commit 402fc48bdef70b5f87822d3b762edc1b623fa64a
parent eb7bc2b8a0295c0b1ac45d284bef58bf990e6995
Author: Ethan Long <ethandavidlong@gmail.com>
Date: Fri, 23 Feb 2024 16:33:08 +1100
Started progress on better visualisation
Everything can convert to a custom Graphviz printer that I wrote for
finite automaton. Next I want to get an interactive web-page working
maybe?
Diffstat:
7 files changed, 343 insertions(+), 34 deletions(-)
diff --git a/src/lib/automaton.rs b/src/lib/automaton.rs
@@ -1,4 +1,5 @@
-use petgraph::{dot::Dot, Directed, Graph};
+use petgraph::{graph::NodeIndex, Directed, Graph};
+use std::collections::BTreeMap;
use std::error::Error;
use std::fmt::Display;
@@ -32,17 +33,74 @@ where
fn parse(encoding: T) -> Result<Self, Box<dyn Error>>;
}
-pub fn to_graphviz_dot<T, U, V>(automaton: &T) -> Result<String, Box<dyn Error>>
+pub struct Dot<V>
where
- T: TryInto<Graph<U, V, Directed>>,
- T: Clone,
- U: std::fmt::Debug,
- V: std::fmt::Debug,
- <T as TryInto<Graph<U, V, Directed>>>::Error: std::error::Error,
- <T as TryInto<Graph<U, V, Directed>>>::Error: 'static,
+ V: Display,
{
- let graph: Graph<U, V, Directed> = automaton.clone().try_into()?;
- Ok(format!("{:?}", Dot::new(&graph)))
+ graph: Graph<u64, V, Directed>,
+ initial_state: (u64, NodeIndex),
+ final_states: BTreeMap<u64, NodeIndex>,
+}
+
+impl<V> Dot<V>
+where
+ V: Display,
+{
+ pub fn new(
+ graph: Graph<u64, V, Directed>,
+ initial_state: (u64, NodeIndex),
+ final_states: BTreeMap<u64, NodeIndex>,
+ ) -> Dot<V> {
+ Dot {
+ graph,
+ initial_state,
+ final_states,
+ }
+ }
+}
+
+impl<V> Display for Dot<V>
+where
+ V: Display,
+{
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ writeln!(f, "digraph {{")?;
+ writeln!(f, " entrypoint [ label = \"\", shape = none ]")?;
+
+ for (id, label) in self
+ .graph
+ .node_indices()
+ .filter_map(|id| Some((id, *self.graph.node_weight(id)?)))
+ {
+ let shape = if self.final_states.contains_key(&label) {
+ "doublecircle"
+ } else {
+ "circle"
+ };
+ writeln!(
+ f,
+ " {} [ label = \"{}\", shape = {} ]",
+ id.index(),
+ label,
+ shape
+ )?;
+ }
+ writeln!(f, " entrypoint -> {}", self.initial_state.1.index())?;
+ for (from, to, transition) in self.graph.edge_indices().filter_map(|ex| {
+ let (from_ix, to_ix) = self.graph.edge_endpoints(ex)?;
+ let label = self.graph.edge_weight(ex)?.to_string();
+ Some((from_ix, to_ix, label))
+ }) {
+ writeln!(
+ f,
+ " {} -> {} [ label = \"{}\" ]",
+ from.index(),
+ to.index(),
+ transition
+ )?;
+ }
+ writeln!(f, "}}")
+ }
}
#[cfg(test)]
diff --git a/src/lib/dfa.rs b/src/lib/dfa.rs
@@ -1,13 +1,13 @@
use core::fmt::Display;
use petgraph::{Directed, Graph};
use std::{
- collections::{HashMap, HashSet},
+ collections::{BTreeMap, HashMap, HashSet},
error::Error,
fmt,
panic::Location,
};
-use crate::{to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton};
+use crate::{Automaton, Dot, Encodable, FiniteStateAutomaton};
use serde_json::{from_str, Value};
#[derive(Debug)]
@@ -60,6 +60,15 @@ pub enum Transition {
Wildcard,
}
+impl Display for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Transition::Char(c) => write!(f, "'{}'", c),
+ Transition::Wildcard => write!(f, "•"),
+ }
+ }
+}
+
#[derive(Clone, Debug, PartialEq)]
pub struct DFA {
pub alphabet: HashSet<char>,
@@ -178,7 +187,7 @@ impl Display for DFA {
writeln!(f, "Initial State: {}", initial_state_str)?;
writeln!(f, "Final States: {}", final_states_str)?;
writeln!(f, "Transition Table:\n{}", transition_table_str)?;
- let graphviz_repr = to_graphviz_dot(self);
+ let graphviz_repr: Result<Dot<Transition>, DFAParseError> = self.try_into();
match graphviz_repr {
Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s),
Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e),
@@ -287,9 +296,9 @@ impl Encodable<Value> for DFA {
}
}
-impl TryFrom<DFA> for Graph<u64, Transition, Directed> {
+impl TryFrom<&DFA> for Dot<Transition> {
type Error = DFAParseError;
- fn try_from(dfa: DFA) -> Result<Self, Self::Error> {
+ fn try_from(dfa: &DFA) -> Result<Self, Self::Error> {
let mut graph: Graph<u64, Transition, Directed> = Graph::new();
let states: HashMap<u64, _> = dfa.states.iter().map(|&s| (s, graph.add_node(s))).collect();
@@ -311,6 +320,23 @@ impl TryFrom<DFA> for Graph<u64, Transition, Directed> {
}
}
- Ok(graph)
+ let initial_state_id = *states.get(&dfa.initial_state).ok_or(DFAParseError::error(
+ "Unable to get initial state graph index!",
+ ))?;
+
+ let final_states = dfa
+ .final_states
+ .iter()
+ .map(|s| states.get(s).map(|&ix| (*s, ix)))
+ .collect::<Option<BTreeMap<u64, _>>>()
+ .ok_or(DFAParseError::error(
+ "Unable to get the graph index of a final state!",
+ ))?;
+
+ Ok(Self::new(
+ graph,
+ (dfa.initial_state, initial_state_id),
+ final_states,
+ ))
}
}
diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs
@@ -1,13 +1,13 @@
use crate::{
dfa::{self, DFA},
nfa::{self, NFA},
- to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton,
+ Automaton, Dot, Encodable, FiniteStateAutomaton,
};
use core::fmt::Display;
use petgraph::{Directed, Graph};
use serde_json::{from_str, Value};
use std::{
- collections::{HashMap, HashSet},
+ collections::{BTreeMap, HashMap, HashSet},
error::Error,
panic::Location,
};
@@ -58,6 +58,16 @@ pub enum Transition {
Wildcard,
}
+impl Display for Transition {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Transition::Char(c) => write!(f, "'{}'", c),
+ Transition::Epsilon => write!(f, "ε"),
+ Transition::Wildcard => write!(f, "•"),
+ }
+ }
+}
+
impl From<nfa::Transition> for Transition {
fn from(transition: nfa::Transition) -> Self {
match transition {
@@ -354,7 +364,7 @@ impl Display for ENFA {
writeln!(f, "Initial State: {}", initial_state_str)?;
writeln!(f, "Final States: {}", final_states_str)?;
writeln!(f, "Transition Table:\n{}", transition_table_str)?;
- let graphviz_repr = to_graphviz_dot(self);
+ let graphviz_repr: Result<Dot<Transition>, ENFAParseError> = self.try_into();
match graphviz_repr {
Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s),
Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e),
@@ -489,9 +499,9 @@ impl Encodable<Value> for ENFA {
}
}
-impl TryFrom<ENFA> for Graph<u64, Transition, Directed> {
+impl TryFrom<&ENFA> for Dot<Transition> {
type Error = ENFAParseError;
- fn try_from(enfa: ENFA) -> Result<Self, Self::Error> {
+ fn try_from(enfa: &ENFA) -> Result<Self, Self::Error> {
let mut graph: Graph<u64, Transition, Directed> = Graph::new();
let states: HashMap<u64, _> = enfa
@@ -522,6 +532,25 @@ impl TryFrom<ENFA> for Graph<u64, Transition, Directed> {
}
}
- Ok(graph)
+ let initial_state_id = *states
+ .get(&enfa.initial_state)
+ .ok_or(ENFAParseError::error(
+ "Unable to get initial state graph index!",
+ ))?;
+
+ let final_states = enfa
+ .final_states
+ .iter()
+ .map(|s| states.get(s).map(|&ix| (*s, ix)))
+ .collect::<Option<BTreeMap<u64, _>>>()
+ .ok_or(ENFAParseError::error(
+ "Unable to get the graph index of a final state!",
+ ))?;
+
+ Ok(Self::new(
+ graph,
+ (enfa.initial_state, initial_state_id),
+ final_states,
+ ))
}
}
diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs
@@ -1,11 +1,11 @@
use crate::{
dfa,
enfa::{self, ENFA},
- to_graphviz_dot, Automaton, FiniteStateAutomaton,
+ Automaton, Dot, FiniteStateAutomaton,
};
use petgraph::{graph::NodeIndex, visit::EdgeRef, Directed, Graph};
use std::{
- collections::{HashMap, HashSet},
+ collections::{BTreeMap, HashMap, HashSet},
error::Error,
fmt::Display,
panic::Location,
@@ -46,6 +46,16 @@ pub enum Transition {
Epsilon,
}
+impl Display for Transition {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Transition::Char(c) => write!(f, "'{}'", c),
+ Transition::Wildcard => write!(f, "•"),
+ Transition::Epsilon => write!(f, "ε"),
+ }
+ }
+}
+
impl From<Transition> for enfa::Transition {
fn from(transition: Transition) -> Self {
match transition {
@@ -396,7 +406,7 @@ impl Display for GraphENFA {
writeln!(f, "States: {}", states_str)?;
writeln!(f, "Initial State: {}", initial_state_str)?;
writeln!(f, "Final States: {}", final_states_str)?;
- let graphviz_repr = to_graphviz_dot(self);
+ let graphviz_repr: Result<Dot<Transition>, GraphENFAError> = self.try_into();
match graphviz_repr {
Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s),
Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e),
@@ -404,8 +414,41 @@ impl Display for GraphENFA {
}
}
-impl From<GraphENFA> for Graph<u64, Transition, Directed> {
- fn from(graph_enfa: GraphENFA) -> Self {
- graph_enfa.graph
+impl From<&GraphENFA> for Graph<u64, Transition, Directed> {
+ fn from(graph_enfa: &GraphENFA) -> Self {
+ graph_enfa.graph.clone()
+ }
+}
+
+impl TryFrom<&GraphENFA> for Dot<Transition> {
+ type Error = GraphENFAError;
+ fn try_from(graph_enfa: &GraphENFA) -> Result<Self, Self::Error> {
+ let states = graph_enfa
+ .states
+ .iter()
+ .map(|s| graph_enfa.state_id_to_index_map.get(s).map(|&ix| (*s, ix)))
+ .collect::<Option<BTreeMap<u64, _>>>()
+ .ok_or(GraphENFAError::error(
+ "Unable to get the graph index of one of the states!",
+ ))?;
+ let initial_state_id =
+ *states
+ .get(&graph_enfa.initial_state)
+ .ok_or(GraphENFAError::error(
+ "Unable to get the graph index of the initial state!",
+ ))?;
+ let final_states = graph_enfa
+ .final_states
+ .iter()
+ .map(|s| states.get(s).map(|&ix| (*s, ix)))
+ .collect::<Option<BTreeMap<u64, _>>>()
+ .ok_or(GraphENFAError::error(
+ "Unable to get the graph index of a final state!",
+ ))?;
+ Ok(Self::new(
+ graph_enfa.into(),
+ (graph_enfa.initial_state, initial_state_id),
+ final_states,
+ ))
}
}
diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs
@@ -1,11 +1,11 @@
use crate::{
dfa::{self, DFA},
- to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton,
+ Automaton, Dot, Encodable, FiniteStateAutomaton,
};
use core::fmt::Display;
use petgraph::{Directed, Graph};
use std::{
- collections::{BTreeSet, HashMap, HashSet},
+ collections::{BTreeMap, BTreeSet, HashMap, HashSet},
error::Error,
fmt,
panic::Location,
@@ -58,6 +58,15 @@ pub enum Transition {
Wildcard,
}
+impl Display for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Transition::Char(c) => write!(f, "'{}'", c),
+ Transition::Wildcard => write!(f, "•"),
+ }
+ }
+}
+
// TODO: From doesn't seem to be 2-way... Maybe I'm using it wrong?
impl From<dfa::Transition> for Transition {
fn from(transition: dfa::Transition) -> Self {
@@ -326,7 +335,7 @@ impl Display for NFA {
writeln!(f, "Initial State: {}", initial_state_str)?;
writeln!(f, "Final States: {}", final_states_str)?;
writeln!(f, "Transition Table:\n{}", transition_table_str)?;
- let graphviz_repr = to_graphviz_dot(self);
+ let graphviz_repr: Result<Dot<Transition>, NFAParseError> = self.try_into();
match graphviz_repr {
Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s),
Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e),
@@ -437,9 +446,9 @@ impl Encodable<Value> for NFA {
}
}
-impl TryFrom<NFA> for Graph<u64, Transition, Directed> {
+impl TryFrom<&NFA> for Dot<Transition> {
type Error = NFAParseError;
- fn try_from(nfa: NFA) -> Result<Self, Self::Error> {
+ fn try_from(nfa: &NFA) -> Result<Self, Self::Error> {
let mut graph: Graph<u64, Transition, Directed> = Graph::new();
let states: HashMap<u64, _> = nfa.states.iter().map(|&s| (s, graph.add_node(s))).collect();
@@ -465,6 +474,23 @@ impl TryFrom<NFA> for Graph<u64, Transition, Directed> {
}
}
- Ok(graph)
+ let initial_state_id = *states.get(&nfa.initial_state).ok_or(NFAParseError::error(
+ "Unable to get initial state graph index!",
+ ))?;
+
+ let final_states = nfa
+ .final_states
+ .iter()
+ .map(|s| states.get(s).map(|&ix| (*s, ix)))
+ .collect::<Option<BTreeMap<u64, _>>>()
+ .ok_or(NFAParseError::error(
+ "Unable to get the graph index of a final state!",
+ ))?;
+
+ Ok(Self::new(
+ graph,
+ (nfa.initial_state, initial_state_id),
+ final_states,
+ ))
}
}
diff --git a/src/lib/regex.rs b/src/lib/regex.rs
@@ -264,12 +264,14 @@ impl Regex {
}
}
+/// Tokenisation
impl Encodable<&str> for Regex {
fn parse(regex_str: &str) -> Result<Self, Box<dyn Error>> {
Self::parse(Self::tokenise(regex_str)?)
}
}
+/// Parsing to an AST
impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> {
let parse_tree = Regex::make_grammar_tree(tokens)?;
@@ -277,7 +279,9 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
}
}
+/// Various fns for converting from the AST to an ENFA graph
impl GraphENFA {
+ /// Adds a nonterminal onto the DFA directly after `from`
fn add_non_terminal(
&mut self,
nonterminal: RegexNonTerminal,
@@ -292,6 +296,7 @@ impl GraphENFA {
}
}
+ /// Concats a terminal onto the DFA directly after `from`
fn add_terminal(&mut self, terminal: RegexTerminal, from: u64) -> Result<u64, GraphENFAError> {
let new_state = self.new_state()?;
@@ -352,6 +357,7 @@ impl GraphENFA {
}
}
+/// Convert AST to a graph ENFA, then convert that ENFA to a DFA.
impl Encodable<RegexNonTerminal> for Regex {
fn parse(parse_tree: RegexNonTerminal) -> Result<Self, Box<dyn Error>> {
let (mut enfa, current_state) = GraphENFA::new();
diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs
@@ -18,6 +18,12 @@ fn tokeniser() {
const PARSE_FAIL_STR: &str = "Unable to parse regex!";
+/// Tests if given a regex that is a concatenation of a bunch of random characters, the regex will recognise itself.
+/// The direct correlation between regex source string and accepted input breaks down when you consider escape sequences, but the concept still holds.
+///
+/// E.g:
+/// - The regex `abdfa` should accept the string `"abdfa"`.
+/// - The regex `a\\cd` should accept the string `"a\cd"`.
#[test]
fn accepts_long_concats() {
let alphabet = [
@@ -46,6 +52,10 @@ fn accepts_long_concats() {
}
}
+/// Tests a bunch of different wildcard scenarios and edge cases.
+///
+/// E.g:
+/// - The regex `a.*a` should recognise all strings starting and ending with `'a'`.
#[test]
fn wildcard_tests() {
let alphabet = [
@@ -72,3 +82,114 @@ fn wildcard_tests() {
assert!(regex.accepts(input_str.as_str()))
}
}
+
+/// Tests Kleene groups, both the Kleene star and the "Kleene plus" sugar.
+///
+/// E.g:
+/// - The regex `a*` should recognise zero or more `'a'`'s.
+/// - The regex `a+` should recognise one or more `'a'`'s.
+/// - The regex `(a*)*` should recognise zero or more groups of zero or more `'a'`'s.
+/// - The regex `(a+)*` should recognise zero or more groups of one or more `'a'`'s.
+/// - The regex `(a+)+` should recognise one or more groups of one or more `'a'`'s.
+/// - The regex `a+b` should recognise one or more `'a'`'s all followed by a `'b'`.
+/// - The regex `(a+b)+` should recognise one or more groups of one or more `'a'`'s followed by a `'b'`.
+#[test]
+fn kleene_tests() {
+ let alphabet = [
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+ "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+ "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", " ",
+ ];
+ let mut rng = rand::thread_rng();
+
+ let regex = Regex::parse("a*").expect(PARSE_FAIL_STR);
+ for len in 0..1000 {
+ let s: String = (0..len).map(|_| 'a').collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("a+").expect(PARSE_FAIL_STR);
+ assert!(!regex.accepts(""));
+ for len in 1..1000 {
+ let s: String = (0..len).map(|_| 'a').collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("(a*)*").expect(PARSE_FAIL_STR);
+ for n_substrings in 0..100 {
+ let s: String = (0..n_substrings)
+ .map(|_| (0..rng.gen_range(1..100)).map(|_| 'a').collect::<String>())
+ .collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("(a+)*").expect(PARSE_FAIL_STR);
+ for n_substrings in 0..100 {
+ let s: String = (0..n_substrings)
+ .map(|_| (1..rng.gen_range(1..100)).map(|_| 'a').collect::<String>())
+ .collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("(a+)+").expect(PARSE_FAIL_STR);
+ assert!(!regex.accepts(""));
+ for n_substrings in 1..100 {
+ let s: String = (0..n_substrings)
+ .map(|_| (1..rng.gen_range(1..100)).map(|_| 'a').collect::<String>())
+ .collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("a+b").expect(PARSE_FAIL_STR);
+ // Should accept any number of `a`'s followed by a `b`
+ for len in 2..1000 {
+ let s: String = (0..len)
+ .map(|i| if i != len - 1 { 'a' } else { 'b' })
+ .collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+ // Should not accept multiple `b`'s
+ for len in 3..1000 {
+ let n_bs = rng.gen_range(2..len);
+ let s: String = (0..len)
+ .map(|i| if i < len - n_bs { 'a' } else { 'b' })
+ .collect();
+ assert!(!regex.accepts(s.as_str()));
+ }
+ // Should not accept just `b`
+ assert!(!regex.accepts("b"));
+ // Should not accept just `a`'s
+ for len in 1..1000 {
+ let s: String = (0..len).map(|_| 'a').collect();
+ assert!(!regex.accepts(s.as_str()));
+ }
+ // Should not accept any other char in the middle of the `a`'s
+ for len in 3..1000 {
+ let rand_index = rng.gen_range(1..len - 1);
+ let s: String = (0..len)
+ .map(|i| {
+ if i == rand_index {
+ alphabet[rng.gen_range(2..alphabet.len())] // Can't be 'a' or 'b'
+ } else if i != len - 1 {
+ "a"
+ } else {
+ "b"
+ }
+ })
+ .collect();
+ assert!(!regex.accepts(s.as_str()));
+ }
+
+ let regex = Regex::parse("(a+b)+").expect(PARSE_FAIL_STR);
+ for n_substrings in 1..100 {
+ let s: String = (0..n_substrings)
+ .map(|_| {
+ let substr_len = rng.gen_range(2..100);
+ (0..substr_len)
+ .map(|i| if i != substr_len - 1 { 'a' } else { 'b' })
+ .collect::<String>()
+ })
+ .collect();
+ assert!(regex.accepts(s.as_str()));
+ }
+}