automaton

An automaton library written in Rust
Log | Files | Refs

commit 402fc48bdef70b5f87822d3b762edc1b623fa64a
parent eb7bc2b8a0295c0b1ac45d284bef58bf990e6995
Author: Ethan Long <ethandavidlong@gmail.com>
Date:   Fri, 23 Feb 2024 16:33:08 +1100

Started progress on better visualisation

Everything can convert to a custom Graphviz printer that I wrote for
finite automaton. Next I want to get an interactive web-page working
maybe?

Diffstat:
Msrc/lib/automaton.rs | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Msrc/lib/dfa.rs | 38++++++++++++++++++++++++++++++++------
Msrc/lib/enfa.rs | 41+++++++++++++++++++++++++++++++++++------
Msrc/lib/graph_enfa.rs | 55+++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/lib/nfa.rs | 38++++++++++++++++++++++++++++++++------
Msrc/lib/regex.rs | 6++++++
Msrc/lib/tests/regex_tests.rs | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 343 insertions(+), 34 deletions(-)

diff --git a/src/lib/automaton.rs b/src/lib/automaton.rs @@ -1,4 +1,5 @@ -use petgraph::{dot::Dot, Directed, Graph}; +use petgraph::{graph::NodeIndex, Directed, Graph}; +use std::collections::BTreeMap; use std::error::Error; use std::fmt::Display; @@ -32,17 +33,74 @@ where fn parse(encoding: T) -> Result<Self, Box<dyn Error>>; } -pub fn to_graphviz_dot<T, U, V>(automaton: &T) -> Result<String, Box<dyn Error>> +pub struct Dot<V> where - T: TryInto<Graph<U, V, Directed>>, - T: Clone, - U: std::fmt::Debug, - V: std::fmt::Debug, - <T as TryInto<Graph<U, V, Directed>>>::Error: std::error::Error, - <T as TryInto<Graph<U, V, Directed>>>::Error: 'static, + V: Display, { - let graph: Graph<U, V, Directed> = automaton.clone().try_into()?; - Ok(format!("{:?}", Dot::new(&graph))) + graph: Graph<u64, V, Directed>, + initial_state: (u64, NodeIndex), + final_states: BTreeMap<u64, NodeIndex>, +} + +impl<V> Dot<V> +where + V: Display, +{ + pub fn new( + graph: Graph<u64, V, Directed>, + initial_state: (u64, NodeIndex), + final_states: BTreeMap<u64, NodeIndex>, + ) -> Dot<V> { + Dot { + graph, + initial_state, + final_states, + } + } +} + +impl<V> Display for Dot<V> +where + V: Display, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "digraph {{")?; + writeln!(f, " entrypoint [ label = \"\", shape = none ]")?; + + for (id, label) in self + .graph + .node_indices() + .filter_map(|id| Some((id, *self.graph.node_weight(id)?))) + { + let shape = if self.final_states.contains_key(&label) { + "doublecircle" + } else { + "circle" + }; + writeln!( + f, + " {} [ label = \"{}\", shape = {} ]", + id.index(), + label, + shape + )?; + } + writeln!(f, " entrypoint -> {}", self.initial_state.1.index())?; + for (from, to, transition) in self.graph.edge_indices().filter_map(|ex| { + let (from_ix, to_ix) = self.graph.edge_endpoints(ex)?; + let label = self.graph.edge_weight(ex)?.to_string(); + Some((from_ix, to_ix, label)) + }) { + writeln!( + f, + " {} -> {} [ label = \"{}\" ]", + from.index(), + to.index(), + transition + )?; + } + writeln!(f, "}}") + } } #[cfg(test)] diff --git a/src/lib/dfa.rs b/src/lib/dfa.rs @@ -1,13 +1,13 @@ use core::fmt::Display; use petgraph::{Directed, Graph}; use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, error::Error, fmt, panic::Location, }; -use crate::{to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton}; +use crate::{Automaton, Dot, Encodable, FiniteStateAutomaton}; use serde_json::{from_str, Value}; #[derive(Debug)] @@ -60,6 +60,15 @@ pub enum Transition { Wildcard, } +impl Display for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Transition::Char(c) => write!(f, "'{}'", c), + Transition::Wildcard => write!(f, "•"), + } + } +} + #[derive(Clone, Debug, PartialEq)] pub struct DFA { pub alphabet: HashSet<char>, @@ -178,7 +187,7 @@ impl Display for DFA { writeln!(f, "Initial State: {}", initial_state_str)?; writeln!(f, "Final States: {}", final_states_str)?; writeln!(f, "Transition Table:\n{}", transition_table_str)?; - let graphviz_repr = to_graphviz_dot(self); + let graphviz_repr: Result<Dot<Transition>, DFAParseError> = self.try_into(); match graphviz_repr { Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s), Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e), @@ -287,9 +296,9 @@ impl Encodable<Value> for DFA { } } -impl TryFrom<DFA> for Graph<u64, Transition, Directed> { +impl TryFrom<&DFA> for Dot<Transition> { type Error = DFAParseError; - fn try_from(dfa: DFA) -> Result<Self, Self::Error> { + fn try_from(dfa: &DFA) -> Result<Self, Self::Error> { let mut graph: Graph<u64, Transition, Directed> = Graph::new(); let states: HashMap<u64, _> = dfa.states.iter().map(|&s| (s, graph.add_node(s))).collect(); @@ -311,6 +320,23 @@ impl TryFrom<DFA> for Graph<u64, Transition, Directed> { } } - Ok(graph) + let initial_state_id = *states.get(&dfa.initial_state).ok_or(DFAParseError::error( + "Unable to get initial state graph index!", + ))?; + + let final_states = dfa + .final_states + .iter() + .map(|s| states.get(s).map(|&ix| (*s, ix))) + .collect::<Option<BTreeMap<u64, _>>>() + .ok_or(DFAParseError::error( + "Unable to get the graph index of a final state!", + ))?; + + Ok(Self::new( + graph, + (dfa.initial_state, initial_state_id), + final_states, + )) } } diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs @@ -1,13 +1,13 @@ use crate::{ dfa::{self, DFA}, nfa::{self, NFA}, - to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton, + Automaton, Dot, Encodable, FiniteStateAutomaton, }; use core::fmt::Display; use petgraph::{Directed, Graph}; use serde_json::{from_str, Value}; use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, error::Error, panic::Location, }; @@ -58,6 +58,16 @@ pub enum Transition { Wildcard, } +impl Display for Transition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Transition::Char(c) => write!(f, "'{}'", c), + Transition::Epsilon => write!(f, "ε"), + Transition::Wildcard => write!(f, "•"), + } + } +} + impl From<nfa::Transition> for Transition { fn from(transition: nfa::Transition) -> Self { match transition { @@ -354,7 +364,7 @@ impl Display for ENFA { writeln!(f, "Initial State: {}", initial_state_str)?; writeln!(f, "Final States: {}", final_states_str)?; writeln!(f, "Transition Table:\n{}", transition_table_str)?; - let graphviz_repr = to_graphviz_dot(self); + let graphviz_repr: Result<Dot<Transition>, ENFAParseError> = self.try_into(); match graphviz_repr { Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s), Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e), @@ -489,9 +499,9 @@ impl Encodable<Value> for ENFA { } } -impl TryFrom<ENFA> for Graph<u64, Transition, Directed> { +impl TryFrom<&ENFA> for Dot<Transition> { type Error = ENFAParseError; - fn try_from(enfa: ENFA) -> Result<Self, Self::Error> { + fn try_from(enfa: &ENFA) -> Result<Self, Self::Error> { let mut graph: Graph<u64, Transition, Directed> = Graph::new(); let states: HashMap<u64, _> = enfa @@ -522,6 +532,25 @@ impl TryFrom<ENFA> for Graph<u64, Transition, Directed> { } } - Ok(graph) + let initial_state_id = *states + .get(&enfa.initial_state) + .ok_or(ENFAParseError::error( + "Unable to get initial state graph index!", + ))?; + + let final_states = enfa + .final_states + .iter() + .map(|s| states.get(s).map(|&ix| (*s, ix))) + .collect::<Option<BTreeMap<u64, _>>>() + .ok_or(ENFAParseError::error( + "Unable to get the graph index of a final state!", + ))?; + + Ok(Self::new( + graph, + (enfa.initial_state, initial_state_id), + final_states, + )) } } diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs @@ -1,11 +1,11 @@ use crate::{ dfa, enfa::{self, ENFA}, - to_graphviz_dot, Automaton, FiniteStateAutomaton, + Automaton, Dot, FiniteStateAutomaton, }; use petgraph::{graph::NodeIndex, visit::EdgeRef, Directed, Graph}; use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, error::Error, fmt::Display, panic::Location, @@ -46,6 +46,16 @@ pub enum Transition { Epsilon, } +impl Display for Transition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Transition::Char(c) => write!(f, "'{}'", c), + Transition::Wildcard => write!(f, "•"), + Transition::Epsilon => write!(f, "ε"), + } + } +} + impl From<Transition> for enfa::Transition { fn from(transition: Transition) -> Self { match transition { @@ -396,7 +406,7 @@ impl Display for GraphENFA { writeln!(f, "States: {}", states_str)?; writeln!(f, "Initial State: {}", initial_state_str)?; writeln!(f, "Final States: {}", final_states_str)?; - let graphviz_repr = to_graphviz_dot(self); + let graphviz_repr: Result<Dot<Transition>, GraphENFAError> = self.try_into(); match graphviz_repr { Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s), Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e), @@ -404,8 +414,41 @@ impl Display for GraphENFA { } } -impl From<GraphENFA> for Graph<u64, Transition, Directed> { - fn from(graph_enfa: GraphENFA) -> Self { - graph_enfa.graph +impl From<&GraphENFA> for Graph<u64, Transition, Directed> { + fn from(graph_enfa: &GraphENFA) -> Self { + graph_enfa.graph.clone() + } +} + +impl TryFrom<&GraphENFA> for Dot<Transition> { + type Error = GraphENFAError; + fn try_from(graph_enfa: &GraphENFA) -> Result<Self, Self::Error> { + let states = graph_enfa + .states + .iter() + .map(|s| graph_enfa.state_id_to_index_map.get(s).map(|&ix| (*s, ix))) + .collect::<Option<BTreeMap<u64, _>>>() + .ok_or(GraphENFAError::error( + "Unable to get the graph index of one of the states!", + ))?; + let initial_state_id = + *states + .get(&graph_enfa.initial_state) + .ok_or(GraphENFAError::error( + "Unable to get the graph index of the initial state!", + ))?; + let final_states = graph_enfa + .final_states + .iter() + .map(|s| states.get(s).map(|&ix| (*s, ix))) + .collect::<Option<BTreeMap<u64, _>>>() + .ok_or(GraphENFAError::error( + "Unable to get the graph index of a final state!", + ))?; + Ok(Self::new( + graph_enfa.into(), + (graph_enfa.initial_state, initial_state_id), + final_states, + )) } } diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs @@ -1,11 +1,11 @@ use crate::{ dfa::{self, DFA}, - to_graphviz_dot, Automaton, Encodable, FiniteStateAutomaton, + Automaton, Dot, Encodable, FiniteStateAutomaton, }; use core::fmt::Display; use petgraph::{Directed, Graph}; use std::{ - collections::{BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap, HashSet}, error::Error, fmt, panic::Location, @@ -58,6 +58,15 @@ pub enum Transition { Wildcard, } +impl Display for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Transition::Char(c) => write!(f, "'{}'", c), + Transition::Wildcard => write!(f, "•"), + } + } +} + // TODO: From doesn't seem to be 2-way... Maybe I'm using it wrong? impl From<dfa::Transition> for Transition { fn from(transition: dfa::Transition) -> Self { @@ -326,7 +335,7 @@ impl Display for NFA { writeln!(f, "Initial State: {}", initial_state_str)?; writeln!(f, "Final States: {}", final_states_str)?; writeln!(f, "Transition Table:\n{}", transition_table_str)?; - let graphviz_repr = to_graphviz_dot(self); + let graphviz_repr: Result<Dot<Transition>, NFAParseError> = self.try_into(); match graphviz_repr { Ok(s) => writeln!(f, "Graphviz Representation:\n{}", s), Err(e) => writeln!(f, "Error getting Graphviz representation: {}", e), @@ -437,9 +446,9 @@ impl Encodable<Value> for NFA { } } -impl TryFrom<NFA> for Graph<u64, Transition, Directed> { +impl TryFrom<&NFA> for Dot<Transition> { type Error = NFAParseError; - fn try_from(nfa: NFA) -> Result<Self, Self::Error> { + fn try_from(nfa: &NFA) -> Result<Self, Self::Error> { let mut graph: Graph<u64, Transition, Directed> = Graph::new(); let states: HashMap<u64, _> = nfa.states.iter().map(|&s| (s, graph.add_node(s))).collect(); @@ -465,6 +474,23 @@ impl TryFrom<NFA> for Graph<u64, Transition, Directed> { } } - Ok(graph) + let initial_state_id = *states.get(&nfa.initial_state).ok_or(NFAParseError::error( + "Unable to get initial state graph index!", + ))?; + + let final_states = nfa + .final_states + .iter() + .map(|s| states.get(s).map(|&ix| (*s, ix))) + .collect::<Option<BTreeMap<u64, _>>>() + .ok_or(NFAParseError::error( + "Unable to get the graph index of a final state!", + ))?; + + Ok(Self::new( + graph, + (nfa.initial_state, initial_state_id), + final_states, + )) } } diff --git a/src/lib/regex.rs b/src/lib/regex.rs @@ -264,12 +264,14 @@ impl Regex { } } +/// Tokenisation impl Encodable<&str> for Regex { fn parse(regex_str: &str) -> Result<Self, Box<dyn Error>> { Self::parse(Self::tokenise(regex_str)?) } } +/// Parsing to an AST impl Encodable<Vec<Either<char, RegexToken>>> for Regex { fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> { let parse_tree = Regex::make_grammar_tree(tokens)?; @@ -277,7 +279,9 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex { } } +/// Various fns for converting from the AST to an ENFA graph impl GraphENFA { + /// Adds a nonterminal onto the DFA directly after `from` fn add_non_terminal( &mut self, nonterminal: RegexNonTerminal, @@ -292,6 +296,7 @@ impl GraphENFA { } } + /// Concats a terminal onto the DFA directly after `from` fn add_terminal(&mut self, terminal: RegexTerminal, from: u64) -> Result<u64, GraphENFAError> { let new_state = self.new_state()?; @@ -352,6 +357,7 @@ impl GraphENFA { } } +/// Convert AST to a graph ENFA, then convert that ENFA to a DFA. impl Encodable<RegexNonTerminal> for Regex { fn parse(parse_tree: RegexNonTerminal) -> Result<Self, Box<dyn Error>> { let (mut enfa, current_state) = GraphENFA::new(); diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs @@ -18,6 +18,12 @@ fn tokeniser() { const PARSE_FAIL_STR: &str = "Unable to parse regex!"; +/// Tests if given a regex that is a concatenation of a bunch of random characters, the regex will recognise itself. +/// The direct correlation between regex source string and accepted input breaks down when you consider escape sequences, but the concept still holds. +/// +/// E.g: +/// - The regex `abdfa` should accept the string `"abdfa"`. +/// - The regex `a\\cd` should accept the string `"a\cd"`. #[test] fn accepts_long_concats() { let alphabet = [ @@ -46,6 +52,10 @@ fn accepts_long_concats() { } } +/// Tests a bunch of different wildcard scenarios and edge cases. +/// +/// E.g: +/// - The regex `a.*a` should recognise all strings starting and ending with `'a'`. #[test] fn wildcard_tests() { let alphabet = [ @@ -72,3 +82,114 @@ fn wildcard_tests() { assert!(regex.accepts(input_str.as_str())) } } + +/// Tests Kleene groups, both the Kleene star and the "Kleene plus" sugar. +/// +/// E.g: +/// - The regex `a*` should recognise zero or more `'a'`'s. +/// - The regex `a+` should recognise one or more `'a'`'s. +/// - The regex `(a*)*` should recognise zero or more groups of zero or more `'a'`'s. +/// - The regex `(a+)*` should recognise zero or more groups of one or more `'a'`'s. +/// - The regex `(a+)+` should recognise one or more groups of one or more `'a'`'s. +/// - The regex `a+b` should recognise one or more `'a'`'s all followed by a `'b'`. +/// - The regex `(a+b)+` should recognise one or more groups of one or more `'a'`'s followed by a `'b'`. +#[test] +fn kleene_tests() { + let alphabet = [ + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", + "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", " ", + ]; + let mut rng = rand::thread_rng(); + + let regex = Regex::parse("a*").expect(PARSE_FAIL_STR); + for len in 0..1000 { + let s: String = (0..len).map(|_| 'a').collect(); + assert!(regex.accepts(s.as_str())); + } + + let regex = Regex::parse("a+").expect(PARSE_FAIL_STR); + assert!(!regex.accepts("")); + for len in 1..1000 { + let s: String = (0..len).map(|_| 'a').collect(); + assert!(regex.accepts(s.as_str())); + } + + let regex = Regex::parse("(a*)*").expect(PARSE_FAIL_STR); + for n_substrings in 0..100 { + let s: String = (0..n_substrings) + .map(|_| (0..rng.gen_range(1..100)).map(|_| 'a').collect::<String>()) + .collect(); + assert!(regex.accepts(s.as_str())); + } + + let regex = Regex::parse("(a+)*").expect(PARSE_FAIL_STR); + for n_substrings in 0..100 { + let s: String = (0..n_substrings) + .map(|_| (1..rng.gen_range(1..100)).map(|_| 'a').collect::<String>()) + .collect(); + assert!(regex.accepts(s.as_str())); + } + + let regex = Regex::parse("(a+)+").expect(PARSE_FAIL_STR); + assert!(!regex.accepts("")); + for n_substrings in 1..100 { + let s: String = (0..n_substrings) + .map(|_| (1..rng.gen_range(1..100)).map(|_| 'a').collect::<String>()) + .collect(); + assert!(regex.accepts(s.as_str())); + } + + let regex = Regex::parse("a+b").expect(PARSE_FAIL_STR); + // Should accept any number of `a`'s followed by a `b` + for len in 2..1000 { + let s: String = (0..len) + .map(|i| if i != len - 1 { 'a' } else { 'b' }) + .collect(); + assert!(regex.accepts(s.as_str())); + } + // Should not accept multiple `b`'s + for len in 3..1000 { + let n_bs = rng.gen_range(2..len); + let s: String = (0..len) + .map(|i| if i < len - n_bs { 'a' } else { 'b' }) + .collect(); + assert!(!regex.accepts(s.as_str())); + } + // Should not accept just `b` + assert!(!regex.accepts("b")); + // Should not accept just `a`'s + for len in 1..1000 { + let s: String = (0..len).map(|_| 'a').collect(); + assert!(!regex.accepts(s.as_str())); + } + // Should not accept any other char in the middle of the `a`'s + for len in 3..1000 { + let rand_index = rng.gen_range(1..len - 1); + let s: String = (0..len) + .map(|i| { + if i == rand_index { + alphabet[rng.gen_range(2..alphabet.len())] // Can't be 'a' or 'b' + } else if i != len - 1 { + "a" + } else { + "b" + } + }) + .collect(); + assert!(!regex.accepts(s.as_str())); + } + + let regex = Regex::parse("(a+b)+").expect(PARSE_FAIL_STR); + for n_substrings in 1..100 { + let s: String = (0..n_substrings) + .map(|_| { + let substr_len = rng.gen_range(2..100); + (0..substr_len) + .map(|i| if i != substr_len - 1 { 'a' } else { 'b' }) + .collect::<String>() + }) + .collect(); + assert!(regex.accepts(s.as_str())); + } +}