automaton

An automaton library written in Rust
Log | Files | Refs

commit eb7bc2b8a0295c0b1ac45d284bef58bf990e6995
parent 707d9a5186561bf10c2ca5b5b4f76494cdd992bd
Author: Ethan Long <ethandavidlong@gmail.com>
Date:   Fri, 23 Feb 2024 03:06:52 +1100

Fixed a bug in the NFA->DFA convertion

The NFA->DFA convertion did not take into account the deterministic
nature of the DFA, it will take only 1 path. This means that in the set
construction, if a wildcard transition exists, then all transitions must
be the union with that transition so that it traverses all paths.

Diffstat:
Msrc/bin/regex.rs | 1+
Msrc/lib/enfa.rs | 2--
Msrc/lib/graph_enfa.rs | 3+--
Msrc/lib/nfa.rs | 134++++++++++++++-----------------------------------------------------------------
Msrc/lib/regex.rs | 40----------------------------------------
Msrc/lib/tests/enfa_tests.rs | 2+-
Msrc/lib/tests/regex_tests.rs | 85+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
7 files changed, 90 insertions(+), 177 deletions(-)

diff --git a/src/bin/regex.rs b/src/bin/regex.rs @@ -40,6 +40,7 @@ fn main() -> Result<(), Box<dyn Error>> { let regex = Regex::parse(regex_str.as_str())?; println!("Regex: {}", regex_str); + println!("DFA: {}", regex.automaton); println!("Running on the following input:\n{input_string}"); println!("Accepts: {}", regex.accepts(&input_string)); diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs @@ -135,7 +135,6 @@ impl ENFA { } } - // FIXME: There are states with no entrypoint here! Prune time pub fn as_nfa(&self) -> NFA { let nfa_final_states = self .states @@ -269,7 +268,6 @@ impl Automaton for ENFA { impl FiniteStateAutomaton for ENFA { fn as_dfa(&self) -> Result<DFA, Box<dyn std::error::Error>> { let nfa = self.as_nfa(); - eprintln!("HOLY FUCKERONI IS THE NFA RIGHT?\n{}", nfa); nfa.as_dfa() } diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs @@ -3,7 +3,7 @@ use crate::{ enfa::{self, ENFA}, to_graphviz_dot, Automaton, FiniteStateAutomaton, }; -use petgraph::{dot::Dot, graph::NodeIndex, visit::EdgeRef, Directed, Graph}; +use petgraph::{graph::NodeIndex, visit::EdgeRef, Directed, Graph}; use std::{ collections::{HashMap, HashSet}, error::Error, @@ -342,7 +342,6 @@ impl FiniteStateAutomaton for GraphENFA { } impl Display for GraphENFA { - /// This should only be used for debugging, the graph is not pretty printed, but the output can be used in graphviz to evaluate the ε-NFA. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut alphabet_vec: Vec<char> = self.alphabet.iter().copied().collect(); let mut states_vec: Vec<u64> = self.states.iter().copied().collect(); diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs @@ -5,7 +5,7 @@ use crate::{ use core::fmt::Display; use petgraph::{Directed, Graph}; use std::{ - collections::{BTreeSet, HashMap, HashSet, VecDeque}, + collections::{BTreeSet, HashMap, HashSet}, error::Error, fmt, panic::Location, @@ -58,6 +58,7 @@ pub enum Transition { Wildcard, } +// TODO: From doesn't seem to be 2-way... Maybe I'm using it wrong? impl From<dfa::Transition> for Transition { fn from(transition: dfa::Transition) -> Self { match transition { @@ -67,7 +68,6 @@ impl From<dfa::Transition> for Transition { } } -// FIXME: WTF IS WRONG WITH FROM AND INTO??? impl From<Transition> for dfa::Transition { fn from(transition: Transition) -> Self { match transition { @@ -147,113 +147,6 @@ impl Automaton for NFA { } impl FiniteStateAutomaton for NFA { - // FIXME: This is broken, very broken. - // I think the bug lies in the fact that the "empty set state"/trap state should be overridden by the wildcard transition (which can be the trap state, so basically always set it to the wildcard transition). - /* - fn as_dfa(&self) -> Result<DFA, Box<dyn Error>> { - let mut states = HashSet::new(); - let initial_dfa_state = 0; - states.insert(initial_dfa_state); - let mut newest_dfa_state = initial_dfa_state; - let mut final_states = HashSet::new(); - let mut transition_table = HashMap::new(); - - let mut dfa_state_to_nfa_set: HashMap<u64, BTreeSet<u64>> = HashMap::new(); - let mut nfa_set_to_dfa_state: HashMap<BTreeSet<u64>, u64> = HashMap::new(); - - nfa_set_to_dfa_state.insert(BTreeSet::from([self.initial_state]), initial_dfa_state); - - let mut possible_inputs: Vec<dfa::Transition> = self - .alphabet - .iter() - .map(|&c| dfa::Transition::Char(c)) - .collect(); - possible_inputs.push(dfa::Transition::Wildcard); - - let wildcard_transitions: HashMap<u64, Vec<u64>> = self - .states - .iter() - .filter_map(|&s| { - self.transition_table - .get(&(s, Transition::Wildcard)) - .map(|dests| (s, dests.to_owned())) - }) - .collect(); - - for &input in self.alphabet.iter() { - let mut nfa_sets_to_be_processed: VecDeque<BTreeSet<u64>> = VecDeque::new(); - let mut processing_nfa_states = BTreeSet::from([self.initial_state]); - let mut nfa_sets_processed: BTreeSet<BTreeSet<u64>> = BTreeSet::new(); - while processing_nfa_states.len() != 0 { - if !nfa_sets_processed.contains(&processing_nfa_states) { - for nfa_dests in processing_nfa_states - .iter() - .map(|&nfa_state| { - self.transition_table - .get(&(nfa_state, Transition::Char(input))) - .map_or_else( - || { - wildcard_transitions - .get(&nfa_state) - .map_or_else(|| vec![], Vec::to_owned) - }, - Vec::to_owned, - ) - }) - // This conversion to BTreeSet is shit - .map(Vec::into_iter) - .map(BTreeSet::from_iter) - { - let dfa_from = nfa_set_to_dfa_state - .get(&processing_nfa_states) - .ok_or(NFAParseError::error( - "Unable to get something that should have already been processed!", - ))? - .to_owned(); - let dfa_to = nfa_set_to_dfa_state.clone().get(&nfa_dests).map_or_else( - || { - // New subset! - newest_dfa_state += 1; - states.insert(newest_dfa_state); - dfa_state_to_nfa_set.insert(newest_dfa_state, nfa_dests.clone()); - nfa_set_to_dfa_state.insert(nfa_dests.clone(), newest_dfa_state); - - if nfa_dests.iter().any(|s| self.final_states.contains(s)) { - // This is a final state! - final_states.insert(newest_dfa_state); - } else if nfa_dests.len() == 0 { - // This is the trap state! - for &trans in possible_inputs.iter() { - transition_table - .insert((newest_dfa_state, trans), newest_dfa_state); - } - } - newest_dfa_state - }, - u64::to_owned, - ); - - nfa_sets_to_be_processed.push_back(nfa_dests); - - transition_table.insert((dfa_from, dfa::Transition::Char(input)), dfa_to); - } - } - nfa_sets_processed.insert(processing_nfa_states); - processing_nfa_states = nfa_sets_to_be_processed - .pop_front() - .unwrap_or(BTreeSet::new()); - } - } - - Ok(DFA { - alphabet: self.alphabet.clone(), - states, - initial_state: initial_dfa_state, - final_states, - transition_table, - }) - } - */ fn as_dfa(&self) -> Result<dfa::DFA, Box<dyn Error>> { let mut states = HashSet::new(); let initial_state = 0; @@ -296,9 +189,28 @@ impl FiniteStateAutomaton for NFA { for &transition in nfa_alphabet.iter() { let nfa_dests: BTreeSet<u64> = current_nfa_set .iter() - .filter_map(|&s| self.transition_table.get(&(s, transition))) + .filter_map(|&s| { + // We need to take into account the wildcard transition dests and take the union of them with our transition dests (determinism) + let wildcard_transitions = + self.transition_table.get(&(s, Transition::Wildcard)); + if transition == Transition::Wildcard { + wildcard_transitions.map(Vec::to_owned) + } else { + match wildcard_transitions { + Some(wcs) => { + let mut transitions = + self.transition_table.get(&(s, transition))?.to_owned(); + transitions.extend(wcs); + Some(transitions) + } + None => self + .transition_table + .get(&(s, transition)) + .map(Vec::to_owned), + } + } + }) .flatten() - .map(u64::to_owned) .collect(); to_be_processed.insert(nfa_dests.clone()); dfa_transition_to_nfa_sets.insert( diff --git a/src/lib/regex.rs b/src/lib/regex.rs @@ -132,7 +132,6 @@ impl Regex { "Unable to get last group".to_owned(), ))? .push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))), - // FIXME: Wildcards are borked! Either::Right(RegexToken::Wildcard) => groups .last_mut() .ok_or(RegexError::ParseError( @@ -140,7 +139,6 @@ impl Regex { ))? .push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)), - // FIXME: Parens are buggy! Either::Right(RegexToken::LeftGroup) => { groups.push(vec![]); } @@ -172,7 +170,6 @@ impl Regex { } Either::Right(RegexToken::Bar) => { - // FIXME: Unions are borked! in_a_union = true; let exited_group = groups.pop().ok_or(RegexError::ParseError( "Unable to pop a group off".to_owned(), @@ -204,7 +201,6 @@ impl Regex { } Either::Right(RegexToken::KleeneStar) => { - // FIXME: KleeneStars are borked! let last_terminal = groups .last_mut() .ok_or(RegexError::ParseError( @@ -274,12 +270,6 @@ impl Encodable<&str> for Regex { } } -// Found a regular expression LL(1) grammar that I can hopefully handroll a parser for! -// https://blog.robertelder.org/regular-expression-parser-grammar/ -// ^^ Probably won't use that... - -// FIXME: I realise now that a DFA with an alphabet of all possible UTF-8 strings is not going to happen, we're better off making a new kind of DFA that has a looser alphabet requirement. - impl Encodable<Vec<Either<char, RegexToken>>> for Regex { fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> { let parse_tree = Regex::make_grammar_tree(tokens)?; @@ -287,28 +277,6 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex { } } -struct Stack<T> { - stack: Vec<T>, -} - -impl<T> Stack<T> { - fn new() -> Self { - Stack { stack: vec![] } - } - fn peek_mut(&mut self) -> Option<&mut T> { - self.stack.last_mut() - } - fn peek(&self) -> Option<&T> { - self.stack.last() - } - fn push(&mut self, item: T) { - self.stack.push(item) - } - fn pop(&mut self) -> Option<T> { - self.stack.pop() - } -} - impl GraphENFA { fn add_non_terminal( &mut self, @@ -390,16 +358,8 @@ impl Encodable<RegexNonTerminal> for Regex { let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?; enfa.set_state_acceptance(final_state, true)?; - eprintln!("Graph ENFA:\n{}", enfa); - - let test_enfa = enfa.as_enfa()?; - - eprintln!("Regular ENFA:\n{}", test_enfa); - let dfa = enfa.as_dfa()?; - eprintln!("DFA:\n{}", dfa); - Ok(Regex { automaton: dfa, parsed: parse_tree, diff --git a/src/lib/tests/enfa_tests.rs b/src/lib/tests/enfa_tests.rs @@ -74,7 +74,7 @@ fn dfa_conversion() { let enfa = ENFA::parse(enfa_enc.as_str()).expect("Unable to parse NFA!"); let as_dfa = enfa.as_dfa().expect("Error converting NFA to DFA!"); - for i in 1..2000 { + for i in 1..500 { let input = rand_string(i, enfa.alphabet.iter().map(char::to_owned).collect()); assert_eq!(enfa.accepts(&input), as_dfa.accepts(&input)); } diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs @@ -1,31 +1,74 @@ -use either::Either; - -use crate::{ - regex::{Regex, RegexToken}, - Encodable, -}; +use crate::{regex::Regex, Automaton, Encodable}; +use rand::Rng; #[test] fn tokeniser() { + // Not realy sure how I'm going to test this + /* assert_eq!( Regex::tokenise("a*").unwrap(), - vec![Either::Left('a'), Either::Right(RegexToken::KleeneStar)] - ); - assert_eq!( - Regex::tokenise("\\*\\\\").unwrap(), - vec![Either::Left('*'), Either::Left('\\')] - ); - assert_eq!( - Regex::tokenise("\\*\\\\*").unwrap(), - vec![ - Either::Left('*'), - Either::Left('\\'), - Either::Right(RegexToken::KleeneStar) - ] + RegexNonTerminal::KleeneGroup(vec![RegexNonTerminal::Terminal(RegexTerminal::Character( + 'a' + ))]) ); + assert_eq!(Regex::tokenise("\\*\\\\").unwrap(), vec![]); + assert_eq!(Regex::tokenise("\\*\\\\*").unwrap(), vec![]); + */ } +const PARSE_FAIL_STR: &str = "Unable to parse regex!"; + #[test] -fn parser() { - Regex::parse(".").expect("Fuck"); +fn accepts_long_concats() { + let alphabet = [ + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", + "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "\\.", + "\\+", "\\*", "\\?", "\\(", "\\)", "\\\\", + ]; + + let mut rng = rand::thread_rng(); + for len in 1..500 { + let regex_str: String = (0..len) + .map(|_| alphabet[rng.gen_range(0..alphabet.len())]) + .collect(); + let mut escaped = false; + let input_str = regex_str + .chars() + .filter(|&c| { + let ret = escaped || c != '\\'; + escaped = !ret; + ret + }) + .collect::<String>(); + let regex = Regex::parse(regex_str.as_str()).expect(PARSE_FAIL_STR); + assert!(regex.accepts(input_str.as_str())); + } +} + +#[test] +fn wildcard_tests() { + let alphabet = [ + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", + "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", " ", + ]; + + let mut rng = rand::thread_rng(); + + let regex = Regex::parse("a.*a").expect(PARSE_FAIL_STR); + + for len in 2..1000 { + let input_str: String = (0..len) + .enumerate() + .map(|(i, _)| { + if i != 0 && i != len - 1 { + alphabet[rng.gen_range(0..alphabet.len())] + } else { + "a" + } + }) + .collect(); + assert!(regex.accepts(input_str.as_str())) + } }