Fixed a bug in the NFA->DFA convertion - automaton - An automaton library written in Rust

commit eb7bc2b8a0295c0b1ac45d284bef58bf990e6995
parent 707d9a5186561bf10c2ca5b5b4f76494cdd992bd
Author: Ethan Long <ethandavidlong@gmail.com>
Date:   Fri, 23 Feb 2024 03:06:52 +1100

Fixed a bug in the NFA->DFA convertion

The NFA->DFA convertion did not take into account the deterministic
nature of the DFA, it will take only 1 path. This means that in the set
construction, if a wildcard transition exists, then all transitions must
be the union with that transition so that it traverses all paths.

Diffstat:
M src/bin/regex.rs  | 1 +
M src/lib/enfa.rs  | 2 --
M src/lib/graph_enfa.rs  | 3 +--
M src/lib/nfa.rs  | 134 ++++++++++++++-----------------------------------------------------------------
M src/lib/regex.rs  | 40 ----------------------------------------
M src/lib/tests/enfa_tests.rs  | 2 +-
M src/lib/tests/regex_tests.rs  | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------

7 files changed, 90 insertions(+), 177 deletions(-)
diff --git a/src/bin/regex.rs b/src/bin/regex.rs
@@ -40,6 +40,7 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     let regex = Regex::parse(regex_str.as_str())?;
     println!("Regex: {}", regex_str);
+    println!("DFA: {}", regex.automaton);
     println!("Running on the following input:\n{input_string}");
     println!("Accepts: {}", regex.accepts(&input_string));
 
diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs
@@ -135,7 +135,6 @@ impl ENFA {
         }
     }
 
-    // FIXME: There are states with no entrypoint here! Prune time
     pub fn as_nfa(&self) -> NFA {
         let nfa_final_states = self
             .states
@@ -269,7 +268,6 @@ impl Automaton for ENFA {
 impl FiniteStateAutomaton for ENFA {
     fn as_dfa(&self) -> Result<DFA, Box<dyn std::error::Error>> {
         let nfa = self.as_nfa();
-        eprintln!("HOLY FUCKERONI IS THE NFA RIGHT?\n{}", nfa);
         nfa.as_dfa()
     }
 
diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs
@@ -3,7 +3,7 @@ use crate::{
     enfa::{self, ENFA},
     to_graphviz_dot, Automaton, FiniteStateAutomaton,
 };
-use petgraph::{dot::Dot, graph::NodeIndex, visit::EdgeRef, Directed, Graph};
+use petgraph::{graph::NodeIndex, visit::EdgeRef, Directed, Graph};
 use std::{
     collections::{HashMap, HashSet},
     error::Error,
@@ -342,7 +342,6 @@ impl FiniteStateAutomaton for GraphENFA {
 }
 
 impl Display for GraphENFA {
-    /// This should only be used for debugging, the graph is not pretty printed, but the output can be used in graphviz to evaluate the ε-NFA.
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let mut alphabet_vec: Vec<char> = self.alphabet.iter().copied().collect();
         let mut states_vec: Vec<u64> = self.states.iter().copied().collect();
diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs
@@ -5,7 +5,7 @@ use crate::{
 use core::fmt::Display;
 use petgraph::{Directed, Graph};
 use std::{
-    collections::{BTreeSet, HashMap, HashSet, VecDeque},
+    collections::{BTreeSet, HashMap, HashSet},
     error::Error,
     fmt,
     panic::Location,
@@ -58,6 +58,7 @@ pub enum Transition {
     Wildcard,
 }
 
+// TODO: From doesn't seem to be 2-way... Maybe I'm using it wrong?
 impl From<dfa::Transition> for Transition {
     fn from(transition: dfa::Transition) -> Self {
         match transition {
@@ -67,7 +68,6 @@ impl From<dfa::Transition> for Transition {
     }
 }
 
-// FIXME: WTF IS WRONG WITH FROM AND INTO???
 impl From<Transition> for dfa::Transition {
     fn from(transition: Transition) -> Self {
         match transition {
@@ -147,113 +147,6 @@ impl Automaton for NFA {
 }
 
 impl FiniteStateAutomaton for NFA {
-    // FIXME: This is broken, very broken.
-    // I think the bug lies in the fact that the "empty set state"/trap state should be overridden by the wildcard transition (which can be the trap state, so basically always set it to the wildcard transition).
-    /*
-    fn as_dfa(&self) -> Result<DFA, Box<dyn Error>> {
-        let mut states = HashSet::new();
-        let initial_dfa_state = 0;
-        states.insert(initial_dfa_state);
-        let mut newest_dfa_state = initial_dfa_state;
-        let mut final_states = HashSet::new();
-        let mut transition_table = HashMap::new();
-
-        let mut dfa_state_to_nfa_set: HashMap<u64, BTreeSet<u64>> = HashMap::new();
-        let mut nfa_set_to_dfa_state: HashMap<BTreeSet<u64>, u64> = HashMap::new();
-
-        nfa_set_to_dfa_state.insert(BTreeSet::from([self.initial_state]), initial_dfa_state);
-
-        let mut possible_inputs: Vec<dfa::Transition> = self
-            .alphabet
-            .iter()
-            .map(|&c| dfa::Transition::Char(c))
-            .collect();
-        possible_inputs.push(dfa::Transition::Wildcard);
-
-        let wildcard_transitions: HashMap<u64, Vec<u64>> = self
-            .states
-            .iter()
-            .filter_map(|&s| {
-                self.transition_table
-                    .get(&(s, Transition::Wildcard))
-                    .map(|dests| (s, dests.to_owned()))
-            })
-            .collect();
-
-        for &input in self.alphabet.iter() {
-            let mut nfa_sets_to_be_processed: VecDeque<BTreeSet<u64>> = VecDeque::new();
-            let mut processing_nfa_states = BTreeSet::from([self.initial_state]);
-            let mut nfa_sets_processed: BTreeSet<BTreeSet<u64>> = BTreeSet::new();
-            while processing_nfa_states.len() != 0 {
-                if !nfa_sets_processed.contains(&processing_nfa_states) {
-                    for nfa_dests in processing_nfa_states
-                        .iter()
-                        .map(|&nfa_state| {
-                            self.transition_table
-                                .get(&(nfa_state, Transition::Char(input)))
-                                .map_or_else(
-                                    || {
-                                        wildcard_transitions
-                                            .get(&nfa_state)
-                                            .map_or_else(|| vec![], Vec::to_owned)
-                                    },
-                                    Vec::to_owned,
-                                )
-                        })
-                        // This conversion to BTreeSet is shit
-                        .map(Vec::into_iter)
-                        .map(BTreeSet::from_iter)
-                    {
-                        let dfa_from = nfa_set_to_dfa_state
-                            .get(&processing_nfa_states)
-                            .ok_or(NFAParseError::error(
-                                "Unable to get something that should have already been processed!",
-                            ))?
-                            .to_owned();
-                        let dfa_to = nfa_set_to_dfa_state.clone().get(&nfa_dests).map_or_else(
-                            || {
-                                // New subset!
-                                newest_dfa_state += 1;
-                                states.insert(newest_dfa_state);
-                                dfa_state_to_nfa_set.insert(newest_dfa_state, nfa_dests.clone());
-                                nfa_set_to_dfa_state.insert(nfa_dests.clone(), newest_dfa_state);
-
-                                if nfa_dests.iter().any(|s| self.final_states.contains(s)) {
-                                    // This is a final state!
-                                    final_states.insert(newest_dfa_state);
-                                } else if nfa_dests.len() == 0 {
-                                    // This is the trap state!
-                                    for &trans in possible_inputs.iter() {
-                                        transition_table
-                                            .insert((newest_dfa_state, trans), newest_dfa_state);
-                                    }
-                                }
-                                newest_dfa_state
-                            },
-                            u64::to_owned,
-                        );
-
-                        nfa_sets_to_be_processed.push_back(nfa_dests);
-
-                        transition_table.insert((dfa_from, dfa::Transition::Char(input)), dfa_to);
-                    }
-                }
-                nfa_sets_processed.insert(processing_nfa_states);
-                processing_nfa_states = nfa_sets_to_be_processed
-                    .pop_front()
-                    .unwrap_or(BTreeSet::new());
-            }
-        }
-
-        Ok(DFA {
-            alphabet: self.alphabet.clone(),
-            states,
-            initial_state: initial_dfa_state,
-            final_states,
-            transition_table,
-        })
-    }
-     */
     fn as_dfa(&self) -> Result<dfa::DFA, Box<dyn Error>> {
         let mut states = HashSet::new();
         let initial_state = 0;
@@ -296,9 +189,28 @@ impl FiniteStateAutomaton for NFA {
             for &transition in nfa_alphabet.iter() {
                 let nfa_dests: BTreeSet<u64> = current_nfa_set
                     .iter()
-                    .filter_map(|&s| self.transition_table.get(&(s, transition)))
+                    .filter_map(|&s| {
+                        // We need to take into account the wildcard transition dests and take the union of them with our transition dests (determinism)
+                        let wildcard_transitions =
+                            self.transition_table.get(&(s, Transition::Wildcard));
+                        if transition == Transition::Wildcard {
+                            wildcard_transitions.map(Vec::to_owned)
+                        } else {
+                            match wildcard_transitions {
+                                Some(wcs) => {
+                                    let mut transitions =
+                                        self.transition_table.get(&(s, transition))?.to_owned();
+                                    transitions.extend(wcs);
+                                    Some(transitions)
+                                }
+                                None => self
+                                    .transition_table
+                                    .get(&(s, transition))
+                                    .map(Vec::to_owned),
+                            }
+                        }
+                    })
                     .flatten()
-                    .map(u64::to_owned)
                     .collect();
                 to_be_processed.insert(nfa_dests.clone());
                 dfa_transition_to_nfa_sets.insert(
diff --git a/src/lib/regex.rs b/src/lib/regex.rs
@@ -132,7 +132,6 @@ impl Regex {
                         "Unable to get last group".to_owned(),
                     ))?
                     .push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))),
-                // FIXME: Wildcards are borked!
                 Either::Right(RegexToken::Wildcard) => groups
                     .last_mut()
                     .ok_or(RegexError::ParseError(
@@ -140,7 +139,6 @@ impl Regex {
                     ))?
                     .push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)),
 
-                // FIXME: Parens are buggy!
                 Either::Right(RegexToken::LeftGroup) => {
                     groups.push(vec![]);
                 }
@@ -172,7 +170,6 @@ impl Regex {
                 }
 
                 Either::Right(RegexToken::Bar) => {
-                    // FIXME: Unions are borked!
                     in_a_union = true;
                     let exited_group = groups.pop().ok_or(RegexError::ParseError(
                         "Unable to pop a group off".to_owned(),
@@ -204,7 +201,6 @@ impl Regex {
                 }
 
                 Either::Right(RegexToken::KleeneStar) => {
-                    // FIXME: KleeneStars are borked!
                     let last_terminal = groups
                         .last_mut()
                         .ok_or(RegexError::ParseError(
@@ -274,12 +270,6 @@ impl Encodable<&str> for Regex {
     }
 }
 
-// Found a regular expression LL(1) grammar that I can hopefully handroll a parser for!
-// https://blog.robertelder.org/regular-expression-parser-grammar/
-// ^^ Probably won't use that...
-
-// FIXME: I realise now that a DFA with an alphabet of all possible UTF-8 strings is not going to happen, we're better off making a new kind of DFA that has a looser alphabet requirement.
-
 impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
     fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> {
         let parse_tree = Regex::make_grammar_tree(tokens)?;
@@ -287,28 +277,6 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
     }
 }
 
-struct Stack<T> {
-    stack: Vec<T>,
-}
-
-impl<T> Stack<T> {
-    fn new() -> Self {
-        Stack { stack: vec![] }
-    }
-    fn peek_mut(&mut self) -> Option<&mut T> {
-        self.stack.last_mut()
-    }
-    fn peek(&self) -> Option<&T> {
-        self.stack.last()
-    }
-    fn push(&mut self, item: T) {
-        self.stack.push(item)
-    }
-    fn pop(&mut self) -> Option<T> {
-        self.stack.pop()
-    }
-}
-
 impl GraphENFA {
     fn add_non_terminal(
         &mut self,
@@ -390,16 +358,8 @@ impl Encodable<RegexNonTerminal> for Regex {
         let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?;
         enfa.set_state_acceptance(final_state, true)?;
 
-        eprintln!("Graph ENFA:\n{}", enfa);
-
-        let test_enfa = enfa.as_enfa()?;
-
-        eprintln!("Regular ENFA:\n{}", test_enfa);
-
         let dfa = enfa.as_dfa()?;
 
-        eprintln!("DFA:\n{}", dfa);
-
         Ok(Regex {
             automaton: dfa,
             parsed: parse_tree,
diff --git a/src/lib/tests/enfa_tests.rs b/src/lib/tests/enfa_tests.rs
@@ -74,7 +74,7 @@ fn dfa_conversion() {
         let enfa = ENFA::parse(enfa_enc.as_str()).expect("Unable to parse NFA!");
         let as_dfa = enfa.as_dfa().expect("Error converting NFA to DFA!");
 
-        for i in 1..2000 {
+        for i in 1..500 {
             let input = rand_string(i, enfa.alphabet.iter().map(char::to_owned).collect());
             assert_eq!(enfa.accepts(&input), as_dfa.accepts(&input));
         }
diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs
@@ -1,31 +1,74 @@
-use either::Either;
-
-use crate::{
-    regex::{Regex, RegexToken},
-    Encodable,
-};
+use crate::{regex::Regex, Automaton, Encodable};
+use rand::Rng;
 
 #[test]
 fn tokeniser() {
+    // Not realy sure how I'm going to test this
+    /*
     assert_eq!(
         Regex::tokenise("a*").unwrap(),
-        vec![Either::Left('a'), Either::Right(RegexToken::KleeneStar)]
-    );
-    assert_eq!(
-        Regex::tokenise("\\*\\\\").unwrap(),
-        vec![Either::Left('*'), Either::Left('\\')]
-    );
-    assert_eq!(
-        Regex::tokenise("\\*\\\\*").unwrap(),
-        vec![
-            Either::Left('*'),
-            Either::Left('\\'),
-            Either::Right(RegexToken::KleeneStar)
-        ]
+        RegexNonTerminal::KleeneGroup(vec![RegexNonTerminal::Terminal(RegexTerminal::Character(
+            'a'
+        ))])
     );
+    assert_eq!(Regex::tokenise("\\*\\\\").unwrap(), vec![]);
+    assert_eq!(Regex::tokenise("\\*\\\\*").unwrap(), vec![]);
+    */
 }
 
+const PARSE_FAIL_STR: &str = "Unable to parse regex!";
+
 #[test]
-fn parser() {
-    Regex::parse(".").expect("Fuck");
+fn accepts_long_concats() {
+    let alphabet = [
+        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+        "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+        "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "\\.",
+        "\\+", "\\*", "\\?", "\\(", "\\)", "\\\\",
+    ];
+
+    let mut rng = rand::thread_rng();
+    for len in 1..500 {
+        let regex_str: String = (0..len)
+            .map(|_| alphabet[rng.gen_range(0..alphabet.len())])
+            .collect();
+        let mut escaped = false;
+        let input_str = regex_str
+            .chars()
+            .filter(|&c| {
+                let ret = escaped || c != '\\';
+                escaped = !ret;
+                ret
+            })
+            .collect::<String>();
+        let regex = Regex::parse(regex_str.as_str()).expect(PARSE_FAIL_STR);
+        assert!(regex.accepts(input_str.as_str()));
+    }
+}
+
+#[test]
+fn wildcard_tests() {
+    let alphabet = [
+        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+        "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+        "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", " ",
+    ];
+
+    let mut rng = rand::thread_rng();
+
+    let regex = Regex::parse("a.*a").expect(PARSE_FAIL_STR);
+
+    for len in 2..1000 {
+        let input_str: String = (0..len)
+            .enumerate()
+            .map(|(i, _)| {
+                if i != 0 && i != len - 1 {
+                    alphabet[rng.gen_range(0..alphabet.len())]
+                } else {
+                    "a"
+                }
+            })
+            .collect();
+        assert!(regex.accepts(input_str.as_str()))
+    }
 }

	automaton An automaton library written in Rust
	Log \| Files \| Refs

M	src/bin/regex.rs	\|	1	+
M	src/lib/enfa.rs	\|	2	--
M	src/lib/graph_enfa.rs	\|	3	+--
M	src/lib/nfa.rs	\|	134	++++++++++++++-----------------------------------------------------------------
M	src/lib/regex.rs	\|	40	----------------------------------------
M	src/lib/tests/enfa_tests.rs	\|	2	+-
M	src/lib/tests/regex_tests.rs	\|	85	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------