commit eb7bc2b8a0295c0b1ac45d284bef58bf990e6995
parent 707d9a5186561bf10c2ca5b5b4f76494cdd992bd
Author: Ethan Long <ethandavidlong@gmail.com>
Date: Fri, 23 Feb 2024 03:06:52 +1100
Fixed a bug in the NFA->DFA convertion
The NFA->DFA convertion did not take into account the deterministic
nature of the DFA, it will take only 1 path. This means that in the set
construction, if a wildcard transition exists, then all transitions must
be the union with that transition so that it traverses all paths.
Diffstat:
7 files changed, 90 insertions(+), 177 deletions(-)
diff --git a/src/bin/regex.rs b/src/bin/regex.rs
@@ -40,6 +40,7 @@ fn main() -> Result<(), Box<dyn Error>> {
let regex = Regex::parse(regex_str.as_str())?;
println!("Regex: {}", regex_str);
+ println!("DFA: {}", regex.automaton);
println!("Running on the following input:\n{input_string}");
println!("Accepts: {}", regex.accepts(&input_string));
diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs
@@ -135,7 +135,6 @@ impl ENFA {
}
}
- // FIXME: There are states with no entrypoint here! Prune time
pub fn as_nfa(&self) -> NFA {
let nfa_final_states = self
.states
@@ -269,7 +268,6 @@ impl Automaton for ENFA {
impl FiniteStateAutomaton for ENFA {
fn as_dfa(&self) -> Result<DFA, Box<dyn std::error::Error>> {
let nfa = self.as_nfa();
- eprintln!("HOLY FUCKERONI IS THE NFA RIGHT?\n{}", nfa);
nfa.as_dfa()
}
diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs
@@ -3,7 +3,7 @@ use crate::{
enfa::{self, ENFA},
to_graphviz_dot, Automaton, FiniteStateAutomaton,
};
-use petgraph::{dot::Dot, graph::NodeIndex, visit::EdgeRef, Directed, Graph};
+use petgraph::{graph::NodeIndex, visit::EdgeRef, Directed, Graph};
use std::{
collections::{HashMap, HashSet},
error::Error,
@@ -342,7 +342,6 @@ impl FiniteStateAutomaton for GraphENFA {
}
impl Display for GraphENFA {
- /// This should only be used for debugging, the graph is not pretty printed, but the output can be used in graphviz to evaluate the ε-NFA.
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut alphabet_vec: Vec<char> = self.alphabet.iter().copied().collect();
let mut states_vec: Vec<u64> = self.states.iter().copied().collect();
diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs
@@ -5,7 +5,7 @@ use crate::{
use core::fmt::Display;
use petgraph::{Directed, Graph};
use std::{
- collections::{BTreeSet, HashMap, HashSet, VecDeque},
+ collections::{BTreeSet, HashMap, HashSet},
error::Error,
fmt,
panic::Location,
@@ -58,6 +58,7 @@ pub enum Transition {
Wildcard,
}
+// TODO: From doesn't seem to be 2-way... Maybe I'm using it wrong?
impl From<dfa::Transition> for Transition {
fn from(transition: dfa::Transition) -> Self {
match transition {
@@ -67,7 +68,6 @@ impl From<dfa::Transition> for Transition {
}
}
-// FIXME: WTF IS WRONG WITH FROM AND INTO???
impl From<Transition> for dfa::Transition {
fn from(transition: Transition) -> Self {
match transition {
@@ -147,113 +147,6 @@ impl Automaton for NFA {
}
impl FiniteStateAutomaton for NFA {
- // FIXME: This is broken, very broken.
- // I think the bug lies in the fact that the "empty set state"/trap state should be overridden by the wildcard transition (which can be the trap state, so basically always set it to the wildcard transition).
- /*
- fn as_dfa(&self) -> Result<DFA, Box<dyn Error>> {
- let mut states = HashSet::new();
- let initial_dfa_state = 0;
- states.insert(initial_dfa_state);
- let mut newest_dfa_state = initial_dfa_state;
- let mut final_states = HashSet::new();
- let mut transition_table = HashMap::new();
-
- let mut dfa_state_to_nfa_set: HashMap<u64, BTreeSet<u64>> = HashMap::new();
- let mut nfa_set_to_dfa_state: HashMap<BTreeSet<u64>, u64> = HashMap::new();
-
- nfa_set_to_dfa_state.insert(BTreeSet::from([self.initial_state]), initial_dfa_state);
-
- let mut possible_inputs: Vec<dfa::Transition> = self
- .alphabet
- .iter()
- .map(|&c| dfa::Transition::Char(c))
- .collect();
- possible_inputs.push(dfa::Transition::Wildcard);
-
- let wildcard_transitions: HashMap<u64, Vec<u64>> = self
- .states
- .iter()
- .filter_map(|&s| {
- self.transition_table
- .get(&(s, Transition::Wildcard))
- .map(|dests| (s, dests.to_owned()))
- })
- .collect();
-
- for &input in self.alphabet.iter() {
- let mut nfa_sets_to_be_processed: VecDeque<BTreeSet<u64>> = VecDeque::new();
- let mut processing_nfa_states = BTreeSet::from([self.initial_state]);
- let mut nfa_sets_processed: BTreeSet<BTreeSet<u64>> = BTreeSet::new();
- while processing_nfa_states.len() != 0 {
- if !nfa_sets_processed.contains(&processing_nfa_states) {
- for nfa_dests in processing_nfa_states
- .iter()
- .map(|&nfa_state| {
- self.transition_table
- .get(&(nfa_state, Transition::Char(input)))
- .map_or_else(
- || {
- wildcard_transitions
- .get(&nfa_state)
- .map_or_else(|| vec![], Vec::to_owned)
- },
- Vec::to_owned,
- )
- })
- // This conversion to BTreeSet is shit
- .map(Vec::into_iter)
- .map(BTreeSet::from_iter)
- {
- let dfa_from = nfa_set_to_dfa_state
- .get(&processing_nfa_states)
- .ok_or(NFAParseError::error(
- "Unable to get something that should have already been processed!",
- ))?
- .to_owned();
- let dfa_to = nfa_set_to_dfa_state.clone().get(&nfa_dests).map_or_else(
- || {
- // New subset!
- newest_dfa_state += 1;
- states.insert(newest_dfa_state);
- dfa_state_to_nfa_set.insert(newest_dfa_state, nfa_dests.clone());
- nfa_set_to_dfa_state.insert(nfa_dests.clone(), newest_dfa_state);
-
- if nfa_dests.iter().any(|s| self.final_states.contains(s)) {
- // This is a final state!
- final_states.insert(newest_dfa_state);
- } else if nfa_dests.len() == 0 {
- // This is the trap state!
- for &trans in possible_inputs.iter() {
- transition_table
- .insert((newest_dfa_state, trans), newest_dfa_state);
- }
- }
- newest_dfa_state
- },
- u64::to_owned,
- );
-
- nfa_sets_to_be_processed.push_back(nfa_dests);
-
- transition_table.insert((dfa_from, dfa::Transition::Char(input)), dfa_to);
- }
- }
- nfa_sets_processed.insert(processing_nfa_states);
- processing_nfa_states = nfa_sets_to_be_processed
- .pop_front()
- .unwrap_or(BTreeSet::new());
- }
- }
-
- Ok(DFA {
- alphabet: self.alphabet.clone(),
- states,
- initial_state: initial_dfa_state,
- final_states,
- transition_table,
- })
- }
- */
fn as_dfa(&self) -> Result<dfa::DFA, Box<dyn Error>> {
let mut states = HashSet::new();
let initial_state = 0;
@@ -296,9 +189,28 @@ impl FiniteStateAutomaton for NFA {
for &transition in nfa_alphabet.iter() {
let nfa_dests: BTreeSet<u64> = current_nfa_set
.iter()
- .filter_map(|&s| self.transition_table.get(&(s, transition)))
+ .filter_map(|&s| {
+ // We need to take into account the wildcard transition dests and take the union of them with our transition dests (determinism)
+ let wildcard_transitions =
+ self.transition_table.get(&(s, Transition::Wildcard));
+ if transition == Transition::Wildcard {
+ wildcard_transitions.map(Vec::to_owned)
+ } else {
+ match wildcard_transitions {
+ Some(wcs) => {
+ let mut transitions =
+ self.transition_table.get(&(s, transition))?.to_owned();
+ transitions.extend(wcs);
+ Some(transitions)
+ }
+ None => self
+ .transition_table
+ .get(&(s, transition))
+ .map(Vec::to_owned),
+ }
+ }
+ })
.flatten()
- .map(u64::to_owned)
.collect();
to_be_processed.insert(nfa_dests.clone());
dfa_transition_to_nfa_sets.insert(
diff --git a/src/lib/regex.rs b/src/lib/regex.rs
@@ -132,7 +132,6 @@ impl Regex {
"Unable to get last group".to_owned(),
))?
.push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))),
- // FIXME: Wildcards are borked!
Either::Right(RegexToken::Wildcard) => groups
.last_mut()
.ok_or(RegexError::ParseError(
@@ -140,7 +139,6 @@ impl Regex {
))?
.push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)),
- // FIXME: Parens are buggy!
Either::Right(RegexToken::LeftGroup) => {
groups.push(vec![]);
}
@@ -172,7 +170,6 @@ impl Regex {
}
Either::Right(RegexToken::Bar) => {
- // FIXME: Unions are borked!
in_a_union = true;
let exited_group = groups.pop().ok_or(RegexError::ParseError(
"Unable to pop a group off".to_owned(),
@@ -204,7 +201,6 @@ impl Regex {
}
Either::Right(RegexToken::KleeneStar) => {
- // FIXME: KleeneStars are borked!
let last_terminal = groups
.last_mut()
.ok_or(RegexError::ParseError(
@@ -274,12 +270,6 @@ impl Encodable<&str> for Regex {
}
}
-// Found a regular expression LL(1) grammar that I can hopefully handroll a parser for!
-// https://blog.robertelder.org/regular-expression-parser-grammar/
-// ^^ Probably won't use that...
-
-// FIXME: I realise now that a DFA with an alphabet of all possible UTF-8 strings is not going to happen, we're better off making a new kind of DFA that has a looser alphabet requirement.
-
impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> {
let parse_tree = Regex::make_grammar_tree(tokens)?;
@@ -287,28 +277,6 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex {
}
}
-struct Stack<T> {
- stack: Vec<T>,
-}
-
-impl<T> Stack<T> {
- fn new() -> Self {
- Stack { stack: vec![] }
- }
- fn peek_mut(&mut self) -> Option<&mut T> {
- self.stack.last_mut()
- }
- fn peek(&self) -> Option<&T> {
- self.stack.last()
- }
- fn push(&mut self, item: T) {
- self.stack.push(item)
- }
- fn pop(&mut self) -> Option<T> {
- self.stack.pop()
- }
-}
-
impl GraphENFA {
fn add_non_terminal(
&mut self,
@@ -390,16 +358,8 @@ impl Encodable<RegexNonTerminal> for Regex {
let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?;
enfa.set_state_acceptance(final_state, true)?;
- eprintln!("Graph ENFA:\n{}", enfa);
-
- let test_enfa = enfa.as_enfa()?;
-
- eprintln!("Regular ENFA:\n{}", test_enfa);
-
let dfa = enfa.as_dfa()?;
- eprintln!("DFA:\n{}", dfa);
-
Ok(Regex {
automaton: dfa,
parsed: parse_tree,
diff --git a/src/lib/tests/enfa_tests.rs b/src/lib/tests/enfa_tests.rs
@@ -74,7 +74,7 @@ fn dfa_conversion() {
let enfa = ENFA::parse(enfa_enc.as_str()).expect("Unable to parse NFA!");
let as_dfa = enfa.as_dfa().expect("Error converting NFA to DFA!");
- for i in 1..2000 {
+ for i in 1..500 {
let input = rand_string(i, enfa.alphabet.iter().map(char::to_owned).collect());
assert_eq!(enfa.accepts(&input), as_dfa.accepts(&input));
}
diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs
@@ -1,31 +1,74 @@
-use either::Either;
-
-use crate::{
- regex::{Regex, RegexToken},
- Encodable,
-};
+use crate::{regex::Regex, Automaton, Encodable};
+use rand::Rng;
#[test]
fn tokeniser() {
+ // Not realy sure how I'm going to test this
+ /*
assert_eq!(
Regex::tokenise("a*").unwrap(),
- vec![Either::Left('a'), Either::Right(RegexToken::KleeneStar)]
- );
- assert_eq!(
- Regex::tokenise("\\*\\\\").unwrap(),
- vec![Either::Left('*'), Either::Left('\\')]
- );
- assert_eq!(
- Regex::tokenise("\\*\\\\*").unwrap(),
- vec![
- Either::Left('*'),
- Either::Left('\\'),
- Either::Right(RegexToken::KleeneStar)
- ]
+ RegexNonTerminal::KleeneGroup(vec![RegexNonTerminal::Terminal(RegexTerminal::Character(
+ 'a'
+ ))])
);
+ assert_eq!(Regex::tokenise("\\*\\\\").unwrap(), vec![]);
+ assert_eq!(Regex::tokenise("\\*\\\\*").unwrap(), vec![]);
+ */
}
+const PARSE_FAIL_STR: &str = "Unable to parse regex!";
+
#[test]
-fn parser() {
- Regex::parse(".").expect("Fuck");
+fn accepts_long_concats() {
+ let alphabet = [
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+ "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+ "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "\\.",
+ "\\+", "\\*", "\\?", "\\(", "\\)", "\\\\",
+ ];
+
+ let mut rng = rand::thread_rng();
+ for len in 1..500 {
+ let regex_str: String = (0..len)
+ .map(|_| alphabet[rng.gen_range(0..alphabet.len())])
+ .collect();
+ let mut escaped = false;
+ let input_str = regex_str
+ .chars()
+ .filter(|&c| {
+ let ret = escaped || c != '\\';
+ escaped = !ret;
+ ret
+ })
+ .collect::<String>();
+ let regex = Regex::parse(regex_str.as_str()).expect(PARSE_FAIL_STR);
+ assert!(regex.accepts(input_str.as_str()));
+ }
+}
+
+#[test]
+fn wildcard_tests() {
+ let alphabet = [
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
+ "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+ "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", " ",
+ ];
+
+ let mut rng = rand::thread_rng();
+
+ let regex = Regex::parse("a.*a").expect(PARSE_FAIL_STR);
+
+ for len in 2..1000 {
+ let input_str: String = (0..len)
+ .enumerate()
+ .map(|(i, _)| {
+ if i != 0 && i != len - 1 {
+ alphabet[rng.gen_range(0..alphabet.len())]
+ } else {
+ "a"
+ }
+ })
+ .collect();
+ assert!(regex.accepts(input_str.as_str()))
+ }
}