automaton

An automaton library written in Rust
Log | Files | Refs

commit c069a2bea6e0514c55a12cc86c439f6cd0fa32b0
parent 822ddeb51d020c593aec273138cbbf35842097f0
Author: Ethan Long <ethandavidlong@gmail.com>
Date:   Sun,  4 Feb 2024 23:28:47 +1100

Made some more progress. Things are still buggy AF

I scrapped the notion of even bothering to have a separate field for
"loose transitions", that was stupid. I have instead decided to make
everything use just use the `Transition` type that depends on the
automaton structure.

I know the Regex->Graph ENFA parsing is going good, as the graphviz
visualisations look good. So I just need to clean up the implementations
and get rid of bugs etc.

Diffstat:
Msrc/lib/dfa.rs | 41++++++++++++++++++++++++++---------------
Msrc/lib/enfa.rs | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Msrc/lib/graph_enfa.rs | 83+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Msrc/lib/nfa.rs | 193+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Msrc/lib/regex.rs | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
5 files changed, 384 insertions(+), 219 deletions(-)

diff --git a/src/lib/dfa.rs b/src/lib/dfa.rs @@ -53,21 +53,30 @@ impl DFAParseError { impl Error for DFAParseError {} +#[derive(Hash, Debug, PartialEq, Eq, Clone, Copy)] +pub enum Transition { + Char(char), + Wildcard, +} + #[derive(Clone, Debug, PartialEq)] pub struct DFA { pub alphabet: HashSet<char>, pub states: HashSet<u64>, pub initial_state: u64, pub final_states: HashSet<u64>, - pub transition_table: HashMap<(u64, char), u64>, - pub loose_transitions: HashMap<u64, u64>, + pub transition_table: HashMap<(u64, Transition), u64>, } impl Automaton for DFA { fn accepts(&self, input: &str) -> bool { let mut current_state: u64 = self.initial_state; for c in input.chars() { - if !self.alphabet.contains(&c) && !self.loose_transitions.contains_key(&current_state) { + if !self.alphabet.contains(&c) + && !self + .transition_table + .contains_key(&(current_state, Transition::Wildcard)) + { DFAParseError::print_warn(&format!( "Input character {} is not in the alphabet of the DFA", c @@ -75,8 +84,10 @@ impl Automaton for DFA { } match self .transition_table - .get(&(current_state, c)) - .or(self.loose_transitions.get(&current_state)) + .get(&(current_state, Transition::Char(c))) + .or(self + .transition_table + .get(&(current_state, Transition::Wildcard))) { Some(state) => current_state = state.to_owned(), None => { @@ -162,18 +173,19 @@ impl Display for DFA { format!("{}{}{}", st, n, ending) }); - let index_map: HashMap<char, usize> = alphabet_vec + let index_map: HashMap<Transition, usize> = alphabet_vec .iter() .enumerate() - .map(|(index, char)| (char.to_owned(), index)) + .map(|(index, &char)| (Transition::Char(char), index)) .collect(); let mut transition_table_hdr: String = alphabet_vec .iter() .map(|c| format!("\t{}\t|", c)) .fold("\t|".to_owned(), |st, substr| format!("{}{}", st, substr)); - transition_table_hdr.push_str("\tLoose"); + transition_table_hdr.push_str("\tWildcard"); + // FIXME: I Think this won't work xD let mut transition_map: HashMap<u64, Vec<u64>> = HashMap::new(); for ((from, input), to) in self.transition_table.iter() { @@ -189,15 +201,15 @@ impl Display for DFA { let transition_table_str: String = transition_map .iter() - .fold(transition_table_hdr, |st, (input, states)| { + .fold(transition_table_hdr, |st, (&from, states)| { let dests = states .into_iter() .fold("".to_owned(), |st, dst| format!("{}{}\t|\t", st, dst)); let wildcard = self - .loose_transitions - .get(input) + .transition_table + .get(&(from, Transition::Wildcard)) .map_or("∅".to_owned(), |&dest| dest.to_string()); - format!("{}\n{}\t|\t{}{}", st, input, dests, wildcard) + format!("{}\n{}\t|\t{}{}", st, from, dests, wildcard) }); writeln!(f, "Alphabet: {}", alphabet_str)?; @@ -260,7 +272,7 @@ impl Encodable<Value> for DFA { "Could not convert the final states to u64", ))?; - let mut transition_table: HashMap<(u64, char), u64> = HashMap::new(); + let mut transition_table: HashMap<(u64, Transition), u64> = HashMap::new(); for row in json["transition_table"] .as_array() .ok_or(DFAParseError::error( @@ -295,7 +307,7 @@ impl Encodable<Value> for DFA { "Unable to parse the transition destinations in the table", ))?; for (input, dest) in dests.iter() { - transition_table.insert((from, *input), *dest); + transition_table.insert((from, Transition::Char(*input)), *dest); } } @@ -305,7 +317,6 @@ impl Encodable<Value> for DFA { final_states, transition_table, initial_state, - loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs @@ -1,4 +1,8 @@ -use crate::{dfa::DFA, nfa::NFA, Automaton, Encodable, FiniteStateAutomaton}; +use crate::{ + dfa::{self, DFA}, + nfa::{self, NFA}, + Automaton, Encodable, FiniteStateAutomaton, +}; use core::fmt::Display; use serde_json::{from_str, Value}; use std::{ @@ -8,7 +12,7 @@ use std::{ }; #[derive(Debug)] -struct ENFAParseError { +pub struct ENFAParseError { reason: String, } @@ -46,6 +50,42 @@ impl ENFAParseError { impl Error for ENFAParseError {} +#[derive(Hash, Debug, PartialEq, Eq, Clone, Copy)] +pub enum Transition { + Char(char), + Wildcard, + Epsilon, +} + +impl From<nfa::Transition> for Transition { + fn from(transition: nfa::Transition) -> Self { + match transition { + nfa::Transition::Char(c) => Self::Char(c), + nfa::Transition::Wildcard => Self::Wildcard, + } + } +} + +impl From<dfa::Transition> for Transition { + fn from(transition: dfa::Transition) -> Self { + match transition { + dfa::Transition::Char(c) => Self::Char(c), + dfa::Transition::Wildcard => Self::Wildcard, + } + } +} + +impl TryFrom<Transition> for nfa::Transition { + type Error = ENFAParseError; + fn try_from(transition: Transition) -> Result<Self, Self::Error> { + match transition { + Transition::Char(c) => Ok(nfa::Transition::Char(c)), + Transition::Wildcard => Ok(nfa::Transition::Wildcard), + Transition::Epsilon => Err(ENFAParseError::error("Unable to convert ENFA epsilon transition to NFA transition, NFAs have no notion of an epsilon transition!")) + } + } +} + // ε-NFAs! #[derive(Clone, Debug, PartialEq)] pub struct ENFA { @@ -53,17 +93,20 @@ pub struct ENFA { states: HashSet<u64>, initial_state: u64, final_states: HashSet<u64>, - transition_table: HashMap<(u64, Option<char>), Vec<u64>>, - loose_transitions: HashMap<u64, u64>, + transition_table: HashMap<(u64, Transition), Vec<u64>>, } impl ENFA { fn eclose(&self, state: u64, visited: Option<&mut HashSet<u64>>) -> HashSet<u64> { let mut reachable: HashSet<u64> = HashSet::new(); - let mut deez = HashSet::new(); - let visited = visited.unwrap_or(&mut deez); + let mut visited_set = HashSet::new(); // Lifetime hack. There has to be a better way + let visited = visited.unwrap_or(&mut visited_set); reachable.insert(state); - for state in self.transition_table.get(&(state, None)).unwrap_or(&vec![]) { + for state in self + .transition_table + .get(&(state, Transition::Epsilon)) + .unwrap_or(&vec![]) + { if !visited.contains(state) { visited.insert(*state); for state in self.eclose(*state, Some(visited)).iter() { @@ -80,8 +123,7 @@ impl ENFA { states: HashSet<u64>, initial_state: u64, final_states: HashSet<u64>, - transition_table: HashMap<(u64, Option<char>), Vec<u64>>, - loose_transitions: HashMap<u64, u64>, + transition_table: HashMap<(u64, Transition), Vec<u64>>, ) -> ENFA { ENFA { alphabet, @@ -89,9 +131,68 @@ impl ENFA { initial_state, final_states, transition_table, - loose_transitions, } } + + pub fn as_nfa(&self) -> NFA { + // TODO: Implement this properly + let nfa_final_states = self + .states + .iter() + .filter(|&&starting_state| { + self.eclose(starting_state, None) + .into_iter() + .any(|epsilon_accessible| self.final_states.contains(&epsilon_accessible)) + }) + .map(u64::to_owned) + .collect(); + + let mut nfa_transition_table: HashMap<(u64, nfa::Transition), Vec<u64>> = HashMap::new(); + + let enfa_state_accepted_input_map: HashMap<u64, HashSet<Transition>> = HashMap::new(); + + for (from, transition) in self.transition_table.keys() { + let mut transitions = enfa_state_accepted_input_map + .get(from) + .map_or(HashSet::new(), HashSet::to_owned); + transitions.insert(*transition); + } + + for &state in self.states.iter() { + let eclose = self.eclose(state, None); + for (enfa_from, transition) in eclose + .into_iter() + .flat_map(|s| { + enfa_state_accepted_input_map + .get(&s) + .map_or(HashSet::new(), HashSet::to_owned) + .into_iter() + .map(move |t| (s, t)) + }) + .filter_map(|(s, t)| nfa::Transition::try_from(t).map(|t| (s, t)).ok()) + { + let mut enfa_to = self + .transition_table + .get(&(enfa_from, Transition::from(transition))) + .map_or(vec![], Vec::to_owned); + let nfa_to = nfa_transition_table.get_mut(&(state, transition)); + match nfa_to { + Some(v) => v.append(&mut enfa_to), + None => { + nfa_transition_table.insert((state, transition), enfa_to); + } + } + } + } + + NFA::new( + self.alphabet.clone(), + self.states.clone(), + self.initial_state, + nfa_final_states, + nfa_transition_table, + ) + } } impl Automaton for ENFA { @@ -103,24 +204,32 @@ impl Automaton for ENFA { for c in input.chars() { if !self.alphabet.contains(&c) - && !current_states - .iter() - .any(|s| self.loose_transitions.contains_key(s)) + && !current_states.iter().any(|&s| { + self.transition_table + .contains_key(&(s, Transition::Wildcard)) + }) { ENFAParseError::print_warn(&format!( - "Input character {} is not in the alphabet of the DFA", + "Input character {} is not in the alphabet of the ENFA", c )); } for state in current_states.iter() { for newstate in self .transition_table - .get(&(*state, Some(c))) + .get(&(*state, Transition::Char(c))) + // FIXME: This is ugly .map_or( - self.loose_transitions - .get(state) - .map_or(vec![], |&s| vec![s]), - |v| v.to_owned(), + self.transition_table + .get(&(*state, Transition::Wildcard)) + .map_or(vec![], Vec::to_owned), + |v| { + let mut transitions = v.to_owned(); + self.transition_table + .get(&(*state, Transition::Wildcard)) + .map(|s| transitions.append(&mut s.clone())); + transitions + }, ) .iter() .flat_map(|s| self.eclose(*s, None)) @@ -139,43 +248,9 @@ impl Automaton for ENFA { } impl FiniteStateAutomaton for ENFA { - // TODO: Check that this is all g - // FIXED?: BUG: If a final state is within close(state), then state is also a final state. - // TODO: Check that there are no other bugs fn as_dfa(&self) -> Result<DFA, Box<dyn std::error::Error>> { - let mut nfa_transition_table = HashMap::new(); - for (from, input) in self - .transition_table - .keys() - .filter_map(|(f, i)| i.and_then(|c| Some((*f, c)))) - { - let new_to: Vec<u64> = self - .eclose(from, None) - .iter() - .filter_map(|v| self.transition_table.get(&(*v, Some(input)))) - .flatten() - .map(u64::to_owned) - .collect(); - nfa_transition_table.insert((from, input), new_to); - } - let final_states = self - .states - .iter() - .filter(|v| { - self.eclose(**v, None) - .iter() - .any(|s| self.final_states.contains(s)) - }) - .map(u64::to_owned) - .collect(); - let nfa = NFA::new( - self.alphabet.clone(), - self.states.clone(), - self.initial_state, - final_states, - nfa_transition_table, - self.loose_transitions.clone(), - ); + let nfa = self.as_nfa(); + eprintln!("HOLY FUCKERONI IS THE NFA RIGHT? {}", nfa); nfa.as_dfa() } @@ -186,7 +261,7 @@ impl FiniteStateAutomaton for ENFA { let transition_table = dfa .transition_table .iter() - .map(|((from, input), to)| ((*from, Some(*input)), vec![*to])) + .map(|((from, input), to)| ((*from, Transition::from(*input)), vec![*to])) .collect(); Ok(ENFA { alphabet: dfa.alphabet, @@ -194,7 +269,6 @@ impl FiniteStateAutomaton for ENFA { initial_state: dfa.initial_state, final_states: dfa.final_states, transition_table, - loose_transitions: dfa.loose_transitions, }) } } @@ -270,11 +344,11 @@ impl Display for ENFA { .unwrap_or(&vec![Vec::new(); self.alphabet.len() + 1]) .to_owned(); match input { - Some(char) => { + Transition::Char(char) => { let new_index = index_map.get(char).ok_or(core::fmt::Error)?; new_vec[*new_index] = to.to_owned(); } - None => { + _ => { new_vec[self.alphabet.len()] = to.to_owned(); } } @@ -300,9 +374,9 @@ impl Display for ENFA { }); // TODO: Check that this is correct let wildcard_map: String = self - .loose_transitions - .get(input) - .map_or("∅".to_owned(), |&dest| dest.to_string()); + .transition_table + .get(&(*input, Transition::Wildcard)) + .map_or("∅".to_owned(), |dest| format!("{:?}", dest)); format!("{}\n{}\t|\t{}\t|\t{}", acc, input, dests, wildcard_map) }); @@ -364,7 +438,7 @@ impl Encodable<Value> for ENFA { "Could not convert the final states to u64", ))?; - let mut transition_table: HashMap<(u64, Option<char>), Vec<u64>> = HashMap::new(); + let mut transition_table: HashMap<(u64, Transition), Vec<u64>> = HashMap::new(); for row in json["transition_table"] .as_array() @@ -423,10 +497,10 @@ impl Encodable<Value> for ENFA { "Unable to parse the ε-closure destinations in the table", ))?; for (input, dests) in dest_sets.iter() { - transition_table.insert((from, Some(*input)), dests.to_owned()); + transition_table.insert((from, Transition::Char(*input)), dests.to_owned()); } for transition in epsilon_transitions.iter() { - transition_table.insert((from, None), transition.to_owned()); + transition_table.insert((from, Transition::Epsilon), transition.to_owned()); } } @@ -437,7 +511,6 @@ impl Encodable<Value> for ENFA { initial_state, final_states, transition_table, - loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs @@ -1,4 +1,8 @@ -use crate::{enfa::ENFA, Automaton, FiniteStateAutomaton}; +use crate::{ + dfa, + enfa::{self, ENFA}, + Automaton, FiniteStateAutomaton, +}; use petgraph::{dot::Dot, graph::NodeIndex, visit::EdgeRef, Directed, Graph}; use std::{ collections::{HashMap, HashSet}, @@ -35,14 +39,49 @@ impl GraphENFAError { impl Error for GraphENFAError {} +#[derive(Hash, Debug, PartialEq, Eq, Copy, Clone)] +pub enum Transition { + Char(char), + Wildcard, + Epsilon, +} + +impl From<Transition> for enfa::Transition { + fn from(transition: Transition) -> Self { + match transition { + Transition::Char(c) => enfa::Transition::Char(c), + Transition::Wildcard => enfa::Transition::Wildcard, + Transition::Epsilon => enfa::Transition::Epsilon, + } + } +} + +impl From<enfa::Transition> for Transition { + fn from(transition: enfa::Transition) -> Self { + match transition { + enfa::Transition::Char(c) => Transition::Char(c), + enfa::Transition::Wildcard => Transition::Wildcard, + enfa::Transition::Epsilon => Transition::Epsilon, + } + } +} + +impl From<dfa::Transition> for Transition { + fn from(transition: dfa::Transition) -> Self { + match transition { + dfa::Transition::Char(c) => Transition::Char(c), + dfa::Transition::Wildcard => Transition::Wildcard, + } + } +} + /// Intended as an easy way of constructing an ε-NFA in bits, adding onto it much like you'd build an ε-NFA on a whiteboard. pub struct GraphENFA { alphabet: HashSet<char>, states: HashSet<u64>, initial_state: u64, final_states: HashSet<u64>, - graph: Graph<u64, Option<char>, Directed>, - loose_transitions: HashMap<u64, u64>, + graph: Graph<u64, Transition, Directed>, state_id_to_index_map: HashMap<u64, NodeIndex>, current_state_id: u64, } @@ -68,7 +107,6 @@ impl GraphENFA { initial_state: initial_state_id, final_states: HashSet::new(), graph, - loose_transitions: HashMap::new(), state_id_to_index_map, current_state_id: initial_state_id, }, @@ -155,7 +193,7 @@ impl GraphENFA { &mut self, from_id: u64, to_id: u64, - input: Option<char>, + input: Transition, ) -> Result<(), GraphENFAError> { if !self.states.contains(&from_id) || !self.states.contains(&to_id) { return Err(GraphENFAError::error( @@ -164,7 +202,7 @@ impl GraphENFA { } match input { - Some(c) => { + Transition::Char(c) => { if !self.alphabet.contains(&c) { self.alphabet.insert(c); } @@ -193,29 +231,10 @@ impl GraphENFA { Ok(()) } - pub fn insert_loose_transition( - &mut self, - from_id: u64, - to_id: u64, - ) -> Result<(), GraphENFAError> { - if !self.states.contains(&from_id) || !self.states.contains(&to_id) { - return Err(GraphENFAError::error( - "Tried adding a loose transition to/from a state that doesn't exist!", - )); - } - if self.loose_transitions.contains_key(&from_id) { - return Err(GraphENFAError::error( - "There is already a loose transition from the specified from location!", - )); - } - self.loose_transitions.insert(from_id, to_id); - Ok(()) - } - /// Convert into a regular ε-NFA backed by a `HashMap` instead of a `Graph` /// Whilst this can be used, the `as_dfa` method should be used if you're going to evaluate string acceptance. pub fn as_enfa(&self) -> Result<ENFA, Box<dyn Error>> { - let mut transition_table = HashMap::new(); + let mut transition_table: HashMap<(u64, enfa::Transition), Vec<u64>> = HashMap::new(); let mut current_states = vec![self.initial_state]; let mut next_states = vec![]; let mut visited = HashSet::new(); @@ -231,7 +250,7 @@ impl GraphENFA { .to_owned(); for edge in self.graph.edges(current_state_index) { let mut transitions = transition_table - .get(&(current_state, *edge.weight())) + .get(&(current_state, Transition::into(*edge.weight()))) .map_or(vec![], |v: &Vec<u64>| v.to_owned()); let dest = self .graph @@ -244,8 +263,12 @@ impl GraphENFA { if !visited.contains(&dest) { next_states.push(dest); } - transition_table.insert((current_state, *edge.weight()), transitions); + transition_table.insert( + (current_state, Transition::into(*edge.weight())), + transitions, + ); } + visited.insert(current_state); } current_states = next_states; @@ -258,7 +281,6 @@ impl GraphENFA { self.initial_state, self.final_states.clone(), transition_table, - self.loose_transitions.clone(), )) } } @@ -295,7 +317,7 @@ impl FiniteStateAutomaton for GraphENFA { .get(&to) .map_or_else(|| graph.add_node(to), |n| n.to_owned()); state_id_to_index_map.insert(to, to_index); - graph.add_edge(from_index, to_index, Some(input)); + graph.add_edge(from_index, to_index, Transition::from(input)); } let current_state_id = *dfa @@ -310,7 +332,6 @@ impl FiniteStateAutomaton for GraphENFA { initial_state: dfa.initial_state, final_states: dfa.final_states, graph, - loose_transitions: dfa.loose_transitions, state_id_to_index_map, current_state_id, }) diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs @@ -1,7 +1,10 @@ -use crate::{dfa::DFA, Automaton, Encodable, FiniteStateAutomaton}; +use crate::{ + dfa::{self, DFA}, + Automaton, Encodable, FiniteStateAutomaton, +}; use core::fmt::Display; use std::{ - collections::{BTreeSet, HashMap, HashSet}, + collections::{BTreeSet, HashMap, HashSet, VecDeque}, error::Error, fmt, panic::Location, @@ -48,14 +51,28 @@ impl NFAParseError { impl Error for NFAParseError {} +#[derive(Hash, Debug, PartialEq, Eq, Clone, Copy)] +pub enum Transition { + Char(char), + Wildcard, +} + +impl From<dfa::Transition> for Transition { + fn from(transition: dfa::Transition) -> Self { + match transition { + dfa::Transition::Char(c) => Transition::Char(c), + dfa::Transition::Wildcard => Transition::Wildcard, + } + } +} + #[derive(Clone, Debug, PartialEq)] pub struct NFA { pub alphabet: HashSet<char>, states: HashSet<u64>, initial_state: u64, final_states: HashSet<u64>, - transition_table: HashMap<(u64, char), Vec<u64>>, - loose_transitions: HashMap<u64, u64>, + transition_table: HashMap<(u64, Transition), Vec<u64>>, } impl NFA { @@ -64,8 +81,7 @@ impl NFA { states: HashSet<u64>, initial_state: u64, final_states: HashSet<u64>, - transition_table: HashMap<(u64, char), Vec<u64>>, - loose_transitions: HashMap<u64, u64>, + transition_table: HashMap<(u64, Transition), Vec<u64>>, ) -> Self { NFA { alphabet, @@ -73,7 +89,6 @@ impl NFA { initial_state, final_states, transition_table, - loose_transitions, } } } @@ -84,9 +99,10 @@ impl Automaton for NFA { let mut new_states: Vec<u64> = Vec::new(); for c in input.chars() { if !self.alphabet.contains(&c) - && !current_states - .iter() - .any(|s| self.loose_transitions.contains_key(s)) + && !current_states.iter().any(|&s| { + self.transition_table + .contains_key(&(s, Transition::Wildcard)) + }) { NFAParseError::print_warn(&format!( "Input character {} is not in the alphabet of the DFA", @@ -95,9 +111,11 @@ impl Automaton for NFA { return false; } for i in 0..current_states.len() { - match current_states.get(i).map_or(None, |s| { - self.transition_table.get(&(*s, c)).map_or( - self.loose_transitions.get(s).and_then(|s| Some(vec![*s])), + match current_states.get(i).map_or(None, |&s| { + self.transition_table.get(&(s, Transition::Char(c))).map_or( + self.transition_table + .get(&(s, Transition::Wildcard)) + .and_then(|s| Some(s.to_owned())), |v| Some(v.to_owned()), ) }) { @@ -119,76 +137,87 @@ impl Automaton for NFA { impl FiniteStateAutomaton for NFA { fn as_dfa(&self) -> Result<DFA, Box<dyn Error>> { - let mut current_newstate = self.initial_state; - let mut newest_newstate = current_newstate; - // Subset construction algo - let mut state_forward_map: HashMap<BTreeSet<u64>, u64> = HashMap::new(); - let mut state_reverse_map: HashMap<u64, BTreeSet<u64>> = HashMap::new(); - let mut transition_table: HashMap<(u64, char), u64> = HashMap::new(); - - state_reverse_map.insert(self.initial_state, BTreeSet::from([self.initial_state])); - state_forward_map.insert(BTreeSet::from([self.initial_state]), self.initial_state); - - while current_newstate <= newest_newstate { - for c in self.alphabet.iter() { - let current_states = state_reverse_map - .clone() - .get(&current_newstate) - .ok_or(NFAParseError::error( - "We should have already added this state into the reverse map!", - ))? - .to_owned(); - let to: BTreeSet<u64> = current_states - .iter() - .flat_map(|st| { - self.transition_table - .get(&(*st, *c)) - .unwrap_or(&vec![]) - .to_owned() - .iter() - .map(u64::to_owned) - .collect::<Vec<u64>>() - }) - .collect(); - - match state_forward_map.get(&to) { - Some(state) => { - transition_table.insert((current_newstate, *c), *state); - } - None => { - newest_newstate += 1; - state_forward_map.insert(to.clone(), newest_newstate); - state_reverse_map.insert(newest_newstate, to.clone()); - transition_table.insert((current_newstate, *c), newest_newstate); - if to.len() == 0 { - // Trap state! - for c in self.alphabet.iter() { - transition_table.insert((newest_newstate, *c), newest_newstate); - } - } + // TODO: Implement this function + let mut states = HashSet::new(); + let initial_dfa_state = 0; + let mut newest_dfa_state = initial_dfa_state; + let mut final_states = HashSet::new(); + let mut transition_table = HashMap::new(); + + let mut dfa_state_to_nfa_set: HashMap<u64, BTreeSet<u64>> = HashMap::new(); + let mut nfa_set_to_dfa_state: HashMap<BTreeSet<u64>, u64> = HashMap::new(); + + let mut possible_inputs: Vec<dfa::Transition> = self + .alphabet + .iter() + .map(|&c| dfa::Transition::Char(c)) + .collect(); + possible_inputs.push(dfa::Transition::Wildcard); + + for &input in possible_inputs.iter() { + let mut nfa_sets_to_be_processed: VecDeque<BTreeSet<u64>> = VecDeque::new(); + let mut processing_nfa_states = BTreeSet::from([self.initial_state]); + let mut nfa_sets_processed: BTreeSet<BTreeSet<u64>> = BTreeSet::new(); + while processing_nfa_states.len() != 0 { + if !nfa_sets_processed.contains(&processing_nfa_states) { + for nfa_dests in processing_nfa_states + .iter() + .map(|&nfa_state| { + self.transition_table + .get(&(nfa_state, Transition::from(input))) + .map_or(vec![], Vec::to_owned) + }) + //This conversion to BTreeSet is shit + .map(Vec::into_iter) + .map(BTreeSet::from_iter) + { + let dfa_from = nfa_set_to_dfa_state + .get(&processing_nfa_states) + .ok_or(NFAParseError::error( + "Unable to get something that should have already been processed!", + ))? + .to_owned(); + let dfa_to = nfa_set_to_dfa_state.clone().get(&nfa_dests).map_or_else( + || { + // New subset! + newest_dfa_state += 1; + states.insert(newest_dfa_state); + dfa_state_to_nfa_set.insert(newest_dfa_state, nfa_dests.clone()); + nfa_set_to_dfa_state.insert(nfa_dests.clone(), newest_dfa_state); + + if nfa_dests.iter().any(|s| self.final_states.contains(s)) { + // This is a final state! + final_states.insert(newest_dfa_state); + } else if nfa_dests.len() == 0 { + // This is the trap state! + for &trans in possible_inputs.iter() { + transition_table + .insert((newest_dfa_state, trans), newest_dfa_state); + } + } + newest_dfa_state + }, + u64::to_owned, + ); + + nfa_sets_to_be_processed.push_back(nfa_dests); + + transition_table.insert((dfa_from, input), dfa_to); } } + nfa_sets_processed.insert(processing_nfa_states); + processing_nfa_states = nfa_sets_to_be_processed + .pop_front() + .unwrap_or(BTreeSet::new()); } - current_newstate += 1; } - let states = (self.initial_state..newest_newstate).collect(); - let final_states: HashSet<u64> = state_reverse_map - .iter() - .filter(|(_, old_states)| { - old_states - .iter() - .any(|old_state| self.final_states.contains(old_state)) - }) - .map(|(new_state, _)| new_state.to_owned()) - .collect(); Ok(DFA { alphabet: self.alphabet.clone(), states, - initial_state: self.initial_state, + initial_state: initial_dfa_state, final_states, transition_table, - loose_transitions: self.loose_transitions.clone(), }) } @@ -199,7 +228,7 @@ impl FiniteStateAutomaton for NFA { let transition_table = dfa .transition_table .iter() - .map(|(from_input, to)| (*from_input, vec![*to])) + .map(|(&(from, input), &to)| ((from, Transition::from(input)), vec![to])) .collect(); Ok(NFA { alphabet: dfa.alphabet, @@ -207,7 +236,6 @@ impl FiniteStateAutomaton for NFA { initial_state: dfa.initial_state, final_states: dfa.final_states, transition_table, - loose_transitions: dfa.loose_transitions, }) } } @@ -270,10 +298,10 @@ impl Display for NFA { let mut transition_map: HashMap<u64, Vec<Vec<u64>>> = HashMap::new(); - let index_map: HashMap<char, usize> = alphabet_vec + let index_map: HashMap<Transition, usize> = alphabet_vec .iter() .enumerate() - .map(|(index, char)| (char.to_owned(), index)) + .map(|(index, char)| (Transition::Char(char.to_owned()), index)) .collect(); for ((from, input), to) in self.transition_table.iter() { @@ -304,9 +332,9 @@ impl Display for NFA { format!("{acc}{}\t|\t", dest_sets_str) }); let wildcard = self - .loose_transitions - .get(input) - .map_or("∅".to_owned(), |&dest| dest.to_string()); + .transition_table + .get(&(*input, Transition::Wildcard)) + .map_or("∅".to_owned(), |dest| format!("{:?}", dest)); format!("{}\n{}\t|\t{}\t|\t{}", st, input, dests, wildcard) }); @@ -366,7 +394,7 @@ impl Encodable<Value> for NFA { "Could not convert the final states to u64", ))?; - let mut transition_table: HashMap<(u64, char), Vec<u64>> = HashMap::new(); + let mut transition_table: HashMap<(u64, Transition), Vec<u64>> = HashMap::new(); for row in json["transition_table"] .as_array() .ok_or(NFAParseError::error( @@ -407,7 +435,7 @@ impl Encodable<Value> for NFA { "Unable to parse the transition destinations in the table", ))?; for (input, dests) in dest_sets.iter() { - transition_table.insert((from, *input), dests.to_owned()); + transition_table.insert((from, Transition::Char(*input)), dests.to_owned()); } } @@ -417,7 +445,6 @@ impl Encodable<Value> for NFA { initial_state, final_states, transition_table, - loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/regex.rs b/src/lib/regex.rs @@ -1,6 +1,6 @@ use crate::{ dfa::DFA, - graph_enfa::{GraphENFA, GraphENFAError}, + graph_enfa::{GraphENFA, GraphENFAError, Transition}, Automaton, Encodable, FiniteStateAutomaton, }; use either::Either; @@ -249,14 +249,22 @@ impl Regex { } if groups.len() != 1 { - Err(RegexError::ParseError( + return Err(RegexError::ParseError( "Ended the grammar tree within a group!".to_owned(), - )) - } else { - Ok(RegexNonTerminal::Group(groups.pop().ok_or( - RegexError::ParseError("Ended the grammar tree without the main group!".to_owned()), - )?)) + )); } + if in_a_union { + // Catch the outermost union if it exists + let last_elem = groups.pop().ok_or(RegexError::ParseError( + "Ended the grammar tree without the main group!".to_owned(), + ))?; + or_group.push(RegexNonTerminal::Group(last_elem)); + return Ok(RegexNonTerminal::Union(or_group)); + } + + Ok(RegexNonTerminal::Group(groups.pop().ok_or( + RegexError::ParseError("Ended the grammar tree without the main group!".to_owned()), + )?)) } } @@ -279,8 +287,29 @@ impl Encodable<Vec<Either<char, RegexToken>>> for Regex { } } +struct Stack<T> { + stack: Vec<T>, +} + +impl<T> Stack<T> { + fn new() -> Self { + Stack { stack: vec![] } + } + fn peek_mut(&mut self) -> Option<&mut T> { + self.stack.last_mut() + } + fn peek(&self) -> Option<&T> { + self.stack.last() + } + fn push(&mut self, item: T) { + self.stack.push(item) + } + fn pop(&mut self) -> Option<T> { + self.stack.pop() + } +} + impl GraphENFA { - // TODO: Build up an ENFA using the parse tree fn add_non_terminal( &mut self, nonterminal: RegexNonTerminal, @@ -299,9 +328,13 @@ impl GraphENFA { let new_state = self.new_state()?; match terminal { - RegexTerminal::Character(c) => self.insert_transition(from, new_state, Some(c)), - RegexTerminal::Wildcard => self.insert_loose_transition(from, new_state), - RegexTerminal::Epsilon => self.insert_transition(from, new_state, None), + RegexTerminal::Character(c) => { + self.insert_transition(from, new_state, Transition::Char(c)) + } + RegexTerminal::Wildcard => { + self.insert_transition(from, new_state, Transition::Wildcard) + } + RegexTerminal::Epsilon => self.insert_transition(from, new_state, Transition::Epsilon), }?; Ok(new_state) @@ -328,15 +361,9 @@ impl GraphENFA { kleene_group: Vec<RegexNonTerminal>, from: u64, ) -> Result<u64, GraphENFAError> { - // FIXME: KLEENE GROUPS ARE BORKED! - let start_state = self.add_terminal(RegexTerminal::Epsilon, from)?; - let end_state = self.new_state()?; - self.insert_transition(start_state, end_state, None)?; - self.insert_transition(end_state, start_state, None)?; - - let last_state_in_group = self.add_group(kleene_group, start_state)?; - - self.insert_transition(last_state_in_group, end_state, None)?; + let end_state = self.add_group(kleene_group, from)?; + self.insert_transition(from, end_state, Transition::Epsilon)?; + self.insert_transition(end_state, from, Transition::Epsilon)?; Ok(end_state) } @@ -347,13 +374,11 @@ impl GraphENFA { union: Vec<RegexNonTerminal>, from: u64, ) -> Result<u64, GraphENFAError> { - // FIXME: Unions are buggy, they are looking for the end of scope ')', but won't work as just something like a|b - let start_state = self.add_terminal(RegexTerminal::Epsilon, from)?; let end_state = self.new_state()?; for nonterminal in union { - let entry_state = self.add_terminal(RegexTerminal::Epsilon, start_state)?; + let entry_state = self.add_terminal(RegexTerminal::Epsilon, from)?; let exit_state = self.add_non_terminal(nonterminal, entry_state)?; - self.insert_transition(exit_state, end_state, None)?; + self.insert_transition(exit_state, end_state, Transition::Epsilon)?; } Ok(end_state) } @@ -365,8 +390,16 @@ impl Encodable<RegexNonTerminal> for Regex { let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?; enfa.set_state_acceptance(final_state, true)?; + eprintln!("DEBUG: {}", enfa); + + let test_enfa = enfa.as_enfa()?; + + eprintln!("DEBUG2: {}", test_enfa); + let dfa = enfa.as_dfa()?; + eprintln!("DEBUG3: {}", dfa); + Ok(Regex { automaton: dfa, parsed: parse_tree,