automaton

An automaton library written in Rust
Log | Files | Refs

commit 013bd2e9723438a78cea692d71872a65a968a1ba
parent 535924654494262d9585cf2ee309cd268a6b54c0
Author: Ethan Long <ethandavidlong@gmail.com>
Date:   Sun,  4 Feb 2024 00:41:29 +1100

After a lot of thinking, regex has been implemented!

It is definitely buggy though! From first glances, I can already see
that wildcards are not supported. More testing to come soon™

Diffstat:
MCargo.lock | 46++++++++++++++++++++++++++++++++++++++++++++++
MCargo.toml | 11+++++++++++
Msrc/bin/dfa.rs | 12++++++------
Msrc/bin/enfa.rs | 30++++++++++++++++++------------
Msrc/bin/nfa.rs | 26++++++++++++++++----------
Asrc/bin/regex.rs | 46++++++++++++++++++++++++++++++++++++++++++++++
Msrc/lib/automaton.rs | 17+++++++++++------
Msrc/lib/dfa.rs | 46++++++++++++++++++++++++++++++----------------
Msrc/lib/enfa.rs | 138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Asrc/lib/graph_enfa.rs | 378+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/lib/nfa.rs | 42+++++++++++++++++++++++++++++++-----------
Asrc/lib/regex.rs | 370+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/lib/tests/regex_tests.rs | 31+++++++++++++++++++++++++++++++
13 files changed, 1097 insertions(+), 96 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock @@ -6,6 +6,8 @@ version = 3 name = "automaton" version = "0.1.0" dependencies = [ + "either", + "petgraph", "rand", "serde", "serde_json", @@ -18,6 +20,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] name = "getrandom" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -29,6 +49,22 @@ dependencies = [ ] [[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" + +[[package]] +name = "indexmap" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433de089bd45971eecf4668ee0ee8f4cec17db4f8bd8f7bc3197a6ce37aa7d9b" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -41,6 +77,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/Cargo.toml b/Cargo.toml @@ -6,9 +6,11 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +either = "1.9.0" rand = "0.8.5" serde = "1.0.193" serde_json = "1.0.108" +petgraph = "0.6.4" [lib] name = "automaton" @@ -36,4 +38,13 @@ test = false bench = false doc = false proc-macro = false +required-features = [] + +[[bin]] +name = "enfa" +path = "src/bin/enfa.rs" +test = false +bench = false +doc = false +proc-macro = false required-features = [] \ No newline at end of file diff --git a/src/bin/dfa.rs b/src/bin/dfa.rs @@ -29,21 +29,21 @@ fn usage(prog_name: &str) -> String { fn main() -> Result<(), Box<dyn Error>> { let args: Vec<String> = env::args().collect(); - if args.len() != 3 { + if args.len() < 2 { return Err(Box::new(CommandLineError::cause(&usage( args.get(0).map_or("automaton", |s| s), )))); } - let dfa_json = fs::read_to_string(args.get(1).unwrap_or(&"".to_owned()))?; - let input_string = args - .get(2) - .ok_or(CommandLineError::cause("No input string provided!"))?; + let dfa_json = fs::read_to_string(args.get(1).ok_or(CommandLineError::cause( + "Could not get filename from commandline args", + ))?)?; + let input_string: String = args.get(2).map_or("".to_owned(), String::to_owned); let dfa: DFA = DFA::parse(dfa_json.as_str())?; println!("DFA:\n{dfa}"); println!("Running on the following input:\n{input_string}"); - println!("Accepts: {}", dfa.accepts(input_string)); + println!("Accepts: {}", dfa.accepts(&input_string)); Ok(()) } diff --git a/src/bin/enfa.rs b/src/bin/enfa.rs @@ -23,30 +23,36 @@ impl CommandLineError { impl Error for CommandLineError {} fn usage(prog_name: &str) -> String { - format!("Usage: {prog_name} <NFA filename> <input string>") + format!("Usage: {prog_name} <NFA filename> <input string> <USE RAW NFA>") } fn main() -> Result<(), Box<dyn Error>> { let args: Vec<String> = env::args().collect(); - if args.len() != 3 { + if args.len() < 2 { return Err(Box::new(CommandLineError::cause(&usage( args.get(0).map_or("automaton", |s| s), )))); } - let enfa_json = fs::read_to_string(args.get(1).unwrap_or(&"".to_owned()))?; + let enfa_json = fs::read_to_string(args.get(1).ok_or(CommandLineError::cause("Could not get filename from commandline args"))?)?; let input_string = args .get(2) - .ok_or(CommandLineError::cause("No input string provided!"))?; - - let nfa: ENFA = ENFA::parse(enfa_json.as_str())?; - println!("NFA:\n{nfa}"); - println!("Running on the following input:\n{input_string}"); - println!("Accepts: {}", nfa.accepts(input_string)); - let dfa: DFA = DFA::convert(Box::new(nfa))?; - println!("Converted to DFA, accepts: {}", dfa.accepts(input_string)); - println!("DFA:\n{dfa}"); + .map_or("".to_owned(), String::to_owned); + + let enfa: ENFA = ENFA::parse(enfa_json.as_str())?; + let automaton: Box<dyn Automaton> = match args.get(3) { + None => { + println!("Converting to DFA"); + Box::new(DFA::convert(Box::new(enfa))?) + } + Some(_) => { + Box::new(enfa) + } + }; + println!("Automaton:\n{}", automaton); + println!("Running on the following input:\n\"{}\"", input_string); + println!("Accepts: {}", automaton.accepts(&input_string)); Ok(()) } diff --git a/src/bin/nfa.rs b/src/bin/nfa.rs @@ -23,30 +23,36 @@ impl CommandLineError { impl Error for CommandLineError {} fn usage(prog_name: &str) -> String { - format!("Usage: {prog_name} <NFA filename> <input string>") + format!("Usage: {prog_name} <NFA filename> <input string> <USE RAW NFA>") } fn main() -> Result<(), Box<dyn Error>> { let args: Vec<String> = env::args().collect(); - if args.len() != 3 { + if args.len() < 2 { return Err(Box::new(CommandLineError::cause(&usage( args.get(0).map_or("automaton", |s| s), )))); } - let nfa_json = fs::read_to_string(args.get(1).unwrap_or(&"".to_owned()))?; + let nfa_json = fs::read_to_string(args.get(1).ok_or(CommandLineError::cause("Could not get filename from commandline args"))?)?; let input_string = args .get(2) - .ok_or(CommandLineError::cause("No input string provided!"))?; + .map_or("".to_owned(), String::to_owned); let nfa: NFA = NFA::parse(nfa_json.as_str())?; - println!("NFA:\n{nfa}"); - println!("Running on the following input:\n{input_string}"); - println!("Accepts: {}", nfa.accepts(input_string)); - let dfa: DFA = DFA::convert(Box::new(nfa))?; - println!("Converted to DFA, accepts: {}", dfa.accepts(input_string)); - println!("DFA:\n{dfa}"); + let automaton: Box<dyn Automaton> = match args.get(3) { + None => { + println!("Converting to DFA"); + Box::new(DFA::convert(Box::new(nfa))?) + }, + Some(_) => { + Box::new(nfa) + } + }; + println!("Automaton:\n{}", automaton); + println!("Running on the following input:\n\"{}\"", input_string); + println!("Accepts: {}", automaton.accepts(&input_string)); Ok(()) } diff --git a/src/bin/regex.rs b/src/bin/regex.rs @@ -0,0 +1,46 @@ +use automaton::{regex::Regex, Automaton, Encodable}; +use std::{env, error::Error, fmt::Display}; + +#[derive(Debug)] +struct CommandLineError { + reason: String, +} + +impl Display for CommandLineError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.reason) + } +} + +impl CommandLineError { + fn cause(reason: &str) -> Self { + CommandLineError { + reason: reason.to_owned(), + } + } +} + +impl Error for CommandLineError {} + +fn usage(prog_name: &str) -> String { + format!("Usage: {prog_name} <regex string> <input string>") +} + +fn main() -> Result<(), Box<dyn Error>> { + let args: Vec<String> = env::args().collect(); + + if args.len() < 2 { + return Err(Box::new(CommandLineError::cause(&usage( + args.get(0).map_or("automaton", |s| s), + )))); + } + + let regex_str: String = args.get(1).map_or("".to_owned(), String::to_owned); + let input_string: String = args.get(2).map_or("".to_owned(), String::to_owned); + + let regex = Regex::parse(regex_str.as_str())?; + println!("Running on the following input:\n{input_string}"); + println!("Accepts: {}", regex.accepts(&input_string)); + + Ok(()) +} diff --git a/src/lib/automaton.rs b/src/lib/automaton.rs @@ -1,10 +1,13 @@ use std::error::Error; +use std::fmt::Display; pub mod dfa; -pub mod nfa; pub mod enfa; +pub mod graph_enfa; +pub mod nfa; +pub mod regex; -pub trait Automaton { +pub trait Automaton: Display { fn accepts(&self, input: &str) -> bool; } @@ -15,7 +18,7 @@ pub trait FiniteStateAutomaton: Automaton { Self: Sized; fn convert(automaton: Box<dyn FiniteStateAutomaton>) -> Result<Self, Box<dyn Error>> where - Self: Sized + Self: Sized, { Self::from_dfa(automaton.as_dfa()?) } @@ -30,10 +33,12 @@ where #[cfg(test)] mod tests { - mod dfa_tests; - mod nfa_tests; - mod enfa_tests; mod testlib { pub mod util; } + + mod dfa_tests; + mod enfa_tests; + mod nfa_tests; + mod regex_tests; } diff --git a/src/lib/dfa.rs b/src/lib/dfa.rs @@ -60,20 +60,24 @@ pub struct DFA { pub initial_state: u64, pub final_states: HashSet<u64>, pub transition_table: HashMap<(u64, char), u64>, + pub loose_transitions: HashMap<u64, u64>, } impl Automaton for DFA { fn accepts(&self, input: &str) -> bool { let mut current_state: u64 = self.initial_state; for c in input.chars() { - if !self.alphabet.contains(&c) { + if !self.alphabet.contains(&c) && !self.loose_transitions.contains_key(&current_state) { DFAParseError::print_warn(&format!( "Input character {} is not in the alphabet of the DFA", c )); - return false; } - match self.transition_table.get(&(current_state, c)) { + match self + .transition_table + .get(&(current_state, c)) + .or(self.loose_transitions.get(&current_state)) + { Some(state) => current_state = state.to_owned(), None => { DFAParseError::print_error( @@ -121,26 +125,26 @@ impl Display for DFA { alphabet_vec .iter() .enumerate() - .fold("[".to_owned(), |str, (index, c)| { + .fold("[".to_owned(), |st, (index, c)| { let ending = if index == self.alphabet.len() - 1 { "]" } else { ", " }; - format!("{str}{c}{ending}") + format!("{}{}{}", st, c, ending) }); let states_str: String = states_vec .iter() .enumerate() - .fold("[".to_owned(), |str, (index, n)| { + .fold("[".to_owned(), |st, (index, n)| { let ending = if index == self.states.len() - 1 { "]" } else { ", " }; - format!("{str}{n}{ending}") + format!("{}{}{}", st, n, ending) }); let initial_state_str: String = self.initial_state.to_string(); @@ -149,13 +153,13 @@ impl Display for DFA { final_states_vec .iter() .enumerate() - .fold("[".to_owned(), |str, (index, n)| { + .fold("[".to_owned(), |st, (index, n)| { let ending = if index == self.final_states.len() - 1 { "]" } else { ", " }; - format!("{str}{n}{ending}") + format!("{}{}{}", st, n, ending) }); let index_map: HashMap<char, usize> = alphabet_vec @@ -164,10 +168,11 @@ impl Display for DFA { .map(|(index, char)| (char.to_owned(), index)) .collect(); - let transition_table_hdr: String = alphabet_vec + let mut transition_table_hdr: String = alphabet_vec .iter() - .map(|c| format!("\t{c}\t|")) - .fold("\t|".to_owned(), |str, substr| format!("{str}{substr}")); + .map(|c| format!("\t{}\t|", c)) + .fold("\t|".to_owned(), |st, substr| format!("{}{}", st, substr)); + transition_table_hdr.push_str("\tLoose"); let mut transition_map: HashMap<u64, Vec<u64>> = HashMap::new(); @@ -184,14 +189,22 @@ impl Display for DFA { let transition_table_str: String = transition_map .iter() - .fold(transition_table_hdr, |str, (input, states)| { + .fold(transition_table_hdr, |st, (input, states)| { let dests = states .into_iter() - .fold("".to_owned(), |str, dst| format!("{str}{dst}\t|\t")); - format!("{str}\n{input}\t|\t{dests}") + .fold("".to_owned(), |st, dst| format!("{}{}\t|\t", st, dst)); + let wildcard = self + .loose_transitions + .get(input) + .map_or("∅".to_owned(), |&dest| dest.to_string()); + format!("{}\n{}\t|\t{}{}", st, input, dests, wildcard) }); - writeln!(f, "Alphabet: {alphabet_str}\nStates: {states_str}\nInitial State: {initial_state_str}\nFinal States: {final_states_str}\nTransition Table:\n{transition_table_str}") + writeln!(f, "Alphabet: {}", alphabet_str)?; + writeln!(f, "States: {}", states_str)?; + writeln!(f, "Initial State: {}", initial_state_str)?; + writeln!(f, "Final States: {}", final_states_str)?; + writeln!(f, "Transition Table:\n{}", transition_table_str) } } @@ -292,6 +305,7 @@ impl Encodable<Value> for DFA { final_states, transition_table, initial_state, + loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/enfa.rs b/src/lib/enfa.rs @@ -1,10 +1,11 @@ +use crate::{dfa::DFA, nfa::NFA, Automaton, Encodable, FiniteStateAutomaton}; +use core::fmt::Display; +use serde_json::{from_str, Value}; use std::{ collections::{HashMap, HashSet}, - panic::Location, error::Error, + error::Error, + panic::Location, }; -use crate::{Automaton, dfa::DFA, Encodable, FiniteStateAutomaton, nfa::NFA}; -use core::fmt::Display; -use serde_json::{from_str, Value}; #[derive(Debug)] struct ENFAParseError { @@ -53,10 +54,10 @@ pub struct ENFA { initial_state: u64, final_states: HashSet<u64>, transition_table: HashMap<(u64, Option<char>), Vec<u64>>, + loose_transitions: HashMap<u64, u64>, } impl ENFA { - // FIXME: BUG: Circular epsilon closures kill this fn eclose(&self, state: u64, visited: Option<&mut HashSet<u64>>) -> HashSet<u64> { let mut reachable: HashSet<u64> = HashSet::new(); let mut deez = HashSet::new(); @@ -73,24 +74,57 @@ impl ENFA { } reachable } + + pub fn new( + alphabet: HashSet<char>, + states: HashSet<u64>, + initial_state: u64, + final_states: HashSet<u64>, + transition_table: HashMap<(u64, Option<char>), Vec<u64>>, + loose_transitions: HashMap<u64, u64>, + ) -> ENFA { + ENFA { + alphabet, + states, + initial_state, + final_states, + transition_table, + loose_transitions, + } + } } impl Automaton for ENFA { // TODO: Check that this is all g fn accepts(&self, input: &str) -> bool { - let mut current_states: HashSet<u64> = self.eclose(self.initial_state, None).into_iter().collect(); + let mut current_states: HashSet<u64> = + self.eclose(self.initial_state, None).into_iter().collect(); let mut new_states: HashSet<u64> = HashSet::new(); for c in input.chars() { - if !self.alphabet.contains(&c) { + if !self.alphabet.contains(&c) + && !current_states + .iter() + .any(|s| self.loose_transitions.contains_key(s)) + { ENFAParseError::print_warn(&format!( "Input character {} is not in the alphabet of the DFA", c )); - return false; } for state in current_states.iter() { - for newstate in self.transition_table.get(&(*state, Some(c))).unwrap_or(&vec![]).iter().flat_map(|s| self.eclose(*s, None)) { + for newstate in self + .transition_table + .get(&(*state, Some(c))) + .map_or( + self.loose_transitions + .get(state) + .map_or(vec![], |&s| vec![s]), + |v| v.to_owned(), + ) + .iter() + .flat_map(|s| self.eclose(*s, None)) + { new_states.insert(newstate); } } @@ -110,7 +144,11 @@ impl FiniteStateAutomaton for ENFA { // TODO: Check that there are no other bugs fn as_dfa(&self) -> Result<DFA, Box<dyn std::error::Error>> { let mut nfa_transition_table = HashMap::new(); - for (from, input) in self.transition_table.keys().filter_map(|(f, i)| i.and_then(|c| Some((*f, c)))) { + for (from, input) in self + .transition_table + .keys() + .filter_map(|(f, i)| i.and_then(|c| Some((*f, c)))) + { let new_to: Vec<u64> = self .eclose(from, None) .iter() @@ -120,14 +158,30 @@ impl FiniteStateAutomaton for ENFA { .collect(); nfa_transition_table.insert((from, input), new_to); } - let final_states = self.states.iter().filter(|v| self.eclose(**v, None).iter().any(|s| self.final_states.contains(s))).map(u64::to_owned).collect(); - let nfa = NFA::new(self.alphabet.clone(), self.states.clone(), self.initial_state, final_states, nfa_transition_table); + let final_states = self + .states + .iter() + .filter(|v| { + self.eclose(**v, None) + .iter() + .any(|s| self.final_states.contains(s)) + }) + .map(u64::to_owned) + .collect(); + let nfa = NFA::new( + self.alphabet.clone(), + self.states.clone(), + self.initial_state, + final_states, + nfa_transition_table, + self.loose_transitions.clone(), + ); nfa.as_dfa() } - + fn from_dfa(dfa: crate::dfa::DFA) -> Result<Self, Box<dyn std::error::Error>> where - Self: Sized + Self: Sized, { let transition_table = dfa .transition_table @@ -140,6 +194,7 @@ impl FiniteStateAutomaton for ENFA { initial_state: dfa.initial_state, final_states: dfa.final_states, transition_table, + loose_transitions: dfa.loose_transitions, }) } } @@ -199,6 +254,7 @@ impl Display for ENFA { .map(|c| format!("\t{c}\t|")) .fold("\t|".to_owned(), |str, substr| format!("{str}{substr}")); transition_table_hdr.push_str("\tε\t|"); + transition_table_hdr.push_str("\tWildcard\t|"); let mut transition_map: HashMap<u64, Vec<Vec<u64>>> = HashMap::new(); @@ -225,24 +281,30 @@ impl Display for ENFA { transition_map.insert(from.to_owned(), new_vec); } - let transition_table_str = transition_map - .iter() - .fold(transition_table_hdr, |acc, (input, state_sets)|{ - let dests = state_sets.into_iter().fold("".to_owned(), |acc, dsts| { - let dest_sets_str: String = if dsts.len() == 0 { + let transition_table_str = + transition_map + .iter() + .fold(transition_table_hdr, |acc, (input, state_sets)| { + let dests = state_sets.into_iter().fold("".to_owned(), |acc, dsts| { + let dest_sets_str: String = if dsts.len() == 0 { "∅".to_owned() - } else { + } else { dsts.iter() .enumerate() .fold("{".to_owned(), |acc, (index, dst)| { let end = if index == dsts.len() - 1 { "}" } else { ", " }; format!("{}{}{}", acc, dst, end) }) - }; - format!("{}{}\t|\t", acc, dest_sets_str) + }; + format!("{}{}\t|\t", acc, dest_sets_str) + }); + // TODO: Check that this is correct + let wildcard_map: String = self + .loose_transitions + .get(input) + .map_or("∅".to_owned(), |&dest| dest.to_string()); + format!("{}\n{}\t|\t{}\t|\t{}", acc, input, dests, wildcard_map) }); - format!("{}\n{}\t|\t{}", acc, input, dests) - }); writeln!(f, "Alphabet: {}", alphabet_str)?; writeln!(f, "States: {}", states_str)?; @@ -330,7 +392,13 @@ impl Encodable<Value> for ENFA { let dest_sets: Vec<(char, Vec<u64>)> = alphabet_vec .iter() - .zip(transitions.iter().skip(1).take(alphabet_size).map(Value::as_array)) + .zip( + transitions + .iter() + .skip(1) + .take(alphabet_size) + .map(Value::as_array), + ) .map(|(input, states)| { states.and_then(|vs| { vs.iter() @@ -341,19 +409,18 @@ impl Encodable<Value> for ENFA { }) .collect::<Option<_>>() .ok_or(ENFAParseError::error( - "Unable to parse the transition destinations in the table" + "Unable to parse the transition destinations in the table", ))?; - let epsilon_transitions: Vec<Vec<u64>> = transitions.iter().skip(alphabet_size + 1).map(Value::as_array) + let epsilon_transitions: Vec<Vec<u64>> = transitions + .iter() + .skip(alphabet_size + 1) + .map(Value::as_array) .map(|states| { - states.and_then(|vs| { - vs.iter() - .map(Value::as_u64) - .collect::<Option<_>>() - }) + states.and_then(|vs| vs.iter().map(Value::as_u64).collect::<Option<_>>()) }) .collect::<Option<_>>() .ok_or(ENFAParseError::error( - "Unable to parse the ε-closure destinations in the table" + "Unable to parse the ε-closure destinations in the table", ))?; for (input, dests) in dest_sets.iter() { transition_table.insert((from, Some(*input)), dests.to_owned()); @@ -362,14 +429,15 @@ impl Encodable<Value> for ENFA { transition_table.insert((from, None), transition.to_owned()); } } - + //panic!("Not yet implemented!"); Ok(ENFA { alphabet, states, initial_state, final_states, - transition_table + transition_table, + loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/graph_enfa.rs b/src/lib/graph_enfa.rs @@ -0,0 +1,378 @@ +use crate::{enfa::ENFA, Automaton, FiniteStateAutomaton}; +use petgraph::{dot::Dot, graph::NodeIndex, visit::EdgeRef, Directed, Graph}; +use std::{ + collections::{HashMap, HashSet}, + error::Error, + fmt::Display, + panic::Location, +}; + +#[derive(Debug)] +pub struct GraphENFAError { + reason: String, +} + +impl Display for GraphENFAError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.reason) + } +} + +impl GraphENFAError { + #[track_caller] + fn error_str(reason: &str, loc: Option<&Location>) -> String { + let location = loc.unwrap_or(Location::caller()); + format!("[ENFA::{} ERROR] {}", location, reason) + } + #[track_caller] + fn error(reason: &str) -> Self { + let location = Location::caller(); + GraphENFAError { + reason: GraphENFAError::error_str(reason, Some(location)), + } + } +} + +impl Error for GraphENFAError {} + +/// Intended as an easy way of constructing an ε-NFA in bits, adding onto it much like you'd build an ε-NFA on a whiteboard. +pub struct GraphENFA { + alphabet: HashSet<char>, + states: HashSet<u64>, + initial_state: u64, + final_states: HashSet<u64>, + graph: Graph<u64, Option<char>, Directed>, + loose_transitions: HashMap<u64, u64>, + state_id_to_index_map: HashMap<u64, NodeIndex>, + current_state_id: u64, +} + +impl GraphENFA { + /// Create a new graph ε-NFA + pub fn new() -> (Self, u64) { + let mut states = HashSet::new(); + let mut state_id_to_index_map = HashMap::new(); + let mut graph = Graph::new(); + + let initial_state_id = 0; + + let initial_state_index = graph.add_node(initial_state_id); + + states.insert(initial_state_id); + state_id_to_index_map.insert(initial_state_id, initial_state_index); + + ( + GraphENFA { + alphabet: HashSet::new(), + states, + initial_state: initial_state_id, + final_states: HashSet::new(), + graph, + loose_transitions: HashMap::new(), + state_id_to_index_map, + current_state_id: initial_state_id, + }, + initial_state_id, + ) + } + + /// Set the acceptance of a state, will return an error if the state is already in the desired acceptance or the state doesn't exist in the ε-NFA. + pub fn set_state_acceptance( + &mut self, + state_id: u64, + accept: bool, + ) -> Result<(), GraphENFAError> { + if !self.states.contains(&state_id) { + return Err(GraphENFAError::error("There's no state with that ID!")); + } + + if accept && !self.final_states.contains(&state_id) && self.final_states.insert(state_id) // adding to final states + || !accept // Removing from final states + && self.final_states.contains(&state_id) + && self.final_states.remove(&state_id) + { + Ok(()) + } else { + Err(GraphENFAError::error( + "The state was already in the desired acceptance!", + )) + } + } + + /// Changes the initial state of the ε-NFA to the state with ID `state_id`, this will remove the other initial state. Returns `Ok` of the previous initial state, or an error describing the failure + pub fn set_initial_state(&mut self, state_id: u64) -> Result<u64, GraphENFAError> { + if !self.states.contains(&state_id) { + return Err(GraphENFAError::error( + "There is no state with the specified ID", + )); + } + + let prev_initial_state_id = self.initial_state; + self.initial_state = state_id; + + Ok(prev_initial_state_id) + } + + /// Inserts a new state with no transitions into the ε-NFA with the ID `state_id` + fn insert_state(&mut self, state_id: u64) -> Result<(), GraphENFAError> { + if self.states.contains(&state_id) { + return Err(GraphENFAError::error( + "There's already a state with that ID", + )); + } + + let node_index = self.graph.add_node(state_id); + if !self + .state_id_to_index_map + .insert(state_id, node_index) + .is_none() + { + return Err(GraphENFAError::error( + "There's already a state with that ID", + )); + } + + if !self.states.insert(state_id) { + return Err(GraphENFAError::error( + "There's already a state with that ID", + )); + } + Ok(()) + } + + /// Inserts a new state with no transitions into the ε-NFA with the ID of the max existing ID + 1. + pub fn new_state(&mut self) -> Result<u64, GraphENFAError> { + self.current_state_id += 1; + + self.insert_state(self.current_state_id)?; + + Ok(self.current_state_id) + } + + /// Add a transition from one state to another, returns an error if either of the states doesn't exist. + /// An epsilon transition is done using the `None` of the `Option<char>` enum. + pub fn insert_transition( + &mut self, + from_id: u64, + to_id: u64, + input: Option<char>, + ) -> Result<(), GraphENFAError> { + if !self.states.contains(&from_id) || !self.states.contains(&to_id) { + return Err(GraphENFAError::error( + "Tried adding a transition to/from a state that doesn't exist!", + )); + } + + match input { + Some(c) => { + if !self.alphabet.contains(&c) { + self.alphabet.insert(c); + } + } + _ => (), + } + + let from_index = self + .state_id_to_index_map + .get(&from_id) + .ok_or(GraphENFAError::error( + "Couldn't get the index of the from state!", + ))? + .to_owned(); + + let to_index = self + .state_id_to_index_map + .get(&to_id) + .ok_or(GraphENFAError::error( + "Couldn't get the index of the to state!", + ))? + .to_owned(); + + self.graph.add_edge(from_index, to_index, input); + + Ok(()) + } + + pub fn insert_loose_transition( + &mut self, + from_id: u64, + to_id: u64, + ) -> Result<(), GraphENFAError> { + if !self.states.contains(&from_id) || !self.states.contains(&to_id) { + return Err(GraphENFAError::error( + "Tried adding a loose transition to/from a state that doesn't exist!", + )); + } + if self.loose_transitions.contains_key(&from_id) { + return Err(GraphENFAError::error( + "There is already a loose transition from the specified from location!", + )); + } + self.loose_transitions.insert(from_id, to_id); + Ok(()) + } + + /// Convert into a regular ε-NFA backed by a `HashMap` instead of a `Graph` + /// Whilst this can be used, the `as_dfa` method should be used if you're going to evaluate string acceptance. + pub fn as_enfa(&self) -> Result<ENFA, Box<dyn Error>> { + let mut transition_table = HashMap::new(); + let mut current_states = vec![self.initial_state]; + let mut next_states = vec![]; + let mut visited = HashSet::new(); + + while current_states.len() != 0 { + for &current_state in current_states.iter() { + let current_state_index = self + .state_id_to_index_map + .get(&current_state) + .ok_or(GraphENFAError::error( + "Couldn't get graph index of a state!", + ))? + .to_owned(); + for edge in self.graph.edges(current_state_index) { + let mut transitions = transition_table + .get(&(current_state, *edge.weight())) + .map_or(vec![], |v: &Vec<u64>| v.to_owned()); + let dest = self + .graph + .node_weight(edge.target()) + .ok_or(GraphENFAError::error( + "Error converting from graph to regular NFA!", + ))? + .to_owned(); + transitions.push(dest); + if !visited.contains(&dest) { + next_states.push(dest); + } + transition_table.insert((current_state, *edge.weight()), transitions); + } + visited.insert(current_state); + } + current_states = next_states; + next_states = vec![]; + } + + Ok(ENFA::new( + self.alphabet.clone(), + self.states.clone(), + self.initial_state, + self.final_states.clone(), + transition_table, + self.loose_transitions.clone(), + )) + } +} + +impl Automaton for GraphENFA { + // This should never be used + fn accepts(&self, input: &str) -> bool { + let dfa = self.as_dfa().expect("Woops!"); + dfa.accepts(input) + } +} + +impl FiniteStateAutomaton for GraphENFA { + /// Converts the `Graph` ε-NFA into a regular ε-NFA backed by a `HashMap`, then converts that to a DFA + fn as_dfa(&self) -> std::result::Result<crate::dfa::DFA, Box<dyn Error>> { + let enfa = self.as_enfa()?; + enfa.as_dfa() + } + + /// This should never be used... Unless you really badly want to modify an existing DFA. + fn from_dfa(dfa: crate::dfa::DFA) -> std::result::Result<Self, Box<dyn Error>> + where + Self: Sized, + { + let mut state_id_to_index_map: HashMap<u64, NodeIndex> = HashMap::new(); + let mut graph = Graph::new(); + + for ((from, input), to) in dfa.transition_table { + let from_index = state_id_to_index_map + .get(&from) + .map_or_else(|| graph.add_node(from), |n| n.to_owned()); + state_id_to_index_map.insert(from, from_index); + let to_index = state_id_to_index_map + .get(&to) + .map_or_else(|| graph.add_node(to), |n| n.to_owned()); + state_id_to_index_map.insert(to, to_index); + graph.add_edge(from_index, to_index, Some(input)); + } + + let current_state_id = *dfa + .states + .iter() + .max() + .ok_or(GraphENFAError::error("Unable to get the current state ID"))?; + + Ok(GraphENFA { + alphabet: dfa.alphabet, + states: dfa.states, + initial_state: dfa.initial_state, + final_states: dfa.final_states, + graph, + loose_transitions: dfa.loose_transitions, + state_id_to_index_map, + current_state_id, + }) + } +} + +impl Display for GraphENFA { + /// This should only be used for debugging, the graph is not pretty printed, but the output can be used in graphviz to evaluate the ε-NFA. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut alphabet_vec: Vec<char> = self.alphabet.iter().copied().collect(); + let mut states_vec: Vec<u64> = self.states.iter().copied().collect(); + let mut final_states_vec: Vec<u64> = self.final_states.iter().copied().collect(); + + alphabet_vec.sort(); + states_vec.sort(); + final_states_vec.sort(); + + let alphabet_str: String = + alphabet_vec + .iter() + .enumerate() + .fold("[".to_owned(), |st, (index, c)| { + let ending = if index == self.alphabet.len() - 1 { + "]" + } else { + ", " + }; + format!("{}{}{}", st, c, ending) + }); + + let states_str: String = + states_vec + .iter() + .enumerate() + .fold("[".to_owned(), |st, (index, n)| { + let ending = if index == self.states.len() - 1 { + "]" + } else { + ", " + }; + format!("{}{}{}", st, n, ending) + }); + + let initial_state_str: String = self.initial_state.to_string(); + + let final_states_str: String = + final_states_vec + .iter() + .enumerate() + .fold("[".to_owned(), |st, (index, n)| { + let ending = if index == self.final_states.len() - 1 { + "]" + } else { + ", " + }; + format!("{}{}{}", st, n, ending) + }); + + writeln!(f, "Alphabet: {}", alphabet_str)?; + writeln!(f, "States: {}", states_str)?; + writeln!(f, "Initial State: {}", initial_state_str)?; + writeln!(f, "Final States: {}", final_states_str)?; + writeln!(f, "{:?}", Dot::new(&self.graph)) + } +} diff --git a/src/lib/nfa.rs b/src/lib/nfa.rs @@ -55,6 +55,7 @@ pub struct NFA { initial_state: u64, final_states: HashSet<u64>, transition_table: HashMap<(u64, char), Vec<u64>>, + loose_transitions: HashMap<u64, u64>, } impl NFA { @@ -64,13 +65,15 @@ impl NFA { initial_state: u64, final_states: HashSet<u64>, transition_table: HashMap<(u64, char), Vec<u64>>, + loose_transitions: HashMap<u64, u64>, ) -> Self { NFA { alphabet, states, initial_state, final_states, - transition_table + transition_table, + loose_transitions, } } } @@ -80,7 +83,11 @@ impl Automaton for NFA { let mut current_states: Vec<u64> = vec![self.initial_state]; let mut new_states: Vec<u64> = Vec::new(); for c in input.chars() { - if !self.alphabet.contains(&c) { + if !self.alphabet.contains(&c) + && !current_states + .iter() + .any(|s| self.loose_transitions.contains_key(s)) + { NFAParseError::print_warn(&format!( "Input character {} is not in the alphabet of the DFA", c @@ -88,10 +95,12 @@ impl Automaton for NFA { return false; } for i in 0..current_states.len() { - match current_states - .get(i) - .map_or(None, |s| self.transition_table.get(&(*s, c))) - { + match current_states.get(i).map_or(None, |s| { + self.transition_table.get(&(*s, c)).map_or( + self.loose_transitions.get(s).and_then(|s| Some(vec![*s])), + |v| Some(v.to_owned()), + ) + }) { Some(states) => { new_states.append(&mut states.clone()); } @@ -179,6 +188,7 @@ impl FiniteStateAutomaton for NFA { initial_state: self.initial_state, final_states, transition_table, + loose_transitions: self.loose_transitions.clone(), }) } @@ -197,6 +207,7 @@ impl FiniteStateAutomaton for NFA { initial_state: dfa.initial_state, final_states: dfa.final_states, transition_table, + loose_transitions: dfa.loose_transitions, }) } } @@ -251,10 +262,11 @@ impl Display for NFA { format!("{str}{c}{ending}") }); - let transition_table_hdr: String = alphabet_vec + let mut transition_table_hdr: String = alphabet_vec .iter() .map(|c| format!("\t{c}\t|")) .fold("\t|".to_owned(), |str, substr| format!("{str}{substr}")); + transition_table_hdr.push_str("\tWildcard\t|"); let mut transition_map: HashMap<u64, Vec<Vec<u64>>> = HashMap::new(); @@ -277,7 +289,7 @@ impl Display for NFA { let transition_table_str: String = transition_map .iter() - .fold(transition_table_hdr, |str, (input, state_sets)| { + .fold(transition_table_hdr, |st, (input, state_sets)| { let dests = state_sets.into_iter().fold("".to_owned(), |acc, dsts| { let dest_sets_str: String = if dsts.len() == 0 { "∅".to_owned() @@ -291,10 +303,18 @@ impl Display for NFA { }; format!("{acc}{}\t|\t", dest_sets_str) }); - format!("{str}\n{input}\t|\t{dests}") + let wildcard = self + .loose_transitions + .get(input) + .map_or("∅".to_owned(), |&dest| dest.to_string()); + format!("{}\n{}\t|\t{}\t|\t{}", st, input, dests, wildcard) }); - writeln!(f, "Alphabet: {alphabet_str}\nStates: {states_str}\nInitial State: {initial_state_str}\nFinal States: {final_states_str}\nTransition Table:\n{transition_table_str}") + writeln!(f, "Alphabet: {}", alphabet_str)?; + writeln!(f, "States: {}", states_str)?; + writeln!(f, "Initial State: {}", initial_state_str)?; + writeln!(f, "Final States: {}", final_states_str)?; + writeln!(f, "Transition Table:\n{}", transition_table_str) } } @@ -391,13 +411,13 @@ impl Encodable<Value> for NFA { } } - //assert!(false, "FUNCTION NOT YET IMPLEMENTED!"); Ok(NFA { alphabet, states, initial_state, final_states, transition_table, + loose_transitions: HashMap::new(), }) } } diff --git a/src/lib/regex.rs b/src/lib/regex.rs @@ -0,0 +1,370 @@ +use crate::{ + dfa::DFA, + graph_enfa::{GraphENFA, GraphENFAError}, + Automaton, Encodable, FiniteStateAutomaton, +}; +use either::Either; +use std::{error::Error, fmt::Display}; + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum RegexToken { + KleeneStar, // * + Wildcard, // . + Bar, // | + LeftGroup, // ( + RightGroup, // ) + // The following are just syntactic sugar that I'll use + Optional, // ? + KleenePlus, // + +} + +#[derive(Debug)] +pub enum RegexError { + UnrecognisedEscape(String), + ParseError(String), +} + +impl Display for RegexError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Self::UnrecognisedEscape(s) => write!(f, "Unrecognised escape sequence \"{}\"", s), + Self::ParseError(s) => write!(f, "PARSE ERROR! {}", s), + } + } +} + +impl Error for RegexError {} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum RegexTerminal { + Character(char), + Wildcard, + Epsilon, +} + +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum RegexNonTerminal { + Terminal(RegexTerminal), + Group(Vec<RegexNonTerminal>), + KleeneGroup(Vec<RegexNonTerminal>), + Union(Vec<RegexNonTerminal>), +} + +pub struct Regex { + pub parsed: RegexNonTerminal, + pub automaton: DFA, +} + +impl Display for Regex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "") + } +} + +impl Automaton for Regex { + fn accepts(&self, input: &str) -> bool { + self.automaton.accepts(input) + } +} + +impl<T: FiniteStateAutomaton> From<Regex> for Result<T, Box<dyn Error>> { + fn from(regex: Regex) -> Result<T, Box<dyn Error>> { + T::convert(Box::new(regex.automaton)) + } +} + +impl Regex { + pub fn tokenise(regex_str: &str) -> Result<Vec<Either<char, RegexToken>>, RegexError> { + let mut escape_next = false; + regex_str + .chars() + .into_iter() + .collect::<Vec<char>>() + .iter() + .filter_map(|c| match (escape_next, c) { + (true, c) => { + escape_next = false; + match *c { + '*' => Some(Ok(Either::Left('*'))), + '.' => Some(Ok(Either::Left('.'))), + '|' => Some(Ok(Either::Left('|'))), + '(' => Some(Ok(Either::Left('('))), + ')' => Some(Ok(Either::Left(')'))), + '?' => Some(Ok(Either::Left('?'))), + '+' => Some(Ok(Either::Left('+'))), + '\\' => Some(Ok(Either::Left('\\'))), + _ => Some(Err(RegexError::UnrecognisedEscape(format!("\\{}", c)))), + } + } + (false, c) => match *c { + '*' => Some(Ok(Either::Right(RegexToken::KleeneStar))), + '.' => Some(Ok(Either::Right(RegexToken::Wildcard))), + '|' => Some(Ok(Either::Right(RegexToken::Bar))), + '(' => Some(Ok(Either::Right(RegexToken::LeftGroup))), + ')' => Some(Ok(Either::Right(RegexToken::RightGroup))), + '?' => Some(Ok(Either::Right(RegexToken::Optional))), + '+' => Some(Ok(Either::Right(RegexToken::KleenePlus))), + '\\' => { + escape_next = true; + None + } + c => Some(Ok(Either::Left(c))), + }, + }) + .collect() + } + + /// This function is written like a mess! + fn make_grammar_tree( + regex_tokens: Vec<Either<char, RegexToken>>, + ) -> Result<RegexNonTerminal, RegexError> { + let mut groups: Vec<Vec<RegexNonTerminal>> = vec![]; + groups.push(vec![]); + + let mut in_an_or_group = false; + let mut or_group: Vec<RegexNonTerminal> = vec![]; + + for token in regex_tokens.into_iter() { + match token { + Either::Left(c) => groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))), + Either::Right(RegexToken::Wildcard) => groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)), + + Either::Right(RegexToken::LeftGroup) => { + groups.push(vec![]); + } + Either::Right(RegexToken::RightGroup) => { + if !in_an_or_group { + let exited_group = groups.pop().ok_or(RegexError::ParseError( + "Unable to pop a group off".to_owned(), + ))?; + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::Group(exited_group)); + } else { + let exited_group = groups.pop().ok_or(RegexError::ParseError( + "Unable to pop a group off".to_owned(), + ))?; + or_group.push(RegexNonTerminal::Group(exited_group)); + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::Union(or_group)); + in_an_or_group = false; + or_group = vec![]; + } + } + + Either::Right(RegexToken::Bar) => { + in_an_or_group = true; + let exited_group = groups.pop().ok_or(RegexError::ParseError( + "Unable to pop a group off".to_owned(), + ))?; + or_group.push(RegexNonTerminal::Group(exited_group)); + groups.push(vec![]); + } + + // Sugar! + Either::Right(RegexToken::Optional) => { + let last_terminal = groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .pop() + .ok_or(RegexError::ParseError( + "Unable to get last terminal".to_owned(), + ))?; + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::Union(vec![ + last_terminal, + RegexNonTerminal::Terminal(RegexTerminal::Epsilon), + ])); + } + + Either::Right(RegexToken::KleeneStar) => { + let last_terminal = groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .pop() + .ok_or(RegexError::ParseError( + "Unable to get last terminal".to_owned(), + ))?; + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::KleeneGroup(vec![last_terminal])); + } + // Sugar! + Either::Right(RegexToken::KleenePlus) => { + let last_terminal = groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .pop() + .ok_or(RegexError::ParseError( + "Unable to get last terminal".to_owned(), + ))?; + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(last_terminal.clone()); + groups + .last_mut() + .ok_or(RegexError::ParseError( + "Unable to get last group".to_owned(), + ))? + .push(RegexNonTerminal::KleeneGroup(vec![last_terminal])); + } + } + } + + if groups.len() != 1 { + Err(RegexError::ParseError( + "Ended the grammar tree within a group!".to_owned(), + )) + } else { + Ok(RegexNonTerminal::Group(groups.pop().ok_or( + RegexError::ParseError("Ended the grammar tree without the main group!".to_owned()), + )?)) + } + } +} + +impl Encodable<&str> for Regex { + fn parse(regex_str: &str) -> Result<Self, Box<dyn Error>> { + Self::parse(Self::tokenise(regex_str)?) + } +} + +// Found a regular expression LL(1) grammar that I can hopefully handroll a parser for! +// https://blog.robertelder.org/regular-expression-parser-grammar/ +// ^^ Probably won't use that... + +// FIXME: I realise now that a DFA with an alphabet of all possible UTF-8 strings is not going to happen, we're better off making a new kind of DFA that has a looser alphabet requirement. + +impl Encodable<Vec<Either<char, RegexToken>>> for Regex { + fn parse(tokens: Vec<Either<char, RegexToken>>) -> Result<Self, Box<dyn Error>> { + let parse_tree = Regex::make_grammar_tree(tokens)?; + Self::parse(parse_tree) + } +} + +impl GraphENFA { + // TODO: Build up an ENFA using the parse tree + fn add_non_terminal( + &mut self, + nonterminal: RegexNonTerminal, + from: u64, + ) -> Result<u64, GraphENFAError> { + let current_state = from; + match nonterminal { + RegexNonTerminal::Terminal(t) => self.add_terminal(t, current_state), + RegexNonTerminal::Group(g) => self.add_group(g, current_state), + RegexNonTerminal::KleeneGroup(kg) => self.add_kleene_group(kg, current_state), + RegexNonTerminal::Union(un) => self.add_union(un, current_state), + } + } + + fn add_terminal(&mut self, terminal: RegexTerminal, from: u64) -> Result<u64, GraphENFAError> { + let new_state = self.new_state()?; + + match terminal { + RegexTerminal::Character(c) => self.insert_transition(from, new_state, Some(c)), + RegexTerminal::Wildcard => self.insert_loose_transition(from, new_state), + RegexTerminal::Epsilon => self.insert_transition(from, new_state, None), + }?; + + Ok(new_state) + } + + /// Adds a group of nonterminals concatenated together to the ENFA + fn add_group( + &mut self, + group: Vec<RegexNonTerminal>, + from: u64, + ) -> Result<u64, GraphENFAError> { + let mut current_state = from; + for nonterminal in group { + current_state = self.add_non_terminal(nonterminal, current_state)?; + } + Ok(current_state) + } + + /// Adds a group of nonterminals concatenated together within a Kleene star to the ENFA + /// + /// Just a warning, this will return the state on the end of the chain, but there will be states with higher IDs than the end of the chain due to the nature of how this is built. + fn add_kleene_group( + &mut self, + kleene_group: Vec<RegexNonTerminal>, + from: u64, + ) -> Result<u64, GraphENFAError> { + let start_state = self.add_terminal(RegexTerminal::Epsilon, from)?; + let end_state = self.new_state()?; + self.insert_transition(start_state, end_state, None)?; + self.insert_transition(end_state, start_state, None)?; + + let last_state_in_group = self.add_group(kleene_group, start_state)?; + + self.insert_transition(last_state_in_group, end_state, None)?; + + Ok(end_state) + } + + /// Adds a union of nonterminals that are each separately possible to the ENFA + fn add_union( + &mut self, + union: Vec<RegexNonTerminal>, + from: u64, + ) -> Result<u64, GraphENFAError> { + // TODO: Implement this + let start_state = self.add_terminal(RegexTerminal::Epsilon, from)?; + let end_state = self.new_state()?; + for nonterminal in union { + let entry_state = self.add_terminal(RegexTerminal::Epsilon, start_state)?; + let exit_state = self.add_non_terminal(nonterminal, entry_state)?; + self.insert_transition(exit_state, end_state, None)?; + } + Ok(end_state) + } +} + +impl Encodable<RegexNonTerminal> for Regex { + fn parse(parse_tree: RegexNonTerminal) -> Result<Self, Box<dyn Error>> { + let (mut enfa, current_state) = GraphENFA::new(); + let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?; + enfa.set_state_acceptance(final_state, true)?; + + let dfa = enfa.as_dfa()?; + + Ok(Regex { + automaton: dfa, + parsed: parse_tree, + }) + } +} diff --git a/src/lib/tests/regex_tests.rs b/src/lib/tests/regex_tests.rs @@ -0,0 +1,31 @@ +use either::Either; + +use crate::{ + regex::{Regex, RegexToken}, + Encodable, +}; + +#[test] +fn tokeniser() { + assert_eq!( + Regex::tokenise("a*").unwrap(), + vec![Either::Left('a'), Either::Right(RegexToken::KleeneStar)] + ); + assert_eq!( + Regex::tokenise("\\*\\\\").unwrap(), + vec![Either::Left('*'), Either::Left('\\')] + ); + assert_eq!( + Regex::tokenise("\\*\\\\*").unwrap(), + vec![ + Either::Left('*'), + Either::Left('\\'), + Either::Right(RegexToken::KleeneStar) + ] + ); +} + +#[test] +fn parser() { + Regex::parse(".").expect("Fuck"); +}