regex.rs (12735B)
1 use crate::{ 2 dfa::DFA, 3 graph_enfa::{GraphENFA, Transition}, 4 Automaton, AutomatonError, Encodable, FiniteStateAutomaton, 5 }; 6 use either::Either; 7 use std::fmt::Display; 8 9 #[derive(Clone, Copy, PartialEq, Eq, Debug)] 10 pub enum RegexToken { 11 KleeneStar, // * 12 Wildcard, // . 13 Bar, // | 14 LeftGroup, // ( 15 RightGroup, // ) 16 // The following are just syntactic sugar that I'll use 17 Optional, // ? 18 KleenePlus, // + 19 } 20 21 #[derive(Clone, Copy, PartialEq, Eq, Debug)] 22 pub enum RegexTerminal { 23 Character(char), 24 Wildcard, 25 Epsilon, 26 } 27 28 #[derive(Clone, PartialEq, Eq, Debug)] 29 pub enum RegexNonTerminal { 30 Terminal(RegexTerminal), 31 Group(Vec<RegexNonTerminal>), 32 KleeneGroup(Vec<RegexNonTerminal>), 33 Union(Vec<RegexNonTerminal>), 34 } 35 36 pub struct Regex { 37 pub parsed: RegexNonTerminal, 38 pub automaton: DFA, 39 } 40 41 impl Display for Regex { 42 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 43 write!(f, "") 44 } 45 } 46 47 impl Automaton for Regex { 48 fn accepts(&self, input: &str) -> bool { 49 self.automaton.accepts(input) 50 } 51 } 52 53 impl<T: FiniteStateAutomaton> From<Regex> for Result<T, AutomatonError> { 54 fn from(regex: Regex) -> Result<T, AutomatonError> { 55 T::convert(Box::new(regex.automaton)) 56 } 57 } 58 59 impl Regex { 60 pub fn tokenise(regex_str: &str) -> Result<Vec<Either<char, RegexToken>>, AutomatonError> { 61 let mut escape_next = false; 62 regex_str 63 .chars() 64 .into_iter() 65 .collect::<Vec<char>>() 66 .iter() 67 .filter_map(|c| match (escape_next, c) { 68 (true, c) => { 69 escape_next = false; 70 match *c { 71 '*' => Some(Ok(Either::Left('*'))), 72 '.' => Some(Ok(Either::Left('.'))), 73 '|' => Some(Ok(Either::Left('|'))), 74 '(' => Some(Ok(Either::Left('('))), 75 ')' => Some(Ok(Either::Left(')'))), 76 '?' => Some(Ok(Either::Left('?'))), 77 '+' => Some(Ok(Either::Left('+'))), 78 '\\' => Some(Ok(Either::Left('\\'))), 79 _ => Some(Err(AutomatonError::tokenise_error(&format!( 80 "Unrecognised escape sequence: \\{}", 81 c 82 )))), 83 } 84 } 85 (false, c) => match *c { 86 '*' => Some(Ok(Either::Right(RegexToken::KleeneStar))), 87 '.' => Some(Ok(Either::Right(RegexToken::Wildcard))), 88 '|' => Some(Ok(Either::Right(RegexToken::Bar))), 89 '(' => Some(Ok(Either::Right(RegexToken::LeftGroup))), 90 ')' => Some(Ok(Either::Right(RegexToken::RightGroup))), 91 '?' => Some(Ok(Either::Right(RegexToken::Optional))), 92 '+' => Some(Ok(Either::Right(RegexToken::KleenePlus))), 93 '\\' => { 94 escape_next = true; 95 None 96 } 97 c => Some(Ok(Either::Left(c))), 98 }, 99 }) 100 .collect() 101 } 102 103 /// This function is written like a mess! 104 fn make_grammar_tree( 105 regex_tokens: Vec<Either<char, RegexToken>>, 106 ) -> Result<RegexNonTerminal, AutomatonError> { 107 let mut groups: Vec<Vec<RegexNonTerminal>> = vec![]; 108 groups.push(vec![]); 109 110 let mut in_a_union = false; 111 let mut or_group: Vec<RegexNonTerminal> = vec![]; 112 113 for token in regex_tokens.into_iter() { 114 match token { 115 Either::Left(c) => groups 116 .last_mut() 117 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 118 .push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))), 119 Either::Right(RegexToken::Wildcard) => groups 120 .last_mut() 121 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 122 .push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)), 123 124 Either::Right(RegexToken::LeftGroup) => { 125 groups.push(vec![]); 126 } 127 Either::Right(RegexToken::RightGroup) => { 128 if !in_a_union { 129 let exited_group = groups 130 .pop() 131 .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?; 132 groups 133 .last_mut() 134 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 135 .push(RegexNonTerminal::Group(exited_group)); 136 } else { 137 let exited_group = groups 138 .pop() 139 .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?; 140 or_group.push(RegexNonTerminal::Group(exited_group)); 141 groups 142 .last_mut() 143 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 144 .push(RegexNonTerminal::Union(or_group)); 145 in_a_union = false; 146 or_group = vec![]; 147 } 148 } 149 150 Either::Right(RegexToken::Bar) => { 151 in_a_union = true; 152 let exited_group = groups 153 .pop() 154 .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?; 155 or_group.push(RegexNonTerminal::Group(exited_group)); 156 groups.push(vec![]); 157 } 158 159 // Sugar! 160 Either::Right(RegexToken::Optional) => { 161 let last_terminal = groups 162 .last_mut() 163 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 164 .pop() 165 .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?; 166 groups 167 .last_mut() 168 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 169 .push(RegexNonTerminal::Union(vec![ 170 last_terminal, 171 RegexNonTerminal::Terminal(RegexTerminal::Epsilon), 172 ])); 173 } 174 175 Either::Right(RegexToken::KleeneStar) => { 176 let last_terminal = groups 177 .last_mut() 178 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 179 .pop() 180 .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?; 181 groups 182 .last_mut() 183 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 184 .push(RegexNonTerminal::KleeneGroup(vec![last_terminal])); 185 } 186 // Sugar! 187 Either::Right(RegexToken::KleenePlus) => { 188 let last_terminal = groups 189 .last_mut() 190 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 191 .pop() 192 .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?; 193 groups 194 .last_mut() 195 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 196 .push(last_terminal.clone()); 197 groups 198 .last_mut() 199 .ok_or(AutomatonError::parse_error("Unable to get last group"))? 200 .push(RegexNonTerminal::KleeneGroup(vec![last_terminal])); 201 } 202 } 203 } 204 205 if groups.len() != 1 { 206 return Err(AutomatonError::parse_error( 207 "Ended the grammar tree within a group!", 208 )); 209 } 210 if in_a_union { 211 // Catch the outermost union if it exists 212 let last_elem = groups.pop().ok_or(AutomatonError::parse_error( 213 "Ended the grammar tree without the main group!", 214 ))?; 215 or_group.push(RegexNonTerminal::Group(last_elem)); 216 return Ok(RegexNonTerminal::Union(or_group)); 217 } 218 219 Ok(RegexNonTerminal::Group(groups.pop().ok_or( 220 AutomatonError::parse_error("Ended the grammar tree without the main group!"), 221 )?)) 222 } 223 } 224 225 /// Tokenisation 226 impl Encodable<&str> for Regex { 227 fn parse(regex_str: &str) -> Result<Self, AutomatonError> { 228 Self::parse(Self::tokenise(regex_str)?) 229 } 230 } 231 232 /// Parsing to an AST 233 type TokenVec = Vec<Either<char, RegexToken>>; 234 impl Encodable<TokenVec> for Regex { 235 fn parse(tokens: TokenVec) -> Result<Self, AutomatonError> { 236 let parse_tree = Regex::make_grammar_tree(tokens)?; 237 Self::parse(parse_tree) 238 } 239 } 240 241 /// Various fns for converting from the AST to an ENFA graph 242 impl GraphENFA { 243 /// Adds a nonterminal onto the DFA directly after `from` 244 pub fn add_non_terminal( 245 &mut self, 246 nonterminal: RegexNonTerminal, 247 from: u64, 248 ) -> Result<u64, AutomatonError> { 249 let current_state = from; 250 match nonterminal { 251 RegexNonTerminal::Terminal(t) => self.add_terminal(t, current_state), 252 RegexNonTerminal::Group(g) => self.add_group(g, current_state), 253 RegexNonTerminal::KleeneGroup(kg) => self.add_kleene_group(kg, current_state), 254 RegexNonTerminal::Union(un) => self.add_union(un, current_state), 255 } 256 } 257 258 /// Concats a terminal onto the DFA directly after `from` 259 fn add_terminal(&mut self, terminal: RegexTerminal, from: u64) -> Result<u64, AutomatonError> { 260 let new_state = self.new_state()?; 261 262 match terminal { 263 RegexTerminal::Character(c) => { 264 self.insert_transition(from, new_state, Transition::Char(c)) 265 } 266 RegexTerminal::Wildcard => { 267 self.insert_transition(from, new_state, Transition::Wildcard) 268 } 269 RegexTerminal::Epsilon => self.insert_transition(from, new_state, Transition::Epsilon), 270 }?; 271 272 Ok(new_state) 273 } 274 275 /// Adds a group of nonterminals concatenated together to the ENFA 276 fn add_group( 277 &mut self, 278 group: Vec<RegexNonTerminal>, 279 from: u64, 280 ) -> Result<u64, AutomatonError> { 281 let mut current_state = from; 282 for nonterminal in group { 283 current_state = self.add_non_terminal(nonterminal, current_state)?; 284 } 285 Ok(current_state) 286 } 287 288 /// Adds a group of nonterminals concatenated together within a Kleene star to the ENFA 289 /// 290 /// Just a warning, this will return the state on the end of the chain, but there will be states with higher IDs than the end of the chain due to the nature of how this is built. 291 fn add_kleene_group( 292 &mut self, 293 kleene_group: Vec<RegexNonTerminal>, 294 from: u64, 295 ) -> Result<u64, AutomatonError> { 296 let end_state = self.add_group(kleene_group, from)?; 297 self.insert_transition(from, end_state, Transition::Epsilon)?; 298 self.insert_transition(end_state, from, Transition::Epsilon)?; 299 300 Ok(end_state) 301 } 302 303 /// Adds a union of nonterminals that are each separately possible to the ENFA 304 fn add_union( 305 &mut self, 306 union: Vec<RegexNonTerminal>, 307 from: u64, 308 ) -> Result<u64, AutomatonError> { 309 let end_state = self.new_state()?; 310 for nonterminal in union { 311 let entry_state = self.add_terminal(RegexTerminal::Epsilon, from)?; 312 let exit_state = self.add_non_terminal(nonterminal, entry_state)?; 313 self.insert_transition(exit_state, end_state, Transition::Epsilon)?; 314 } 315 Ok(end_state) 316 } 317 } 318 319 /// Convert AST to a graph ENFA, then convert that ENFA to a DFA. 320 impl Encodable<RegexNonTerminal> for Regex { 321 fn parse(parse_tree: RegexNonTerminal) -> Result<Self, AutomatonError> { 322 let (mut enfa, current_state) = GraphENFA::new(); 323 let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?; 324 enfa.set_state_acceptance(final_state, true)?; 325 326 let dfa = enfa.as_dfa()?; 327 328 Ok(Regex { 329 automaton: dfa, 330 parsed: parse_tree, 331 }) 332 } 333 }