automaton

An automaton library & basic programs written in Rust
git clone git://git.ethandl.dev/automaton
Log | Files | Refs | README

regex.rs (12735B)


      1 use crate::{
      2     dfa::DFA,
      3     graph_enfa::{GraphENFA, Transition},
      4     Automaton, AutomatonError, Encodable, FiniteStateAutomaton,
      5 };
      6 use either::Either;
      7 use std::fmt::Display;
      8 
      9 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     10 pub enum RegexToken {
     11     KleeneStar, // *
     12     Wildcard,   // .
     13     Bar,        // |
     14     LeftGroup,  // (
     15     RightGroup, // )
     16     // The following are just syntactic sugar that I'll use
     17     Optional,   // ?
     18     KleenePlus, // +
     19 }
     20 
     21 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     22 pub enum RegexTerminal {
     23     Character(char),
     24     Wildcard,
     25     Epsilon,
     26 }
     27 
     28 #[derive(Clone, PartialEq, Eq, Debug)]
     29 pub enum RegexNonTerminal {
     30     Terminal(RegexTerminal),
     31     Group(Vec<RegexNonTerminal>),
     32     KleeneGroup(Vec<RegexNonTerminal>),
     33     Union(Vec<RegexNonTerminal>),
     34 }
     35 
     36 pub struct Regex {
     37     pub parsed: RegexNonTerminal,
     38     pub automaton: DFA,
     39 }
     40 
     41 impl Display for Regex {
     42     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
     43         write!(f, "")
     44     }
     45 }
     46 
     47 impl Automaton for Regex {
     48     fn accepts(&self, input: &str) -> bool {
     49         self.automaton.accepts(input)
     50     }
     51 }
     52 
     53 impl<T: FiniteStateAutomaton> From<Regex> for Result<T, AutomatonError> {
     54     fn from(regex: Regex) -> Result<T, AutomatonError> {
     55         T::convert(Box::new(regex.automaton))
     56     }
     57 }
     58 
     59 impl Regex {
     60     pub fn tokenise(regex_str: &str) -> Result<Vec<Either<char, RegexToken>>, AutomatonError> {
     61         let mut escape_next = false;
     62         regex_str
     63             .chars()
     64             .into_iter()
     65             .collect::<Vec<char>>()
     66             .iter()
     67             .filter_map(|c| match (escape_next, c) {
     68                 (true, c) => {
     69                     escape_next = false;
     70                     match *c {
     71                         '*' => Some(Ok(Either::Left('*'))),
     72                         '.' => Some(Ok(Either::Left('.'))),
     73                         '|' => Some(Ok(Either::Left('|'))),
     74                         '(' => Some(Ok(Either::Left('('))),
     75                         ')' => Some(Ok(Either::Left(')'))),
     76                         '?' => Some(Ok(Either::Left('?'))),
     77                         '+' => Some(Ok(Either::Left('+'))),
     78                         '\\' => Some(Ok(Either::Left('\\'))),
     79                         _ => Some(Err(AutomatonError::tokenise_error(&format!(
     80                             "Unrecognised escape sequence: \\{}",
     81                             c
     82                         )))),
     83                     }
     84                 }
     85                 (false, c) => match *c {
     86                     '*' => Some(Ok(Either::Right(RegexToken::KleeneStar))),
     87                     '.' => Some(Ok(Either::Right(RegexToken::Wildcard))),
     88                     '|' => Some(Ok(Either::Right(RegexToken::Bar))),
     89                     '(' => Some(Ok(Either::Right(RegexToken::LeftGroup))),
     90                     ')' => Some(Ok(Either::Right(RegexToken::RightGroup))),
     91                     '?' => Some(Ok(Either::Right(RegexToken::Optional))),
     92                     '+' => Some(Ok(Either::Right(RegexToken::KleenePlus))),
     93                     '\\' => {
     94                         escape_next = true;
     95                         None
     96                     }
     97                     c => Some(Ok(Either::Left(c))),
     98                 },
     99             })
    100             .collect()
    101     }
    102 
    103     /// This function is written like a mess!
    104     fn make_grammar_tree(
    105         regex_tokens: Vec<Either<char, RegexToken>>,
    106     ) -> Result<RegexNonTerminal, AutomatonError> {
    107         let mut groups: Vec<Vec<RegexNonTerminal>> = vec![];
    108         groups.push(vec![]);
    109 
    110         let mut in_a_union = false;
    111         let mut or_group: Vec<RegexNonTerminal> = vec![];
    112 
    113         for token in regex_tokens.into_iter() {
    114             match token {
    115                 Either::Left(c) => groups
    116                     .last_mut()
    117                     .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    118                     .push(RegexNonTerminal::Terminal(RegexTerminal::Character(c))),
    119                 Either::Right(RegexToken::Wildcard) => groups
    120                     .last_mut()
    121                     .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    122                     .push(RegexNonTerminal::Terminal(RegexTerminal::Wildcard)),
    123 
    124                 Either::Right(RegexToken::LeftGroup) => {
    125                     groups.push(vec![]);
    126                 }
    127                 Either::Right(RegexToken::RightGroup) => {
    128                     if !in_a_union {
    129                         let exited_group = groups
    130                             .pop()
    131                             .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?;
    132                         groups
    133                             .last_mut()
    134                             .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    135                             .push(RegexNonTerminal::Group(exited_group));
    136                     } else {
    137                         let exited_group = groups
    138                             .pop()
    139                             .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?;
    140                         or_group.push(RegexNonTerminal::Group(exited_group));
    141                         groups
    142                             .last_mut()
    143                             .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    144                             .push(RegexNonTerminal::Union(or_group));
    145                         in_a_union = false;
    146                         or_group = vec![];
    147                     }
    148                 }
    149 
    150                 Either::Right(RegexToken::Bar) => {
    151                     in_a_union = true;
    152                     let exited_group = groups
    153                         .pop()
    154                         .ok_or(AutomatonError::parse_error("Unable to pop a group off"))?;
    155                     or_group.push(RegexNonTerminal::Group(exited_group));
    156                     groups.push(vec![]);
    157                 }
    158 
    159                 // Sugar!
    160                 Either::Right(RegexToken::Optional) => {
    161                     let last_terminal = groups
    162                         .last_mut()
    163                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    164                         .pop()
    165                         .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?;
    166                     groups
    167                         .last_mut()
    168                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    169                         .push(RegexNonTerminal::Union(vec![
    170                             last_terminal,
    171                             RegexNonTerminal::Terminal(RegexTerminal::Epsilon),
    172                         ]));
    173                 }
    174 
    175                 Either::Right(RegexToken::KleeneStar) => {
    176                     let last_terminal = groups
    177                         .last_mut()
    178                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    179                         .pop()
    180                         .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?;
    181                     groups
    182                         .last_mut()
    183                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    184                         .push(RegexNonTerminal::KleeneGroup(vec![last_terminal]));
    185                 }
    186                 // Sugar!
    187                 Either::Right(RegexToken::KleenePlus) => {
    188                     let last_terminal = groups
    189                         .last_mut()
    190                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    191                         .pop()
    192                         .ok_or(AutomatonError::parse_error("Unable to get last terminal"))?;
    193                     groups
    194                         .last_mut()
    195                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    196                         .push(last_terminal.clone());
    197                     groups
    198                         .last_mut()
    199                         .ok_or(AutomatonError::parse_error("Unable to get last group"))?
    200                         .push(RegexNonTerminal::KleeneGroup(vec![last_terminal]));
    201                 }
    202             }
    203         }
    204 
    205         if groups.len() != 1 {
    206             return Err(AutomatonError::parse_error(
    207                 "Ended the grammar tree within a group!",
    208             ));
    209         }
    210         if in_a_union {
    211             // Catch the outermost union if it exists
    212             let last_elem = groups.pop().ok_or(AutomatonError::parse_error(
    213                 "Ended the grammar tree without the main group!",
    214             ))?;
    215             or_group.push(RegexNonTerminal::Group(last_elem));
    216             return Ok(RegexNonTerminal::Union(or_group));
    217         }
    218 
    219         Ok(RegexNonTerminal::Group(groups.pop().ok_or(
    220             AutomatonError::parse_error("Ended the grammar tree without the main group!"),
    221         )?))
    222     }
    223 }
    224 
    225 /// Tokenisation
    226 impl Encodable<&str> for Regex {
    227     fn parse(regex_str: &str) -> Result<Self, AutomatonError> {
    228         Self::parse(Self::tokenise(regex_str)?)
    229     }
    230 }
    231 
    232 /// Parsing to an AST
    233 type TokenVec = Vec<Either<char, RegexToken>>;
    234 impl Encodable<TokenVec> for Regex {
    235     fn parse(tokens: TokenVec) -> Result<Self, AutomatonError> {
    236         let parse_tree = Regex::make_grammar_tree(tokens)?;
    237         Self::parse(parse_tree)
    238     }
    239 }
    240 
    241 /// Various fns for converting from the AST to an ENFA graph
    242 impl GraphENFA {
    243     /// Adds a nonterminal onto the DFA directly after `from`
    244     pub fn add_non_terminal(
    245         &mut self,
    246         nonterminal: RegexNonTerminal,
    247         from: u64,
    248     ) -> Result<u64, AutomatonError> {
    249         let current_state = from;
    250         match nonterminal {
    251             RegexNonTerminal::Terminal(t) => self.add_terminal(t, current_state),
    252             RegexNonTerminal::Group(g) => self.add_group(g, current_state),
    253             RegexNonTerminal::KleeneGroup(kg) => self.add_kleene_group(kg, current_state),
    254             RegexNonTerminal::Union(un) => self.add_union(un, current_state),
    255         }
    256     }
    257 
    258     /// Concats a terminal onto the DFA directly after `from`
    259     fn add_terminal(&mut self, terminal: RegexTerminal, from: u64) -> Result<u64, AutomatonError> {
    260         let new_state = self.new_state()?;
    261 
    262         match terminal {
    263             RegexTerminal::Character(c) => {
    264                 self.insert_transition(from, new_state, Transition::Char(c))
    265             }
    266             RegexTerminal::Wildcard => {
    267                 self.insert_transition(from, new_state, Transition::Wildcard)
    268             }
    269             RegexTerminal::Epsilon => self.insert_transition(from, new_state, Transition::Epsilon),
    270         }?;
    271 
    272         Ok(new_state)
    273     }
    274 
    275     /// Adds a group of nonterminals concatenated together to the ENFA
    276     fn add_group(
    277         &mut self,
    278         group: Vec<RegexNonTerminal>,
    279         from: u64,
    280     ) -> Result<u64, AutomatonError> {
    281         let mut current_state = from;
    282         for nonterminal in group {
    283             current_state = self.add_non_terminal(nonterminal, current_state)?;
    284         }
    285         Ok(current_state)
    286     }
    287 
    288     /// Adds a group of nonterminals concatenated together within a Kleene star to the ENFA
    289     ///
    290     /// Just a warning, this will return the state on the end of the chain, but there will be states with higher IDs than the end of the chain due to the nature of how this is built.
    291     fn add_kleene_group(
    292         &mut self,
    293         kleene_group: Vec<RegexNonTerminal>,
    294         from: u64,
    295     ) -> Result<u64, AutomatonError> {
    296         let end_state = self.add_group(kleene_group, from)?;
    297         self.insert_transition(from, end_state, Transition::Epsilon)?;
    298         self.insert_transition(end_state, from, Transition::Epsilon)?;
    299 
    300         Ok(end_state)
    301     }
    302 
    303     /// Adds a union of nonterminals that are each separately possible to the ENFA
    304     fn add_union(
    305         &mut self,
    306         union: Vec<RegexNonTerminal>,
    307         from: u64,
    308     ) -> Result<u64, AutomatonError> {
    309         let end_state = self.new_state()?;
    310         for nonterminal in union {
    311             let entry_state = self.add_terminal(RegexTerminal::Epsilon, from)?;
    312             let exit_state = self.add_non_terminal(nonterminal, entry_state)?;
    313             self.insert_transition(exit_state, end_state, Transition::Epsilon)?;
    314         }
    315         Ok(end_state)
    316     }
    317 }
    318 
    319 /// Convert AST to a graph ENFA, then convert that ENFA to a DFA.
    320 impl Encodable<RegexNonTerminal> for Regex {
    321     fn parse(parse_tree: RegexNonTerminal) -> Result<Self, AutomatonError> {
    322         let (mut enfa, current_state) = GraphENFA::new();
    323         let final_state = enfa.add_non_terminal(parse_tree.clone(), current_state)?;
    324         enfa.set_state_acceptance(final_state, true)?;
    325 
    326         let dfa = enfa.as_dfa()?;
    327 
    328         Ok(Regex {
    329             automaton: dfa,
    330             parsed: parse_tree,
    331         })
    332     }
    333 }