Refactor the Flexer (#972)

This commit is contained in:
Ara Adkins 2020-07-08 14:20:00 +01:00 committed by GitHub
parent 6ba038c800
commit fd3e3df92c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 575 additions and 429 deletions

View File

@ -1,7 +1,9 @@
//! Exports API for construction of Nondeterminist and Deterministic Finite State Automata. //! Provides an API for the construction of finite state automata, in both their deterministic and
//! non-deterministic forms.
pub mod alphabet; pub mod alphabet;
pub mod dfa; pub mod dfa;
pub mod nfa; pub mod nfa;
pub mod pattern; pub mod pattern;
pub mod state; pub mod state;
pub mod symbol;

View File

@ -1,7 +1,8 @@
//! Exports an alphabet (set of all valid input symbols) for Finite State Automata (NFA and DFA). //! Exports an alphabet for an arbitrary finite state automaton.
use crate::automata::state::Symbol; use crate::automata::symbol::Symbol;
use crate::prelude::*;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
@ -11,44 +12,72 @@ use std::ops::RangeInclusive;
// === Alphabet === // === Alphabet ===
// ================ // ================
/// An alphabet describes a set of all the valid input symbols that a given finite state automata /// A representation of the distinct intervals over the input alphabet for a given finite state
/// (NFA or DFA) can operate over. /// automaton.
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton ///
/// The alphabet is meant to be represented as an interval. That is, if `a` and `b` are in alphabet, /// These intervals are defined by a set of _divisions_ of the input alphabet, where each division
/// then any symbol from `a..=b` is in alphabet too. /// is represented as a point in that alphabet. This is necessary to allow for efficient encoding of
/// state transitions that trigger not just on _one_, but potentially on _many_ of the input
/// symbols in the automaton's alphabet.
///
/// This is best explained by way of example. Consider the original unbounded alphabet:
///
/// ```text
/// ... a b c d e f g h ... z ...
/// ```
///
/// We want to add a rule that matches on the interval `[b, d]`. This results in there being three
/// intervals on the alphabet, as there are two divisions (annotated below):
///
/// ```text
/// ... a | b c d | e f g h ... z ...
/// div: 1 2
/// seg: 1 2 3
/// ```
///
/// If we then add a rule that matches on the interval `[d, f]`, we end up with five intervals on
/// the alphabet, with four divisions (annotated below):
///
/// ```text
/// ... a | b c | d | e f | g h ... z ...
/// div: 1 2 3 4
/// seg: 1 2 3 4 5
/// ```
///
/// This type tracks these divisions explicitly for an input alphabet defined for all automata in
/// this library as `0u32..=u32::max_value()`.
#[derive(Clone,Debug,PartialEq,Eq)] #[derive(Clone,Debug,PartialEq,Eq)]
pub struct Alphabet { #[allow(missing_docs)]
/// The interval of all valid input symbols. The interval is further divided into subintervals pub struct Segmentation {
/// (i.e. `[a,z,A,Z]` should be understood as `[a..=z,z..=A,A..=Z]`), in order to efficiently pub divisions: BTreeSet<Symbol>
/// encode state transitions that trigger not just on one but a whole range of symbols
/// (i.e. `a..=z`)
pub symbols: BTreeSet<Symbol>
} }
impl Default for Alphabet { impl Segmentation {
fn default() -> Self {
Alphabet {symbols:[Symbol{val:0}].iter().cloned().collect()}
}
}
impl Alphabet {
/// Inserts a range of symbols into the alphabet. /// Inserts a range of symbols into the alphabet.
pub fn insert(&mut self, range:RangeInclusive<Symbol>) { pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
// The symbol range is associated with transition in automata. Therefore we: self.divisions.insert(Symbol::from(range.start()));
// Mark the symbol with the new transition. self.divisions.insert(Symbol{val:range.end().val + 1});
self.symbols.insert(Symbol{val:range.start().val});
// Mark the symbol without the new transition.
self.symbols.insert(Symbol{val:range.end().val + 1});
// This way each symbol in alphabet corresponds to a unique set of transitions.
} }
}
impl From<Vec<u32>> for Alphabet { /// Creates an [`AlphabetSegmentation`] from an input set of divisions.
fn from(vec:Vec<u32>) -> Self { pub fn from_divisions(divisions:&[u32]) -> Self {
let mut dict = Self::default(); let mut dict = Self::default();
for val in vec { for val in divisions {
dict.symbols.insert(Symbol{val}); dict.divisions.insert(Symbol::from(*val));
} }
dict dict
} }
} }
// === Trait Impls ===
impl Default for Segmentation {
fn default() -> Self {
let mut divisions: BTreeSet<Symbol> = default();
// The existence of the default (0) member in the set is assumed by the implementation of
// the NFA -> DFA conversion.
divisions.insert(default());
Segmentation { divisions }
}
}

View File

@ -1,61 +1,59 @@
//! Exports the structure for Deterministic Finite Automata. //! The structure for defining deterministic finite automata.
use crate::automata::alphabet::Alphabet; use crate::automata::alphabet;
use crate::automata::state; use crate::automata::state;
use crate::data::matrix::Matrix; use crate::data::matrix::Matrix;
// ===================================== // =====================================
// === Deterministic Finite Automata === // === Deterministic Finite Automata ===
// ===================================== // =====================================
/// Function callback for an arbitrary state of finite automata. /// The definition of a [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) for a
/// It contains name of Rust procedure that is meant to be executed after encountering a pattern /// given set of symbols, states, and transitions.
/// (declared in `group::Rule.pattern`). ///
#[derive(Clone,Debug,PartialEq,Eq)] /// A DFA is a finite state automaton that accepts or rejects a given sequence of symbols by
pub struct Callback { /// executing on a sequence of states _uniquely_ determined by the sequence of input symbols.
/// TODO[jv] Write better explanation after implementing rust code generation. ///
/// Priority is used during rust code generation. /// ```text
pub priority: usize, /// ┌───┐ 'D' ┌───┐ 'F' ┌───┐ 'A' ┌───┐
/// Name of Rust method that will be called when executing this callback. /// │ 0 │──────▶│ 1 │──────▶│ 2 │──────▶│ 3 │
pub name: String, /// └───┘ └───┘ └───┘ └───┘
} /// ```
#[derive(Clone,Debug,Default,Eq,PartialEq)]
/// DFA automata with a set of symbols, states and transitions.
/// Deterministic Finite Automata is a finite-state machine that accepts or rejects a given sequence
/// of symbols, by running through a state sequence uniquely determined by the input symbol sequence.
/// ___ ___ ___ ___
/// | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3 |
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
#[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct DFA { pub struct DFA {
/// Finite set of all valid input symbols. /// A set of disjoint intervals over the allowable input alphabet.
pub alphabet: Alphabet, pub alphabet_segmentation: alphabet::Segmentation,
/// Transition matrix of deterministic finite state automata. /// The transition matrix for the DFA.
/// It contains next state for each pair of state and input symbol - (state,symbol) => new state. ///
/// For example, a transition matrix for automata that accepts string "ABABAB...." would look /// It represents a function of type `(state, symbol) -> state`, returning the identifier for
/// like this: /// the new state.
/// states ///
/// | | A | B | <- symbols /// For example, the transition matrix for an automaton that accepts the language
/// | 0 | 1 | - | /// `{"A" | "B"}*"` would appear as follows, with `-` denoting
/// | 1 | - | 0 | /// [the invalid state](state::INVALID). The leftmost column encodes the input state, while the
/// Where `-` denotes `state::INVALID`. /// topmost row encodes the input symbols.
pub links: Matrix<state::Id>, ///
/// Stores callback for each state (if it has one). /// | | A | B |
pub callbacks: Vec<Option<Callback>>, /// |:-:|:-:|:-:|
/// | 0 | 1 | - |
/// | 1 | - | 0 |
///
pub links: Matrix<state::Identifier>,
/// A collection of callbacks for each state (indexable in order)
pub callbacks: Vec<Option<RuleExecutable>>,
} }
impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
// === Trait Impls ===
impl From<Vec<Vec<usize>>> for Matrix<state::Identifier> {
fn from(input:Vec<Vec<usize>>) -> Self { fn from(input:Vec<Vec<usize>>) -> Self {
let rows = input.len(); let rows = input.len();
let columns = if rows == 0 {0} else {input[0].len()}; let columns = if rows == 0 {0} else {input[0].len()};
let mut matrix = Self::new(rows,columns); let mut matrix = Self::new(rows,columns);
for row in 0..rows { for row in 0..rows {
for column in 0..columns { for column in 0..columns {
matrix[(row,column)] = state::Id{id:input[row][column]}; matrix[(row,column)] = state::Identifier::from(input[row][column]);
} }
} }
matrix matrix
@ -64,25 +62,45 @@ impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
// =========== // ================
// == Tests == // === Callback ===
// =========== // ================
/// The callback associated with an arbitrary state of a finite automaton.
///
/// It contains the rust code that is intended to be executed after encountering a
/// [`pattern`](super::pattern::Pattern) that causes the associated state transition. This pattern
/// is declared in [`Rule.pattern`](crate::group::rule::Rule::pattern).
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct RuleExecutable {
/// A description of the priority with which the callback is constructed during codegen.
pub priority: usize,
/// The rust code that will be executed when running this callback.
pub code: String,
}
// =============
// === Tests ===
// =============
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use super::*;
use crate::automata::state; use crate::automata::state;
const I:usize = state::INVALID.id; use super::*;
const INVALID:usize = state::Identifier::INVALID.id;
/// DFA automata that accepts newline '\n'. /// DFA automata that accepts newline '\n'.
pub fn newline() -> DFA { pub fn newline() -> DFA {
DFA { DFA {
alphabet: Alphabet::from(vec![10,11]), alphabet_segmentation: alphabet::Segmentation::from_divisions(&[10,11]),
links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]), links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
callbacks: vec![ callbacks: vec![
None, None,
Some(Callback{priority:2,name:"group0_rule0".into()}), Some(RuleExecutable {priority:2, code:"group0_rule0".into()}),
], ],
} }
} }
@ -90,11 +108,11 @@ pub mod tests {
/// DFA automata that accepts any letter a..=z. /// DFA automata that accepts any letter a..=z.
pub fn letter() -> DFA { pub fn letter() -> DFA {
DFA { DFA {
alphabet: Alphabet::from(vec![97,123]), alphabet_segmentation: alphabet::Segmentation::from_divisions(&[97,123]),
links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]), links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
callbacks: vec![ callbacks: vec![
None, None,
Some(Callback{priority:2,name:"group0_rule0".into()}), Some(RuleExecutable {priority:2, code:"group0_rule0".into()}),
], ],
} }
} }
@ -102,16 +120,16 @@ pub mod tests {
/// DFA automata that accepts any number of spaces ' '. /// DFA automata that accepts any number of spaces ' '.
pub fn spaces() -> DFA { pub fn spaces() -> DFA {
DFA { DFA {
alphabet: Alphabet::from(vec![0,32,33]), alphabet_segmentation: alphabet::Segmentation::from_divisions(&[0,32,33]),
links: Matrix::from(vec![ links: Matrix::from(vec![
vec![I,1,I], vec![INVALID,1,INVALID],
vec![I,2,I], vec![INVALID,2,INVALID],
vec![I,2,I], vec![INVALID,2,INVALID],
]), ]),
callbacks: vec![ callbacks: vec![
None, None,
Some(Callback{priority:3,name:"group0_rule0".into()}), Some(RuleExecutable {priority:3, code:"group0_rule0".into()}),
Some(Callback{priority:3,name:"group0_rule0".into()}), Some(RuleExecutable {priority:3, code:"group0_rule0".into()}),
], ],
} }
} }
@ -119,18 +137,18 @@ pub mod tests {
/// DFA automata that accepts one letter a..=z or any many spaces. /// DFA automata that accepts one letter a..=z or any many spaces.
pub fn letter_and_spaces() -> DFA { pub fn letter_and_spaces() -> DFA {
DFA { DFA {
alphabet: Alphabet::from(vec![32,33,97,123]), alphabet_segmentation: alphabet::Segmentation::from_divisions(&[32,33,97,123]),
links: Matrix::from(vec![ links: Matrix::from(vec![
vec![I,1,I,2,I], vec![INVALID,1,INVALID,2,INVALID],
vec![I,3,I,I,I], vec![INVALID,3,INVALID,INVALID,INVALID],
vec![I,I,I,I,I], vec![INVALID,INVALID,INVALID,INVALID,INVALID],
vec![I,3,I,I,I], vec![INVALID,3,INVALID,INVALID,INVALID],
]), ]),
callbacks: vec![ callbacks: vec![
None, None,
Some(Callback{priority:4,name:"group0_rule1".into()}), Some(RuleExecutable {priority:4, code:"group0_rule1".into()}),
Some(Callback{priority:4,name:"group0_rule0".into()}), Some(RuleExecutable {priority:4, code:"group0_rule0".into()}),
Some(Callback{priority:4,name:"group0_rule1".into()}), Some(RuleExecutable {priority:4, code:"group0_rule1".into()}),
], ],
} }
} }

View File

@ -1,80 +1,93 @@
//! Implementation of Nondeterministic Finite Automata and it's conversion to DFA. //! The structure for defining non-deterministic finite automata.
use crate::automata::alphabet::Alphabet; use crate::automata::alphabet;
use crate::automata::dfa::RuleExecutable;
use crate::automata::dfa::DFA; use crate::automata::dfa::DFA;
use crate::automata::dfa::Callback; use crate::automata::pattern::Pattern;
use crate::automata::state::Link;
use crate::automata::state::Symbol;
use crate::automata::state::State; use crate::automata::state::State;
use crate::automata::state::Transition;
use crate::automata::state; use crate::automata::state;
use crate::automata::symbol::Symbol;
use crate::data::matrix::Matrix; use crate::data::matrix::Matrix;
use std::collections::HashMap;
use std::collections::BTreeSet;
use std::ops::RangeInclusive;
use crate::automata::pattern::Pattern;
use itertools::Itertools; use itertools::Itertools;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::ops::RangeInclusive;
use crate::prelude::*;
// ========================================
// === Nondeterministic Finite Automata ===
// ========================================
/// Type alias for a state Id based on set of states. // =========================================
/// It is used during NFA -> DFA transformation where multiple states can merge together, // === Non-Deterministic Finite Automata ===
/// thanks to epsilon links. // =========================================
type StateSetId = BTreeSet<state::Id>;
/// NFA automata with a set of symbols, states and transitions. /// A state identifier based on a set of states.
/// Nondeterministic Finite Automata is a finite-state machine that accepts or rejects a given ///
/// sequence of symbols. /// This is used during the NFA -> DFA transformation, where multiple states can merge together due
/// Compared to `DFA`, NFA can transition into multiple new states without reading any symbol /// to the collapsing of epsilon transitions.
/// (so called epsilon link / transition), type StateSetId = BTreeSet<state::Identifier>;
/// ___ ___ ___ ___ ___
/// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 | /// The definition of a [NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) for a
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ /// given set of symbols, states, and transitions (specifically a NFA with ε-moves).
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton ///
/// A NFA is a finite state automaton that accepts or rejects a given sequence of symbols. In
/// contrast with a DFA, the NFA may transition between states _without_ reading any new symbol
/// through use of
/// [epsilon links](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton#NFA_with_%CE%B5-moves).
///
/// ```text
/// ┌───┐ 'N' ┌───┐ ┌───┐ 'F' ┌───┐ ┌───┐ 'A' ┌───┐
/// │ 0 │──────▶│ 1 │──▶│ 2 │──────▶│ 3 │──▶│ 3 │──────▶│ 3 │
/// └───┘ └───┘ ε └───┘ └───┘ ε └───┘ └───┘
/// ```
#[derive(Clone,Debug,Default,PartialEq,Eq)] #[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct NFA { pub struct NFA {
/// Finite set of all valid input symbols. /// A set of disjoint intervals over the input alphabet.
pub alphabet: Alphabet, pub alphabet_segmentation: alphabet::Segmentation,
/// Set of named NFA states with (epsilon) transitions. /// A set of named NFA states, with (epsilon) transitions.
pub states: Vec<State>, pub states: Vec<State>,
} }
impl NFA { impl NFA {
/// Adds a new state to NFA and returns it's Id. /// Adds a new state to the NFA and returns its identifier.
pub fn new_state(&mut self) -> state::Id { pub fn new_state(&mut self) -> state::Identifier {
let id = self.states.len(); let id = self.states.len();
self.states.push(State::default()); self.states.push(State::default());
state::Id {id} state::Identifier {id}
} }
/// Creates an epsilon transition between two states. /// Creates an epsilon transition between two states.
/// Whenever the automata happens to be in `source` state it can immediatelly move to ///
/// `target` state (but does not have to). /// Whenever the automaton happens to be in `source` state it can immediately transition to the
pub fn connect(&mut self, source:state::Id, target:state::Id) { /// `target` state. It is, however, not _required_ to do so.
pub fn connect(&mut self, source:state::Identifier, target:state::Identifier) {
self.states[source.id].epsilon_links.push(target); self.states[source.id].epsilon_links.push(target);
} }
/// Creates an ordinary transition (for a range of symbols) between two states. /// Creates an ordinary transition for a range of symbols.
/// If any symbol from such range happens to be on input when the automata is in `source` ///
/// state, it will immediatelly move to `target` state. /// If any symbol from such range happens to be the input when the automaton is in the `source`
pub fn connect_by /// state, it will immediately transition to the `target` state.
(&mut self, source:state::Id, target:state::Id, symbols:&RangeInclusive<Symbol>) { pub fn connect_via
self.alphabet.insert(symbols.clone()); ( &mut self
self.states[source.id].links.push(Link{symbols:symbols.clone(), target}); , source:state::Identifier
, target_state:state::Identifier
, symbols:&RangeInclusive<Symbol>) {
self.alphabet_segmentation.insert(symbols.clone());
self.states[source.id].links.push(Transition{symbols:symbols.clone(), target_state});
} }
/// Transforms pattern to NFA. /// Transforms a pattern to an NFA using the algorithm described
/// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI /// [here](https://www.youtube.com/watch?v=RYNN-tb9WxI).
pub fn new_pattern(&mut self, source:state::Id, pattern:&Pattern) -> state::Id { pub fn new_pattern(&mut self, source:state::Identifier, pattern:&Pattern) -> state::Identifier {
let current = self.new_state(); let current = self.new_state();
self.connect(source,current); self.connect(source,current);
match pattern { match pattern {
Pattern::Range(range) => { Pattern::Range(range) => {
let state = self.new_state(); let state = self.new_state();
self.connect_by(current,state,range); self.connect_via(current,state,range);
state state
}, },
Pattern::Many(body) => { Pattern::Many(body) => {
@ -101,18 +114,15 @@ impl NFA {
} }
} }
/// Merges states that are connected by epsilon links, using an algorithm based on the one shown
// === NFA -> DFA === /// [here](https://www.youtube.com/watch?v=taClnxU-nao).
/// Merges states that are connected by epsilon links.
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
fn eps_matrix(&self) -> Vec<StateSetId> { fn eps_matrix(&self) -> Vec<StateSetId> {
fn fill_eps_matrix fn fill_eps_matrix
( nfa : &NFA ( nfa : &NFA
, states : &mut Vec<StateSetId> , states : &mut Vec<StateSetId>
, computed : &mut Vec<bool> , computed : &mut Vec<bool>
, visited : &mut Vec<bool> , visited : &mut Vec<bool>
, state : state::Id , state : state::Identifier
) { ) {
let mut state_set = StateSetId::new(); let mut state_set = StateSetId::new();
let mut circular = false; let mut circular = false;
@ -138,18 +148,17 @@ impl NFA {
let mut computed = vec![false; self.states.len()]; let mut computed = vec![false; self.states.len()];
for id in 0..self.states.len() { for id in 0..self.states.len() {
let mut visited = vec![false; states.len()]; let mut visited = vec![false; states.len()];
fill_eps_matrix(self,&mut states,&mut computed,&mut visited,state::Id{id}); fill_eps_matrix(self,&mut states,&mut computed,&mut visited,state::Identifier{id});
} }
states states
} }
/// Computes a transition matrix (state X symbol => state) for NFA. /// Computes a transition matrix `(state, symbol) => state` for the NFA, ignoring epsilon links.
/// Ignores epsilon links. fn nfa_matrix(&self) -> Matrix<state::Identifier> {
fn nfa_matrix(&self) -> Matrix<state::Id> { let mut matrix = Matrix::new(self.states.len(),self.alphabet_segmentation.divisions.len());
let mut matrix = Matrix::new(self.states.len(),self.alphabet.symbols.len());
for (state_ix, source) in self.states.iter().enumerate() { for (state_ix, source) in self.states.iter().enumerate() {
let targets = source.targets(&self.alphabet); let targets = source.targets(&self.alphabet_segmentation);
for (voc_ix, &target) in targets.iter().enumerate() { for (voc_ix, &target) in targets.iter().enumerate() {
matrix[(state_ix,voc_ix)] = target; matrix[(state_ix,voc_ix)] = target;
} }
@ -159,26 +168,27 @@ impl NFA {
} }
impl From<&NFA> for DFA { impl From<&NFA> for DFA {
/// Transforms NFA into DFA.
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao /// Transforms an NFA into a DFA, based on the algorithm described
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
fn from(nfa:&NFA) -> Self { fn from(nfa:&NFA) -> Self {
let nfa_mat = nfa.nfa_matrix(); let nfa_mat = nfa.nfa_matrix();
let eps_mat = nfa.eps_matrix(); let eps_mat = nfa.eps_matrix();
let mut dfa_mat = Matrix::new(0,nfa.alphabet.symbols.len()); let mut dfa_mat = Matrix::new(0,nfa.alphabet_segmentation.divisions.len());
let mut dfa_eps_ixs = Vec::<StateSetId>::new(); let mut dfa_eps_ixs = Vec::<StateSetId>::new();
let mut dfa_eps_map = HashMap::<StateSetId,state::Id>::new(); let mut dfa_eps_map = HashMap::<StateSetId,state::Identifier>::new();
dfa_eps_ixs.push(eps_mat[0].clone()); dfa_eps_ixs.push(eps_mat[0].clone());
dfa_eps_map.insert(eps_mat[0].clone(), state::Id{id:0}); dfa_eps_map.insert(eps_mat[0].clone(),state::Identifier::from(0));
let mut i = 0; let mut i = 0;
while i < dfa_eps_ixs.len() { while i < dfa_eps_ixs.len() {
dfa_mat.new_row(); dfa_mat.new_row();
for voc_ix in 0..nfa.alphabet.symbols.len() { for voc_ix in 0..nfa.alphabet_segmentation.divisions.len() {
let mut eps_set = StateSetId::new(); let mut eps_set = StateSetId::new();
for &eps_ix in &dfa_eps_ixs[i] { for &eps_ix in &dfa_eps_ixs[i] {
let tgt = nfa_mat[(eps_ix.id,voc_ix)]; let tgt = nfa_mat[(eps_ix.id,voc_ix)];
if tgt != state::INVALID { if tgt != state::Identifier::INVALID {
eps_set.extend(eps_mat[tgt.id].iter()); eps_set.extend(eps_mat[tgt.id].iter());
} }
} }
@ -186,7 +196,7 @@ impl From<&NFA> for DFA {
dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) { dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) {
Some(&id) => id, Some(&id) => id,
None => { None => {
let id = state::Id {id:dfa_eps_ixs.len()}; let id = state::Identifier {id:dfa_eps_ixs.len()};
dfa_eps_ixs.push(eps_set.clone()); dfa_eps_ixs.push(eps_set.clone());
dfa_eps_map.insert(eps_set,id); dfa_eps_map.insert(eps_set,id);
id id
@ -200,14 +210,17 @@ impl From<&NFA> for DFA {
let mut callbacks = vec![None; dfa_eps_ixs.len()]; let mut callbacks = vec![None; dfa_eps_ixs.len()];
let priority = dfa_eps_ixs.len(); let priority = dfa_eps_ixs.len();
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() { for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some(); let has_name = |&key:&state::Identifier| nfa.states[key.id].name.is_some();
if let Some(eps) = epss.into_iter().find(has_name) { if let Some(eps) = epss.into_iter().find(has_name) {
let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap(); let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap();
callbacks[dfa_ix] = Some(Callback {name:rule,priority}); callbacks[dfa_ix] = Some(RuleExecutable{ code:rule,priority});
} }
} }
DFA {alphabet:nfa.alphabet.clone(),links:dfa_mat,callbacks} let alphabet_segmentation = nfa.alphabet_segmentation.clone();
let links = dfa_mat;
DFA{alphabet_segmentation,links,callbacks}
} }
} }
@ -218,13 +231,13 @@ impl From<&NFA> for DFA {
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
extern crate test; extern crate test;
use crate::automata::dfa; use crate::automata::dfa;
use super::*; use super::*;
use test::Bencher; use test::Bencher;
/// NFA automata that accepts newline '\n'. /// NFA that accepts a newline '\n'.
pub fn newline() -> NFA { pub fn newline() -> NFA {
NFA { NFA {
states: vec![ states: vec![
@ -233,11 +246,11 @@ pub mod tests {
State::from(vec![3]).named("group0_rule0"), State::from(vec![3]).named("group0_rule0"),
State::default(), State::default(),
], ],
alphabet: Alphabet::from(vec![10,11]), alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()),
} }
} }
/// NFA automata that accepts any letter a..=z. /// NFA that accepts any letter in the range a..=z.
pub fn letter() -> NFA { pub fn letter() -> NFA {
NFA { NFA {
states: vec![ states: vec![
@ -246,11 +259,11 @@ pub mod tests {
State::from(vec![3]).named("group0_rule0"), State::from(vec![3]).named("group0_rule0"),
State::default(), State::default(),
], ],
alphabet: Alphabet::from(vec![97,123]), alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()),
} }
} }
/// NFA automata that accepts any number of spaces ' '. /// NFA that accepts any number of spaces ' '.
pub fn spaces() -> NFA { pub fn spaces() -> NFA {
NFA { NFA {
states: vec![ states: vec![
@ -265,11 +278,11 @@ pub mod tests {
State::from(vec![5,9]).named("group0_rule0"), State::from(vec![5,9]).named("group0_rule0"),
State::default(), State::default(),
], ],
alphabet: Alphabet::from(vec![0,32,33]), alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()),
} }
} }
/// NFA automata that accepts one letter a..=z or many spaces ' '. /// NFA that accepts one letter a..=z or many spaces ' '.
pub fn letter_and_spaces() -> NFA { pub fn letter_and_spaces() -> NFA {
NFA { NFA {
states: vec![ states: vec![
@ -286,10 +299,10 @@ pub mod tests {
State::from(vec![7,11]).named("group0_rule1"), State::from(vec![7,11]).named("group0_rule1"),
State::default(), State::default(),
], ],
alphabet: Alphabet::from(vec![32,33,97,123]), alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()),
} }
} }
#[test] #[test]
fn test_to_dfa_newline() { fn test_to_dfa_newline() {
assert_eq!(DFA::from(&newline()),dfa::tests::newline()); assert_eq!(DFA::from(&newline()),dfa::tests::newline());

View File

@ -1,7 +1,6 @@
//! Simple API for constructing regex patterns that are used in parser implementation. //! Simple API for constructing regex patterns that are used in parser implementation.
use crate::parser; use crate::automata::symbol::Symbol;
use crate::automata::state::Symbol;
use core::iter; use core::iter;
use itertools::Itertools; use itertools::Itertools;
@ -9,25 +8,127 @@ use std::ops::BitAnd;
use std::ops::BitOr; use std::ops::BitOr;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use Pattern::*;
// ============= // =============
// == Pattern == // == Pattern ==
// ============= // =============
/// Simple regex pattern. /// A representation of a simple regular pattern.
#[derive(Clone,Debug)] #[derive(Clone,Debug)]
pub enum Pattern { pub enum Pattern {
/// Pattern that triggers on any symbol from given range. /// The pattern that triggers on any symbol from the given range.
Range(RangeInclusive<Symbol>), Range(RangeInclusive<Symbol>),
/// Pattern that triggers on any given pattern from sequence. /// The pattern that triggers on any given pattern from a sequence.
Or(Vec<Pattern>), Or(Vec<Pattern>),
/// Pattern that triggers when a sequence of patterns is encountered. /// The pattern that triggers when a sequence of patterns is encountered.
And(Vec<Pattern>), And(Vec<Pattern>),
/// Pattern that triggers on 0..N repetitions of given pattern. /// The pattern that triggers on 0..N repetitions of given pattern.
Many(Box<Pattern>) Many(Box<Pattern>)
} }
use Pattern::*; impl Pattern {
/// A pattern that never triggers.
pub fn never() -> Self {
Pattern::symbols(Symbol::from(1)..=Symbol::from(0))
}
/// A pattern that always triggers
pub fn always() -> Self {
Pattern::symbols(Symbol::from(u32::min_value())..=Symbol::from(u32::max_value()))
}
/// A pattern that triggers on any character.
pub fn any_char() -> Self {
Pattern::symbols(Symbol::from(0)..=Symbol::from(u32::max_value()))
}
/// A pattern that triggers on 0..N repetitions of the pattern described by `self`.
pub fn many(self) -> Self {
Many(Box::new(self))
}
/// A pattern that triggers on 1..N repetitions of the pattern described by `self`.
pub fn many1(self) -> Self {
self.clone() & self.many()
}
/// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`.
pub fn opt(self) -> Self {
self | Self::always()
}
/// A pattern that triggers on the given character.
pub fn char(character:char) -> Self {
Self::symbol(Symbol::from(character))
}
/// A pattern that triggers on the given symbol.
pub fn symbol(symbol:Symbol) -> Self {
Pattern::symbols(symbol..=symbol)
}
/// A pattern that triggers on any of the provided `symbols`.
pub fn symbols(symbols:RangeInclusive<Symbol>) -> Self {
Pattern::Range(symbols)
}
/// A pattern that triggers at the end of the file.
pub fn eof() -> Self {
Self::symbol(Symbol::EOF_CODE)
}
/// A pattern that triggers on any character in the provided `range`.
pub fn range(range:RangeInclusive<char>) -> Self {
Pattern::symbols(Symbol::from(*range.start())..=Symbol::from(*range.end()))
}
/// Pattern that triggers when sequence of characters given by `chars` is encountered.
pub fn all(chars:&str) -> Self {
chars.chars().fold(Self::never(), |pat,char| pat & Self::char(char))
}
/// The pattern that triggers on any characters contained in `chars`.
pub fn any(chars:&str) -> Self {
chars.chars().fold(Self::never(), |pat,char| pat | Self::char(char))
}
/// The pattern that doesn't trigger on any character contained in `chars`.
pub fn none(chars:&str) -> Self {
let max = u32::max_value();
let char_iter = chars.chars().map(|char| char as u32);
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
let mut codes = char_iter2.collect_vec();
codes.sort();
codes.iter().tuple_windows().fold(Self::never(), |pat,(start,end)| {
if end < start {pat} else {
pat | Pattern::symbols(Symbol::from(*start)..=Symbol::from(*end))
}
})
}
/// The pattern that triggers on any character but `char`.
pub fn not(char:char) -> Self {
Self::none(&char.to_string())
}
/// The pattern that triggers on `num` repetitions of `pat`.
pub fn repeat(pat:Pattern, num:usize) -> Self {
(0..num).fold(Self::always(), |p,_| p & pat.clone())
}
/// Pattern that triggers on `min`..`max` repetitions of `pat`.
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
(min..max).fold(Self::never(), |p,n| p | Self::repeat(pat.clone(),n))
}
}
// === Trait Impls ====
impl BitOr<Pattern> for Pattern { impl BitOr<Pattern> for Pattern {
type Output = Pattern; type Output = Pattern;
@ -52,101 +153,3 @@ impl BitAnd<Pattern> for Pattern {
} }
} }
} }
impl Pattern {
/// Pattern that never triggers.
pub fn never() -> Self {
Pattern::symbols(1..=0)
}
/// Pattern that always triggers.
pub fn always() -> Self {
Pattern::symbols(u32::min_value()..=u32::max_value())
}
/// Pattern that triggers on any char.
pub fn any_char() -> Self {
Pattern::symbols(0..=u32::max_value())
}
/// Pattern that triggers on 0..N repetitions of given pattern.
pub fn many(self) -> Self {
Many(Box::new(self))
}
/// Pattern that triggers on 1..N repetitions of given pattern.
pub fn many1(self) -> Self {
self.clone() & self.many()
}
/// Pattern that triggers on 0..=1 repetitions of given pattern.
pub fn opt(self) -> Self {
self | Self::always()
}
/// Pattern that triggers on given symbol
pub fn symbol(symbol:u32) -> Self {
Pattern::symbols(symbol..=symbol)
}
/// Pattern that triggers on any of the given symbols.
pub fn symbols(symbols:RangeInclusive<u32>) -> Self {
Pattern::Range(Symbol{val:*symbols.start()}..=Symbol{val:*symbols.end()})
}
/// Pattern that triggers on end of file.
pub fn eof() -> Self {
Self::symbol(parser::EOF_CODE.val)
}
/// Pattern that triggers on given character.
pub fn char(char:char) -> Self {
Self::symbol(char as u32)
}
/// Pattern that triggers on any of the given characters.
pub fn range(chars:RangeInclusive<char>) -> Self {
Pattern::symbols((*chars.start() as u32)..=(*chars.end() as u32))
}
/// Pattern that triggers when sequence of characters is encountered.
pub fn all(chars:&str) -> Self {
chars.chars().fold(Self::never(), |pat,char| pat & Self::char(char))
}
/// Pattern that triggers on any characters from given sequence.
pub fn any(chars:&str) -> Self {
chars.chars().fold(Self::never(), |pat,char| pat | Self::char(char))
}
/// Pattern that doesn't trigger on any given character from given sequence.
pub fn none(chars:&str) -> Self {
let max = u32::max_value();
let char_iter = chars.chars().map(|char| char as u32);
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
let mut codes = char_iter2.collect_vec();
codes.sort();
codes.iter().tuple_windows().fold(Self::never(), |pat,(start,end)| {
if end < start {pat} else {
pat | Pattern::symbols(*start..=*end)
}
})
}
/// Pattern that triggers on any character but the one given.
pub fn not(char:char) -> Self {
Self::none(&char.to_string())
}
/// Pattern that triggers on N repetitions of given pattern.
pub fn repeat(pat:Pattern, num:usize) -> Self {
(0..num).fold(Self::always(), |p,_| p & pat.clone())
}
/// Pattern that triggers on MIN..MAX repetitions of given pattern.
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
(min..max).fold(Self::never(), |p,n| p | Self::repeat(pat.clone(),n))
}
}

View File

@ -1,70 +1,31 @@
//! This module exports State implementation for Nondeterministic Finite Automata. //! This module exports State implementation for Nondeterministic Finite Automata.
use crate::automata::alphabet::Alphabet; use crate::automata::alphabet;
use crate::automata::state; use crate::automata::symbol::Symbol;
use std::ops::RangeInclusive; use crate::prelude::*;
// ======================= // ===========
// == State Of Automata == // == State ==
// ======================= // ===========
/// Flag for invalid state. /// A named state for a [`super::nfa::NFA`].
/// When finite automata gets into invalid state the input sequence of symbols is rejected.
pub const INVALID:Id = Id {id:usize::max_value()};
// TODO [AA] Extract this. Turn it into using `char
/// Newtype wrapper for finite automata input symbol.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
pub struct Symbol {
#[allow(missing_docs)]
pub val: u32
}
// TODO [AA] Define some constants on char
/// Newtype wrapper for finite automata state ID.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
pub struct Id {
#[allow(missing_docs)]
pub id: usize
}
impl Default for Id {
/// Returns state::INVALID. This is because every finite automata has an invalid state
/// and because all transitions in automata transition matrix lead to invalid state by default.
fn default() -> Self {
state::INVALID
}
}
/// Named NFA state with a set of transitions (links).
#[derive(Clone,Debug,Default,PartialEq,Eq)] #[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct State { pub struct State {
/// Set of transitions that don't require any symbol to trigger. /// A set of transitions that can trigger without consuming a symbol (ε-transitions).
/// I.E. If there is an epsilon link from state A to state B, then whenever we are in state A, pub epsilon_links: Vec<Identifier>,
/// we can freely move to state B. /// The set of transitions that trigger while consuming a specific symbol.
pub epsilon_links: Vec<Id>, ///
/// Set of transitions that trigger with specific symbol on input. /// When triggered, the automaton will transition to the [`Transition::target_state`].
/// When triggered, the automata will transition to the `link.target`. pub links: Vec<Transition>,
pub links: Vec<Link>, /// The name of the state.
/// Name of the state. ///
/// We use it to autogenerate a call to Rust method with same name. /// This is used to auto-generate a call to the rust method of the same name.
pub name: Option<String>, pub name: Option<String>,
} }
/// A transition to new automata state
/// that requires specific symbol on automata input to trigger.
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct Link {
/// Any symbol from the range will trigger this link.
pub symbols: RangeInclusive<Symbol>,
/// A state that is visited, after the link is triggered.
pub target: Id,
}
impl State { impl State {
/// Updater for field `name`. Returns updated state. /// Updater for field `name`. Returns updated state.
pub fn named(mut self, name:&str) -> Self { pub fn named(mut self, name:&str) -> Self {
@ -73,29 +34,32 @@ impl State {
} }
/// Returns transition (next state) for each symbol in alphabet. /// Returns transition (next state) for each symbol in alphabet.
pub fn targets(&self, alphabet:&Alphabet) -> Vec<Id> { pub fn targets(&self, alphabet:&alphabet::Segmentation) -> Vec<Identifier> {
let mut targets = vec![]; let mut targets = vec![];
let mut index = 0; let mut index = 0;
let mut links = self.links.clone(); let mut links = self.links.clone();
links.sort_by_key(|link| *link.symbols.start()); links.sort_by_key(|link| *link.symbols.start());
for &symbol in &alphabet.symbols { for &symbol in &alphabet.divisions {
while links.len() > index && *links[index].symbols.end() < symbol { while links.len() > index && *links[index].symbols.end() < symbol {
index += 1; index += 1;
} }
if links.len() <= index || *links[index].symbols.start() > symbol { if links.len() <= index || *links[index].symbols.start() > symbol {
targets.push(state::INVALID); targets.push(Identifier::INVALID);
} else { } else {
targets.push(links[index].target); targets.push(links[index].target_state);
} }
} }
targets targets
} }
} }
// === Trait Impls ====
impl From<Vec<usize>> for State { impl From<Vec<usize>> for State {
/// Creates a state with epsilon links. /// Creates a state with epsilon links.
fn from(vec:Vec<usize>) -> Self { fn from(vec:Vec<usize>) -> Self {
let epsilon_links = vec.iter().cloned().map(|id| Id{id}).collect(); let epsilon_links = vec.iter().cloned().map(|id| Identifier {id}).collect();
State {epsilon_links,..Default::default()} State {epsilon_links,..Default::default()}
} }
} }
@ -106,9 +70,60 @@ impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
let link = |(range, id): (RangeInclusive<u32>, usize)| { let link = |(range, id): (RangeInclusive<u32>, usize)| {
let start = Symbol{val:*range.start()}; let start = Symbol{val:*range.start()};
let end = Symbol{val:*range.end()}; let end = Symbol{val:*range.end()};
Link {symbols: start..=end, target: Id{ id }} Transition {symbols: start..=end, target_state: Identifier { id }}
}; };
let links = vec.iter().cloned().map(link).collect(); let links = vec.iter().cloned().map(link).collect();
State {links,..Default::default()} State {links,..Default::default()}
} }
} }
// ================
// == Identifier ==
// ================
/// A state identifier for an arbitrary finite automaton.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
#[allow(missing_docs)]
pub struct Identifier {
pub id: usize
}
impl Identifier {
/// An identifier representing the invalid state.
///
/// When in an invalid state, a finite automaton will reject the sequence of input symbols.
pub const INVALID:Identifier = Identifier{id:usize::max_value()};
}
// === Trait Impls ===
impl Default for Identifier {
/// Returns state::INVALID. This is because every finite automata has an invalid state
/// and because all transitions in automata transition matrix lead to invalid state by default.
fn default() -> Self {
Identifier::INVALID
}
}
impl From<usize> for Identifier {
fn from(id: usize) -> Self {
Identifier{id}
}
}
// ============
// === Link ===
// ============
/// A transition between states in a finite automaton that must consume a symbol to trigger.
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct Transition {
/// The range of symbols on which this transition will trigger.
pub symbols: RangeInclusive<Symbol>,
/// The state that is entered after the transition has triggered.
pub target_state: Identifier,
}

View File

@ -0,0 +1,49 @@
//! Defines a Symbol that is operated on by the finite automata.
// ==============
// === Symbol ===
// ==============
/// An input symbol to a finite automaton.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
pub struct Symbol {
#[allow(missing_docs)]
pub val: u32
}
impl Symbol {
/// A representation of the end of the file.
pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()};
/// A representation of the null symbol.
pub const NULL:Symbol = Symbol{val:0};
}
// === Trait Impls ===
impl Default for Symbol {
fn default() -> Self {
Symbol::NULL
}
}
impl From<u32> for Symbol {
fn from(val:u32) -> Symbol {
Symbol{val}
}
}
impl From<char> for Symbol {
fn from(val:char) -> Symbol {
Symbol{val:val as u32}
}
}
impl From<&Symbol> for Symbol {
fn from(symb: &Symbol) -> Self {
Symbol{val:symb.val}
}
}

View File

@ -1,3 +1,3 @@
//! Generic datastructures, with multiple usecases. //! Generic data-structures to support multiple use-cases.
pub mod matrix; pub mod matrix;

View File

@ -1,4 +1,6 @@
//! Efficient representation of 2D matrix. //! An efficient representation of a 2D matrix.
use enso_prelude::default;
use std::ops::Index; use std::ops::Index;
use std::ops::IndexMut; use std::ops::IndexMut;
@ -9,17 +11,38 @@ use std::ops::IndexMut;
// == Matrix == // == Matrix ==
// ============ // ============
/// Efficient 2D matrix implemented on top of vector. /// An efficient 2D matrix implemented on top of [`std::vec::Vec`].
#[derive(Clone,Debug,Default,PartialEq,Eq)] #[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct Matrix<T> { pub struct Matrix<T> {
/// The number of rows in matrix. /// The number of rows in the matrix.
rows: usize, rows: usize,
/// The number of columns in matrix. /// The number of columns in the matrix.
columns: usize, columns: usize,
/// Matrix implemented with vector. /// The matrix.
matrix: Vec<T>, matrix: Vec<T>,
} }
impl<T:Default> Matrix<T> {
/// Constructs a matrix with the dimensions given by `rows` and `columns`.
pub fn new(rows:usize, columns:usize) -> Self {
let mut matrix = Vec::with_capacity(rows*columns);
for _ in 0..matrix.capacity() {
matrix.push(default())
}
Self{rows,columns,matrix}
}
/// Adds a new row to the matrix `self`, filled with default values.
pub fn new_row(&mut self) {
for _ in 0..self.columns {
self.matrix.push(default());
}
self.rows += 1;
}
}
// === Trait Impls ===
impl<T> Index<(usize,usize)> for Matrix<T> { impl<T> Index<(usize,usize)> for Matrix<T> {
type Output = T; type Output = T;
@ -33,22 +56,3 @@ impl<T> IndexMut<(usize,usize)> for Matrix<T> {
&mut self.matrix[index.0*self.columns+index.1] &mut self.matrix[index.0*self.columns+index.1]
} }
} }
impl<T:Default> Matrix<T> {
/// Constructs a new matrix for given number of rows and columns.
pub fn new(rows:usize, columns:usize) -> Self {
let mut matrix = Vec::with_capacity(rows*columns);
for _ in 0..matrix.capacity() {
matrix.push(Default::default())
}
Self{rows,columns,matrix}
}
/// Adds a new row to matrix, filled with default values.
pub fn new_row(&mut self) {
for _ in 0..self.columns {
self.matrix.push(Default::default());
}
self.rows += 1;
}
}

View File

@ -1,4 +1,4 @@
//! This module exports API for grouping multiple rules (Rust callbacks with regex pattern) together. //! This module provides an API for grouping multiple flexer rules.
use crate::automata::pattern::Pattern; use crate::automata::pattern::Pattern;
use crate::automata::nfa::NFA; use crate::automata::nfa::NFA;
@ -14,41 +14,59 @@ pub mod rule;
// == Group == // == Group ==
// =========== // ===========
/// Struct that group rules together. It also inherits rules from parent group (if it has one). /// A group is a structure for associating multiple rules with each other, and is the basic building
/// Groups are the basic building block of flexer: /// block of the flexer.
/// Flexer internally keeps a stack of groups, only one of them active at a time. ///
/// Each group contains set of regex patterns and callbacks (together called `Rule`). /// A group consists of the following:
/// Whenever a rule.pattern from active group is matched with part of input the associated ///
/// rule.callback is executed, which in turn may exit the current groupor enter a new one. /// - A set of [`Rule`s](Rule), each containing a regex pattern and associated callback.
/// This allows us to nicely model a situation, where certain part of program (like a string literal) /// - Inherited rules from a parent group, if such a group exists.
/// should have very different parsing rules than other (for example body of function). ///
/// Note that the input is first matched with first added rule, then with the second etc. /// Internally, the flexer maintains a stack of groups, where only one group can be active at any
/// Therefore, if two rules overlap, only the callback of the first added rule will be executed. /// given time. Rules are matched _in order_, and hence overlaps are handled by the order in which
/// the rules are matched, with the first callback being triggered.
///
/// Whenever a [`rule.pattern`](Rule::pattern) from the active group is matched against part of the
/// input, the associated [`rule.callback`](Rule::callback) is executed. This callback may exit the
/// current group or even enter a new one. As a result, groups allow us to elegantly model a
/// situation where certain parts of a program (e.g. within a string literal) have very different
/// lexing rules than other portions of a program (e.g. the body of a function).
#[derive(Clone,Debug,Default)] #[derive(Clone,Debug,Default)]
pub struct Group { pub struct Group {
/// Unique ID. /// A unique identifier for the group.
pub id: usize, pub id: usize,
/// Custom name which is used for debugging. /// A name for the group (useful in debugging).
pub name: String, pub name: String,
/// Parent which we inherit rules from. /// The parent group from which rules are inherited.
pub parent: Option<Box<Group>>, pub parent: Option<Box<Group>>,
/// Set of regex patterns with associated callbacks. /// A set of flexer rules.
pub rules: Vec<Rule>, pub rules: Vec<Rule>,
} }
impl Group { impl Group {
/// Adds new rule (regex pattern with associated callback) to group.
/// Adds a new rule to the current group.
pub fn add_rule(&mut self, rule:Rule) { pub fn add_rule(&mut self, rule:Rule) {
self.rules.push(rule) self.rules.push(rule)
} }
/// Returns rule builder for given pattern. /// Returns a rule builder for the given pattern.
/// TODO[jv] better describe it's purpose once we agree on correct API.
pub fn rule(&mut self, pattern:Pattern) -> rule::Builder<impl FnMut(Rule) + '_> { pub fn rule(&mut self, pattern:Pattern) -> rule::Builder<impl FnMut(Rule) + '_> {
rule::Builder{pattern,callback:move |rule| self.add_rule(rule)} rule::Builder{pattern, callback:move |rule| self.add_rule(rule)}
} }
/// All rules including parent rules. /// The canonical name for a given rule.
fn callback_name(&self, rule_ix:usize) -> String {
format!("group{}_rule{}", self.id, rule_ix)
}
}
// === Getters ===
impl Group {
/// The full set of rules, including parent rules.
pub fn rules(&self) -> Vec<&Rule> { pub fn rules(&self) -> Vec<&Rule> {
let mut parent = &self.parent; let mut parent = &self.parent;
let mut rules = (&self.rules).iter().collect_vec(); let mut rules = (&self.rules).iter().collect_vec();
@ -58,16 +76,16 @@ impl Group {
} }
rules rules
} }
/// Canonical name of given rule.
fn callback_name(&self, rule_ix:usize) -> String {
format!("group{}_rule{}",self.id,rule_ix)
}
} }
// === Trait Impls ===
impl From<&Group> for NFA { impl From<&Group> for NFA {
/// Transforms Group to NFA. /// Transforms the input group into an NFA.
/// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI ///
/// The algorithm is based on this algorithm for
/// [converting a regular expression to an NFA](https://www.youtube.com/watch?v=RYNN-tb9WxI).
fn from(group:&Group) -> Self { fn from(group:&Group) -> Self {
let mut nfa = NFA::default(); let mut nfa = NFA::default();
let start = nfa.new_state(); let start = nfa.new_state();
@ -83,6 +101,7 @@ impl From<&Group> for NFA {
} }
// ============= // =============
// === Tests === // === Tests ===
// ============= // =============

View File

@ -1,5 +1,8 @@
//! An API for declaring Rust callbacks for encountered regex patterns. //! An API for declaring rust-code callbacks to be executed when a given pattern is matched.
//! //!
//! A flexer rule is a [`crate::automata::pattern`] associated with rust code to be executed as a
//! callback.
use crate::automata::pattern::Pattern; use crate::automata::pattern::Pattern;
@ -8,29 +11,28 @@ use crate::automata::pattern::Pattern;
// == Rule == // == Rule ==
// ========== // ==========
/// A rule is a pair of regex pattern and callback. /// A flexer rule.
/// The intention is to run the callback after encountering given pattern.
#[derive(Clone,Debug)] #[derive(Clone,Debug)]
pub struct Rule { pub struct Rule {
/// Pattern that triggers the callback. /// The pattern that triggers the callback.
pub pattern: Pattern, pub pattern: Pattern,
/// Callback containing stringified Rust code. /// The code to execute when [`Rule::pattern`] matches, containing rust code as a
/// [`std::string::String`].
pub callback: String, pub callback: String,
} }
/// Builder that allows us to add `Rule` to `Group` in a nice way. /// A builder that allows us to add a [`Rule`] to [`crate::group::Group`] in an elegant way.
/// It is possible this structure won't be useful in rust, since borrow checker will likely influence
/// the final API of rule construction.
#[derive(Clone,Debug)] #[derive(Clone,Debug)]
pub struct Builder<Callback> { pub struct Builder<Callback> {
/// Pattern that triggers the callback. /// The pattern that triggers the callback.
pub pattern: Pattern, pub pattern: Pattern,
/// Callback containing a closure.
/// The callback containing a closure
pub callback: Callback, pub callback: Callback,
} }
impl<F:FnMut(Rule)> Builder<F> { impl<F:FnMut(Rule)> Builder<F> {
/// Feeds the input that triggered regex pattern to callback. /// Feeds the input that triggered the [`Builder::pattern`] to the [`Builder::callback`].
pub fn run(&mut self, program:String){ pub fn run(&mut self, program:String){
let rule = Rule {pattern:self.pattern.clone(),callback:program}; let rule = Rule {pattern:self.pattern.clone(),callback:program};
(self.callback)(rule); (self.callback)(rule);

View File

@ -8,10 +8,14 @@
#![warn(unsafe_code)] #![warn(unsafe_code)]
#![warn(unused_import_braces)] #![warn(unused_import_braces)]
//! This module exports simple parser based on Deterministic Finite State Automata for regular //! This module exports the API for defining a simple lexer based on a deterministic finite state
//! grammars (anything parsable with regex patterns). //! automaton.
//!
//! These lexers are capable of lexing any regular grammar, with some extensions to allow working
//! with context sensitive (e.g. indentation-aware) syntax.
pub mod automata; pub mod automata;
pub mod group; pub mod group;
pub mod parser;
pub mod data; pub mod data;
pub use enso_prelude as prelude;

View File

@ -1,14 +0,0 @@
//! The entry point of flexer. It (is going to) contain API for parsing an input string based on
//! group of regex patterns.
use crate::automata::state::Symbol;
// ============
// == Parser ==
// ============
/// End Of File - This symbol is inserted at the end of each parser input.
/// We can use the maximum value of u32, because no `char` (unicode scalar) can hold this value.
pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()};

View File

@ -56,7 +56,9 @@ object Macro {
val clsDef = c.parse(s"final class __Parser__ extends $tree2") val clsDef = c.parse(s"final class __Parser__ extends $tree2")
val tgtDef = addGroupDefs.transform(clsDef) val tgtDef = addGroupDefs.transform(clsDef)
c.Expr[() => P](q"$tgtDef; () => { new __Parser__ () }")
}
val finalCode = q"$tgtDef; () => { new __Parser__() }"
c.Expr[() => P](finalCode)
}
} }