mirror of
https://github.com/enso-org/enso.git
synced 2024-11-23 08:08:34 +03:00
Refactor the Flexer (#972)
This commit is contained in:
parent
6ba038c800
commit
fd3e3df92c
@ -1,7 +1,9 @@
|
||||
//! Exports API for construction of Nondeterminist and Deterministic Finite State Automata.
|
||||
//! Provides an API for the construction of finite state automata, in both their deterministic and
|
||||
//! non-deterministic forms.
|
||||
|
||||
pub mod alphabet;
|
||||
pub mod dfa;
|
||||
pub mod nfa;
|
||||
pub mod pattern;
|
||||
pub mod state;
|
||||
pub mod symbol;
|
||||
|
@ -1,7 +1,8 @@
|
||||
//! Exports an alphabet (set of all valid input symbols) for Finite State Automata (NFA and DFA).
|
||||
//! Exports an alphabet for an arbitrary finite state automaton.
|
||||
|
||||
use crate::automata::state::Symbol;
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use crate::prelude::*;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
@ -11,44 +12,72 @@ use std::ops::RangeInclusive;
|
||||
// === Alphabet ===
|
||||
// ================
|
||||
|
||||
/// An alphabet describes a set of all the valid input symbols that a given finite state automata
|
||||
/// (NFA or DFA) can operate over.
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
/// The alphabet is meant to be represented as an interval. That is, if `a` and `b` are in alphabet,
|
||||
/// then any symbol from `a..=b` is in alphabet too.
|
||||
/// A representation of the distinct intervals over the input alphabet for a given finite state
|
||||
/// automaton.
|
||||
///
|
||||
/// These intervals are defined by a set of _divisions_ of the input alphabet, where each division
|
||||
/// is represented as a point in that alphabet. This is necessary to allow for efficient encoding of
|
||||
/// state transitions that trigger not just on _one_, but potentially on _many_ of the input
|
||||
/// symbols in the automaton's alphabet.
|
||||
///
|
||||
/// This is best explained by way of example. Consider the original unbounded alphabet:
|
||||
///
|
||||
/// ```text
|
||||
/// ... a b c d e f g h ... z ...
|
||||
/// ```
|
||||
///
|
||||
/// We want to add a rule that matches on the interval `[b, d]`. This results in there being three
|
||||
/// intervals on the alphabet, as there are two divisions (annotated below):
|
||||
///
|
||||
/// ```text
|
||||
/// ... a | b c d | e f g h ... z ...
|
||||
/// div: 1 2
|
||||
/// seg: 1 2 3
|
||||
/// ```
|
||||
///
|
||||
/// If we then add a rule that matches on the interval `[d, f]`, we end up with five intervals on
|
||||
/// the alphabet, with four divisions (annotated below):
|
||||
///
|
||||
/// ```text
|
||||
/// ... a | b c | d | e f | g h ... z ...
|
||||
/// div: 1 2 3 4
|
||||
/// seg: 1 2 3 4 5
|
||||
/// ```
|
||||
///
|
||||
/// This type tracks these divisions explicitly for an input alphabet defined for all automata in
|
||||
/// this library as `0u32..=u32::max_value()`.
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Alphabet {
|
||||
/// The interval of all valid input symbols. The interval is further divided into subintervals
|
||||
/// (i.e. `[a,z,A,Z]` should be understood as `[a..=z,z..=A,A..=Z]`), in order to efficiently
|
||||
/// encode state transitions that trigger not just on one but a whole range of symbols
|
||||
/// (i.e. `a..=z`)
|
||||
pub symbols: BTreeSet<Symbol>
|
||||
#[allow(missing_docs)]
|
||||
pub struct Segmentation {
|
||||
pub divisions: BTreeSet<Symbol>
|
||||
}
|
||||
|
||||
impl Default for Alphabet {
|
||||
fn default() -> Self {
|
||||
Alphabet {symbols:[Symbol{val:0}].iter().cloned().collect()}
|
||||
}
|
||||
}
|
||||
|
||||
impl Alphabet {
|
||||
impl Segmentation {
|
||||
/// Inserts a range of symbols into the alphabet.
|
||||
pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
|
||||
// The symbol range is associated with transition in automata. Therefore we:
|
||||
// Mark the symbol with the new transition.
|
||||
self.symbols.insert(Symbol{val:range.start().val});
|
||||
// Mark the symbol without the new transition.
|
||||
self.symbols.insert(Symbol{val:range.end().val + 1});
|
||||
// This way each symbol in alphabet corresponds to a unique set of transitions.
|
||||
self.divisions.insert(Symbol::from(range.start()));
|
||||
self.divisions.insert(Symbol{val:range.end().val + 1});
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u32>> for Alphabet {
|
||||
fn from(vec:Vec<u32>) -> Self {
|
||||
/// Creates an [`AlphabetSegmentation`] from an input set of divisions.
|
||||
pub fn from_divisions(divisions:&[u32]) -> Self {
|
||||
let mut dict = Self::default();
|
||||
for val in vec {
|
||||
dict.symbols.insert(Symbol{val});
|
||||
for val in divisions {
|
||||
dict.divisions.insert(Symbol::from(*val));
|
||||
}
|
||||
dict
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Segmentation {
|
||||
fn default() -> Self {
|
||||
let mut divisions: BTreeSet<Symbol> = default();
|
||||
// The existence of the default (0) member in the set is assumed by the implementation of
|
||||
// the NFA -> DFA conversion.
|
||||
divisions.insert(default());
|
||||
Segmentation { divisions }
|
||||
}
|
||||
}
|
||||
|
@ -1,61 +1,59 @@
|
||||
//! Exports the structure for Deterministic Finite Automata.
|
||||
//! The structure for defining deterministic finite automata.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::state;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
|
||||
|
||||
// =====================================
|
||||
// === Deterministic Finite Automata ===
|
||||
// =====================================
|
||||
|
||||
/// Function callback for an arbitrary state of finite automata.
|
||||
/// It contains name of Rust procedure that is meant to be executed after encountering a pattern
|
||||
/// (declared in `group::Rule.pattern`).
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Callback {
|
||||
/// TODO[jv] Write better explanation after implementing rust code generation.
|
||||
/// Priority is used during rust code generation.
|
||||
pub priority: usize,
|
||||
/// Name of Rust method that will be called when executing this callback.
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
/// DFA automata with a set of symbols, states and transitions.
|
||||
/// Deterministic Finite Automata is a finite-state machine that accepts or rejects a given sequence
|
||||
/// of symbols, by running through a state sequence uniquely determined by the input symbol sequence.
|
||||
/// ___ ___ ___ ___
|
||||
/// | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3 |
|
||||
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
/// The definition of a [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) for a
|
||||
/// given set of symbols, states, and transitions.
|
||||
///
|
||||
/// A DFA is a finite state automaton that accepts or rejects a given sequence of symbols by
|
||||
/// executing on a sequence of states _uniquely_ determined by the sequence of input symbols.
|
||||
///
|
||||
/// ```text
|
||||
/// ┌───┐ 'D' ┌───┐ 'F' ┌───┐ 'A' ┌───┐
|
||||
/// │ 0 │──────▶│ 1 │──────▶│ 2 │──────▶│ 3 │
|
||||
/// └───┘ └───┘ └───┘ └───┘
|
||||
/// ```
|
||||
#[derive(Clone,Debug,Default,Eq,PartialEq)]
|
||||
pub struct DFA {
|
||||
/// Finite set of all valid input symbols.
|
||||
pub alphabet: Alphabet,
|
||||
/// Transition matrix of deterministic finite state automata.
|
||||
/// It contains next state for each pair of state and input symbol - (state,symbol) => new state.
|
||||
/// For example, a transition matrix for automata that accepts string "ABABAB...." would look
|
||||
/// like this:
|
||||
/// states
|
||||
/// | | A | B | <- symbols
|
||||
/// | 0 | 1 | - |
|
||||
/// | 1 | - | 0 |
|
||||
/// Where `-` denotes `state::INVALID`.
|
||||
pub links: Matrix<state::Id>,
|
||||
/// Stores callback for each state (if it has one).
|
||||
pub callbacks: Vec<Option<Callback>>,
|
||||
/// A set of disjoint intervals over the allowable input alphabet.
|
||||
pub alphabet_segmentation: alphabet::Segmentation,
|
||||
/// The transition matrix for the DFA.
|
||||
///
|
||||
/// It represents a function of type `(state, symbol) -> state`, returning the identifier for
|
||||
/// the new state.
|
||||
///
|
||||
/// For example, the transition matrix for an automaton that accepts the language
|
||||
/// `{"A" | "B"}*"` would appear as follows, with `-` denoting
|
||||
/// [the invalid state](state::INVALID). The leftmost column encodes the input state, while the
|
||||
/// topmost row encodes the input symbols.
|
||||
///
|
||||
/// | | A | B |
|
||||
/// |:-:|:-:|:-:|
|
||||
/// | 0 | 1 | - |
|
||||
/// | 1 | - | 0 |
|
||||
///
|
||||
pub links: Matrix<state::Identifier>,
|
||||
/// A collection of callbacks for each state (indexable in order)
|
||||
pub callbacks: Vec<Option<RuleExecutable>>,
|
||||
}
|
||||
|
||||
impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<Vec<Vec<usize>>> for Matrix<state::Identifier> {
|
||||
fn from(input:Vec<Vec<usize>>) -> Self {
|
||||
let rows = input.len();
|
||||
let columns = if rows == 0 {0} else {input[0].len()};
|
||||
let mut matrix = Self::new(rows,columns);
|
||||
for row in 0..rows {
|
||||
for column in 0..columns {
|
||||
matrix[(row,column)] = state::Id{id:input[row][column]};
|
||||
matrix[(row,column)] = state::Identifier::from(input[row][column]);
|
||||
}
|
||||
}
|
||||
matrix
|
||||
@ -64,25 +62,45 @@ impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// == Tests ==
|
||||
// ===========
|
||||
// ================
|
||||
// === Callback ===
|
||||
// ================
|
||||
|
||||
/// The callback associated with an arbitrary state of a finite automaton.
|
||||
///
|
||||
/// It contains the rust code that is intended to be executed after encountering a
|
||||
/// [`pattern`](super::pattern::Pattern) that causes the associated state transition. This pattern
|
||||
/// is declared in [`Rule.pattern`](crate::group::rule::Rule::pattern).
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct RuleExecutable {
|
||||
/// A description of the priority with which the callback is constructed during codegen.
|
||||
pub priority: usize,
|
||||
/// The rust code that will be executed when running this callback.
|
||||
pub code: String,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
use crate::automata::state;
|
||||
|
||||
const I:usize = state::INVALID.id;
|
||||
use super::*;
|
||||
|
||||
const INVALID:usize = state::Identifier::INVALID.id;
|
||||
|
||||
/// DFA automata that accepts newline '\n'.
|
||||
pub fn newline() -> DFA {
|
||||
DFA {
|
||||
alphabet: Alphabet::from(vec![10,11]),
|
||||
links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(&[10,11]),
|
||||
links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
|
||||
callbacks: vec![
|
||||
None,
|
||||
Some(Callback{priority:2,name:"group0_rule0".into()}),
|
||||
Some(RuleExecutable {priority:2, code:"group0_rule0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
@ -90,11 +108,11 @@ pub mod tests {
|
||||
/// DFA automata that accepts any letter a..=z.
|
||||
pub fn letter() -> DFA {
|
||||
DFA {
|
||||
alphabet: Alphabet::from(vec![97,123]),
|
||||
links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(&[97,123]),
|
||||
links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
|
||||
callbacks: vec![
|
||||
None,
|
||||
Some(Callback{priority:2,name:"group0_rule0".into()}),
|
||||
Some(RuleExecutable {priority:2, code:"group0_rule0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
@ -102,16 +120,16 @@ pub mod tests {
|
||||
/// DFA automata that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet: Alphabet::from(vec![0,32,33]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(&[0,32,33]),
|
||||
links: Matrix::from(vec![
|
||||
vec![I,1,I],
|
||||
vec![I,2,I],
|
||||
vec![I,2,I],
|
||||
vec![INVALID,1,INVALID],
|
||||
vec![INVALID,2,INVALID],
|
||||
vec![INVALID,2,INVALID],
|
||||
]),
|
||||
callbacks: vec![
|
||||
None,
|
||||
Some(Callback{priority:3,name:"group0_rule0".into()}),
|
||||
Some(Callback{priority:3,name:"group0_rule0".into()}),
|
||||
Some(RuleExecutable {priority:3, code:"group0_rule0".into()}),
|
||||
Some(RuleExecutable {priority:3, code:"group0_rule0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
@ -119,18 +137,18 @@ pub mod tests {
|
||||
/// DFA automata that accepts one letter a..=z or any many spaces.
|
||||
pub fn letter_and_spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet: Alphabet::from(vec![32,33,97,123]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(&[32,33,97,123]),
|
||||
links: Matrix::from(vec![
|
||||
vec![I,1,I,2,I],
|
||||
vec![I,3,I,I,I],
|
||||
vec![I,I,I,I,I],
|
||||
vec![I,3,I,I,I],
|
||||
vec![INVALID,1,INVALID,2,INVALID],
|
||||
vec![INVALID,3,INVALID,INVALID,INVALID],
|
||||
vec![INVALID,INVALID,INVALID,INVALID,INVALID],
|
||||
vec![INVALID,3,INVALID,INVALID,INVALID],
|
||||
]),
|
||||
callbacks: vec![
|
||||
None,
|
||||
Some(Callback{priority:4,name:"group0_rule1".into()}),
|
||||
Some(Callback{priority:4,name:"group0_rule0".into()}),
|
||||
Some(Callback{priority:4,name:"group0_rule1".into()}),
|
||||
Some(RuleExecutable {priority:4, code:"group0_rule1".into()}),
|
||||
Some(RuleExecutable {priority:4, code:"group0_rule0".into()}),
|
||||
Some(RuleExecutable {priority:4, code:"group0_rule1".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
@ -1,80 +1,93 @@
|
||||
//! Implementation of Nondeterministic Finite Automata and it's conversion to DFA.
|
||||
//! The structure for defining non-deterministic finite automata.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::dfa::RuleExecutable;
|
||||
use crate::automata::dfa::DFA;
|
||||
use crate::automata::dfa::Callback;
|
||||
use crate::automata::state::Link;
|
||||
use crate::automata::state::Symbol;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::automata::state::State;
|
||||
use crate::automata::state::Transition;
|
||||
use crate::automata::state;
|
||||
use crate::automata::symbol::Symbol;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use itertools::Itertools;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
|
||||
// ========================================
|
||||
// === Nondeterministic Finite Automata ===
|
||||
// ========================================
|
||||
|
||||
/// Type alias for a state Id based on set of states.
|
||||
/// It is used during NFA -> DFA transformation where multiple states can merge together,
|
||||
/// thanks to epsilon links.
|
||||
type StateSetId = BTreeSet<state::Id>;
|
||||
// =========================================
|
||||
// === Non-Deterministic Finite Automata ===
|
||||
// =========================================
|
||||
|
||||
/// NFA automata with a set of symbols, states and transitions.
|
||||
/// Nondeterministic Finite Automata is a finite-state machine that accepts or rejects a given
|
||||
/// sequence of symbols.
|
||||
/// Compared to `DFA`, NFA can transition into multiple new states without reading any symbol
|
||||
/// (so called epsilon link / transition),
|
||||
/// ___ ___ ___ ___ ___
|
||||
/// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 |
|
||||
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
/// A state identifier based on a set of states.
|
||||
///
|
||||
/// This is used during the NFA -> DFA transformation, where multiple states can merge together due
|
||||
/// to the collapsing of epsilon transitions.
|
||||
type StateSetId = BTreeSet<state::Identifier>;
|
||||
|
||||
/// The definition of a [NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) for a
|
||||
/// given set of symbols, states, and transitions (specifically a NFA with ε-moves).
|
||||
///
|
||||
/// A NFA is a finite state automaton that accepts or rejects a given sequence of symbols. In
|
||||
/// contrast with a DFA, the NFA may transition between states _without_ reading any new symbol
|
||||
/// through use of
|
||||
/// [epsilon links](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton#NFA_with_%CE%B5-moves).
|
||||
///
|
||||
/// ```text
|
||||
/// ┌───┐ 'N' ┌───┐ ┌───┐ 'F' ┌───┐ ┌───┐ 'A' ┌───┐
|
||||
/// │ 0 │──────▶│ 1 │──▶│ 2 │──────▶│ 3 │──▶│ 3 │──────▶│ 3 │
|
||||
/// └───┘ └───┘ ε └───┘ └───┘ ε └───┘ └───┘
|
||||
/// ```
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct NFA {
|
||||
/// Finite set of all valid input symbols.
|
||||
pub alphabet: Alphabet,
|
||||
/// Set of named NFA states with (epsilon) transitions.
|
||||
/// A set of disjoint intervals over the input alphabet.
|
||||
pub alphabet_segmentation: alphabet::Segmentation,
|
||||
/// A set of named NFA states, with (epsilon) transitions.
|
||||
pub states: Vec<State>,
|
||||
}
|
||||
|
||||
impl NFA {
|
||||
/// Adds a new state to NFA and returns it's Id.
|
||||
pub fn new_state(&mut self) -> state::Id {
|
||||
/// Adds a new state to the NFA and returns its identifier.
|
||||
pub fn new_state(&mut self) -> state::Identifier {
|
||||
let id = self.states.len();
|
||||
self.states.push(State::default());
|
||||
state::Id {id}
|
||||
state::Identifier {id}
|
||||
}
|
||||
|
||||
/// Creates an epsilon transition between two states.
|
||||
/// Whenever the automata happens to be in `source` state it can immediatelly move to
|
||||
/// `target` state (but does not have to).
|
||||
pub fn connect(&mut self, source:state::Id, target:state::Id) {
|
||||
///
|
||||
/// Whenever the automaton happens to be in `source` state it can immediately transition to the
|
||||
/// `target` state. It is, however, not _required_ to do so.
|
||||
pub fn connect(&mut self, source:state::Identifier, target:state::Identifier) {
|
||||
self.states[source.id].epsilon_links.push(target);
|
||||
}
|
||||
|
||||
/// Creates an ordinary transition (for a range of symbols) between two states.
|
||||
/// If any symbol from such range happens to be on input when the automata is in `source`
|
||||
/// state, it will immediatelly move to `target` state.
|
||||
pub fn connect_by
|
||||
(&mut self, source:state::Id, target:state::Id, symbols:&RangeInclusive<Symbol>) {
|
||||
self.alphabet.insert(symbols.clone());
|
||||
self.states[source.id].links.push(Link{symbols:symbols.clone(), target});
|
||||
/// Creates an ordinary transition for a range of symbols.
|
||||
///
|
||||
/// If any symbol from such range happens to be the input when the automaton is in the `source`
|
||||
/// state, it will immediately transition to the `target` state.
|
||||
pub fn connect_via
|
||||
( &mut self
|
||||
, source:state::Identifier
|
||||
, target_state:state::Identifier
|
||||
, symbols:&RangeInclusive<Symbol>) {
|
||||
self.alphabet_segmentation.insert(symbols.clone());
|
||||
self.states[source.id].links.push(Transition{symbols:symbols.clone(), target_state});
|
||||
}
|
||||
|
||||
/// Transforms pattern to NFA.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
|
||||
pub fn new_pattern(&mut self, source:state::Id, pattern:&Pattern) -> state::Id {
|
||||
/// Transforms a pattern to an NFA using the algorithm described
|
||||
/// [here](https://www.youtube.com/watch?v=RYNN-tb9WxI).
|
||||
pub fn new_pattern(&mut self, source:state::Identifier, pattern:&Pattern) -> state::Identifier {
|
||||
let current = self.new_state();
|
||||
self.connect(source,current);
|
||||
match pattern {
|
||||
Pattern::Range(range) => {
|
||||
let state = self.new_state();
|
||||
self.connect_by(current,state,range);
|
||||
self.connect_via(current,state,range);
|
||||
state
|
||||
},
|
||||
Pattern::Many(body) => {
|
||||
@ -101,18 +114,15 @@ impl NFA {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === NFA -> DFA ===
|
||||
|
||||
/// Merges states that are connected by epsilon links.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
|
||||
/// Merges states that are connected by epsilon links, using an algorithm based on the one shown
|
||||
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
|
||||
fn eps_matrix(&self) -> Vec<StateSetId> {
|
||||
fn fill_eps_matrix
|
||||
( nfa : &NFA
|
||||
, states : &mut Vec<StateSetId>
|
||||
, computed : &mut Vec<bool>
|
||||
, visited : &mut Vec<bool>
|
||||
, state : state::Id
|
||||
, state : state::Identifier
|
||||
) {
|
||||
let mut state_set = StateSetId::new();
|
||||
let mut circular = false;
|
||||
@ -138,18 +148,17 @@ impl NFA {
|
||||
let mut computed = vec![false; self.states.len()];
|
||||
for id in 0..self.states.len() {
|
||||
let mut visited = vec![false; states.len()];
|
||||
fill_eps_matrix(self,&mut states,&mut computed,&mut visited,state::Id{id});
|
||||
fill_eps_matrix(self,&mut states,&mut computed,&mut visited,state::Identifier{id});
|
||||
}
|
||||
states
|
||||
}
|
||||
|
||||
/// Computes a transition matrix (state X symbol => state) for NFA.
|
||||
/// Ignores epsilon links.
|
||||
fn nfa_matrix(&self) -> Matrix<state::Id> {
|
||||
let mut matrix = Matrix::new(self.states.len(),self.alphabet.symbols.len());
|
||||
/// Computes a transition matrix `(state, symbol) => state` for the NFA, ignoring epsilon links.
|
||||
fn nfa_matrix(&self) -> Matrix<state::Identifier> {
|
||||
let mut matrix = Matrix::new(self.states.len(),self.alphabet_segmentation.divisions.len());
|
||||
|
||||
for (state_ix, source) in self.states.iter().enumerate() {
|
||||
let targets = source.targets(&self.alphabet);
|
||||
let targets = source.targets(&self.alphabet_segmentation);
|
||||
for (voc_ix, &target) in targets.iter().enumerate() {
|
||||
matrix[(state_ix,voc_ix)] = target;
|
||||
}
|
||||
@ -159,26 +168,27 @@ impl NFA {
|
||||
}
|
||||
|
||||
impl From<&NFA> for DFA {
|
||||
/// Transforms NFA into DFA.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
|
||||
|
||||
/// Transforms an NFA into a DFA, based on the algorithm described
|
||||
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
|
||||
fn from(nfa:&NFA) -> Self {
|
||||
let nfa_mat = nfa.nfa_matrix();
|
||||
let eps_mat = nfa.eps_matrix();
|
||||
let mut dfa_mat = Matrix::new(0,nfa.alphabet.symbols.len());
|
||||
let mut dfa_mat = Matrix::new(0,nfa.alphabet_segmentation.divisions.len());
|
||||
let mut dfa_eps_ixs = Vec::<StateSetId>::new();
|
||||
let mut dfa_eps_map = HashMap::<StateSetId,state::Id>::new();
|
||||
let mut dfa_eps_map = HashMap::<StateSetId,state::Identifier>::new();
|
||||
|
||||
dfa_eps_ixs.push(eps_mat[0].clone());
|
||||
dfa_eps_map.insert(eps_mat[0].clone(), state::Id{id:0});
|
||||
dfa_eps_map.insert(eps_mat[0].clone(),state::Identifier::from(0));
|
||||
|
||||
let mut i = 0;
|
||||
while i < dfa_eps_ixs.len() {
|
||||
dfa_mat.new_row();
|
||||
for voc_ix in 0..nfa.alphabet.symbols.len() {
|
||||
for voc_ix in 0..nfa.alphabet_segmentation.divisions.len() {
|
||||
let mut eps_set = StateSetId::new();
|
||||
for &eps_ix in &dfa_eps_ixs[i] {
|
||||
let tgt = nfa_mat[(eps_ix.id,voc_ix)];
|
||||
if tgt != state::INVALID {
|
||||
if tgt != state::Identifier::INVALID {
|
||||
eps_set.extend(eps_mat[tgt.id].iter());
|
||||
}
|
||||
}
|
||||
@ -186,7 +196,7 @@ impl From<&NFA> for DFA {
|
||||
dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) {
|
||||
Some(&id) => id,
|
||||
None => {
|
||||
let id = state::Id {id:dfa_eps_ixs.len()};
|
||||
let id = state::Identifier {id:dfa_eps_ixs.len()};
|
||||
dfa_eps_ixs.push(eps_set.clone());
|
||||
dfa_eps_map.insert(eps_set,id);
|
||||
id
|
||||
@ -200,14 +210,17 @@ impl From<&NFA> for DFA {
|
||||
let mut callbacks = vec![None; dfa_eps_ixs.len()];
|
||||
let priority = dfa_eps_ixs.len();
|
||||
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
|
||||
let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some();
|
||||
let has_name = |&key:&state::Identifier| nfa.states[key.id].name.is_some();
|
||||
if let Some(eps) = epss.into_iter().find(has_name) {
|
||||
let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap();
|
||||
callbacks[dfa_ix] = Some(Callback {name:rule,priority});
|
||||
let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap();
|
||||
callbacks[dfa_ix] = Some(RuleExecutable{ code:rule,priority});
|
||||
}
|
||||
}
|
||||
|
||||
DFA {alphabet:nfa.alphabet.clone(),links:dfa_mat,callbacks}
|
||||
let alphabet_segmentation = nfa.alphabet_segmentation.clone();
|
||||
let links = dfa_mat;
|
||||
|
||||
DFA{alphabet_segmentation,links,callbacks}
|
||||
}
|
||||
}
|
||||
|
||||
@ -218,13 +231,13 @@ impl From<&NFA> for DFA {
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
extern crate test;
|
||||
|
||||
|
||||
use crate::automata::dfa;
|
||||
|
||||
use super::*;
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
|
||||
/// NFA automata that accepts newline '\n'.
|
||||
/// NFA that accepts a newline '\n'.
|
||||
pub fn newline() -> NFA {
|
||||
NFA {
|
||||
states: vec![
|
||||
@ -233,11 +246,11 @@ pub mod tests {
|
||||
State::from(vec![3]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet: Alphabet::from(vec![10,11]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts any letter a..=z.
|
||||
/// NFA that accepts any letter in the range a..=z.
|
||||
pub fn letter() -> NFA {
|
||||
NFA {
|
||||
states: vec![
|
||||
@ -246,11 +259,11 @@ pub mod tests {
|
||||
State::from(vec![3]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet: Alphabet::from(vec![97,123]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts any number of spaces ' '.
|
||||
/// NFA that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> NFA {
|
||||
NFA {
|
||||
states: vec![
|
||||
@ -265,11 +278,11 @@ pub mod tests {
|
||||
State::from(vec![5,9]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet: Alphabet::from(vec![0,32,33]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts one letter a..=z or many spaces ' '.
|
||||
/// NFA that accepts one letter a..=z or many spaces ' '.
|
||||
pub fn letter_and_spaces() -> NFA {
|
||||
NFA {
|
||||
states: vec![
|
||||
@ -286,10 +299,10 @@ pub mod tests {
|
||||
State::from(vec![7,11]).named("group0_rule1"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet: Alphabet::from(vec![32,33,97,123]),
|
||||
alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_newline() {
|
||||
assert_eq!(DFA::from(&newline()),dfa::tests::newline());
|
||||
|
@ -1,7 +1,6 @@
|
||||
//! Simple API for constructing regex patterns that are used in parser implementation.
|
||||
|
||||
use crate::parser;
|
||||
use crate::automata::state::Symbol;
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use core::iter;
|
||||
use itertools::Itertools;
|
||||
@ -9,25 +8,127 @@ use std::ops::BitAnd;
|
||||
use std::ops::BitOr;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use Pattern::*;
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// == Pattern ==
|
||||
// =============
|
||||
|
||||
/// Simple regex pattern.
|
||||
/// A representation of a simple regular pattern.
|
||||
#[derive(Clone,Debug)]
|
||||
pub enum Pattern {
|
||||
/// Pattern that triggers on any symbol from given range.
|
||||
/// The pattern that triggers on any symbol from the given range.
|
||||
Range(RangeInclusive<Symbol>),
|
||||
/// Pattern that triggers on any given pattern from sequence.
|
||||
/// The pattern that triggers on any given pattern from a sequence.
|
||||
Or(Vec<Pattern>),
|
||||
/// Pattern that triggers when a sequence of patterns is encountered.
|
||||
/// The pattern that triggers when a sequence of patterns is encountered.
|
||||
And(Vec<Pattern>),
|
||||
/// Pattern that triggers on 0..N repetitions of given pattern.
|
||||
/// The pattern that triggers on 0..N repetitions of given pattern.
|
||||
Many(Box<Pattern>)
|
||||
}
|
||||
|
||||
use Pattern::*;
|
||||
impl Pattern {
|
||||
|
||||
/// A pattern that never triggers.
|
||||
pub fn never() -> Self {
|
||||
Pattern::symbols(Symbol::from(1)..=Symbol::from(0))
|
||||
}
|
||||
|
||||
/// A pattern that always triggers
|
||||
pub fn always() -> Self {
|
||||
Pattern::symbols(Symbol::from(u32::min_value())..=Symbol::from(u32::max_value()))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any character.
|
||||
pub fn any_char() -> Self {
|
||||
Pattern::symbols(Symbol::from(0)..=Symbol::from(u32::max_value()))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 0..N repetitions of the pattern described by `self`.
|
||||
pub fn many(self) -> Self {
|
||||
Many(Box::new(self))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 1..N repetitions of the pattern described by `self`.
|
||||
pub fn many1(self) -> Self {
|
||||
self.clone() & self.many()
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`.
|
||||
pub fn opt(self) -> Self {
|
||||
self | Self::always()
|
||||
}
|
||||
|
||||
/// A pattern that triggers on the given character.
|
||||
pub fn char(character:char) -> Self {
|
||||
Self::symbol(Symbol::from(character))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on the given symbol.
|
||||
pub fn symbol(symbol:Symbol) -> Self {
|
||||
Pattern::symbols(symbol..=symbol)
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any of the provided `symbols`.
|
||||
pub fn symbols(symbols:RangeInclusive<Symbol>) -> Self {
|
||||
Pattern::Range(symbols)
|
||||
}
|
||||
|
||||
/// A pattern that triggers at the end of the file.
|
||||
pub fn eof() -> Self {
|
||||
Self::symbol(Symbol::EOF_CODE)
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any character in the provided `range`.
|
||||
pub fn range(range:RangeInclusive<char>) -> Self {
|
||||
Pattern::symbols(Symbol::from(*range.start())..=Symbol::from(*range.end()))
|
||||
}
|
||||
|
||||
/// Pattern that triggers when sequence of characters given by `chars` is encountered.
|
||||
pub fn all(chars:&str) -> Self {
|
||||
chars.chars().fold(Self::never(), |pat,char| pat & Self::char(char))
|
||||
}
|
||||
|
||||
/// The pattern that triggers on any characters contained in `chars`.
|
||||
pub fn any(chars:&str) -> Self {
|
||||
chars.chars().fold(Self::never(), |pat,char| pat | Self::char(char))
|
||||
}
|
||||
|
||||
/// The pattern that doesn't trigger on any character contained in `chars`.
|
||||
pub fn none(chars:&str) -> Self {
|
||||
let max = u32::max_value();
|
||||
let char_iter = chars.chars().map(|char| char as u32);
|
||||
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
|
||||
let mut codes = char_iter2.collect_vec();
|
||||
|
||||
codes.sort();
|
||||
codes.iter().tuple_windows().fold(Self::never(), |pat,(start,end)| {
|
||||
if end < start {pat} else {
|
||||
pat | Pattern::symbols(Symbol::from(*start)..=Symbol::from(*end))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// The pattern that triggers on any character but `char`.
|
||||
pub fn not(char:char) -> Self {
|
||||
Self::none(&char.to_string())
|
||||
}
|
||||
|
||||
/// The pattern that triggers on `num` repetitions of `pat`.
|
||||
pub fn repeat(pat:Pattern, num:usize) -> Self {
|
||||
(0..num).fold(Self::always(), |p,_| p & pat.clone())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on `min`..`max` repetitions of `pat`.
|
||||
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
|
||||
(min..max).fold(Self::never(), |p,n| p | Self::repeat(pat.clone(),n))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ====
|
||||
|
||||
impl BitOr<Pattern> for Pattern {
|
||||
type Output = Pattern;
|
||||
@ -52,101 +153,3 @@ impl BitAnd<Pattern> for Pattern {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Pattern {
|
||||
|
||||
/// Pattern that never triggers.
|
||||
pub fn never() -> Self {
|
||||
Pattern::symbols(1..=0)
|
||||
}
|
||||
|
||||
/// Pattern that always triggers.
|
||||
pub fn always() -> Self {
|
||||
Pattern::symbols(u32::min_value()..=u32::max_value())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any char.
|
||||
pub fn any_char() -> Self {
|
||||
Pattern::symbols(0..=u32::max_value())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on 0..N repetitions of given pattern.
|
||||
pub fn many(self) -> Self {
|
||||
Many(Box::new(self))
|
||||
}
|
||||
|
||||
/// Pattern that triggers on 1..N repetitions of given pattern.
|
||||
pub fn many1(self) -> Self {
|
||||
self.clone() & self.many()
|
||||
}
|
||||
|
||||
/// Pattern that triggers on 0..=1 repetitions of given pattern.
|
||||
pub fn opt(self) -> Self {
|
||||
self | Self::always()
|
||||
}
|
||||
|
||||
/// Pattern that triggers on given symbol
|
||||
pub fn symbol(symbol:u32) -> Self {
|
||||
Pattern::symbols(symbol..=symbol)
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any of the given symbols.
|
||||
pub fn symbols(symbols:RangeInclusive<u32>) -> Self {
|
||||
Pattern::Range(Symbol{val:*symbols.start()}..=Symbol{val:*symbols.end()})
|
||||
}
|
||||
|
||||
/// Pattern that triggers on end of file.
|
||||
pub fn eof() -> Self {
|
||||
Self::symbol(parser::EOF_CODE.val)
|
||||
}
|
||||
|
||||
/// Pattern that triggers on given character.
|
||||
pub fn char(char:char) -> Self {
|
||||
Self::symbol(char as u32)
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any of the given characters.
|
||||
pub fn range(chars:RangeInclusive<char>) -> Self {
|
||||
Pattern::symbols((*chars.start() as u32)..=(*chars.end() as u32))
|
||||
}
|
||||
|
||||
/// Pattern that triggers when sequence of characters is encountered.
|
||||
pub fn all(chars:&str) -> Self {
|
||||
chars.chars().fold(Self::never(), |pat,char| pat & Self::char(char))
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any characters from given sequence.
|
||||
pub fn any(chars:&str) -> Self {
|
||||
chars.chars().fold(Self::never(), |pat,char| pat | Self::char(char))
|
||||
}
|
||||
|
||||
/// Pattern that doesn't trigger on any given character from given sequence.
|
||||
pub fn none(chars:&str) -> Self {
|
||||
let max = u32::max_value();
|
||||
let char_iter = chars.chars().map(|char| char as u32);
|
||||
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
|
||||
let mut codes = char_iter2.collect_vec();
|
||||
|
||||
codes.sort();
|
||||
codes.iter().tuple_windows().fold(Self::never(), |pat,(start,end)| {
|
||||
if end < start {pat} else {
|
||||
pat | Pattern::symbols(*start..=*end)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any character but the one given.
|
||||
pub fn not(char:char) -> Self {
|
||||
Self::none(&char.to_string())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on N repetitions of given pattern.
|
||||
pub fn repeat(pat:Pattern, num:usize) -> Self {
|
||||
(0..num).fold(Self::always(), |p,_| p & pat.clone())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on MIN..MAX repetitions of given pattern.
|
||||
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
|
||||
(min..max).fold(Self::never(), |p,n| p | Self::repeat(pat.clone(),n))
|
||||
}
|
||||
}
|
||||
|
@ -1,70 +1,31 @@
|
||||
//! This module exports State implementation for Nondeterministic Finite Automata.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::state;
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use std::ops::RangeInclusive;
|
||||
use crate::prelude::*;
|
||||
|
||||
|
||||
|
||||
// =======================
|
||||
// == State Of Automata ==
|
||||
// =======================
|
||||
// ===========
|
||||
// == State ==
|
||||
// ===========
|
||||
|
||||
/// Flag for invalid state.
|
||||
/// When finite automata gets into invalid state the input sequence of symbols is rejected.
|
||||
pub const INVALID:Id = Id {id:usize::max_value()};
|
||||
|
||||
// TODO [AA] Extract this. Turn it into using `char
|
||||
/// Newtype wrapper for finite automata input symbol.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
pub struct Symbol {
|
||||
#[allow(missing_docs)]
|
||||
pub val: u32
|
||||
}
|
||||
|
||||
// TODO [AA] Define some constants on char
|
||||
|
||||
/// Newtype wrapper for finite automata state ID.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
pub struct Id {
|
||||
#[allow(missing_docs)]
|
||||
pub id: usize
|
||||
}
|
||||
|
||||
impl Default for Id {
|
||||
/// Returns state::INVALID. This is because every finite automata has an invalid state
|
||||
/// and because all transitions in automata transition matrix lead to invalid state by default.
|
||||
fn default() -> Self {
|
||||
state::INVALID
|
||||
}
|
||||
}
|
||||
|
||||
/// Named NFA state with a set of transitions (links).
|
||||
/// A named state for a [`super::nfa::NFA`].
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct State {
|
||||
/// Set of transitions that don't require any symbol to trigger.
|
||||
/// I.E. If there is an epsilon link from state A to state B, then whenever we are in state A,
|
||||
/// we can freely move to state B.
|
||||
pub epsilon_links: Vec<Id>,
|
||||
/// Set of transitions that trigger with specific symbol on input.
|
||||
/// When triggered, the automata will transition to the `link.target`.
|
||||
pub links: Vec<Link>,
|
||||
/// Name of the state.
|
||||
/// We use it to autogenerate a call to Rust method with same name.
|
||||
/// A set of transitions that can trigger without consuming a symbol (ε-transitions).
|
||||
pub epsilon_links: Vec<Identifier>,
|
||||
/// The set of transitions that trigger while consuming a specific symbol.
|
||||
///
|
||||
/// When triggered, the automaton will transition to the [`Transition::target_state`].
|
||||
pub links: Vec<Transition>,
|
||||
/// The name of the state.
|
||||
///
|
||||
/// This is used to auto-generate a call to the rust method of the same name.
|
||||
pub name: Option<String>,
|
||||
}
|
||||
|
||||
/// A transition to new automata state
|
||||
/// that requires specific symbol on automata input to trigger.
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Link {
|
||||
/// Any symbol from the range will trigger this link.
|
||||
pub symbols: RangeInclusive<Symbol>,
|
||||
/// A state that is visited, after the link is triggered.
|
||||
pub target: Id,
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Updater for field `name`. Returns updated state.
|
||||
pub fn named(mut self, name:&str) -> Self {
|
||||
@ -73,29 +34,32 @@ impl State {
|
||||
}
|
||||
|
||||
/// Returns transition (next state) for each symbol in alphabet.
|
||||
pub fn targets(&self, alphabet:&Alphabet) -> Vec<Id> {
|
||||
pub fn targets(&self, alphabet:&alphabet::Segmentation) -> Vec<Identifier> {
|
||||
let mut targets = vec![];
|
||||
let mut index = 0;
|
||||
let mut links = self.links.clone();
|
||||
links.sort_by_key(|link| *link.symbols.start());
|
||||
for &symbol in &alphabet.symbols {
|
||||
for &symbol in &alphabet.divisions {
|
||||
while links.len() > index && *links[index].symbols.end() < symbol {
|
||||
index += 1;
|
||||
}
|
||||
if links.len() <= index || *links[index].symbols.start() > symbol {
|
||||
targets.push(state::INVALID);
|
||||
targets.push(Identifier::INVALID);
|
||||
} else {
|
||||
targets.push(links[index].target);
|
||||
targets.push(links[index].target_state);
|
||||
}
|
||||
}
|
||||
targets
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ====
|
||||
|
||||
impl From<Vec<usize>> for State {
|
||||
/// Creates a state with epsilon links.
|
||||
fn from(vec:Vec<usize>) -> Self {
|
||||
let epsilon_links = vec.iter().cloned().map(|id| Id{id}).collect();
|
||||
let epsilon_links = vec.iter().cloned().map(|id| Identifier {id}).collect();
|
||||
State {epsilon_links,..Default::default()}
|
||||
}
|
||||
}
|
||||
@ -106,9 +70,60 @@ impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
|
||||
let link = |(range, id): (RangeInclusive<u32>, usize)| {
|
||||
let start = Symbol{val:*range.start()};
|
||||
let end = Symbol{val:*range.end()};
|
||||
Link {symbols: start..=end, target: Id{ id }}
|
||||
Transition {symbols: start..=end, target_state: Identifier { id }}
|
||||
};
|
||||
let links = vec.iter().cloned().map(link).collect();
|
||||
State {links,..Default::default()}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// == Identifier ==
|
||||
// ================
|
||||
|
||||
/// A state identifier for an arbitrary finite automaton.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct Identifier {
|
||||
pub id: usize
|
||||
}
|
||||
|
||||
impl Identifier {
|
||||
/// An identifier representing the invalid state.
|
||||
///
|
||||
/// When in an invalid state, a finite automaton will reject the sequence of input symbols.
|
||||
pub const INVALID:Identifier = Identifier{id:usize::max_value()};
|
||||
}
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Identifier {
|
||||
/// Returns state::INVALID. This is because every finite automata has an invalid state
|
||||
/// and because all transitions in automata transition matrix lead to invalid state by default.
|
||||
fn default() -> Self {
|
||||
Identifier::INVALID
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for Identifier {
|
||||
fn from(id: usize) -> Self {
|
||||
Identifier{id}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ============
|
||||
// === Link ===
|
||||
// ============
|
||||
|
||||
/// A transition between states in a finite automaton that must consume a symbol to trigger.
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Transition {
|
||||
/// The range of symbols on which this transition will trigger.
|
||||
pub symbols: RangeInclusive<Symbol>,
|
||||
/// The state that is entered after the transition has triggered.
|
||||
pub target_state: Identifier,
|
||||
}
|
||||
|
49
lib/rust/flexer/src/automata/symbol.rs
Normal file
49
lib/rust/flexer/src/automata/symbol.rs
Normal file
@ -0,0 +1,49 @@
|
||||
//! Defines a Symbol that is operated on by the finite automata.
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Symbol ===
|
||||
// ==============
|
||||
|
||||
/// An input symbol to a finite automaton.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
pub struct Symbol {
|
||||
#[allow(missing_docs)]
|
||||
pub val: u32
|
||||
}
|
||||
|
||||
impl Symbol {
|
||||
/// A representation of the end of the file.
|
||||
pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()};
|
||||
|
||||
/// A representation of the null symbol.
|
||||
pub const NULL:Symbol = Symbol{val:0};
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Symbol {
|
||||
fn default() -> Self {
|
||||
Symbol::NULL
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for Symbol {
|
||||
fn from(val:u32) -> Symbol {
|
||||
Symbol{val}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<char> for Symbol {
|
||||
fn from(val:char) -> Symbol {
|
||||
Symbol{val:val as u32}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Symbol> for Symbol {
|
||||
fn from(symb: &Symbol) -> Self {
|
||||
Symbol{val:symb.val}
|
||||
}
|
||||
}
|
@ -1,3 +1,3 @@
|
||||
//! Generic datastructures, with multiple usecases.
|
||||
//! Generic data-structures to support multiple use-cases.
|
||||
|
||||
pub mod matrix;
|
||||
pub mod matrix;
|
||||
|
@ -1,4 +1,6 @@
|
||||
//! Efficient representation of 2D matrix.
|
||||
//! An efficient representation of a 2D matrix.
|
||||
|
||||
use enso_prelude::default;
|
||||
|
||||
use std::ops::Index;
|
||||
use std::ops::IndexMut;
|
||||
@ -9,17 +11,38 @@ use std::ops::IndexMut;
|
||||
// == Matrix ==
|
||||
// ============
|
||||
|
||||
/// Efficient 2D matrix implemented on top of vector.
|
||||
/// An efficient 2D matrix implemented on top of [`std::vec::Vec`].
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct Matrix<T> {
|
||||
/// The number of rows in matrix.
|
||||
/// The number of rows in the matrix.
|
||||
rows: usize,
|
||||
/// The number of columns in matrix.
|
||||
/// The number of columns in the matrix.
|
||||
columns: usize,
|
||||
/// Matrix implemented with vector.
|
||||
/// The matrix.
|
||||
matrix: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T:Default> Matrix<T> {
|
||||
/// Constructs a matrix with the dimensions given by `rows` and `columns`.
|
||||
pub fn new(rows:usize, columns:usize) -> Self {
|
||||
let mut matrix = Vec::with_capacity(rows*columns);
|
||||
for _ in 0..matrix.capacity() {
|
||||
matrix.push(default())
|
||||
}
|
||||
Self{rows,columns,matrix}
|
||||
}
|
||||
|
||||
/// Adds a new row to the matrix `self`, filled with default values.
|
||||
pub fn new_row(&mut self) {
|
||||
for _ in 0..self.columns {
|
||||
self.matrix.push(default());
|
||||
}
|
||||
self.rows += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl<T> Index<(usize,usize)> for Matrix<T> {
|
||||
type Output = T;
|
||||
@ -33,22 +56,3 @@ impl<T> IndexMut<(usize,usize)> for Matrix<T> {
|
||||
&mut self.matrix[index.0*self.columns+index.1]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T:Default> Matrix<T> {
|
||||
/// Constructs a new matrix for given number of rows and columns.
|
||||
pub fn new(rows:usize, columns:usize) -> Self {
|
||||
let mut matrix = Vec::with_capacity(rows*columns);
|
||||
for _ in 0..matrix.capacity() {
|
||||
matrix.push(Default::default())
|
||||
}
|
||||
Self{rows,columns,matrix}
|
||||
}
|
||||
|
||||
/// Adds a new row to matrix, filled with default values.
|
||||
pub fn new_row(&mut self) {
|
||||
for _ in 0..self.columns {
|
||||
self.matrix.push(Default::default());
|
||||
}
|
||||
self.rows += 1;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
//! This module exports API for grouping multiple rules (Rust callbacks with regex pattern) together.
|
||||
//! This module provides an API for grouping multiple flexer rules.
|
||||
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::automata::nfa::NFA;
|
||||
@ -14,41 +14,59 @@ pub mod rule;
|
||||
// == Group ==
|
||||
// ===========
|
||||
|
||||
/// Struct that group rules together. It also inherits rules from parent group (if it has one).
|
||||
/// Groups are the basic building block of flexer:
|
||||
/// Flexer internally keeps a stack of groups, only one of them active at a time.
|
||||
/// Each group contains set of regex patterns and callbacks (together called `Rule`).
|
||||
/// Whenever a rule.pattern from active group is matched with part of input the associated
|
||||
/// rule.callback is executed, which in turn may exit the current groupor enter a new one.
|
||||
/// This allows us to nicely model a situation, where certain part of program (like a string literal)
|
||||
/// should have very different parsing rules than other (for example body of function).
|
||||
/// Note that the input is first matched with first added rule, then with the second etc.
|
||||
/// Therefore, if two rules overlap, only the callback of the first added rule will be executed.
|
||||
/// A group is a structure for associating multiple rules with each other, and is the basic building
|
||||
/// block of the flexer.
|
||||
///
|
||||
/// A group consists of the following:
|
||||
///
|
||||
/// - A set of [`Rule`s](Rule), each containing a regex pattern and associated callback.
|
||||
/// - Inherited rules from a parent group, if such a group exists.
|
||||
///
|
||||
/// Internally, the flexer maintains a stack of groups, where only one group can be active at any
|
||||
/// given time. Rules are matched _in order_, and hence overlaps are handled by the order in which
|
||||
/// the rules are matched, with the first callback being triggered.
|
||||
///
|
||||
/// Whenever a [`rule.pattern`](Rule::pattern) from the active group is matched against part of the
|
||||
/// input, the associated [`rule.callback`](Rule::callback) is executed. This callback may exit the
|
||||
/// current group or even enter a new one. As a result, groups allow us to elegantly model a
|
||||
/// situation where certain parts of a program (e.g. within a string literal) have very different
|
||||
/// lexing rules than other portions of a program (e.g. the body of a function).
|
||||
#[derive(Clone,Debug,Default)]
|
||||
pub struct Group {
|
||||
/// Unique ID.
|
||||
/// A unique identifier for the group.
|
||||
pub id: usize,
|
||||
/// Custom name which is used for debugging.
|
||||
/// A name for the group (useful in debugging).
|
||||
pub name: String,
|
||||
/// Parent which we inherit rules from.
|
||||
/// The parent group from which rules are inherited.
|
||||
pub parent: Option<Box<Group>>,
|
||||
/// Set of regex patterns with associated callbacks.
|
||||
/// A set of flexer rules.
|
||||
pub rules: Vec<Rule>,
|
||||
}
|
||||
|
||||
impl Group {
|
||||
/// Adds new rule (regex pattern with associated callback) to group.
|
||||
|
||||
/// Adds a new rule to the current group.
|
||||
pub fn add_rule(&mut self, rule:Rule) {
|
||||
self.rules.push(rule)
|
||||
}
|
||||
|
||||
/// Returns rule builder for given pattern.
|
||||
/// TODO[jv] better describe it's purpose once we agree on correct API.
|
||||
/// Returns a rule builder for the given pattern.
|
||||
pub fn rule(&mut self, pattern:Pattern) -> rule::Builder<impl FnMut(Rule) + '_> {
|
||||
rule::Builder{pattern,callback:move |rule| self.add_rule(rule)}
|
||||
rule::Builder{pattern, callback:move |rule| self.add_rule(rule)}
|
||||
}
|
||||
|
||||
/// All rules including parent rules.
|
||||
/// The canonical name for a given rule.
|
||||
fn callback_name(&self, rule_ix:usize) -> String {
|
||||
format!("group{}_rule{}", self.id, rule_ix)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Getters ===
|
||||
|
||||
impl Group {
|
||||
|
||||
/// The full set of rules, including parent rules.
|
||||
pub fn rules(&self) -> Vec<&Rule> {
|
||||
let mut parent = &self.parent;
|
||||
let mut rules = (&self.rules).iter().collect_vec();
|
||||
@ -58,16 +76,16 @@ impl Group {
|
||||
}
|
||||
rules
|
||||
}
|
||||
|
||||
/// Canonical name of given rule.
|
||||
fn callback_name(&self, rule_ix:usize) -> String {
|
||||
format!("group{}_rule{}",self.id,rule_ix)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<&Group> for NFA {
|
||||
/// Transforms Group to NFA.
|
||||
/// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
|
||||
/// Transforms the input group into an NFA.
|
||||
///
|
||||
/// The algorithm is based on this algorithm for
|
||||
/// [converting a regular expression to an NFA](https://www.youtube.com/watch?v=RYNN-tb9WxI).
|
||||
fn from(group:&Group) -> Self {
|
||||
let mut nfa = NFA::default();
|
||||
let start = nfa.new_state();
|
||||
@ -83,6 +101,7 @@ impl From<&Group> for NFA {
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
@ -1,5 +1,8 @@
|
||||
//! An API for declaring Rust callbacks for encountered regex patterns.
|
||||
//! An API for declaring rust-code callbacks to be executed when a given pattern is matched.
|
||||
//!
|
||||
//! A flexer rule is a [`crate::automata::pattern`] associated with rust code to be executed as a
|
||||
//! callback.
|
||||
|
||||
use crate::automata::pattern::Pattern;
|
||||
|
||||
|
||||
@ -8,29 +11,28 @@ use crate::automata::pattern::Pattern;
|
||||
// == Rule ==
|
||||
// ==========
|
||||
|
||||
/// A rule is a pair of regex pattern and callback.
|
||||
/// The intention is to run the callback after encountering given pattern.
|
||||
/// A flexer rule.
|
||||
#[derive(Clone,Debug)]
|
||||
pub struct Rule {
|
||||
/// Pattern that triggers the callback.
|
||||
/// The pattern that triggers the callback.
|
||||
pub pattern: Pattern,
|
||||
/// Callback containing stringified Rust code.
|
||||
/// The code to execute when [`Rule::pattern`] matches, containing rust code as a
|
||||
/// [`std::string::String`].
|
||||
pub callback: String,
|
||||
}
|
||||
|
||||
/// Builder that allows us to add `Rule` to `Group` in a nice way.
|
||||
/// It is possible this structure won't be useful in rust, since borrow checker will likely influence
|
||||
/// the final API of rule construction.
|
||||
/// A builder that allows us to add a [`Rule`] to [`crate::group::Group`] in an elegant way.
|
||||
#[derive(Clone,Debug)]
|
||||
pub struct Builder<Callback> {
|
||||
/// Pattern that triggers the callback.
|
||||
/// The pattern that triggers the callback.
|
||||
pub pattern: Pattern,
|
||||
/// Callback containing a closure.
|
||||
|
||||
/// The callback containing a closure
|
||||
pub callback: Callback,
|
||||
}
|
||||
|
||||
impl<F:FnMut(Rule)> Builder<F> {
|
||||
/// Feeds the input that triggered regex pattern to callback.
|
||||
/// Feeds the input that triggered the [`Builder::pattern`] to the [`Builder::callback`].
|
||||
pub fn run(&mut self, program:String){
|
||||
let rule = Rule {pattern:self.pattern.clone(),callback:program};
|
||||
(self.callback)(rule);
|
||||
|
@ -8,10 +8,14 @@
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This module exports simple parser based on Deterministic Finite State Automata for regular
|
||||
//! grammars (anything parsable with regex patterns).
|
||||
//! This module exports the API for defining a simple lexer based on a deterministic finite state
|
||||
//! automaton.
|
||||
//!
|
||||
//! These lexers are capable of lexing any regular grammar, with some extensions to allow working
|
||||
//! with context sensitive (e.g. indentation-aware) syntax.
|
||||
|
||||
pub mod automata;
|
||||
pub mod group;
|
||||
pub mod parser;
|
||||
pub mod data;
|
||||
|
||||
pub use enso_prelude as prelude;
|
||||
|
@ -1,14 +0,0 @@
|
||||
//! The entry point of flexer. It (is going to) contain API for parsing an input string based on
|
||||
//! group of regex patterns.
|
||||
|
||||
use crate::automata::state::Symbol;
|
||||
|
||||
|
||||
|
||||
// ============
|
||||
// == Parser ==
|
||||
// ============
|
||||
|
||||
/// End Of File - This symbol is inserted at the end of each parser input.
|
||||
/// We can use the maximum value of u32, because no `char` (unicode scalar) can hold this value.
|
||||
pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()};
|
@ -56,7 +56,9 @@ object Macro {
|
||||
|
||||
val clsDef = c.parse(s"final class __Parser__ extends $tree2")
|
||||
val tgtDef = addGroupDefs.transform(clsDef)
|
||||
c.Expr[() => P](q"$tgtDef; () => { new __Parser__ () }")
|
||||
}
|
||||
|
||||
val finalCode = q"$tgtDef; () => { new __Parser__() }"
|
||||
|
||||
c.Expr[() => P](finalCode)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user