From ae7b4c0dce9ca32e940d8a35d5a65e104dacf688 Mon Sep 17 00:00:00 2001 From: Ara Adkins Date: Thu, 25 Jun 2020 13:06:08 +0100 Subject: [PATCH] Initial parser docs setup (#936) --- docs/parser/README.md | 17 ++ docs/parser/architecture.md | 50 ++++++ docs/parser/ast.md | 13 ++ docs/parser/construct-resolution.md | 13 ++ docs/parser/flexer.md | 13 ++ docs/parser/jvm-object-generation.md | 13 ++ docs/parser/lexer.md | 13 ++ docs/parser/macro-resolution.md | 13 ++ docs/parser/parser-driver.md | 13 ++ docs/parser/parser.md | 153 +++++++++++++++++ docs/parser/reader.md | 13 ++ docs/parser/tech-analysis.md | 2 +- parser/flexer/src/automata.rs | 3 +- parser/flexer/src/automata/alphabet.rs | 45 ++--- parser/flexer/src/automata/dfa.rs | 125 ++++++-------- parser/flexer/src/automata/nfa.rs | 229 +++++++++++-------------- parser/flexer/src/automata/pattern.rs | 124 ++++++------- parser/flexer/src/automata/state.rs | 74 ++++---- parser/flexer/src/data.rs | 2 +- parser/flexer/src/data/matrix.rs | 33 ++-- parser/flexer/src/group.rs | 126 ++++++-------- parser/flexer/src/group/rule.rs | 25 ++- parser/flexer/src/lib.rs | 6 +- parser/flexer/src/parser.rs | 13 +- 24 files changed, 672 insertions(+), 459 deletions(-) create mode 100644 docs/parser/architecture.md create mode 100644 docs/parser/ast.md create mode 100644 docs/parser/construct-resolution.md create mode 100644 docs/parser/flexer.md create mode 100644 docs/parser/jvm-object-generation.md create mode 100644 docs/parser/lexer.md create mode 100644 docs/parser/macro-resolution.md create mode 100644 docs/parser/parser-driver.md create mode 100644 docs/parser/parser.md create mode 100644 docs/parser/reader.md diff --git a/docs/parser/README.md b/docs/parser/README.md index ecc046ae32..c97e3ccdcd 100644 --- a/docs/parser/README.md +++ b/docs/parser/README.md @@ -23,3 +23,20 @@ below: - [**Tech Analysis:**](./tech-analysis.md) A brief overview of the reasons for the implementation technologies for the parser. +- [**Parser Architecture:**](./architecture.md) An overview of the architecture + of the parser as a whole. +- [**Flexer:**](./flexer.md) An overview of the design and architecture of the + flexer, a generic, DFA-based lexing engine. +- [**Lexer:**](./lexer.md) The Enso lexer, responsible for tokenising the input + stream of source code. +- [**Macro Resolution:**](./macro-resolution.md) The system for defining and + resolving macros on the token stream. +- [**Construct Resolution:**](./construct-resolution.md) The system for + resolving higher-level language constructs in the AST to produce a useful + output. +- [**Parser Driver:**](./parser-driver.md) +- [**AST:**](./ast.md) The parser AST. +- [**JVM Object Generation:**](./jvm-object-generation.md) The process for + generating the ast representation on the JVM via reflection. +- [**Reading Source Code:**](./reader.md) The flexible architecture for reading + source code into the lexer. diff --git a/docs/parser/architecture.md b/docs/parser/architecture.md new file mode 100644 index 0000000000..acefa7799d --- /dev/null +++ b/docs/parser/architecture.md @@ -0,0 +1,50 @@ +--- +layout: developer-doc +title: Parser Architecture Overview +category: parser +tags: [parser, architecture] +order: 2 +--- + +# Parser Architecture Overview +The Enso parser is designed in a highly modular fashion, with separate crates +responsible for the component's various responsibilities. The main components of +the parser are described below. + + + +- [Overall Architecture](#overall-architecture) +- [Reader](#reader) +- [Flexer](#flexer) +- [Lexer](#lexer) +- [Macro Resolution](#macro-resolution) +- [Operator Resolution](#operator-resolution) +- [Construct Resolution](#construct-resolution) +- [Parser Driver](#parser-driver) + - [AST](#ast) +- [JVM Object Generation](#jvm-object-generation) + + + +## Overall Architecture +The overall architecture of the parser subsystem can be visualised as follows. + +## Reader + +## Flexer + +## Lexer + +## Macro Resolution + +## Operator Resolution + +## Construct Resolution + +## Parser Driver + +### AST + +## JVM Object Generation + +- Should wrap the parser as a whole into a new module, built for the engine diff --git a/docs/parser/ast.md b/docs/parser/ast.md new file mode 100644 index 0000000000..e6d624281d --- /dev/null +++ b/docs/parser/ast.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Parser Driver +category: parser +tags: [parser, ast] +order: 8 +--- + +# Parser Driver + + + + diff --git a/docs/parser/construct-resolution.md b/docs/parser/construct-resolution.md new file mode 100644 index 0000000000..8d4d243f12 --- /dev/null +++ b/docs/parser/construct-resolution.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Construct Resolution +category: parser +tags: [parser, construct, resolution] +order: 6 +--- + +# Construct Resolution + + + + diff --git a/docs/parser/flexer.md b/docs/parser/flexer.md new file mode 100644 index 0000000000..58eb87a7ca --- /dev/null +++ b/docs/parser/flexer.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Flexer +category: syntax +tags: [parser, flexer, lexer, dfa] +order: 3 +--- + +# Flexer + + + + diff --git a/docs/parser/jvm-object-generation.md b/docs/parser/jvm-object-generation.md new file mode 100644 index 0000000000..22473b1f09 --- /dev/null +++ b/docs/parser/jvm-object-generation.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: JVM Object Generation +category: parser +tags: [parser, jvm, object-generation] +order: 9 +--- + +# JVM Object Generation + + + + diff --git a/docs/parser/lexer.md b/docs/parser/lexer.md new file mode 100644 index 0000000000..f32301da96 --- /dev/null +++ b/docs/parser/lexer.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Lexer +category: syntax +tags: [parser, lexer] +order: 4 +--- + +# Lexer + + + + diff --git a/docs/parser/macro-resolution.md b/docs/parser/macro-resolution.md new file mode 100644 index 0000000000..d68cbea54e --- /dev/null +++ b/docs/parser/macro-resolution.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Macro Resolution +category: parser +tags: [parser, macro, resolution] +order: 5 +--- + +# Macro Resolution + + + + diff --git a/docs/parser/parser-driver.md b/docs/parser/parser-driver.md new file mode 100644 index 0000000000..2b3f7d6a0d --- /dev/null +++ b/docs/parser/parser-driver.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Parser Driver +category: parser +tags: [parser, driver] +order: 7 +--- + +# Parser Driver + + + + diff --git a/docs/parser/parser.md b/docs/parser/parser.md new file mode 100644 index 0000000000..39f574bec1 --- /dev/null +++ b/docs/parser/parser.md @@ -0,0 +1,153 @@ +# Parser Design + +## 1. Lexer (Code -> Token Stream) + +- Lexer needs to be generic over the input stream encoding to support utf-16 + coming from the JVM. +- Is there any use case that requires the lexer to read an actual file? +- The prelude needs to be released to crates.io otherwise we're going to rapidly + get out of sync. +- I don't think it makes sense to have separate `Var` and `Cons` identifiers. We + should instead have `Name`, with functions `is_referrent` and `is_variable`. + This better mirrors how the language actually treats names. +- What actually is the flexer? +- What should the AST look like? + +Lexer reads source file (lazily, line by line) or uses in-memory `&str` and produces token stream of `Var`, `Cons`, `Opr`, `Number`, `Text`, `Invalid`, and `Block`. Please note that `Block` is part of the token stream on purpose. It is important that the source code is easy to parse visually, so if you see a block, it should be a block. Discovering blocks in lexer allows us to prevent all other parts of parser, like macros, from breaking this assumption. Moreover, it makes the design of the following stages a lot simpler. Enso lexer should always succeed, on any input stream (token stream could contain `Invalid` tokens). + +Lexer is defined using Rust procedural macro system. We are using procedural macros, because the lexer definition produces a Rust code (pastes it "in-place" of the macro usage). Let's consider a very simple lexer definition: + +```rust +use crate::prelude::*; // Needs to be a released crate + +use flexer; +use flexer::Flexer; + + + +// ============= +// === Token === +// ============= + +pub struct Token { + location : flexer::Location, + ast : TokenAst, +} + +enum TokenAst { + Var(ImString), + Cons(ImString), + Blank, + ... +} + +impl Token { + pub fn new(location:Location, ast:TokenAst) -> Self { + Self {location,ast} + } + + pub fn var(location:Location, name:impl Into) -> Self { + let ast = TokenAst::Var(name.into()); + Self::new(location,ast) + } + + ... +} + + + +// ============= +// === Lexer === +// ============= + +#[derive(Debug,Default)] +struct Lexer { + current : Option, + tokens : Vec, + state : Flexer::State +} + +impl Lexer { + fn on_ident(&mut self, tok:Token) { + self.current = Some(tok); + self.state.push(self.ident_sfx_check); + } + + fn on_ident_err_sfx(&mut self) { + println!("OH NO!") + } + + fn on_no_ident_err_sfx(&mut self) { + let current = std::mem::take(&mut self.current).unwrap(); + self.tokens.push_back(current); + } +} + +impl Flexer::Definition Lexer { + fn state (& self) -> & flexer::State { & self.state } + fn state_mut (&mut self) -> &mut flexer::State { &mut self.state } +} + +pub fn lexer_source_code() -> String { + let lexer = Flexer::::new(); + + let chr = alphaNum | '_'; + let blank = Pattern::from('_'); + let body = chr.many >> '\''.many(); + let var = lowerLetter >> body; + let cons = upperLetter >> body; + let breaker = "^`!@#$%^&*()-=+[]{}|;:<>,./ \t\r\n\\"; + + let sfx_check = lexer.add(State("Identifier Suffix Check")); + + lexer.rule(lexer.root,var,"self.on_ident(Token::var(self.start_location,self.current_match()))"); + lexer.rule(lexer.root,cons,"self.on_ident(token::cons(self.start_location,self.current_match()))"); + lexer.rule(lexer.root,blank,"self.on_ident(token::blank(self.start_location))"); + lexer.rule(sfx_check,err_sfx,"self.on_ident_err_sfx()"); + lexer.rule(sfx_check,Flexer::always,"self.on_no_ident_err_sfx()"); + ... + lexer.generate_specialized_code() +} + +``` + +The idea here is that we are describing regexp-like patterns and tell what should happen when the pattern is matched. For example, after matching the `var` pattern, the code `self.on_ident(ast::Var)` should be evaluated. The code is passed as string, because it will be part of the generated, highly specialized, very fast lexer. + +Technically, the patterns are first translated to a state machine, and then to a bunch of if-then-else statements in such a way, that parsing is always `O(n)` where `n` is the input size. Logically, the regular expressions are matched top-bottom and the first fully-matched expression is chosen (unlike in the popular lexer generator flex, which uses longest match instead). After the expression is chosen, the associated function is executed and the process starts over again till the end of the input stream. Only the rules from the currently active state are considered. State is just a named (for debug purposes only) set of rules. Lexer always starts with the `lexer.root` state. You can make other state active by running (from within Flexer instance) `state.push(new_state)`, and pop it using `state.pop()`. + +The `lexer.generate_specialized_code` first works in a few steps: + +1. It takes all rules and states and generates an NFA state machine. +2. It generates DFA state machine using some custom optimizations to make sure that the regexps are matched in order and the associated code chunks are not lost. +3. It generates a highly tailored lexer `Engine` struct. One of the fields of the engine is the `Lexer` struct we defined above. The engine contains a main "loop" which consumes char by char, evaluates a big if-then-else machinery generated from the NFA, and evaluates functions from the `Lexer`. Please note that the functions start with `self`, that's because `Engine` implements `Deref` and `DerefMut` to `Lexer`. + +The generation of the if-then-else code block is not defined in this document, but can be observed by: + +1. Inspecting the current code in Scala. +2. Printing the Java code generated by current Scala Flexer implementation. +3. Talking with @wdanilo about it. + + + +## 2. Macro Resolution (Token Stream -> Chunked AST Stream incl spaec-unaware AST) + +To be described in detail taking into consideration all current use cases. For the current documentation of macro resolution, take a look here: https://github.com/luna/enso/blob/main/lib/syntax/specialization/shared/src/main/scala/org/enso/syntax/text/Parser.scala + +Before implementing this step, we need to talk about handling of space-unaware AST (the AST produced by user-macros). + + + +## 3. Operator Resolution (Chunked AST Stream -> Chunked AST Stream with Opr Apps) + +Using modified [Shunting-yard algorithm](https://en.wikipedia.org/wiki/Shunting-yard_algorithm). The algorithm is modified to support sections. The Scala implementation is here: https://github.com/luna/enso/blob/main/lib/syntax/definition/src/main/scala/org/enso/syntax/text/prec/Operator.scala . Unfortunatelly, we cannot use recursion in Rust, so it needs to be re-worked. + + + +## 4. Finalization and Special Rules Discovery (Chunked AST Stream with Opr Apps -> AST) + +To be described in detail taking into consideration all current use cases. + + + + + diff --git a/docs/parser/reader.md b/docs/parser/reader.md new file mode 100644 index 0000000000..efd3483431 --- /dev/null +++ b/docs/parser/reader.md @@ -0,0 +1,13 @@ +--- +layout: developer-doc +title: Reading Source Code +category: parser +tags: [parser, reader] +order: 9 +--- + +# Reading Source Code + + + + diff --git a/docs/parser/tech-analysis.md b/docs/parser/tech-analysis.md index 07ebca6fb7..9a59bedf42 100644 --- a/docs/parser/tech-analysis.md +++ b/docs/parser/tech-analysis.md @@ -17,7 +17,7 @@ the parser. - [Technology Requirements for the Parser](#technology-requirements-for-the-parser) - [Issues With the Previous Implementation](#issues-with-the-previous-implementation) - [Choosing Rust](#choosing-rust) - - [Downsides of Rust](#downsides-of-rust) + - [Downsides of Rust](#downsides-of-rust) diff --git a/parser/flexer/src/automata.rs b/parser/flexer/src/automata.rs index 6cec71e732..816e55a04e 100644 --- a/parser/flexer/src/automata.rs +++ b/parser/flexer/src/automata.rs @@ -1,5 +1,4 @@ -//! Exports API for construction of Nondeterminist and Deterministic Finite -//! State Automata. +//! Exports API for construction of Nondeterminist and Deterministic Finite State Automata. pub mod alphabet; pub mod dfa; diff --git a/parser/flexer/src/automata/alphabet.rs b/parser/flexer/src/automata/alphabet.rs index d5f7183d93..c41356b0a2 100644 --- a/parser/flexer/src/automata/alphabet.rs +++ b/parser/flexer/src/automata/alphabet.rs @@ -1,52 +1,45 @@ -//! Exports an alphabet (set of all valid input symbols) for Finite State -//! Automata (NFA and DFA). +//! Exports an alphabet (set of all valid input symbols) for Finite State Automata (NFA and DFA). use crate::automata::state::Symbol; use std::collections::BTreeSet; use std::ops::RangeInclusive; + + // ================ // === Alphabet === // ================ -/// An alphabet describes a set of all the valid input symbols that a given -/// finite state automata (NFA or DFA) can operate over. +/// An alphabet describes a set of all the valid input symbols that a given finite state automata +/// (NFA or DFA) can operate over. /// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton -/// The alphabet is meant to be represented as an interval. That is, if `a` and -/// `b` are in alphabet, then any symbol from `a..=b` is in alphabet too. -#[derive(Clone, Debug, PartialEq, Eq)] +/// The alphabet is meant to be represented as an interval. That is, if `a` and `b` are in alphabet, +/// then any symbol from `a..=b` is in alphabet too. +#[derive(Clone,Debug,PartialEq,Eq)] pub struct Alphabet { - /// The interval of all valid input symbols. The interval is further - /// divided into subintervals (i.e. `[a,z,A,Z]` should be understood as - /// `[a..=z,z..=A,A..=Z]`), in order to efficiently encode state - /// transitions that trigger not just on one but a whole range of symbols + /// The interval of all valid input symbols. The interval is further divided into subintervals + /// (i.e. `[a,z,A,Z]` should be understood as `[a..=z,z..=A,A..=Z]`), in order to efficiently + /// encode state transitions that trigger not just on one but a whole range of symbols /// (i.e. `a..=z`) - pub symbols:BTreeSet, + pub symbols: BTreeSet } impl Default for Alphabet { fn default() -> Self { - Alphabet { - symbols:[Symbol { val:0 }].iter().cloned().collect(), - } + Alphabet {symbols:[Symbol{val:0}].iter().cloned().collect()} } } impl Alphabet { /// Inserts a range of symbols into the alphabet. pub fn insert(&mut self, range:RangeInclusive) { - // The symbol range is associated with transition in automata. Therefore - // we: Mark the symbol with the new transition. - self.symbols.insert(Symbol { - val:range.start().val, - }); + // The symbol range is associated with transition in automata. Therefore we: + // Mark the symbol with the new transition. + self.symbols.insert(Symbol{val:range.start().val}); // Mark the symbol without the new transition. - self.symbols.insert(Symbol { - val:range.end().val + 1, - }); - // This way each symbol in alphabet corresponds to a unique set of - // transitions. + self.symbols.insert(Symbol{val:range.end().val + 1}); + // This way each symbol in alphabet corresponds to a unique set of transitions. } } @@ -54,7 +47,7 @@ impl From> for Alphabet { fn from(vec:Vec) -> Self { let mut dict = Self::default(); for val in vec { - dict.symbols.insert(Symbol { val }); + dict.symbols.insert(Symbol{val}); } dict } diff --git a/parser/flexer/src/automata/dfa.rs b/parser/flexer/src/automata/dfa.rs index a55441a94c..62772d1c6f 100644 --- a/parser/flexer/src/automata/dfa.rs +++ b/parser/flexer/src/automata/dfa.rs @@ -4,64 +4,66 @@ use crate::automata::alphabet::Alphabet; use crate::automata::state; use crate::data::matrix::Matrix; + + // ===================================== // === Deterministic Finite Automata === // ===================================== /// Function callback for an arbitrary state of finite automata. -/// It contains name of Rust procedure that is meant to be executed after -/// encountering a pattern (declared in `group::Rule.pattern`). -#[derive(Clone, Debug, PartialEq, Eq)] +/// It contains name of Rust procedure that is meant to be executed after encountering a pattern +/// (declared in `group::Rule.pattern`). +#[derive(Clone,Debug,PartialEq,Eq)] pub struct Callback { - /// TODO[jv] Write better explanation after implementing rust code - /// generation. Priority is used during rust code generation. - pub priority:usize, + /// TODO[jv] Write better explanation after implementing rust code generation. + /// Priority is used during rust code generation. + pub priority: usize, /// Name of Rust method that will be called when executing this callback. - pub name:String, + pub name: String, } /// DFA automata with a set of symbols, states and transitions. -/// Deterministic Finite Automata is a finite-state machine that accepts or -/// rejects a given sequence of symbols, by running through a state sequence -/// uniquely determined by the input symbol sequence. ___ ___ -/// ___ ___ | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3 -/// | ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ +/// Deterministic Finite Automata is a finite-state machine that accepts or rejects a given sequence +/// of symbols, by running through a state sequence uniquely determined by the input symbol sequence. +/// ___ ___ ___ ___ +/// | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3 | +/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ /// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct DFA { /// Finite set of all valid input symbols. - pub alphabet:Alphabet, + pub alphabet: Alphabet, /// Transition matrix of deterministic finite state automata. - /// It contains next state for each pair of state and input symbol - - /// (state,symbol) => new state. For example, a transition matrix for - /// automata that accepts string "ABABAB...." would look like this: + /// It contains next state for each pair of state and input symbol - (state,symbol) => new state. + /// For example, a transition matrix for automata that accepts string "ABABAB...." would look + /// like this: /// states /// | | A | B | <- symbols /// | 0 | 1 | - | /// | 1 | - | 0 | /// Where `-` denotes `state::INVALID`. - pub links:Matrix, + pub links: Matrix, /// Stores callback for each state (if it has one). - pub callbacks:Vec>, + pub callbacks: Vec>, } impl From>> for Matrix { fn from(input:Vec>) -> Self { - let rows = input.len(); - let columns = if rows == 0 { 0 } else { input[0].len() }; - let mut matrix = Self::new(rows, columns); + let rows = input.len(); + let columns = if rows == 0 {0} else {input[0].len()}; + let mut matrix = Self::new(rows,columns); for row in 0..rows { for column in 0..columns { - matrix[(row, column)] = state::Id { - id:input[row][column], - }; + matrix[(row,column)] = state::Id{id:input[row][column]}; } } matrix } } + + // =========== // == Tests == // =========== @@ -76,14 +78,11 @@ pub mod tests { /// DFA automata that accepts newline '\n'. pub fn newline() -> DFA { DFA { - alphabet:Alphabet::from(vec![10, 11]), - links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]), - callbacks:vec![ + alphabet: Alphabet::from(vec![10,11]), + links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]), + callbacks: vec![ None, - Some(Callback { - priority:2, - name:"group0_rule0".into(), - }), + Some(Callback{priority:2,name:"group0_rule0".into()}), ], } } @@ -91,14 +90,11 @@ pub mod tests { /// DFA automata that accepts any letter a..=z. pub fn letter() -> DFA { DFA { - alphabet:Alphabet::from(vec![97, 123]), - links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]), - callbacks:vec![ + alphabet: Alphabet::from(vec![97,123]), + links: Matrix::from(vec![vec![I,1,I], vec![I,I,I]]), + callbacks: vec![ None, - Some(Callback { - priority:2, - name:"group0_rule0".into(), - }), + Some(Callback{priority:2,name:"group0_rule0".into()}), ], } } @@ -106,22 +102,16 @@ pub mod tests { /// DFA automata that accepts any number of spaces ' '. pub fn spaces() -> DFA { DFA { - alphabet:Alphabet::from(vec![0, 32, 33]), - links:Matrix::from(vec![ - vec![I, 1, I], - vec![I, 2, I], - vec![I, 2, I], + alphabet: Alphabet::from(vec![0,32,33]), + links: Matrix::from(vec![ + vec![I,1,I], + vec![I,2,I], + vec![I,2,I], ]), - callbacks:vec![ + callbacks: vec![ None, - Some(Callback { - priority:3, - name:"group0_rule0".into(), - }), - Some(Callback { - priority:3, - name:"group0_rule0".into(), - }), + Some(Callback{priority:3,name:"group0_rule0".into()}), + Some(Callback{priority:3,name:"group0_rule0".into()}), ], } } @@ -129,27 +119,18 @@ pub mod tests { /// DFA automata that accepts one letter a..=z or any many spaces. pub fn letter_and_spaces() -> DFA { DFA { - alphabet:Alphabet::from(vec![32, 33, 97, 123]), - links:Matrix::from(vec![ - vec![I, 1, I, 2, I], - vec![I, 3, I, I, I], - vec![I, I, I, I, I], - vec![I, 3, I, I, I], + alphabet: Alphabet::from(vec![32,33,97,123]), + links: Matrix::from(vec![ + vec![I,1,I,2,I], + vec![I,3,I,I,I], + vec![I,I,I,I,I], + vec![I,3,I,I,I], ]), - callbacks:vec![ + callbacks: vec![ None, - Some(Callback { - priority:4, - name:"group0_rule1".into(), - }), - Some(Callback { - priority:4, - name:"group0_rule0".into(), - }), - Some(Callback { - priority:4, - name:"group0_rule1".into(), - }), + Some(Callback{priority:4,name:"group0_rule1".into()}), + Some(Callback{priority:4,name:"group0_rule0".into()}), + Some(Callback{priority:4,name:"group0_rule1".into()}), ], } } diff --git a/parser/flexer/src/automata/nfa.rs b/parser/flexer/src/automata/nfa.rs index 38d9d5c190..75a7182f8c 100644 --- a/parser/flexer/src/automata/nfa.rs +++ b/parser/flexer/src/automata/nfa.rs @@ -1,45 +1,45 @@ -//! Implementation of Nondeterministic Finite Automata and it's conversion to -//! DFA. +//! Implementation of Nondeterministic Finite Automata and it's conversion to DFA. use crate::automata::alphabet::Alphabet; -use crate::automata::dfa::Callback; use crate::automata::dfa::DFA; -use crate::automata::state; +use crate::automata::dfa::Callback; use crate::automata::state::Link; -use crate::automata::state::State; use crate::automata::state::Symbol; +use crate::automata::state::State; +use crate::automata::state; use crate::data::matrix::Matrix; +use std::collections::HashMap; +use std::collections::BTreeSet; +use std::ops::RangeInclusive; use crate::automata::pattern::Pattern; use itertools::Itertools; -use std::collections::BTreeSet; -use std::collections::HashMap; -use std::ops::RangeInclusive; + // ======================================== // === Nondeterministic Finite Automata === // ======================================== /// Type alias for a state Id based on set of states. -/// It is used during NFA -> DFA transformation where multiple states can merge -/// together, thanks to epsilon links. +/// It is used during NFA -> DFA transformation where multiple states can merge together, +/// thanks to epsilon links. type StateSetId = BTreeSet; /// NFA automata with a set of symbols, states and transitions. -/// Nondeterministic Finite Automata is a finite-state machine that accepts or -/// rejects a given sequence of symbols. -/// Compared to `DFA`, NFA can transition into multiple new states without -/// reading any symbol (so called epsilon link / transition), +/// Nondeterministic Finite Automata is a finite-state machine that accepts or rejects a given +/// sequence of symbols. +/// Compared to `DFA`, NFA can transition into multiple new states without reading any symbol +/// (so called epsilon link / transition), /// ___ ___ ___ ___ ___ /// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 | /// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ /// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct NFA { /// Finite set of all valid input symbols. - pub alphabet:Alphabet, + pub alphabet: Alphabet, /// Set of named NFA states with (epsilon) transitions. - pub states:Vec, + pub states: Vec, } impl NFA { @@ -47,94 +47,80 @@ impl NFA { pub fn new_state(&mut self) -> state::Id { let id = self.states.len(); self.states.push(State::default()); - state::Id { id } + state::Id {id} } /// Creates an epsilon transition between two states. - /// Whenever the automata happens to be in `source` state it can - /// immediatelly move to `target` state (but does not have to). + /// Whenever the automata happens to be in `source` state it can immediatelly move to + /// `target` state (but does not have to). pub fn connect(&mut self, source:state::Id, target:state::Id) { self.states[source.id].epsilon_links.push(target); } - /// Creates an ordinary transition (for a range of symbols) between two - /// states. If any symbol from such range happens to be on input when - /// the automata is in `source` state, it will immediatelly move to - /// `target` state. - pub fn connect_by( - &mut self, - source:state::Id, - target:state::Id, - symbols:&RangeInclusive, - ) { + /// Creates an ordinary transition (for a range of symbols) between two states. + /// If any symbol from such range happens to be on input when the automata is in `source` + /// state, it will immediatelly move to `target` state. + pub fn connect_by + (&mut self, source:state::Id, target:state::Id, symbols:&RangeInclusive) { self.alphabet.insert(symbols.clone()); - self.states[source.id].links.push(Link { - symbols:symbols.clone(), - target, - }); + self.states[source.id].links.push(Link{symbols:symbols.clone(), target}); } /// Transforms pattern to NFA. /// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI - pub fn new_pattern( - &mut self, - source:state::Id, - pattern:&Pattern, - ) -> state::Id { + pub fn new_pattern(&mut self, source:state::Id, pattern:&Pattern) -> state::Id { let current = self.new_state(); - self.connect(source, current); + self.connect(source,current); match pattern { Pattern::Range(range) => { let state = self.new_state(); - self.connect_by(current, state, range); + self.connect_by(current,state,range); state - } + }, Pattern::Many(body) => { let s1 = self.new_state(); - let s2 = self.new_pattern(s1, body); + let s2 = self.new_pattern(s1,body); let s3 = self.new_state(); - self.connect(current, s1); - self.connect(current, s3); - self.connect(s2, s3); - self.connect(s3, s1); + self.connect(current,s1); + self.connect(current,s3); + self.connect(s2,s3); + self.connect(s3,s1); s3 - } - Pattern::And(patterns) => patterns - .iter() - .fold(current, |s, pat| self.new_pattern(s, pat)), + }, + Pattern::And(patterns) => { + patterns.iter().fold(current,|s,pat| self.new_pattern(s,pat)) + }, Pattern::Or(patterns) => { - let states = patterns - .iter() - .map(|pat| self.new_pattern(current, pat)) - .collect_vec(); - let end = self.new_state(); + let states = patterns.iter().map(|pat| self.new_pattern(current,pat)).collect_vec(); + let end = self.new_state(); for state in states { - self.connect(state, end); + self.connect(state,end); } end } } } + // === NFA -> DFA === /// Merges states that are connected by epsilon links. /// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao fn eps_matrix(&self) -> Vec { - fn fill_eps_matrix( - nfa:&NFA, - states:&mut Vec, - computed:&mut Vec, - visited:&mut Vec, - state:state::Id, + fn fill_eps_matrix + ( nfa : &NFA + , states : &mut Vec + , computed : &mut Vec + , visited : &mut Vec + , state : state::Id ) { let mut state_set = StateSetId::new(); - let mut circular = false; + let mut circular = false; visited[state.id] = true; state_set.insert(state); for &target in &nfa.states[state.id].epsilon_links { if !visited[target.id] { - fill_eps_matrix(nfa, states, computed, visited, target); + fill_eps_matrix(nfa,states,computed,visited,target); } state_set.insert(target); state_set.extend(states[target.id].iter()); @@ -148,17 +134,11 @@ impl NFA { states[state.id] = state_set; } - let mut states = vec![StateSetId::new(); self.states.len()]; + let mut states = vec![StateSetId::new(); self.states.len()]; let mut computed = vec![false; self.states.len()]; for id in 0..self.states.len() { let mut visited = vec![false; states.len()]; - fill_eps_matrix( - self, - &mut states, - &mut computed, - &mut visited, - state::Id { id }, - ); + fill_eps_matrix(self,&mut states,&mut computed,&mut visited,state::Id{id}); } states } @@ -166,13 +146,12 @@ impl NFA { /// Computes a transition matrix (state X symbol => state) for NFA. /// Ignores epsilon links. fn nfa_matrix(&self) -> Matrix { - let mut matrix = - Matrix::new(self.states.len(), self.alphabet.symbols.len()); + let mut matrix = Matrix::new(self.states.len(),self.alphabet.symbols.len()); for (state_ix, source) in self.states.iter().enumerate() { let targets = source.targets(&self.alphabet); for (voc_ix, &target) in targets.iter().enumerate() { - matrix[(state_ix, voc_ix)] = target; + matrix[(state_ix,voc_ix)] = target; } } matrix @@ -183,37 +162,35 @@ impl From<&NFA> for DFA { /// Transforms NFA into DFA. /// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao fn from(nfa:&NFA) -> Self { - let nfa_mat = nfa.nfa_matrix(); - let eps_mat = nfa.eps_matrix(); - let mut dfa_mat = Matrix::new(0, nfa.alphabet.symbols.len()); + let nfa_mat = nfa.nfa_matrix(); + let eps_mat = nfa.eps_matrix(); + let mut dfa_mat = Matrix::new(0,nfa.alphabet.symbols.len()); let mut dfa_eps_ixs = Vec::::new(); - let mut dfa_eps_map = HashMap::::new(); + let mut dfa_eps_map = HashMap::::new(); dfa_eps_ixs.push(eps_mat[0].clone()); - dfa_eps_map.insert(eps_mat[0].clone(), state::Id { id:0 }); + dfa_eps_map.insert(eps_mat[0].clone(), state::Id{id:0}); let mut i = 0; - while i < dfa_eps_ixs.len() { + while i < dfa_eps_ixs.len() { dfa_mat.new_row(); for voc_ix in 0..nfa.alphabet.symbols.len() { let mut eps_set = StateSetId::new(); for &eps_ix in &dfa_eps_ixs[i] { - let tgt = nfa_mat[(eps_ix.id, voc_ix)]; + let tgt = nfa_mat[(eps_ix.id,voc_ix)]; if tgt != state::INVALID { eps_set.extend(eps_mat[tgt.id].iter()); } } if !eps_set.is_empty() { - dfa_mat[(i, voc_ix)] = match dfa_eps_map.get(&eps_set) { + dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) { Some(&id) => id, None => { - let id = state::Id { - id:dfa_eps_ixs.len(), - }; + let id = state::Id {id:dfa_eps_ixs.len()}; dfa_eps_ixs.push(eps_set.clone()); - dfa_eps_map.insert(eps_set, id); + dfa_eps_map.insert(eps_set,id); id - } + }, }; } } @@ -221,23 +198,16 @@ impl From<&NFA> for DFA { } let mut callbacks = vec![None; dfa_eps_ixs.len()]; - let priority = dfa_eps_ixs.len(); + let priority = dfa_eps_ixs.len(); for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() { let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some(); if let Some(eps) = epss.into_iter().find(has_name) { - let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap(); - callbacks[dfa_ix] = Some(Callback { - name:rule, - priority, - }); + let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap(); + callbacks[dfa_ix] = Some(Callback {name:rule,priority}); } } - DFA { - alphabet:nfa.alphabet.clone(), - links:dfa_mat, - callbacks, - } + DFA {alphabet:nfa.alphabet.clone(),links:dfa_mat,callbacks} } } @@ -248,99 +218,96 @@ impl From<&NFA> for DFA { #[cfg(test)] pub mod tests { extern crate test; - + use crate::automata::dfa; - - use super::*; + + use super::*; use test::Bencher; /// NFA automata that accepts newline '\n'. pub fn newline() -> NFA { NFA { - states:vec![ + states: vec![ State::from(vec![1]), - State::from(vec![(10..=10, 2)]), + State::from(vec![(10..=10,2)]), State::from(vec![3]).named("group0_rule0"), State::default(), ], - alphabet:Alphabet::from(vec![10, 11]), + alphabet: Alphabet::from(vec![10,11]), } } /// NFA automata that accepts any letter a..=z. pub fn letter() -> NFA { NFA { - states:vec![ + states: vec![ State::from(vec![1]), - State::from(vec![(97..=122, 2)]), + State::from(vec![(97..=122,2)]), State::from(vec![3]).named("group0_rule0"), State::default(), ], - alphabet:Alphabet::from(vec![97, 123]), + alphabet: Alphabet::from(vec![97,123]), } } /// NFA automata that accepts any number of spaces ' '. pub fn spaces() -> NFA { NFA { - states:vec![ + states: vec![ State::from(vec![1]), State::from(vec![2]), - State::from(vec![(32..=32, 3)]), + State::from(vec![(32..=32,3)]), State::from(vec![4]), - State::from(vec![5, 8]), + State::from(vec![5,8]), State::from(vec![6]), - State::from(vec![(32..=32, 7)]), + State::from(vec![(32..=32,7)]), State::from(vec![8]), - State::from(vec![5, 9]).named("group0_rule0"), + State::from(vec![5,9]).named("group0_rule0"), State::default(), ], - alphabet:Alphabet::from(vec![0, 32, 33]), + alphabet: Alphabet::from(vec![0,32,33]), } } /// NFA automata that accepts one letter a..=z or many spaces ' '. pub fn letter_and_spaces() -> NFA { NFA { - states:vec![ - State::from(vec![1, 3]), - State::from(vec![(97..=122, 2)]), + states: vec![ + State::from(vec![1,3]), + State::from(vec![(97..=122,2)]), State::from(vec![11]).named("group0_rule0"), State::from(vec![4]), - State::from(vec![(32..=32, 5)]), + State::from(vec![(32..=32,5)]), State::from(vec![6]), - State::from(vec![7, 10]), + State::from(vec![7,10]), State::from(vec![8]), - State::from(vec![(32..=32, 9)]), + State::from(vec![(32..=32,9)]), State::from(vec![10]), - State::from(vec![7, 11]).named("group0_rule1"), + State::from(vec![7,11]).named("group0_rule1"), State::default(), ], - alphabet:Alphabet::from(vec![32, 33, 97, 123]), + alphabet: Alphabet::from(vec![32,33,97,123]), } } - + #[test] fn test_to_dfa_newline() { - assert_eq!(DFA::from(&newline()), dfa::tests::newline()); + assert_eq!(DFA::from(&newline()),dfa::tests::newline()); } #[test] fn test_to_dfa_letter() { - assert_eq!(DFA::from(&letter()), dfa::tests::letter()); + assert_eq!(DFA::from(&letter()),dfa::tests::letter()); } #[test] fn test_to_dfa_spaces() { - assert_eq!(DFA::from(&spaces()), dfa::tests::spaces()); + assert_eq!(DFA::from(&spaces()),dfa::tests::spaces()); } #[test] fn test_to_dfa_letter_and_spaces() { - assert_eq!( - DFA::from(&letter_and_spaces()), - dfa::tests::letter_and_spaces() - ); + assert_eq!(DFA::from(&letter_and_spaces()),dfa::tests::letter_and_spaces()); } #[bench] diff --git a/parser/flexer/src/automata/pattern.rs b/parser/flexer/src/automata/pattern.rs index 1bf2fea954..d65b3bbcae 100644 --- a/parser/flexer/src/automata/pattern.rs +++ b/parser/flexer/src/automata/pattern.rs @@ -1,8 +1,7 @@ -//! Simple API for constructing regex patterns that are used in parser -//! implementation. +//! Simple API for constructing regex patterns that are used in parser implementation. -use crate::automata::state::Symbol; use crate::parser; +use crate::automata::state::Symbol; use core::iter; use itertools::Itertools; @@ -10,12 +9,13 @@ use std::ops::BitAnd; use std::ops::BitOr; use std::ops::RangeInclusive; + // ============= // == Pattern == // ============= /// Simple regex pattern. -#[derive(Clone, Debug)] +#[derive(Clone,Debug)] pub enum Pattern { /// Pattern that triggers on any symbol from given range. Range(RangeInclusive), @@ -24,56 +24,41 @@ pub enum Pattern { /// Pattern that triggers when a sequence of patterns is encountered. And(Vec), /// Pattern that triggers on 0..N repetitions of given pattern. - Many(Box), + Many(Box) } use Pattern::*; impl BitOr for Pattern { type Output = Pattern; - fn bitor(self, rhs:Pattern) -> Self::Output { + fn bitor(self, rhs: Pattern) -> Self::Output { match (self, rhs) { - (Or(mut lhs), Or(rhs)) => { - lhs.extend(rhs); - Or(lhs) - } - (Or(mut lhs), rhs) => { - lhs.push(rhs); - Or(lhs) - } - (lhs, Or(mut rhs)) => { - rhs.push(lhs); - Or(rhs) - } - (lhs, rhs) => Or(vec![lhs, rhs]), + (Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)}, + (Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)}, + (lhs , Or(mut rhs)) => {rhs.push(lhs) ; Or(rhs)}, + (lhs , rhs ) => Or(vec![lhs,rhs]), } } } impl BitAnd for Pattern { type Output = Pattern; - fn bitand(self, rhs:Pattern) -> Self::Output { + fn bitand(self, rhs: Pattern) -> Self::Output { match (self, rhs) { - (And(mut lhs), And(rhs)) => { - lhs.extend(rhs); - And(lhs) - } - (And(mut lhs), rhs) => { - lhs.push(rhs); - And(lhs) - } - (lhs, And(mut rhs)) => { - rhs.push(lhs); - And(rhs) - } - (lhs, rhs) => And(vec![lhs, rhs]), + (And(mut lhs), And( rhs)) => {lhs.extend(rhs) ; And(lhs)}, + (And(mut lhs), rhs ) => {lhs.push(rhs) ; And(lhs)}, + (lhs , And(mut rhs)) => {rhs.push(lhs) ; And(rhs)}, + (lhs , rhs ) => And(vec![lhs,rhs]), } } } impl Pattern { + /// Pattern that never triggers. - pub fn never() -> Self { Pattern::symbols(1..=0) } + pub fn never() -> Self { + Pattern::symbols(1..=0) + } /// Pattern that always triggers. pub fn always() -> Self { @@ -81,34 +66,44 @@ impl Pattern { } /// Pattern that triggers on any char. - pub fn any_char() -> Self { Pattern::symbols(0..=u32::max_value()) } + pub fn any_char() -> Self { + Pattern::symbols(0..=u32::max_value()) + } /// Pattern that triggers on 0..N repetitions of given pattern. - pub fn many(self) -> Self { Many(Box::new(self)) } + pub fn many(self) -> Self { + Many(Box::new(self)) + } /// Pattern that triggers on 1..N repetitions of given pattern. - pub fn many1(self) -> Self { self.clone() & self.many() } + pub fn many1(self) -> Self { + self.clone() & self.many() + } /// Pattern that triggers on 0..=1 repetitions of given pattern. - pub fn opt(self) -> Self { self | Self::always() } + pub fn opt(self) -> Self { + self | Self::always() + } /// Pattern that triggers on given symbol - pub fn symbol(symbol:u32) -> Self { Pattern::symbols(symbol..=symbol) } + pub fn symbol(symbol:u32) -> Self { + Pattern::symbols(symbol..=symbol) + } /// Pattern that triggers on any of the given symbols. pub fn symbols(symbols:RangeInclusive) -> Self { - Pattern::Range( - Symbol { - val:*symbols.start(), - }..=Symbol { val:*symbols.end() }, - ) + Pattern::Range(Symbol{val:*symbols.start()}..=Symbol{val:*symbols.end()}) } /// Pattern that triggers on end of file. - pub fn eof() -> Self { Self::symbol(parser::EOF_CODE.val) } + pub fn eof() -> Self { + Self::symbol(parser::EOF_CODE.val) + } /// Pattern that triggers on given character. - pub fn char(char:char) -> Self { Self::symbol(char as u32) } + pub fn char(char:char) -> Self { + Self::symbol(char as u32) + } /// Pattern that triggers on any of the given characters. pub fn range(chars:RangeInclusive) -> Self { @@ -117,48 +112,41 @@ impl Pattern { /// Pattern that triggers when sequence of characters is encountered. pub fn all(chars:&str) -> Self { - chars - .chars() - .fold(Self::never(), |pat, char| pat & Self::char(char)) + chars.chars().fold(Self::never(), |pat,char| pat & Self::char(char)) } /// Pattern that triggers on any characters from given sequence. pub fn any(chars:&str) -> Self { - chars - .chars() - .fold(Self::never(), |pat, char| pat | Self::char(char)) + chars.chars().fold(Self::never(), |pat,char| pat | Self::char(char)) } /// Pattern that doesn't trigger on any given character from given sequence. pub fn none(chars:&str) -> Self { - let max = u32::max_value(); - let char_iter = chars.chars().map(|char| char as u32); + let max = u32::max_value(); + let char_iter = chars.chars().map(|char| char as u32); let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max)); - let mut codes = char_iter2.collect_vec(); + let mut codes = char_iter2.collect_vec(); codes.sort(); - codes - .iter() - .tuple_windows() - .fold(Self::never(), |pat, (start, end)| { - if end < start { - pat - } else { - pat | Pattern::symbols(*start..=*end) - } - }) + codes.iter().tuple_windows().fold(Self::never(), |pat,(start,end)| { + if end < start {pat} else { + pat | Pattern::symbols(*start..=*end) + } + }) } /// Pattern that triggers on any character but the one given. - pub fn not(char:char) -> Self { Self::none(&char.to_string()) } + pub fn not(char:char) -> Self { + Self::none(&char.to_string()) + } /// Pattern that triggers on N repetitions of given pattern. pub fn repeat(pat:Pattern, num:usize) -> Self { - (0..num).fold(Self::always(), |p, _| p & pat.clone()) + (0..num).fold(Self::always(), |p,_| p & pat.clone()) } /// Pattern that triggers on MIN..MAX repetitions of given pattern. pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self { - (min..max).fold(Self::never(), |p, n| p | Self::repeat(pat.clone(), n)) + (min..max).fold(Self::never(), |p,n| p | Self::repeat(pat.clone(),n)) } } diff --git a/parser/flexer/src/automata/state.rs b/parser/flexer/src/automata/state.rs index 4bff4ac2fc..9d9edba506 100644 --- a/parser/flexer/src/automata/state.rs +++ b/parser/flexer/src/automata/state.rs @@ -1,66 +1,65 @@ -//! This module exports State implementation for Nondeterministic Finite -//! Automata. +//! This module exports State implementation for Nondeterministic Finite Automata. use crate::automata::alphabet::Alphabet; use crate::automata::state; use std::ops::RangeInclusive; + + // ======================= // == State Of Automata == // ======================= /// Flag for invalid state. -/// When finite automata gets into invalid state the input sequence of symbols -/// is rejected. -pub const INVALID:Id = Id { - id:usize::max_value(), -}; +/// When finite automata gets into invalid state the input sequence of symbols is rejected. +pub const INVALID:Id = Id {id:usize::max_value()}; /// Newtype wrapper for finite automata input symbol. -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)] pub struct Symbol { #[allow(missing_docs)] - pub val:u32, + pub val: u32 } /// Newtype wrapper for finite automata state ID. -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)] pub struct Id { #[allow(missing_docs)] - pub id:usize, + pub id: usize } impl Default for Id { - /// Returns state::INVALID. This is because every finite automata has an - /// invalid state and because all transitions in automata transition - /// matrix lead to invalid state by default. - fn default() -> Self { state::INVALID } + /// Returns state::INVALID. This is because every finite automata has an invalid state + /// and because all transitions in automata transition matrix lead to invalid state by default. + fn default() -> Self { + state::INVALID + } } /// Named NFA state with a set of transitions (links). -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct State { /// Set of transitions that don't require any symbol to trigger. - /// I.E. If there is an epsilon link from state A to state B, then whenever - /// we are in state A, we can freely move to state B. - pub epsilon_links:Vec, + /// I.E. If there is an epsilon link from state A to state B, then whenever we are in state A, + /// we can freely move to state B. + pub epsilon_links: Vec, /// Set of transitions that trigger with specific symbol on input. /// When triggered, the automata will transition to the `link.target`. - pub links:Vec, + pub links: Vec, /// Name of the state. /// We use it to autogenerate a call to Rust method with same name. - pub name:Option, + pub name: Option, } /// A transition to new automata state /// that requires specific symbol on automata input to trigger. -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone,Debug,PartialEq,Eq)] pub struct Link { /// Any symbol from the range will trigger this link. - pub symbols:RangeInclusive, + pub symbols: RangeInclusive, /// A state that is visited, after the link is triggered. - pub target:Id, + pub target: Id, } impl State { @@ -73,8 +72,8 @@ impl State { /// Returns transition (next state) for each symbol in alphabet. pub fn targets(&self, alphabet:&Alphabet) -> Vec { let mut targets = vec![]; - let mut index = 0; - let mut links = self.links.clone(); + let mut index = 0; + let mut links = self.links.clone(); links.sort_by_key(|link| *link.symbols.start()); for &symbol in &alphabet.symbols { while links.len() > index && *links[index].symbols.end() < symbol { @@ -93,29 +92,20 @@ impl State { impl From> for State { /// Creates a state with epsilon links. fn from(vec:Vec) -> Self { - let epsilon_links = vec.iter().cloned().map(|id| Id { id }).collect(); - State { - epsilon_links, - ..Default::default() - } + let epsilon_links = vec.iter().cloned().map(|id| Id{id}).collect(); + State {epsilon_links,..Default::default()} } } impl From, usize)>> for State { /// Creates a state with ordinary links. fn from(vec:Vec<(RangeInclusive, usize)>) -> Self { - let link = |(range, id):(RangeInclusive, usize)| { - let start = Symbol { val:*range.start() }; - let end = Symbol { val:*range.end() }; - Link { - symbols:start..=end, - target:Id { id }, - } + let link = |(range, id): (RangeInclusive, usize)| { + let start = Symbol{val:*range.start()}; + let end = Symbol{val:*range.end()}; + Link {symbols: start..=end, target: Id{ id }} }; let links = vec.iter().cloned().map(link).collect(); - State { - links, - ..Default::default() - } + State {links,..Default::default()} } } diff --git a/parser/flexer/src/data.rs b/parser/flexer/src/data.rs index a2bb4e66f3..8df7015f12 100644 --- a/parser/flexer/src/data.rs +++ b/parser/flexer/src/data.rs @@ -1,3 +1,3 @@ //! Generic datastructures, with multiple usecases. -pub mod matrix; +pub mod matrix; \ No newline at end of file diff --git a/parser/flexer/src/data/matrix.rs b/parser/flexer/src/data/matrix.rs index f1964b48e2..19bc96418a 100644 --- a/parser/flexer/src/data/matrix.rs +++ b/parser/flexer/src/data/matrix.rs @@ -3,46 +3,45 @@ use std::ops::Index; use std::ops::IndexMut; + + // ============ // == Matrix == // ============ /// Efficient 2D matrix implemented on top of vector. -#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct Matrix { /// The number of rows in matrix. - rows:usize, + rows: usize, /// The number of columns in matrix. - columns:usize, + columns: usize, /// Matrix implemented with vector. - matrix:Vec, + matrix: Vec, } -impl Index<(usize, usize)> for Matrix { + +impl Index<(usize,usize)> for Matrix { type Output = T; - fn index(&self, index:(usize, usize)) -> &T { - &self.matrix[index.0 * self.columns + index.1] + fn index(&self, index:(usize,usize)) -> &T { + &self.matrix[index.0*self.columns+index.1] } } -impl IndexMut<(usize, usize)> for Matrix { - fn index_mut(&mut self, index:(usize, usize)) -> &mut T { - &mut self.matrix[index.0 * self.columns + index.1] +impl IndexMut<(usize,usize)> for Matrix { + fn index_mut(&mut self, index:(usize,usize)) -> &mut T { + &mut self.matrix[index.0*self.columns+index.1] } } impl Matrix { /// Constructs a new matrix for given number of rows and columns. pub fn new(rows:usize, columns:usize) -> Self { - let mut matrix = Vec::with_capacity(rows * columns); + let mut matrix = Vec::with_capacity(rows*columns); for _ in 0..matrix.capacity() { matrix.push(Default::default()) } - Self { - rows, - columns, - matrix, - } + Self{rows,columns,matrix} } /// Adds a new row to matrix, filled with default values. @@ -52,4 +51,4 @@ impl Matrix { } self.rows += 1; } -} +} \ No newline at end of file diff --git a/parser/flexer/src/group.rs b/parser/flexer/src/group.rs index 6e89c080fc..f26674d46d 100644 --- a/parser/flexer/src/group.rs +++ b/parser/flexer/src/group.rs @@ -1,62 +1,57 @@ -//! This module exports API for grouping multiple rules (Rust callbacks with -//! regex pattern) together. +//! This module exports API for grouping multiple rules (Rust callbacks with regex pattern) together. -use crate::automata::nfa::NFA; use crate::automata::pattern::Pattern; +use crate::automata::nfa::NFA; use crate::group::rule::Rule; use itertools::Itertools; pub mod rule; + + // =========== // == Group == // =========== -/// Struct that group rules together. It also inherits rules from parent group -/// (if it has one). Groups are the basic building block of flexer: -/// Flexer internally keeps a stack of groups, only one of them active at a -/// time. Each group contains set of regex patterns and callbacks (together -/// called `Rule`). Whenever a rule.pattern from active group is matched with -/// part of input the associated rule.callback is executed, which in turn may -/// exit the current groupor enter a new one. This allows us to nicely model a -/// situation, where certain part of program (like a string literal) should have -/// very different parsing rules than other (for example body of function). Note -/// that the input is first matched with first added rule, then with the second -/// etc. Therefore, if two rules overlap, only the callback of the first added -/// rule will be executed. -#[derive(Clone, Debug, Default)] +/// Struct that group rules together. It also inherits rules from parent group (if it has one). +/// Groups are the basic building block of flexer: +/// Flexer internally keeps a stack of groups, only one of them active at a time. +/// Each group contains set of regex patterns and callbacks (together called `Rule`). +/// Whenever a rule.pattern from active group is matched with part of input the associated +/// rule.callback is executed, which in turn may exit the current groupor enter a new one. +/// This allows us to nicely model a situation, where certain part of program (like a string literal) +/// should have very different parsing rules than other (for example body of function). +/// Note that the input is first matched with first added rule, then with the second etc. +/// Therefore, if two rules overlap, only the callback of the first added rule will be executed. +#[derive(Clone,Debug,Default)] pub struct Group { /// Unique ID. - pub id:usize, + pub id: usize, /// Custom name which is used for debugging. - pub name:String, + pub name: String, /// Parent which we inherit rules from. - pub parent:Option>, + pub parent: Option>, /// Set of regex patterns with associated callbacks. - pub rules:Vec, + pub rules: Vec, } impl Group { /// Adds new rule (regex pattern with associated callback) to group. - pub fn add_rule(&mut self, rule:Rule) { self.rules.push(rule) } + pub fn add_rule(&mut self, rule:Rule) { + self.rules.push(rule) + } /// Returns rule builder for given pattern. /// TODO[jv] better describe it's purpose once we agree on correct API. - pub fn rule( - &mut self, - pattern:Pattern, - ) -> rule::Builder { - rule::Builder { - pattern, - callback:move |rule| self.add_rule(rule), - } + pub fn rule(&mut self, pattern:Pattern) -> rule::Builder { + rule::Builder{pattern,callback:move |rule| self.add_rule(rule)} } /// All rules including parent rules. pub fn rules(&self) -> Vec<&Rule> { let mut parent = &self.parent; - let mut rules = (&self.rules).iter().collect_vec(); + let mut rules = (&self.rules).iter().collect_vec(); while let Some(state) = parent { rules.extend((&state.rules).iter()); parent = &state.parent; @@ -66,7 +61,7 @@ impl Group { /// Canonical name of given rule. fn callback_name(&self, rule_ix:usize) -> String { - format!("group{}_rule{}", self.id, rule_ix) + format!("group{}_rule{}",self.id,rule_ix) } } @@ -75,10 +70,10 @@ impl From<&Group> for NFA { /// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI fn from(group:&Group) -> Self { let mut nfa = NFA::default(); - let start = nfa.new_state(); - let build = |rule:&Rule| nfa.new_pattern(start, &rule.pattern); - let states = group.rules().into_iter().map(build).collect_vec(); - let end = nfa.new_state(); + let start = nfa.new_state(); + let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern); + let states = group.rules().into_iter().map(build).collect_vec(); + let end = nfa.new_state(); for (ix, state) in states.into_iter().enumerate() { nfa.states[state.id].name = Some(group.callback_name(ix)); nfa.connect(state, end); @@ -87,6 +82,7 @@ impl From<&Group> for NFA { } } + // ============= // === Tests === // ============= @@ -99,75 +95,56 @@ pub mod tests { use crate::automata::nfa; use crate::automata::nfa::NFA; use crate::automata::pattern::Pattern; - use crate::group::rule::Rule; use crate::group::Group; + use crate::group::rule::Rule; use std::default::Default; use test::Bencher; fn newline() -> Group { - let pattern = Pattern::char('\n'); - let mut group = Group::default(); + let pattern = Pattern::char('\n'); + let mut group = Group::default(); - group.add_rule(Rule { - pattern, - callback:"".into(), - }); + group.add_rule(Rule{pattern,callback:"".into()}); group } fn letter() -> Group { - let pattern = Pattern::range('a'..='z'); - let mut group = Group::default(); + let pattern = Pattern::range('a'..='z'); + let mut group = Group::default(); - group.add_rule(Rule { - pattern, - callback:"".into(), - }); + group.add_rule(Rule{pattern,callback:"".into()}); group } fn spaces() -> Group { - let pattern = Pattern::char(' ').many1(); - let mut group = Group::default(); + let pattern = Pattern::char(' ').many1(); + let mut group = Group::default(); - group.add_rule(Rule { - pattern, - callback:"".into(), - }); + group.add_rule(Rule{pattern,callback:"".into()}); group } fn letter_and_spaces() -> Group { - let letter = Pattern::range('a'..='z'); - let spaces = Pattern::char(' ').many1(); - let mut group = Group::default(); + let letter = Pattern::range('a'..='z'); + let spaces = Pattern::char(' ').many1(); + let mut group = Group::default(); - group.add_rule(Rule { - pattern:letter, - callback:"".into(), - }); - group.add_rule(Rule { - pattern:spaces, - callback:"".into(), - }); + group.add_rule(Rule{pattern:letter,callback:"".into()}); + group.add_rule(Rule{pattern:spaces,callback:"".into()}); group } fn hundred_rules() -> Group { - let pattern = - Pattern::all("The quick brown fox jumps over the lazy dog!!"); - let mut group = Group::default(); + let pattern = Pattern::all("The quick brown fox jumps over the lazy dog!!"); + let mut group = Group::default(); for _ in 0..100 { - group.add_rule(Rule { - pattern:pattern.clone(), - callback:"".into(), - }) + group.add_rule(Rule{pattern:pattern.clone(),callback:"".into()}) } group } @@ -189,10 +166,7 @@ pub mod tests { #[test] fn test_to_nfa_letter_and_spaces() { - assert_eq!( - NFA::from(&letter_and_spaces()), - nfa::tests::letter_and_spaces() - ); + assert_eq!(NFA::from(&letter_and_spaces()), nfa::tests::letter_and_spaces()); } #[bench] diff --git a/parser/flexer/src/group/rule.rs b/parser/flexer/src/group/rule.rs index d5f90e1083..006b374d80 100644 --- a/parser/flexer/src/group/rule.rs +++ b/parser/flexer/src/group/rule.rs @@ -2,38 +2,37 @@ //! use crate::automata::pattern::Pattern; + + // ========== // == Rule == // ========== /// A rule is a pair of regex pattern and callback. /// The intention is to run the callback after encountering given pattern. -#[derive(Clone, Debug)] +#[derive(Clone,Debug)] pub struct Rule { /// Pattern that triggers the callback. - pub pattern:Pattern, + pub pattern: Pattern, /// Callback containing stringified Rust code. - pub callback:String, + pub callback: String, } /// Builder that allows us to add `Rule` to `Group` in a nice way. -/// It is possible this structure won't be useful in rust, since borrow checker -/// will likely influence the final API of rule construction. -#[derive(Clone, Debug)] +/// It is possible this structure won't be useful in rust, since borrow checker will likely influence +/// the final API of rule construction. +#[derive(Clone,Debug)] pub struct Builder { /// Pattern that triggers the callback. - pub pattern:Pattern, + pub pattern: Pattern, /// Callback containing a closure. - pub callback:Callback, + pub callback: Callback, } impl Builder { /// Feeds the input that triggered regex pattern to callback. - pub fn run(&mut self, program:String) { - let rule = Rule { - pattern:self.pattern.clone(), - callback:program, - }; + pub fn run(&mut self, program:String){ + let rule = Rule {pattern:self.pattern.clone(),callback:program}; (self.callback)(rule); } } diff --git a/parser/flexer/src/lib.rs b/parser/flexer/src/lib.rs index b73d02cb7c..00b59515ba 100644 --- a/parser/flexer/src/lib.rs +++ b/parser/flexer/src/lib.rs @@ -8,10 +8,10 @@ #![warn(unsafe_code)] #![warn(unused_import_braces)] -//! This module exports simple parser based on Deterministic Finite State -//! Automata for regular grammars (anything parsable with regex patterns). +//! This module exports simple parser based on Deterministic Finite State Automata for regular +//! grammars (anything parsable with regex patterns). pub mod automata; -pub mod data; pub mod group; pub mod parser; +pub mod data; diff --git a/parser/flexer/src/parser.rs b/parser/flexer/src/parser.rs index 9c4213ff02..585174634e 100644 --- a/parser/flexer/src/parser.rs +++ b/parser/flexer/src/parser.rs @@ -1,15 +1,14 @@ -//! The entry point of flexer. It (is going to) contain API for parsing an input -//! string based on group of regex patterns. +//! The entry point of flexer. It (is going to) contain API for parsing an input string based on +//! group of regex patterns. use crate::automata::state::Symbol; + + // ============ // == Parser == // ============ /// End Of File - This symbol is inserted at the end of each parser input. -/// We can use the maximum value of u32, because no `char` (unicode scalar) can -/// hold this value. -pub const EOF_CODE:Symbol = Symbol { - val:u32::max_value(), -}; +/// We can use the maximum value of u32, because no `char` (unicode scalar) can hold this value. +pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()};