From f7d4ef546aa43a3cb084264bc69a8b9d7643ef95 Mon Sep 17 00:00:00 2001 From: Ara Adkins Date: Tue, 16 Jun 2020 17:18:11 +0100 Subject: [PATCH] Prepare the repo for working on rust code (#841) --- .cargo/config | 4 + .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/settings.yml | 2 +- .github/workflows/rust.yml | 55 ++++ .github/workflows/scala.yml | 6 +- .gitignore | 40 +-- .rustfmt.toml | 33 +++ Cargo.toml | 25 ++ docs/README.md | 2 +- docs/parser/README.md | 25 ++ docs/parser/tech-analysis.md | 68 +++++ parser/flexer/Cargo.toml | 27 ++ parser/flexer/README.md | 4 + parser/flexer/build.rs | 1 + parser/flexer/src/automata.rs | 8 + parser/flexer/src/automata/alphabet.rs | 61 +++++ parser/flexer/src/automata/dfa.rs | 156 +++++++++++ parser/flexer/src/automata/nfa.rs | 365 +++++++++++++++++++++++++ parser/flexer/src/automata/pattern.rs | 164 +++++++++++ parser/flexer/src/automata/state.rs | 121 ++++++++ parser/flexer/src/data.rs | 3 + parser/flexer/src/data/matrix.rs | 55 ++++ parser/flexer/src/group.rs | 222 +++++++++++++++ parser/flexer/src/group/rule.rs | 39 +++ parser/flexer/src/lib.rs | 17 ++ parser/flexer/src/parser.rs | 15 + 26 files changed, 1482 insertions(+), 38 deletions(-) create mode 100644 .cargo/config create mode 100644 .github/workflows/rust.yml create mode 100644 .rustfmt.toml create mode 100644 Cargo.toml create mode 100644 docs/parser/README.md create mode 100644 docs/parser/tech-analysis.md create mode 100644 parser/flexer/Cargo.toml create mode 100644 parser/flexer/README.md create mode 100644 parser/flexer/build.rs create mode 100644 parser/flexer/src/automata.rs create mode 100644 parser/flexer/src/automata/alphabet.rs create mode 100644 parser/flexer/src/automata/dfa.rs create mode 100644 parser/flexer/src/automata/nfa.rs create mode 100644 parser/flexer/src/automata/pattern.rs create mode 100644 parser/flexer/src/automata/state.rs create mode 100644 parser/flexer/src/data.rs create mode 100644 parser/flexer/src/data/matrix.rs create mode 100644 parser/flexer/src/group.rs create mode 100644 parser/flexer/src/group/rule.rs create mode 100644 parser/flexer/src/lib.rs create mode 100644 parser/flexer/src/parser.rs diff --git a/.cargo/config b/.cargo/config new file mode 100644 index 0000000000..c7960922a8 --- /dev/null +++ b/.cargo/config @@ -0,0 +1,4 @@ + +[build] +target-dir = "target/rust/" +rustflags = [] diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index aba880a7b1..143a8d8062 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -14,5 +14,5 @@ Please include the following checklist in your PR: - [ ] The documentation has been updated if necessary. -- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md) and [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md) style guides. +- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md), [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md), and [Rust](https://github.com/luna/enso/blob/main/docs/style-guide/rust.md) style guides. - [ ] All code has been tested where possible. diff --git a/.github/settings.yml b/.github/settings.yml index d265643818..055a73152e 100644 --- a/.github/settings.yml +++ b/.github/settings.yml @@ -204,7 +204,7 @@ branches: required_status_checks: # Require branches to be up to date before merging. strict: true - contexts: ["Test (macOS-latest)", "Test (ubuntu-latest)", "license/cla"] + contexts: ["Test Engine (macOS-latest)", "Test Engine (ubuntu-latest)", "Test Parser (macOS-latest)", "Test Parser (ubuntu-latest)", " Test Parser (windows-latest) ", "license/cla"] enforce_admins: null restrictions: null diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000000..5244f1bc7a --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,55 @@ +name: Parser CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ "*" ] + +env: + wasmpackVersion: 0.8.1 + +jobs: + test: + name: Test Parser + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + strategy: + matrix: + os: [macOS-latest, ubuntu-latest, windows-latest] + fail-fast: false + steps: + - name: Checkout Parser Sources + uses: actions/checkout@v2 + + # Install Tooling + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2020-06-09 + override: true + - name: Install wasm-pack + uses: actions-rs/cargo@v1 + with: + command: install + args: wasm-pack --version ${{ env.wasmpackVersion }} + + # Caches + - name: Cache Cargo Registry + uses: actions/cache@v2 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**Cargo.toml') }} + restore-keys: ${{ runner.os }}-cargo-registry + - name: Cache Cargo Test + uses: actions/cache@v2 + with: + path: ./target/rust + key: ${{ runner.os }}-cargo-build-${{ hashFiles('**Cargo.toml') }} + restore-keys: ${{ runner.os }}-cargo-build + + # Tests + - name: Test Parser + uses: actions-rs/cargo@v1 + with: + command: test diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index a79d7d8937..38c5120699 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -1,4 +1,4 @@ -name: Enso CI +name: Engine CI on: push: @@ -27,7 +27,7 @@ env: jobs: # This job is responsible for testing the codebase test: - name: Test + name: Test Engine runs-on: ${{ matrix.os }} timeout-minutes: 30 strategy: @@ -93,7 +93,7 @@ jobs: # This job is responsible for building the artifacts build: - name: Build + name: Build Engine runs-on: ubuntu-latest timeout-minutes: 30 steps: diff --git a/.gitignore b/.gitignore index 10afaf8320..6fbcb5fa43 100644 --- a/.gitignore +++ b/.gitignore @@ -5,15 +5,9 @@ graal_dumps -########## -## Java ## -########## - -*.class - -########### -## Scala ## -########### +######### +## JVM ## +######### graal_dumps/ target/ @@ -25,6 +19,8 @@ target/ ########## Cargo.lock +**/*.rs.bk +wasm-pack.log ############# ## Haskell ## @@ -32,28 +28,12 @@ Cargo.lock dist cabal-dev -*.o -*.hi -*.chi -*.chs.h -*.dyn_o -*.dyn_hi -.hpc -.hsenv -.cabal-sandbox/ -cabal.sandbox.config -*.cabal -*.prof -*.aux -*.hp -*.DS_Store .stack-work/ ############ ## System ## ############ -# OSX .DS_Store ############ @@ -70,6 +50,7 @@ cabal.sandbox.config ###################### .idea/ +.vscode/ *.swp .projections.json @@ -83,6 +64,7 @@ scaladoc/ ####################### ## Benchmark Reports ## ####################### + bench-report.xml ############## @@ -97,10 +79,4 @@ bench-report.xml ######### .editorconfig -.bloop - - -######### -## NPM ## -######### -node_modules/ +.bloop/ diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 0000000000..7f37c27ec4 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,33 @@ + +# General Configuration +unstable_features = true +max_width = 80 +error_on_line_overflow = true +newline_style = "Unix" + +# Operators +binop_separator = "Front" + +# Whitespace +blank_lines_upper_bound = 1 + +# Code Layout +brace_style = "SameLineWhere" +combine_control_expr = true +empty_item_single_line = true +fn_single_line = true +format_strings = true +inline_attribute_width = 80 +space_before_colon = false +space_after_colon = false +type_punctuation_density = "Wide" + +# Comments +comment_width = 80 +wrap_comments = true +format_code_in_doc_comments = true +normalize_comments = true + +# Macros +format_macro_matchers = true +format_macro_bodies = true diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..47897e224a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,25 @@ +[workspace] + +members = [ + "parser/flexer" +] + +[profile.dev] +opt-level = 0 +lto = false +debug = true + +[profile.release] +opt-level = 3 +lto = true +debug = false + +[profile.bench] +opt-level = 3 +lto = true +debug = false + +[profile.test] +opt-level = 0 +lto = false +debug = true diff --git a/docs/README.md b/docs/README.md index 4aecb560a1..47a837da0b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -43,4 +43,4 @@ It is broken up into categories as follows: - [**Syntax:**](./syntax) A specification of Enso's syntax. - [**Types:**](./types) A specification of Enso's type system and type theory. - [**Debugger:**](./debugger) A specification of Enso's debugger. - +- [**Parser:**](./parser) Design and specification of the Enso parser. diff --git a/docs/parser/README.md b/docs/parser/README.md new file mode 100644 index 0000000000..ecc046ae32 --- /dev/null +++ b/docs/parser/README.md @@ -0,0 +1,25 @@ +--- +layout: docs-index +title: Enso's Parser +category: summary +tags: [parser, readme] +order: 0 +--- + +# Enso's Parser +The parser is one of the most crucial components of the Enso runtime in that +_all_ code that a user writes must be parsed. This means that a good parser is +fast, responsive, and lightweight; it shouldn't baulk at having thousands of +lines of code thrown at it. + +Enso's parser, however, is very special. In order to support interactive use it +has to narrow down the scope of a syntax error as much as possible, while still +providing useful output for the compiler around the rest of the parse errors. +This feature makes it more complex than many common parsers, so making this work +while still preserving performance is of paramount importance. + +The various components of the parser's design and architecture are described +below: + +- [**Tech Analysis:**](./tech-analysis.md) A brief overview of the reasons for + the implementation technologies for the parser. diff --git a/docs/parser/tech-analysis.md b/docs/parser/tech-analysis.md new file mode 100644 index 0000000000..07ebca6fb7 --- /dev/null +++ b/docs/parser/tech-analysis.md @@ -0,0 +1,68 @@ +--- +layout: developer-doc +title: Technology Analysis +category: syntax +tags: [parser, tech-analysis] +order: 1 +--- + +# Parser Technology Analysis +As the Enso parser has some fairly unique requirements placed upon it, the +choice of implementation technology is of paramount importance. Choosing the +correct technology ensures that we can meet all of the requirements placed upon +the parser. + + + +- [Technology Requirements for the Parser](#technology-requirements-for-the-parser) +- [Issues With the Previous Implementation](#issues-with-the-previous-implementation) +- [Choosing Rust](#choosing-rust) + - [Downsides of Rust](#downsides-of-rust) + + + +## Technology Requirements for the Parser +As the parser has to work both for the Engine and for the IDE, it has a strange +set of requirements: + +- The implementation language must be able to run on native platforms, as well + as in the browser via WASM (not JavaScript due to the marshalling overhead). +- The implementation language should permit _excellent_ native performance on + both native and web platforms, by giving implementers fine-grained control + over memory usage. +- The implementation language must be able to target all primary platforms: + macOS, Linux and Windows. + +## Issues With the Previous Implementation +The previous implementation of the parser was implemented in Scala, and had some +serious issues that have necessitated this rewrite: + +- **Performance:** The structures used to implement the parser proved inherently + difficult for a JIT to optimise, making performance far worse than expected on + the JVM. +- **ScalaJS Sub-Optimal Code Generation:** The JavaScript generated by ScalaJS + was very suboptimal for these structures, making the parser _even_ slower when + run in the browser. +- **JS as a Browser Target:** To transfer textual data between WASM and JS + incurs a significant marshalling overhead. As the IDE primarily works with + textual operations under the hood, this proved to be a significant slowdown. + +## Choosing Rust +Rust, then, is an obvious choice for the following reasons: + +- It can be compiled _natively_ into the IDE binary, providing them with + excellent performance. +- As a native language it can use JNI to directly create JVM objects on the JVM + heap, for use by the compiler. +- As a native language it can be called directly via JNI. +- There is potential in the future for employing Graal's LLVM bitcode + interpreter to execute the parser safely in a non-native context. + +### Downsides of Rust +This is not to say that choosing rust doesn't come with some compromises: + +- It significantly complicates the CI pipeline for the engine, as we will have + to build native artefacts for use by the runtime itself. +- As a non-JVM language, the complexity of working with it from Scala and Java + is increased. We will need to maintain a full definition of the AST in Scala + to permit the compiler to work properly with it. diff --git a/parser/flexer/Cargo.toml b/parser/flexer/Cargo.toml new file mode 100644 index 0000000000..9d46d331d2 --- /dev/null +++ b/parser/flexer/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "flexer" +version = "0.0.1" +authors = [ + "Enso Team ", + "Ara Adkins , +} + +impl Default for Alphabet { + fn default() -> Self { + Alphabet { + symbols:[Symbol { val:0 }].iter().cloned().collect(), + } + } +} + +impl Alphabet { + /// Inserts a range of symbols into the alphabet. + pub fn insert(&mut self, range:RangeInclusive) { + // The symbol range is associated with transition in automata. Therefore + // we: Mark the symbol with the new transition. + self.symbols.insert(Symbol { + val:range.start().val, + }); + // Mark the symbol without the new transition. + self.symbols.insert(Symbol { + val:range.end().val + 1, + }); + // This way each symbol in alphabet corresponds to a unique set of + // transitions. + } +} + +impl From> for Alphabet { + fn from(vec:Vec) -> Self { + let mut dict = Self::default(); + for val in vec { + dict.symbols.insert(Symbol { val }); + } + dict + } +} diff --git a/parser/flexer/src/automata/dfa.rs b/parser/flexer/src/automata/dfa.rs new file mode 100644 index 0000000000..a55441a94c --- /dev/null +++ b/parser/flexer/src/automata/dfa.rs @@ -0,0 +1,156 @@ +//! Exports the structure for Deterministic Finite Automata. + +use crate::automata::alphabet::Alphabet; +use crate::automata::state; +use crate::data::matrix::Matrix; + +// ===================================== +// === Deterministic Finite Automata === +// ===================================== + +/// Function callback for an arbitrary state of finite automata. +/// It contains name of Rust procedure that is meant to be executed after +/// encountering a pattern (declared in `group::Rule.pattern`). +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Callback { + /// TODO[jv] Write better explanation after implementing rust code + /// generation. Priority is used during rust code generation. + pub priority:usize, + /// Name of Rust method that will be called when executing this callback. + pub name:String, +} + +/// DFA automata with a set of symbols, states and transitions. +/// Deterministic Finite Automata is a finite-state machine that accepts or +/// rejects a given sequence of symbols, by running through a state sequence +/// uniquely determined by the input symbol sequence. ___ ___ +/// ___ ___ | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3 +/// | ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ +/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct DFA { + /// Finite set of all valid input symbols. + pub alphabet:Alphabet, + /// Transition matrix of deterministic finite state automata. + /// It contains next state for each pair of state and input symbol - + /// (state,symbol) => new state. For example, a transition matrix for + /// automata that accepts string "ABABAB...." would look like this: + /// states + /// | | A | B | <- symbols + /// | 0 | 1 | - | + /// | 1 | - | 0 | + /// Where `-` denotes `state::INVALID`. + pub links:Matrix, + /// Stores callback for each state (if it has one). + pub callbacks:Vec>, +} + +impl From>> for Matrix { + fn from(input:Vec>) -> Self { + let rows = input.len(); + let columns = if rows == 0 { 0 } else { input[0].len() }; + let mut matrix = Self::new(rows, columns); + for row in 0..rows { + for column in 0..columns { + matrix[(row, column)] = state::Id { + id:input[row][column], + }; + } + } + matrix + } +} + +// =========== +// == Tests == +// =========== + +#[cfg(test)] +pub mod tests { + use super::*; + use crate::automata::state; + + const I:usize = state::INVALID.id; + + /// DFA automata that accepts newline '\n'. + pub fn newline() -> DFA { + DFA { + alphabet:Alphabet::from(vec![10, 11]), + links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]), + callbacks:vec![ + None, + Some(Callback { + priority:2, + name:"group0_rule0".into(), + }), + ], + } + } + + /// DFA automata that accepts any letter a..=z. + pub fn letter() -> DFA { + DFA { + alphabet:Alphabet::from(vec![97, 123]), + links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]), + callbacks:vec![ + None, + Some(Callback { + priority:2, + name:"group0_rule0".into(), + }), + ], + } + } + + /// DFA automata that accepts any number of spaces ' '. + pub fn spaces() -> DFA { + DFA { + alphabet:Alphabet::from(vec![0, 32, 33]), + links:Matrix::from(vec![ + vec![I, 1, I], + vec![I, 2, I], + vec![I, 2, I], + ]), + callbacks:vec![ + None, + Some(Callback { + priority:3, + name:"group0_rule0".into(), + }), + Some(Callback { + priority:3, + name:"group0_rule0".into(), + }), + ], + } + } + + /// DFA automata that accepts one letter a..=z or any many spaces. + pub fn letter_and_spaces() -> DFA { + DFA { + alphabet:Alphabet::from(vec![32, 33, 97, 123]), + links:Matrix::from(vec![ + vec![I, 1, I, 2, I], + vec![I, 3, I, I, I], + vec![I, I, I, I, I], + vec![I, 3, I, I, I], + ]), + callbacks:vec![ + None, + Some(Callback { + priority:4, + name:"group0_rule1".into(), + }), + Some(Callback { + priority:4, + name:"group0_rule0".into(), + }), + Some(Callback { + priority:4, + name:"group0_rule1".into(), + }), + ], + } + } +} diff --git a/parser/flexer/src/automata/nfa.rs b/parser/flexer/src/automata/nfa.rs new file mode 100644 index 0000000000..38d9d5c190 --- /dev/null +++ b/parser/flexer/src/automata/nfa.rs @@ -0,0 +1,365 @@ +//! Implementation of Nondeterministic Finite Automata and it's conversion to +//! DFA. + +use crate::automata::alphabet::Alphabet; +use crate::automata::dfa::Callback; +use crate::automata::dfa::DFA; +use crate::automata::state; +use crate::automata::state::Link; +use crate::automata::state::State; +use crate::automata::state::Symbol; +use crate::data::matrix::Matrix; + +use crate::automata::pattern::Pattern; +use itertools::Itertools; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::ops::RangeInclusive; + +// ======================================== +// === Nondeterministic Finite Automata === +// ======================================== + +/// Type alias for a state Id based on set of states. +/// It is used during NFA -> DFA transformation where multiple states can merge +/// together, thanks to epsilon links. +type StateSetId = BTreeSet; + +/// NFA automata with a set of symbols, states and transitions. +/// Nondeterministic Finite Automata is a finite-state machine that accepts or +/// rejects a given sequence of symbols. +/// Compared to `DFA`, NFA can transition into multiple new states without +/// reading any symbol (so called epsilon link / transition), +/// ___ ___ ___ ___ ___ +/// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 | +/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ +/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct NFA { + /// Finite set of all valid input symbols. + pub alphabet:Alphabet, + /// Set of named NFA states with (epsilon) transitions. + pub states:Vec, +} + +impl NFA { + /// Adds a new state to NFA and returns it's Id. + pub fn new_state(&mut self) -> state::Id { + let id = self.states.len(); + self.states.push(State::default()); + state::Id { id } + } + + /// Creates an epsilon transition between two states. + /// Whenever the automata happens to be in `source` state it can + /// immediatelly move to `target` state (but does not have to). + pub fn connect(&mut self, source:state::Id, target:state::Id) { + self.states[source.id].epsilon_links.push(target); + } + + /// Creates an ordinary transition (for a range of symbols) between two + /// states. If any symbol from such range happens to be on input when + /// the automata is in `source` state, it will immediatelly move to + /// `target` state. + pub fn connect_by( + &mut self, + source:state::Id, + target:state::Id, + symbols:&RangeInclusive, + ) { + self.alphabet.insert(symbols.clone()); + self.states[source.id].links.push(Link { + symbols:symbols.clone(), + target, + }); + } + + /// Transforms pattern to NFA. + /// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI + pub fn new_pattern( + &mut self, + source:state::Id, + pattern:&Pattern, + ) -> state::Id { + let current = self.new_state(); + self.connect(source, current); + match pattern { + Pattern::Range(range) => { + let state = self.new_state(); + self.connect_by(current, state, range); + state + } + Pattern::Many(body) => { + let s1 = self.new_state(); + let s2 = self.new_pattern(s1, body); + let s3 = self.new_state(); + self.connect(current, s1); + self.connect(current, s3); + self.connect(s2, s3); + self.connect(s3, s1); + s3 + } + Pattern::And(patterns) => patterns + .iter() + .fold(current, |s, pat| self.new_pattern(s, pat)), + Pattern::Or(patterns) => { + let states = patterns + .iter() + .map(|pat| self.new_pattern(current, pat)) + .collect_vec(); + let end = self.new_state(); + for state in states { + self.connect(state, end); + } + end + } + } + } + + // === NFA -> DFA === + + /// Merges states that are connected by epsilon links. + /// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao + fn eps_matrix(&self) -> Vec { + fn fill_eps_matrix( + nfa:&NFA, + states:&mut Vec, + computed:&mut Vec, + visited:&mut Vec, + state:state::Id, + ) { + let mut state_set = StateSetId::new(); + let mut circular = false; + visited[state.id] = true; + state_set.insert(state); + for &target in &nfa.states[state.id].epsilon_links { + if !visited[target.id] { + fill_eps_matrix(nfa, states, computed, visited, target); + } + state_set.insert(target); + state_set.extend(states[target.id].iter()); + if !computed[target.id] { + circular = true + } + } + if !circular { + computed[state.id] = true + } + states[state.id] = state_set; + } + + let mut states = vec![StateSetId::new(); self.states.len()]; + let mut computed = vec![false; self.states.len()]; + for id in 0..self.states.len() { + let mut visited = vec![false; states.len()]; + fill_eps_matrix( + self, + &mut states, + &mut computed, + &mut visited, + state::Id { id }, + ); + } + states + } + + /// Computes a transition matrix (state X symbol => state) for NFA. + /// Ignores epsilon links. + fn nfa_matrix(&self) -> Matrix { + let mut matrix = + Matrix::new(self.states.len(), self.alphabet.symbols.len()); + + for (state_ix, source) in self.states.iter().enumerate() { + let targets = source.targets(&self.alphabet); + for (voc_ix, &target) in targets.iter().enumerate() { + matrix[(state_ix, voc_ix)] = target; + } + } + matrix + } +} + +impl From<&NFA> for DFA { + /// Transforms NFA into DFA. + /// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao + fn from(nfa:&NFA) -> Self { + let nfa_mat = nfa.nfa_matrix(); + let eps_mat = nfa.eps_matrix(); + let mut dfa_mat = Matrix::new(0, nfa.alphabet.symbols.len()); + let mut dfa_eps_ixs = Vec::::new(); + let mut dfa_eps_map = HashMap::::new(); + + dfa_eps_ixs.push(eps_mat[0].clone()); + dfa_eps_map.insert(eps_mat[0].clone(), state::Id { id:0 }); + + let mut i = 0; + while i < dfa_eps_ixs.len() { + dfa_mat.new_row(); + for voc_ix in 0..nfa.alphabet.symbols.len() { + let mut eps_set = StateSetId::new(); + for &eps_ix in &dfa_eps_ixs[i] { + let tgt = nfa_mat[(eps_ix.id, voc_ix)]; + if tgt != state::INVALID { + eps_set.extend(eps_mat[tgt.id].iter()); + } + } + if !eps_set.is_empty() { + dfa_mat[(i, voc_ix)] = match dfa_eps_map.get(&eps_set) { + Some(&id) => id, + None => { + let id = state::Id { + id:dfa_eps_ixs.len(), + }; + dfa_eps_ixs.push(eps_set.clone()); + dfa_eps_map.insert(eps_set, id); + id + } + }; + } + } + i += 1; + } + + let mut callbacks = vec![None; dfa_eps_ixs.len()]; + let priority = dfa_eps_ixs.len(); + for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() { + let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some(); + if let Some(eps) = epss.into_iter().find(has_name) { + let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap(); + callbacks[dfa_ix] = Some(Callback { + name:rule, + priority, + }); + } + } + + DFA { + alphabet:nfa.alphabet.clone(), + links:dfa_mat, + callbacks, + } + } +} + +// =========== +// == Tests == +// =========== + +#[cfg(test)] +pub mod tests { + extern crate test; + + use crate::automata::dfa; + + use super::*; + use test::Bencher; + + /// NFA automata that accepts newline '\n'. + pub fn newline() -> NFA { + NFA { + states:vec![ + State::from(vec![1]), + State::from(vec![(10..=10, 2)]), + State::from(vec![3]).named("group0_rule0"), + State::default(), + ], + alphabet:Alphabet::from(vec![10, 11]), + } + } + + /// NFA automata that accepts any letter a..=z. + pub fn letter() -> NFA { + NFA { + states:vec![ + State::from(vec![1]), + State::from(vec![(97..=122, 2)]), + State::from(vec![3]).named("group0_rule0"), + State::default(), + ], + alphabet:Alphabet::from(vec![97, 123]), + } + } + + /// NFA automata that accepts any number of spaces ' '. + pub fn spaces() -> NFA { + NFA { + states:vec![ + State::from(vec![1]), + State::from(vec![2]), + State::from(vec![(32..=32, 3)]), + State::from(vec![4]), + State::from(vec![5, 8]), + State::from(vec![6]), + State::from(vec![(32..=32, 7)]), + State::from(vec![8]), + State::from(vec![5, 9]).named("group0_rule0"), + State::default(), + ], + alphabet:Alphabet::from(vec![0, 32, 33]), + } + } + + /// NFA automata that accepts one letter a..=z or many spaces ' '. + pub fn letter_and_spaces() -> NFA { + NFA { + states:vec![ + State::from(vec![1, 3]), + State::from(vec![(97..=122, 2)]), + State::from(vec![11]).named("group0_rule0"), + State::from(vec![4]), + State::from(vec![(32..=32, 5)]), + State::from(vec![6]), + State::from(vec![7, 10]), + State::from(vec![8]), + State::from(vec![(32..=32, 9)]), + State::from(vec![10]), + State::from(vec![7, 11]).named("group0_rule1"), + State::default(), + ], + alphabet:Alphabet::from(vec![32, 33, 97, 123]), + } + } + + #[test] + fn test_to_dfa_newline() { + assert_eq!(DFA::from(&newline()), dfa::tests::newline()); + } + + #[test] + fn test_to_dfa_letter() { + assert_eq!(DFA::from(&letter()), dfa::tests::letter()); + } + + #[test] + fn test_to_dfa_spaces() { + assert_eq!(DFA::from(&spaces()), dfa::tests::spaces()); + } + + #[test] + fn test_to_dfa_letter_and_spaces() { + assert_eq!( + DFA::from(&letter_and_spaces()), + dfa::tests::letter_and_spaces() + ); + } + + #[bench] + fn bench_to_dfa_newline(bencher:&mut Bencher) { + bencher.iter(|| DFA::from(&newline())) + } + + #[bench] + fn bench_to_dfa_letter(bencher:&mut Bencher) { + bencher.iter(|| DFA::from(&letter())) + } + + #[bench] + fn bench_to_dfa_spaces(bencher:&mut Bencher) { + bencher.iter(|| DFA::from(&spaces())) + } + + #[bench] + fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) { + bencher.iter(|| DFA::from(&letter_and_spaces())) + } +} diff --git a/parser/flexer/src/automata/pattern.rs b/parser/flexer/src/automata/pattern.rs new file mode 100644 index 0000000000..1bf2fea954 --- /dev/null +++ b/parser/flexer/src/automata/pattern.rs @@ -0,0 +1,164 @@ +//! Simple API for constructing regex patterns that are used in parser +//! implementation. + +use crate::automata::state::Symbol; +use crate::parser; + +use core::iter; +use itertools::Itertools; +use std::ops::BitAnd; +use std::ops::BitOr; +use std::ops::RangeInclusive; + +// ============= +// == Pattern == +// ============= + +/// Simple regex pattern. +#[derive(Clone, Debug)] +pub enum Pattern { + /// Pattern that triggers on any symbol from given range. + Range(RangeInclusive), + /// Pattern that triggers on any given pattern from sequence. + Or(Vec), + /// Pattern that triggers when a sequence of patterns is encountered. + And(Vec), + /// Pattern that triggers on 0..N repetitions of given pattern. + Many(Box), +} + +use Pattern::*; + +impl BitOr for Pattern { + type Output = Pattern; + fn bitor(self, rhs:Pattern) -> Self::Output { + match (self, rhs) { + (Or(mut lhs), Or(rhs)) => { + lhs.extend(rhs); + Or(lhs) + } + (Or(mut lhs), rhs) => { + lhs.push(rhs); + Or(lhs) + } + (lhs, Or(mut rhs)) => { + rhs.push(lhs); + Or(rhs) + } + (lhs, rhs) => Or(vec![lhs, rhs]), + } + } +} + +impl BitAnd for Pattern { + type Output = Pattern; + fn bitand(self, rhs:Pattern) -> Self::Output { + match (self, rhs) { + (And(mut lhs), And(rhs)) => { + lhs.extend(rhs); + And(lhs) + } + (And(mut lhs), rhs) => { + lhs.push(rhs); + And(lhs) + } + (lhs, And(mut rhs)) => { + rhs.push(lhs); + And(rhs) + } + (lhs, rhs) => And(vec![lhs, rhs]), + } + } +} + +impl Pattern { + /// Pattern that never triggers. + pub fn never() -> Self { Pattern::symbols(1..=0) } + + /// Pattern that always triggers. + pub fn always() -> Self { + Pattern::symbols(u32::min_value()..=u32::max_value()) + } + + /// Pattern that triggers on any char. + pub fn any_char() -> Self { Pattern::symbols(0..=u32::max_value()) } + + /// Pattern that triggers on 0..N repetitions of given pattern. + pub fn many(self) -> Self { Many(Box::new(self)) } + + /// Pattern that triggers on 1..N repetitions of given pattern. + pub fn many1(self) -> Self { self.clone() & self.many() } + + /// Pattern that triggers on 0..=1 repetitions of given pattern. + pub fn opt(self) -> Self { self | Self::always() } + + /// Pattern that triggers on given symbol + pub fn symbol(symbol:u32) -> Self { Pattern::symbols(symbol..=symbol) } + + /// Pattern that triggers on any of the given symbols. + pub fn symbols(symbols:RangeInclusive) -> Self { + Pattern::Range( + Symbol { + val:*symbols.start(), + }..=Symbol { val:*symbols.end() }, + ) + } + + /// Pattern that triggers on end of file. + pub fn eof() -> Self { Self::symbol(parser::EOF_CODE.val) } + + /// Pattern that triggers on given character. + pub fn char(char:char) -> Self { Self::symbol(char as u32) } + + /// Pattern that triggers on any of the given characters. + pub fn range(chars:RangeInclusive) -> Self { + Pattern::symbols((*chars.start() as u32)..=(*chars.end() as u32)) + } + + /// Pattern that triggers when sequence of characters is encountered. + pub fn all(chars:&str) -> Self { + chars + .chars() + .fold(Self::never(), |pat, char| pat & Self::char(char)) + } + + /// Pattern that triggers on any characters from given sequence. + pub fn any(chars:&str) -> Self { + chars + .chars() + .fold(Self::never(), |pat, char| pat | Self::char(char)) + } + + /// Pattern that doesn't trigger on any given character from given sequence. + pub fn none(chars:&str) -> Self { + let max = u32::max_value(); + let char_iter = chars.chars().map(|char| char as u32); + let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max)); + let mut codes = char_iter2.collect_vec(); + + codes.sort(); + codes + .iter() + .tuple_windows() + .fold(Self::never(), |pat, (start, end)| { + if end < start { + pat + } else { + pat | Pattern::symbols(*start..=*end) + } + }) + } + + /// Pattern that triggers on any character but the one given. + pub fn not(char:char) -> Self { Self::none(&char.to_string()) } + + /// Pattern that triggers on N repetitions of given pattern. + pub fn repeat(pat:Pattern, num:usize) -> Self { + (0..num).fold(Self::always(), |p, _| p & pat.clone()) + } + + /// Pattern that triggers on MIN..MAX repetitions of given pattern. + pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self { + (min..max).fold(Self::never(), |p, n| p | Self::repeat(pat.clone(), n)) + } +} diff --git a/parser/flexer/src/automata/state.rs b/parser/flexer/src/automata/state.rs new file mode 100644 index 0000000000..4bff4ac2fc --- /dev/null +++ b/parser/flexer/src/automata/state.rs @@ -0,0 +1,121 @@ +//! This module exports State implementation for Nondeterministic Finite +//! Automata. + +use crate::automata::alphabet::Alphabet; +use crate::automata::state; + +use std::ops::RangeInclusive; + +// ======================= +// == State Of Automata == +// ======================= + +/// Flag for invalid state. +/// When finite automata gets into invalid state the input sequence of symbols +/// is rejected. +pub const INVALID:Id = Id { + id:usize::max_value(), +}; + +/// Newtype wrapper for finite automata input symbol. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Symbol { + #[allow(missing_docs)] + pub val:u32, +} + +/// Newtype wrapper for finite automata state ID. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Id { + #[allow(missing_docs)] + pub id:usize, +} + +impl Default for Id { + /// Returns state::INVALID. This is because every finite automata has an + /// invalid state and because all transitions in automata transition + /// matrix lead to invalid state by default. + fn default() -> Self { state::INVALID } +} + +/// Named NFA state with a set of transitions (links). +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct State { + /// Set of transitions that don't require any symbol to trigger. + /// I.E. If there is an epsilon link from state A to state B, then whenever + /// we are in state A, we can freely move to state B. + pub epsilon_links:Vec, + /// Set of transitions that trigger with specific symbol on input. + /// When triggered, the automata will transition to the `link.target`. + pub links:Vec, + /// Name of the state. + /// We use it to autogenerate a call to Rust method with same name. + pub name:Option, +} + +/// A transition to new automata state +/// that requires specific symbol on automata input to trigger. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Link { + /// Any symbol from the range will trigger this link. + pub symbols:RangeInclusive, + /// A state that is visited, after the link is triggered. + pub target:Id, +} + +impl State { + /// Updater for field `name`. Returns updated state. + pub fn named(mut self, name:&str) -> Self { + self.name = Some(name.to_owned()); + self + } + + /// Returns transition (next state) for each symbol in alphabet. + pub fn targets(&self, alphabet:&Alphabet) -> Vec { + let mut targets = vec![]; + let mut index = 0; + let mut links = self.links.clone(); + links.sort_by_key(|link| *link.symbols.start()); + for &symbol in &alphabet.symbols { + while links.len() > index && *links[index].symbols.end() < symbol { + index += 1; + } + if links.len() <= index || *links[index].symbols.start() > symbol { + targets.push(state::INVALID); + } else { + targets.push(links[index].target); + } + } + targets + } +} + +impl From> for State { + /// Creates a state with epsilon links. + fn from(vec:Vec) -> Self { + let epsilon_links = vec.iter().cloned().map(|id| Id { id }).collect(); + State { + epsilon_links, + ..Default::default() + } + } +} + +impl From, usize)>> for State { + /// Creates a state with ordinary links. + fn from(vec:Vec<(RangeInclusive, usize)>) -> Self { + let link = |(range, id):(RangeInclusive, usize)| { + let start = Symbol { val:*range.start() }; + let end = Symbol { val:*range.end() }; + Link { + symbols:start..=end, + target:Id { id }, + } + }; + let links = vec.iter().cloned().map(link).collect(); + State { + links, + ..Default::default() + } + } +} diff --git a/parser/flexer/src/data.rs b/parser/flexer/src/data.rs new file mode 100644 index 0000000000..a2bb4e66f3 --- /dev/null +++ b/parser/flexer/src/data.rs @@ -0,0 +1,3 @@ +//! Generic datastructures, with multiple usecases. + +pub mod matrix; diff --git a/parser/flexer/src/data/matrix.rs b/parser/flexer/src/data/matrix.rs new file mode 100644 index 0000000000..f1964b48e2 --- /dev/null +++ b/parser/flexer/src/data/matrix.rs @@ -0,0 +1,55 @@ +//! Efficient representation of 2D matrix. + +use std::ops::Index; +use std::ops::IndexMut; + +// ============ +// == Matrix == +// ============ + +/// Efficient 2D matrix implemented on top of vector. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct Matrix { + /// The number of rows in matrix. + rows:usize, + /// The number of columns in matrix. + columns:usize, + /// Matrix implemented with vector. + matrix:Vec, +} + +impl Index<(usize, usize)> for Matrix { + type Output = T; + fn index(&self, index:(usize, usize)) -> &T { + &self.matrix[index.0 * self.columns + index.1] + } +} + +impl IndexMut<(usize, usize)> for Matrix { + fn index_mut(&mut self, index:(usize, usize)) -> &mut T { + &mut self.matrix[index.0 * self.columns + index.1] + } +} + +impl Matrix { + /// Constructs a new matrix for given number of rows and columns. + pub fn new(rows:usize, columns:usize) -> Self { + let mut matrix = Vec::with_capacity(rows * columns); + for _ in 0..matrix.capacity() { + matrix.push(Default::default()) + } + Self { + rows, + columns, + matrix, + } + } + + /// Adds a new row to matrix, filled with default values. + pub fn new_row(&mut self) { + for _ in 0..self.columns { + self.matrix.push(Default::default()); + } + self.rows += 1; + } +} diff --git a/parser/flexer/src/group.rs b/parser/flexer/src/group.rs new file mode 100644 index 0000000000..6e89c080fc --- /dev/null +++ b/parser/flexer/src/group.rs @@ -0,0 +1,222 @@ +//! This module exports API for grouping multiple rules (Rust callbacks with +//! regex pattern) together. + +use crate::automata::nfa::NFA; +use crate::automata::pattern::Pattern; +use crate::group::rule::Rule; + +use itertools::Itertools; + +pub mod rule; + +// =========== +// == Group == +// =========== + +/// Struct that group rules together. It also inherits rules from parent group +/// (if it has one). Groups are the basic building block of flexer: +/// Flexer internally keeps a stack of groups, only one of them active at a +/// time. Each group contains set of regex patterns and callbacks (together +/// called `Rule`). Whenever a rule.pattern from active group is matched with +/// part of input the associated rule.callback is executed, which in turn may +/// exit the current groupor enter a new one. This allows us to nicely model a +/// situation, where certain part of program (like a string literal) should have +/// very different parsing rules than other (for example body of function). Note +/// that the input is first matched with first added rule, then with the second +/// etc. Therefore, if two rules overlap, only the callback of the first added +/// rule will be executed. +#[derive(Clone, Debug, Default)] +pub struct Group { + /// Unique ID. + pub id:usize, + /// Custom name which is used for debugging. + pub name:String, + /// Parent which we inherit rules from. + pub parent:Option>, + /// Set of regex patterns with associated callbacks. + pub rules:Vec, +} + +impl Group { + /// Adds new rule (regex pattern with associated callback) to group. + pub fn add_rule(&mut self, rule:Rule) { self.rules.push(rule) } + + /// Returns rule builder for given pattern. + /// TODO[jv] better describe it's purpose once we agree on correct API. + pub fn rule( + &mut self, + pattern:Pattern, + ) -> rule::Builder { + rule::Builder { + pattern, + callback:move |rule| self.add_rule(rule), + } + } + + /// All rules including parent rules. + pub fn rules(&self) -> Vec<&Rule> { + let mut parent = &self.parent; + let mut rules = (&self.rules).iter().collect_vec(); + while let Some(state) = parent { + rules.extend((&state.rules).iter()); + parent = &state.parent; + } + rules + } + + /// Canonical name of given rule. + fn callback_name(&self, rule_ix:usize) -> String { + format!("group{}_rule{}", self.id, rule_ix) + } +} + +impl From<&Group> for NFA { + /// Transforms Group to NFA. + /// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI + fn from(group:&Group) -> Self { + let mut nfa = NFA::default(); + let start = nfa.new_state(); + let build = |rule:&Rule| nfa.new_pattern(start, &rule.pattern); + let states = group.rules().into_iter().map(build).collect_vec(); + let end = nfa.new_state(); + for (ix, state) in states.into_iter().enumerate() { + nfa.states[state.id].name = Some(group.callback_name(ix)); + nfa.connect(state, end); + } + nfa + } +} + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +pub mod tests { + extern crate test; + + use crate::automata::dfa::DFA; + use crate::automata::nfa; + use crate::automata::nfa::NFA; + use crate::automata::pattern::Pattern; + use crate::group::rule::Rule; + use crate::group::Group; + + use std::default::Default; + use test::Bencher; + + fn newline() -> Group { + let pattern = Pattern::char('\n'); + let mut group = Group::default(); + + group.add_rule(Rule { + pattern, + callback:"".into(), + }); + + group + } + + fn letter() -> Group { + let pattern = Pattern::range('a'..='z'); + let mut group = Group::default(); + + group.add_rule(Rule { + pattern, + callback:"".into(), + }); + + group + } + + fn spaces() -> Group { + let pattern = Pattern::char(' ').many1(); + let mut group = Group::default(); + + group.add_rule(Rule { + pattern, + callback:"".into(), + }); + + group + } + + fn letter_and_spaces() -> Group { + let letter = Pattern::range('a'..='z'); + let spaces = Pattern::char(' ').many1(); + let mut group = Group::default(); + + group.add_rule(Rule { + pattern:letter, + callback:"".into(), + }); + group.add_rule(Rule { + pattern:spaces, + callback:"".into(), + }); + + group + } + + fn hundred_rules() -> Group { + let pattern = + Pattern::all("The quick brown fox jumps over the lazy dog!!"); + let mut group = Group::default(); + + for _ in 0..100 { + group.add_rule(Rule { + pattern:pattern.clone(), + callback:"".into(), + }) + } + group + } + + #[test] + fn test_to_nfa_newline() { + assert_eq!(NFA::from(&newline()), nfa::tests::newline()); + } + + #[test] + fn test_to_nfa_letter() { + assert_eq!(NFA::from(&letter()), nfa::tests::letter()); + } + + #[test] + fn test_to_nfa_spaces() { + assert_eq!(NFA::from(&spaces()), nfa::tests::spaces()); + } + + #[test] + fn test_to_nfa_letter_and_spaces() { + assert_eq!( + NFA::from(&letter_and_spaces()), + nfa::tests::letter_and_spaces() + ); + } + + #[bench] + fn bench_to_nfa_newline(bencher:&mut Bencher) { + bencher.iter(|| NFA::from(&newline())) + } + + #[bench] + fn bench_to_nfa_letter(bencher:&mut Bencher) { + bencher.iter(|| NFA::from(&letter())) + } + + #[bench] + fn bench_to_nfa_spaces(bencher:&mut Bencher) { + bencher.iter(|| NFA::from(&spaces())) + } + + #[bench] + fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) { + bencher.iter(|| NFA::from(&letter_and_spaces())) + } + + #[bench] + fn bench_hundred_rules(bencher:&mut Bencher) { + bencher.iter(|| DFA::from(&NFA::from(&hundred_rules()))); + } +} diff --git a/parser/flexer/src/group/rule.rs b/parser/flexer/src/group/rule.rs new file mode 100644 index 0000000000..d5f90e1083 --- /dev/null +++ b/parser/flexer/src/group/rule.rs @@ -0,0 +1,39 @@ +//! An API for declaring Rust callbacks for encountered regex patterns. +//! +use crate::automata::pattern::Pattern; + +// ========== +// == Rule == +// ========== + +/// A rule is a pair of regex pattern and callback. +/// The intention is to run the callback after encountering given pattern. +#[derive(Clone, Debug)] +pub struct Rule { + /// Pattern that triggers the callback. + pub pattern:Pattern, + /// Callback containing stringified Rust code. + pub callback:String, +} + +/// Builder that allows us to add `Rule` to `Group` in a nice way. +/// It is possible this structure won't be useful in rust, since borrow checker +/// will likely influence the final API of rule construction. +#[derive(Clone, Debug)] +pub struct Builder { + /// Pattern that triggers the callback. + pub pattern:Pattern, + /// Callback containing a closure. + pub callback:Callback, +} + +impl Builder { + /// Feeds the input that triggered regex pattern to callback. + pub fn run(&mut self, program:String) { + let rule = Rule { + pattern:self.pattern.clone(), + callback:program, + }; + (self.callback)(rule); + } +} diff --git a/parser/flexer/src/lib.rs b/parser/flexer/src/lib.rs new file mode 100644 index 0000000000..b73d02cb7c --- /dev/null +++ b/parser/flexer/src/lib.rs @@ -0,0 +1,17 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This module exports simple parser based on Deterministic Finite State +//! Automata for regular grammars (anything parsable with regex patterns). + +pub mod automata; +pub mod data; +pub mod group; +pub mod parser; diff --git a/parser/flexer/src/parser.rs b/parser/flexer/src/parser.rs new file mode 100644 index 0000000000..9c4213ff02 --- /dev/null +++ b/parser/flexer/src/parser.rs @@ -0,0 +1,15 @@ +//! The entry point of flexer. It (is going to) contain API for parsing an input +//! string based on group of regex patterns. + +use crate::automata::state::Symbol; + +// ============ +// == Parser == +// ============ + +/// End Of File - This symbol is inserted at the end of each parser input. +/// We can use the maximum value of u32, because no `char` (unicode scalar) can +/// hold this value. +pub const EOF_CODE:Symbol = Symbol { + val:u32::max_value(), +};