Prepare the repo for working on rust code (#841)

This commit is contained in:
Ara Adkins 2020-06-16 17:18:11 +01:00 committed by GitHub
parent 2f404b7f08
commit f7d4ef546a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 1482 additions and 38 deletions

4
.cargo/config Normal file
View File

@ -0,0 +1,4 @@
[build]
target-dir = "target/rust/"
rustflags = []

View File

@ -14,5 +14,5 @@
Please include the following checklist in your PR:
- [ ] The documentation has been updated if necessary.
- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md) and [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md) style guides.
- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md), [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md), and [Rust](https://github.com/luna/enso/blob/main/docs/style-guide/rust.md) style guides.
- [ ] All code has been tested where possible.

View File

@ -204,7 +204,7 @@ branches:
required_status_checks:
# Require branches to be up to date before merging.
strict: true
contexts: ["Test (macOS-latest)", "Test (ubuntu-latest)", "license/cla"]
contexts: ["Test Engine (macOS-latest)", "Test Engine (ubuntu-latest)", "Test Parser (macOS-latest)", "Test Parser (ubuntu-latest)", " Test Parser (windows-latest) ", "license/cla"]
enforce_admins: null
restrictions: null

55
.github/workflows/rust.yml vendored Normal file
View File

@ -0,0 +1,55 @@
name: Parser CI
on:
push:
branches: [ main ]
pull_request:
branches: [ "*" ]
env:
wasmpackVersion: 0.8.1
jobs:
test:
name: Test Parser
runs-on: ${{ matrix.os }}
timeout-minutes: 10
strategy:
matrix:
os: [macOS-latest, ubuntu-latest, windows-latest]
fail-fast: false
steps:
- name: Checkout Parser Sources
uses: actions/checkout@v2
# Install Tooling
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
toolchain: nightly-2020-06-09
override: true
- name: Install wasm-pack
uses: actions-rs/cargo@v1
with:
command: install
args: wasm-pack --version ${{ env.wasmpackVersion }}
# Caches
- name: Cache Cargo Registry
uses: actions/cache@v2
with:
path: ~/.cargo/registry
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**Cargo.toml') }}
restore-keys: ${{ runner.os }}-cargo-registry
- name: Cache Cargo Test
uses: actions/cache@v2
with:
path: ./target/rust
key: ${{ runner.os }}-cargo-build-${{ hashFiles('**Cargo.toml') }}
restore-keys: ${{ runner.os }}-cargo-build
# Tests
- name: Test Parser
uses: actions-rs/cargo@v1
with:
command: test

View File

@ -1,4 +1,4 @@
name: Enso CI
name: Engine CI
on:
push:
@ -27,7 +27,7 @@ env:
jobs:
# This job is responsible for testing the codebase
test:
name: Test
name: Test Engine
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
@ -93,7 +93,7 @@ jobs:
# This job is responsible for building the artifacts
build:
name: Build
name: Build Engine
runs-on: ubuntu-latest
timeout-minutes: 30
steps:

40
.gitignore vendored
View File

@ -5,15 +5,9 @@
graal_dumps
##########
## Java ##
##########
*.class
###########
## Scala ##
###########
#########
## JVM ##
#########
graal_dumps/
target/
@ -25,6 +19,8 @@ target/
##########
Cargo.lock
**/*.rs.bk
wasm-pack.log
#############
## Haskell ##
@ -32,28 +28,12 @@ Cargo.lock
dist
cabal-dev
*.o
*.hi
*.chi
*.chs.h
*.dyn_o
*.dyn_hi
.hpc
.hsenv
.cabal-sandbox/
cabal.sandbox.config
*.cabal
*.prof
*.aux
*.hp
*.DS_Store
.stack-work/
############
## System ##
############
# OSX
.DS_Store
############
@ -70,6 +50,7 @@ cabal.sandbox.config
######################
.idea/
.vscode/
*.swp
.projections.json
@ -83,6 +64,7 @@ scaladoc/
#######################
## Benchmark Reports ##
#######################
bench-report.xml
##############
@ -97,10 +79,4 @@ bench-report.xml
#########
.editorconfig
.bloop
#########
## NPM ##
#########
node_modules/
.bloop/

33
.rustfmt.toml Normal file
View File

@ -0,0 +1,33 @@
# General Configuration
unstable_features = true
max_width = 80
error_on_line_overflow = true
newline_style = "Unix"
# Operators
binop_separator = "Front"
# Whitespace
blank_lines_upper_bound = 1
# Code Layout
brace_style = "SameLineWhere"
combine_control_expr = true
empty_item_single_line = true
fn_single_line = true
format_strings = true
inline_attribute_width = 80
space_before_colon = false
space_after_colon = false
type_punctuation_density = "Wide"
# Comments
comment_width = 80
wrap_comments = true
format_code_in_doc_comments = true
normalize_comments = true
# Macros
format_macro_matchers = true
format_macro_bodies = true

25
Cargo.toml Normal file
View File

@ -0,0 +1,25 @@
[workspace]
members = [
"parser/flexer"
]
[profile.dev]
opt-level = 0
lto = false
debug = true
[profile.release]
opt-level = 3
lto = true
debug = false
[profile.bench]
opt-level = 3
lto = true
debug = false
[profile.test]
opt-level = 0
lto = false
debug = true

View File

@ -43,4 +43,4 @@ It is broken up into categories as follows:
- [**Syntax:**](./syntax) A specification of Enso's syntax.
- [**Types:**](./types) A specification of Enso's type system and type theory.
- [**Debugger:**](./debugger) A specification of Enso's debugger.
- [**Parser:**](./parser) Design and specification of the Enso parser.

25
docs/parser/README.md Normal file
View File

@ -0,0 +1,25 @@
---
layout: docs-index
title: Enso's Parser
category: summary
tags: [parser, readme]
order: 0
---
# Enso's Parser
The parser is one of the most crucial components of the Enso runtime in that
_all_ code that a user writes must be parsed. This means that a good parser is
fast, responsive, and lightweight; it shouldn't baulk at having thousands of
lines of code thrown at it.
Enso's parser, however, is very special. In order to support interactive use it
has to narrow down the scope of a syntax error as much as possible, while still
providing useful output for the compiler around the rest of the parse errors.
This feature makes it more complex than many common parsers, so making this work
while still preserving performance is of paramount importance.
The various components of the parser's design and architecture are described
below:
- [**Tech Analysis:**](./tech-analysis.md) A brief overview of the reasons for
the implementation technologies for the parser.

View File

@ -0,0 +1,68 @@
---
layout: developer-doc
title: Technology Analysis
category: syntax
tags: [parser, tech-analysis]
order: 1
---
# Parser Technology Analysis
As the Enso parser has some fairly unique requirements placed upon it, the
choice of implementation technology is of paramount importance. Choosing the
correct technology ensures that we can meet all of the requirements placed upon
the parser.
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [Technology Requirements for the Parser](#technology-requirements-for-the-parser)
- [Issues With the Previous Implementation](#issues-with-the-previous-implementation)
- [Choosing Rust](#choosing-rust)
- [Downsides of Rust](#downsides-of-rust)
<!-- /MarkdownTOC -->
## Technology Requirements for the Parser
As the parser has to work both for the Engine and for the IDE, it has a strange
set of requirements:
- The implementation language must be able to run on native platforms, as well
as in the browser via WASM (not JavaScript due to the marshalling overhead).
- The implementation language should permit _excellent_ native performance on
both native and web platforms, by giving implementers fine-grained control
over memory usage.
- The implementation language must be able to target all primary platforms:
macOS, Linux and Windows.
## Issues With the Previous Implementation
The previous implementation of the parser was implemented in Scala, and had some
serious issues that have necessitated this rewrite:
- **Performance:** The structures used to implement the parser proved inherently
difficult for a JIT to optimise, making performance far worse than expected on
the JVM.
- **ScalaJS Sub-Optimal Code Generation:** The JavaScript generated by ScalaJS
was very suboptimal for these structures, making the parser _even_ slower when
run in the browser.
- **JS as a Browser Target:** To transfer textual data between WASM and JS
incurs a significant marshalling overhead. As the IDE primarily works with
textual operations under the hood, this proved to be a significant slowdown.
## Choosing Rust
Rust, then, is an obvious choice for the following reasons:
- It can be compiled _natively_ into the IDE binary, providing them with
excellent performance.
- As a native language it can use JNI to directly create JVM objects on the JVM
heap, for use by the compiler.
- As a native language it can be called directly via JNI.
- There is potential in the future for employing Graal's LLVM bitcode
interpreter to execute the parser safely in a non-native context.
### Downsides of Rust
This is not to say that choosing rust doesn't come with some compromises:
- It significantly complicates the CI pipeline for the engine, as we will have
to build native artefacts for use by the runtime itself.
- As a non-JVM language, the complexity of working with it from Scala and Java
is increased. We will need to maintain a full definition of the AST in Scala
to permit the compiler to work properly with it.

27
parser/flexer/Cargo.toml Normal file
View File

@ -0,0 +1,27 @@
[package]
name = "flexer"
version = "0.0.1"
authors = [
"Enso Team <enso-dev@enso.org>",
"Ara Adkins <ara.adkins@enso.org"
]
edition = "2018"
description = "A finite-automata-based lexing engine."
readme = "README.md"
homepage = "https://github.com/luna/enso"
repository = "https://github.com/luna/enso"
license-file = "../../LICENSE"
keywords = ["lexer", "finite-automata"]
publish = false
[lib]
name = "flexer"
crate-type = ["dylib", "rlib"]
test = true
bench = true
[dependencies]
itertools = "0.8"

4
parser/flexer/README.md Normal file
View File

@ -0,0 +1,4 @@
# Flexer
This library provides a finite-automata-based lexing engine that can flexibly
tokenize an input stream.

1
parser/flexer/build.rs Normal file
View File

@ -0,0 +1 @@
fn main() {}

View File

@ -0,0 +1,8 @@
//! Exports API for construction of Nondeterminist and Deterministic Finite
//! State Automata.
pub mod alphabet;
pub mod dfa;
pub mod nfa;
pub mod pattern;
pub mod state;

View File

@ -0,0 +1,61 @@
//! Exports an alphabet (set of all valid input symbols) for Finite State
//! Automata (NFA and DFA).
use crate::automata::state::Symbol;
use std::collections::BTreeSet;
use std::ops::RangeInclusive;
// ================
// === Alphabet ===
// ================
/// An alphabet describes a set of all the valid input symbols that a given
/// finite state automata (NFA or DFA) can operate over.
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
/// The alphabet is meant to be represented as an interval. That is, if `a` and
/// `b` are in alphabet, then any symbol from `a..=b` is in alphabet too.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Alphabet {
/// The interval of all valid input symbols. The interval is further
/// divided into subintervals (i.e. `[a,z,A,Z]` should be understood as
/// `[a..=z,z..=A,A..=Z]`), in order to efficiently encode state
/// transitions that trigger not just on one but a whole range of symbols
/// (i.e. `a..=z`)
pub symbols:BTreeSet<Symbol>,
}
impl Default for Alphabet {
fn default() -> Self {
Alphabet {
symbols:[Symbol { val:0 }].iter().cloned().collect(),
}
}
}
impl Alphabet {
/// Inserts a range of symbols into the alphabet.
pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
// The symbol range is associated with transition in automata. Therefore
// we: Mark the symbol with the new transition.
self.symbols.insert(Symbol {
val:range.start().val,
});
// Mark the symbol without the new transition.
self.symbols.insert(Symbol {
val:range.end().val + 1,
});
// This way each symbol in alphabet corresponds to a unique set of
// transitions.
}
}
impl From<Vec<u32>> for Alphabet {
fn from(vec:Vec<u32>) -> Self {
let mut dict = Self::default();
for val in vec {
dict.symbols.insert(Symbol { val });
}
dict
}
}

View File

@ -0,0 +1,156 @@
//! Exports the structure for Deterministic Finite Automata.
use crate::automata::alphabet::Alphabet;
use crate::automata::state;
use crate::data::matrix::Matrix;
// =====================================
// === Deterministic Finite Automata ===
// =====================================
/// Function callback for an arbitrary state of finite automata.
/// It contains name of Rust procedure that is meant to be executed after
/// encountering a pattern (declared in `group::Rule.pattern`).
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Callback {
/// TODO[jv] Write better explanation after implementing rust code
/// generation. Priority is used during rust code generation.
pub priority:usize,
/// Name of Rust method that will be called when executing this callback.
pub name:String,
}
/// DFA automata with a set of symbols, states and transitions.
/// Deterministic Finite Automata is a finite-state machine that accepts or
/// rejects a given sequence of symbols, by running through a state sequence
/// uniquely determined by the input symbol sequence. ___ ___
/// ___ ___ | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3
/// | ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct DFA {
/// Finite set of all valid input symbols.
pub alphabet:Alphabet,
/// Transition matrix of deterministic finite state automata.
/// It contains next state for each pair of state and input symbol -
/// (state,symbol) => new state. For example, a transition matrix for
/// automata that accepts string "ABABAB...." would look like this:
/// states
/// | | A | B | <- symbols
/// | 0 | 1 | - |
/// | 1 | - | 0 |
/// Where `-` denotes `state::INVALID`.
pub links:Matrix<state::Id>,
/// Stores callback for each state (if it has one).
pub callbacks:Vec<Option<Callback>>,
}
impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
fn from(input:Vec<Vec<usize>>) -> Self {
let rows = input.len();
let columns = if rows == 0 { 0 } else { input[0].len() };
let mut matrix = Self::new(rows, columns);
for row in 0..rows {
for column in 0..columns {
matrix[(row, column)] = state::Id {
id:input[row][column],
};
}
}
matrix
}
}
// ===========
// == Tests ==
// ===========
#[cfg(test)]
pub mod tests {
use super::*;
use crate::automata::state;
const I:usize = state::INVALID.id;
/// DFA automata that accepts newline '\n'.
pub fn newline() -> DFA {
DFA {
alphabet:Alphabet::from(vec![10, 11]),
links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]),
callbacks:vec![
None,
Some(Callback {
priority:2,
name:"group0_rule0".into(),
}),
],
}
}
/// DFA automata that accepts any letter a..=z.
pub fn letter() -> DFA {
DFA {
alphabet:Alphabet::from(vec![97, 123]),
links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]),
callbacks:vec![
None,
Some(Callback {
priority:2,
name:"group0_rule0".into(),
}),
],
}
}
/// DFA automata that accepts any number of spaces ' '.
pub fn spaces() -> DFA {
DFA {
alphabet:Alphabet::from(vec![0, 32, 33]),
links:Matrix::from(vec![
vec![I, 1, I],
vec![I, 2, I],
vec![I, 2, I],
]),
callbacks:vec![
None,
Some(Callback {
priority:3,
name:"group0_rule0".into(),
}),
Some(Callback {
priority:3,
name:"group0_rule0".into(),
}),
],
}
}
/// DFA automata that accepts one letter a..=z or any many spaces.
pub fn letter_and_spaces() -> DFA {
DFA {
alphabet:Alphabet::from(vec![32, 33, 97, 123]),
links:Matrix::from(vec![
vec![I, 1, I, 2, I],
vec![I, 3, I, I, I],
vec![I, I, I, I, I],
vec![I, 3, I, I, I],
]),
callbacks:vec![
None,
Some(Callback {
priority:4,
name:"group0_rule1".into(),
}),
Some(Callback {
priority:4,
name:"group0_rule0".into(),
}),
Some(Callback {
priority:4,
name:"group0_rule1".into(),
}),
],
}
}
}

View File

@ -0,0 +1,365 @@
//! Implementation of Nondeterministic Finite Automata and it's conversion to
//! DFA.
use crate::automata::alphabet::Alphabet;
use crate::automata::dfa::Callback;
use crate::automata::dfa::DFA;
use crate::automata::state;
use crate::automata::state::Link;
use crate::automata::state::State;
use crate::automata::state::Symbol;
use crate::data::matrix::Matrix;
use crate::automata::pattern::Pattern;
use itertools::Itertools;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::ops::RangeInclusive;
// ========================================
// === Nondeterministic Finite Automata ===
// ========================================
/// Type alias for a state Id based on set of states.
/// It is used during NFA -> DFA transformation where multiple states can merge
/// together, thanks to epsilon links.
type StateSetId = BTreeSet<state::Id>;
/// NFA automata with a set of symbols, states and transitions.
/// Nondeterministic Finite Automata is a finite-state machine that accepts or
/// rejects a given sequence of symbols.
/// Compared to `DFA`, NFA can transition into multiple new states without
/// reading any symbol (so called epsilon link / transition),
/// ___ ___ ___ ___ ___
/// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 |
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct NFA {
/// Finite set of all valid input symbols.
pub alphabet:Alphabet,
/// Set of named NFA states with (epsilon) transitions.
pub states:Vec<State>,
}
impl NFA {
/// Adds a new state to NFA and returns it's Id.
pub fn new_state(&mut self) -> state::Id {
let id = self.states.len();
self.states.push(State::default());
state::Id { id }
}
/// Creates an epsilon transition between two states.
/// Whenever the automata happens to be in `source` state it can
/// immediatelly move to `target` state (but does not have to).
pub fn connect(&mut self, source:state::Id, target:state::Id) {
self.states[source.id].epsilon_links.push(target);
}
/// Creates an ordinary transition (for a range of symbols) between two
/// states. If any symbol from such range happens to be on input when
/// the automata is in `source` state, it will immediatelly move to
/// `target` state.
pub fn connect_by(
&mut self,
source:state::Id,
target:state::Id,
symbols:&RangeInclusive<Symbol>,
) {
self.alphabet.insert(symbols.clone());
self.states[source.id].links.push(Link {
symbols:symbols.clone(),
target,
});
}
/// Transforms pattern to NFA.
/// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
pub fn new_pattern(
&mut self,
source:state::Id,
pattern:&Pattern,
) -> state::Id {
let current = self.new_state();
self.connect(source, current);
match pattern {
Pattern::Range(range) => {
let state = self.new_state();
self.connect_by(current, state, range);
state
}
Pattern::Many(body) => {
let s1 = self.new_state();
let s2 = self.new_pattern(s1, body);
let s3 = self.new_state();
self.connect(current, s1);
self.connect(current, s3);
self.connect(s2, s3);
self.connect(s3, s1);
s3
}
Pattern::And(patterns) => patterns
.iter()
.fold(current, |s, pat| self.new_pattern(s, pat)),
Pattern::Or(patterns) => {
let states = patterns
.iter()
.map(|pat| self.new_pattern(current, pat))
.collect_vec();
let end = self.new_state();
for state in states {
self.connect(state, end);
}
end
}
}
}
// === NFA -> DFA ===
/// Merges states that are connected by epsilon links.
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
fn eps_matrix(&self) -> Vec<StateSetId> {
fn fill_eps_matrix(
nfa:&NFA,
states:&mut Vec<StateSetId>,
computed:&mut Vec<bool>,
visited:&mut Vec<bool>,
state:state::Id,
) {
let mut state_set = StateSetId::new();
let mut circular = false;
visited[state.id] = true;
state_set.insert(state);
for &target in &nfa.states[state.id].epsilon_links {
if !visited[target.id] {
fill_eps_matrix(nfa, states, computed, visited, target);
}
state_set.insert(target);
state_set.extend(states[target.id].iter());
if !computed[target.id] {
circular = true
}
}
if !circular {
computed[state.id] = true
}
states[state.id] = state_set;
}
let mut states = vec![StateSetId::new(); self.states.len()];
let mut computed = vec![false; self.states.len()];
for id in 0..self.states.len() {
let mut visited = vec![false; states.len()];
fill_eps_matrix(
self,
&mut states,
&mut computed,
&mut visited,
state::Id { id },
);
}
states
}
/// Computes a transition matrix (state X symbol => state) for NFA.
/// Ignores epsilon links.
fn nfa_matrix(&self) -> Matrix<state::Id> {
let mut matrix =
Matrix::new(self.states.len(), self.alphabet.symbols.len());
for (state_ix, source) in self.states.iter().enumerate() {
let targets = source.targets(&self.alphabet);
for (voc_ix, &target) in targets.iter().enumerate() {
matrix[(state_ix, voc_ix)] = target;
}
}
matrix
}
}
impl From<&NFA> for DFA {
/// Transforms NFA into DFA.
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
fn from(nfa:&NFA) -> Self {
let nfa_mat = nfa.nfa_matrix();
let eps_mat = nfa.eps_matrix();
let mut dfa_mat = Matrix::new(0, nfa.alphabet.symbols.len());
let mut dfa_eps_ixs = Vec::<StateSetId>::new();
let mut dfa_eps_map = HashMap::<StateSetId, state::Id>::new();
dfa_eps_ixs.push(eps_mat[0].clone());
dfa_eps_map.insert(eps_mat[0].clone(), state::Id { id:0 });
let mut i = 0;
while i < dfa_eps_ixs.len() {
dfa_mat.new_row();
for voc_ix in 0..nfa.alphabet.symbols.len() {
let mut eps_set = StateSetId::new();
for &eps_ix in &dfa_eps_ixs[i] {
let tgt = nfa_mat[(eps_ix.id, voc_ix)];
if tgt != state::INVALID {
eps_set.extend(eps_mat[tgt.id].iter());
}
}
if !eps_set.is_empty() {
dfa_mat[(i, voc_ix)] = match dfa_eps_map.get(&eps_set) {
Some(&id) => id,
None => {
let id = state::Id {
id:dfa_eps_ixs.len(),
};
dfa_eps_ixs.push(eps_set.clone());
dfa_eps_map.insert(eps_set, id);
id
}
};
}
}
i += 1;
}
let mut callbacks = vec![None; dfa_eps_ixs.len()];
let priority = dfa_eps_ixs.len();
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some();
if let Some(eps) = epss.into_iter().find(has_name) {
let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap();
callbacks[dfa_ix] = Some(Callback {
name:rule,
priority,
});
}
}
DFA {
alphabet:nfa.alphabet.clone(),
links:dfa_mat,
callbacks,
}
}
}
// ===========
// == Tests ==
// ===========
#[cfg(test)]
pub mod tests {
extern crate test;
use crate::automata::dfa;
use super::*;
use test::Bencher;
/// NFA automata that accepts newline '\n'.
pub fn newline() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![(10..=10, 2)]),
State::from(vec![3]).named("group0_rule0"),
State::default(),
],
alphabet:Alphabet::from(vec![10, 11]),
}
}
/// NFA automata that accepts any letter a..=z.
pub fn letter() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![(97..=122, 2)]),
State::from(vec![3]).named("group0_rule0"),
State::default(),
],
alphabet:Alphabet::from(vec![97, 123]),
}
}
/// NFA automata that accepts any number of spaces ' '.
pub fn spaces() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![2]),
State::from(vec![(32..=32, 3)]),
State::from(vec![4]),
State::from(vec![5, 8]),
State::from(vec![6]),
State::from(vec![(32..=32, 7)]),
State::from(vec![8]),
State::from(vec![5, 9]).named("group0_rule0"),
State::default(),
],
alphabet:Alphabet::from(vec![0, 32, 33]),
}
}
/// NFA automata that accepts one letter a..=z or many spaces ' '.
pub fn letter_and_spaces() -> NFA {
NFA {
states:vec![
State::from(vec![1, 3]),
State::from(vec![(97..=122, 2)]),
State::from(vec![11]).named("group0_rule0"),
State::from(vec![4]),
State::from(vec![(32..=32, 5)]),
State::from(vec![6]),
State::from(vec![7, 10]),
State::from(vec![8]),
State::from(vec![(32..=32, 9)]),
State::from(vec![10]),
State::from(vec![7, 11]).named("group0_rule1"),
State::default(),
],
alphabet:Alphabet::from(vec![32, 33, 97, 123]),
}
}
#[test]
fn test_to_dfa_newline() {
assert_eq!(DFA::from(&newline()), dfa::tests::newline());
}
#[test]
fn test_to_dfa_letter() {
assert_eq!(DFA::from(&letter()), dfa::tests::letter());
}
#[test]
fn test_to_dfa_spaces() {
assert_eq!(DFA::from(&spaces()), dfa::tests::spaces());
}
#[test]
fn test_to_dfa_letter_and_spaces() {
assert_eq!(
DFA::from(&letter_and_spaces()),
dfa::tests::letter_and_spaces()
);
}
#[bench]
fn bench_to_dfa_newline(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&newline()))
}
#[bench]
fn bench_to_dfa_letter(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&letter()))
}
#[bench]
fn bench_to_dfa_spaces(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&spaces()))
}
#[bench]
fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&letter_and_spaces()))
}
}

View File

@ -0,0 +1,164 @@
//! Simple API for constructing regex patterns that are used in parser
//! implementation.
use crate::automata::state::Symbol;
use crate::parser;
use core::iter;
use itertools::Itertools;
use std::ops::BitAnd;
use std::ops::BitOr;
use std::ops::RangeInclusive;
// =============
// == Pattern ==
// =============
/// Simple regex pattern.
#[derive(Clone, Debug)]
pub enum Pattern {
/// Pattern that triggers on any symbol from given range.
Range(RangeInclusive<Symbol>),
/// Pattern that triggers on any given pattern from sequence.
Or(Vec<Pattern>),
/// Pattern that triggers when a sequence of patterns is encountered.
And(Vec<Pattern>),
/// Pattern that triggers on 0..N repetitions of given pattern.
Many(Box<Pattern>),
}
use Pattern::*;
impl BitOr<Pattern> for Pattern {
type Output = Pattern;
fn bitor(self, rhs:Pattern) -> Self::Output {
match (self, rhs) {
(Or(mut lhs), Or(rhs)) => {
lhs.extend(rhs);
Or(lhs)
}
(Or(mut lhs), rhs) => {
lhs.push(rhs);
Or(lhs)
}
(lhs, Or(mut rhs)) => {
rhs.push(lhs);
Or(rhs)
}
(lhs, rhs) => Or(vec![lhs, rhs]),
}
}
}
impl BitAnd<Pattern> for Pattern {
type Output = Pattern;
fn bitand(self, rhs:Pattern) -> Self::Output {
match (self, rhs) {
(And(mut lhs), And(rhs)) => {
lhs.extend(rhs);
And(lhs)
}
(And(mut lhs), rhs) => {
lhs.push(rhs);
And(lhs)
}
(lhs, And(mut rhs)) => {
rhs.push(lhs);
And(rhs)
}
(lhs, rhs) => And(vec![lhs, rhs]),
}
}
}
impl Pattern {
/// Pattern that never triggers.
pub fn never() -> Self { Pattern::symbols(1..=0) }
/// Pattern that always triggers.
pub fn always() -> Self {
Pattern::symbols(u32::min_value()..=u32::max_value())
}
/// Pattern that triggers on any char.
pub fn any_char() -> Self { Pattern::symbols(0..=u32::max_value()) }
/// Pattern that triggers on 0..N repetitions of given pattern.
pub fn many(self) -> Self { Many(Box::new(self)) }
/// Pattern that triggers on 1..N repetitions of given pattern.
pub fn many1(self) -> Self { self.clone() & self.many() }
/// Pattern that triggers on 0..=1 repetitions of given pattern.
pub fn opt(self) -> Self { self | Self::always() }
/// Pattern that triggers on given symbol
pub fn symbol(symbol:u32) -> Self { Pattern::symbols(symbol..=symbol) }
/// Pattern that triggers on any of the given symbols.
pub fn symbols(symbols:RangeInclusive<u32>) -> Self {
Pattern::Range(
Symbol {
val:*symbols.start(),
}..=Symbol { val:*symbols.end() },
)
}
/// Pattern that triggers on end of file.
pub fn eof() -> Self { Self::symbol(parser::EOF_CODE.val) }
/// Pattern that triggers on given character.
pub fn char(char:char) -> Self { Self::symbol(char as u32) }
/// Pattern that triggers on any of the given characters.
pub fn range(chars:RangeInclusive<char>) -> Self {
Pattern::symbols((*chars.start() as u32)..=(*chars.end() as u32))
}
/// Pattern that triggers when sequence of characters is encountered.
pub fn all(chars:&str) -> Self {
chars
.chars()
.fold(Self::never(), |pat, char| pat & Self::char(char))
}
/// Pattern that triggers on any characters from given sequence.
pub fn any(chars:&str) -> Self {
chars
.chars()
.fold(Self::never(), |pat, char| pat | Self::char(char))
}
/// Pattern that doesn't trigger on any given character from given sequence.
pub fn none(chars:&str) -> Self {
let max = u32::max_value();
let char_iter = chars.chars().map(|char| char as u32);
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
let mut codes = char_iter2.collect_vec();
codes.sort();
codes
.iter()
.tuple_windows()
.fold(Self::never(), |pat, (start, end)| {
if end < start {
pat
} else {
pat | Pattern::symbols(*start..=*end)
}
})
}
/// Pattern that triggers on any character but the one given.
pub fn not(char:char) -> Self { Self::none(&char.to_string()) }
/// Pattern that triggers on N repetitions of given pattern.
pub fn repeat(pat:Pattern, num:usize) -> Self {
(0..num).fold(Self::always(), |p, _| p & pat.clone())
}
/// Pattern that triggers on MIN..MAX repetitions of given pattern.
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
(min..max).fold(Self::never(), |p, n| p | Self::repeat(pat.clone(), n))
}
}

View File

@ -0,0 +1,121 @@
//! This module exports State implementation for Nondeterministic Finite
//! Automata.
use crate::automata::alphabet::Alphabet;
use crate::automata::state;
use std::ops::RangeInclusive;
// =======================
// == State Of Automata ==
// =======================
/// Flag for invalid state.
/// When finite automata gets into invalid state the input sequence of symbols
/// is rejected.
pub const INVALID:Id = Id {
id:usize::max_value(),
};
/// Newtype wrapper for finite automata input symbol.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Symbol {
#[allow(missing_docs)]
pub val:u32,
}
/// Newtype wrapper for finite automata state ID.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Id {
#[allow(missing_docs)]
pub id:usize,
}
impl Default for Id {
/// Returns state::INVALID. This is because every finite automata has an
/// invalid state and because all transitions in automata transition
/// matrix lead to invalid state by default.
fn default() -> Self { state::INVALID }
}
/// Named NFA state with a set of transitions (links).
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct State {
/// Set of transitions that don't require any symbol to trigger.
/// I.E. If there is an epsilon link from state A to state B, then whenever
/// we are in state A, we can freely move to state B.
pub epsilon_links:Vec<Id>,
/// Set of transitions that trigger with specific symbol on input.
/// When triggered, the automata will transition to the `link.target`.
pub links:Vec<Link>,
/// Name of the state.
/// We use it to autogenerate a call to Rust method with same name.
pub name:Option<String>,
}
/// A transition to new automata state
/// that requires specific symbol on automata input to trigger.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Link {
/// Any symbol from the range will trigger this link.
pub symbols:RangeInclusive<Symbol>,
/// A state that is visited, after the link is triggered.
pub target:Id,
}
impl State {
/// Updater for field `name`. Returns updated state.
pub fn named(mut self, name:&str) -> Self {
self.name = Some(name.to_owned());
self
}
/// Returns transition (next state) for each symbol in alphabet.
pub fn targets(&self, alphabet:&Alphabet) -> Vec<Id> {
let mut targets = vec![];
let mut index = 0;
let mut links = self.links.clone();
links.sort_by_key(|link| *link.symbols.start());
for &symbol in &alphabet.symbols {
while links.len() > index && *links[index].symbols.end() < symbol {
index += 1;
}
if links.len() <= index || *links[index].symbols.start() > symbol {
targets.push(state::INVALID);
} else {
targets.push(links[index].target);
}
}
targets
}
}
impl From<Vec<usize>> for State {
/// Creates a state with epsilon links.
fn from(vec:Vec<usize>) -> Self {
let epsilon_links = vec.iter().cloned().map(|id| Id { id }).collect();
State {
epsilon_links,
..Default::default()
}
}
}
impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
/// Creates a state with ordinary links.
fn from(vec:Vec<(RangeInclusive<u32>, usize)>) -> Self {
let link = |(range, id):(RangeInclusive<u32>, usize)| {
let start = Symbol { val:*range.start() };
let end = Symbol { val:*range.end() };
Link {
symbols:start..=end,
target:Id { id },
}
};
let links = vec.iter().cloned().map(link).collect();
State {
links,
..Default::default()
}
}
}

View File

@ -0,0 +1,3 @@
//! Generic datastructures, with multiple usecases.
pub mod matrix;

View File

@ -0,0 +1,55 @@
//! Efficient representation of 2D matrix.
use std::ops::Index;
use std::ops::IndexMut;
// ============
// == Matrix ==
// ============
/// Efficient 2D matrix implemented on top of vector.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Matrix<T> {
/// The number of rows in matrix.
rows:usize,
/// The number of columns in matrix.
columns:usize,
/// Matrix implemented with vector.
matrix:Vec<T>,
}
impl<T> Index<(usize, usize)> for Matrix<T> {
type Output = T;
fn index(&self, index:(usize, usize)) -> &T {
&self.matrix[index.0 * self.columns + index.1]
}
}
impl<T> IndexMut<(usize, usize)> for Matrix<T> {
fn index_mut(&mut self, index:(usize, usize)) -> &mut T {
&mut self.matrix[index.0 * self.columns + index.1]
}
}
impl<T:Default> Matrix<T> {
/// Constructs a new matrix for given number of rows and columns.
pub fn new(rows:usize, columns:usize) -> Self {
let mut matrix = Vec::with_capacity(rows * columns);
for _ in 0..matrix.capacity() {
matrix.push(Default::default())
}
Self {
rows,
columns,
matrix,
}
}
/// Adds a new row to matrix, filled with default values.
pub fn new_row(&mut self) {
for _ in 0..self.columns {
self.matrix.push(Default::default());
}
self.rows += 1;
}
}

222
parser/flexer/src/group.rs Normal file
View File

@ -0,0 +1,222 @@
//! This module exports API for grouping multiple rules (Rust callbacks with
//! regex pattern) together.
use crate::automata::nfa::NFA;
use crate::automata::pattern::Pattern;
use crate::group::rule::Rule;
use itertools::Itertools;
pub mod rule;
// ===========
// == Group ==
// ===========
/// Struct that group rules together. It also inherits rules from parent group
/// (if it has one). Groups are the basic building block of flexer:
/// Flexer internally keeps a stack of groups, only one of them active at a
/// time. Each group contains set of regex patterns and callbacks (together
/// called `Rule`). Whenever a rule.pattern from active group is matched with
/// part of input the associated rule.callback is executed, which in turn may
/// exit the current groupor enter a new one. This allows us to nicely model a
/// situation, where certain part of program (like a string literal) should have
/// very different parsing rules than other (for example body of function). Note
/// that the input is first matched with first added rule, then with the second
/// etc. Therefore, if two rules overlap, only the callback of the first added
/// rule will be executed.
#[derive(Clone, Debug, Default)]
pub struct Group {
/// Unique ID.
pub id:usize,
/// Custom name which is used for debugging.
pub name:String,
/// Parent which we inherit rules from.
pub parent:Option<Box<Group>>,
/// Set of regex patterns with associated callbacks.
pub rules:Vec<Rule>,
}
impl Group {
/// Adds new rule (regex pattern with associated callback) to group.
pub fn add_rule(&mut self, rule:Rule) { self.rules.push(rule) }
/// Returns rule builder for given pattern.
/// TODO[jv] better describe it's purpose once we agree on correct API.
pub fn rule(
&mut self,
pattern:Pattern,
) -> rule::Builder<impl FnMut(Rule) + '_> {
rule::Builder {
pattern,
callback:move |rule| self.add_rule(rule),
}
}
/// All rules including parent rules.
pub fn rules(&self) -> Vec<&Rule> {
let mut parent = &self.parent;
let mut rules = (&self.rules).iter().collect_vec();
while let Some(state) = parent {
rules.extend((&state.rules).iter());
parent = &state.parent;
}
rules
}
/// Canonical name of given rule.
fn callback_name(&self, rule_ix:usize) -> String {
format!("group{}_rule{}", self.id, rule_ix)
}
}
impl From<&Group> for NFA {
/// Transforms Group to NFA.
/// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
fn from(group:&Group) -> Self {
let mut nfa = NFA::default();
let start = nfa.new_state();
let build = |rule:&Rule| nfa.new_pattern(start, &rule.pattern);
let states = group.rules().into_iter().map(build).collect_vec();
let end = nfa.new_state();
for (ix, state) in states.into_iter().enumerate() {
nfa.states[state.id].name = Some(group.callback_name(ix));
nfa.connect(state, end);
}
nfa
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
pub mod tests {
extern crate test;
use crate::automata::dfa::DFA;
use crate::automata::nfa;
use crate::automata::nfa::NFA;
use crate::automata::pattern::Pattern;
use crate::group::rule::Rule;
use crate::group::Group;
use std::default::Default;
use test::Bencher;
fn newline() -> Group {
let pattern = Pattern::char('\n');
let mut group = Group::default();
group.add_rule(Rule {
pattern,
callback:"".into(),
});
group
}
fn letter() -> Group {
let pattern = Pattern::range('a'..='z');
let mut group = Group::default();
group.add_rule(Rule {
pattern,
callback:"".into(),
});
group
}
fn spaces() -> Group {
let pattern = Pattern::char(' ').many1();
let mut group = Group::default();
group.add_rule(Rule {
pattern,
callback:"".into(),
});
group
}
fn letter_and_spaces() -> Group {
let letter = Pattern::range('a'..='z');
let spaces = Pattern::char(' ').many1();
let mut group = Group::default();
group.add_rule(Rule {
pattern:letter,
callback:"".into(),
});
group.add_rule(Rule {
pattern:spaces,
callback:"".into(),
});
group
}
fn hundred_rules() -> Group {
let pattern =
Pattern::all("The quick brown fox jumps over the lazy dog!!");
let mut group = Group::default();
for _ in 0..100 {
group.add_rule(Rule {
pattern:pattern.clone(),
callback:"".into(),
})
}
group
}
#[test]
fn test_to_nfa_newline() {
assert_eq!(NFA::from(&newline()), nfa::tests::newline());
}
#[test]
fn test_to_nfa_letter() {
assert_eq!(NFA::from(&letter()), nfa::tests::letter());
}
#[test]
fn test_to_nfa_spaces() {
assert_eq!(NFA::from(&spaces()), nfa::tests::spaces());
}
#[test]
fn test_to_nfa_letter_and_spaces() {
assert_eq!(
NFA::from(&letter_and_spaces()),
nfa::tests::letter_and_spaces()
);
}
#[bench]
fn bench_to_nfa_newline(bencher:&mut Bencher) {
bencher.iter(|| NFA::from(&newline()))
}
#[bench]
fn bench_to_nfa_letter(bencher:&mut Bencher) {
bencher.iter(|| NFA::from(&letter()))
}
#[bench]
fn bench_to_nfa_spaces(bencher:&mut Bencher) {
bencher.iter(|| NFA::from(&spaces()))
}
#[bench]
fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) {
bencher.iter(|| NFA::from(&letter_and_spaces()))
}
#[bench]
fn bench_hundred_rules(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&NFA::from(&hundred_rules())));
}
}

View File

@ -0,0 +1,39 @@
//! An API for declaring Rust callbacks for encountered regex patterns.
//!
use crate::automata::pattern::Pattern;
// ==========
// == Rule ==
// ==========
/// A rule is a pair of regex pattern and callback.
/// The intention is to run the callback after encountering given pattern.
#[derive(Clone, Debug)]
pub struct Rule {
/// Pattern that triggers the callback.
pub pattern:Pattern,
/// Callback containing stringified Rust code.
pub callback:String,
}
/// Builder that allows us to add `Rule` to `Group` in a nice way.
/// It is possible this structure won't be useful in rust, since borrow checker
/// will likely influence the final API of rule construction.
#[derive(Clone, Debug)]
pub struct Builder<Callback> {
/// Pattern that triggers the callback.
pub pattern:Pattern,
/// Callback containing a closure.
pub callback:Callback,
}
impl<F:FnMut(Rule)> Builder<F> {
/// Feeds the input that triggered regex pattern to callback.
pub fn run(&mut self, program:String) {
let rule = Rule {
pattern:self.pattern.clone(),
callback:program,
};
(self.callback)(rule);
}
}

17
parser/flexer/src/lib.rs Normal file
View File

@ -0,0 +1,17 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This module exports simple parser based on Deterministic Finite State
//! Automata for regular grammars (anything parsable with regex patterns).
pub mod automata;
pub mod data;
pub mod group;
pub mod parser;

View File

@ -0,0 +1,15 @@
//! The entry point of flexer. It (is going to) contain API for parsing an input
//! string based on group of regex patterns.
use crate::automata::state::Symbol;
// ============
// == Parser ==
// ============
/// End Of File - This symbol is inserted at the end of each parser input.
/// We can use the maximum value of u32, because no `char` (unicode scalar) can
/// hold this value.
pub const EOF_CODE:Symbol = Symbol {
val:u32::max_value(),
};