mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 03:32:23 +03:00
Prepare the repo for working on rust code (#841)
This commit is contained in:
parent
2f404b7f08
commit
f7d4ef546a
4
.cargo/config
Normal file
4
.cargo/config
Normal file
@ -0,0 +1,4 @@
|
||||
|
||||
[build]
|
||||
target-dir = "target/rust/"
|
||||
rustflags = []
|
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -14,5 +14,5 @@
|
||||
Please include the following checklist in your PR:
|
||||
|
||||
- [ ] The documentation has been updated if necessary.
|
||||
- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md) and [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md) style guides.
|
||||
- [ ] All code conforms to the [Scala](https://github.com/luna/enso/blob/main/docs/style-guide/scala.md), [Java](https://github.com/luna/enso/blob/main/docs/style-guide/java.md), and [Rust](https://github.com/luna/enso/blob/main/docs/style-guide/rust.md) style guides.
|
||||
- [ ] All code has been tested where possible.
|
||||
|
2
.github/settings.yml
vendored
2
.github/settings.yml
vendored
@ -204,7 +204,7 @@ branches:
|
||||
required_status_checks:
|
||||
# Require branches to be up to date before merging.
|
||||
strict: true
|
||||
contexts: ["Test (macOS-latest)", "Test (ubuntu-latest)", "license/cla"]
|
||||
contexts: ["Test Engine (macOS-latest)", "Test Engine (ubuntu-latest)", "Test Parser (macOS-latest)", "Test Parser (ubuntu-latest)", " Test Parser (windows-latest) ", "license/cla"]
|
||||
enforce_admins: null
|
||||
restrictions: null
|
||||
|
||||
|
55
.github/workflows/rust.yml
vendored
Normal file
55
.github/workflows/rust.yml
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
name: Parser CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ "*" ]
|
||||
|
||||
env:
|
||||
wasmpackVersion: 0.8.1
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Test Parser
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 10
|
||||
strategy:
|
||||
matrix:
|
||||
os: [macOS-latest, ubuntu-latest, windows-latest]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout Parser Sources
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Install Tooling
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly-2020-06-09
|
||||
override: true
|
||||
- name: Install wasm-pack
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: install
|
||||
args: wasm-pack --version ${{ env.wasmpackVersion }}
|
||||
|
||||
# Caches
|
||||
- name: Cache Cargo Registry
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.cargo/registry
|
||||
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**Cargo.toml') }}
|
||||
restore-keys: ${{ runner.os }}-cargo-registry
|
||||
- name: Cache Cargo Test
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ./target/rust
|
||||
key: ${{ runner.os }}-cargo-build-${{ hashFiles('**Cargo.toml') }}
|
||||
restore-keys: ${{ runner.os }}-cargo-build
|
||||
|
||||
# Tests
|
||||
- name: Test Parser
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: test
|
6
.github/workflows/scala.yml
vendored
6
.github/workflows/scala.yml
vendored
@ -1,4 +1,4 @@
|
||||
name: Enso CI
|
||||
name: Engine CI
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -27,7 +27,7 @@ env:
|
||||
jobs:
|
||||
# This job is responsible for testing the codebase
|
||||
test:
|
||||
name: Test
|
||||
name: Test Engine
|
||||
runs-on: ${{ matrix.os }}
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
@ -93,7 +93,7 @@ jobs:
|
||||
|
||||
# This job is responsible for building the artifacts
|
||||
build:
|
||||
name: Build
|
||||
name: Build Engine
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
|
40
.gitignore
vendored
40
.gitignore
vendored
@ -5,15 +5,9 @@
|
||||
|
||||
graal_dumps
|
||||
|
||||
##########
|
||||
## Java ##
|
||||
##########
|
||||
|
||||
*.class
|
||||
|
||||
###########
|
||||
## Scala ##
|
||||
###########
|
||||
#########
|
||||
## JVM ##
|
||||
#########
|
||||
|
||||
graal_dumps/
|
||||
target/
|
||||
@ -25,6 +19,8 @@ target/
|
||||
##########
|
||||
|
||||
Cargo.lock
|
||||
**/*.rs.bk
|
||||
wasm-pack.log
|
||||
|
||||
#############
|
||||
## Haskell ##
|
||||
@ -32,28 +28,12 @@ Cargo.lock
|
||||
|
||||
dist
|
||||
cabal-dev
|
||||
*.o
|
||||
*.hi
|
||||
*.chi
|
||||
*.chs.h
|
||||
*.dyn_o
|
||||
*.dyn_hi
|
||||
.hpc
|
||||
.hsenv
|
||||
.cabal-sandbox/
|
||||
cabal.sandbox.config
|
||||
*.cabal
|
||||
*.prof
|
||||
*.aux
|
||||
*.hp
|
||||
*.DS_Store
|
||||
.stack-work/
|
||||
|
||||
############
|
||||
## System ##
|
||||
############
|
||||
|
||||
# OSX
|
||||
.DS_Store
|
||||
|
||||
############
|
||||
@ -70,6 +50,7 @@ cabal.sandbox.config
|
||||
######################
|
||||
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
.projections.json
|
||||
|
||||
@ -83,6 +64,7 @@ scaladoc/
|
||||
#######################
|
||||
## Benchmark Reports ##
|
||||
#######################
|
||||
|
||||
bench-report.xml
|
||||
|
||||
##############
|
||||
@ -97,10 +79,4 @@ bench-report.xml
|
||||
#########
|
||||
|
||||
.editorconfig
|
||||
.bloop
|
||||
|
||||
|
||||
#########
|
||||
## NPM ##
|
||||
#########
|
||||
node_modules/
|
||||
.bloop/
|
||||
|
33
.rustfmt.toml
Normal file
33
.rustfmt.toml
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
# General Configuration
|
||||
unstable_features = true
|
||||
max_width = 80
|
||||
error_on_line_overflow = true
|
||||
newline_style = "Unix"
|
||||
|
||||
# Operators
|
||||
binop_separator = "Front"
|
||||
|
||||
# Whitespace
|
||||
blank_lines_upper_bound = 1
|
||||
|
||||
# Code Layout
|
||||
brace_style = "SameLineWhere"
|
||||
combine_control_expr = true
|
||||
empty_item_single_line = true
|
||||
fn_single_line = true
|
||||
format_strings = true
|
||||
inline_attribute_width = 80
|
||||
space_before_colon = false
|
||||
space_after_colon = false
|
||||
type_punctuation_density = "Wide"
|
||||
|
||||
# Comments
|
||||
comment_width = 80
|
||||
wrap_comments = true
|
||||
format_code_in_doc_comments = true
|
||||
normalize_comments = true
|
||||
|
||||
# Macros
|
||||
format_macro_matchers = true
|
||||
format_macro_bodies = true
|
25
Cargo.toml
Normal file
25
Cargo.toml
Normal file
@ -0,0 +1,25 @@
|
||||
[workspace]
|
||||
|
||||
members = [
|
||||
"parser/flexer"
|
||||
]
|
||||
|
||||
[profile.dev]
|
||||
opt-level = 0
|
||||
lto = false
|
||||
debug = true
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
lto = true
|
||||
debug = false
|
||||
|
||||
[profile.bench]
|
||||
opt-level = 3
|
||||
lto = true
|
||||
debug = false
|
||||
|
||||
[profile.test]
|
||||
opt-level = 0
|
||||
lto = false
|
||||
debug = true
|
@ -43,4 +43,4 @@ It is broken up into categories as follows:
|
||||
- [**Syntax:**](./syntax) A specification of Enso's syntax.
|
||||
- [**Types:**](./types) A specification of Enso's type system and type theory.
|
||||
- [**Debugger:**](./debugger) A specification of Enso's debugger.
|
||||
|
||||
- [**Parser:**](./parser) Design and specification of the Enso parser.
|
||||
|
25
docs/parser/README.md
Normal file
25
docs/parser/README.md
Normal file
@ -0,0 +1,25 @@
|
||||
---
|
||||
layout: docs-index
|
||||
title: Enso's Parser
|
||||
category: summary
|
||||
tags: [parser, readme]
|
||||
order: 0
|
||||
---
|
||||
|
||||
# Enso's Parser
|
||||
The parser is one of the most crucial components of the Enso runtime in that
|
||||
_all_ code that a user writes must be parsed. This means that a good parser is
|
||||
fast, responsive, and lightweight; it shouldn't baulk at having thousands of
|
||||
lines of code thrown at it.
|
||||
|
||||
Enso's parser, however, is very special. In order to support interactive use it
|
||||
has to narrow down the scope of a syntax error as much as possible, while still
|
||||
providing useful output for the compiler around the rest of the parse errors.
|
||||
This feature makes it more complex than many common parsers, so making this work
|
||||
while still preserving performance is of paramount importance.
|
||||
|
||||
The various components of the parser's design and architecture are described
|
||||
below:
|
||||
|
||||
- [**Tech Analysis:**](./tech-analysis.md) A brief overview of the reasons for
|
||||
the implementation technologies for the parser.
|
68
docs/parser/tech-analysis.md
Normal file
68
docs/parser/tech-analysis.md
Normal file
@ -0,0 +1,68 @@
|
||||
---
|
||||
layout: developer-doc
|
||||
title: Technology Analysis
|
||||
category: syntax
|
||||
tags: [parser, tech-analysis]
|
||||
order: 1
|
||||
---
|
||||
|
||||
# Parser Technology Analysis
|
||||
As the Enso parser has some fairly unique requirements placed upon it, the
|
||||
choice of implementation technology is of paramount importance. Choosing the
|
||||
correct technology ensures that we can meet all of the requirements placed upon
|
||||
the parser.
|
||||
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [Technology Requirements for the Parser](#technology-requirements-for-the-parser)
|
||||
- [Issues With the Previous Implementation](#issues-with-the-previous-implementation)
|
||||
- [Choosing Rust](#choosing-rust)
|
||||
- [Downsides of Rust](#downsides-of-rust)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
|
||||
## Technology Requirements for the Parser
|
||||
As the parser has to work both for the Engine and for the IDE, it has a strange
|
||||
set of requirements:
|
||||
|
||||
- The implementation language must be able to run on native platforms, as well
|
||||
as in the browser via WASM (not JavaScript due to the marshalling overhead).
|
||||
- The implementation language should permit _excellent_ native performance on
|
||||
both native and web platforms, by giving implementers fine-grained control
|
||||
over memory usage.
|
||||
- The implementation language must be able to target all primary platforms:
|
||||
macOS, Linux and Windows.
|
||||
|
||||
## Issues With the Previous Implementation
|
||||
The previous implementation of the parser was implemented in Scala, and had some
|
||||
serious issues that have necessitated this rewrite:
|
||||
|
||||
- **Performance:** The structures used to implement the parser proved inherently
|
||||
difficult for a JIT to optimise, making performance far worse than expected on
|
||||
the JVM.
|
||||
- **ScalaJS Sub-Optimal Code Generation:** The JavaScript generated by ScalaJS
|
||||
was very suboptimal for these structures, making the parser _even_ slower when
|
||||
run in the browser.
|
||||
- **JS as a Browser Target:** To transfer textual data between WASM and JS
|
||||
incurs a significant marshalling overhead. As the IDE primarily works with
|
||||
textual operations under the hood, this proved to be a significant slowdown.
|
||||
|
||||
## Choosing Rust
|
||||
Rust, then, is an obvious choice for the following reasons:
|
||||
|
||||
- It can be compiled _natively_ into the IDE binary, providing them with
|
||||
excellent performance.
|
||||
- As a native language it can use JNI to directly create JVM objects on the JVM
|
||||
heap, for use by the compiler.
|
||||
- As a native language it can be called directly via JNI.
|
||||
- There is potential in the future for employing Graal's LLVM bitcode
|
||||
interpreter to execute the parser safely in a non-native context.
|
||||
|
||||
### Downsides of Rust
|
||||
This is not to say that choosing rust doesn't come with some compromises:
|
||||
|
||||
- It significantly complicates the CI pipeline for the engine, as we will have
|
||||
to build native artefacts for use by the runtime itself.
|
||||
- As a non-JVM language, the complexity of working with it from Scala and Java
|
||||
is increased. We will need to maintain a full definition of the AST in Scala
|
||||
to permit the compiler to work properly with it.
|
27
parser/flexer/Cargo.toml
Normal file
27
parser/flexer/Cargo.toml
Normal file
@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "flexer"
|
||||
version = "0.0.1"
|
||||
authors = [
|
||||
"Enso Team <enso-dev@enso.org>",
|
||||
"Ara Adkins <ara.adkins@enso.org"
|
||||
]
|
||||
edition = "2018"
|
||||
|
||||
description = "A finite-automata-based lexing engine."
|
||||
readme = "README.md"
|
||||
homepage = "https://github.com/luna/enso"
|
||||
repository = "https://github.com/luna/enso"
|
||||
license-file = "../../LICENSE"
|
||||
|
||||
keywords = ["lexer", "finite-automata"]
|
||||
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
name = "flexer"
|
||||
crate-type = ["dylib", "rlib"]
|
||||
test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.8"
|
4
parser/flexer/README.md
Normal file
4
parser/flexer/README.md
Normal file
@ -0,0 +1,4 @@
|
||||
# Flexer
|
||||
This library provides a finite-automata-based lexing engine that can flexibly
|
||||
tokenize an input stream.
|
||||
|
1
parser/flexer/build.rs
Normal file
1
parser/flexer/build.rs
Normal file
@ -0,0 +1 @@
|
||||
fn main() {}
|
8
parser/flexer/src/automata.rs
Normal file
8
parser/flexer/src/automata.rs
Normal file
@ -0,0 +1,8 @@
|
||||
//! Exports API for construction of Nondeterminist and Deterministic Finite
|
||||
//! State Automata.
|
||||
|
||||
pub mod alphabet;
|
||||
pub mod dfa;
|
||||
pub mod nfa;
|
||||
pub mod pattern;
|
||||
pub mod state;
|
61
parser/flexer/src/automata/alphabet.rs
Normal file
61
parser/flexer/src/automata/alphabet.rs
Normal file
@ -0,0 +1,61 @@
|
||||
//! Exports an alphabet (set of all valid input symbols) for Finite State
|
||||
//! Automata (NFA and DFA).
|
||||
|
||||
use crate::automata::state::Symbol;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// ================
|
||||
// === Alphabet ===
|
||||
// ================
|
||||
|
||||
/// An alphabet describes a set of all the valid input symbols that a given
|
||||
/// finite state automata (NFA or DFA) can operate over.
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
/// The alphabet is meant to be represented as an interval. That is, if `a` and
|
||||
/// `b` are in alphabet, then any symbol from `a..=b` is in alphabet too.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Alphabet {
|
||||
/// The interval of all valid input symbols. The interval is further
|
||||
/// divided into subintervals (i.e. `[a,z,A,Z]` should be understood as
|
||||
/// `[a..=z,z..=A,A..=Z]`), in order to efficiently encode state
|
||||
/// transitions that trigger not just on one but a whole range of symbols
|
||||
/// (i.e. `a..=z`)
|
||||
pub symbols:BTreeSet<Symbol>,
|
||||
}
|
||||
|
||||
impl Default for Alphabet {
|
||||
fn default() -> Self {
|
||||
Alphabet {
|
||||
symbols:[Symbol { val:0 }].iter().cloned().collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Alphabet {
|
||||
/// Inserts a range of symbols into the alphabet.
|
||||
pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
|
||||
// The symbol range is associated with transition in automata. Therefore
|
||||
// we: Mark the symbol with the new transition.
|
||||
self.symbols.insert(Symbol {
|
||||
val:range.start().val,
|
||||
});
|
||||
// Mark the symbol without the new transition.
|
||||
self.symbols.insert(Symbol {
|
||||
val:range.end().val + 1,
|
||||
});
|
||||
// This way each symbol in alphabet corresponds to a unique set of
|
||||
// transitions.
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u32>> for Alphabet {
|
||||
fn from(vec:Vec<u32>) -> Self {
|
||||
let mut dict = Self::default();
|
||||
for val in vec {
|
||||
dict.symbols.insert(Symbol { val });
|
||||
}
|
||||
dict
|
||||
}
|
||||
}
|
156
parser/flexer/src/automata/dfa.rs
Normal file
156
parser/flexer/src/automata/dfa.rs
Normal file
@ -0,0 +1,156 @@
|
||||
//! Exports the structure for Deterministic Finite Automata.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::state;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
// =====================================
|
||||
// === Deterministic Finite Automata ===
|
||||
// =====================================
|
||||
|
||||
/// Function callback for an arbitrary state of finite automata.
|
||||
/// It contains name of Rust procedure that is meant to be executed after
|
||||
/// encountering a pattern (declared in `group::Rule.pattern`).
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Callback {
|
||||
/// TODO[jv] Write better explanation after implementing rust code
|
||||
/// generation. Priority is used during rust code generation.
|
||||
pub priority:usize,
|
||||
/// Name of Rust method that will be called when executing this callback.
|
||||
pub name:String,
|
||||
}
|
||||
|
||||
/// DFA automata with a set of symbols, states and transitions.
|
||||
/// Deterministic Finite Automata is a finite-state machine that accepts or
|
||||
/// rejects a given sequence of symbols, by running through a state sequence
|
||||
/// uniquely determined by the input symbol sequence. ___ ___
|
||||
/// ___ ___ | 0 | -- 'D' --> | 1 | -- 'F' --> | 2 | -- 'A' --> | 3
|
||||
/// | ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct DFA {
|
||||
/// Finite set of all valid input symbols.
|
||||
pub alphabet:Alphabet,
|
||||
/// Transition matrix of deterministic finite state automata.
|
||||
/// It contains next state for each pair of state and input symbol -
|
||||
/// (state,symbol) => new state. For example, a transition matrix for
|
||||
/// automata that accepts string "ABABAB...." would look like this:
|
||||
/// states
|
||||
/// | | A | B | <- symbols
|
||||
/// | 0 | 1 | - |
|
||||
/// | 1 | - | 0 |
|
||||
/// Where `-` denotes `state::INVALID`.
|
||||
pub links:Matrix<state::Id>,
|
||||
/// Stores callback for each state (if it has one).
|
||||
pub callbacks:Vec<Option<Callback>>,
|
||||
}
|
||||
|
||||
impl From<Vec<Vec<usize>>> for Matrix<state::Id> {
|
||||
fn from(input:Vec<Vec<usize>>) -> Self {
|
||||
let rows = input.len();
|
||||
let columns = if rows == 0 { 0 } else { input[0].len() };
|
||||
let mut matrix = Self::new(rows, columns);
|
||||
for row in 0..rows {
|
||||
for column in 0..columns {
|
||||
matrix[(row, column)] = state::Id {
|
||||
id:input[row][column],
|
||||
};
|
||||
}
|
||||
}
|
||||
matrix
|
||||
}
|
||||
}
|
||||
|
||||
// ===========
|
||||
// == Tests ==
|
||||
// ===========
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use super::*;
|
||||
use crate::automata::state;
|
||||
|
||||
const I:usize = state::INVALID.id;
|
||||
|
||||
/// DFA automata that accepts newline '\n'.
|
||||
pub fn newline() -> DFA {
|
||||
DFA {
|
||||
alphabet:Alphabet::from(vec![10, 11]),
|
||||
links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(Callback {
|
||||
priority:2,
|
||||
name:"group0_rule0".into(),
|
||||
}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts any letter a..=z.
|
||||
pub fn letter() -> DFA {
|
||||
DFA {
|
||||
alphabet:Alphabet::from(vec![97, 123]),
|
||||
links:Matrix::from(vec![vec![I, 1, I], vec![I, I, I]]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(Callback {
|
||||
priority:2,
|
||||
name:"group0_rule0".into(),
|
||||
}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet:Alphabet::from(vec![0, 32, 33]),
|
||||
links:Matrix::from(vec![
|
||||
vec![I, 1, I],
|
||||
vec![I, 2, I],
|
||||
vec![I, 2, I],
|
||||
]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(Callback {
|
||||
priority:3,
|
||||
name:"group0_rule0".into(),
|
||||
}),
|
||||
Some(Callback {
|
||||
priority:3,
|
||||
name:"group0_rule0".into(),
|
||||
}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts one letter a..=z or any many spaces.
|
||||
pub fn letter_and_spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet:Alphabet::from(vec![32, 33, 97, 123]),
|
||||
links:Matrix::from(vec![
|
||||
vec![I, 1, I, 2, I],
|
||||
vec![I, 3, I, I, I],
|
||||
vec![I, I, I, I, I],
|
||||
vec![I, 3, I, I, I],
|
||||
]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(Callback {
|
||||
priority:4,
|
||||
name:"group0_rule1".into(),
|
||||
}),
|
||||
Some(Callback {
|
||||
priority:4,
|
||||
name:"group0_rule0".into(),
|
||||
}),
|
||||
Some(Callback {
|
||||
priority:4,
|
||||
name:"group0_rule1".into(),
|
||||
}),
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
365
parser/flexer/src/automata/nfa.rs
Normal file
365
parser/flexer/src/automata/nfa.rs
Normal file
@ -0,0 +1,365 @@
|
||||
//! Implementation of Nondeterministic Finite Automata and it's conversion to
|
||||
//! DFA.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::dfa::Callback;
|
||||
use crate::automata::dfa::DFA;
|
||||
use crate::automata::state;
|
||||
use crate::automata::state::Link;
|
||||
use crate::automata::state::State;
|
||||
use crate::automata::state::Symbol;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
use crate::automata::pattern::Pattern;
|
||||
use itertools::Itertools;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// ========================================
|
||||
// === Nondeterministic Finite Automata ===
|
||||
// ========================================
|
||||
|
||||
/// Type alias for a state Id based on set of states.
|
||||
/// It is used during NFA -> DFA transformation where multiple states can merge
|
||||
/// together, thanks to epsilon links.
|
||||
type StateSetId = BTreeSet<state::Id>;
|
||||
|
||||
/// NFA automata with a set of symbols, states and transitions.
|
||||
/// Nondeterministic Finite Automata is a finite-state machine that accepts or
|
||||
/// rejects a given sequence of symbols.
|
||||
/// Compared to `DFA`, NFA can transition into multiple new states without
|
||||
/// reading any symbol (so called epsilon link / transition),
|
||||
/// ___ ___ ___ ___ ___
|
||||
/// | 0 | -- 'N' --> | 1 | ----> | 2 | -- 'F' --> | 3 | -- 'A' --> | 4 |
|
||||
/// ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾ ‾‾‾
|
||||
/// More information at: https://en.wikipedia.org/wiki/Deterministic_finite_automaton
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct NFA {
|
||||
/// Finite set of all valid input symbols.
|
||||
pub alphabet:Alphabet,
|
||||
/// Set of named NFA states with (epsilon) transitions.
|
||||
pub states:Vec<State>,
|
||||
}
|
||||
|
||||
impl NFA {
|
||||
/// Adds a new state to NFA and returns it's Id.
|
||||
pub fn new_state(&mut self) -> state::Id {
|
||||
let id = self.states.len();
|
||||
self.states.push(State::default());
|
||||
state::Id { id }
|
||||
}
|
||||
|
||||
/// Creates an epsilon transition between two states.
|
||||
/// Whenever the automata happens to be in `source` state it can
|
||||
/// immediatelly move to `target` state (but does not have to).
|
||||
pub fn connect(&mut self, source:state::Id, target:state::Id) {
|
||||
self.states[source.id].epsilon_links.push(target);
|
||||
}
|
||||
|
||||
/// Creates an ordinary transition (for a range of symbols) between two
|
||||
/// states. If any symbol from such range happens to be on input when
|
||||
/// the automata is in `source` state, it will immediatelly move to
|
||||
/// `target` state.
|
||||
pub fn connect_by(
|
||||
&mut self,
|
||||
source:state::Id,
|
||||
target:state::Id,
|
||||
symbols:&RangeInclusive<Symbol>,
|
||||
) {
|
||||
self.alphabet.insert(symbols.clone());
|
||||
self.states[source.id].links.push(Link {
|
||||
symbols:symbols.clone(),
|
||||
target,
|
||||
});
|
||||
}
|
||||
|
||||
/// Transforms pattern to NFA.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
|
||||
pub fn new_pattern(
|
||||
&mut self,
|
||||
source:state::Id,
|
||||
pattern:&Pattern,
|
||||
) -> state::Id {
|
||||
let current = self.new_state();
|
||||
self.connect(source, current);
|
||||
match pattern {
|
||||
Pattern::Range(range) => {
|
||||
let state = self.new_state();
|
||||
self.connect_by(current, state, range);
|
||||
state
|
||||
}
|
||||
Pattern::Many(body) => {
|
||||
let s1 = self.new_state();
|
||||
let s2 = self.new_pattern(s1, body);
|
||||
let s3 = self.new_state();
|
||||
self.connect(current, s1);
|
||||
self.connect(current, s3);
|
||||
self.connect(s2, s3);
|
||||
self.connect(s3, s1);
|
||||
s3
|
||||
}
|
||||
Pattern::And(patterns) => patterns
|
||||
.iter()
|
||||
.fold(current, |s, pat| self.new_pattern(s, pat)),
|
||||
Pattern::Or(patterns) => {
|
||||
let states = patterns
|
||||
.iter()
|
||||
.map(|pat| self.new_pattern(current, pat))
|
||||
.collect_vec();
|
||||
let end = self.new_state();
|
||||
for state in states {
|
||||
self.connect(state, end);
|
||||
}
|
||||
end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === NFA -> DFA ===
|
||||
|
||||
/// Merges states that are connected by epsilon links.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
|
||||
fn eps_matrix(&self) -> Vec<StateSetId> {
|
||||
fn fill_eps_matrix(
|
||||
nfa:&NFA,
|
||||
states:&mut Vec<StateSetId>,
|
||||
computed:&mut Vec<bool>,
|
||||
visited:&mut Vec<bool>,
|
||||
state:state::Id,
|
||||
) {
|
||||
let mut state_set = StateSetId::new();
|
||||
let mut circular = false;
|
||||
visited[state.id] = true;
|
||||
state_set.insert(state);
|
||||
for &target in &nfa.states[state.id].epsilon_links {
|
||||
if !visited[target.id] {
|
||||
fill_eps_matrix(nfa, states, computed, visited, target);
|
||||
}
|
||||
state_set.insert(target);
|
||||
state_set.extend(states[target.id].iter());
|
||||
if !computed[target.id] {
|
||||
circular = true
|
||||
}
|
||||
}
|
||||
if !circular {
|
||||
computed[state.id] = true
|
||||
}
|
||||
states[state.id] = state_set;
|
||||
}
|
||||
|
||||
let mut states = vec![StateSetId::new(); self.states.len()];
|
||||
let mut computed = vec![false; self.states.len()];
|
||||
for id in 0..self.states.len() {
|
||||
let mut visited = vec![false; states.len()];
|
||||
fill_eps_matrix(
|
||||
self,
|
||||
&mut states,
|
||||
&mut computed,
|
||||
&mut visited,
|
||||
state::Id { id },
|
||||
);
|
||||
}
|
||||
states
|
||||
}
|
||||
|
||||
/// Computes a transition matrix (state X symbol => state) for NFA.
|
||||
/// Ignores epsilon links.
|
||||
fn nfa_matrix(&self) -> Matrix<state::Id> {
|
||||
let mut matrix =
|
||||
Matrix::new(self.states.len(), self.alphabet.symbols.len());
|
||||
|
||||
for (state_ix, source) in self.states.iter().enumerate() {
|
||||
let targets = source.targets(&self.alphabet);
|
||||
for (voc_ix, &target) in targets.iter().enumerate() {
|
||||
matrix[(state_ix, voc_ix)] = target;
|
||||
}
|
||||
}
|
||||
matrix
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&NFA> for DFA {
|
||||
/// Transforms NFA into DFA.
|
||||
/// The algorithm is based on: https://www.youtube.com/watch?v=taClnxU-nao
|
||||
fn from(nfa:&NFA) -> Self {
|
||||
let nfa_mat = nfa.nfa_matrix();
|
||||
let eps_mat = nfa.eps_matrix();
|
||||
let mut dfa_mat = Matrix::new(0, nfa.alphabet.symbols.len());
|
||||
let mut dfa_eps_ixs = Vec::<StateSetId>::new();
|
||||
let mut dfa_eps_map = HashMap::<StateSetId, state::Id>::new();
|
||||
|
||||
dfa_eps_ixs.push(eps_mat[0].clone());
|
||||
dfa_eps_map.insert(eps_mat[0].clone(), state::Id { id:0 });
|
||||
|
||||
let mut i = 0;
|
||||
while i < dfa_eps_ixs.len() {
|
||||
dfa_mat.new_row();
|
||||
for voc_ix in 0..nfa.alphabet.symbols.len() {
|
||||
let mut eps_set = StateSetId::new();
|
||||
for &eps_ix in &dfa_eps_ixs[i] {
|
||||
let tgt = nfa_mat[(eps_ix.id, voc_ix)];
|
||||
if tgt != state::INVALID {
|
||||
eps_set.extend(eps_mat[tgt.id].iter());
|
||||
}
|
||||
}
|
||||
if !eps_set.is_empty() {
|
||||
dfa_mat[(i, voc_ix)] = match dfa_eps_map.get(&eps_set) {
|
||||
Some(&id) => id,
|
||||
None => {
|
||||
let id = state::Id {
|
||||
id:dfa_eps_ixs.len(),
|
||||
};
|
||||
dfa_eps_ixs.push(eps_set.clone());
|
||||
dfa_eps_map.insert(eps_set, id);
|
||||
id
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let mut callbacks = vec![None; dfa_eps_ixs.len()];
|
||||
let priority = dfa_eps_ixs.len();
|
||||
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
|
||||
let has_name = |&key:&state::Id| nfa.states[key.id].name.is_some();
|
||||
if let Some(eps) = epss.into_iter().find(has_name) {
|
||||
let rule = nfa.states[eps.id].name.as_ref().cloned().unwrap();
|
||||
callbacks[dfa_ix] = Some(Callback {
|
||||
name:rule,
|
||||
priority,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
DFA {
|
||||
alphabet:nfa.alphabet.clone(),
|
||||
links:dfa_mat,
|
||||
callbacks,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ===========
|
||||
// == Tests ==
|
||||
// ===========
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
extern crate test;
|
||||
|
||||
use crate::automata::dfa;
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
|
||||
/// NFA automata that accepts newline '\n'.
|
||||
pub fn newline() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![(10..=10, 2)]),
|
||||
State::from(vec![3]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet:Alphabet::from(vec![10, 11]),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts any letter a..=z.
|
||||
pub fn letter() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![(97..=122, 2)]),
|
||||
State::from(vec![3]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet:Alphabet::from(vec![97, 123]),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![2]),
|
||||
State::from(vec![(32..=32, 3)]),
|
||||
State::from(vec![4]),
|
||||
State::from(vec![5, 8]),
|
||||
State::from(vec![6]),
|
||||
State::from(vec![(32..=32, 7)]),
|
||||
State::from(vec![8]),
|
||||
State::from(vec![5, 9]).named("group0_rule0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet:Alphabet::from(vec![0, 32, 33]),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA automata that accepts one letter a..=z or many spaces ' '.
|
||||
pub fn letter_and_spaces() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1, 3]),
|
||||
State::from(vec![(97..=122, 2)]),
|
||||
State::from(vec![11]).named("group0_rule0"),
|
||||
State::from(vec![4]),
|
||||
State::from(vec![(32..=32, 5)]),
|
||||
State::from(vec![6]),
|
||||
State::from(vec![7, 10]),
|
||||
State::from(vec![8]),
|
||||
State::from(vec![(32..=32, 9)]),
|
||||
State::from(vec![10]),
|
||||
State::from(vec![7, 11]).named("group0_rule1"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet:Alphabet::from(vec![32, 33, 97, 123]),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_newline() {
|
||||
assert_eq!(DFA::from(&newline()), dfa::tests::newline());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_letter() {
|
||||
assert_eq!(DFA::from(&letter()), dfa::tests::letter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_spaces() {
|
||||
assert_eq!(DFA::from(&spaces()), dfa::tests::spaces());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_letter_and_spaces() {
|
||||
assert_eq!(
|
||||
DFA::from(&letter_and_spaces()),
|
||||
dfa::tests::letter_and_spaces()
|
||||
);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_newline(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&newline()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_letter(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&letter()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&spaces()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&letter_and_spaces()))
|
||||
}
|
||||
}
|
164
parser/flexer/src/automata/pattern.rs
Normal file
164
parser/flexer/src/automata/pattern.rs
Normal file
@ -0,0 +1,164 @@
|
||||
//! Simple API for constructing regex patterns that are used in parser
|
||||
//! implementation.
|
||||
|
||||
use crate::automata::state::Symbol;
|
||||
use crate::parser;
|
||||
|
||||
use core::iter;
|
||||
use itertools::Itertools;
|
||||
use std::ops::BitAnd;
|
||||
use std::ops::BitOr;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// =============
|
||||
// == Pattern ==
|
||||
// =============
|
||||
|
||||
/// Simple regex pattern.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum Pattern {
|
||||
/// Pattern that triggers on any symbol from given range.
|
||||
Range(RangeInclusive<Symbol>),
|
||||
/// Pattern that triggers on any given pattern from sequence.
|
||||
Or(Vec<Pattern>),
|
||||
/// Pattern that triggers when a sequence of patterns is encountered.
|
||||
And(Vec<Pattern>),
|
||||
/// Pattern that triggers on 0..N repetitions of given pattern.
|
||||
Many(Box<Pattern>),
|
||||
}
|
||||
|
||||
use Pattern::*;
|
||||
|
||||
impl BitOr<Pattern> for Pattern {
|
||||
type Output = Pattern;
|
||||
fn bitor(self, rhs:Pattern) -> Self::Output {
|
||||
match (self, rhs) {
|
||||
(Or(mut lhs), Or(rhs)) => {
|
||||
lhs.extend(rhs);
|
||||
Or(lhs)
|
||||
}
|
||||
(Or(mut lhs), rhs) => {
|
||||
lhs.push(rhs);
|
||||
Or(lhs)
|
||||
}
|
||||
(lhs, Or(mut rhs)) => {
|
||||
rhs.push(lhs);
|
||||
Or(rhs)
|
||||
}
|
||||
(lhs, rhs) => Or(vec![lhs, rhs]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BitAnd<Pattern> for Pattern {
|
||||
type Output = Pattern;
|
||||
fn bitand(self, rhs:Pattern) -> Self::Output {
|
||||
match (self, rhs) {
|
||||
(And(mut lhs), And(rhs)) => {
|
||||
lhs.extend(rhs);
|
||||
And(lhs)
|
||||
}
|
||||
(And(mut lhs), rhs) => {
|
||||
lhs.push(rhs);
|
||||
And(lhs)
|
||||
}
|
||||
(lhs, And(mut rhs)) => {
|
||||
rhs.push(lhs);
|
||||
And(rhs)
|
||||
}
|
||||
(lhs, rhs) => And(vec![lhs, rhs]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Pattern {
|
||||
/// Pattern that never triggers.
|
||||
pub fn never() -> Self { Pattern::symbols(1..=0) }
|
||||
|
||||
/// Pattern that always triggers.
|
||||
pub fn always() -> Self {
|
||||
Pattern::symbols(u32::min_value()..=u32::max_value())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any char.
|
||||
pub fn any_char() -> Self { Pattern::symbols(0..=u32::max_value()) }
|
||||
|
||||
/// Pattern that triggers on 0..N repetitions of given pattern.
|
||||
pub fn many(self) -> Self { Many(Box::new(self)) }
|
||||
|
||||
/// Pattern that triggers on 1..N repetitions of given pattern.
|
||||
pub fn many1(self) -> Self { self.clone() & self.many() }
|
||||
|
||||
/// Pattern that triggers on 0..=1 repetitions of given pattern.
|
||||
pub fn opt(self) -> Self { self | Self::always() }
|
||||
|
||||
/// Pattern that triggers on given symbol
|
||||
pub fn symbol(symbol:u32) -> Self { Pattern::symbols(symbol..=symbol) }
|
||||
|
||||
/// Pattern that triggers on any of the given symbols.
|
||||
pub fn symbols(symbols:RangeInclusive<u32>) -> Self {
|
||||
Pattern::Range(
|
||||
Symbol {
|
||||
val:*symbols.start(),
|
||||
}..=Symbol { val:*symbols.end() },
|
||||
)
|
||||
}
|
||||
|
||||
/// Pattern that triggers on end of file.
|
||||
pub fn eof() -> Self { Self::symbol(parser::EOF_CODE.val) }
|
||||
|
||||
/// Pattern that triggers on given character.
|
||||
pub fn char(char:char) -> Self { Self::symbol(char as u32) }
|
||||
|
||||
/// Pattern that triggers on any of the given characters.
|
||||
pub fn range(chars:RangeInclusive<char>) -> Self {
|
||||
Pattern::symbols((*chars.start() as u32)..=(*chars.end() as u32))
|
||||
}
|
||||
|
||||
/// Pattern that triggers when sequence of characters is encountered.
|
||||
pub fn all(chars:&str) -> Self {
|
||||
chars
|
||||
.chars()
|
||||
.fold(Self::never(), |pat, char| pat & Self::char(char))
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any characters from given sequence.
|
||||
pub fn any(chars:&str) -> Self {
|
||||
chars
|
||||
.chars()
|
||||
.fold(Self::never(), |pat, char| pat | Self::char(char))
|
||||
}
|
||||
|
||||
/// Pattern that doesn't trigger on any given character from given sequence.
|
||||
pub fn none(chars:&str) -> Self {
|
||||
let max = u32::max_value();
|
||||
let char_iter = chars.chars().map(|char| char as u32);
|
||||
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
|
||||
let mut codes = char_iter2.collect_vec();
|
||||
|
||||
codes.sort();
|
||||
codes
|
||||
.iter()
|
||||
.tuple_windows()
|
||||
.fold(Self::never(), |pat, (start, end)| {
|
||||
if end < start {
|
||||
pat
|
||||
} else {
|
||||
pat | Pattern::symbols(*start..=*end)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Pattern that triggers on any character but the one given.
|
||||
pub fn not(char:char) -> Self { Self::none(&char.to_string()) }
|
||||
|
||||
/// Pattern that triggers on N repetitions of given pattern.
|
||||
pub fn repeat(pat:Pattern, num:usize) -> Self {
|
||||
(0..num).fold(Self::always(), |p, _| p & pat.clone())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on MIN..MAX repetitions of given pattern.
|
||||
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
|
||||
(min..max).fold(Self::never(), |p, n| p | Self::repeat(pat.clone(), n))
|
||||
}
|
||||
}
|
121
parser/flexer/src/automata/state.rs
Normal file
121
parser/flexer/src/automata/state.rs
Normal file
@ -0,0 +1,121 @@
|
||||
//! This module exports State implementation for Nondeterministic Finite
|
||||
//! Automata.
|
||||
|
||||
use crate::automata::alphabet::Alphabet;
|
||||
use crate::automata::state;
|
||||
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
// =======================
|
||||
// == State Of Automata ==
|
||||
// =======================
|
||||
|
||||
/// Flag for invalid state.
|
||||
/// When finite automata gets into invalid state the input sequence of symbols
|
||||
/// is rejected.
|
||||
pub const INVALID:Id = Id {
|
||||
id:usize::max_value(),
|
||||
};
|
||||
|
||||
/// Newtype wrapper for finite automata input symbol.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Symbol {
|
||||
#[allow(missing_docs)]
|
||||
pub val:u32,
|
||||
}
|
||||
|
||||
/// Newtype wrapper for finite automata state ID.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Id {
|
||||
#[allow(missing_docs)]
|
||||
pub id:usize,
|
||||
}
|
||||
|
||||
impl Default for Id {
|
||||
/// Returns state::INVALID. This is because every finite automata has an
|
||||
/// invalid state and because all transitions in automata transition
|
||||
/// matrix lead to invalid state by default.
|
||||
fn default() -> Self { state::INVALID }
|
||||
}
|
||||
|
||||
/// Named NFA state with a set of transitions (links).
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct State {
|
||||
/// Set of transitions that don't require any symbol to trigger.
|
||||
/// I.E. If there is an epsilon link from state A to state B, then whenever
|
||||
/// we are in state A, we can freely move to state B.
|
||||
pub epsilon_links:Vec<Id>,
|
||||
/// Set of transitions that trigger with specific symbol on input.
|
||||
/// When triggered, the automata will transition to the `link.target`.
|
||||
pub links:Vec<Link>,
|
||||
/// Name of the state.
|
||||
/// We use it to autogenerate a call to Rust method with same name.
|
||||
pub name:Option<String>,
|
||||
}
|
||||
|
||||
/// A transition to new automata state
|
||||
/// that requires specific symbol on automata input to trigger.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Link {
|
||||
/// Any symbol from the range will trigger this link.
|
||||
pub symbols:RangeInclusive<Symbol>,
|
||||
/// A state that is visited, after the link is triggered.
|
||||
pub target:Id,
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Updater for field `name`. Returns updated state.
|
||||
pub fn named(mut self, name:&str) -> Self {
|
||||
self.name = Some(name.to_owned());
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns transition (next state) for each symbol in alphabet.
|
||||
pub fn targets(&self, alphabet:&Alphabet) -> Vec<Id> {
|
||||
let mut targets = vec![];
|
||||
let mut index = 0;
|
||||
let mut links = self.links.clone();
|
||||
links.sort_by_key(|link| *link.symbols.start());
|
||||
for &symbol in &alphabet.symbols {
|
||||
while links.len() > index && *links[index].symbols.end() < symbol {
|
||||
index += 1;
|
||||
}
|
||||
if links.len() <= index || *links[index].symbols.start() > symbol {
|
||||
targets.push(state::INVALID);
|
||||
} else {
|
||||
targets.push(links[index].target);
|
||||
}
|
||||
}
|
||||
targets
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<usize>> for State {
|
||||
/// Creates a state with epsilon links.
|
||||
fn from(vec:Vec<usize>) -> Self {
|
||||
let epsilon_links = vec.iter().cloned().map(|id| Id { id }).collect();
|
||||
State {
|
||||
epsilon_links,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
|
||||
/// Creates a state with ordinary links.
|
||||
fn from(vec:Vec<(RangeInclusive<u32>, usize)>) -> Self {
|
||||
let link = |(range, id):(RangeInclusive<u32>, usize)| {
|
||||
let start = Symbol { val:*range.start() };
|
||||
let end = Symbol { val:*range.end() };
|
||||
Link {
|
||||
symbols:start..=end,
|
||||
target:Id { id },
|
||||
}
|
||||
};
|
||||
let links = vec.iter().cloned().map(link).collect();
|
||||
State {
|
||||
links,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
3
parser/flexer/src/data.rs
Normal file
3
parser/flexer/src/data.rs
Normal file
@ -0,0 +1,3 @@
|
||||
//! Generic datastructures, with multiple usecases.
|
||||
|
||||
pub mod matrix;
|
55
parser/flexer/src/data/matrix.rs
Normal file
55
parser/flexer/src/data/matrix.rs
Normal file
@ -0,0 +1,55 @@
|
||||
//! Efficient representation of 2D matrix.
|
||||
|
||||
use std::ops::Index;
|
||||
use std::ops::IndexMut;
|
||||
|
||||
// ============
|
||||
// == Matrix ==
|
||||
// ============
|
||||
|
||||
/// Efficient 2D matrix implemented on top of vector.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
pub struct Matrix<T> {
|
||||
/// The number of rows in matrix.
|
||||
rows:usize,
|
||||
/// The number of columns in matrix.
|
||||
columns:usize,
|
||||
/// Matrix implemented with vector.
|
||||
matrix:Vec<T>,
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize)> for Matrix<T> {
|
||||
type Output = T;
|
||||
fn index(&self, index:(usize, usize)) -> &T {
|
||||
&self.matrix[index.0 * self.columns + index.1]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize)> for Matrix<T> {
|
||||
fn index_mut(&mut self, index:(usize, usize)) -> &mut T {
|
||||
&mut self.matrix[index.0 * self.columns + index.1]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T:Default> Matrix<T> {
|
||||
/// Constructs a new matrix for given number of rows and columns.
|
||||
pub fn new(rows:usize, columns:usize) -> Self {
|
||||
let mut matrix = Vec::with_capacity(rows * columns);
|
||||
for _ in 0..matrix.capacity() {
|
||||
matrix.push(Default::default())
|
||||
}
|
||||
Self {
|
||||
rows,
|
||||
columns,
|
||||
matrix,
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a new row to matrix, filled with default values.
|
||||
pub fn new_row(&mut self) {
|
||||
for _ in 0..self.columns {
|
||||
self.matrix.push(Default::default());
|
||||
}
|
||||
self.rows += 1;
|
||||
}
|
||||
}
|
222
parser/flexer/src/group.rs
Normal file
222
parser/flexer/src/group.rs
Normal file
@ -0,0 +1,222 @@
|
||||
//! This module exports API for grouping multiple rules (Rust callbacks with
|
||||
//! regex pattern) together.
|
||||
|
||||
use crate::automata::nfa::NFA;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::group::rule::Rule;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
pub mod rule;
|
||||
|
||||
// ===========
|
||||
// == Group ==
|
||||
// ===========
|
||||
|
||||
/// Struct that group rules together. It also inherits rules from parent group
|
||||
/// (if it has one). Groups are the basic building block of flexer:
|
||||
/// Flexer internally keeps a stack of groups, only one of them active at a
|
||||
/// time. Each group contains set of regex patterns and callbacks (together
|
||||
/// called `Rule`). Whenever a rule.pattern from active group is matched with
|
||||
/// part of input the associated rule.callback is executed, which in turn may
|
||||
/// exit the current groupor enter a new one. This allows us to nicely model a
|
||||
/// situation, where certain part of program (like a string literal) should have
|
||||
/// very different parsing rules than other (for example body of function). Note
|
||||
/// that the input is first matched with first added rule, then with the second
|
||||
/// etc. Therefore, if two rules overlap, only the callback of the first added
|
||||
/// rule will be executed.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct Group {
|
||||
/// Unique ID.
|
||||
pub id:usize,
|
||||
/// Custom name which is used for debugging.
|
||||
pub name:String,
|
||||
/// Parent which we inherit rules from.
|
||||
pub parent:Option<Box<Group>>,
|
||||
/// Set of regex patterns with associated callbacks.
|
||||
pub rules:Vec<Rule>,
|
||||
}
|
||||
|
||||
impl Group {
|
||||
/// Adds new rule (regex pattern with associated callback) to group.
|
||||
pub fn add_rule(&mut self, rule:Rule) { self.rules.push(rule) }
|
||||
|
||||
/// Returns rule builder for given pattern.
|
||||
/// TODO[jv] better describe it's purpose once we agree on correct API.
|
||||
pub fn rule(
|
||||
&mut self,
|
||||
pattern:Pattern,
|
||||
) -> rule::Builder<impl FnMut(Rule) + '_> {
|
||||
rule::Builder {
|
||||
pattern,
|
||||
callback:move |rule| self.add_rule(rule),
|
||||
}
|
||||
}
|
||||
|
||||
/// All rules including parent rules.
|
||||
pub fn rules(&self) -> Vec<&Rule> {
|
||||
let mut parent = &self.parent;
|
||||
let mut rules = (&self.rules).iter().collect_vec();
|
||||
while let Some(state) = parent {
|
||||
rules.extend((&state.rules).iter());
|
||||
parent = &state.parent;
|
||||
}
|
||||
rules
|
||||
}
|
||||
|
||||
/// Canonical name of given rule.
|
||||
fn callback_name(&self, rule_ix:usize) -> String {
|
||||
format!("group{}_rule{}", self.id, rule_ix)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Group> for NFA {
|
||||
/// Transforms Group to NFA.
|
||||
/// Algorithm is based on: https://www.youtube.com/watch?v=RYNN-tb9WxI
|
||||
fn from(group:&Group) -> Self {
|
||||
let mut nfa = NFA::default();
|
||||
let start = nfa.new_state();
|
||||
let build = |rule:&Rule| nfa.new_pattern(start, &rule.pattern);
|
||||
let states = group.rules().into_iter().map(build).collect_vec();
|
||||
let end = nfa.new_state();
|
||||
for (ix, state) in states.into_iter().enumerate() {
|
||||
nfa.states[state.id].name = Some(group.callback_name(ix));
|
||||
nfa.connect(state, end);
|
||||
}
|
||||
nfa
|
||||
}
|
||||
}
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
extern crate test;
|
||||
|
||||
use crate::automata::dfa::DFA;
|
||||
use crate::automata::nfa;
|
||||
use crate::automata::nfa::NFA;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::group::rule::Rule;
|
||||
use crate::group::Group;
|
||||
|
||||
use std::default::Default;
|
||||
use test::Bencher;
|
||||
|
||||
fn newline() -> Group {
|
||||
let pattern = Pattern::char('\n');
|
||||
let mut group = Group::default();
|
||||
|
||||
group.add_rule(Rule {
|
||||
pattern,
|
||||
callback:"".into(),
|
||||
});
|
||||
|
||||
group
|
||||
}
|
||||
|
||||
fn letter() -> Group {
|
||||
let pattern = Pattern::range('a'..='z');
|
||||
let mut group = Group::default();
|
||||
|
||||
group.add_rule(Rule {
|
||||
pattern,
|
||||
callback:"".into(),
|
||||
});
|
||||
|
||||
group
|
||||
}
|
||||
|
||||
fn spaces() -> Group {
|
||||
let pattern = Pattern::char(' ').many1();
|
||||
let mut group = Group::default();
|
||||
|
||||
group.add_rule(Rule {
|
||||
pattern,
|
||||
callback:"".into(),
|
||||
});
|
||||
|
||||
group
|
||||
}
|
||||
|
||||
fn letter_and_spaces() -> Group {
|
||||
let letter = Pattern::range('a'..='z');
|
||||
let spaces = Pattern::char(' ').many1();
|
||||
let mut group = Group::default();
|
||||
|
||||
group.add_rule(Rule {
|
||||
pattern:letter,
|
||||
callback:"".into(),
|
||||
});
|
||||
group.add_rule(Rule {
|
||||
pattern:spaces,
|
||||
callback:"".into(),
|
||||
});
|
||||
|
||||
group
|
||||
}
|
||||
|
||||
fn hundred_rules() -> Group {
|
||||
let pattern =
|
||||
Pattern::all("The quick brown fox jumps over the lazy dog!!");
|
||||
let mut group = Group::default();
|
||||
|
||||
for _ in 0..100 {
|
||||
group.add_rule(Rule {
|
||||
pattern:pattern.clone(),
|
||||
callback:"".into(),
|
||||
})
|
||||
}
|
||||
group
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_newline() {
|
||||
assert_eq!(NFA::from(&newline()), nfa::tests::newline());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_letter() {
|
||||
assert_eq!(NFA::from(&letter()), nfa::tests::letter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_spaces() {
|
||||
assert_eq!(NFA::from(&spaces()), nfa::tests::spaces());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_letter_and_spaces() {
|
||||
assert_eq!(
|
||||
NFA::from(&letter_and_spaces()),
|
||||
nfa::tests::letter_and_spaces()
|
||||
);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_newline(bencher:&mut Bencher) {
|
||||
bencher.iter(|| NFA::from(&newline()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_letter(bencher:&mut Bencher) {
|
||||
bencher.iter(|| NFA::from(&letter()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| NFA::from(&spaces()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| NFA::from(&letter_and_spaces()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_hundred_rules(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&NFA::from(&hundred_rules())));
|
||||
}
|
||||
}
|
39
parser/flexer/src/group/rule.rs
Normal file
39
parser/flexer/src/group/rule.rs
Normal file
@ -0,0 +1,39 @@
|
||||
//! An API for declaring Rust callbacks for encountered regex patterns.
|
||||
//!
|
||||
use crate::automata::pattern::Pattern;
|
||||
|
||||
// ==========
|
||||
// == Rule ==
|
||||
// ==========
|
||||
|
||||
/// A rule is a pair of regex pattern and callback.
|
||||
/// The intention is to run the callback after encountering given pattern.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Rule {
|
||||
/// Pattern that triggers the callback.
|
||||
pub pattern:Pattern,
|
||||
/// Callback containing stringified Rust code.
|
||||
pub callback:String,
|
||||
}
|
||||
|
||||
/// Builder that allows us to add `Rule` to `Group` in a nice way.
|
||||
/// It is possible this structure won't be useful in rust, since borrow checker
|
||||
/// will likely influence the final API of rule construction.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder<Callback> {
|
||||
/// Pattern that triggers the callback.
|
||||
pub pattern:Pattern,
|
||||
/// Callback containing a closure.
|
||||
pub callback:Callback,
|
||||
}
|
||||
|
||||
impl<F:FnMut(Rule)> Builder<F> {
|
||||
/// Feeds the input that triggered regex pattern to callback.
|
||||
pub fn run(&mut self, program:String) {
|
||||
let rule = Rule {
|
||||
pattern:self.pattern.clone(),
|
||||
callback:program,
|
||||
};
|
||||
(self.callback)(rule);
|
||||
}
|
||||
}
|
17
parser/flexer/src/lib.rs
Normal file
17
parser/flexer/src/lib.rs
Normal file
@ -0,0 +1,17 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This module exports simple parser based on Deterministic Finite State
|
||||
//! Automata for regular grammars (anything parsable with regex patterns).
|
||||
|
||||
pub mod automata;
|
||||
pub mod data;
|
||||
pub mod group;
|
||||
pub mod parser;
|
15
parser/flexer/src/parser.rs
Normal file
15
parser/flexer/src/parser.rs
Normal file
@ -0,0 +1,15 @@
|
||||
//! The entry point of flexer. It (is going to) contain API for parsing an input
|
||||
//! string based on group of regex patterns.
|
||||
|
||||
use crate::automata::state::Symbol;
|
||||
|
||||
// ============
|
||||
// == Parser ==
|
||||
// ============
|
||||
|
||||
/// End Of File - This symbol is inserted at the end of each parser input.
|
||||
/// We can use the maximum value of u32, because no `char` (unicode scalar) can
|
||||
/// hold this value.
|
||||
pub const EOF_CODE:Symbol = Symbol {
|
||||
val:u32::max_value(),
|
||||
};
|
Loading…
Reference in New Issue
Block a user