From 43f7b838f97a78b77000716e1f8107cb6cbb6088 Mon Sep 17 00:00:00 2001 From: Ara Adkins Date: Thu, 13 Aug 2020 13:23:01 +0100 Subject: [PATCH] Implement code generation for the flexer (#1058) --- .gitignore | 2 +- Cargo.toml | 37 +- docs/parser/flexer.md | 309 ++--- lib/rust/enso-logger/Cargo.toml | 2 +- lib/rust/enso-logger/src/enabled.rs | 5 +- lib/rust/enso-prelude/src/vec.rs | 5 +- lib/rust/flexer-testing/definition/Cargo.toml | 15 + lib/rust/flexer-testing/definition/src/lib.rs | 259 ++++ lib/rust/flexer-testing/generation/Cargo.toml | 20 + lib/rust/flexer-testing/generation/build.rs | 32 + .../generation/src/generated.rs | 3 + lib/rust/flexer-testing/generation/src/lib.rs | 19 + .../generation/tests/test_generated_lexer.rs | 110 ++ lib/rust/flexer/Cargo.toml | 10 +- lib/rust/flexer/src/automata/alphabet.rs | 55 +- lib/rust/flexer/src/automata/dfa.rs | 61 +- lib/rust/flexer/src/automata/nfa.rs | 44 +- lib/rust/flexer/src/automata/pattern.rs | 21 +- .../flexer/src/automata/pattern/macros.rs | 28 + lib/rust/flexer/src/automata/state.rs | 24 +- lib/rust/flexer/src/automata/symbol.rs | 21 +- lib/rust/flexer/src/data/matrix.rs | 25 +- lib/rust/flexer/src/generate.rs | 528 ++++++++ lib/rust/flexer/src/group.rs | 126 +- lib/rust/flexer/src/group/rule.rs | 4 +- lib/rust/flexer/src/lib.rs | 1187 ++++++++++++++++- .../flexer/tests/flexer_generated_api_test.rs | 522 -------- .../tests/flexer_lexer_definition_test.rs | 188 --- .../flexer/tests/test_invalid_definitions.rs | 414 ++++++ lib/rust/lazy-reader/Cargo.toml | 3 +- lib/rust/lazy-reader/src/lib.rs | 363 +++-- lib/rust/lexer/definition/Cargo.toml | 24 - lib/rust/lexer/definition/src/lexer.rs | 22 - lib/rust/lexer/definition/src/lib.rs | 13 - lib/rust/lexer/generation/Cargo.toml | 29 - lib/rust/lexer/generation/build.rs | 20 - lib/rust/lexer/generation/src/lexer.rs | 3 - lib/rust/lexer/generation/src/lib.rs | 9 - lib/rust/lexer/tests/Cargo.toml | 20 - lib/rust/lexer/tests/src/main.rs | 33 - .../src/main/scala/org/enso/flexer/Spec.scala | 2 +- .../syntax/text/spec/ParserDefSmall.scala | 132 -- .../enso/syntax/text/spec/__Parser__.scala | 553 -------- 43 files changed, 3178 insertions(+), 2124 deletions(-) create mode 100644 lib/rust/flexer-testing/definition/Cargo.toml create mode 100644 lib/rust/flexer-testing/definition/src/lib.rs create mode 100644 lib/rust/flexer-testing/generation/Cargo.toml create mode 100644 lib/rust/flexer-testing/generation/build.rs create mode 100644 lib/rust/flexer-testing/generation/src/generated.rs create mode 100644 lib/rust/flexer-testing/generation/src/lib.rs create mode 100644 lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs create mode 100644 lib/rust/flexer/src/automata/pattern/macros.rs create mode 100644 lib/rust/flexer/src/generate.rs delete mode 100644 lib/rust/flexer/tests/flexer_generated_api_test.rs delete mode 100644 lib/rust/flexer/tests/flexer_lexer_definition_test.rs create mode 100644 lib/rust/flexer/tests/test_invalid_definitions.rs delete mode 100644 lib/rust/lexer/definition/Cargo.toml delete mode 100644 lib/rust/lexer/definition/src/lexer.rs delete mode 100644 lib/rust/lexer/definition/src/lib.rs delete mode 100644 lib/rust/lexer/generation/Cargo.toml delete mode 100644 lib/rust/lexer/generation/build.rs delete mode 100644 lib/rust/lexer/generation/src/lexer.rs delete mode 100644 lib/rust/lexer/generation/src/lib.rs delete mode 100644 lib/rust/lexer/tests/Cargo.toml delete mode 100644 lib/rust/lexer/tests/src/main.rs delete mode 100644 lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDefSmall.scala delete mode 100644 lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/__Parser__.scala diff --git a/.gitignore b/.gitignore index d18c15851e8..d5af837f5a4 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,7 @@ target/ Cargo.lock **/*.rs.bk wasm-pack.log -lib/rust/lexer/generation/src/lexer-engine.rs +generated/ ############# ## Haskell ## diff --git a/Cargo.toml b/Cargo.toml index 638bd1c0787..feb84d1f200 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ + "lib/rust/ast", "lib/rust/enso-data", "lib/rust/enso-generics", "lib/rust/enso-logger", @@ -10,30 +11,32 @@ members = [ "lib/rust/enso-shapely/impl", "lib/rust/enso-shapely/macros", "lib/rust/flexer", - "lib/rust/lexer/definition", - "lib/rust/lexer/generation", - "lib/rust/lexer/tests", + "lib/rust/flexer-testing/definition", + "lib/rust/flexer-testing/generation", "lib/rust/lazy-reader", - "lib/rust/ast", ] [profile.dev] -opt-level = 0 -lto = false -debug = true +opt-level = 0 +lto = false +debug = true +debug-assertions = true [profile.release] -opt-level = 3 -lto = true -debug = false -panic = 'abort' +opt-level = 3 +lto = true +debug = false +panic = 'abort' +debug-assertions = false [profile.bench] -opt-level = 3 -lto = true -debug = false +opt-level = 3 +lto = true +debug = false +debug-assertions = false [profile.test] -opt-level = 0 -lto = false -debug = true +opt-level = 0 +lto = false +debug = true +debug-assertions = true diff --git a/docs/parser/flexer.md b/docs/parser/flexer.md index d8953eaff89..3d400fb5104 100644 --- a/docs/parser/flexer.md +++ b/docs/parser/flexer.md @@ -8,93 +8,132 @@ order: 3 # Flexer -The flexer is a finite-automata-based engine for generating lexers. Akin to -`flex` and other lexer generators, it is given a definition as a series of rules -from which it then generates code for a highly-optimised lexer. +The flexer is a finite-automata-based engine for the definition and generation +of lexers. Akin to `flex`, and other lexer generators, the user may use it to +define a series of rules for lexing their language, which are then used by the +flexer to generate a highly-efficient lexer implementation. + +Where the flexer differs from other programs in this space, however, is the +power that it gives users. When matching a rule, the flexer allows its users to +execute _arbitrary_ Rust code, which may even manipulate the lexer's state and +position. This means that the languages that can be lexed by the flexer extend +from the simplest regular grammars right up to unrestricted grammars (but please +don't write a programming language whose syntax falls into this category). It +also differs in that it chooses the first complete match for a rule, rather than +the longest one, which makes lexers much easier to define and maintain. + +For detailed library documentation, please see the +[crate documentation](../../lib/rust/flexer/src/lib.rs) itself. This includes a +comprehensive tutorial on how to define a lexer using the flexer. -- [Pattern Description](#pattern-description) -- [State Management](#state-management) +- [The Lexing Process](#the-lexing-process) +- [Lexing Rules](#lexing-rules) + - [Groups](#groups) + - [Patterns](#patterns) + - [Transition Functions](#transition-functions) - [Code Generation](#code-generation) - [Automated Code Generation](#automated-code-generation) - - [Notes on Code Generation](#notes-on-code-generation) - [Structuring the Flexer Code](#structuring-the-flexer-code) - - [Supporting the Definition of Lexers](#supporting-the-definition-of-lexers) - [Supporting Code Generation](#supporting-code-generation) -- [An Example](#an-example) -## Pattern Description +## The Lexing Process -The definition of a lexer using the flexer library consists of a set of rules -for how to behave when matching portions of syntax. These rules behave as -follows: +In the flexer, the lexing process proceeds from the top to the bottom of the +user-defined rules, and selects the first expression that _matches fully_. Once +a pattern has been matched against the input, the associated code is executed +and the process starts again until the input stream has been consumed. -- A rule describes a regex-like pattern. -- It also describes the code to be executed when the pattern is matched. +This point about _matching fully_ is particularly important to keep in mind, as +it differs from other lexer generators that tend to prefer the _longest_ match +instead. + +## Lexing Rules + +A lexing rule for the flexer is a combination of three things: + +1. A group. +2. A pattern. +3. A transition function. + +An example of defining a rule is as follows: ```rust -pub fn lexer_definition() -> String { - // ... - - let chr = alphaNum | '_'; - let blank = Pattern::from('_') - - lexer.rule(lexer.root,blank,"self.on_ident(token::blank(self.start_location))"); - - // ... +fn define() -> Self { + let mut lexer = TestLexer::new(); + let a_word = Pattern::char('a').many1(); + let root_group_id = lexer.initial_state; + let root_group = lexer.groups_mut().group_mut(root_group_id); + // Here is the rule definition. + root_group.create_rule(&a_word,"self.on_first_word(reader)"); + lexer } ``` -A pattern, such as `chr`, or `blank` is a description of the characters that -should be matched for that pattern to match. The flexer library provides a set -of basic matchers for doing this. +### Groups -A `lexer.rule(...)` definition consists of the following parts: +A group is a mechanism that the flexer provides to allow grouping of rules +together. The flexer has a concept of a "state stack", which records the +currently active state at the current time, that can be manipulated by the +user-defined [transition functions](#transition-functions). -- A state, used for grouping rules and named for debugging (see the section on - [state management](#state-management) below). -- A pattern, as described above. -- The code that is executed when the pattern matches. +A state can be made active by using `flexer::push_state(state)`, and can be +deactivated by using `flexer::pop_state(state)` or +`flexer::pop_states_until(state)`. In addition, states may also have _parents_, +from which they can inherit rules. This is fantastic for removing the need to +repeat yourself when defining the lexer. -## State Management +### Patterns -States in the flexer engine provide a mechanism for grouping sets of rules -together known as `State`. At any given time, only rules from the _active_ state -are considered by the lexer. +Rules are defined to match _patterns_. Patterns are regular-grammar-like +descriptions of the textual content (as characters) that should be matched. For +a description of the various patterns provided by the flexer, see +[pattern.rs](../../lib/rust/flexer/src/automata/pattern.rs). -- States are named for purposes of debugging. -- You can activate another state from within the flexer instance by using - `state.push(new_state)`. -- You can deactivate the topmost state by using `state.pop()`. +When a pattern is matched, the associated +[transition function](#transition-functions) is executed. + +### Transition Functions + +The transition function is a piece of arbitrary rust code that is executed when +the pattern for a given rule is matched by the flexer. This code may perform +arbitrary manipulations of the lexer state, and is where the majority of the +power of the flexer stems from. ## Code Generation -The patterns in a lexer definition are used to generate a highly-efficient and -specialised lexer. This translation process works as follows: +While it would be possible to interpret the flexer definition directly at +runtime, this would involve far too much dynamicism and non-cache-local lookup +to be at all fast. -1. All rules are taken and used to generate an NFA. -2. A DFA is generated from the NFA using the standard - [subset construction](https://en.wikipedia.org/wiki/Powerset_construction) - algorithm, but with some additional optimisations that ensure the following - properties hold: - - Patterns are matched in the order that they are defined. - - The associated code chunks are maintained properly. - - Lexing is `O(n)`, where `n` is the size of the input. -3. The DFA is used to generate the code for a lexer `Engine` struct, containing - the `Lexer` definition. +Instead, the flexer includes +[`generate.rs`](../../lib/rust/flexer/src/generate.rs), a library for generating +highly-specialized lexer implementations based on the definition provided by the +user. The transformation that it implements operates as follows for each group +of rules. -The `Engine` generated through this process contains a main loop that consumes -the input stream character-by-character, evaluating a big switch generated from -the DFA using functions from the `Lexer`. +1. The set of rules in a group is used to generate a + [Nondeterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton), + (NFA). +2. The NFA is ttransformed into a + [Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) + (DFA), using a variant of the standard + [powerset construction](https://en.wikipedia.org/wiki/Powerset_construction) + algorithm. This variant has been modified to ensure that the following + additional properties hold: + - Patterns are matched in the order in which they are defined. + - The associated transition functions are maintained correctly through the + transformation. + - The lexing process is `O(n)`, where `n` is the size of the input. +3. The DFA is then used to generate the rust code that implements that lexer. -Lexing proceeds from top-to-bottom of the rules, and the first expression that -_matches fully_ is chosen. This differs from other common lexer generators, as -they mostly choose the _longest_ match instead. Once the pattern is matched, the -associated code is executed and the process starts over again until the input -stream has been consumed. +The generated lexer contains a main loop that consumes the input stream +character-by-character, evaluating what is effectively a big `match` expression +that processes the input to evaluate the user-provided transition functions as +appropriate. ### Automated Code Generation @@ -103,63 +142,46 @@ implementation (the generated engine), it is necessary to create a separate crate for the generated engine that has the lexer definition as one of its dependencies. -This separation enables a call to `flexer.generate_specialized_code()` in +This separation enables a call to `flexer::State::specialize()` in the crate's `build.rs` (or a macro) during compilation. The output can be stored in a new -file i.e. `lexer-engine.rs` and exported from the library with -`include!("lexer-engine.rs")`. The project structure therefore appears as -follows: +file i.e. `engine.rs` and exported from the library as needed. The project +structure would therefore appear as follows. ``` - lib/rust/lexer/ - definition/ - src/ - - lexer.rs + - lib.rs - cargo.toml - generation/ - src/ - - lexer.rs <-- include!("lexer-engine.rs") - - build.rs <-- calls `lexer_definition::lexer_source_code()` - -- and saves its output to `src/lexer-engine.rs` + - engine.rs <-- the generated file + - lib.rs <-- `pub mod engine` + - build.rs <-- calls `flexer::State::specialize()` and saves its output to + `src/engine.rs` - cargo.toml <-- lexer-definition is in dependencies and build-dependencies ``` With this design, `flexer.generate_specialized_code()` is going to be executed on each rebuild of `lexer/generation`. Therefore, `generation` should contain -only the minimum amount of logic (i.e. tests should be in separate crate) and -its dependencies should optimally involve only such code which directly -influences the content of generated code (in order to minimize the unnecessary -calls to expensive flexer specialization). - -### Notes on Code Generation - -The following properties are likely to hold for the code generation machinery. - -- The vast majority of the code generated by the flexer is going to be the same - for all lexers. -- The primary generation is in `consume_next_character`, which takes a `Lexer` - as an argument. +only the minimum amount of logic, and should endeavor to minimize any +unnecessary dependencies to avoid recompiling too often. ## Structuring the Flexer Code In order to unify the API between the definition and generated usages of the flexer, the API is separated into the following components: -- **Flexer:** The main flexer definition itself, providing functionality common - to the definition and implementation of all lexers. -- **FlexerState:** The stateful components of a lexer definition. This trait is +- `Flexer`: The main flexer definition itself, providing functionality common to + the definition and implementation of all lexers. +- `flexer::State`: The stateful components of a lexer definition. This trait is implemented for a particular lexer definition, allowing the user to store arbitrary data in their lexer, as needed. - **User-Defined Lexer:** The user can then define a lexer that _wraps_ the - flexer, specialised to the particular `FlexerState` that the user has defined. - It is recommended to implement `Deref` and `DerefMut` between the defined - lexer and the `Flexer`, to allow for ease of use. - -### Supporting the Definition of Lexers - -> The actionables for this section are: -> -> - Fill it in as the generation solidifies. + flexer, specialised to the particular `flexer::State` that the user has + defined. It is recommended to implement `Deref` and `DerefMut` between the + defined lexer and the `Flexer`, to allow for ease of use. ### Supporting Code Generation @@ -170,106 +192,3 @@ _define_ the lexer, and be used by the generated code from that definition. For an example of how these components are used in the generated lexer, please see [`generated_api_test`](../../lib/rust/flexer/tests/generated_api_test.rs). - -## An Example - -The following code provides a sketchy example of the intended API for the flexer -code generation using the definition of a simple lexer. - -```rust -use crate::prelude::*; - -use flexer; -use flexer::Flexer; - - - -// ============= -// === Token === -// ============= - -pub struct Token { - location : flexer::Location, - ast : TokenAst, -} - -enum TokenAst { - Var(ImString), - Cons(ImString), - Blank, - ... -} - -impl Token { - pub fn new(location:Location, ast:TokenAst) -> Self { - Self {location,ast} - } - - pub fn var(location:Location, name:impl Into) -> Self { - let ast = TokenAst::Var(name.into()); - Self::new(location,ast) - } - - ... -} - - - -// ============= -// === Lexer === -// ============= - -#[derive(Debug,Default)] -struct Lexer { - current : Option, - tokens : Vec, - state : T -} - -impl Lexer { - fn on_ident(&mut self, tok:Token) { - self.current = Some(tok); - self.state.push(self.ident_sfx_check); - } - - fn on_ident_err_sfx(&mut self) { - println!("OH NO!") - } - - fn on_no_ident_err_sfx(&mut self) { - let current = std::mem::take(&mut self.current).unwrap(); - self.tokens.push_back(current); - } -} - -impl Flexer::Definition Lexer { - fn state (& self) -> & flexer::State { & self.state } - fn state_mut (&mut self) -> &mut flexer::State { &mut self.state } -} - -pub fn lexer_source_code() -> String { - let lexer = Flexer::>::new(); - - let chr = alphaNum | '_'; - let blank = Pattern::from('_'); - let body = chr.many >> '\''.many(); - let var = lowerLetter >> body; - let cons = upperLetter >> body; - let breaker = "^`!@#$%^&*()-=+[]{}|;:<>,./ \t\r\n\\"; - - let sfx_check = lexer.add(State("Identifier Suffix Check")); - - lexer.rule(lexer.root,var,"self.on_ident(Token::var(self.start_location,self.current_match()))"); - lexer.rule(lexer.root,cons,"self.on_ident(token::cons(self.start_location,self.current_match()))"); - lexer.rule(lexer.root,blank,"self.on_ident(token::blank(self.start_location))"); - lexer.rule(sfx_check,err_sfx,"self.on_ident_err_sfx()"); - lexer.rule(sfx_check,Flexer::always,"self.on_no_ident_err_sfx()"); - ... - lexer.generate_specialized_code() // This code needs to become a source file, probably via build.rs -} -``` - -Some things to note: - -- The function definitions in `Lexer` take `self` as their first argument - because `Engine` implements `Deref` and `DerefMut` to `Lexer`. diff --git a/lib/rust/enso-logger/Cargo.toml b/lib/rust/enso-logger/Cargo.toml index c2b00ce7c68..0b657fed391 100644 --- a/lib/rust/enso-logger/Cargo.toml +++ b/lib/rust/enso-logger/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "logger" +name = "enso-logger" version = "0.1.0" authors = ["Enso Team "] edition = "2018" diff --git a/lib/rust/enso-logger/src/enabled.rs b/lib/rust/enso-logger/src/enabled.rs index b082fc6073f..cb85067ecb4 100644 --- a/lib/rust/enso-logger/src/enabled.rs +++ b/lib/rust/enso-logger/src/enabled.rs @@ -59,14 +59,15 @@ impl AnyLogger for Logger { let indent = default(); Self {path,indent} } - fn path (&self) -> &str { &self.path } + + fn path (&self) -> &str { &self.path } fn trace (&self, msg:impl Message) { println!("{}",self.format(msg)) } fn debug (&self, msg:impl Message) { println!("{}",self.format(msg)) } fn info (&self, msg:impl Message) { println!("{}",self.format(msg)) } fn warning (&self, msg:impl Message) { println!("[WARNING] {}",self.format(msg)) } fn error (&self, msg:impl Message) { println!("[ERROR] {}",self.format(msg)) } fn group_begin (&self, msg:impl Message) { println!("{}",self.format(msg)); self.inc_indent() } - fn group_end (&self) { self.dec_indent() } + fn group_end (&self) { self.dec_indent() } } diff --git a/lib/rust/enso-prelude/src/vec.rs b/lib/rust/enso-prelude/src/vec.rs index d8757308014..717c808deec 100644 --- a/lib/rust/enso-prelude/src/vec.rs +++ b/lib/rust/enso-prelude/src/vec.rs @@ -1,12 +1,13 @@ //! This module defines utilities for working with the [`std::vec::Vec`] type. +use failure::_core::hint::unreachable_unchecked; + + // ============== // === VecOps === // ============== -use failure::_core::hint::unreachable_unchecked; - pub trait VecOps { type Item; diff --git a/lib/rust/flexer-testing/definition/Cargo.toml b/lib/rust/flexer-testing/definition/Cargo.toml new file mode 100644 index 00000000000..d8140aac9a1 --- /dev/null +++ b/lib/rust/flexer-testing/definition/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "flexer-test-definition" +version = "0.1.0" +authors = ["Enso Team "] +edition = "2018" + +publish = false + +[lib] +crate-type = ["cdylib", "rlib"] +test = true +bench = true + +[dependencies] +flexer = { path = "../../flexer", version = "0.1.0" } diff --git a/lib/rust/flexer-testing/definition/src/lib.rs b/lib/rust/flexer-testing/definition/src/lib.rs new file mode 100644 index 00000000000..d5dcbde041b --- /dev/null +++ b/lib/rust/flexer-testing/definition/src/lib.rs @@ -0,0 +1,259 @@ +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] +#![allow(unused_imports)] +#![allow(clippy::all)] + +//! This file contains the code defining a lexer for the following small language. Due to the way in +//! which the code-generation from the flexer is used, it has to be defined in a separate crate from +//! the site at which it's used. For the actual tests of this code, please see +//! `flexer-testing/generation`. +//! +//! The language here is being defined as follows: +//! +//! a-word = 'a'+; +//! b-word = 'b'+; +//! word = a-word | b-word; +//! space = ' '; +//! spaced-word = space, word; +//! language = word, spaced-word*; +//! +//! Please note that there is a fair amount of duplicated code between this test and the +//! `lexer_generated_api_test` file. This is to present the full view of what each portion of the +//! process looks like. + +use flexer::prelude::*; + +use flexer::*; +use flexer; +use flexer::automata::pattern::Pattern; +use flexer::group::Registry; +use flexer::prelude::logger::Disabled; +use flexer::prelude::reader::BookmarkManager; +use flexer::prelude::reader::decoder::DecoderUTF8; + + + +// ==================== +// === Type Aliases === +// ==================== + +type Logger = Disabled; + + + +// =========== +// === AST === +// =========== + +/// A very simple AST, sufficient for the simple language being defined. +#[derive(Clone,Debug,PartialEq)] +pub enum Token { + /// A word from the input, consisting of a sequence of all `a` or all `b`. + Word(String), + /// A token that the lexer is unable to recognise. + Unrecognized(String), +} +impl Token { + /// Construct a new word token. + pub fn word(name:impl Into) -> Token { + Token::Word(name.into()) + } + + /// Construct a new unrecognized token. + pub fn unrecognized(name:impl Into) -> Token { + Token::Unrecognized(name.into()) + } +} + +/// A representation of a stream of tokens. +#[allow(missing_docs)] +#[derive(Clone,Debug,Default,PartialEq)] +pub struct TokenStream { + tokens:Vec +} + +impl TokenStream { + /// Append the provided token to the token stream. + pub fn push(&mut self,token:Token) { + self.tokens.push(token); + } +} + + +// === Trait Impls === + +impl From> for TokenStream { + fn from(tokens: Vec) -> Self { + TokenStream {tokens} + } +} + + + +// ================== +// === Test Lexer === +// ================== + +/// The definition of a test lexer for the above-described language. +#[derive(Debug)] +pub struct TestLexer { + lexer:Flexer +} + +impl Deref for TestLexer { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.lexer + } +} + +impl DerefMut for TestLexer { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.lexer + } +} + +impl TestLexer { + /// Creates a new instance of this lexer. + pub fn new() -> Self { + let logger = Logger::new("TestLexer"); + let lexer = Flexer::new(logger); + TestLexer{lexer} + } +} + +/// Implementations of functionality used by the lexer. +/// +/// These functions are provided by the user, by hand, and must all take a reader. +#[allow(missing_docs)] +impl TestLexer { + pub fn on_first_word(&mut self, _reader:&mut R) { + let str = self.current_match.clone(); + let ast = Token::Word(str); + self.output.push(ast); + let id = self.seen_first_word_state; + self.push_state(id); + } + + pub fn on_spaced_word(&mut self, _reader:&mut R) { + let str = self.current_match.clone(); + let ast = Token::Word(String::from(str.trim())); + self.output.push(ast); + } + + pub fn on_err_suffix_first_word(&mut self, _reader:&mut R) { + let ast = Token::Unrecognized(self.current_match.clone()); + self.output.push(ast); + } + + pub fn on_err_suffix(&mut self, reader:&mut R) { + self.on_err_suffix_first_word(reader); + self.pop_state(); + } + + pub fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} + + pub fn on_no_err_suffix(&mut self, reader:&mut R) { + self.on_no_err_suffix_first_word(reader); + self.pop_state(); + } +} + + +// === Trait Impls === + +impl flexer::Definition for TestLexer { + fn define() -> Self { + let mut lexer = TestLexer::new(); + + let a_word = Pattern::char('a').many1(); + let b_word = Pattern::char('b').many1(); + let space = Pattern::char(' '); + let spaced_a_word = &space >> &a_word; + let spaced_b_word = &space >> &b_word; + let any = Pattern::any(); + let end = Pattern::eof(); + + let root_group_id = lexer.initial_state; + let root_group = lexer.groups_mut().group_mut(root_group_id); + root_group.create_rule(&a_word,"self.on_first_word(reader)"); + root_group.create_rule(&b_word,"self.on_first_word(reader)"); + root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); + root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); + + let seen_first_word_group_id = lexer.seen_first_word_state; + let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); + seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); + seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); + seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); + seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); + + lexer + } + + fn groups(&self) -> &Registry { + self.lexer.groups() + } +} + + + +// =================== +// === Lexer State === +// =================== + +/// The stateful components of the test lexer. +#[derive(Debug)] +pub struct TestState { + /// The registry for groups in the lexer. + lexer_states:group::Registry, + /// The initial state of the lexer. + initial_state:group::Identifier, + /// The state entered when the first word has been seen. + seen_first_word_state:group::Identifier, + /// The bookmarks for this lexer. + bookmarks:BookmarkManager +} + + +// === Trait Impls === + +impl flexer::State for TestState { + fn new() -> Self { + let mut lexer_states = group::Registry::default(); + let initial_state = lexer_states.define_group("ROOT",None); + let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); + let bookmarks = BookmarkManager::new(); + Self{lexer_states,initial_state,seen_first_word_state,bookmarks} + } + + fn initial_state(&self) -> group::Identifier { + self.initial_state + } + + fn groups(&self) -> &group::Registry { + &self.lexer_states + } + + fn groups_mut(&mut self) -> &mut group::Registry { + &mut self.lexer_states + } + + fn bookmarks(&self) -> &BookmarkManager { + &self.bookmarks + } + + fn bookmarks_mut(&mut self) -> &mut BookmarkManager { + &mut self.bookmarks + } + + fn specialize(&self) -> Result { + generate::specialize(self,"TestLexer","TokenStream") + } +} diff --git a/lib/rust/flexer-testing/generation/Cargo.toml b/lib/rust/flexer-testing/generation/Cargo.toml new file mode 100644 index 00000000000..2b1ca75a9b2 --- /dev/null +++ b/lib/rust/flexer-testing/generation/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "flexer-test-generation" +version = "0.1.0" +authors = ["Enso Team "] +edition = "2018" + +publish = false + +[lib] +crate-type = ["cdylib", "rlib"] +test = true +bench = true + +[dependencies] +flexer = { path = "../../flexer" , version = "0.1.0" } +flexer-test-definition = { path = "../definition", version = "0.1.0" } + +[build-dependencies] +flexer = { path = "../../flexer" , version = "0.1.0" } +flexer-test-definition = { path = "../definition", version = "0.1.0" } diff --git a/lib/rust/flexer-testing/generation/build.rs b/lib/rust/flexer-testing/generation/build.rs new file mode 100644 index 00000000000..154a49db7be --- /dev/null +++ b/lib/rust/flexer-testing/generation/build.rs @@ -0,0 +1,32 @@ +use std::fs::File; +use std::io::prelude::*; +use flexer_test_definition::TestLexer; +use flexer::Definition; +use flexer::State; + + + +/// Generates the lexer engine and saves the result into the file `src/engine.rs`. +/// +/// The content of the generated file can be used with the `include!` macro. +fn generate_engine() -> std::io::Result<()> { + let definition_path = "../definition/src/lib.rs"; + let output_directory = "src/generated"; + let _ = std::fs::create_dir(output_directory); + let output_path = "src/generated/engine.rs"; + let definition_error = format!("The lexer definition should exist at {}.",definition_path); + let output_error = format!("Cannot open output file at {}.",output_path); + let mut lexer_def = File::open(definition_path).expect(definition_error.as_str()); + let mut contents = String::new(); + let mut file = File::create(output_path).expect(output_error.as_str()); + let lexer = TestLexer::define(); + let engine = lexer.specialize().unwrap(); + lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition."); + file.write_all(contents.as_bytes()).expect("Unable to write lexer definition."); + file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization."); + Ok(()) +} + +fn main() -> std::io::Result<()> { + generate_engine() +} diff --git a/lib/rust/flexer-testing/generation/src/generated.rs b/lib/rust/flexer-testing/generation/src/generated.rs new file mode 100644 index 00000000000..99ac31885e5 --- /dev/null +++ b/lib/rust/flexer-testing/generation/src/generated.rs @@ -0,0 +1,3 @@ +//! This module serves to re-export the generated lexer. + +pub mod engine; diff --git a/lib/rust/flexer-testing/generation/src/lib.rs b/lib/rust/flexer-testing/generation/src/lib.rs new file mode 100644 index 00000000000..02eb98737a3 --- /dev/null +++ b/lib/rust/flexer-testing/generation/src/lib.rs @@ -0,0 +1,19 @@ +//! This library exposes the specialized version of the Enso lexer. +//! +//! Its sole purpose is to avoid the lexer definition getting out of sync with its implementation +//! (the generated engine), which requires the engine to live in a separate crate. +//! +//! This separation enables generation of the enso lexer source code with `build.rs` during +//! compilation. Its output is then stored in a new file `engine.rs`and exported by `lexer.rs`. + +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +pub mod generated; diff --git a/lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs b/lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs new file mode 100644 index 00000000000..d1dace86a92 --- /dev/null +++ b/lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs @@ -0,0 +1,110 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for the generated lexer. + +use flexer::prelude::*; + +use flexer::prelude::reader::decoder::DecoderUTF8; +use flexer_test_generation::generated::engine::TestLexer; +use flexer_test_generation::generated::engine::Token; +use flexer_test_generation::generated::engine::TokenStream; + + + +// ============= +// === Tests === +// ============= + +/// Executes the test on the provided input string slice. +fn run_test_on(str:impl AsRef) -> TokenStream { + // Hardcoded for ease of use here. + let reader = Reader::new(str.as_ref().as_bytes(), DecoderUTF8()); + let mut lexer = TestLexer::new(); + let run_result = lexer.run(reader); + + match run_result.kind { + flexer::ResultKind::Success => run_result.tokens, + _ => default() + } +} + +#[test] +fn test_single_a_word() { + let input = "aaaaa"; + let expected_output = TokenStream::from(vec![Token::word(input)]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_single_b_word() { + let input = "bbbbb"; + let expected_output = TokenStream::from(vec![Token::word(input)]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_two_word() { + let input = "aaaaa bbbbb"; + let expected_output = TokenStream::from(vec![Token::word("aaaaa"), Token::word("bbbbb")]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_multi_word() { + let input = "bbb aa a b bbbbb aa"; + let expected_output = TokenStream::from(vec![ + Token::word("bbb"), + Token::word("aa"), + Token::word("a"), + Token::word("b"), + Token::word("bbbbb"), + Token::word("aa") + ]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_invalid_single_word() { + let input = "c"; + let expected_output = TokenStream::from(vec![Token::unrecognized(input)]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_multi_word_invalid() { + let input = "aaaaaa c bbbbbb"; + let expected_output = TokenStream::from(vec![ + Token::word("aaaaaa"), + Token::unrecognized(" "), + Token::unrecognized("c"), + Token::unrecognized(" "), + Token::word("bbbbbb"), + ]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} + +#[test] +fn test_end_invalid() { + let input = "bbbbbb c"; + let expected_output = TokenStream::from(vec![ + Token::word("bbbbbb"), + Token::unrecognized(" "), + Token::unrecognized("c"), + ]); + let result = run_test_on(input); + assert_eq!(result, expected_output); +} diff --git a/lib/rust/flexer/Cargo.toml b/lib/rust/flexer/Cargo.toml index 133716fb991..a7578d224ca 100644 --- a/lib/rust/flexer/Cargo.toml +++ b/lib/rust/flexer/Cargo.toml @@ -22,10 +22,16 @@ test = true bench = true [dependencies] -enso-prelude = { path = "../enso-prelude", version = "0.1.0" } +enso-logger = { path = "../enso-logger", version = "0.1.0" } +enso-prelude = { path = "../enso-prelude", version = "0.1.0" } +lazy-reader = { path = "../lazy-reader", version = "0.1.0" } +enso-macro-utils = { path = "../enso-macro-utils", version = "0.1.0" } + itertools = "0.8" -lazy-reader = { path = "../lazy-reader", version = "0.1.0" } +proc-macro2 = "1.0.19" nonempty = "0.1.5" +quote = "1.0" +syn = { version = "1.0.12", features = ["full", "extra-traits", "visit-mut", "visit", "parsing", "printing"] } unicode-segmentation = "1.6.0" wasm-bindgen = "0.2" diff --git a/lib/rust/flexer/src/automata/alphabet.rs b/lib/rust/flexer/src/automata/alphabet.rs index 977e1e1cf83..decd8d0ce76 100644 --- a/lib/rust/flexer/src/automata/alphabet.rs +++ b/lib/rust/flexer/src/automata/alphabet.rs @@ -1,8 +1,9 @@ //! Exports an alphabet for an arbitrary finite state automaton. +use crate::prelude::*; + use crate::automata::symbol::Symbol; -use crate::prelude::*; use std::collections::BTreeSet; use std::ops::RangeInclusive; @@ -49,15 +50,15 @@ use std::ops::RangeInclusive; #[derive(Clone,Debug,PartialEq,Eq)] #[allow(missing_docs)] pub struct Segmentation { - pub divisions: BTreeSet + pub divisions:BTreeSet } impl Segmentation { /// Inserts a range of symbols into the alphabet. pub fn insert(&mut self, range:RangeInclusive) { self.divisions.insert(Symbol::from(range.start())); - if range.end().val != Symbol::EOF_CODE.val { - self.divisions.insert(Symbol{val:range.end().val + 1}); + if range.end().value != Symbol::EOF_CODE.value { + self.divisions.insert(Symbol{value:range.end().value + 1}); } } @@ -69,6 +70,11 @@ impl Segmentation { } dict } + + /// Obtains the divisions in the alphabet segmentation as a vector. + pub fn divisions_as_vec(&self) -> Vec { + self.divisions.iter().copied().enumerate().map(From::from).collect() + } } @@ -80,6 +86,45 @@ impl Default for Segmentation { // The existence of the default (0) member in the set is assumed by the implementation of // the NFA -> DFA conversion. divisions.insert(default()); - Segmentation { divisions } + Segmentation{divisions} } } + + + +// ================ +// === Division === +// ================ + +/// A division of the alphabet used by the lexer. +#[derive(Copy,Clone,Debug,PartialEq,Eq)] +pub struct Division { + /// The position of the division. + pub position : usize, + /// The symbol at which it divides the alphabet. + pub symbol : Symbol, +} + +impl Division { + /// Create a new division. + pub fn new(position:usize, symbol:Symbol) -> Division { + Division{position,symbol} + } +} + + +// === Trait Impls === + +impl Into<(usize,Symbol)> for Division { + fn into(self) -> (usize, Symbol) { + (self.position,self.symbol) + } +} + +impl From<(usize,Symbol)> for Division { + fn from((position, symbol): (usize, Symbol)) -> Self { + Division::new(position,symbol) + } +} + + diff --git a/lib/rust/flexer/src/automata/dfa.rs b/lib/rust/flexer/src/automata/dfa.rs index 691b46306bf..ef51a276d5b 100644 --- a/lib/rust/flexer/src/automata/dfa.rs +++ b/lib/rust/flexer/src/automata/dfa.rs @@ -22,7 +22,7 @@ use crate::data::matrix::Matrix; #[derive(Clone,Debug,Default,Eq,PartialEq)] pub struct DFA { /// A set of disjoint intervals over the allowable input alphabet. - pub alphabet_segmentation: alphabet::Segmentation, + pub alphabet_segmentation:alphabet::Segmentation, /// The transition matrix for the DFA. /// /// It represents a function of type `(state, symbol) -> state`, returning the identifier for @@ -38,9 +38,22 @@ pub struct DFA { /// | 0 | 1 | - | /// | 1 | - | 0 | /// - pub links: Matrix, + pub links:Matrix, /// A collection of callbacks for each state (indexable in order) - pub callbacks: Vec>, + pub callbacks:Vec>, +} + +impl DFA { + /// Check whether the DFA has a rule for the target state. + /// + /// This method should only be used in generated code, where its invariants are already checked. + /// + /// # Panics + /// + /// If no callback exists for `target_state`. + pub fn has_rule_for(&self, target_state:state::Identifier) -> bool { + self.callbacks.get(target_state.id).unwrap().is_some() + } } @@ -74,9 +87,9 @@ impl From>> for Matrix { #[derive(Clone,Debug,PartialEq,Eq)] pub struct RuleExecutable { /// A description of the priority with which the callback is constructed during codegen. - pub priority: usize, + pub priority:usize, /// The rust code that will be executed when running this callback. - pub code: String, + pub code:String, } impl RuleExecutable { @@ -104,11 +117,11 @@ pub mod tests { /// DFA automata that accepts newline '\n'. pub fn newline() -> DFA { DFA { - alphabet_segmentation: alphabet::Segmentation::from_divisions(&[10,11]), - links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), - callbacks: vec![ + alphabet_segmentation:alphabet::Segmentation::from_divisions(&[10,11]), + links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), + callbacks:vec![ None, - Some(RuleExecutable {priority:2, code:"group0_rule0".into()}), + Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}), ], } } @@ -116,11 +129,11 @@ pub mod tests { /// DFA automata that accepts any letter a..=z. pub fn letter() -> DFA { DFA { - alphabet_segmentation: alphabet::Segmentation::from_divisions(&[97,123]), - links: Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), - callbacks: vec![ + alphabet_segmentation:alphabet::Segmentation::from_divisions(&[97,123]), + links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), + callbacks:vec![ None, - Some(RuleExecutable {priority:2, code:"group0_rule0".into()}), + Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}), ], } } @@ -128,16 +141,16 @@ pub mod tests { /// DFA automata that accepts any number of spaces ' '. pub fn spaces() -> DFA { DFA { - alphabet_segmentation: alphabet::Segmentation::from_divisions(&[0,32,33]), - links: Matrix::from(vec![ + alphabet_segmentation:alphabet::Segmentation::from_divisions(&[0,32,33]), + links:Matrix::from(vec![ vec![INVALID,1,INVALID], vec![INVALID,2,INVALID], vec![INVALID,2,INVALID], ]), - callbacks: vec![ + callbacks:vec![ None, - Some(RuleExecutable {priority:3, code:"group0_rule0".into()}), - Some(RuleExecutable {priority:3, code:"group0_rule0".into()}), + Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}), + Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}), ], } } @@ -145,18 +158,18 @@ pub mod tests { /// DFA automata that accepts one letter a..=z or any many spaces. pub fn letter_and_spaces() -> DFA { DFA { - alphabet_segmentation: alphabet::Segmentation::from_divisions(&[32,33,97,123]), - links: Matrix::from(vec![ + alphabet_segmentation:alphabet::Segmentation::from_divisions(&[32,33,97,123]), + links:Matrix::from(vec![ vec![INVALID, 1,INVALID, 2,INVALID], vec![INVALID, 3,INVALID,INVALID,INVALID], vec![INVALID,INVALID,INVALID,INVALID,INVALID], vec![INVALID, 3,INVALID,INVALID,INVALID], ]), - callbacks: vec![ + callbacks:vec![ None, - Some(RuleExecutable {priority:4, code:"group0_rule1".into()}), - Some(RuleExecutable {priority:4, code:"group0_rule0".into()}), - Some(RuleExecutable {priority:4, code:"group0_rule1".into()}), + Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}), + Some(RuleExecutable {priority:4, code:"group_0_rule_0".into()}), + Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}), ], } } diff --git a/lib/rust/flexer/src/automata/nfa.rs b/lib/rust/flexer/src/automata/nfa.rs index 79624b7c888..bdf09c72e22 100644 --- a/lib/rust/flexer/src/automata/nfa.rs +++ b/lib/rust/flexer/src/automata/nfa.rs @@ -1,8 +1,8 @@ //! The structure for defining non-deterministic finite automata. use crate::automata::alphabet; -use crate::automata::dfa::RuleExecutable; use crate::automata::dfa::DFA; +use crate::automata::dfa::RuleExecutable; use crate::automata::pattern::Pattern; use crate::automata::state::State; use crate::automata::state::Transition; @@ -45,9 +45,9 @@ type StateSetId = BTreeSet; #[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct NFA { /// A set of disjoint intervals over the input alphabet. - pub alphabet_segmentation: alphabet::Segmentation, + pub alphabet_segmentation:alphabet::Segmentation, /// A set of named NFA states, with (epsilon) transitions. - pub states: Vec, + pub states:Vec, } impl NFA { @@ -55,7 +55,7 @@ impl NFA { pub fn new_state(&mut self) -> state::Identifier { let id = self.states.len(); self.states.push(State::default()); - state::Identifier {id} + state::Identifier{id} } /// Creates an epsilon transition between two states. @@ -72,9 +72,10 @@ impl NFA { /// state, it will immediately transition to the `target` state. pub fn connect_via ( &mut self - , source:state::Identifier - , target_state:state::Identifier - , symbols:&RangeInclusive) { + , source : state::Identifier + , target_state : state::Identifier + , symbols : &RangeInclusive + ) { self.alphabet_segmentation.insert(symbols.clone()); self.states[source.id].links.push(Transition{symbols:symbols.clone(),target_state}); } @@ -159,6 +160,9 @@ impl NFA { } } + +// === Trait Impls === + impl From<&NFA> for DFA { /// Transforms an NFA into a DFA, based on the algorithm described @@ -233,33 +237,33 @@ pub mod tests { /// NFA that accepts a newline '\n'. pub fn newline() -> NFA { NFA { - states: vec![ + states:vec![ State::from(vec![1]), State::from(vec![(10..=10,2)]), - State::from(vec![3]).named("group0_rule0"), + State::from(vec![3]).named("group_0_rule_0"), State::default(), ], - alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()), + alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()), } } /// NFA that accepts any letter in the range a..=z. pub fn letter() -> NFA { NFA { - states: vec![ + states:vec![ State::from(vec![1]), State::from(vec![(97..=122,2)]), - State::from(vec![3]).named("group0_rule0"), + State::from(vec![3]).named("group_0_rule_0"), State::default(), ], - alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()), + alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()), } } /// NFA that accepts any number of spaces ' '. pub fn spaces() -> NFA { NFA { - states: vec![ + states:vec![ State::from(vec![1]), State::from(vec![2]), State::from(vec![(32..=32,3)]), @@ -268,20 +272,20 @@ pub mod tests { State::from(vec![6]), State::from(vec![(32..=32,7)]), State::from(vec![8]), - State::from(vec![5,9]).named("group0_rule0"), + State::from(vec![5,9]).named("group_0_rule_0"), State::default(), ], - alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()), + alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()), } } /// NFA that accepts one letter a..=z or many spaces ' '. pub fn letter_and_spaces() -> NFA { NFA { - states: vec![ + states:vec![ State::from(vec![1,3]), State::from(vec![(97..=122,2)]), - State::from(vec![11]).named("group0_rule0"), + State::from(vec![11]).named("group_0_rule_0"), State::from(vec![4]), State::from(vec![(32..=32,5)]), State::from(vec![6]), @@ -289,10 +293,10 @@ pub mod tests { State::from(vec![8]), State::from(vec![(32..=32,9)]), State::from(vec![10]), - State::from(vec![7,11]).named("group0_rule1"), + State::from(vec![7,11]).named("group_0_rule_1"), State::default(), ], - alphabet_segmentation: alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()), + alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()), } } diff --git a/lib/rust/flexer/src/automata/pattern.rs b/lib/rust/flexer/src/automata/pattern.rs index d5f76101ed5..75380d7553e 100644 --- a/lib/rust/flexer/src/automata/pattern.rs +++ b/lib/rust/flexer/src/automata/pattern.rs @@ -1,5 +1,8 @@ //! Simple API for constructing regex patterns that are used in parser implementation. +#[macro_use] +mod macros; + use crate::automata::symbol::Symbol; use core::iter; @@ -132,24 +135,26 @@ impl Pattern { impl BitOr for Pattern { type Output = Pattern; - fn bitor(self, rhs: Pattern) -> Self::Output { + fn bitor(self, rhs:Pattern) -> Self::Output { match (self, rhs) { - (Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)}, - (Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)}, - (lhs , Or(mut rhs)) => {rhs.push(lhs) ; Or(rhs)}, + (Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)}, + (Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)}, + (lhs , Or(mut rhs)) => {rhs.insert(0,lhs) ; Or(rhs)}, (lhs , rhs ) => Or(vec![lhs,rhs]), } } } +gen_ref_versions!(Pattern,BitOr,bitor); impl Shr for Pattern { type Output = Pattern; - fn shr(self, rhs: Pattern) -> Self::Output { + fn shr(self, rhs:Pattern) -> Self::Output { match (self, rhs) { - (Seq(mut lhs), Seq(rhs)) => {lhs.extend(rhs) ; Seq(lhs)}, - (Seq(mut lhs), rhs ) => {lhs.push(rhs) ; Seq(lhs)}, - (lhs , Seq(mut rhs)) => {rhs.push(lhs) ; Seq(rhs)}, + (Seq(mut lhs), Seq(rhs) ) => {lhs.extend(rhs) ; Seq(lhs)}, + (Seq(mut lhs), rhs ) => {lhs.push(rhs) ; Seq(lhs)}, + (lhs , Seq(mut rhs)) => {rhs.insert(0,lhs) ; Seq(rhs)}, (lhs , rhs ) => Seq(vec![lhs, rhs]), } } } +gen_ref_versions!(Pattern,Shr,shr); diff --git a/lib/rust/flexer/src/automata/pattern/macros.rs b/lib/rust/flexer/src/automata/pattern/macros.rs new file mode 100644 index 00000000000..5e43b948d6e --- /dev/null +++ b/lib/rust/flexer/src/automata/pattern/macros.rs @@ -0,0 +1,28 @@ +//! Useful macros for defining operators over patterns. + +/// Generates versions of an operator taking various combinations of by-reference and by-value. +#[macro_export] +macro_rules! gen_ref_versions { + ($ty_name:ty,$opr_name:ident,$fn_name:ident) => ( + impl $opr_name<&$ty_name> for &$ty_name { + type Output = $ty_name; + fn $fn_name(self, rhs:&$ty_name) -> Self::Output { + self.clone().$fn_name(rhs.clone()) + } + } + + impl $opr_name<&$ty_name> for $ty_name { + type Output = $ty_name; + fn $fn_name(self, rhs:&$ty_name) -> Self::Output { + self.$fn_name(rhs.clone()) + } + } + + impl $opr_name<$ty_name> for &$ty_name { + type Output = $ty_name; + fn $fn_name(self, rhs:$ty_name) -> Self::Output { + self.clone().$fn_name(rhs) + } + } + ) +} diff --git a/lib/rust/flexer/src/automata/state.rs b/lib/rust/flexer/src/automata/state.rs index 03538ba4d2b..16c7588998d 100644 --- a/lib/rust/flexer/src/automata/state.rs +++ b/lib/rust/flexer/src/automata/state.rs @@ -15,15 +15,17 @@ use crate::prelude::*; #[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct State { /// A set of transitions that can trigger without consuming a symbol (ε-transitions). - pub epsilon_links: Vec, + pub epsilon_links:Vec, /// The set of transitions that trigger while consuming a specific symbol. /// /// When triggered, the automaton will transition to the [`Transition::target_state`]. - pub links: Vec, + pub links:Vec, /// The name of the state. /// /// This is used to auto-generate a call to the rust method of the same name. - pub name: Option, + pub name:Option, + /// The function to call when evaluating the state. + pub callback:String } impl State { @@ -59,8 +61,8 @@ impl State { impl From> for State { /// Creates a state with epsilon links. fn from(vec:Vec) -> Self { - let epsilon_links = vec.iter().cloned().map(|id| Identifier {id}).collect(); - State {epsilon_links,..Default::default()} + let epsilon_links = vec.iter().cloned().map(|id| Identifier{id}).collect(); + State{epsilon_links,..Default::default()} } } @@ -68,12 +70,12 @@ impl From, usize)>> for State { /// Creates a state with ordinary links. fn from(vec:Vec<(RangeInclusive, usize)>) -> Self { let link = |(range, id): (RangeInclusive, usize)| { - let start = Symbol{val:*range.start()}; - let end = Symbol{val:*range.end()}; - Transition {symbols: start..=end, target_state: Identifier { id }} + let start = Symbol{value:*range.start()}; + let end = Symbol{value:*range.end()}; + Transition{symbols:start..=end,target_state:Identifier{id}} }; let links = vec.iter().cloned().map(link).collect(); - State {links,..Default::default()} + State{links,..Default::default()} } } @@ -128,7 +130,7 @@ impl From for Identifier { #[derive(Clone,Debug,PartialEq,Eq)] pub struct Transition { /// The range of symbols on which this transition will trigger. - pub symbols: RangeInclusive, + pub symbols:RangeInclusive, /// The state that is entered after the transition has triggered. - pub target_state: Identifier, + pub target_state:Identifier, } diff --git a/lib/rust/flexer/src/automata/symbol.rs b/lib/rust/flexer/src/automata/symbol.rs index fdaa56875e1..295ee369cfc 100644 --- a/lib/rust/flexer/src/automata/symbol.rs +++ b/lib/rust/flexer/src/automata/symbol.rs @@ -9,16 +9,16 @@ /// An input symbol to a finite automaton. #[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)] pub struct Symbol { - #[allow(missing_docs)] - pub val: u32 + /// The 4-byte representation of the symbol. + pub value:u32 } impl Symbol { /// A representation of the end of the file. - pub const EOF_CODE:Symbol = Symbol{val:u32::max_value()}; + pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()}; /// A representation of the null symbol. - pub const NULL:Symbol = Symbol{val:0}; + pub const NULL:Symbol = Symbol{value:0}; } @@ -31,19 +31,20 @@ impl Default for Symbol { } impl From for Symbol { - fn from(val:u32) -> Symbol { - Symbol{val} + fn from(value:u32) -> Symbol { + Symbol{value} } } impl From for Symbol { - fn from(val:char) -> Symbol { - Symbol{val:val as u32} + fn from(value:char) -> Symbol { + Symbol{value:value as u32} } } impl From<&Symbol> for Symbol { - fn from(symb: &Symbol) -> Self { - Symbol{val:symb.val} + fn from(symbol:&Symbol) -> Self { + let value = symbol.value; + Symbol{value} } } diff --git a/lib/rust/flexer/src/data/matrix.rs b/lib/rust/flexer/src/data/matrix.rs index 790032c9ccc..f7590e591b4 100644 --- a/lib/rust/flexer/src/data/matrix.rs +++ b/lib/rust/flexer/src/data/matrix.rs @@ -1,6 +1,6 @@ //! An efficient representation of a 2D matrix. -use enso_prelude::default; +use crate::prelude::*; use std::ops::Index; use std::ops::IndexMut; @@ -15,11 +15,28 @@ use std::ops::IndexMut; #[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct Matrix { /// The number of rows in the matrix. - rows: usize, + rows:usize, /// The number of columns in the matrix. - columns: usize, + columns:usize, /// The matrix. - matrix: Vec, + matrix:Vec, +} + +impl Matrix { + /// Get the number of rows in the matrix. + pub fn rows(&self) -> usize { + self.rows + } + + /// Get the number of columns in the matrix. + pub fn columns(&self) -> usize { + self.columns + } + + /// Obtain the indices for the rows in this matrix. + pub fn row_indices(&self) -> Range { + 0..self.rows() + } } impl Matrix { diff --git a/lib/rust/flexer/src/generate.rs b/lib/rust/flexer/src/generate.rs new file mode 100644 index 00000000000..4abc2e79b69 --- /dev/null +++ b/lib/rust/flexer/src/generate.rs @@ -0,0 +1,528 @@ +//! This file contains utilities for generating rust code from lexer definitions, allowing the +//! flexer to be specialised for a specific language. + +use crate::prelude::*; +use quote::*; +use syn::*; + +use crate::automata::dfa::DFA; +use crate::automata::dfa::RuleExecutable; +use crate::automata::state::Identifier; +use crate::automata::state::State; +use crate::group::Group; +use crate::group; + +use enso_macro_utils::repr; +use proc_macro2::Literal; +use std::hash::BuildHasher; +use std::result::Result; +use std::fmt; + +use crate as flexer; + + + +// ======================= +// === Code Generation === +// ======================= + +/// Generate specialized code for the provided lexer `definition`. +/// +/// This specialized code is a highly-optimised and tailored lexer that dispatches based on simple +/// code-point switches, with no dynamic lookup. This means that it is very fast, and very low +/// overhead. +pub fn specialize +( definition : &impl flexer::State +, state_type_name : impl Into +, output_type_name : impl Into +) -> Result { + let group_registry = definition.groups(); + let mut body_items = Vec::new(); + body_items.push(run_function(output_type_name)?); + body_items.push(run_current_state_function()); + body_items.push(step(group_registry)); + for group in group_registry.all().iter() { + body_items.extend(automaton_for_group(group,group_registry)?) + } + let result = wrap_in_impl_for(state_type_name,body_items)?; + let code = show_code(&result); + Ok(code) +} + + +// === Whole-Lexer Codegen Utilities === + +/// Wrap the provided implementation items into an `impl` block for the provided `state_name` type. +pub fn wrap_in_impl_for +( state_name : impl Into +, body : Vec +) -> Result { + let state_name:Ident = str_to_ident(state_name.into().as_str())?; + let mut tree:ItemImpl = parse_quote! { + #[allow(missing_docs,dead_code)] + impl #state_name {} + }; + tree.items.extend(body); + Ok(tree) +} + +/// Generate the `run` function for the specialized lexer. +/// +/// This function is what the user of the lexer will call +pub fn run_function(output_type_name:impl Into) -> Result { + let output_type_name:Ident = str_to_ident(output_type_name)?; + let tree:ImplItem = parse_quote! { + pub fn run(&mut self, mut reader:R) -> LexingResult<#output_type_name> { + reader.advance_char(&mut self.bookmarks); + while self.run_current_state(&mut reader) == StageStatus::ExitSuccess {} + match self.status { + StageStatus::ExitFinished => LexingResult::success( + mem::take(&mut self.output) + ), + StageStatus::ExitFail => LexingResult::failure( + mem::take(&mut self.output) + ), + _ => LexingResult::partial(mem::take(&mut self.output)) + } + } + }; + Ok(tree) +} + +/// Generate the function responsible for executing the lexer in its current state. +pub fn run_current_state_function() -> ImplItem { + let tree:ImplItem = parse_quote! { + fn run_current_state(&mut self, reader:&mut R) -> StageStatus { + self.status = StageStatus::Initial; + + // Runs until reaching a state that no longer says to continue. + while let Some(next_state) = self.status.continue_as() { + self.logger.info(||format!("Current character is {:?}.",reader.character())); + self.logger.info(||format!("Continuing in {:?}.",next_state)); + self.status = self.step(next_state,reader); + + if reader.finished() { + self.status = StageStatus::ExitFinished + } + + if self.status.should_continue() { + match reader.character().char { + Ok(char) => { + reader.append_result(char); + self.logger.info(||format!("Result is {:?}.",reader.result())); + }, + Err(flexer::prelude::reader::Error::EndOfGroup) => { + let current_state = self.current_state(); + let group_name = self.groups().group(current_state).name.as_str(); + let err = format!("Missing rules for state {}.", group_name); + self.logger.error(err.as_str()); + panic!(err) + } + Err(_) => { + self.logger.error("Unexpected error!"); + panic!("Unexpected error!") + } + } + reader.advance_char(&mut self.bookmarks); + } + } + + self.status + } + }; + tree +} + +/// Generate the `step` function for the lexer. +/// +/// This function is responsible for dispatching based on the current state, consuming a character, +/// and returning the state to transition to. +pub fn step(groups:&group::Registry) -> ImplItem { + let arms = groups.all().iter().map(|g| step_match_arm(g.id.into())).collect_vec(); + parse_quote! { + fn step(&mut self, next_state:SubStateId, reader:&mut R) -> StageStatus { + let current_state:usize = self.current_state().into(); + match current_state { + #(#arms)* + _ => unreachable_panic!("Unreachable state reached in lexer."), + } + } + } +} + +/// Generate a match arm for the step function. +/// +/// There is one match arm per lexer state. +pub fn step_match_arm(number:usize) -> Arm { + let literal = Literal::usize_unsuffixed(number); + let function_name_str = format!("dispatch_in_state_{}",number); + let func_name:Ident = parse_str(function_name_str.as_str()).unwrap(); + let arm:Arm = parse_quote! { + #literal => self.#func_name(next_state,reader), + }; + arm +} + + +// === Generation for a Specific Lexer State === + +/// Generate the functions that implement the lexer automaton for a given lexer state. +pub fn automaton_for_group +( group : &Group +, registry : &group::Registry +) -> Result,GenError> { + let nfa = registry.to_nfa_from(group.id); + let mut rules = Vec::with_capacity(nfa.states.len()); + for state in nfa.states.iter() { + if state.name.is_some() { + rules.push(rule_for_state(state)?); + } + } + let mut dfa = DFA::from(&nfa); + let dispatch_for_dfa = dispatch_in_state(&dfa,group.id.into())?; + let mut dfa_transitions = transitions_for_dfa(&mut dfa,group.id.into())?; + dfa_transitions.push(dispatch_for_dfa); + dfa_transitions.extend(rules); + Ok(dfa_transitions) +} + +/// Generate a set of transition functions for the provided `dfa`, with identifier `id`. +pub fn transitions_for_dfa(dfa:&mut DFA, id:usize) -> Result,GenError> { + let mut state_has_overlapping_rules:HashMap = HashMap::new(); + state_has_overlapping_rules.insert(0,false); + let state_names:Vec<_> = dfa.links.row_indices().map(|ix| (ix, name_for_step(id, ix))).collect(); + let mut transitions = Vec::with_capacity(state_names.len()); + for (ix,name) in state_names.into_iter() { + transitions.push(transition_for_dfa(dfa,name,ix,&mut state_has_overlapping_rules)?) + } + Ok(transitions) +} + +/// Generate a specific transition function for +#[allow(clippy::implicit_hasher)] +pub fn transition_for_dfa +( dfa : &mut DFA +, transition_name : Ident +, state_ix : usize +, has_overlaps : &mut HashMap +) -> Result { + let match_expr:Expr = match_for_transition(dfa,state_ix,has_overlaps)?; + let function:ImplItem = parse_quote! { + fn #transition_name(&mut self, reader:&mut R) -> StageStatus { + #match_expr + } + }; + Ok(function) +} + +/// Generate the pattern match for a given transition function. +pub fn match_for_transition +( dfa : &mut DFA +, state_ix : usize +, has_overlaps : &mut HashMap +) -> Result { + let overlaps = *has_overlaps.get(&state_ix).unwrap_or(&false); + let state = dfa.callbacks.get(state_ix).expect("Internal error.").clone(); + let mut trigger_state = dfa.links[(state_ix,0)]; + let mut range_start = u32::min_value(); + let divisions:Vec<_> = dfa.alphabet_segmentation.divisions_as_vec(); + let mut branches = Vec::with_capacity(divisions.len()); + for division in divisions.into_iter() { + let ix = division.position; + let sym = division.symbol; + let new_trigger_state = dfa.links[(state_ix,ix)]; + if new_trigger_state != trigger_state { + let range_end = if sym.value != 0 { sym.value - 1 } else { sym.value }; + let current_trigger_state = trigger_state; + let current_range_start = range_start; + trigger_state = new_trigger_state; + range_start = sym.value; + let body = + branch_body(dfa,current_trigger_state,&state,has_overlaps,overlaps)?; + branches.push(Branch::new(Some(current_range_start..=range_end),body)) + } else {} + } + let catch_all_branch_body = branch_body(dfa,trigger_state,&state,has_overlaps,overlaps)?; + let catch_all_branch = Branch::new(None,catch_all_branch_body); + branches.push(catch_all_branch); + let arms:Vec = branches.into_iter().map(Into::into).collect(); + let mut match_expr:ExprMatch = parse_quote! { + match u32::from(reader.character()) { + #(#arms)* + } + }; + match_expr.arms = arms; + Ok(Expr::Match(match_expr)) +} + +/// Generate the branch body for a transition in the DFA. +pub fn branch_body +( dfa : &mut DFA +, target_state : Identifier +, maybe_state : &Option +, has_overlaps : &mut HashMap +, rules_overlap : bool +) -> Result { + if target_state == Identifier::INVALID { + match maybe_state { + None => { + Ok(parse_quote! {{ + StageStatus::ExitFail + }}) + }, + Some(rule_exec) => { + let rule:Expr = match parse_str(rule_exec.code.as_str()) { + Ok(rule) => rule, + Err(_) => return Err(GenError::BadExpression(rule_exec.code.clone())) + }; + if rules_overlap { + Ok(parse_quote! {{ + let rule_bookmark = self.bookmarks.rule_bookmark; + let matched_bookmark = self.bookmarks.matched_bookmark; + self.bookmarks.rewind(rule_bookmark,reader); + self.current_match = reader.pop_result(); + self.#rule(reader); + self.bookmarks.bookmark(matched_bookmark,reader); + StageStatus::ExitSuccess + }}) + } else { + Ok(parse_quote! {{ + let matched_bookmark = self.bookmarks.matched_bookmark; + self.current_match = reader.pop_result(); + self.#rule(reader); + self.bookmarks.bookmark(matched_bookmark,reader); + StageStatus::ExitSuccess + }}) + } + } + } + } else { + let target_state_has_no_rule = match maybe_state { + Some(state) => if !dfa.has_rule_for(target_state) { + dfa.callbacks[target_state.id] = Some(state.clone()); + has_overlaps.insert(target_state.id,true); + true + } else { + false + }, + None => false + }; + + let state_id = Literal::usize_unsuffixed(target_state.id); + let ret:Expr = parse_quote! { + StageStatus::ContinueWith(#state_id.into()) + }; + + if target_state_has_no_rule && !rules_overlap { + Ok(parse_quote! {{ + let rule_bookmark = self.bookmarks.rule_bookmark; + self.bookmarks.bookmark(rule_bookmark,reader); + #ret + }}) + } else { + Ok(parse_quote! {{ + #ret + }}) + } + } +} + +/// Generate the dispatch function for a given lexer state. +/// +/// This dispatch function is responsible for dispatching based on the sub-state of any given lexer +/// state, and is the main part of implementing the actual lexer transitions. +pub fn dispatch_in_state(dfa:&DFA, id:usize) -> Result { + let dispatch_name:Ident = str_to_ident(format!("dispatch_in_state_{}",id))?; + let state_names = dfa.links.row_indices().map(|ix| (ix, name_for_step(id,ix))).collect_vec(); + let mut branches = Vec::with_capacity(state_names.len()); + for (ix,name) in state_names.into_iter() { + let literal = Literal::usize_unsuffixed(ix); + let arm:Arm = parse_quote! { + #literal => self.#name(reader), + }; + branches.push(arm); + } + + let pattern_match:ExprMatch = parse_quote! { + match new_state_index.into() { + #(#branches)* + _ => unreachable_panic!("Unreachable state reached in lexer.") + } + }; + let func:ImplItem = parse_quote! { + fn #dispatch_name + ( &mut self + , new_state_index:SubStateId + , reader:&mut R + ) -> StageStatus { + #pattern_match + } + }; + + Ok(func) +} + +/// Generate a name for a given step function. +pub fn name_for_step(in_state:usize, to_state:usize) -> Ident { + let name_str = format!("state_{}_to_{}",in_state,to_state); + parse_str(name_str.as_str()).expect("Impossible to not be a valid identifier.") +} + +/// Generate an executable rule function for a given lexer state. +pub fn rule_for_state(state:&State) -> Result { + match &state.name { + None => unreachable_panic!("Rule for state requested, but state has none."), + Some(name) => { + let rule_name = str_to_ident(name)?; + let code:Expr = match parse_str(state.callback.as_str()) { + Ok(expr) => expr, + Err(_) => return Err(GenError::BadExpression(state.callback.clone())) + }; + if !has_reader_arg(&code) { + return Err(GenError::BadCallbackArgument) + } + + let tree:ImplItem = parse_quote! { + fn #rule_name(&mut self, reader:&mut R) { + #code + } + }; + Ok(tree) + } + } +} + +/// Checks if the given `expr` is a call with a single argument "reader" being passed. +#[allow(clippy::cmp_owned)] +pub fn has_reader_arg(expr:&Expr) -> bool { + match expr { + Expr::MethodCall(expr) => match expr.args.first() { + Some(Expr::Path(path)) => { + match path.path.segments.first() { + Some(segment) => { + segment.ident.to_string() == "reader" + } + _ => false + } + } + _ => false + }, + Expr::Call(expr) => match expr.args.first() { + Some(Expr::Path(path)) => { + match path.path.segments.first() { + Some(segment) => { + segment.ident.to_string() == "reader" + } + _ => false + } + } + _ => false + } + _ => false + } +} + + + +// ================ +// === GenError === +// ================ + +/// Errors that arise during code generation. +#[derive(Clone,Debug,PartialEq)] +pub enum GenError { + /// The callback function does not take a single argument `reader`. + BadCallbackArgument, + /// The provided string is not a valid rust identifier. + BadIdentifier(String), + /// The provided expression isn't a valid rust expression. + BadExpression(String), + /// The provided string is not a valid rust literal. + BadLiteral(String), +} + + +// === Trait Impls === + +impl Display for GenError { + fn fmt(&self, f:&mut fmt::Formatter<'_>) -> fmt::Result { + match self { + GenError::BadCallbackArgument => write!(f, + "Bad argument to a callback function. It must take a single argument `reader`." + ), + GenError::BadIdentifier(str) => write!(f,"`{}` is not a valid rust identifier.",str), + GenError::BadExpression(str) => write!(f,"`{}` is not a valid rust expression.",str), + GenError::BadLiteral(str) => write!(f,"`{}` is not a valid rust literal.",str), + } + } +} + + + +// ============== +// === Branch === +// ============== + +/// A representation of a dispatch branch for helping to generate pattern arms. +#[allow(missing_docs)] +#[derive(Clone,Debug,PartialEq)] +struct Branch { + pub range:Option>, + pub body:Block +} + +impl Branch { + /// Create a new branch, from the provided `range` and with `body` as the code it executes. + pub fn new(range:Option>, body:Block) -> Branch { + Branch {range,body} + } +} + + +// === Trait Impls === + +impl Into for Branch { + fn into(self) -> Arm { + let body = self.body; + match self.range { + Some(range) => { + let range_start = Literal::u32_unsuffixed(*range.start()); + let range_end = Literal::u32_unsuffixed(*range.end()); + if range.start() == range.end() { + parse_quote! { + #range_start => #body, + } + } else { + parse_quote! { + #range_start..=#range_end => #body, + } + } + } + None => parse_quote! { + _ => #body, + } + } + } +} + + + +// ================= +// === Utilities === +// ================= + +/// Convert a string to an identifier. +pub fn str_to_ident(str:impl Into) -> Result { + let string = str.into(); + match parse_str(string.as_ref()) { + Ok(literal) => Ok(literal), + Err(_) => Err(GenError::BadIdentifier(string)) + } +} + +/// Convert the syntax tree into a string. +pub fn show_code(tokens:&impl ToTokens) -> String { + repr(tokens) +} + + diff --git a/lib/rust/flexer/src/group.rs b/lib/rust/flexer/src/group.rs index 37a14f3c366..5b615cafc99 100644 --- a/lib/rust/flexer/src/group.rs +++ b/lib/rust/flexer/src/group.rs @@ -21,14 +21,18 @@ pub mod rule; #[derive(Clone,Debug,Default)] pub struct Registry { /// The groups defined for the lexer. - groups: Vec + groups:Vec } impl Registry { /// Defines a new group of rules for the lexer with the specified `name` and `parent`. /// /// It returns the identifier of the newly-created group. - pub fn define_group(&mut self, name:impl Into, parent_index:Option) -> Identifier { + pub fn define_group + ( &mut self + , name : impl Into + , parent_index : Option + ) -> Identifier { let id = self.next_id(); let group = Group::new(id,name.into(),parent_index); self.groups.push(group); @@ -47,8 +51,7 @@ impl Registry { /// /// Panics if `group_id` refers to a nonexistent group. pub fn create_rule(&mut self, group:Identifier, pattern:&Pattern, callback:impl AsRef) { - let err = format!("The provided group_id {:?} is invalid.",group); - let group = self.group_mut(group).expect(&err); + let group = self.group_mut(group); group.create_rule(pattern,callback.as_ref()); } @@ -56,8 +59,7 @@ impl Registry { /// /// Panics if `group_id` refers to a nonexistent group. pub fn add_rule(&mut self, group:Identifier, rule:Rule) { - let err = format!("The provided group_id {:?} is invalid.",group); - let group = self.group_mut(group).expect(&err); + let group = self.group_mut(group); group.add_rule(rule); } @@ -65,54 +67,65 @@ impl Registry { /// by `group_id` as active. /// /// This set of rules includes the rules inherited from any parent groups. - pub fn rules_for(&self, group:Identifier) -> Option> { - self.group(group).map(|group| { - let mut parent = group.parent_index.and_then(|ix|self.group(ix.into())); - let mut rules = (&group.rules).iter().collect_vec(); - while let Some(parent_group) = parent { - if parent_group.id == group.id { - panic!("There should not be cycles in parent links for lexer groups.") - } - rules.extend((&parent_group.rules).iter()); - parent = parent_group.parent_index.and_then(|ix|self.group(ix.into())); + pub fn rules_for(&self, group:Identifier) -> Vec<&Rule> { + let group_handle = self.group(group); + let mut parent = group_handle.parent_index.map(|p| self.group(p)); + let mut rules = (&group_handle.rules).iter().collect_vec(); + while let Some(parent_group) = parent { + if parent_group.id == group_handle.id { + panic!("There should not be cycles in parent links for lexer groups.") } - rules - }) + rules.extend((&parent_group.rules).iter()); + parent = parent_group.parent_index.map(|p| self.group(p)); + } + rules } /// Obtains a reference to the group for the given `group_id`. - pub fn group(&self, group:Identifier) -> Option<&Group> { - self.groups.get(group.val) + /// + /// As group identifiers can only be created by use of this `Registry`, this will always + /// succeed. + pub fn group(&self, group:Identifier) -> &Group { + self.groups.get(group.0).expect("The group must exist.") } /// Obtains a mutable reference to the group for the given `group_id`. - pub fn group_mut(&mut self, group:Identifier) -> Option<&mut Group> { - self.groups.get_mut(group.val) + /// + /// As group identifiers can only be created by use of this `Registry`, this will always + /// succeed. + pub fn group_mut(&mut self, group:Identifier) -> &mut Group { + self.groups.get_mut(group.0).expect("The group should exist.") } /// Converts the group identified by `group_id` into an NFA. /// /// Returns `None` if the group does not exist, or if the conversion fails. - pub fn to_nfa_from(&self, group:Identifier) -> Option { - let group = self.group(group); - group.map(|group| { - let mut nfa = NFA::default(); - let start = nfa.new_state(); - let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern); - let rules = self.rules_for(group.id).expect("Group exists."); - let states = rules.into_iter().map(build).collect_vec(); - let end = nfa.new_state(); - for (ix,state) in states.into_iter().enumerate() { - nfa.states[state.id].name = Some(group.callback_name(ix)); - nfa.connect(state,end); - } - nfa - }) + pub fn to_nfa_from(&self, group:Identifier) -> NFA { + let group = self.group(group); + let mut nfa = NFA::default(); + let start = nfa.new_state(); + let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern); + let rules = self.rules_for(group.id); + let callbacks = rules.iter().map(|r| r.callback.clone()).collect_vec(); + let states = rules.into_iter().map(build).collect_vec(); + let end = nfa.new_state(); + for (ix,state) in states.into_iter().enumerate() { + nfa.states[state.id].name = Some(group.callback_name(ix)); + nfa.states[state.id].callback = callbacks.get(ix).unwrap().clone(); + nfa.connect(state,end); + } + nfa } /// Generates the next group identifier for this registry. fn next_id(&self) -> Identifier { - Identifier::new(self.groups.len()) + let val = self.groups.len(); + Identifier(val) + } + + /// Get an immutable reference to the groups contained within the registry. + pub fn all(&self) -> &Vec { + &self.groups } } @@ -125,35 +138,26 @@ impl Registry { /// An identifier for a group. #[allow(missing_docs)] #[derive(Copy,Clone,Debug,Default,Eq,PartialEq)] -pub struct Identifier { - val:usize -} - -impl Identifier { - /// Creates a new identifier. - pub fn new(val:usize) -> Identifier { - Identifier{val} - } -} +pub struct Identifier(usize); // === Trait Impls === impl From for Identifier { fn from(id:usize) -> Self { - Identifier::new(id) + Identifier(id) } } impl From<&usize> for Identifier { fn from(id:&usize) -> Self { - Identifier::new(*id) + Identifier(*id) } } impl Into for Identifier { fn into(self) -> usize { - self.val + self.0 } } @@ -183,21 +187,21 @@ impl Into for Identifier { #[derive(Clone,Debug,Default)] pub struct Group { /// A unique identifier for the group. - pub id: Identifier, + pub id:Identifier, /// A name for the group (useful in debugging). - pub name: String, + pub name:String, /// The parent group from which rules are inherited. /// /// It is ensured that the group is held mutably. - pub parent_index: Option, + pub parent_index:Option, /// A set of flexer rules. - pub rules: Vec, + pub rules:Vec, } impl Group { /// Creates a new group. - pub fn new(id:Identifier, name:impl Into, parent_index:Option) -> Self { + pub fn new(id:Identifier, name:impl Into, parent_index:Option) -> Self { let rules = Vec::new(); Group{id,name:name.into(),parent_index,rules} } @@ -216,7 +220,7 @@ impl Group { /// The canonical name for a given rule. pub fn callback_name(&self, rule_ix:usize) -> String { - format!("group{}_rule{}",self.id.val,rule_ix) + format!("group_{}_rule_{}",self.id.0,rule_ix) } } @@ -297,23 +301,23 @@ pub mod tests { #[test] fn test_to_nfa_newline() { - assert_eq!(newline().to_nfa_from(default()),Some(nfa::tests::newline())); + assert_eq!(newline().to_nfa_from(default()),nfa::tests::newline()); } #[test] fn test_to_nfa_letter() { - assert_eq!(letter().to_nfa_from(default()),Some(nfa::tests::letter())); + assert_eq!(letter().to_nfa_from(default()),nfa::tests::letter()); } #[test] fn test_to_nfa_spaces() { - assert_eq!(spaces().to_nfa_from(default()),Some(nfa::tests::spaces())); + assert_eq!(spaces().to_nfa_from(default()),nfa::tests::spaces()); } #[test] fn test_to_nfa_letter_and_spaces() { let expected = nfa::tests::letter_and_spaces(); - assert_eq!(letter_and_spaces().to_nfa_from(default()),Some(expected)); + assert_eq!(letter_and_spaces().to_nfa_from(default()),expected); } #[bench] diff --git a/lib/rust/flexer/src/group/rule.rs b/lib/rust/flexer/src/group/rule.rs index f59ff23b6ac..daa1b0e56c5 100644 --- a/lib/rust/flexer/src/group/rule.rs +++ b/lib/rust/flexer/src/group/rule.rs @@ -15,7 +15,7 @@ use crate::automata::pattern::Pattern; #[derive(Clone,Debug)] pub struct Rule { /// The pattern that triggers the callback. - pub pattern: Pattern, + pub pattern:Pattern, /// The code to execute when [`Rule::pattern`] matches, containing rust code as a /// [`std::string::String`]. @@ -23,7 +23,7 @@ pub struct Rule { /// This code will be called directly from a method defined on your Lexer (the one that contains /// a [`crate::Flexer`] instance. To this end, the code you provide as a string must be valid in /// that context. - pub callback: String, + pub callback:String, } impl Rule { diff --git a/lib/rust/flexer/src/lib.rs b/lib/rust/flexer/src/lib.rs index dcb146e0228..fe2b31b2cbb 100644 --- a/lib/rust/flexer/src/lib.rs +++ b/lib/rust/flexer/src/lib.rs @@ -11,22 +11,1024 @@ //! This module exports the API for defining a simple lexer based on a deterministic finite state //! automaton. //! -//! These lexers are capable of lexing any regular grammar, with some extensions to allow working -//! with context sensitive (e.g. indentation-aware) syntax. - -// TODO [AA] Logging https://github.com/enso-org/ide/blob/main/src/rust/ide/src/model/execution_context/synchronized.rs#L45 +//! Lexers defined using the Flexer are capable of lexing languages of significant complexity, and +//! while they're defined in a simple way (akin to regular grammars), they can work even with +//! context-sensitive languages. +//! +//! The process of defining a lexer involves the user doing the following: +//! +//! 1. Creating a `Lexer` type that wraps the [`Flexer`]. +//! 2. Creating a `State` type, to hold the user-defined lexing state. +//! 3. Implementing [`State`] for the `State` type. +//! 4. Implementing [`Definition`] for the `Lexer`, along with lexing transition rules to lex the +//! language. +//! +//! The result of defining a lexer using the flexer is a hybrid of the code written using this +//! library, and also the code that this library generates to specialize your lexer. +//! +//! # Writing a Lexer +//! +//! As the Flexer is a library for writing lexers, it would be remiss of us not to include a worked +//! example for how to define a lexer. The following example defines a lexer for a small language, +//! and shows you how to integrate the flexer code generation step with your project's build. +//! +//! ## The Language +//! +//! We're going to define a lexer for a very simple language, represented by the following +//! [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) grammar. +//! +//! ```plain +//! a-word = 'a'+; +//! b-word = 'b'+; +//! word = a-word | b-word; +//! space = ' '; +//! spaced-word = space, word; +//! language = word, spaced-word*; +//! ``` +//! +//! ## The Lexer's Output +//! +//! Every lexer needs the ability to write a stream of tokens as its output. A flexer-based lexer +//! can use any type that it wants as its output type, but this language is going to use a very +//! simple `Token` type, wrapped into a `TokenStream`. +//! +//! ``` +//! #[derive(Clone)] +//! pub enum Token { +//! /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! Word(String), +//! /// A token that the lexer is unable to recognise. +//! Unrecognized(String) +//! } +//! +//! #[derive(Clone,Default)] +//! pub struct TokenStream { +//! tokens:Vec +//! } +//! +//! impl TokenStream { +//! pub fn push(&mut self,token:Token) { +//! self.tokens.push(token) +//! } +//! } +//! ``` +//! +//! These tokens will be inserted into the token stream by our lexer as it recognises valid portions +//! of our language. +//! +//! Whatever you choose as the `Output` type of your lexer, it will need to implement both +//! [`std::clone::Clone`] and [`std::default::Default`]. +//! +//! ## The Lexer's State +//! +//! Every Flexer-based lexer operates over a state that holds all of the user-defined state +//! information required to define the particular lexer. This state type must conform to the +//! [`State`] trait, which defines important functionality that it must provide to the flexer. +//! +//! In our language, we want to only be able to match words with a preceding space character once +//! we've seen an initial word that doesn't have one. To this end, we need a state in our lexer to +//! record that we've 'seen' the first word. As required by the [`State`] trait, we also need to +//! provide the flexer with an initial state, the state registry, and the bookmarks we use. +//! +//! ``` +//! use flexer::group; +//! use flexer::prelude::reader::BookmarkManager; +//! use flexer::State; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! +//! +//! // === LexerState === +//! +//! #[derive(Debug)] +//! pub struct LexerState { +//! /// The registry for groups in the lexer. +//! lexer_states:group::Registry, +//! /// The initial state of the lexer. +//! initial_state:group::Identifier, +//! /// The state entered when the first word has been seen. +//! seen_first_word_state:group::Identifier, +//! /// The bookmarks for this lexer. +//! bookmarks:BookmarkManager +//! } +//! ``` +//! +//! The flexer library provides useful functionality to help with defining your lexer state, such as +//! [`group::Registry`] for containing the various states through which your lexer may transition, +//! amd [`prelude::reader::BookmarkManager`] for storing bookmarks. +//! +//! > ### Bookmarks +//! > In order to enable arbitrary lookahead, the flexer provides a system for "bookmarking" a point +//! > in the input stream so that the lexer may return to it later. In fact, this mechanism is used +//! > _by default_ in the implementation to deal with overlapping rules, and so the +//! > [`prelude::reader::BookmarkManager`] provides some bookmarks for you by default. +//! > +//! > As a user, however, you can define additional bookmarks as part of your state, and mark or +//! > return to them as part of your lexer's transition functions (more on this below). +//! +//! Now that we have our state type, we need to define an implementation of [`State`] for it. This +//! is a mostly trivial exercise, but two functions ([`State::new()`] and [`State::specialize`]) +//! require special attention. We'll look at both below. +//! +//! ``` +//! use flexer::generate; +//! # use flexer::group; +//! use flexer::generate::GenError; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! +//! impl flexer::State for LexerState { +//! fn new() -> Self { +//! // Here we construct all of the elements needed for our lexer state. This function can +//! // contain arbitrarily complex logic and is only called once at initialization time. +//! let mut lexer_states = group::Registry::default(); +//! let initial_state = lexer_states.define_group("ROOT",None); +//! let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! let bookmarks = BookmarkManager::new(); +//! Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! } +//! +//! fn initial_state(&self) -> group::Identifier { +//! self.initial_state +//! } +//! +//! fn groups(&self) -> &group::Registry { +//! &self.lexer_states +//! } +//! +//! fn groups_mut(&mut self) -> &mut group::Registry { +//! &mut self.lexer_states +//! } +//! +//! fn bookmarks(&self) -> &BookmarkManager { +//! &self.bookmarks +//! } +//! +//! fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! &mut self.bookmarks +//! } +//! +//! fn specialize(&self) -> Result { +//! // It is very important to pass both the type name of your lexer and your output +//! // correctly here. This function should always be implemented as a call to the +//! // below-used function. +//! generate::specialize(self,"TestLexer","Token") +//! } +//! } +//! ``` +//! +//! ## Defining the Lexer Type +//! +//! With our state type defined, we now have the prerequisites for defining the lexer itself! +//! +//! The notion behind the way we define lexers in the flexer is to use a chain of +//! [`std::ops::Deref`] implementations to make the disparate parts feel like a cohesive whole. +//! The [`Flexer`] itself already implements deref to your state type, so all that remains is to do +//! the following: +//! +//! 1. Define your lexer struct itself, containing an instance of the [`Flexer`], parametrised by +//! your state and output types. +//! +//! ``` +//! use flexer::Flexer; +//! # use flexer::generate; +//! # use flexer::group; +//! # use flexer::prelude::GenError; +//! use flexer::prelude::logger::Disabled; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! +//! type Logger = Disabled; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! # +//! # impl flexer::State for LexerState { +//! # fn new() -> Self { +//! # // Here we construct all of the elements needed for our lexer state. This function can +//! # // contain arbitrarily complex logic and is only called once at initialization time. +//! # let mut lexer_states = group::Registry::default(); +//! # let initial_state = lexer_states.define_group("ROOT",None); +//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! # let bookmarks = BookmarkManager::new(); +//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! # } +//! # +//! # fn initial_state(&self) -> group::Identifier { +//! # self.initial_state +//! # } +//! # +//! # fn groups(&self) -> &group::Registry { +//! # &self.lexer_states +//! # } +//! # +//! # fn groups_mut(&mut self) -> &mut group::Registry { +//! # &mut self.lexer_states +//! # } +//! # +//! # fn bookmarks(&self) -> &BookmarkManager { +//! # &self.bookmarks +//! # } +//! # +//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! # &mut self.bookmarks +//! # } +//! # +//! # fn specialize(&self) -> Result { +//! # // It is very important to pass both the type name of your lexer and your output +//! # // correctly here. This function should always be implemented as a call to the +//! # // below-used function. +//! # generate::specialize(self,"TestLexer","Token") +//! # } +//! # } +//! +//! +//! // === Lexer === +//! +//! pub struct Lexer { +//! lexer:Flexer +//! } +//! ``` +//! +//! You'll note that the `Flexer` also takes a logging implementation from the Enso logging library +//! as a type parameter. This lets the client of the library configure the behaviour of logging in +//! their lexer. We recommend aliasing the current logger type (as shown above) for ease of use. +//! +//! 2. Implement a `new()` function for your lexer. +//! +//! ``` +//! # use flexer::Flexer; +//! # use flexer::generate; +//! # use flexer::group; +//! use flexer::prelude::AnyLogger; +//! # use flexer::prelude::GenError; +//! # use flexer::prelude::logger::Disabled; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! # +//! # type Logger = Disabled; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! # +//! # impl flexer::State for LexerState { +//! # fn new() -> Self { +//! # // Here we construct all of the elements needed for our lexer state. This function can +//! # // contain arbitrarily complex logic and is only called once at initialization time. +//! # let mut lexer_states = group::Registry::default(); +//! # let initial_state = lexer_states.define_group("ROOT",None); +//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! # let bookmarks = BookmarkManager::new(); +//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! # } +//! # +//! # fn initial_state(&self) -> group::Identifier { +//! # self.initial_state +//! # } +//! # +//! # fn groups(&self) -> &group::Registry { +//! # &self.lexer_states +//! # } +//! # +//! # fn groups_mut(&mut self) -> &mut group::Registry { +//! # &mut self.lexer_states +//! # } +//! # +//! # fn bookmarks(&self) -> &BookmarkManager { +//! # &self.bookmarks +//! # } +//! # +//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! # &mut self.bookmarks +//! # } +//! # +//! # fn specialize(&self) -> Result { +//! # // It is very important to pass both the type name of your lexer and your output +//! # // correctly here. This function should always be implemented as a call to the +//! # // below-used function. +//! # generate::specialize(self,"TestLexer","Token") +//! # } +//! # } +//! # +//! # +//! # // === Lexer === +//! # +//! # pub struct Lexer { +//! # lexer:Flexer +//! # } +//! +//! impl Lexer { +//! pub fn new() -> Self { +//! let lexer = Flexer::new(Logger::new("Lexer")); +//! Lexer{lexer} +//! } +//! } +//! ``` +//! +//! 3. Define [`std::ops::Deref`] and [`std::ops::DerefMut`] for your lexer. +//! +//! ``` +//! # use flexer::Flexer; +//! # use flexer::generate; +//! # use flexer::group; +//! # use flexer::prelude::AnyLogger; +//! # use flexer::prelude::GenError; +//! # use flexer::prelude::logger::Disabled; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! use std::ops::Deref; +//! use std::ops::DerefMut; +//! # +//! # type Logger = Disabled; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! # +//! # impl flexer::State for LexerState { +//! # fn new() -> Self { +//! # // Here we construct all of the elements needed for our lexer state. This function can +//! # // contain arbitrarily complex logic and is only called once at initialization time. +//! # let mut lexer_states = group::Registry::default(); +//! # let initial_state = lexer_states.define_group("ROOT",None); +//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! # let bookmarks = BookmarkManager::new(); +//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! # } +//! # +//! # fn initial_state(&self) -> group::Identifier { +//! # self.initial_state +//! # } +//! # +//! # fn groups(&self) -> &group::Registry { +//! # &self.lexer_states +//! # } +//! # +//! # fn groups_mut(&mut self) -> &mut group::Registry { +//! # &mut self.lexer_states +//! # } +//! # +//! # fn bookmarks(&self) -> &BookmarkManager { +//! # &self.bookmarks +//! # } +//! # +//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! # &mut self.bookmarks +//! # } +//! # +//! # fn specialize(&self) -> Result { +//! # // It is very important to pass both the type name of your lexer and your output +//! # // correctly here. This function should always be implemented as a call to the +//! # // below-used function. +//! # generate::specialize(self,"TestLexer","Token") +//! # } +//! # } +//! # +//! # +//! # // === Lexer === +//! # +//! # pub struct Lexer { +//! # lexer:Flexer +//! # } +//! # +//! # impl Lexer { +//! # pub fn new() -> Self { +//! # let lexer = Flexer::new(Logger::new("Lexer")); +//! # Lexer{lexer} +//! # } +//! # } +//! +//! impl Deref for Lexer { +//! type Target = Flexer ; +//! fn deref(&self) -> &Self::Target { +//! &self.lexer +//! } +//! } +//! impl DerefMut for Lexer { +//! fn deref_mut(&mut self) -> &mut Self::Target { +//! &mut self.lexer +//! } +//! } +//! ``` +//! +//! You'll note that here we've instantiated the flexer with a `Logger`. This is used for providing +//! debug information during development, and can be accessed from all scopes of your lexer. In +//! release mode, however, logging calls at the "trace", "debug", and "info" levels are optimised +//! away. +//! +//! ## Defining the Lexing Rules +//! +//! Flexer-based lexers operate by matching on a series of [`automata::pattern::Pattern`]s that +//! describe the language that it is trying to lex. It combines these patterns with "transition +//! functions" that may execute arbitrary code when a pattern matches on the lexer's input. +//! +//! In order to define the lexing rules, we need to implement [`Definition`] for our lexer, +//! particularly the [`Definition::define()`] function. +//! +//! ``` +//! use flexer::automata::pattern::Pattern; +//! # use flexer::Flexer; +//! # use flexer::generate; +//! use flexer::group::Registry; +//! # use flexer::group; +//! # use flexer::prelude::AnyLogger; +//! # use flexer::prelude::GenError; +//! # use flexer::prelude::logger::Disabled; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! use flexer; +//! # use std::ops::Deref; +//! # use std::ops::DerefMut; +//! # +//! # type Logger = Disabled; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! # +//! # impl flexer::State for LexerState { +//! # fn new() -> Self { +//! # // Here we construct all of the elements needed for our lexer state. This function can +//! # // contain arbitrarily complex logic and is only called once at initialization time. +//! # let mut lexer_states = group::Registry::default(); +//! # let initial_state = lexer_states.define_group("ROOT",None); +//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! # let bookmarks = BookmarkManager::new(); +//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! # } +//! # +//! # fn initial_state(&self) -> group::Identifier { +//! # self.initial_state +//! # } +//! # +//! # fn groups(&self) -> &group::Registry { +//! # &self.lexer_states +//! # } +//! # +//! # fn groups_mut(&mut self) -> &mut group::Registry { +//! # &mut self.lexer_states +//! # } +//! # +//! # fn bookmarks(&self) -> &BookmarkManager { +//! # &self.bookmarks +//! # } +//! # +//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! # &mut self.bookmarks +//! # } +//! # +//! # fn specialize(&self) -> Result { +//! # // It is very important to pass both the type name of your lexer and your output +//! # // correctly here. This function should always be implemented as a call to the +//! # // below-used function. +//! # generate::specialize(self,"TestLexer","Token") +//! # } +//! # } +//! # +//! # +//! # // === Lexer === +//! # +//! # pub struct Lexer { +//! # lexer:Flexer +//! # } +//! # +//! # impl Lexer { +//! # pub fn new() -> Self { +//! # let lexer = Flexer::new(Logger::new("Lexer")); +//! # Lexer{lexer} +//! # } +//! # } +//! # +//! # impl Deref for Lexer { +//! # type Target = Flexer ; +//! # fn deref(&self) -> &Self::Target { +//! # &self.lexer +//! # } +//! # } +//! # impl DerefMut for Lexer { +//! # fn deref_mut(&mut self) -> &mut Self::Target { +//! # &mut self.lexer +//! # } +//! # } +//! +//! impl flexer::Definition for Lexer { +//! fn define() -> Self { +//! // First we instantiate our lexer. Definitions take place _directly_ on the lexer, and +//! // manipulate runtime state. +//! let mut lexer = Self::new(); +//! +//! // Then, we define the patterns that we're going to use. For an overview of the p +//! let a_word = Pattern::char('a').many1(); +//! let b_word = Pattern::char('b').many1(); +//! let space = Pattern::char(' '); +//! let spaced_a_word = &space >> &a_word; +//! let spaced_b_word = &space >> &b_word; +//! let any = Pattern::any(); +//! let end = Pattern::eof(); +//! +//! // Next, we define groups of lexer rules. This uses the groups that we've defined in our +//! // lexer's state, and the patterns we've defined above. +//! let root_group_id = lexer.initial_state; +//! let root_group = lexer.groups_mut().group_mut(root_group_id); +//! root_group.create_rule(&a_word,"self.on_first_word(reader)"); +//! root_group.create_rule(&b_word,"self.on_first_word(reader)"); +//! root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); +//! root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); +//! +//! let seen_first_word_group_id = lexer.seen_first_word_state; +//! let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); +//! seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); +//! seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); +//! seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); +//! seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); +//! +//! lexer +//! } +//! +//! /// This function just returns the lexer's groups. +//! fn groups(&self) -> &Registry { +//! self.lexer.groups() +//! } +//! } +//! ``` +//! +//! > ### Transition Functions +//! > You may be wondering why the transition functions are specified as strings. This allows us to +//! > generate highly-efficient, specialized code for your lexer once you define it. More on this +//! > later. +//! +//! A [`group::Group`] in the lexer is like a state that operates on a stack. A transition function +//! can arbitrarily activate or deactivate a group on the flexer's stack, allowing you to perform +//! context-sensitive lexing behaviour. For more information (including on how to use parent groups +//! to inherit rules), see the relevant module. +//! +//! For more information on the [`automata::pattern::Pattern`] APIs used above, please see the +//! relevant module in this crate. +//! +//! ## Defining the Transition Functions +//! +//! You'll have noticed that, up above, we told the rules to use a bunch of transition functions +//! that we've not yet talked about. These functions can be defined anywhere you like, as long as +//! they are in scope in the file in which you are defining your lexer. We do, however, recommend +//! defining them on your lexer itself, so they can access and manipulate lexer state, so that's +//! what we're going to do here. +//! +//! ``` +//! # use flexer::automata::pattern::Pattern; +//! # use flexer::Flexer; +//! # use flexer::generate; +//! # use flexer::group::Registry; +//! # use flexer::group; +//! # use flexer::prelude::AnyLogger; +//! use flexer::prelude::LazyReader; +//! # use flexer::prelude::GenError; +//! # use flexer::prelude::logger::Disabled; +//! # use flexer::prelude::reader::BookmarkManager; +//! # use flexer::State; +//! # use flexer; +//! # use std::ops::Deref; +//! # use std::ops::DerefMut; +//! # +//! # type Logger = Disabled; +//! # +//! # +//! # // === Token === +//! # +//! # #[derive(Clone)] +//! # pub enum Token { +//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. +//! # Word(String), +//! # /// A token that the lexer is unable to recognise. +//! # Unrecognized(String) +//! # } +//! # +//! # #[derive(Clone,Default)] +//! # pub struct TokenStream { +//! # tokens:Vec +//! # } +//! # +//! # impl TokenStream { +//! # pub fn push(&mut self,token:Token) { +//! # self.tokens.push(token) +//! # } +//! # } +//! # +//! # +//! # // === LexerState === +//! # +//! # #[derive(Debug)] +//! # pub struct LexerState { +//! # /// The registry for groups in the lexer. +//! # lexer_states:group::Registry, +//! # /// The initial state of the lexer. +//! # initial_state:group::Identifier, +//! # /// The state entered when the first word has been seen. +//! # seen_first_word_state:group::Identifier, +//! # /// The bookmarks for this lexer. +//! # bookmarks:BookmarkManager +//! # } +//! # +//! # impl flexer::State for LexerState { +//! # fn new() -> Self { +//! # // Here we construct all of the elements needed for our lexer state. This function can +//! # // contain arbitrarily complex logic and is only called once at initialization time. +//! # let mut lexer_states = group::Registry::default(); +//! # let initial_state = lexer_states.define_group("ROOT",None); +//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); +//! # let bookmarks = BookmarkManager::new(); +//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} +//! # } +//! # +//! # fn initial_state(&self) -> group::Identifier { +//! # self.initial_state +//! # } +//! # +//! # fn groups(&self) -> &group::Registry { +//! # &self.lexer_states +//! # } +//! # +//! # fn groups_mut(&mut self) -> &mut group::Registry { +//! # &mut self.lexer_states +//! # } +//! # +//! # fn bookmarks(&self) -> &BookmarkManager { +//! # &self.bookmarks +//! # } +//! # +//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { +//! # &mut self.bookmarks +//! # } +//! # +//! # fn specialize(&self) -> Result { +//! # // It is very important to pass both the type name of your lexer and your output +//! # // correctly here. This function should always be implemented as a call to the +//! # // below-used function. +//! # generate::specialize(self,"TestLexer","Token") +//! # } +//! # } +//! # +//! # +//! # // === Lexer === +//! # +//! # pub struct Lexer { +//! # lexer:Flexer +//! # } +//! # +//! # impl Lexer { +//! # pub fn new() -> Self { +//! # let lexer = Flexer::new(Logger::new("Lexer")); +//! # Lexer{lexer} +//! # } +//! # } +//! # +//! # impl Deref for Lexer { +//! # type Target = Flexer ; +//! # fn deref(&self) -> &Self::Target { +//! # &self.lexer +//! # } +//! # } +//! # impl DerefMut for Lexer { +//! # fn deref_mut(&mut self) -> &mut Self::Target { +//! # &mut self.lexer +//! # } +//! # } +//! # +//! # impl flexer::Definition for Lexer { +//! # fn define() -> Self { +//! # // First we instantiate our lexer. Definitions take place _directly_ on the lexer, and +//! # // manipulate runtime state. +//! # let mut lexer = Self::new(); +//! # +//! # // Then, we define the patterns that we're going to use. For an overview of the p +//! # let a_word = Pattern::char('a').many1(); +//! # let b_word = Pattern::char('b').many1(); +//! # let space = Pattern::char(' '); +//! # let spaced_a_word = &space >> &a_word; +//! # let spaced_b_word = &space >> &b_word; +//! # let any = Pattern::any(); +//! # let end = Pattern::eof(); +//! # +//! # // Next, we define groups of lexer rules. This uses the groups that we've defined in our +//! # // lexer's state, and the patterns we've defined above. +//! # let root_group_id = lexer.initial_state; +//! # let root_group = lexer.groups_mut().group_mut(root_group_id); +//! # root_group.create_rule(&a_word,"self.on_first_word(reader)"); +//! # root_group.create_rule(&b_word,"self.on_first_word(reader)"); +//! # root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); +//! # root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); +//! # +//! # let seen_first_word_group_id = lexer.seen_first_word_state; +//! # let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); +//! # seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); +//! # seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); +//! # seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); +//! # seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); +//! # +//! # lexer +//! # } +//! # +//! # /// This function just returns the lexer's groups. +//! # fn groups(&self) -> &Registry { +//! # self.lexer.groups() +//! # } +//! # } +//! +//! impl Lexer { +//! pub fn on_first_word(&mut self, _reader:&mut R) { +//! let str = self.current_match.clone(); +//! let ast = Token::Word(str); +//! self.output.push(ast); +//! let id = self.seen_first_word_state; +//! self.push_state(id); +//! } +//! +//! pub fn on_spaced_word(&mut self, _reader:&mut R) { +//! let str = self.current_match.clone(); +//! let ast = Token::Word(String::from(str.trim())); +//! self.output.push(ast); +//! } +//! +//! pub fn on_err_suffix_first_word(&mut self, _reader:&mut R) { +//! let ast = Token::Unrecognized(self.current_match.clone()); +//! self.output.push(ast); +//! } +//! +//! pub fn on_err_suffix(&mut self, reader:&mut R) { +//! self.on_err_suffix_first_word(reader); +//! self.pop_state(); +//! } +//! +//! pub fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} +//! +//! pub fn on_no_err_suffix(&mut self, reader:&mut R) { +//! self.on_no_err_suffix_first_word(reader); +//! self.pop_state(); +//! } +//! } +//! ``` +//! +//! > ### Magic Transition Functions +//! > The transition functions are the 'secret sauce', so to speak, of the Flexer. They are called +//! > when a rule matches, and allow arbitrary code to manipulate the lexer. This means that the +//! > flexer can be used to define very complex grammars while still keeping a simple interface and +//! > ensuring performant execution. +//! +//! You'll note that all of these functions have a couple of things in common: +//! +//! 1. They have a type parameter `R` that conforms to the [`prelude::LazyReader`] trait. +//! 2. They take an argument of type `R`, that is the reader over which the lexer is running. +//! +//! Both of these, combined, allow the transition functions to manipulate the text being read by the +//! lexer. +//! +//! ## Specializing the Lexer +//! +//! In order to actually _use_ the lexer that you've defined, you need to specialize it to the rules +//! that you define. Unfortunately, `cargo` doesn't have support for post-build hooks, and so this +//! is a little more involved than we'd like it to be. +//! +//! 1. Create a file that performs the definition of the lexer as above. It can use multiple files +//! in its crate as long as they are publicly exposed. +//! 2. Create a separate cargo project that has a prebuild hook in its `build.rs`. +//! 3. In that build.rs, you need to: +//! 1. Import the lexer definition and instantiate it using `::define()`. +//! 2. Call [`State::specialize()`] on the resultant lexer. This will generate a string that +//! contains the optimised lexer implementation. +//! 3. Write both the generated code and the code from the original lexer definition into an +//! output file. +//! 4. Re-export this output file from your cargo project's `lib.rs`. +//! +//! The process of specialization will generate quite a bit of code, but most importantly it will +//! generate `pub fn run(&mut self, mut reader:R) -> Result`, where `Output` +//! is your lexer's token type. All of these functions are defined on your lexer type (the one whose +//! name is provided to `specialize()`. +//! +//! ## In Summary +//! +//! The flexer allows its clients to define highly optimised lexer implementations that are capable +//! of lexing languages of a high complexity. use crate::prelude::*; -use lazy_reader::LazyReader; +use crate::generate::GenError; +use prelude::logger::AnyLogger; +use prelude::reader::BookmarkManager; pub mod automata; pub mod data; +pub mod generate; pub mod group; -#[allow(missing_docs)] +/// Useful libraries for working with the flexer. pub mod prelude { + pub use crate::generate::GenError; pub use enso_prelude::*; + pub use lazy_reader::LazyReader; + pub use lazy_reader::Reader; + pub use logger::AnyLogger; + + /// The lazy reader library. + pub mod reader { + pub use lazy_reader::*; + } + + /// The Enso logging library. + pub mod logger { + pub use enso_logger::*; + pub use enso_logger::disabled::Logger as Disabled; + pub use enso_logger::enabled::Logger as Enabled; + } } @@ -39,9 +1041,6 @@ mod constants { /// The number of 'frames' to reserve in the state stack, aiming to avoid re-allocation in hot /// code paths. pub const STATE_STACK_RESERVATION:usize = 1024; - /// The size of the output buffer (in tokens) to reserve, aiming to avoid re-allocation of the - /// output buffer for common usage cases. - pub const OUTPUT_BUFFER_RESERVATION:usize = 1024; } @@ -62,44 +1061,46 @@ mod constants { /// state. The user may cause the lexer to transition between states by pushing and popping states /// on the stack, thus allowing a much more flexible lexing engine than pure regular grammars. #[derive(Clone,Debug)] -pub struct Flexer { +pub struct Flexer { /// The stack of states that are active during lexer execution. - pub state_stack: NonEmptyVec, - /// A reader for the input. - pub reader: Reader, + pub state_stack:NonEmptyVec, /// The result of the current stage of the DFA. - pub status: StageStatus, + pub status:StageStatus, /// The tokens that have been lexed. - pub output: Vec, + pub output:Output, /// The text of the current match of the lexer. - pub current_match: String, + pub current_match:String, + /// A logger for the flexer, accessible in user definitions. + pub logger:Logger, /// The definition of the user-provided state for the lexer. - definition: Definition, + definition:Definition, } -impl Flexer -where Definition:State, Reader:LazyReader { +impl Flexer +where Definition : State, + Logger : AnyLogger, + Output : Default { /// Create a new lexer instance. - pub fn new(mut reader:Reader) -> Flexer { + pub fn new(parent_logger:impl AnyLogger) -> Flexer { + let logger = ::sub(&parent_logger,"Flexer"); let status = default(); - let mut output = Vec::default(); - let definition = Definition::new(&mut reader); + let output = default(); + let definition = Definition::new(); let initial_state_id = definition.initial_state(); let mut state_stack = NonEmptyVec::singleton(initial_state_id); let current_match = default(); state_stack.reserve(constants::STATE_STACK_RESERVATION); - output.reserve(constants::OUTPUT_BUFFER_RESERVATION); - state_stack.push(initial_state_id); - Flexer {state_stack,reader,status,output,definition,current_match} + Flexer{state_stack,status,output,definition,current_match,logger} } } -/// This block is things that are part of the lexer's interface and functionality. -impl Flexer -where Definition:State, Output:Clone { +impl Flexer +where Definition : State, + Output : Clone, + Logger : AnyLogger { /// Get the lexer result. - pub fn result(&mut self) -> &Vec { + pub fn result(&mut self) -> &Output { &self.output } @@ -115,6 +1116,7 @@ where Definition:State, Output:Clone { /// Tell the lexer to enter the state described by `state`. pub fn push_state(&mut self, state:group::Identifier) { + self.logger.info(||format!("Pushing state {:?}",state)); self.state_stack.push(state); } @@ -122,7 +1124,9 @@ where Definition:State, Output:Clone { /// /// It will never end the initial state of the lexer. pub fn pop_state(&mut self) -> Option { - self.state_stack.pop() + let result = self.state_stack.pop(); + self.logger.info(||format!("Popped state {:?}",result)); + result } /// End states until the specified `state` is reached, leaving the lexer in `state`. @@ -132,8 +1136,10 @@ where Definition:State, Output:Clone { pub fn pop_states_until(&mut self, state:group::Identifier) -> Vec { let non_opt_root_state_position = self.state_stack.iter().positions(|elem| *elem == state).last().unwrap_or(0); - let range = (non_opt_root_state_position + 1)..self.state_stack.len(); - self.state_stack.drain(range).collect() + let range = (non_opt_root_state_position + 1)..self.state_stack.len(); + let states = self.state_stack.drain(range).collect(); + self.logger.info(||format!("Popped states {:?}",states)); + states } /// Check if the lexer is currently in the state described by `state`. @@ -144,14 +1150,14 @@ where Definition:State, Output:Clone { // === Trait Impls === -impl Deref for Flexer { +impl Deref for Flexer { type Target = Definition; fn deref(&self) -> &Self::Target { &self.definition } } -impl DerefMut for Flexer { +impl DerefMut for Flexer { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.definition } @@ -159,6 +1165,44 @@ impl DerefMut for Flexer { +// ================== +// === SubStateId === +// ================== + +/// An identifier for a sub-state of the lexer to transition to. +#[derive(Copy,Clone,Debug,Default,PartialEq)] +pub struct SubStateId(usize); + +impl SubStateId { + /// Create a new `SubStateId` with the specified value. + pub fn new(val:usize) -> SubStateId { + SubStateId(val) + } +} + + +// === Trait Impls === + +impl From for SubStateId { + fn from(val:usize) -> Self { + SubStateId::new(val) + } +} + +impl From<&usize> for SubStateId { + fn from(val:&usize) -> Self { + SubStateId::new(*val) + } +} + +impl Into for SubStateId { + fn into(self) -> usize { + self.0 + } +} + + + // =================== // === StageStatus === // =================== @@ -175,7 +1219,7 @@ pub enum StageStatus { /// A single step of the DFA has executed successfully. ExitFinished, /// The lexer should continue, transitioning to the included state. - ContinueWith(group::Identifier) + ContinueWith(SubStateId) } impl StageStatus { @@ -185,9 +1229,9 @@ impl StageStatus { } /// Obtain the state to which the lexer should transition, iff the lexer should continue. - pub fn continue_as(&self) -> Option { + pub fn continue_as(&self) -> Option { match self { - StageStatus::Initial => Some(group::Identifier::new(0)), + StageStatus::Initial => Some(SubStateId::new(0)), StageStatus::ContinueWith(val) => Some(*val), _ => None } @@ -210,33 +1254,33 @@ impl Default for StageStatus { // ============== /// The result of executing the lexer on a given input. -#[allow(missing_docs)] #[derive(Clone,Debug)] -pub struct Result { - pub kind: ResultKind, - pub tokens: Vec +pub struct LexingResult { + /// The kind of the result, representing _how_ the lexer completed. + pub kind:ResultKind, + /// The tokens that the lexer was able to process. + pub tokens:T } -impl Result { - +impl LexingResult { /// Create a new lexer result using the provided `kind` and `tokens`. - pub fn new(kind:ResultKind,tokens:Vec) -> Result { - Result {kind,tokens} + pub fn new(kind:ResultKind,tokens:T) -> LexingResult { + LexingResult {kind,tokens} } /// Create a new success result, with the provided `tokens`. - pub fn success(tokens:Vec) -> Result { - Result::new(ResultKind::Success,tokens) + pub fn success(tokens:T) -> LexingResult { + LexingResult::new(ResultKind::Success, tokens) } /// Create a new partial lex result, with the provided `tokens`. - pub fn partial(tokens:Vec) -> Result { - Result::new(ResultKind::Partial,tokens) + pub fn partial(tokens:T) -> LexingResult { + LexingResult::new(ResultKind::Partial, tokens) } /// Create a failure result, with the `tokens` it _did_ manage to consume. - pub fn failure(tokens:Vec) -> Result { - Result::new(ResultKind::Failure,tokens) + pub fn failure(tokens:T) -> LexingResult { + LexingResult::new(ResultKind::Failure, tokens) } } @@ -257,33 +1301,42 @@ pub enum ResultKind { // === State === // ============= -/// Contains the state needed by any given lexer implementation. +/// Contains the state needed by the flexer from a lexer implementation. +/// +/// The types for which this trait is implemented will normally also contain the user-defined state +/// for that lexer. pub trait State { /// Create a new instance of the lexer's state. - fn new(reader:&mut Reader) -> Self; + /// + /// This function is guaranteed to be called at most once per run of the lexer. + fn new() -> Self; /// Return the _initial_ lexing state. fn initial_state(&self) -> group::Identifier; /// Return a reference to the group registry for a given lexer. fn groups(&self) -> &group::Registry; /// Return a mutable reference to the group registry for a given lexer. fn groups_mut(&mut self) -> &mut group::Registry; + /// Get an immutable reference to the bookmark manager for this state. + fn bookmarks(&self) -> &BookmarkManager; + /// Get a mutable reference to the bookmark manager for this state. + fn bookmarks_mut(&mut self) -> &mut BookmarkManager; + /// Generate code to specialize the flexer for the user's particular lexer definition. + /// + /// This function should be implemented as a call to [`generate::specialize`], passing + /// the name of your lexer, and the name of your lexer's output type as a string. + fn specialize(&self) -> Result; } -// =============== -// === Flexer ==== -// =============== +// ================== +// === Definition === +// ================== -// TODO [AA] Remove this once code generation is ready. -#[allow(missing_docs)] -pub trait FlexerTemp { - /// Creates a new lexer. - fn new() -> Self; - - /// Returns a code for a highly-optimised lexer implemented on top of a finite-state-automaton. - fn generate_specialized_code(&mut self) -> String { - String::from("#[derive(Debug)]\npub struct Lexer {}") - } +/// Allows for the definition of flexer-based lexers. +pub trait Definition { + /// Define the custom lexer. + fn define() -> Self; + /// Obtain the registry of groups for the lexer. + fn groups(&self) -> &group::Registry; } - diff --git a/lib/rust/flexer/tests/flexer_generated_api_test.rs b/lib/rust/flexer/tests/flexer_generated_api_test.rs deleted file mode 100644 index 79fd0fe6c0d..00000000000 --- a/lib/rust/flexer/tests/flexer_generated_api_test.rs +++ /dev/null @@ -1,522 +0,0 @@ -//! This file contains tests for the intended generated code using the flexer, based on the -//! following small language. -//! -//! The language here is being defined as follows: -//! -//! a-word = 'a'+; -//! b-word = 'b'+; -//! word = a-word | b-word; -//! space = ' '; -//! spaced-word = space, word; -//! language = word, spaced-word*; -//! -//! Please note that there is a fair amount of duplicated code between this test and the -//! `flexer_lexer_definition_test` file. This is to present the full view of what each portion of -//! the process looks like. - -use flexer::*; -use flexer::prelude::*; -use lazy_reader::decoder::DecoderUTF8; -use lazy_reader::{BookmarkId,LazyReader,Reader}; -use flexer::group; - - -// =========== -// === AST === -// =========== - -/// A very simple AST, sufficient for the simple language being defined. -#[derive(Clone,Debug,PartialEq)] -pub enum AST { - /// A word from the input, consisting of a sequence of all `a` or all `b`. - Word(String), - /// A token that the lexer is unable to recognise. - Unrecognised(String) -} - - - -// ================== -// === Test Lexer === -// ================== - -/// The definition of a test lexer for the above-described language. -#[derive(Debug)] -pub struct TestLexer { - lexer: Flexer -} - -impl TestLexer { - /// Creates a new instance of this lexer. - pub fn new(reader:Reader) -> Self { - let lexer = Flexer::new(reader); - TestLexer{lexer} - } -} - -/// Implementations of functionality used by the lexer. -/// -/// These functions are provided by the user, by hand. -#[allow(missing_docs)] -impl TestLexer { - pub fn on_first_word(&mut self) { - let str = self.current_match.clone(); - let ast = AST::Word(str); - self.output.push(ast); - let id = self.seen_first_word_state; - self.push_state(id); - } - - pub fn on_spaced_word(&mut self) { - let str = self.current_match.clone(); - let ast = AST::Word(String::from(str.trim())); - self.output.push(ast); - } - - pub fn on_err_suffix_first_word(&mut self) { - let ast = AST::Unrecognised(self.current_match.clone()); - self.output.push(ast); - } - - pub fn on_err_suffix(&mut self) { - self.on_err_suffix_first_word(); - self.pop_state(); - } - - pub fn on_no_err_suffix_first_word(&mut self) {} - - pub fn on_no_err_suffix(&mut self) { - self.on_no_err_suffix_first_word(); - self.pop_state(); - } -} - -impl Deref for TestLexer { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for TestLexer { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -/// Generated functionality used at runtime by the lexer. -#[allow(missing_docs)] -impl TestLexer { - - /// Executes the lexer on the input provided by the reader, resulting in a - /// series of tokens. - pub fn run(&mut self) -> Result { - self.reader.advance_char(); - - while self.gen_run_current_state() == StageStatus::ExitSuccess {} - - match self.status { - StageStatus::ExitFinished => Result::success(mem::replace(&mut self.output,vec![])), - StageStatus::ExitFail => Result::failure(mem::replace(&mut self.output,vec![])), - _ => Result::partial(mem::replace(&mut self.output,vec![])) - } - } - - /// Executes the lexer in the current state. - fn gen_run_current_state(&mut self) -> StageStatus { - self.status = StageStatus::Initial; - - // Runs until reaching a state that no longer says to continue. - while let Some(next_state) = self.status.continue_as() { - self.status = self.gen_step(next_state); - - if self.reader.finished() { - self.status = StageStatus::ExitFinished - } - - if self.status.should_continue() { - if let Ok(char) = self.reader.character().char { - self.reader.append_result(char); - } - self.reader.advance_char(); - } - } - - self.status - } - - /// The step function for the generated lexer. - fn gen_step(&mut self, next_state:group::Identifier) -> StageStatus { - let current_state:usize = self.current_state().into(); - - // This match should be generated - match current_state { - 0 => self.gen_dispatch_in_state_0(next_state), - 1 => self.gen_dispatch_in_state_1(next_state), - _ => unreachable_panic!("Unreachable state reached in lexer.") - } - } - - // === DFA Steps === - - fn gen_dispatch_in_state_0(&mut self, new_state_index:group::Identifier) -> StageStatus { - match new_state_index.into() { - 0 => self.gen_state_0_to_0(), - 1 => self.gen_state_0_to_1(), - 2 => self.gen_state_0_to_2(), - 3 => self.gen_state_0_to_3(), - 4 => self.gen_state_0_to_4(), - 5 => self.gen_state_0_to_5(), - 6 => self.gen_state_0_to_6(), - _ => unreachable_panic!("Unreachable state reached in lexer.") - } - } - - fn gen_state_0_to_0(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(3.into()), - 98 => StageStatus::ContinueWith(4.into()), - _ => StageStatus::ContinueWith(2.into()) - } - } - - fn gen_state_0_to_1(&mut self) -> StageStatus { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_2(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - - fn gen_state_0_to_2(&mut self) -> StageStatus { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_3(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - - fn gen_state_0_to_3(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(5.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_0(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_0_to_4(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 98 => StageStatus::ContinueWith(6.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_1(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_0_to_5(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(5.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_0(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_0_to_6(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 98 => StageStatus::ContinueWith(6.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_0_rule_1(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_group_0_rule_0(&mut self) -> () { - self.on_first_word() - } - - fn gen_group_0_rule_1(&mut self) -> () { - self.on_first_word() - } - - fn gen_group_0_rule_2(&mut self) -> () { - self.on_err_suffix_first_word() - } - - fn gen_group_0_rule_3(&mut self) { - self.on_err_suffix_first_word() - } - - fn gen_dispatch_in_state_1(&mut self, new_state_index:group::Identifier) -> StageStatus { - match new_state_index.into() { - 0 => self.gen_state_1_to_0(), - 1 => self.gen_state_1_to_1(), - 2 => self.gen_state_1_to_2(), - 3 => self.gen_state_1_to_3(), - 4 => self.gen_state_1_to_4(), - 5 => self.gen_state_1_to_5(), - 6 => self.gen_state_1_to_6(), - 7 => self.gen_state_1_to_7(), - _ => unreachable_panic!("Unreachable state reached in lexer.") - } - } - - fn gen_state_1_to_0(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 32 => StageStatus::ContinueWith(3.into()), - _ => StageStatus::ContinueWith(2.into()) - } - } - - fn gen_state_1_to_1(&mut self) -> StageStatus { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_2(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - - fn gen_state_1_to_2(&mut self) -> StageStatus { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_3(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - - fn gen_state_1_to_3(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(4.into()), - 98 => StageStatus::ContinueWith(5.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_3(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_1_to_4(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(6.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_0(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_1_to_5(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 98 => StageStatus::ContinueWith(7.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_1(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_1_to_6(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 97 => StageStatus::ContinueWith(6.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_0(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_state_1_to_7(&mut self) -> StageStatus { - match u32::from(self.reader.character()) { - 98 => StageStatus::ContinueWith(7.into()), - _ => { - self.current_match = self.reader.pop_result(); - self.gen_group_1_rule_1(); - let t = self.matched_bookmark; - self.reader.bookmark(t); - StageStatus::ExitSuccess - } - } - } - - fn gen_group_1_rule_0(&mut self) { - self.on_spaced_word(); - } - - fn gen_group_1_rule_1(&mut self) -> () { - self.on_spaced_word(); - } - - fn gen_group_1_rule_2(&mut self) { - self.on_no_err_suffix(); - } - - fn gen_group_1_rule_3(&mut self) { - self.on_err_suffix() - } -} - - - -// =================== -// === Lexer State === -// =================== - -/// The stateful components of the test lexer. -#[derive(Debug)] -pub struct TestState { - /// The registry for groups in the lexer. - lexer_states: group::Registry, - /// The initial state of the lexer. - initial_state: group::Identifier, - /// The state entered when the first word has been seen. - seen_first_word_state: group::Identifier, - /// A bookmark that is set when a match occurs, allowing for rewinding if necessary. - matched_bookmark: BookmarkId, -} - - -// === Trait Impls === - -impl flexer::State for TestState { - fn new(reader:&mut Reader) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); - let matched_bookmark = reader.add_bookmark(); - Self{lexer_states,initial_state,seen_first_word_state,matched_bookmark} - } - - fn initial_state(&self) -> group::Identifier { - self.initial_state - } - - fn groups(&self) -> &group::Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut group::Registry { - &mut self.lexer_states - } -} - - - -// ============= -// === Tests === -// ============= - -/// Executes the test on the provided input string slice. -fn run_test_on(str:impl AsRef) -> Vec { - // Hardcoded for ease of use here. - let reader = Reader::new(str.as_ref().as_bytes(),DecoderUTF8()); - let mut lexer = TestLexer::new(reader); - let run_result = lexer.run(); - - match run_result.kind { - flexer::ResultKind::Success => run_result.tokens, - _ => default() - } -} - -#[test] -fn test_single_a_word() { - let input = "aaaaa"; - let expected_output = vec![AST::Word(String::from(input))]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_single_b_word() { - let input = "bbbbb"; - let expected_output = vec![AST::Word(String::from(input))]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_two_word() { - let input = "aaaaa bbbbb"; - let expected_output = - vec![AST::Word(String::from("aaaaa")),AST::Word(String::from("bbbbb"))]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_multi_word() { - let input = "bbb aa a b bbbbb aa"; - let expected_output = vec![ - AST::Word(String::from("bbb")), - AST::Word(String::from("aa")), - AST::Word(String::from("a")), - AST::Word(String::from("b")), - AST::Word(String::from("bbbbb")), - AST::Word(String::from("aa")) - ]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_invalid_single_word() { - let input = "c"; - let expected_output = vec![AST::Unrecognised(String::from(input))]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_multi_word_invalid() { - let input = "aaaaaa c bbbbbb"; - let expected_output = vec![ - AST::Word(String::from("aaaaaa")), - AST::Unrecognised(String::from(" ")), - AST::Unrecognised(String::from("c")), - AST::Unrecognised(String::from(" ")), - AST::Word(String::from("bbbbbb")), - ]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} - -#[test] -fn test_end_invalid() { - let input = "bbbbbb c"; - let expected_output = vec![ - AST::Word(String::from("bbbbbb")), - AST::Unrecognised(String::from(" ")), - AST::Unrecognised(String::from("c")), - ]; - let result = run_test_on(input); - assert_eq!(result,expected_output); -} diff --git a/lib/rust/flexer/tests/flexer_lexer_definition_test.rs b/lib/rust/flexer/tests/flexer_lexer_definition_test.rs deleted file mode 100644 index d9042c9c022..00000000000 --- a/lib/rust/flexer/tests/flexer_lexer_definition_test.rs +++ /dev/null @@ -1,188 +0,0 @@ -//! This file contains tests for the intended definition code of a lexer using the flexer, based on -//! the following small language. -//! -//! The language here is being defined as follows: -//! -//! a-word = 'a'+; -//! b-word = 'b'+; -//! word = a-word | b-word; -//! space = ' '; -//! spaced-word = space, word; -//! language = word, spaced-word*; -//! -//! Please note that there is a fair amount of duplicated code between this test and the -//! `lexer_generated_api_test` file. This is to present the full view of what each portion of the -//! process looks like. - -use flexer::prelude::*; -use flexer::group; -use lazy_reader::{BookmarkId, LazyReader, Reader}; -use flexer::{State, Flexer}; -use lazy_reader::decoder::DecoderUTF8; -use flexer::automata::pattern::Pattern; - - -// =========== -// === AST === -// =========== - -/// A very simple AST, sufficient for the simple lexer being defined. -#[derive(Clone,Debug,PartialEq)] -pub enum AST { - /// A word from the input, consisting of a sequence of all `a` or all `b`. - Word(String), - /// A token that the lexer is unable to recognise. - Unrecognised(String) -} - - - -// ================== -// === Test Lexer === -// ================== - -/// The definition of a test lexer for the above-described language. -#[derive(Debug)] -pub struct TestLexer { - lexer: Flexer -} - -impl Deref for TestLexer { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for TestLexer { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl TestLexer { - /// Creates a new instance of this lexer. - pub fn new(reader:Reader) -> Self { - let lexer = Flexer::new(reader); - TestLexer{lexer} - } -} - -/// Implementations of functionality used by the lexer. -/// -/// These functions are provided by the user, by hand. -#[allow(missing_docs)] -impl TestLexer { - pub fn on_first_word(&mut self) { - let str = self.current_match.clone(); - let ast = AST::Word(str); - self.output.push(ast); - let id = self.seen_first_word_state; - self.push_state(id); - } - - pub fn on_spaced_word(&mut self) { - let str = self.current_match.clone(); - let ast = AST::Word(String::from(str.trim())); - self.output.push(ast); - } - - pub fn on_err_suffix_first_word(&mut self) { - let ast = AST::Unrecognised(self.current_match.clone()); - self.output.push(ast); - } - - pub fn on_err_suffix(&mut self) { - self.on_err_suffix_first_word(); - self.pop_state(); - } - - pub fn on_no_err_suffix_first_word(&mut self) {} - - pub fn on_no_err_suffix(&mut self) { - self.on_no_err_suffix_first_word(); - self.pop_state(); - } -} - - - -// =================== -// === Lexer State === -// =================== - -/// The stateful components of the test lexer. -#[derive(Debug)] -pub struct TestState { - /// The registry for groups in the lexer. - lexer_states: group::Registry, - /// The initial state of the lexer. - initial_state: group::Identifier, - /// The state entered when the first word has been seen. - seen_first_word_state: group::Identifier, - /// A bookmark that is set when a match occurs, allowing for rewinding if necessary. - matched_bookmark: BookmarkId, -} - - -// === Trait Impls === - -impl flexer::State for TestState { - fn new(reader:&mut Reader) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); - let matched_bookmark = reader.add_bookmark(); - Self{lexer_states,initial_state,seen_first_word_state,matched_bookmark} - } - - fn initial_state(&self) -> group::Identifier { - self.initial_state - } - - fn groups(&self) -> &group::Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut group::Registry { - &mut self.lexer_states - } -} - - - -// ============= -// === Tests === -// ============= - -#[test] -fn test_lexer_definition() { - // FIXME [AA] Work out how to best-define the lexer. - // TODO [AA] Needing a dummy reader to define the lexer is awkward. - let str = "aaaaa".as_bytes(); - let reader = Reader::new(str,DecoderUTF8()); - let mut lexer = TestLexer::new(reader); - - let a_word = Pattern::char('a').many1(); - let b_word = Pattern::char('b').many1(); - let space = Pattern::char(' '); - let spaced_a_word = space.clone() >> a_word.clone(); - let spaced_b_word = space.clone() >> b_word.clone(); - let any = Pattern::any(); - let end = Pattern::eof(); - - let root_group_id = lexer.initial_state; - let root_group = lexer.groups_mut().group_mut(root_group_id).unwrap(); - root_group.create_rule(&a_word,"self.on_first_word()"); - root_group.create_rule(&b_word,"self.on_first_word()"); - root_group.create_rule(&end, "self.on_no_err_suffix_first_word()"); - root_group.create_rule(&any, "self.on_err_suffix_first_word()"); - - let seen_first_word_group_id = lexer.seen_first_word_state; - let seen_first_word_group = - lexer.groups_mut().group_mut(seen_first_word_group_id).unwrap(); - seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word()"); - seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word()"); - seen_first_word_group.create_rule(&end, "self.on_no_err_suffix()"); - seen_first_word_group.create_rule(&any, "self.on_err_suffix()"); -} diff --git a/lib/rust/flexer/tests/test_invalid_definitions.rs b/lib/rust/flexer/tests/test_invalid_definitions.rs new file mode 100644 index 00000000000..aa56678f091 --- /dev/null +++ b/lib/rust/flexer/tests/test_invalid_definitions.rs @@ -0,0 +1,414 @@ +//! This file contains tests for the user-facing error-handling logic in the flexer code generator. +//! +//! This file includes quite a bit of duplicated code, but this is known and intentional as it +//! allows for increased clarity in the testing. + +#![allow(missing_docs)] + +use crate::prelude::LazyReader; +use crate::prelude::logger::AnyLogger; +use crate::prelude::logger::Disabled; +use crate::prelude::reader::BookmarkManager; +use flexer::*; +use flexer::automata::pattern::Pattern; +use flexer::Flexer; +use flexer::generate; +use flexer::group::{Registry, Identifier}; +use flexer::group; +use flexer::prelude::*; +use flexer::State; +use flexer; + + + +// ==================== +// === Type Aliases === +// ==================== + +type Logger = Disabled; + + + +// ==================== +// === Shared Setup === +// ==================== + +/// A token type for these lexers. +#[derive(Copy,Clone,Debug,PartialEq)] +pub enum Token { + Foo, + Bar +} + +/// An output type for these lexers. +#[allow(missing_docs)] +#[derive(Clone,Debug,Default,PartialEq)] +pub struct Output { + tokens:Vec +} + +/// A testing lexer state. +pub struct LexerState { + lexer_states:group::Registry, + initial_state:group::Identifier, +} +impl flexer::State for LexerState { + fn new() -> Self { + let mut lexer_states = group::Registry::default(); + let initial_state = lexer_states.define_group("ROOT",None); + LexerState{lexer_states,initial_state} + } + + fn initial_state(&self) -> Identifier { + self.initial_state + } + + fn groups(&self) -> &Registry { + &self.lexer_states + } + + fn groups_mut(&mut self) -> &mut Registry { + &mut self.lexer_states + } + + fn bookmarks(&self) -> &BookmarkManager { + unimplemented!() + } + + fn bookmarks_mut(&mut self) -> &mut BookmarkManager { + unimplemented!() + } + + fn specialize(&self) -> Result { + // Note [Naming "Lexer"] + generate::specialize(self,"Lexer","Output") + } +} + +/* Note [Naming "Lexer"] + * ~~~~~~~~~~~~~~~~~~~~~ + * In general, the name passed to `specialize` should match that of your lexer definition. However + * here, as we never compile the code, we set it to a generic constant that is a valid rust + * identifier so as to reduce testing boilerplate. + */ + + + +// ==================== +// === Definition 1 === +// ==================== + +pub struct Lexer1 { + lexer:Flexer +} + +impl Deref for Lexer1 { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.lexer + } +} + +impl DerefMut for Lexer1 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.lexer + } +} + +impl Lexer1 { + pub fn new() -> Lexer1 { + let logger = Logger::new("Lexer1"); + let lexer = Flexer::new(logger); + Lexer1 {lexer} + } + + pub fn my_test_fun(&mut self, _reader:&mut R) { + unimplemented!() + } +} + +impl flexer::Definition for Lexer1 { + fn define() -> Self { + let mut lexer = Self::new(); + + let foo = Pattern::all_of("foo"); + + let root_group_id = lexer.initial_state(); + let root_group = lexer.groups_mut().group_mut(root_group_id); + root_group.create_rule(&foo, "ETERNAL SCREAMING"); + + lexer + } + + fn groups(&self) -> &Registry { + self.lexer.groups() + } +} + +#[test] +fn test_bad_rule_expression() { + let lexer = Lexer1::define(); + let result = lexer.specialize(); + assert!(result.is_err()); + let message = result.unwrap_err().to_string(); + assert_eq!(message,"`ETERNAL SCREAMING` is not a valid rust expression."); +} + + +// ==================== +// === Definition 2 === +// ==================== + +pub struct Lexer2 { + lexer:Flexer +} + +impl Deref for Lexer2 { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.lexer + } +} + +impl DerefMut for Lexer2 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.lexer + } +} + +impl Lexer2 { + pub fn new() -> Lexer2 { + let logger = Logger::new("Lexer2"); + let lexer = Flexer::new(logger); + Lexer2{lexer} + } + + pub fn my_test_fun(&mut self, _reader:&mut R) { + unimplemented!() + } +} + +impl flexer::Definition for Lexer2 { + fn define() -> Self { + let mut lexer = Self::new(); + + let foo = Pattern::all_of("foo"); + + let root_group_id = lexer.initial_state(); + let root_group = lexer.groups_mut().group_mut(root_group_id); + root_group.create_rule(&foo, "self.test_function_no_reader()"); + + lexer + } + + fn groups(&self) -> &Registry { + self.lexer.groups() + } +} + +#[test] +pub fn test_no_reader_arg() { + let lexer = Lexer2::define(); + let result = lexer.specialize(); + let expected_message = + "Bad argument to a callback function. It must take a single argument `reader`."; + assert!(result.is_err()); + let message = result.unwrap_err().to_string(); + assert_eq!(message,expected_message); +} + + + +// ==================== +// === Definition 3 === +// ==================== + +pub struct Lexer3 { + lexer:Flexer +} + +impl Deref for Lexer3 { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.lexer + } +} + +impl DerefMut for Lexer3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.lexer + } +} + +impl Lexer3 { + pub fn new() -> Lexer3 { + let logger = Logger::new("Lexer3"); + let lexer = Flexer::new(logger); + Lexer3{lexer} + } + + pub fn my_test_fun(&mut self, _reader:&mut R) { + unimplemented!() + } +} + +impl flexer::Definition for Lexer3 { + fn define() -> Self { + let mut lexer = Self::new(); + + let foo = Pattern::all_of("foo"); + + let root_group_id = lexer.initial_state(); + let root_group = lexer.groups_mut().group_mut(root_group_id); + root_group.create_rule(&foo, "self.test_function_reader(reader)"); + + lexer + } + + fn groups(&self) -> &Registry { + self.lexer.groups() + } +} + +pub struct LexerState1 { + lexer_states:group::Registry, + initial_state:group::Identifier, +} +impl flexer::State for LexerState1 { + fn new() -> Self { + let mut lexer_states = group::Registry::default(); + let initial_state = lexer_states.define_group("ROOT",None); + LexerState1 {lexer_states,initial_state} + } + + fn initial_state(&self) -> Identifier { + self.initial_state + } + + fn groups(&self) -> &Registry { + &self.lexer_states + } + + fn groups_mut(&mut self) -> &mut Registry { + &mut self.lexer_states + } + + fn bookmarks(&self) -> &BookmarkManager { + unimplemented!() + } + + fn bookmarks_mut(&mut self) -> &mut BookmarkManager { + unimplemented!() + } + + fn specialize(&self) -> Result { + generate::specialize(self,"Bad Lexer Name","Output") + } +} + +#[test] +pub fn test_bad_state_name() { + let lexer = Lexer3::define(); + let result = lexer.specialize(); + assert!(result.is_err()); + let message = result.unwrap_err().to_string(); + assert_eq!(message,"`Bad Lexer Name` is not a valid rust identifier."); +} + + + +// ==================== +// === Definition 4 === +// ==================== + +pub struct Lexer4 { + lexer:Flexer +} + +impl Deref for Lexer4 { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.lexer + } +} + +impl DerefMut for Lexer4 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.lexer + } +} + +impl Lexer4 { + pub fn new() -> Lexer4 { + let logger = Logger::new("Lexer4"); + let lexer = Flexer::new(logger); + Lexer4{lexer} + } + + pub fn my_test_fun(&mut self, _reader:&mut R) { + unimplemented!() + } +} + +impl flexer::Definition for Lexer4 { + fn define() -> Self { + let mut lexer = Self::new(); + + let foo = Pattern::all_of("foo"); + + let root_group_id = lexer.initial_state(); + let root_group = lexer.groups_mut().group_mut(root_group_id); + root_group.create_rule(&foo, "self.test_function_reader(reader)"); + + lexer + } + + fn groups(&self) -> &Registry { + self.lexer.groups() + } +} + +pub struct LexerState2 { + lexer_states:group::Registry, + initial_state:group::Identifier, +} +impl flexer::State for LexerState2 { + fn new() -> Self { + let mut lexer_states = group::Registry::default(); + let initial_state = lexer_states.define_group("ROOT",None); + LexerState2 {lexer_states,initial_state} + } + + fn initial_state(&self) -> Identifier { + self.initial_state + } + + fn groups(&self) -> &Registry { + &self.lexer_states + } + + fn groups_mut(&mut self) -> &mut Registry { + &mut self.lexer_states + } + + fn bookmarks(&self) -> &BookmarkManager { + unimplemented!() + } + + fn bookmarks_mut(&mut self) -> &mut BookmarkManager { + unimplemented!() + } + + fn specialize(&self) -> Result { + generate::specialize(self,"Lexer4","Bad output name") + } +} + +#[test] +pub fn test_bad_output_name() { + let lexer = Lexer4::define(); + let result = lexer.specialize(); + assert!(result.is_err()); + let message = result.unwrap_err().to_string(); + assert_eq!(message,"`Bad output name` is not a valid rust identifier."); +} diff --git a/lib/rust/lazy-reader/Cargo.toml b/lib/rust/lazy-reader/Cargo.toml index e54e97266de..52ce2baad4f 100644 --- a/lib/rust/lazy-reader/Cargo.toml +++ b/lib/rust/lazy-reader/Cargo.toml @@ -21,4 +21,5 @@ test = true bench = true [dependencies] -itertools = "0.8" +itertools = "0.8" +enso-prelude = { path = "../enso-prelude", version = "0.1.0" } diff --git a/lib/rust/lazy-reader/src/lib.rs b/lib/rust/lazy-reader/src/lib.rs index c504fb0532b..5effa283515 100644 --- a/lib/rust/lazy-reader/src/lib.rs +++ b/lib/rust/lazy-reader/src/lib.rs @@ -12,6 +12,8 @@ pub mod decoder; +use enso_prelude::*; + use decoder::Decoder; use crate::decoder::{Char, InvalidChar}; use crate::Error::EOF; @@ -70,6 +72,42 @@ pub enum Error { EOF, /// Couldn't decode character. InvalidChar, + /// The lexer has found no matching rule in the current state. + EndOfGroup, +} + +impl Error { + /// The `u32` value that corresponds to EOF. + pub const END_OF_FILE:u32 = u32::max_value(); + /// The `u32` value that corresponds to an invalid character. + pub const INVALID_CHAR:u32 = u32::max_value() - 1; + /// The `u32` value corresponding to the end of group. + pub const END_OF_GROUP:u32 = u32::max_value() - 2; +} + + +// === Trait Impls === + +impl From> for decoder::Char { + fn from(char:Char) -> Self { + let size = char.size; + let char = match char.char { + Ok(char) => Ok(char), + Err(_) => Err(Error::InvalidChar), + }; + decoder::Char{char,size} + } +} + +impl From> for u32 { + fn from(char:decoder::Char) -> Self { + match char.char { + Ok (char) => char as u32, + Err(Error::EOF) => Error::END_OF_FILE, + Err(Error::InvalidChar) => Error::INVALID_CHAR, + Err(Error::EndOfGroup) => Error::END_OF_GROUP, + } + } } @@ -94,56 +132,42 @@ impl BookmarkId { -// ================ -// === Bookmark === -// ================ - -/// Bookmarks a specific character in buffer, so that `LazyReader` can return to it when needed. -#[derive(Debug,Clone,Copy,Default,PartialEq)] -pub struct Bookmark { - /// The position of bookmarked character in `reader.buffer`. - offset: usize, - /// The length of `reader.result` up to the bookmarked character. - length: usize, -} - - - // ================== // === LazyReader === // ================== /// The behaviour needed by the lazy reader interface. pub trait LazyReader { - /// Creates a new bookmark, providing a handle so it can be used later. - fn add_bookmark(&mut self) -> BookmarkId; - /// Bookmarks the current character using the provided `bookmark`, so that the reader can later - /// return to it using `rewind()`. - /// - /// Panics if `bookmark` refers to a nonexistent bookmark. - fn bookmark(&mut self, bookmark:BookmarkId); - /// Returns the reader to the character bookmarked using `bookmark`. - fn rewind(&mut self, bookmark:BookmarkId); - /// The maximum number of words that may be rewound in the buffer. - fn max_possible_rewind_len(&self) -> usize; - /// Decrease the offset for all bookmarks. - fn decrease_offset(&mut self, off:usize); - /// Fill the buffer with words from the input. - fn fill(&mut self); - /// Checks if the reader is empty. - fn empty(&self) -> bool; - /// Checks if the reader has finished reading. - fn finished(&self) -> bool; - /// Reads the next character from input. - fn next_char(&mut self) -> Result; - /// Gets the current character from the reader. + /// Read the next character from input. + fn next_char(&mut self, bookmarks:&mut BookmarkManager) -> Result; + /// Advance along the input without returning the character. + fn advance_char(&mut self, bookmarks:&mut BookmarkManager); + /// Get the current character from the reader. fn character(&self) -> decoder::Char; - /// Advances along the input without returning the character. - fn advance_char(&mut self); - /// Appends the provided character to the reader's result. + /// Check if the reader has finished reading. + fn finished(&self) -> bool; + /// Check if the reader is empty. + fn empty(&self) -> bool; + /// Fill the buffer with words from the input. + fn fill(&mut self, bookmarks:&mut BookmarkManager); + /// Get the maximum possible rewind for the reader. + fn max_possible_rewind_len(&self, bookmarks:&BookmarkManager) -> usize; + /// Append the provided character to the reader's result. fn append_result(&mut self, char:char); - /// Returns `self.result` and sets the internal result to empty. + /// Return `self.result` and sets the internal result to empty. fn pop_result(&mut self) -> String; + /// Get the reader's current offset in the buffer. + fn offset(&self) -> usize; + /// Get an immutable reference to the reader's result. + fn result(&self) -> &String; + /// Get a mutable reference to the reader's result. + fn result_mut(&mut self) -> &mut String; + /// Get the current length of the reader's buffer. + fn buffer_len(&self) -> usize; + /// Set the buffer offset to the specified value. + fn set_offset(&mut self, off:usize); + /// Truncate the current match to the provided length. + fn truncate_match(&mut self, len:usize); } /// The default size of the buffer. @@ -171,8 +195,6 @@ pub struct Reader { pub offset: usize, /// The number of elements stored in buffer. pub length: usize, - /// Flag that is true iff the reader was just rewinded and no new chars were read. - pub bookmark: Vec, /// The last character read. pub character: decoder::Char, } @@ -186,7 +208,6 @@ impl> Reader { result : String::from(""), offset : 0, length : 0, - bookmark : Vec::new(), character : decoder::Char{char:Err(Error::EOF), size:0}, }; reader.length = reader.reader.read(&mut reader.buffer[..]); @@ -198,64 +219,11 @@ impl> Reader { // === Trait Impls === impl> LazyReader for Reader { - fn add_bookmark(&mut self) -> BookmarkId { - self.bookmark.push(Bookmark::default()); - BookmarkId::new(self.bookmark.len() - 1) - } - - fn bookmark(&mut self, bookmark:BookmarkId) { - self.bookmark[bookmark.id].offset = self.offset - self.character.size; - self.bookmark[bookmark.id].length = self.result.len(); - } - - fn rewind(&mut self, bookmark:BookmarkId) { - self.offset = self.bookmark[bookmark.id].offset; - self.result.truncate(self.bookmark[bookmark.id].length); - let _ = self.next_char(); - } - - fn max_possible_rewind_len(&self) -> usize { - if let Some(offset) = self.bookmark.iter().map(|b| b.offset).min() { - return self.buffer.len() - offset - } - D::MAX_CODEPOINT_LEN - } - - fn decrease_offset(&mut self, off:usize) { - for bookmark in self.bookmark.iter_mut() { - bookmark.offset -= off - } - } - - fn fill(&mut self) { - let len = self.buffer.len(); - let words = len - self.offset; - self.offset = self.max_possible_rewind_len(); - if self.offset == len { - panic!("Rewind won't be possible. Buffer is too small.") - } - - self.decrease_offset(len - self.offset); - for i in 1..=self.offset { - self.buffer[self.offset - i] = self.buffer[len - i]; - } - self.length = self.offset + self.reader.read(&mut self.buffer[self.offset..]); - self.offset -= words; - } - - fn empty(&self) -> bool { - self.length < self.buffer.len() && self.length <= self.offset - } - - fn finished(&self) -> bool { - self.empty() && self.character.char == Err(EOF) - } - - fn next_char(&mut self) -> Result { + fn next_char(&mut self, bookmarks:&mut BookmarkManager) -> Result { if self.empty() { self.character.char = Err(Error::EOF); return Err(Error::EOF) } if self.offset >= self.buffer.len() - D::MAX_CODEPOINT_LEN { - self.fill(); + self.fill(bookmarks); } self.character = D::decode(&self.buffer[self.offset..]).into(); @@ -264,12 +232,43 @@ impl> LazyReader for Reader { self.character.char } + fn advance_char(&mut self, bookmarks:&mut BookmarkManager) { + let _ = self.next_char(bookmarks); + } + fn character(&self) -> Char { self.character } - fn advance_char(&mut self) { - let _ = self.next_char(); + fn finished(&self) -> bool { + self.empty() && self.character.char == Err(EOF) + } + + fn empty(&self) -> bool { + self.length < self.buffer.len() && self.length <= self.offset + } + + fn fill(&mut self, bookmarks:&mut BookmarkManager) { + let len = self.buffer.len(); + let words = len - self.offset; + self.offset = self.max_possible_rewind_len(bookmarks); + if self.offset == len { + panic!("Rewind won't be possible. Buffer is too small.") + } + + bookmarks.decrease_bookmark_offsets(len - self.offset); + for i in 1..=self.offset { + self.buffer[self.offset - i] = self.buffer[len - i]; + } + self.length = self.offset + self.reader.read(&mut self.buffer[self.offset..]); + self.offset -= words; + } + + fn max_possible_rewind_len(&self, bookmarks:&BookmarkManager) -> usize { + if let Some(offset) = bookmarks.min_offset() { + return self.buffer_len() - offset + } + D::MAX_CODEPOINT_LEN } fn append_result(&mut self,char:char) { @@ -281,27 +280,135 @@ impl> LazyReader for Reader { self.result.truncate(0); str } -} -impl From> for decoder::Char { - fn from(char:Char) -> Self { - let size = char.size; - let char = match char.char { - Ok(char) => Ok(char), - Err(_) => Err(Error::InvalidChar), - }; - decoder::Char{char,size} + fn offset(&self) -> usize { + self.offset + } + + fn result(&self) -> &String { + &self.result + } + + fn result_mut(&mut self) -> &mut String { + &mut self.result + } + + fn buffer_len(&self) -> usize { + self.buffer.len() + } + + fn set_offset(&mut self, off: usize) { + self.offset = off; + } + + fn truncate_match(&mut self, len: usize) { + self.result.truncate(len); } } -impl From> for u32 { - fn from(char:decoder::Char) -> Self { - match char.char { - Ok (char) => char as u32, - Err(Error::EOF) => u32::max_value(), - Err(Error::InvalidChar) => u32::max_value() - 1, + + +// ================ +// === Bookmark === +// ================ + +/// Bookmarks a specific character in buffer, so that `LazyReader` can return to it when needed. +#[derive(Debug,Clone,Copy,Default,PartialEq)] +pub struct Bookmark { + /// The position of the bookmarked character in the `reader.buffer`. + offset: usize, + /// The length of `reader.result` up to the bookmarked character. + length: usize, + /// Whether or not the bookmark has been set by the user. + set:bool +} + + + +// ======================= +// === BookmarkManager === +// ======================= + +/// Contains and manages bookmarks for a running lexer. +/// +/// Some of its operations operate on a specific [`Reader`]. It is undefined behaviour to not pass +/// the same reader to all calls for a given bookmark manager. +#[allow(missing_docs)] +#[derive(Clone,Debug,PartialEq)] +pub struct BookmarkManager { + bookmarks: Vec, + /// The bookmark used by the flexer to mark the end of the last matched segment of the input. + pub matched_bookmark: BookmarkId, + /// A bookmark used by the flexer to deal with overlapping rules that may fail later. + pub rule_bookmark: BookmarkId, +} + +#[allow(missing_docs)] +impl BookmarkManager { + /// Create a new bookmark manager, with no associated bookmarks. + pub fn new() -> BookmarkManager { + let mut bookmarks = Vec::new(); + let matched_bookmark = BookmarkManager::make_bookmark(&mut bookmarks); + let rule_bookmark = BookmarkManager::make_bookmark(&mut bookmarks); + BookmarkManager {bookmarks,matched_bookmark,rule_bookmark} + } + + /// Create a new bookmark in the manager, returning a handle to it. + fn make_bookmark(bookmarks:&mut Vec) -> BookmarkId { + bookmarks.push(Bookmark::default()); + BookmarkId::new(bookmarks.len() - 1) + } + + /// Add a bookmark to the manager, returning a handle to that bookmark. + pub fn add_bookmark(&mut self) -> BookmarkId { + BookmarkManager::make_bookmark(&mut self.bookmarks) + } + + /// Bookmarks the current position in `reader` using `bookmark`. + pub fn bookmark(&mut self, bookmark:BookmarkId, reader:&mut R) { + self.bookmarks[bookmark.id].offset = reader.offset() - reader.character().size; + self.bookmarks[bookmark.id].length = reader.result().len(); + self.bookmarks[bookmark.id].set = true + } + + /// Unsets a bookmark. + pub fn unset(&mut self, bookmark:BookmarkId) { + self.bookmarks[bookmark.id].offset = 0; + self.bookmarks[bookmark.id].length = 0; + self.bookmarks[bookmark.id].set = false + } + + /// Decrease the offset for all bookmarks by the specified `amount` in preparation for + /// truncating the reader's buffer. + pub fn decrease_bookmark_offsets(&mut self, amount:usize) { + for bookmark in self.bookmarks.iter_mut() { + if bookmark.set { + bookmark.offset -= amount + } } } + + /// Rewind the reader to the position marked by `bookmark`. + pub fn rewind(&mut self, bookmark:BookmarkId, reader:&mut R) { + let bookmark = self.bookmarks.get(bookmark.id).expect("Bookmark must exist."); + reader.set_offset(bookmark.offset); + reader.truncate_match(bookmark.length); + reader.advance_char(self); + } + + /// Obtains the minimum offset from the start of the buffer for any bookmark. + pub fn min_offset(&self) -> Option { + self.bookmarks.iter().filter_map(|b| b.set.and_option(Some(b.offset))).min() + } +} + + +// === Trait Impls === + +impl Default for BookmarkManager { + fn default() -> Self { + BookmarkManager::new() + } } @@ -319,8 +426,6 @@ mod tests { use test::Bencher; - - // ================ // === Repeater === // ================ @@ -380,6 +485,17 @@ mod tests { + // ============= + // === Utils === + // ============= + + /// Constructs an _empty_ bookmark manager for testing purposes. + pub fn bookmark_manager() -> BookmarkManager { + BookmarkManager::new() + } + + + // ============= // === Tests === // ============= @@ -406,10 +522,11 @@ mod tests { #[test] fn test_reader_small_input() { + let mut mgr = bookmark_manager(); let str = "a.b^c! #𤭢界んにち𤭢#𤭢"; let mut reader = Reader::new(str.as_bytes(), DecoderUTF8()); let mut result = String::from(""); - while let Ok(char) = reader.next_char() { + while let Ok(char) = reader.next_char(&mut mgr) { result.push(char); } assert_eq!(&result, str); @@ -417,24 +534,26 @@ mod tests { #[test] fn test_reader_big_input() { + let mut mgr = bookmark_manager(); let str = "a.b^c! #𤭢界んにち𤭢#𤭢".repeat(10_000); let mut reader = Reader::new(str.as_bytes(), DecoderUTF8()); let mut result = String::from(""); - while let Ok(char) = reader.next_char() { + while let Ok(char) = reader.next_char(&mut mgr) { + mgr.bookmark(mgr.matched_bookmark,&mut reader); result.push(char); } assert_eq!(&result, &str); - assert_eq!(reader.bookmark.len(), 0); assert_eq!(reader.buffer.len(), BUFFER_SIZE); } #[bench] fn bench_reader(bencher:&mut Bencher) { let run = || { + let mut mgr = bookmark_manager(); let str = repeat("Hello, World!".as_bytes().to_vec(), 10_000_000); let mut reader = Reader::new(str, DecoderUTF8()); let mut count = 0; - while reader.next_char() != Err(Error::EOF) { + while reader.next_char(&mut mgr) != Err(Error::EOF) { count += 1; } count diff --git a/lib/rust/lexer/definition/Cargo.toml b/lib/rust/lexer/definition/Cargo.toml deleted file mode 100644 index 7a6d49323c7..00000000000 --- a/lib/rust/lexer/definition/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "lexer-definition" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -description = "Lexer for the enso language." -readme = "README.md" -homepage = "https://github.com/enso-org/enso/lib/rust/lexer/definition" -repository = "https://github.com/enso-org/enso" -license-file = "../../../../LICENSE" - -keywords = ["lexer", "finite-automata"] -categories = ["parsing"] - -publish = false - -[lib] -crate-type = ["cdylib", "rlib"] -test = true -bench = true - -[dependencies] -flexer = { path = "../../flexer", version = "0.1.0" } diff --git a/lib/rust/lexer/definition/src/lexer.rs b/lib/rust/lexer/definition/src/lexer.rs deleted file mode 100644 index 450aae4515b..00000000000 --- a/lib/rust/lexer/definition/src/lexer.rs +++ /dev/null @@ -1,22 +0,0 @@ -//! This module exposes the definition of the Enso lexer. - -use flexer::FlexerTemp; - - - -// ======================== -// === Lexer Definition === -// ======================== - -/// The definition of enso lexer that is responsible for lexing the enso source code. -/// -/// It chunks the character stream into a (structured) token stream in order to make later -/// processing faster, and to identify blocks. -#[derive(Debug,Clone,Copy)] -pub struct Lexer {} - -impl FlexerTemp for Lexer { - fn new() -> Self { - Lexer{} - } -} diff --git a/lib/rust/lexer/definition/src/lib.rs b/lib/rust/lexer/definition/src/lib.rs deleted file mode 100644 index e3b1bdc2cf4..00000000000 --- a/lib/rust/lexer/definition/src/lib.rs +++ /dev/null @@ -1,13 +0,0 @@ -#![feature(test)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This library contains the definition of parser for the enso language. - -pub mod lexer; diff --git a/lib/rust/lexer/generation/Cargo.toml b/lib/rust/lexer/generation/Cargo.toml deleted file mode 100644 index 2b05c449fe2..00000000000 --- a/lib/rust/lexer/generation/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "lexer-generation" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -description = "The generated code for the lexer of the enso language." -readme = "README.md" -homepage = "https://github.com/enso-org/enso/lib/rust/lexer/generation" -repository = "https://github.com/enso-org/enso" -license-file = "../../../../LICENSE" - -keywords = ["lexer", "finite-automata"] -categories = ["parsing"] - -publish = false - -[lib] -crate-type = ["cdylib", "rlib"] -test = true -bench = true - -[dependencies] -flexer = { path = "../../flexer" , version = "0.1.0" } -lexer-definition = { path = "../definition", version = "0.1.0" } - -[build-dependencies] -flexer = { path = "../../flexer" , version = "0.1.0" } -lexer-definition = { path = "../definition", version = "0.1.0" } diff --git a/lib/rust/lexer/generation/build.rs b/lib/rust/lexer/generation/build.rs deleted file mode 100644 index 68c7f5170dc..00000000000 --- a/lib/rust/lexer/generation/build.rs +++ /dev/null @@ -1,20 +0,0 @@ -use std::fs::File; -use std::io::prelude::*; -use lexer_definition::lexer::Lexer; -use flexer::FlexerTemp; - - - -/// Generates the lexer engine and saves the result into the file `src/lexer-engine.rs`. -/// -/// The content of the generated file can be used with the `include!` macro. -fn generate_engine() -> std::io::Result<()> { - let mut file = File::create("src/lexer-engine.rs")?; - let engine = Lexer::new().generate_specialized_code(); - file.write_all(engine.as_bytes())?; - Ok(()) -} - -fn main() -> std::io::Result<()> { - generate_engine() -} diff --git a/lib/rust/lexer/generation/src/lexer.rs b/lib/rust/lexer/generation/src/lexer.rs deleted file mode 100644 index ae32a67604c..00000000000 --- a/lib/rust/lexer/generation/src/lexer.rs +++ /dev/null @@ -1,3 +0,0 @@ -//! This file includes the source code generated by lexer specialization. - -include!("lexer-engine.rs"); diff --git a/lib/rust/lexer/generation/src/lib.rs b/lib/rust/lexer/generation/src/lib.rs deleted file mode 100644 index ee46fa54866..00000000000 --- a/lib/rust/lexer/generation/src/lib.rs +++ /dev/null @@ -1,9 +0,0 @@ -//! This library exposes the specialized version of the Enso lexer. -//! -//! Its sole purpose is to avoid the lexer definition getting out of sync with its implementation -//! (the generated engine), which requires the engine to live in a separate crate. -//! -//! This separation enables generation of the enso lexer source code with `build.rs` during -//! compilation. Its output is then stored in a new file `lexer-engine.rs`and exported by `lexer.rs`. - -pub mod lexer; diff --git a/lib/rust/lexer/tests/Cargo.toml b/lib/rust/lexer/tests/Cargo.toml deleted file mode 100644 index 599886f61c9..00000000000 --- a/lib/rust/lexer/tests/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "lexer-tests" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -description = "The tests for the lexer of the enso language." -readme = "README.md" -homepage = "https://github.com/enso-org/enso/lib/rust/lexer/tests" -repository = "https://github.com/enso-org/enso" -license-file = "../../../../LICENSE" - -keywords = ["lexer", "finite-automata"] -categories = ["parsing"] - -publish = false - -[dependencies] -flexer = { path = "../../flexer" , version = "0.1.0" } -lexer-generation = { path = "../generation", version = "0.1.0" } diff --git a/lib/rust/lexer/tests/src/main.rs b/lib/rust/lexer/tests/src/main.rs deleted file mode 100644 index 6463db2d5f6..00000000000 --- a/lib/rust/lexer/tests/src/main.rs +++ /dev/null @@ -1,33 +0,0 @@ -#![feature(test)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This file tests the generated source code of the enso lexer. - - - -// ============= -// === Tests === -// ============= - -fn main() { - println!("This needs to be here because the crate isn't a library.") -} - -#[cfg(test)] -mod tests { - use lexer_generation::lexer::Lexer; - - - - #[test] - fn test_lexer_generation() { - assert_eq!(format!("{:?}", Lexer{}), "Lexer"); - } -} diff --git a/lib/scala/flexer/src/main/scala/org/enso/flexer/Spec.scala b/lib/scala/flexer/src/main/scala/org/enso/flexer/Spec.scala index 85d9abdd916..19f397cebd7 100644 --- a/lib/scala/flexer/src/main/scala/org/enso/flexer/Spec.scala +++ b/lib/scala/flexer/src/main/scala/org/enso/flexer/Spec.scala @@ -95,7 +95,7 @@ case class Spec[C <: Context](c: C, dfa: DFA) { utf2 match { case b2 +: utf2 => - val b1UTF = Branch(b1.range.start to MIN_MATCH_CODE - 1, b1.body) + val b1UTF = Branch(b1.range.start until MIN_MATCH_CODE, b1.body) val b1ASC = Branch(MIN_MATCH_CODE to b1.range.end, b1.body) val b2ASC = Branch(b2.range.start to MAX_MATCH_CODE, b2.body) val b2UTF = Branch(MAX_MATCH_CODE + 1 to b2.range.end, b2.body) diff --git a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDefSmall.scala b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDefSmall.scala deleted file mode 100644 index ae607247a27..00000000000 --- a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDefSmall.scala +++ /dev/null @@ -1,132 +0,0 @@ -package org.enso.syntax.text.spec - -import org.enso.flexer -import org.enso.flexer.automata.Pattern -import org.enso.flexer.automata.Pattern._ -import org.enso.flexer.{Reader, State} -import org.enso.syntax.text.AST - -case class ParserDefSmall() extends flexer.Parser[AST.Module] { - import ParserDef2._ - - final def unwrap[T](opt: Option[T]): T = - opt match { - case None => throw new Error("Internal Error") - case Some(t) => t - } - - ///////////// - //// API //// - ///////////// - - override def run(input: Reader): Result[AST.Module] = { - state.begin(ROOT) - super.run(input) - } - - // === Debug Lexer Definition === - - /* a-word = 'a'+; - * b-word = 'b'+; - * word = a-word | b-word; - * space = ' '; - * language = word, (space, word)*; - */ - - val aWord: Pattern = 'a'.many1 - val bWord: Pattern = 'b'.many1 - val space: Pattern = ' ' - val spacedAWord: Pattern = space >> aWord - val spacedBWord: Pattern = space >> bWord - val end: Pattern = eof - - final object Word { - def onFirstWord(word: String => AST.Ident): Unit = - logger.trace_ { - onFirstWord(word(currentMatch)) - } - - def onFirstWord(ast: AST.Ident): Unit = - logger.trace { - result.app(ast) - state.begin(SEEN_FIRST_WORD) - } - - def onSpacedWord(word: String => AST.Ident): Unit = - logger.trace_ { - onSpacedWord(word(currentMatch.stripLeading())) - } - - def onSpacedWord(ast: AST.Ident): Unit = - logger.trace { - result.app(ast) - } - - def onNoErrSuffixFirstWord(): Unit = - logger.trace { - submit() - } - - def onNoErrSuffix(): Unit = - logger.trace { - onNoErrSuffixFirstWord() - state.end() - } - - def onErrSuffixFirstWord(): Unit = - logger.trace { - val ast = AST.Invalid.Unrecognized(currentMatch) - result.app(ast) - } - - def onErrSuffix(): Unit = - logger.trace { - onNoErrSuffixFirstWord() - state.end() - } - - def submit(): Unit = logger.trace {} - - val SEEN_FIRST_WORD: State = state.define("Inside Word") - } - - ROOT || aWord || Word.onFirstWord(AST.Var(_)) - ROOT || bWord || Word.onFirstWord(AST.Var(_)) - ROOT || eof || Word.onNoErrSuffixFirstWord() - ROOT || any || Word.onErrSuffixFirstWord() - Word.SEEN_FIRST_WORD || spacedAWord || Word.onSpacedWord(AST.Var(_)) - Word.SEEN_FIRST_WORD || spacedBWord || Word.onSpacedWord(AST.Var(_)) - Word.SEEN_FIRST_WORD || eof || Word.onNoErrSuffix() - Word.SEEN_FIRST_WORD || any || Word.onErrSuffix() - - //////////////// - //// Result //// - //////////////// - - override def getResult() = - result.current.flatMap { - case AST.Module.any(mod) => Some(mod) - case a => - val line = AST.Block.OptLine(a) - Some(AST.Module(line, List())) - } - - final object result { - var current: Option[AST] = None - - def app(fn: String => AST): Unit = - app(fn(currentMatch)) - - def app(ast: AST): Unit = - logger.trace { - current = Some(current match { - case None => ast - case Some(r) => AST.App.Prefix(r, ast) - }) - } - } -} - -object ParserDef2Small { - type Result[T] = flexer.Parser.Result[T] -} diff --git a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/__Parser__.scala b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/__Parser__.scala deleted file mode 100644 index 1c70c26caf4..00000000000 --- a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/__Parser__.scala +++ /dev/null @@ -1,553 +0,0 @@ -package org.enso.syntax.text.spec - -import org.enso.flexer -import org.enso.flexer.State -import org.enso.syntax.text.AST - -import scala.annotation.tailrec - -final class __Parser__ extends flexer.Parser[AST.Module] { - - final object Word { - def onFirstWord(word: String => AST.Ident): Unit = - logger.trace_ { - onFirstWord(word(currentMatch)) - } - - def onFirstWord(ast: AST.Ident): Unit = - logger.trace { - result.app(ast) - state.begin(SEEN_FIRST_WORD) - } - - def onSpacedWord(word: String => AST.Ident): Unit = - logger.trace_ { - onSpacedWord(word(currentMatch.stripLeading())) - } - - def onSpacedWord(ast: AST.Ident): Unit = - logger.trace { - result.app(ast) - } - - def onNoErrSuffixFirstWord(): Unit = - logger.trace { - submit() - } - - def onNoErrSuffix(): Unit = - logger.trace { - onNoErrSuffixFirstWord() - state.end() - } - - def onErrSuffixFirstWord(): Unit = - logger.trace { - val ast = AST.Invalid.Unrecognized(currentMatch) - result.app(ast) - } - - def onErrSuffix(): Unit = - logger.trace { - onNoErrSuffixFirstWord() - state.end() - } - - def submit(): Unit = logger.trace {} - - val SEEN_FIRST_WORD: State = state.define("Inside Word") - } - - final object result { - - var current: Option[AST] = None - var stack: List[Option[AST]] = Nil - - def push(): Unit = - logger.trace { - logger.log(s"Pushed: $current") - stack +:= current - current = None - } - - def pop(): Unit = - logger.trace { - current = stack.head - stack = stack.tail - logger.log(s"New result: ${current.map(_.show()).getOrElse("None")}") - } - - def app(fn: String => AST): Unit = - app(fn(currentMatch)) - - def app(ast: AST): Unit = - logger.trace { - current = Some(current match { - case None => ast - case Some(r) => AST.App.Prefix(r, ast) - }) - } - - def last(): Option[AST] = { - @tailrec - def go(ast: AST): AST = - ast match { - case AST.App.Prefix.any(t) => go(t.arg) - case t => t - } - current.map(go) - } - } - - override def getResult() = - result.current.flatMap { - case AST.Module.any(mod) => Some(mod) - case _ => None - } - - stateDefs.update(0, nextState0) - - def nextState0(state: Int): Int = - state match { - case 0 => state0_0 - case 1 => state0_1 - case 2 => state0_2 - case 3 => state0_3 - case 4 => state0_4 - case 5 => state0_5 - case 6 => state0_6 - } - - def state0_0 = - reader.charCode match { - case -1 => 1 - case (0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | - 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | - 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | - 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | - 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | - 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | - 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - 2 - case 97 => 3 - case 98 => 4 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - 2 - case (charCode @ _) => - if (charCode.<=(-2)) - -2 - else - 2 - } - - def state0_1 = state.call(group0_rule2) - - def state0_2 = state.call(group0_rule3) - - def state0_3 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - state.call(group0_rule0) - case 97 => 5 - case (98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | - 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | - 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | - 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | - 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | - 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | - 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | - 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | - 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | - 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | - 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | - 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | - 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | - 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | - 252 | 253 | 254 | 255) => - state.call(group0_rule0) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group0_rule0) - else - state.call(group0_rule0) - } - - def state0_4 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | - 97) => - state.call(group0_rule1) - case 98 => 6 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - state.call(group0_rule1) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group0_rule1) - else - state.call(group0_rule1) - } - - def state0_5 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - state.call(group0_rule0) - case 97 => 5 - case (98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | - 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | - 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | - 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | - 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | - 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | - 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | - 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | - 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | - 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | - 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | - 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | - 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | - 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | - 252 | 253 | 254 | 255) => - state.call(group0_rule0) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group0_rule0) - else - state.call(group0_rule0) - } - - def state0_6 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | - 97) => - state.call(group0_rule1) - case 98 => 6 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - state.call(group0_rule1) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group0_rule1) - else - state.call(group0_rule1) - } - - def group0_rule0() = - Word.onFirstWord( - ((x$1: String) => org.enso.syntax.text.AST.Var.apply(x$1)) - ) - - def group0_rule1() = - Word.onFirstWord( - ((x$2: String) => org.enso.syntax.text.AST.Var.apply(x$2)) - ) - - def group0_rule2() = Word.onNoErrSuffixFirstWord() - - def group0_rule3() = Word.onErrSuffixFirstWord() - - stateDefs.update(1, nextState1) - - def nextState1(state: Int): Int = - state match { - case 0 => state1_0 - case 1 => state1_1 - case 2 => state1_2 - case 3 => state1_3 - case 4 => state1_4 - case 5 => state1_5 - case 6 => state1_6 - case 7 => state1_7 - } - - def state1_0 = - reader.charCode match { - case -1 => 1 - case (0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | - 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | - 29 | 30 | 31) => - 2 - case 32 => 3 - case (33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | - 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | - 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | - 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | - 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | - 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | - 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | - 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | - 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | - 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | - 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | - 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | - 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | - 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | - 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | - 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | - 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | - 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | - 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | - 255) => - 2 - case (charCode @ _) => - if (charCode.<=(-2)) - -2 - else - 2 - } - - def state1_1 = state.call(group1_rule2) - - def state1_2 = state.call(group1_rule3) - - def state1_3 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - state.call(group1_rule3) - case 97 => 4 - case 98 => 5 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - state.call(group1_rule3) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group1_rule3) - else - state.call(group1_rule3) - } - - def state1_4 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - state.call(group1_rule0) - case 97 => 6 - case (98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | - 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | - 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | - 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | - 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | - 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | - 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | - 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | - 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | - 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | - 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | - 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | - 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | - 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | - 252 | 253 | 254 | 255) => - state.call(group1_rule0) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group1_rule0) - else - state.call(group1_rule0) - } - - def state1_5 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | - 97) => - state.call(group1_rule1) - case 98 => 7 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - state.call(group1_rule1) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group1_rule1) - else - state.call(group1_rule1) - } - - def state1_6 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96) => - state.call(group1_rule0) - case 97 => 6 - case (98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | - 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | - 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | - 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | - 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | - 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | - 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | - 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | - 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | - 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | - 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | - 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | - 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | - 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | - 252 | 253 | 254 | 255) => - state.call(group1_rule0) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group1_rule0) - else - state.call(group1_rule0) - } - - def state1_7 = - reader.charCode match { - case (-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | - 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | - 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | - 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | - 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | - 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | - 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | - 97) => - state.call(group1_rule1) - case 98 => 7 - case (99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | - 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | - 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | - 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | - 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | - 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | - 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | - 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | - 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | - 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | - 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | - 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | - 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | - 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | - 253 | 254 | 255) => - state.call(group1_rule1) - case (charCode @ _) => - if (charCode.<=(-2)) - state.call(group1_rule1) - else - state.call(group1_rule1) - } - - def group1_rule0() = - Word.onSpacedWord( - ((x$3: String) => org.enso.syntax.text.AST.Var.apply(x$3)) - ) - - def group1_rule1() = - Word.onSpacedWord( - ((x$4: String) => org.enso.syntax.text.AST.Var.apply(x$4)) - ) - - def group1_rule2() = Word.onNoErrSuffix() - - def group1_rule3() = Word.onErrSuffix() -}