From e64c0384b045d7193cfd7fb51a673d18188eaa80 Mon Sep 17 00:00:00 2001 From: Ara Adkins Date: Thu, 27 Aug 2020 13:27:22 +0100 Subject: [PATCH] Implement part of the Enso lexer in rust (#1109) --- Cargo.toml | 2 + docs/distribution/packaging.md | 2 +- docs/parser/flexer.md | 5 + docs/parser/lexer.md | 41 +- docs/parser/operator-resolution.md | 18 + docs/semantics/modules.md | 6 +- docs/syntax/naming.md | 27 +- .../test/resources/Cycle_Test/package.yaml | 6 +- lib/rust/ast/src/ast.rs | 8 +- lib/rust/enso-logger/src/enabled.rs | 2 +- lib/rust/enso-optics/Cargo.toml | 2 +- .../enso-prelude/src/data/non_empty_vec.rs | 5 +- lib/rust/enso-prelude/src/lib.rs | 6 +- lib/rust/flexer-testing/definition/src/lib.rs | 103 +- ...ted_lexer.rs => flexer_generated_lexer.rs} | 0 lib/rust/flexer/src/automata/nfa.rs | 3 +- lib/rust/flexer/src/automata/pattern.rs | 58 +- lib/rust/flexer/src/automata/symbol.rs | 9 +- lib/rust/flexer/src/generate.rs | 49 +- lib/rust/flexer/src/group.rs | 19 +- lib/rust/flexer/src/lib.rs | 76 +- ...tions.rs => flexer_invalid_definitions.rs} | 40 +- lib/rust/lazy-reader/src/decoder.rs | 11 + lib/rust/lazy-reader/src/lib.rs | 21 +- lib/rust/lexer/definition/Cargo.toml | 18 + lib/rust/lexer/definition/src/lexer.rs | 1159 +++++++++++++++++ lib/rust/lexer/definition/src/lib.rs | 29 + lib/rust/lexer/definition/src/token.rs | 570 ++++++++ lib/rust/lexer/generation/Cargo.toml | 21 + lib/rust/lexer/generation/build.rs | 32 + lib/rust/lexer/generation/src/generated.rs | 3 + lib/rust/lexer/generation/src/lib.rs | 25 + lib/rust/lexer/generation/tests/enso_lexer.rs | 759 +++++++++++ .../org/enso/syntax/text/spec/ParserDef.scala | 1 - 34 files changed, 3000 insertions(+), 136 deletions(-) rename lib/rust/flexer-testing/generation/tests/{test_generated_lexer.rs => flexer_generated_lexer.rs} (100%) rename lib/rust/flexer/tests/{test_invalid_definitions.rs => flexer_invalid_definitions.rs} (93%) create mode 100644 lib/rust/lexer/definition/Cargo.toml create mode 100644 lib/rust/lexer/definition/src/lexer.rs create mode 100644 lib/rust/lexer/definition/src/lib.rs create mode 100644 lib/rust/lexer/definition/src/token.rs create mode 100644 lib/rust/lexer/generation/Cargo.toml create mode 100644 lib/rust/lexer/generation/build.rs create mode 100644 lib/rust/lexer/generation/src/generated.rs create mode 100644 lib/rust/lexer/generation/src/lib.rs create mode 100644 lib/rust/lexer/generation/tests/enso_lexer.rs diff --git a/Cargo.toml b/Cargo.toml index 5f3c112fcc..8c2bdee358 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ members = [ "lib/rust/flexer-testing/definition", "lib/rust/flexer-testing/generation", "lib/rust/lazy-reader", + "lib/rust/lexer/definition", + "lib/rust/lexer/generation", "lib/rust/parser", ] diff --git a/docs/distribution/packaging.md b/docs/distribution/packaging.md index 3de879592e..c6587eee56 100644 --- a/docs/distribution/packaging.md +++ b/docs/distribution/packaging.md @@ -42,7 +42,7 @@ My_Package │ ├── Helper.enso │ └── Util.enso └── visualization (optional) - └── + └── ``` ### The `src` Directory diff --git a/docs/parser/flexer.md b/docs/parser/flexer.md index 3d400fb510..75ecd5ec7d 100644 --- a/docs/parser/flexer.md +++ b/docs/parser/flexer.md @@ -86,6 +86,11 @@ deactivated by using `flexer::pop_state(state)` or from which they can inherit rules. This is fantastic for removing the need to repeat yourself when defining the lexer. +When inheriting rules from a parent group, the rules from the parent group are +matched strictly _after_ the rules from the child group. This means that groups +are able to selectively "override" the rules of their parents. Rules are still +matched in order for each group's set of rules. + ### Patterns Rules are defined to match _patterns_. Patterns are regular-grammar-like diff --git a/docs/parser/lexer.md b/docs/parser/lexer.md index 0c06bc4e67..c785e51643 100644 --- a/docs/parser/lexer.md +++ b/docs/parser/lexer.md @@ -15,11 +15,32 @@ identify blocks +- [Lexer Architecture](#lexer-architecture) + - [Libraries in the Lexer Definition](#libraries-in-the-lexer-definition) - [Lexer Functionality](#lexer-functionality) - [The Lexer AST](#the-lexer-ast) +## Lexer Architecture + +The structure of the flexer's code generation forces the lexer to be split into +two parts: the definition, and the generation. As the latter is the point from +which the lexer will be used, the second subproject is the one that is graced +with the name `lexer`. + +### Libraries in the Lexer Definition + +The lexer generation subproject needs to be able to make the assumption that all +imports will be in the same place (relative to the crate root). To this end, the +definition subproject exports public modules `library` and `prelude`. These are +re-imported and used in the generation subproject to ensure that all components +are found at the same paths relative to the crate root. + +This does mean, however, that all imports from _within_ the current crate in the +definition subproject must be imported from the `library` module, not from their +paths directly from the crate root. + ## Lexer Functionality The lexer needs to provide the following functionality as part of the parser. @@ -42,13 +63,21 @@ for use by the GUI. It contains the following constructs: -- `Var`: Variable identifiers. -- `Ref`: Referrent identifiers. -- `Opr`: Operator identifiers. -- `Number`: Numbers. -- `Text`: Text. -- `Invalid`: Invalid constructs that cannot be lexed. +- `Referent`: Referrent identifiers (e.g. `Some_Ref_Ident`). +- `Variable`: Variable identifiers (e.g. `some_var_ident`). +- `External`: External identifiers (e.g. `someJavaName`). +- `Blank`: The blank name `_`. +- `Operator`: Operator identifiers (e.g. `-->>`). +- `Modifier`: Modifier operators (e.g. `+=`). +- `Number`: Numbers (`16_FFFF`). +- `DanglingBase`: An explicit base without an associated number (e.g. `16_`). +- `Text`: Text (e.g. `"Some text goes here."`). +- `Line`: A line in a block that contains tokens. +- `BlankLine`: A line in a block that contains only whitespace. - `Block`: Syntactic blocks in the language. +- `InvalidSuffix`: Invalid tokens when in a given state that would otherwise be + valid. +- `Unrecognized`: Tokens that the lexer doesn't recognise. The distinction is made here between the various kinds of identifiers in order to keep lexing fast, but also in order to allow macros to switch on the kinds of diff --git a/docs/parser/operator-resolution.md b/docs/parser/operator-resolution.md index f7bcce5bae..e6d713cf1c 100644 --- a/docs/parser/operator-resolution.md +++ b/docs/parser/operator-resolution.md @@ -17,6 +17,24 @@ specific nodes on the AST. +> The actionables for this section are: +> +> - Work out how to ensure that precedence and associativity isn't broken by the +> macro resolution phase. +> - Work out how to handle the special case for `,`. We don't want comma to be +> subject to the variable precedence functionality, as conventional spacing +> for defining lists goes `[x, y, z]` and that should be allowed without the +> variable precedence happening. +> - Work out how to handle the special case for `-`. The expression `-n` should +> be treated as an application of the unary operator negate, while `- n` +> should be treated as part of a larger expression (e.g. a section, +> subtraction). +> - As Enso has no syntactic marker for the introduction of a lambda, we need to +> have a special case for `->` so that it has appropriate precedence on its +> left and right sides. Ideally, `map.fold 0 $ y -> foo $ y` is resolved as +> `(map.fold 0) $ (y -> (foo $ y))`. This makes writing code much more +> natural. + ## Resolution Algorithm The operator resolution process uses a version of the classic diff --git a/docs/semantics/modules.md b/docs/semantics/modules.md index 8f6447d40b..4cae6e1375 100644 --- a/docs/semantics/modules.md +++ b/docs/semantics/modules.md @@ -106,9 +106,9 @@ A qualified export statement only exports the name of the exported module In a `from` export, any mentioned items become available as though they were defined in the exporting module. -Please note it is explicitly forbidden for export statements across modules -to form a cycle. If export statements cycle is detected, a compile error will -be reported. +Please note it is explicitly forbidden for export statements across modules to +form a cycle. If export statements cycle is detected, a compile error will be +reported. ## Project Main Module diff --git a/docs/syntax/naming.md b/docs/syntax/naming.md index a35727a3ba..0fc3fc1529 100644 --- a/docs/syntax/naming.md +++ b/docs/syntax/naming.md @@ -19,9 +19,11 @@ giving Enso code a uniform identity. - [Naming Constructs](#naming-constructs) + - [External Identifiers](#external-identifiers) - [Pattern Contexts](#pattern-contexts) - [Localised Naming](#localised-naming) - [Operator Naming](#operator-naming) + - [Modifier Operators](#modifier-operators) - [Reserved Names](#reserved-names) @@ -69,6 +71,23 @@ Identifiers are introduced by: - Using them in a pattern matching context (free variables). - Using them in a type ascription (free variables). +### External Identifiers + +As Enso has the ability to interface with many other programming languages in a +highly-integrated fashion, it needs to be able to use naming styles from other +languages natively. To do this, we have the concept of a _third_ kind of +identifier, called the 'external' identifier. + +An external identifier is one that doesn't match either the variable or referent +forms described above, for example `someJavaName`. It is not an _exclusive_ +category, however. Common styles of naming functions in Python, for example, +will usually lex as variable identifiers. + +> The actionables for this section are: +> +> - Work out how and where to make a variable/referent distinction for external +> names. + ## Pattern Contexts A pattern context is a span in the code where variable identifiers (as described @@ -117,13 +136,19 @@ Operator names are those built solely from operator symbols (e.g. `+` or `<*>`). Operator symbols are defined as characters in the following set. ``` -!$%&*+-/<>?^~|:\,.()[]{}= +;!$%&*+-/<>?^~|:\\= ``` Please note that not every sequence that can be created from the above is a _valid_ operator name, as some may collide with built-in language constructs (e.g. `[` and `]`, which start and end a vector literal respectively). +### Modifier Operators + +Barring specially defined operators (`=`, `==`, `!=`, `#=`, `>=` and `<=`), any +operator that ends with an equals sign `=` is called a _modifier_ operator. +These will, in the future, have special treatment in the language. + ## Reserved Names Even though we do not intend to reserve any names at the level of the lexer or diff --git a/engine/runtime/src/test/resources/Cycle_Test/package.yaml b/engine/runtime/src/test/resources/Cycle_Test/package.yaml index 1239ba0c9a..3bb2b24a82 100644 --- a/engine/runtime/src/test/resources/Cycle_Test/package.yaml +++ b/engine/runtime/src/test/resources/Cycle_Test/package.yaml @@ -1,6 +1,6 @@ name: Cycle_Test version: 0.0.1 enso-version: 0.1.1-rc5 -license: '' -author: '' -maintainer: '' +license: "" +author: "" +maintainer: "" diff --git a/lib/rust/ast/src/ast.rs b/lib/rust/ast/src/ast.rs index a4b0d4d334..06b856d113 100644 --- a/lib/rust/ast/src/ast.rs +++ b/lib/rust/ast/src/ast.rs @@ -23,13 +23,13 @@ pub type AnyAst = Ast; #[derive(Debug,Clone)] pub struct Ast { /// A unique identifier. - uid: Option, + uid : Option, /// Length in number of chars of this ast node. - len: usize, + len : usize, /// The number of trailing spaces. - off: usize, + off : usize, /// The ast node itself. - ast: T, + ast : T, } // The set of all ast nodes. diff --git a/lib/rust/enso-logger/src/enabled.rs b/lib/rust/enso-logger/src/enabled.rs index cb85067ecb..095a62a211 100644 --- a/lib/rust/enso-logger/src/enabled.rs +++ b/lib/rust/enso-logger/src/enabled.rs @@ -40,7 +40,7 @@ impl Logger { } fn dec_indent(&self) { - self.indent.update(|t|t-1); + self.indent.update(|t|t.saturating_sub(1)); } } diff --git a/lib/rust/enso-optics/Cargo.toml b/lib/rust/enso-optics/Cargo.toml index 83d103ece9..b86d36685d 100644 --- a/lib/rust/enso-optics/Cargo.toml +++ b/lib/rust/enso-optics/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "optics" +name = "enso-optics" version = "0.1.0" authors = ["Enso Team "] edition = "2018" diff --git a/lib/rust/enso-prelude/src/data/non_empty_vec.rs b/lib/rust/enso-prelude/src/data/non_empty_vec.rs index 55319ad767..553f7946f5 100644 --- a/lib/rust/enso-prelude/src/data/non_empty_vec.rs +++ b/lib/rust/enso-prelude/src/data/non_empty_vec.rs @@ -131,6 +131,8 @@ impl NonEmptyVec { /// use enso_prelude::NonEmptyVec; /// let mut vec = NonEmptyVec::with_capacity(0, 10); /// assert_eq!(vec.capacity(),10); + /// vec.shrink_to_fit(); + /// assert!(vec.capacity() < 10); /// ``` pub fn shrink_to_fit(&mut self) { self.elems.shrink_to_fit(); @@ -165,9 +167,10 @@ impl NonEmptyVec { /// let mut vec = NonEmptyVec::new(0,vec![1]); /// assert!(vec.pop().is_some()); /// assert!(vec.pop().is_none()); + /// assert_eq!(vec.len(),1); /// ``` pub fn pop(&mut self) -> Option { - (self.len() != 1).and_option(self.elems.pop()) + (self.len() > 1).and_option_from(||self.elems.pop()) } /// Obtain a mutable reference to teh element in the vector at the specified `index`. diff --git a/lib/rust/enso-prelude/src/lib.rs b/lib/rust/enso-prelude/src/lib.rs index e6f8e7305d..f0f80f0411 100644 --- a/lib/rust/enso-prelude/src/lib.rs +++ b/lib/rust/enso-prelude/src/lib.rs @@ -3,12 +3,12 @@ //! defines several aliases and utils which may find their place in new //! libraries in the future. +#![feature(specialization)] #![feature(test)] -#![warn(unsafe_code)] +#![feature(trait_alias)] #![warn(missing_copy_implementations)] #![warn(missing_debug_implementations)] -#![feature(specialization)] -#![feature(trait_alias)] +#![warn(unsafe_code)] mod clone; mod collections; diff --git a/lib/rust/flexer-testing/definition/src/lib.rs b/lib/rust/flexer-testing/definition/src/lib.rs index d5dcbde041..cdb2acab04 100644 --- a/lib/rust/flexer-testing/definition/src/lib.rs +++ b/lib/rust/flexer-testing/definition/src/lib.rs @@ -6,8 +6,6 @@ #![warn(trivial_numeric_casts)] #![warn(unsafe_code)] #![warn(unused_import_braces)] -#![allow(unused_imports)] -#![allow(clippy::all)] //! This file contains the code defining a lexer for the following small language. Due to the way in //! which the code-generation from the flexer is used, it has to be defined in a separate crate from @@ -35,7 +33,6 @@ use flexer::automata::pattern::Pattern; use flexer::group::Registry; use flexer::prelude::logger::Disabled; use flexer::prelude::reader::BookmarkManager; -use flexer::prelude::reader::decoder::DecoderUTF8; @@ -128,12 +125,10 @@ impl TestLexer { } } -/// Implementations of functionality used by the lexer. -/// -/// These functions are provided by the user, by hand, and must all take a reader. -#[allow(missing_docs)] +/// Rules for the root state. +#[allow(dead_code,missing_docs)] impl TestLexer { - pub fn on_first_word(&mut self, _reader:&mut R) { + fn on_first_word(&mut self, _reader:&mut R) { let str = self.current_match.clone(); let ast = Token::Word(str); self.output.push(ast); @@ -141,28 +136,65 @@ impl TestLexer { self.push_state(id); } - pub fn on_spaced_word(&mut self, _reader:&mut R) { + fn on_err_suffix_first_word(&mut self, _reader:&mut R) { + let ast = Token::Unrecognized(self.current_match.clone()); + self.output.push(ast); + } + + fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} + + fn rules_in_root(lexer:&mut TestLexer) { + let a_word = Pattern::char('a').many1(); + let b_word = Pattern::char('b').many1(); + let any = Pattern::any(); + let end = Pattern::eof(); + + let root_group_id = lexer.initial_state; + let root_group = lexer.groups_mut().group_mut(root_group_id); + + root_group.create_rule(&a_word,"self.on_first_word(reader)"); + root_group.create_rule(&b_word,"self.on_first_word(reader)"); + root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); + root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); + } +} + +/// Rules for the "seen first word" state. +#[allow(dead_code,missing_docs)] +impl TestLexer { + fn on_spaced_word(&mut self, _reader:&mut R) { let str = self.current_match.clone(); let ast = Token::Word(String::from(str.trim())); self.output.push(ast); } - pub fn on_err_suffix_first_word(&mut self, _reader:&mut R) { - let ast = Token::Unrecognized(self.current_match.clone()); - self.output.push(ast); - } - - pub fn on_err_suffix(&mut self, reader:&mut R) { + fn on_err_suffix(&mut self, reader:&mut R) { self.on_err_suffix_first_word(reader); self.pop_state(); } - pub fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} - - pub fn on_no_err_suffix(&mut self, reader:&mut R) { + fn on_no_err_suffix(&mut self, reader:&mut R) { self.on_no_err_suffix_first_word(reader); self.pop_state(); } + + fn rules_in_seen_first_word(lexer:&mut TestLexer) { + let a_word = Pattern::char('a').many1(); + let b_word = Pattern::char('b').many1(); + let space = Pattern::char(' '); + let spaced_a_word = &space >> &a_word; + let spaced_b_word = &space >> &b_word; + let any = Pattern::any(); + let end = Pattern::eof(); + + let seen_first_word_group_id = lexer.seen_first_word_state; + let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); + + seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); + seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); + seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); + seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); + } } @@ -172,27 +204,8 @@ impl flexer::Definition for TestLexer { fn define() -> Self { let mut lexer = TestLexer::new(); - let a_word = Pattern::char('a').many1(); - let b_word = Pattern::char('b').many1(); - let space = Pattern::char(' '); - let spaced_a_word = &space >> &a_word; - let spaced_b_word = &space >> &b_word; - let any = Pattern::any(); - let end = Pattern::eof(); - - let root_group_id = lexer.initial_state; - let root_group = lexer.groups_mut().group_mut(root_group_id); - root_group.create_rule(&a_word,"self.on_first_word(reader)"); - root_group.create_rule(&b_word,"self.on_first_word(reader)"); - root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); - root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); - - let seen_first_word_group_id = lexer.seen_first_word_state; - let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); - seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); - seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); - seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); - seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); + TestLexer::rules_in_seen_first_word(&mut lexer); + TestLexer::rules_in_root(&mut lexer); lexer } @@ -200,6 +213,16 @@ impl flexer::Definition for TestLexer { fn groups(&self) -> &Registry { self.lexer.groups() } + + fn set_up(&mut self) {} + + fn tear_down(&mut self) {} +} + +impl Default for TestLexer { + fn default() -> Self { + TestLexer::new() + } } @@ -225,7 +248,7 @@ pub struct TestState { // === Trait Impls === impl flexer::State for TestState { - fn new() -> Self { + fn new(_logger:&impl AnyLogger) -> Self { let mut lexer_states = group::Registry::default(); let initial_state = lexer_states.define_group("ROOT",None); let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); diff --git a/lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs b/lib/rust/flexer-testing/generation/tests/flexer_generated_lexer.rs similarity index 100% rename from lib/rust/flexer-testing/generation/tests/test_generated_lexer.rs rename to lib/rust/flexer-testing/generation/tests/flexer_generated_lexer.rs diff --git a/lib/rust/flexer/src/automata/nfa.rs b/lib/rust/flexer/src/automata/nfa.rs index bdf09c72e2..2c970b1441 100644 --- a/lib/rust/flexer/src/automata/nfa.rs +++ b/lib/rust/flexer/src/automata/nfa.rs @@ -112,7 +112,8 @@ impl NFA { self.connect(state,end); } end - } + }, + Pattern::Always => current, } } diff --git a/lib/rust/flexer/src/automata/pattern.rs b/lib/rust/flexer/src/automata/pattern.rs index 75380d7553..2ea3aafb6c 100644 --- a/lib/rust/flexer/src/automata/pattern.rs +++ b/lib/rust/flexer/src/automata/pattern.rs @@ -29,19 +29,21 @@ pub enum Pattern { /// The pattern that triggers when a sequence of patterns is encountered. Seq(Vec), /// The pattern that triggers on 0..N repetitions of given pattern. - Many(Box) + Many(Box), + /// The pattern that always triggers. + Always, } impl Pattern { /// A pattern that never triggers. pub fn never() -> Self { - Pattern::symbols(Symbol::from(1)..=Symbol::from(0)) + Pattern::symbol(Symbol::INVALID_SYMBOL) } /// A pattern that always triggers pub fn always() -> Self { - Pattern::symbols(Symbol::from(u32::min_value())..=Symbol::from(u32::max_value())) + Pattern::Always } /// A pattern that triggers on any character. @@ -50,18 +52,18 @@ impl Pattern { } /// A pattern that triggers on 0..N repetitions of the pattern described by `self`. - pub fn many(self) -> Self { - Many(Box::new(self)) + pub fn many(&self) -> Self { + Many(Box::new(self.clone())) } /// A pattern that triggers on 1..N repetitions of the pattern described by `self`. - pub fn many1(self) -> Self { + pub fn many1(&self) -> Self { self.clone() >> self.many() } /// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`. - pub fn opt(self) -> Self { - self | Self::always() + pub fn opt(&self) -> Self { + self.clone() | Self::always() } /// A pattern that triggers on the given character. @@ -91,7 +93,12 @@ impl Pattern { /// Pattern that triggers when sequence of characters given by `chars` is encountered. pub fn all_of(chars:&str) -> Self { - chars.chars().fold(Self::never(),|pat,char| pat >> Self::char(char)) + let mut chars_iter = chars.chars(); + if let Some(first) = chars_iter.next() { + chars_iter.fold(Self::char(first),|pat, char| pat >> Self::char(char)) + } else { + Pattern::never() + } } /// The pattern that triggers on any characters contained in `chars`. @@ -105,11 +112,12 @@ impl Pattern { let char_iter = chars.chars().map(|char| char as u32); let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max)); let mut codes = char_iter2.collect_vec(); - codes.sort(); - codes.iter().tuple_windows().fold(Self::never(),|pat,(start,end)| { + codes.iter().tuple_windows().fold(Self::never(),|pat,(prev_code,next_code)| { + let start = prev_code + 1; + let end = next_code - 1; if end < start {pat} else { - pat | Pattern::symbols(Symbol::from(*start)..=Symbol::from(*end)) + pat | Pattern::symbols(Symbol::from(start)..=Symbol::from(end)) } }) } @@ -158,3 +166,29 @@ impl Shr for Pattern { } } gen_ref_versions!(Pattern,Shr,shr); + + + +// ================= +// === Utilities === +// ================= + +/// Quote a character as a character pattern. +/// +/// It is equivalent to `Pattern::char(...)`. +#[macro_export] +macro_rules! c { + ($char:literal) => { + Pattern::char($char) + } +} + +/// Quote a string as a literal pattern. +/// +/// It is equivalent to `Pattern::all_of(...)`. +#[macro_export] +macro_rules! l { + ($lit:literal) => { + Pattern::all_of($lit) + } +} diff --git a/lib/rust/flexer/src/automata/symbol.rs b/lib/rust/flexer/src/automata/symbol.rs index 295ee369cf..3b6f90a8dc 100644 --- a/lib/rust/flexer/src/automata/symbol.rs +++ b/lib/rust/flexer/src/automata/symbol.rs @@ -14,11 +14,14 @@ pub struct Symbol { } impl Symbol { - /// A representation of the end of the file. - pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()}; - /// A representation of the null symbol. pub const NULL:Symbol = Symbol{value:0}; + /// A representation of the end of the file. + pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()}; + /// A representation of an arbitrary invalid unicode symbol. + pub const INVALID_SYMBOL:Symbol = Symbol{value:0xFFFF}; + /// A representation of the group reaching its end without matching. + pub const INCOMPLETE_GROUP:Symbol = Symbol{value:u32::max_value() - 1}; } diff --git a/lib/rust/flexer/src/generate.rs b/lib/rust/flexer/src/generate.rs index 4abc2e79b6..e5e6fa6467 100644 --- a/lib/rust/flexer/src/generate.rs +++ b/lib/rust/flexer/src/generate.rs @@ -33,8 +33,8 @@ use crate as flexer; /// overhead. pub fn specialize ( definition : &impl flexer::State -, state_type_name : impl Into -, output_type_name : impl Into +, state_type_name : impl Str +, output_type_name : impl Str ) -> Result { let group_registry = definition.groups(); let mut body_items = Vec::new(); @@ -59,7 +59,7 @@ pub fn wrap_in_impl_for ) -> Result { let state_name:Ident = str_to_ident(state_name.into().as_str())?; let mut tree:ItemImpl = parse_quote! { - #[allow(missing_docs,dead_code)] + #[allow(missing_docs,dead_code,clippy::all)] impl #state_name {} }; tree.items.extend(body); @@ -68,14 +68,15 @@ pub fn wrap_in_impl_for /// Generate the `run` function for the specialized lexer. /// -/// This function is what the user of the lexer will call -pub fn run_function(output_type_name:impl Into) -> Result { - let output_type_name:Ident = str_to_ident(output_type_name)?; - let tree:ImplItem = parse_quote! { +/// This function is what the user of the lexer will call to begin execution. +pub fn run_function(output_type_name:impl Str) -> Result { + let output_type_name = str_to_path(output_type_name)?; + let tree:ImplItem = parse_quote! { pub fn run(&mut self, mut reader:R) -> LexingResult<#output_type_name> { + self.set_up(); reader.advance_char(&mut self.bookmarks); while self.run_current_state(&mut reader) == StageStatus::ExitSuccess {} - match self.status { + let result = match self.status { StageStatus::ExitFinished => LexingResult::success( mem::take(&mut self.output) ), @@ -83,7 +84,9 @@ pub fn run_function(output_type_name:impl Into) -> Result LexingResult::partial(mem::take(&mut self.output)) - } + }; + self.tear_down(); + result } }; Ok(tree) @@ -94,16 +97,19 @@ pub fn run_current_state_function() -> ImplItem { let tree:ImplItem = parse_quote! { fn run_current_state(&mut self, reader:&mut R) -> StageStatus { self.status = StageStatus::Initial; + let mut finished = false; // Runs until reaching a state that no longer says to continue. while let Some(next_state) = self.status.continue_as() { - self.logger.info(||format!("Current character is {:?}.",reader.character())); - self.logger.info(||format!("Continuing in {:?}.",next_state)); + self.logger.debug(||format!("Current character is {:?}.",reader.character().char)); + self.logger.debug(||format!("Continuing in {:?}.",next_state)); self.status = self.step(next_state,reader); - if reader.finished() { + if finished && reader.finished(self.bookmarks()) { + self.logger.info("Input finished."); self.status = StageStatus::ExitFinished } + finished = reader.character().is_eof(); if self.status.should_continue() { match reader.character().char { @@ -111,6 +117,9 @@ pub fn run_current_state_function() -> ImplItem { reader.append_result(char); self.logger.info(||format!("Result is {:?}.",reader.result())); }, + Err(flexer::prelude::reader::Error::EOF) => { + self.logger.info("Reached EOF."); + }, Err(flexer::prelude::reader::Error::EndOfGroup) => { let current_state = self.current_state(); let group_name = self.groups().group(current_state).name.as_str(); @@ -439,6 +448,8 @@ pub enum GenError { BadExpression(String), /// The provided string is not a valid rust literal. BadLiteral(String), + /// The provided string is not a valid rust path. + BadPath(String), } @@ -453,6 +464,7 @@ impl Display for GenError { GenError::BadIdentifier(str) => write!(f,"`{}` is not a valid rust identifier.",str), GenError::BadExpression(str) => write!(f,"`{}` is not a valid rust expression.",str), GenError::BadLiteral(str) => write!(f,"`{}` is not a valid rust literal.",str), + GenError::BadPath(str) => write!(f,"`{}` is not a valid rust path.",str), } } } @@ -512,12 +524,13 @@ impl Into for Branch { // ================= /// Convert a string to an identifier. -pub fn str_to_ident(str:impl Into) -> Result { - let string = str.into(); - match parse_str(string.as_ref()) { - Ok(literal) => Ok(literal), - Err(_) => Err(GenError::BadIdentifier(string)) - } +pub fn str_to_ident(str:impl Str) -> Result { + parse_str(str.as_ref()).map_err(|_| GenError::BadIdentifier(str.into())) +} + +/// Convert a string to a path. +pub fn str_to_path(str:impl Str) -> Result { + parse_str(str.as_ref()).map_err(|_| GenError::BadPath(str.into())) } /// Convert the syntax tree into a string. diff --git a/lib/rust/flexer/src/group.rs b/lib/rust/flexer/src/group.rs index 5b615cafc9..630bafb0af 100644 --- a/lib/rust/flexer/src/group.rs +++ b/lib/rust/flexer/src/group.rs @@ -5,6 +5,8 @@ use crate::automata::pattern::Pattern; use crate::group::rule::Rule; use itertools::Itertools; +use std::fmt::Display; +use wasm_bindgen::__rt::core::fmt::Formatter; pub mod rule; @@ -234,6 +236,12 @@ impl Into for Group { } } +impl Display for Group { + fn fmt(&self, f:&mut Formatter<'_>) -> std::fmt::Result { + write!(f,"Group {}",self.name) + } +} + // ============= @@ -289,11 +297,12 @@ pub mod tests { fn complex_rules(count:usize) -> Registry { let mut group = Group::default(); for ix in 0..count { - let string = ix.to_string(); - let all = Pattern::all_of(&string); - let any = Pattern::any_of(&string); - let none = Pattern::none_of(&string); - let pattern = Pattern::many(all >> any >> none); + let string = ix.to_string(); + let all = Pattern::all_of(&string); + let any = Pattern::any_of(&string); + let none = Pattern::none_of(&string); + let all_any_none = all >> any >> none; + let pattern = Pattern::many(&all_any_none); group.add_rule(Rule::new(pattern.clone(),"")); } group.into() diff --git a/lib/rust/flexer/src/lib.rs b/lib/rust/flexer/src/lib.rs index fe2b31b2cb..c66c1a850b 100644 --- a/lib/rust/flexer/src/lib.rs +++ b/lib/rust/flexer/src/lib.rs @@ -154,6 +154,7 @@ //! use flexer::generate; //! # use flexer::group; //! use flexer::generate::GenError; +//! use flexer::prelude::AnyLogger; //! # use flexer::prelude::reader::BookmarkManager; //! # use flexer::State; //! # @@ -195,7 +196,7 @@ //! # } //! //! impl flexer::State for LexerState { -//! fn new() -> Self { +//! fn new(_logger:&impl AnyLogger) -> Self { //! // Here we construct all of the elements needed for our lexer state. This function can //! // contain arbitrarily complex logic and is only called once at initialization time. //! let mut lexer_states = group::Registry::default(); @@ -251,6 +252,7 @@ //! # use flexer::generate; //! # use flexer::group; //! # use flexer::prelude::GenError; +//! # use flexer::prelude::AnyLogger; //! use flexer::prelude::logger::Disabled; //! # use flexer::prelude::reader::BookmarkManager; //! # use flexer::State; @@ -295,7 +297,7 @@ //! # } //! # //! # impl flexer::State for LexerState { -//! # fn new() -> Self { +//! # fn new(_logger:&impl AnyLogger) -> Self { //! # // Here we construct all of the elements needed for our lexer state. This function can //! # // contain arbitrarily complex logic and is only called once at initialization time. //! # let mut lexer_states = group::Registry::default(); @@ -351,7 +353,7 @@ //! # use flexer::Flexer; //! # use flexer::generate; //! # use flexer::group; -//! use flexer::prelude::AnyLogger; +//! # use flexer::prelude::AnyLogger; //! # use flexer::prelude::GenError; //! # use flexer::prelude::logger::Disabled; //! # use flexer::prelude::reader::BookmarkManager; @@ -397,7 +399,7 @@ //! # } //! # //! # impl flexer::State for LexerState { -//! # fn new() -> Self { +//! # fn new(_logger:&impl AnyLogger) -> Self { //! # // Here we construct all of the elements needed for our lexer state. This function can //! # // contain arbitrarily complex logic and is only called once at initialization time. //! # let mut lexer_states = group::Registry::default(); @@ -504,7 +506,7 @@ //! # } //! # //! # impl flexer::State for LexerState { -//! # fn new() -> Self { +//! # fn new(_logger:&impl AnyLogger) -> Self { //! # // Here we construct all of the elements needed for our lexer state. This function can //! # // contain arbitrarily complex logic and is only called once at initialization time. //! # let mut lexer_states = group::Registry::default(); @@ -638,7 +640,7 @@ //! # } //! # //! # impl flexer::State for LexerState { -//! # fn new() -> Self { +//! # fn new(_logger:&impl AnyLogger) -> Self { //! # // Here we construct all of the elements needed for our lexer state. This function can //! # // contain arbitrarily complex logic and is only called once at initialization time. //! # let mut lexer_states = group::Registry::default(); @@ -740,6 +742,12 @@ //! fn groups(&self) -> &Registry { //! self.lexer.groups() //! } +//! +//! /// Code you want to run before lexing begins. +//! fn set_up(&mut self) {} +//! +//! /// Code you want to run after lexing finishes. +//! fn tear_down(&mut self) {} //! } //! ``` //! @@ -820,7 +828,7 @@ //! # } //! # //! # impl flexer::State for LexerState { -//! # fn new() -> Self { +//! # fn new(_logger:&impl AnyLogger) -> Self { //! # // Here we construct all of the elements needed for our lexer state. This function can //! # // contain arbitrarily complex logic and is only called once at initialization time. //! # let mut lexer_states = group::Registry::default(); @@ -922,6 +930,12 @@ //! # fn groups(&self) -> &Registry { //! # self.lexer.groups() //! # } +//! # +//! # /// Code you want to run before lexing begins. +//! # fn set_up(&mut self) {} +//! # +//! # /// Code you want to run after lexing finishes. +//! # fn tear_down(&mut self) {} //! # } //! //! impl Lexer { @@ -1000,6 +1014,7 @@ //! of lexing languages of a high complexity. use crate::prelude::*; +use prelude::logger::*; use crate::generate::GenError; use prelude::logger::AnyLogger; @@ -1085,7 +1100,7 @@ where Definition : State, let logger = ::sub(&parent_logger,"Flexer"); let status = default(); let output = default(); - let definition = Definition::new(); + let definition = Definition::new(&logger); let initial_state_id = definition.initial_state(); let mut state_stack = NonEmptyVec::singleton(initial_state_id); let current_match = default(); @@ -1098,7 +1113,7 @@ where Definition : State, impl Flexer where Definition : State, Output : Clone, - Logger : AnyLogger { + Logger : AnyLogger { /// Get the lexer result. pub fn result(&mut self) -> &Output { &self.output @@ -1116,7 +1131,9 @@ where Definition : State, /// Tell the lexer to enter the state described by `state`. pub fn push_state(&mut self, state:group::Identifier) { - self.logger.info(||format!("Pushing state {:?}",state)); + self.logger.group_begin( + ||format!("Enter State: {}",self.groups().group(state).name.as_str()) + ); self.state_stack.push(state); } @@ -1125,21 +1142,36 @@ where Definition : State, /// It will never end the initial state of the lexer. pub fn pop_state(&mut self) -> Option { let result = self.state_stack.pop(); - self.logger.info(||format!("Popped state {:?}",result)); + match result { + None => (), + Some(ident) => debug!(self.logger,"Leave State: {self.groups().group(ident)}"), + }; + self.logger.group_end(); result } /// End states until the specified `state` is reached, leaving the lexer in `state`. /// /// If `state` does not exist on the lexer's stack, then the lexer will be left in the root - /// state. - pub fn pop_states_until(&mut self, state:group::Identifier) -> Vec { - let non_opt_root_state_position = - self.state_stack.iter().positions(|elem| *elem == state).last().unwrap_or(0); - let range = (non_opt_root_state_position + 1)..self.state_stack.len(); - let states = self.state_stack.drain(range).collect(); - self.logger.info(||format!("Popped states {:?}",states)); - states + /// state. Additionally, this function cannot pop the final occurrence of the root state. + pub fn pop_states_until(&mut self, state:group::Identifier) -> group::Identifier { + while self.current_state() != state && self.current_state() != self.initial_state() { + self.pop_state(); + } + *self.state_stack.last() + } + + /// End states up to and including the first instance of `state`, returning the identifier of + /// the new state the lexer is in. + /// + /// If `state` does not exist on the lexer's stack, the lexer will be left in the root state. + /// Additionally, this function cannot pop the final occurrence of the root state. + pub fn pop_states_including(&mut self, state:group::Identifier) -> group::Identifier { + while self.current_state() != state && self.current_state() != self.initial_state() { + self.pop_state(); + } + self.pop_state(); + *self.state_stack.last() } /// Check if the lexer is currently in the state described by `state`. @@ -1309,7 +1341,7 @@ pub trait State { /// Create a new instance of the lexer's state. /// /// This function is guaranteed to be called at most once per run of the lexer. - fn new() -> Self; + fn new(parent_logger:&impl AnyLogger) -> Self; /// Return the _initial_ lexing state. fn initial_state(&self) -> group::Identifier; /// Return a reference to the group registry for a given lexer. @@ -1339,4 +1371,8 @@ pub trait Definition { fn define() -> Self; /// Obtain the registry of groups for the lexer. fn groups(&self) -> &group::Registry; + /// Run before any lexing takes place. + fn set_up(&mut self); + /// Run after lexing has completed. + fn tear_down(&mut self); } diff --git a/lib/rust/flexer/tests/test_invalid_definitions.rs b/lib/rust/flexer/tests/flexer_invalid_definitions.rs similarity index 93% rename from lib/rust/flexer/tests/test_invalid_definitions.rs rename to lib/rust/flexer/tests/flexer_invalid_definitions.rs index aa56678f09..ff432d3f39 100644 --- a/lib/rust/flexer/tests/test_invalid_definitions.rs +++ b/lib/rust/flexer/tests/flexer_invalid_definitions.rs @@ -53,7 +53,7 @@ pub struct LexerState { initial_state:group::Identifier, } impl flexer::State for LexerState { - fn new() -> Self { + fn new(_logger:&impl AnyLogger) -> Self { let mut lexer_states = group::Registry::default(); let initial_state = lexer_states.define_group("ROOT",None); LexerState{lexer_states,initial_state} @@ -143,6 +143,14 @@ impl flexer::Definition for Lexer1 { fn groups(&self) -> &Registry { self.lexer.groups() } + + fn set_up(&mut self) { + unimplemented!() + } + + fn tear_down(&mut self) { + unimplemented!() + } } #[test] @@ -204,6 +212,14 @@ impl flexer::Definition for Lexer2 { fn groups(&self) -> &Registry { self.lexer.groups() } + + fn set_up(&mut self) { + unimplemented!() + } + + fn tear_down(&mut self) { + unimplemented!() + } } #[test] @@ -268,6 +284,14 @@ impl flexer::Definition for Lexer3 { fn groups(&self) -> &Registry { self.lexer.groups() } + + fn set_up(&mut self) { + unimplemented!() + } + + fn tear_down(&mut self) { + unimplemented!() + } } pub struct LexerState1 { @@ -275,7 +299,7 @@ pub struct LexerState1 { initial_state:group::Identifier, } impl flexer::State for LexerState1 { - fn new() -> Self { + fn new(_logger:&impl AnyLogger) -> Self { let mut lexer_states = group::Registry::default(); let initial_state = lexer_states.define_group("ROOT",None); LexerState1 {lexer_states,initial_state} @@ -366,6 +390,14 @@ impl flexer::Definition for Lexer4 { fn groups(&self) -> &Registry { self.lexer.groups() } + + fn set_up(&mut self) { + unimplemented!() + } + + fn tear_down(&mut self) { + unimplemented!() + } } pub struct LexerState2 { @@ -373,7 +405,7 @@ pub struct LexerState2 { initial_state:group::Identifier, } impl flexer::State for LexerState2 { - fn new() -> Self { + fn new(_logger:&impl AnyLogger) -> Self { let mut lexer_states = group::Registry::default(); let initial_state = lexer_states.define_group("ROOT",None); LexerState2 {lexer_states,initial_state} @@ -410,5 +442,5 @@ pub fn test_bad_output_name() { let result = lexer.specialize(); assert!(result.is_err()); let message = result.unwrap_err().to_string(); - assert_eq!(message,"`Bad output name` is not a valid rust identifier."); + assert_eq!(message,"`Bad output name` is not a valid rust path."); } diff --git a/lib/rust/lazy-reader/src/decoder.rs b/lib/rust/lazy-reader/src/decoder.rs index 67dc9274b8..63171783f5 100644 --- a/lib/rust/lazy-reader/src/decoder.rs +++ b/lib/rust/lazy-reader/src/decoder.rs @@ -39,6 +39,17 @@ pub struct Char { pub size: usize, } +impl Char { + /// Check if the character represents the end of file. + pub fn is_eof(&self) -> bool { + match self.char { + Ok(_) => false, + Err(crate::Error::EOF) => true, + Err(_) => false + } + } +} + // ===================== diff --git a/lib/rust/lazy-reader/src/lib.rs b/lib/rust/lazy-reader/src/lib.rs index 5effa28351..7609fb7d7a 100644 --- a/lib/rust/lazy-reader/src/lib.rs +++ b/lib/rust/lazy-reader/src/lib.rs @@ -14,9 +14,10 @@ pub mod decoder; use enso_prelude::*; +use crate::decoder::Char; +use crate::decoder::InvalidChar; use decoder::Decoder; -use crate::decoder::{Char, InvalidChar}; -use crate::Error::EOF; + // ============ @@ -79,10 +80,10 @@ pub enum Error { impl Error { /// The `u32` value that corresponds to EOF. pub const END_OF_FILE:u32 = u32::max_value(); - /// The `u32` value that corresponds to an invalid character. - pub const INVALID_CHAR:u32 = u32::max_value() - 1; + /// The `u32` value that corresponds to an invalid unicode character. + pub const INVALID_CHAR:u32 = 0xFFFF; /// The `u32` value corresponding to the end of group. - pub const END_OF_GROUP:u32 = u32::max_value() - 2; + pub const END_OF_GROUP:u32 = u32::max_value() - 1; } @@ -145,7 +146,10 @@ pub trait LazyReader { /// Get the current character from the reader. fn character(&self) -> decoder::Char; /// Check if the reader has finished reading. - fn finished(&self) -> bool; + /// + /// A reader is finished when it has no further input left to read, and when it does not need to + /// rewind to any point. + fn finished(&self, bookmarks:&BookmarkManager) -> bool; /// Check if the reader is empty. fn empty(&self) -> bool; /// Fill the buffer with words from the input. @@ -240,8 +244,9 @@ impl> LazyReader for Reader { self.character } - fn finished(&self) -> bool { - self.empty() && self.character.char == Err(EOF) + fn finished(&self, _bookmarks:&BookmarkManager) -> bool { + let rewinded = self.max_possible_rewind_len(_bookmarks) != 0; + self.empty() && rewinded } fn empty(&self) -> bool { diff --git a/lib/rust/lexer/definition/Cargo.toml b/lib/rust/lexer/definition/Cargo.toml new file mode 100644 index 0000000000..14b8bc653d --- /dev/null +++ b/lib/rust/lexer/definition/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "lexer-definition" +version = "0.1.0" +authors = ["Enso Team "] +edition = "2018" + +publish = false + +[lib] +crate-type = ["cdylib", "rlib"] +test = true +bench = true + +[dependencies] +flexer = { path = "../../flexer", version = "0.1.0" } +enso-prelude = { path = "../../enso-prelude", version = "0.1.0" } + +uuid = { version = "0.8.1" , features = ["serde","v4","wasm-bindgen"] } diff --git a/lib/rust/lexer/definition/src/lexer.rs b/lib/rust/lexer/definition/src/lexer.rs new file mode 100644 index 0000000000..f92c1c091a --- /dev/null +++ b/lib/rust/lexer/definition/src/lexer.rs @@ -0,0 +1,1159 @@ +//! This module contains the definition of the lexer for the Enso programming language. + +use crate::prelude::*; +use flexer::*; + +use crate::library::token::BlockType; +use crate::library::token::Token; +use crate::library::token; + +use flexer::automata::pattern::Pattern; +use flexer::group::Group; +use flexer::group::Registry; +use flexer::prelude::logger::Disabled; +use flexer::prelude::reader; +use flexer::State as FlexerState; +use flexer; +use std::collections::VecDeque; +use std::cmp::Ordering; + + + +// ==================== +// === Type Aliases === +// ==================== + +type Logger = Disabled; +type Flexer = flexer::Flexer,token::Stream,Logger>; + + + +// ================== +// === Enso Lexer === +// ================== + +/// The Enso lexer. +#[derive(Debug)] +pub struct EnsoLexer(Flexer); + +impl Deref for EnsoLexer { + type Target = Flexer; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for EnsoLexer { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +/// Functions for working with the lexer. +impl EnsoLexer { + /// Construct a new instance of the Enso lexer. + pub fn new() -> Self { + let logger = Logger::new("EnsoLexer"); + let lexer = Flexer::new(logger); + EnsoLexer(lexer) + } +} + + +// === Result Functionality === + +impl EnsoLexer { + /// Push the current token stream onto the stack. + pub fn push_tokens(&mut self) { + let current_stream = mem::take(&mut self.output); + debug!(self.logger,"Push Tokens: {¤t_stream:?}"); + self.tokens_stack.push(current_stream); + } + + /// Pop the top token stream from the stack and make it current. + pub fn pop_tokens(&mut self) { + let popped = self.tokens_stack.pop().unwrap_or_default(); + debug!(self.logger,"Pop Tokens: {&popped:?}"); + self.output = popped; + } + + /// Append the provided `token` to the lexer output. + pub fn append_token(&mut self, token:Token) { + debug!(self.logger,"Append: {&token:?}"); + self.output.append(token); + } + + /// Get a reference to the last token in the current lexer output. + pub fn last_token(&mut self) -> Option<&Token> { + self.output.last() + } + + /// Consume the currently active stream of tokens. + pub fn consume_tokens(&mut self) -> token::Stream { + mem::take(&mut self.output) + } + + /// Consume the current match and replace it with the empty string. + pub fn consume_current(&mut self) -> String { + debug!(self.logger,"Consume: {self.current_match:?}"); + mem::take(&mut self.current_match) + } + + /// Discard the current match and replace it with the empty string. + pub fn discard_current(&mut self) { + debug!(self.logger,"Discard: {self.current_match:?}"); + self.current_match = default(); + } +} + + + +// === Common Patterns === + +/// Basic character classification +#[allow(dead_code)] +impl EnsoLexer { + /// Match lower-case ASCII letters. + fn lower_ascii_letter() -> Pattern { + Pattern::range('a'..='z') + } + + /// Match upper-case ASCII letters. + fn upper_ascii_letter() -> Pattern { + Pattern::range('A'..='Z') + } + + /// Match ASCII digits. + fn ascii_digit() -> Pattern { + Pattern::range('0'..='9') + } + + /// Match ASCII letters. + fn ascii_letter() -> Pattern { + EnsoLexer::lower_ascii_letter() | EnsoLexer::upper_ascii_letter() + } + + /// Match ASCII alphanumeric characters. + fn ascii_alpha_num() -> Pattern { + EnsoLexer::ascii_digit() | EnsoLexer::ascii_letter() + } + + /// Match at least one ASCII space character. + fn spaces() -> Pattern { + Pattern::char(' ').many1() + } + + /// Match a newline. + /// + /// This matches both Unix (LF) and Windows (CRLF) styles of newlines. This is particularly + /// important so as not to result in incorrect spans on windows clients. + fn newline() -> Pattern { + Pattern::char('\n') | Pattern::all_of("\r\n") + } + + /// The allowable group characters in Enso. + fn group_chars() -> String { + String::from("()[]{}") + } + + /// The allowable operator characters in Enso. + fn operator_chars() -> String { + String::from(";!$%&*+-/<>?^~|:\\") + } + + /// The characters that break tokens in Enso. + fn whitespace_break_chars() -> String { + String::from("\t\r\n") + } + + /// The characters that break token lexing in Enso. + fn break_chars() -> String { + let mut break_chars = String::from("`@#,. "); + break_chars.push_str(&Self::operator_chars()); + break_chars.push_str(&Self::whitespace_break_chars()); + break_chars.push_str(&Self::group_chars()); + break_chars + } +} + + +// === Operators === + +/// The set of rules for lexing Enso operator identifiers. +#[allow(dead_code)] +impl EnsoLexer { + + /// Create an arbitrary operator that requires no special handling. + fn on_operator(&mut self, _reader:&mut R) { + let op_modifier_check = self.operator_modifier_check; + let operator = self.consume_current(); + let offset = self.offset.consume(); + let token = Token::Operator(operator,offset); + self.append_token(token); + self.push_state(op_modifier_check); + } + + /// Create an operator that cannot have an associated modifier. + fn on_operator_no_modifier(&mut self, _reader:&mut R) { + let op_suffix_check = self.operator_suffix_check; + let operator = self.consume_current(); + let offset = self.offset.consume(); + let token = Token::Operator(operator,offset); + self.append_token(token); + self.push_state(op_suffix_check); + } + + /// Create a grouping operator. + fn on_group(&mut self, reader:&mut R) { + let operator = self.consume_current(); + let offset = self.offset.consume(); + let token = Token::Operator(operator,offset); + self.append_token(token); + self.ident_on_no_error_suffix(reader); + } + + /// Create an operator modifier. + fn on_modifier(&mut self, _reader:&mut R) { + match self.output.pop() { + Some(token) => match token.shape { + token::Shape::Operator(name) => { + let new_token = Token::Modifier(name,token.offset); + self.discard_current(); + self.append_token(new_token); + }, + _ => unreachable_panic!("The preceding token should always be an operator."), + } + None => unreachable_panic!("There should always be a preceding token."), + } + } + + /// The rules for lexing Enso operators. + fn add_operator_rules(lexer:&mut EnsoLexer) { + let operator_char = Pattern::any_of(Self::operator_chars().as_str()); + let equals = c!('='); + let comma = c!(','); + let dot = c!('.'); + let error_char = &operator_char | &equals | &comma | ˙ + let error_suffix = &error_char.many1(); + let operator_body = &operator_char.many1(); + let ops_eq = &equals | l!("==") | l!(">=") | l!("<=") | l!("!=") | l!("#="); + let ops_in = l!("in"); + let ops_dot = dot | comma | l!("..") | l!("..."); + let ops_group = Pattern::any_of(Self::group_chars().as_str()); + let ops_comment = c!('#') | l!("##"); + let ops_no_modifier = &ops_eq | &ops_dot | &ops_comment | &ops_in; + + let initial_state_id = lexer.initial_state; + let initial_state = lexer.group_mut(initial_state_id); + initial_state.create_rule(&operator_body, "self.on_operator(reader)"); + initial_state.create_rule(&ops_no_modifier,"self.on_operator_no_modifier(reader)"); + initial_state.create_rule(&ops_group, "self.on_group(reader)"); + + let operator_mod_check_id = lexer.operator_modifier_check; + let operator_mod_check = lexer.group_mut(operator_mod_check_id); + operator_mod_check.create_rule(&equals,"self.on_modifier(reader)"); + + let operator_sfx_check_id = lexer.operator_suffix_check; + let operator_sfx_check = lexer.group_mut(operator_sfx_check_id); + operator_sfx_check.create_rule(&error_suffix,"self.ident_on_error_suffix(reader)"); + operator_sfx_check.create_rule(&Pattern::always(),"self.ident_on_no_error_suffix(reader)"); + } +} + + +// === Identifiers === + +/// Lexing rules for Enso identifiers. +#[allow(dead_code)] +impl EnsoLexer { + + /// Create a variable identifier from the current match. + fn on_variable_ident(&mut self, _reader:&mut R) { + let token = Token::Variable(self.consume_current(),self.offset.consume()); + let suffix_check = self.ident_suffix_check; + self.append_token(token); + self.push_state(suffix_check); + } + + /// Create a referent identifier from the current match. + fn on_referent_ident(&mut self, _reader:&mut R) { + let token = Token::Referent(self.consume_current(),self.offset.consume()); + let suffix_check = self.ident_suffix_check; + self.append_token(token); + self.push_state(suffix_check); + } + + /// Create an external identifier from the current match. + fn on_external_ident(&mut self, _reader:&mut R) { + let token = Token::External(self.consume_current(),self.offset.consume()); + let suffix_check = self.ident_suffix_check; + self.append_token(token); + self.push_state(suffix_check); + } + + /// Create a blank identifier from the current match. + fn on_blank(&mut self, _reader:&mut R) { + let token = Token::Blank(self.offset.consume()); + let suffix_check = self.ident_suffix_check; + self.discard_current(); + self.append_token(token); + self.push_state(suffix_check); + } + + /// Tokenize an unexpected error suffix. + fn ident_on_error_suffix(&mut self, _reader:&mut R) { + let token = Token::InvalidSuffix(self.consume_current(),self.offset.consume()); + self.append_token(token); + self.pop_state(); + } + + /// Submit a non-error identifier. + fn ident_on_no_error_suffix(&mut self, _reader:&mut R) { + self.pop_state(); + } + + /// The set of rules for lexing Enso identifiers. + fn add_identifier_rules(lexer:&mut EnsoLexer) { + let body_char = (EnsoLexer::lower_ascii_letter() | EnsoLexer::ascii_digit()).many(); + let underscore = c!('_'); + let ticks = c!('\'').many(); + let init_var_seg = EnsoLexer::lower_ascii_letter() >> &body_char; + let var_seg = (EnsoLexer::lower_ascii_letter() | EnsoLexer::ascii_digit()) >> &body_char; + let init_ref_seg = EnsoLexer::upper_ascii_letter() >> &body_char; + let ref_seg = (EnsoLexer::upper_ascii_letter() | EnsoLexer::ascii_digit()) >> &body_char; + let external_start = EnsoLexer::ascii_letter() | &underscore; + let external_body = EnsoLexer::ascii_alpha_num() | &underscore; + let variable_ident = &init_var_seg >> (&underscore >> &var_seg).many() >> &ticks; + let referent_ident = &init_ref_seg >> (&underscore >> &ref_seg).many() >> &ticks; + let external_ident = &external_start >> external_body.many() >> &ticks; + let error_suffix = Pattern::none_of(EnsoLexer::break_chars().as_str()).many1(); + + let initial_state_id = lexer.initial_state; + let initial_state = lexer.group_mut(initial_state_id); + initial_state.create_rule(&variable_ident,"self.on_variable_ident(reader)"); + initial_state.create_rule(&referent_ident,"self.on_referent_ident(reader)"); + initial_state.create_rule(&underscore, "self.on_blank(reader)"); + initial_state.create_rule(&external_ident,"self.on_external_ident(reader)"); + + let suffix_check_id = lexer.ident_suffix_check; + let suffix_check = lexer.group_mut(suffix_check_id); + suffix_check.create_rule(&error_suffix, "self.ident_on_error_suffix(reader)"); + suffix_check.create_rule(&Pattern::always(),"self.ident_on_no_error_suffix(reader)"); + } +} + + +// === Numbers === + +/// The set of rules for lexing numbers in Enso. +#[allow(dead_code)] +impl EnsoLexer { + + /// Finalize the lexer when it's done lexing a number with an explicit base. + fn finalize_explicit_base(&mut self) { + let number_part_2 = self.number_phase_two; + self.pop_states_including(number_part_2); + self.number_state.reset(); + } + + /// Triggered when the lexer matches an integer with an implicit base. + fn on_integer(&mut self, _reader:&mut R) { + let number_phase_2 = self.number_phase_two; + self.number_state.literal = self.consume_current(); + self.push_state(number_phase_2) + } + + /// Triggered when the lexer matches a number annotated with an explicit base. + fn on_explicit_base(&mut self, _reader:&mut R) { + let literal = self.consume_current(); + self.number_state.literal = literal; + let offset = self.offset.consume(); + let token = self.number_state.consume_token(offset); + self.append_token(token); + self.finalize_explicit_base(); + } + + /// Triggered when the lexer has seen an explicit base definition that isn't followed by an + /// actual number. + fn on_dangling_base(&mut self, _reader:&mut R) { + let base = self.number_state.consume_base(); + let offset = self.offset.consume(); + let token = Token::DanglingBase(base,offset); + self.append_token(token); + self.discard_current(); + self.finalize_explicit_base(); + } + + /// Triggered when an explicit decimal number has been seen by the lexer. + fn on_decimal(&mut self, _reader:&mut R) { + let decimal_suffix_check = self.decimal_suffix_check; + self.number_state.literal = self.consume_current(); + let offset = self.offset.consume(); + let token = self.number_state.consume_token(offset); + self.append_token(token); + self.push_state(decimal_suffix_check); + } + + /// Triggered when an explicit base annotation has been seen by the lexer. + fn seen_base(&mut self, _reader:&mut R) { + let seen_base_id = self.number_seen_base; + self.push_state(seen_base_id); + self.number_state.swap_members(); + } + + /// Submit an integer token into the lexer. + fn submit_integer(&mut self, _reader:&mut R) { + let offset = self.offset.consume(); + let token = self.number_state.consume_token(offset); + self.append_token(token); + self.pop_state(); + } + + /// Triggered when a decimal number is followed by an erroneous suffix. + fn decimal_error_suffix(&mut self, _reader:&mut R) { + let decimal_suffix_check = self.decimal_suffix_check; + let current_match = self.consume_current(); + let offset = self.offset.consume(); + let token = Token::InvalidSuffix(current_match,offset); + self.append_token(token); + self.pop_states_including(decimal_suffix_check); + } + + /// Triggered when a decimal number is followed by a valid suffix. + fn decimal_valid_suffix(&mut self, _reader:&mut R) { + let seen_decimal_id = self.decimal_suffix_check; + self.pop_states_including(seen_decimal_id); + } + + /// The rules for lexing numbers in Enso. + fn add_number_rules(lexer:&mut EnsoLexer) { + let digits = EnsoLexer::ascii_digit().many1(); + let point = c!('.'); + let underscore = c!('_'); + let decimal = &digits >> &point >> &digits; + let arbitrary_digits = EnsoLexer::ascii_alpha_num().many1(); + let arbitrary_decimal = &arbitrary_digits >> (&point >> &arbitrary_digits).opt(); + let error_suffix = Pattern::none_of(EnsoLexer::break_chars().as_str()).many1(); + + let initial_state_id = lexer.initial_state; + let initial_state = lexer.group_mut(initial_state_id); + initial_state.create_rule(&digits,"self.on_integer(reader)"); + initial_state.create_rule(&decimal,"self.on_decimal(reader)"); + + let number_phase_2_id = lexer.number_phase_two; + let number_phase_2 = lexer.groups_mut().group_mut(number_phase_2_id); + number_phase_2.create_rule(&underscore, "self.seen_base(reader)"); + number_phase_2.create_rule(&Pattern::always(),"self.submit_integer(reader)"); + + let seen_base_id = lexer.number_seen_base; + let seen_base = lexer.groups_mut().group_mut(seen_base_id); + seen_base.create_rule(&arbitrary_decimal,"self.on_explicit_base(reader)"); + seen_base.create_rule(&Pattern::always(),"self.on_dangling_base(reader)"); + + let decimal_suffix_check_id = lexer.decimal_suffix_check; + let decimal_suffix_check = lexer.groups_mut().group_mut(decimal_suffix_check_id); + decimal_suffix_check.create_rule(&error_suffix,"self.decimal_error_suffix(reader)"); + decimal_suffix_check.create_rule(&Pattern::always(),"self.decimal_valid_suffix(reader)"); + } +} + + +// === Text Rules === + +/// The set of rules for lexing text literals in the Enso language. +#[allow(dead_code)] +impl EnsoLexer { + + /// Define the rules for lexing Enso text literals. + fn add_text_rules(_lexer:&mut EnsoLexer) { + // TODO [AA] Write the lexing rules for text literals. + } +} + + +// === Block Rules === + +/// The set of rules for lexing blocks in the Enso language. +#[allow(dead_code)] +impl EnsoLexer { + + /// Triggered when a unix-style line ending is seen. + fn block_on_lf(&mut self, reader:&mut R) { + self.block_state.push_line_ending(token::LineEnding::LF); + self.block_on_line_ending(reader); + } + + /// Triggered when a windows-style line ending is seen. + fn block_on_crlf(&mut self, reader:&mut R) { + self.block_state.push_line_ending(token::LineEnding::CRLF); + self.block_on_line_ending(reader); + } + + /// Common functionality for both styles of line ending. + fn block_on_line_ending(&mut self, _reader:&mut R) { + let block_newline = self.block_newline; + self.block_state.seen_newline = true; + self.offset.push(); + self.push_state(block_newline); + } + + /// Transitions the lexer into a state in which it knows it is lexing a block line. + fn block_in_line(&mut self, _reader:&mut R) { + let indent_len = self.current_match.chars().count(); + self.offset.increase(indent_len,0); + let in_block_line = self.in_block_line; + self.push_state(in_block_line); + } + + /// Triggered when lexing a non-blank line. + fn block_on_non_empty_line(&mut self, reader:&mut R) { + let block_newline = self.block_newline; + self.pop_states_including(block_newline); + + match self.offset.current.cmp(&self.block_state.current().indent) { + Ordering::Equal => { + self.offset.consume(); + self.block_submit_line(reader); + }, + Ordering::Greater => { + let new_indent = self.offset.consume(); + self.begin_block(new_indent,reader); + }, + Ordering::Less => { + let new_indent = self.offset.consume(); + self.on_block_end(new_indent,reader); + } + } + } + + /// Triggered when lexing a block line that is empty and ends in a unix-style line ending. + fn block_on_empty_lf_line(&mut self, reader:&mut R) { + self.block_state.push_line_ending(token::LineEnding::LF); + self.block_in_empty_line(reader); + } + + /// Triggered when lexing a block line that is empty and ends in a windows-style line ending. + fn block_on_empty_crlf_line(&mut self, reader:&mut R) { + self.block_state.push_line_ending(token::LineEnding::CRLF); + self.block_in_empty_line(reader); + } + + /// Begin a new block. + fn begin_block(&mut self, block_indent:usize, _reader:&mut R) { + let is_orphan = self.output.is_empty(); + self.push_tokens(); + self.block_state.begin_block(block_indent,is_orphan); + } + + /// Triggered when lexing an empty line in a block. + fn block_in_empty_line(&mut self, reader:&mut R) { + self.block_submit_line(reader); + let offset = self.offset.consume(); + let block_newline = self.block_newline; + self.pop_states_until(block_newline); + self.block_state.push_empty_line(offset); + } + + /// Triggered when lexing a line in a block that ends a file. + fn block_in_eof_line(&mut self, reader:&mut R) { + let initial_state = self.initial_state; + self.pop_states_until(initial_state); + self.on_eof(reader); + } + + /// Triggered when beginning a top-level block. + fn block_begin_top_level(&mut self, reader:&mut R) { + let matched_bookmark = self.bookmarks.matched_bookmark; + let block_newline = self.block_newline; + let initial_state = self.initial_state; + self.bookmarks.rewind(matched_bookmark,reader); + self.offset.push(); + self.pop_states_until(initial_state); + self.push_state(block_newline); + } + + /// Triggered when a block is ended. + fn on_block_end(&mut self, new_indent:usize, reader:&mut R) { + if self.block_state.seen_newline { + while new_indent < self.block_state.current().indent { + self.block_submit(reader); + } + if new_indent > self.block_state.current().indent { + info!(self.logger,"Block with invalid indentation."); + self.begin_block(new_indent, reader); + self.block_state.current_mut().is_valid = false; + } else { + self.offset.push(); + self.block_submit_line(reader); + } + } + } + + /// Create a block token from the current block state. + fn build_block(&mut self, reader:&mut R) -> Token { + self.block_submit_line(reader); + let offset = self.offset.consume(); + let current_block = self.block_state.consume_current(); + current_block.into_token(offset) + } + + /// Submit a block to the token stream of the lexer. + fn block_submit(&mut self, reader:&mut R) { + let mut block = self.build_block(reader); + self.pop_tokens(); + self.offset.pop(); + self.block_state.end_block(); + + if let Some(Token{shape:token::Shape::Operator(_),..}) = self.last_token() { + if let token::Shape::Block {indent,lines,..} = block.shape { + block.shape = token::Shape::block(BlockType::Discontinuous,indent,lines); + } + } + + self.append_token(block); + self.offset.push(); + } + + /// Submit a line in a block. + /// + /// It should be noted that lines that have content in blocks cannot have an offset. + fn block_submit_line(&mut self, _reader:&mut R) { + if self.block_state.seen_newline { + if !self.output.is_empty() { + let token_stream = self.consume_tokens(); + let offset = 0; + self.block_state.append_line_to_current(token_stream.into(),offset); + } + debug!(self.logger,"Clear Output Buffer: Old Length = {self.output.len()}"); + self.output.clear(); + } + } + + /// Triggered when the top-level block ends. + fn block_end_top_level(&mut self, _reader:&mut R) { + let current_block = self.block_state.consume_current(); + if self.block_state.seen_newline { + let offset = self.offset.consume(); + let top_level_block = current_block.into_token(offset); + self.append_token(top_level_block); + } else { + let additional_offset = current_block.indent; + if let Some(token) = self.output.first_mut() { token.offset += additional_offset } + } + } + + /// The rule definitions for lexing blocks in Enso. + fn add_block_rules(lexer:&mut EnsoLexer) { + let spaces = EnsoLexer::spaces(); + let lf = c!('\n'); + let crlf = l!("\r\n"); + let opt_spaces = spaces.opt(); + let eof_line = &opt_spaces >> Pattern::eof(); + + let root_state_id = lexer.initial_state; + let root_state = lexer.group_mut(root_state_id); + root_state.create_rule(&lf, "self.block_on_lf(reader)"); + root_state.create_rule(&crlf,"self.block_on_crlf(reader)"); + + let block_newline_id = lexer.block_newline; + let block_newline = lexer.group_mut(block_newline_id); + block_newline.create_rule(&opt_spaces,"self.block_in_line(reader)"); + block_newline.create_rule(&eof_line, "self.block_in_eof_line(reader)"); + + let in_block_line_id = lexer.in_block_line; + let in_block_line = lexer.group_mut(in_block_line_id); + in_block_line.create_rule(&lf, "self.block_on_empty_lf_line(reader)"); + in_block_line.create_rule(&crlf, "self.block_on_empty_crlf_line(reader)"); + in_block_line.create_rule(&Pattern::always(),"self.block_on_non_empty_line(reader)"); + + let block_module_id = lexer.block_top_level; + let block_module = lexer.group_mut(block_module_id); + block_module.create_rule(&opt_spaces,"self.block_begin_top_level(reader)"); + } +} + + +// === Default Rules === + +/// The set of rules that apply as defaults in the root state. +#[allow(dead_code)] +impl EnsoLexer { + + /// Triggered on an arbitrary space character. + fn on_space(&mut self, _reader:&mut R) { + let current_len = self.current_match.chars().count(); + self.offset.increase(current_len,0); + self.discard_current(); + } + + /// Triggered on an arbitrary eof character. + fn on_eof(&mut self, reader:&mut R) { + self.offset.push(); + self.block_submit_line(reader); + self.on_block_end(0,reader); + self.block_end_top_level(reader); + } + + /// Triggered on any unrecognized character. + fn on_unrecognized(&mut self, _reader:&mut R) { + let token = Token::Unrecognized(self.consume_current(),self.offset.consume()); + self.append_token(token); + } + + /// The default rules for the lexer. + fn add_default_rules(lexer:&mut EnsoLexer) { + let space = Pattern::char(' '); + let eof = Pattern::eof(); + let any = Pattern::any(); + + let initial_state_id = lexer.initial_state; + let initial_state = lexer.group_mut(initial_state_id); + initial_state.create_rule(&space,"self.on_space(reader)"); + initial_state.create_rule(&eof, "self.on_eof(reader)"); + initial_state.create_rule(&any, "self.on_unrecognized(reader)"); + } +} + + + +// === Trait Impls === + +impl flexer::Definition for EnsoLexer { + fn define() -> Self { + let mut lexer = EnsoLexer::new(); + + EnsoLexer::add_operator_rules(&mut lexer); + EnsoLexer::add_identifier_rules(&mut lexer); + EnsoLexer::add_number_rules(&mut lexer); + EnsoLexer::add_text_rules(&mut lexer); + EnsoLexer::add_block_rules(&mut lexer); + EnsoLexer::add_default_rules(&mut lexer); + + lexer + } + + fn groups(&self) -> &Registry { + &self.lexer_states + } + + fn set_up(&mut self) { + let module_state_id = self.block_top_level; + self.push_state(module_state_id); + } + + fn tear_down(&mut self) {} +} + +impl Default for EnsoLexer { + fn default() -> Self { + EnsoLexer::new() + } +} + + + +// =================== +// === Lexer State === +// =================== + +/// The state for the Enso lexer. +#[derive(Debug)] +pub struct State { + /// The logger for the lexing state. + logger : Logger, + /// The bookmarks used by the lexer. + bookmarks : reader::BookmarkManager, + /// The registry of states for the lexer. + lexer_states : group::Registry, + /// The initial state of the lexer. + initial_state : group::Identifier, + /// The state for checking the end of identifiers. + ident_suffix_check : group::Identifier, + /// The state for completing number lexing. + number_phase_two : group::Identifier, + /// The state where number lexing has seen an explicit base. + number_seen_base : group::Identifier, + /// The state where number lexing has seen a decimal. + decimal_suffix_check : group::Identifier, + /// The state for lexing operator suffixes. + operator_suffix_check : group::Identifier, + /// The state for lexing operator modifiers. + operator_modifier_check : group::Identifier, + /// The state for lexing the top-level block. + block_top_level : group::Identifier, + /// The state entered when a newline has been lexed. + block_newline : group::Identifier, + /// The state entered when within the line of a block. + in_block_line : group::Identifier, + /// A stack of token matches. + tokens_stack : Vec, + /// Tracking for the current offset. + offset : Offset, + /// State specifically for lexing Enso numbers. + number_state : NumberLexingState, + /// State specifically for lexing Enso blocks. + block_state : BlockLexingState +} + +impl> State { + /// Get a reference to the group for the provided identifier. + pub fn group(&self, group:group::Identifier) -> &Group { + self.groups().group(group) + } + + /// Get a mutable reference to the group for the provided identifier. + pub fn group_mut(&mut self, group:group::Identifier) -> &mut Group { + self.groups_mut().group_mut(group) + } +} + + +// === Trait Impls === + +impl> flexer::State for State { + fn new(parent_logger:&impl AnyLogger) -> Self { + let logger = ::sub(parent_logger, "State"); + let bookmarks = default(); + let mut lexer_states = group::Registry::default(); + let initial_state = lexer_states.define_group("ROOT",None); + let ident_suffix_check = lexer_states.define_group("IDENT_SFX_CHECK",None); + let number_phase_two = lexer_states.define_group("NUMBER_PHASE_2",None); + let number_seen_base = lexer_states.define_group("NUMBER_SEEN_BASE",None); + let decimal_suffix_check = lexer_states.define_group("NUMBER_SEEN_POINT",None); + let operator_suffix_check = lexer_states.define_group("OPERATOR_SUFFIX_CHECK",None); + let operator_modifier_check = + lexer_states.define_group("OPERATOR_MODIFIER_CHECK",Some(operator_suffix_check)); + let block_top_level = lexer_states.define_group("BLOCK_MODULE", None); + let block_newline = lexer_states.define_group("BLOCK_NEWLINE",None); + let in_block_line = lexer_states.define_group("IN_BLOCK_LINE",None); + let tokens_stack = Vec::new(); + let offset_logger = ::sub(&logger,"Offset"); + let offset = Offset::new(offset_logger); + let number_state_logger = ::sub(&logger,"NumberState"); + let number_state = NumberLexingState::new(number_state_logger); + let block_state_logger = ::sub(&logger,"BlockLexingState"); + let block_state = BlockLexingState::new(block_state_logger); + + Self + { logger + , bookmarks + , lexer_states + , initial_state + , ident_suffix_check + , number_phase_two + , number_seen_base + , decimal_suffix_check + , operator_suffix_check + , operator_modifier_check + , block_top_level + , block_newline + , in_block_line + , tokens_stack + , offset + , number_state + , block_state + } + } + + fn initial_state(&self) -> group::Identifier { + self.initial_state + } + + fn groups(&self) -> &group::Registry { + &self.lexer_states + } + + fn groups_mut(&mut self) -> &mut group::Registry { + &mut self.lexer_states + } + + fn bookmarks(&self) -> &reader::BookmarkManager { + &self.bookmarks + } + + fn bookmarks_mut(&mut self) -> &mut reader::BookmarkManager { + &mut self.bookmarks + } + + fn specialize(&self) -> Result { + generate::specialize(self,"EnsoLexer","token::Stream") + } +} + + + +// ========================= +// === Offset Management === +// ========================= + +/// A manager for the current offset state of the lexer. +#[derive(Clone,Debug,Eq,PartialEq)] +pub struct Offset { + /// The current offset of the lexer. + /// + /// The offset is the number of leading spaces between the last-lexed token and the token that + /// is currently being lexed. + current : usize, + /// The stack of current offsets in the lexer. + stack : Vec, + /// The logger for the offset state. + logger : Logger +} + +impl Offset { + /// Create a new offset state. + pub fn new(logger:Logger) -> Self { + let current = default(); + let stack = default(); + Offset{current,stack,logger} + } + + /// Push the current offset onto the offset stack. + pub fn push(&mut self) { + debug!(self.logger,"Push Offset: {self.current}"); + self.stack.push(self.current); + self.current = 0; + } + + /// Pop the top offset from the offset stack. + pub fn pop(&mut self) { + self.current = self.stack.pop().unwrap_or(0); + debug!(self.logger,"Pop Offset: {self.current}"); + } + + /// Consume the current offset. + pub fn consume(&mut self) -> usize { + let offset = self.current; + self.current = 0; + debug!(self.logger,"Consume Offset: {offset}"); + offset + } + + /// Increase the current offset by `match_length` + `shift`. + pub fn increase(&mut self, match_length:usize, shift:usize) { + let diff = match_length + shift; + debug!(self.logger,"Increase Offset By: {diff}"); + self.current += diff; + debug!(self.logger,"Offset Now: {self.current}"); + } +} + + + +// ========================= +// === NumberLexingState === +// ========================= + +/// The state for lexing an Enso number. +#[derive(Clone,Debug,Default,PartialEq,Eq)] +pub struct NumberLexingState { + /// The (optional) base for the number. + pub base : String, + /// The literal number, to be interpreted in `base`. + pub literal : String, + /// A logger for the number state. + logger : Logger, +} + +impl NumberLexingState { + /// Create a new number lexing state. + pub fn new(logger:Logger) -> Self { + let base = default(); + let literal = default(); + NumberLexingState{base,literal,logger} + } + + /// Reset the number lexing state. + pub fn reset(&mut self) { + self.base.truncate(0); + self.literal.truncate(0); + debug!(self.logger,"Reset Number State"); + } + + /// Swap the `base` and `literal` in place. + pub fn swap_members(&mut self) { + debug!(self.logger,"Swap Number Fields"); + mem::swap(&mut self.base,&mut self.literal); + } + + /// Convert `self` into a token, resetting the lexing state. + pub fn consume_token(&mut self, offset:usize) -> Token { + debug!(self.logger,"Consuming Number: Base = {self.base}, Number = {self.literal}"); + Token::Number(mem::take(&mut self.base),mem::take(&mut self.literal),offset) + } + + /// Take the `literal` portion of the number lexing state. + pub fn consume_literal(&mut self) -> String { + mem::take(&mut self.literal) + } + + /// Take the `base` portion of the number lexing state. + pub fn consume_base(&mut self) -> String { + mem::take(&mut self.base) + } +} + + + +// ======================== +// === BlockLexingState === +// ======================== + +/// The state for managing the lexing of blocks in Enso. +#[derive(Clone,Debug,PartialEq)] +pub struct BlockLexingState { + /// The stack of blocks being lexed. + stack : NonEmptyVec, + /// Whether or not the lexer has seen an explicit newline. + seen_newline : bool, + /// A logger for the lexing state. + logger : Logger, +} + +impl BlockLexingState { + /// Construct a new block lexing state. + pub fn new(logger:Logger) -> Self { + let stack = NonEmptyVec::singleton(default()); + let seen_newline = false; + BlockLexingState{stack,seen_newline,logger} + } + + /// Set the last seen line ending. + pub fn push_line_ending(&mut self, line_ending:token::LineEnding) { + self.current_mut().seen_line_endings.push_back(line_ending); + debug!(self.logger,"Push Line Ending: {line_ending:?}"); + } + + /// Consume the last seen line ending. + pub fn pop_line_ending(&mut self) -> token::LineEnding { + let popped = self.current_mut().seen_line_endings.pop_front(); + debug!(self.logger,"Pop Line Ending: {popped:?}"); + popped.unwrap_or(token::LineEnding::None) + } + + /// Appends a line to the current block. + pub fn append_line_to_current(&mut self, tokens:Vec, offset:usize) { + let trailing_line_ending = self.pop_line_ending(); + debug!( + self.logger, + "Append Line: Line Ending = {trailing_line_ending:?}, Tokens = {&tokens:?}" + ); + self.current_mut().push_line(tokens, offset, trailing_line_ending); + } + + /// Get a reference to the current block. + pub fn current(&self) -> &BlockState { + self.stack.last() + } + + /// Get a mutable reference to the current block. + pub fn current_mut(&mut self) -> &mut BlockState { + self.stack.last_mut() + } + + /// Push a new block state onto the stack. + pub fn begin_block(&mut self, new_offset:usize, is_orphan:bool) { + debug!(self.logger,"Begin Block State: Indent = {new_offset}"); + self.stack.push(default()); + self.current_mut().is_orphan = is_orphan; + self.current_mut().indent = new_offset; + } + + /// Pop a block state from the stack. + pub fn end_block(&mut self) -> Option { + debug!(self.logger,"End Block State"); + self.stack.pop() + } + + /// Consume the state of the current block. + pub fn consume_current(&mut self) -> BlockState { + let block = mem::take(self.stack.last_mut()); + debug!(self.logger,"Consume Block: {&block:?}"); + block + } + + /// Push an empty line into the storage for them. + pub fn push_empty_line(&mut self, offset:usize) { + let trailing_line_ending = self.pop_line_ending(); + self.current_mut().push_empty_line(offset, trailing_line_ending); + debug!(self.logger,"Append Empty line: Line Ending = {trailing_line_ending:?}"); + } +} + + + +// ================== +// === BlockState === +// ================== + +/// The state for lexing a given block in Enso. +#[derive(Clone,Debug,PartialEq,Eq)] +pub struct BlockState { + /// Whether or not the block is orphaned. + /// + /// An orphaned block is one that has no block parent. + pub is_orphan : bool, + /// Whether or not the block is well-formed. + pub is_valid : bool, + /// The root indentation level of the block. + pub indent: usize, + /// The remaining lines of the block. + pub lines : Vec, + /// The line endings that have been seen in this block's context. + pub seen_line_endings : VecDeque +} + +impl BlockState { + /// Construct a new block state. + pub fn new() -> Self { + let is_orphan = false; + let is_valid = true; + let offset = 0; + let lines = default(); + let seen_line_endings = default(); + BlockState{is_orphan,is_valid, indent: offset,lines,seen_line_endings} + } + + /// Push a line into the block. + pub fn push_line + (&mut self + , tokens : Vec + , indent: usize + , trailing_line_ending : token::LineEnding + ) { + let line = Token::Line(tokens,indent,trailing_line_ending); + self.lines.push(line) + } + + /// Push a blank line into the block. + /// + /// The offset here should be the offset from the baseline, not from the block indent level. + pub fn push_empty_line(&mut self, offset:usize, trailing_line_ending:token::LineEnding) { + let line = Token::BlankLine(offset, trailing_line_ending); + self.lines.push(line); + } + + /// Convert the block state into a block token. + pub fn into_token(self, offset:usize) -> Token { + Token::Block( + BlockType::Continuous, + self.indent, + self.lines, + offset + ) + } + + /// Consume the lines in the block. + pub fn consume_lines(&mut self) -> Vec { + mem::take(&mut self.lines) + } +} + + +// === Trait Impls === + +impl Default for BlockState { + fn default() -> Self { + BlockState::new() + } +} diff --git a/lib/rust/lexer/definition/src/lib.rs b/lib/rust/lexer/definition/src/lib.rs new file mode 100644 index 0000000000..57229735f9 --- /dev/null +++ b/lib/rust/lexer/definition/src/lib.rs @@ -0,0 +1,29 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This library defines the lexer for the syntax of the Enso language. + +pub mod lexer; +pub mod token; + +/// A module that can be re-exported under the same name in the generation crate. +/// +/// This is necessary to avoid issues with paths getting wonky when the code is generated from the +/// Enso lexer definition. In this project, imports should _not_ be made from the crate root +/// _except_ through use of this `library` module. +pub mod library { + pub use crate::token; +} + +/// A collection of functionality for working with the lexer definition. +pub mod prelude { + pub use flexer::prelude::*; + pub use flexer::prelude::logger::*; +} diff --git a/lib/rust/lexer/definition/src/token.rs b/lib/rust/lexer/definition/src/token.rs new file mode 100644 index 0000000000..768c6ccd66 --- /dev/null +++ b/lib/rust/lexer/definition/src/token.rs @@ -0,0 +1,570 @@ +//! This file defines the various tokens requried by the Enso lexer. +//! +//! This file makes heavy use of terminology from the Enso design documentation, particularly the +//! [syntax](https://dev.enso.org/docs/enso/syntax) documentation. For the sake of brevity, many +//! terms will _not_ be defined here. + +use crate::prelude::*; + + + +// ============= +// === Token === +// ============= + +/// A lexer token. +#[derive(Clone,Debug,Eq,PartialEq)] +pub struct Token { + /// The shape of the token. + pub shape : Shape, + /// The length (in characters) of this token. + pub length : usize, + /// The number of trailing spaces after this token before the next. + pub offset : usize, +} + +impl Token { + /// Get the length that the token takes up in the program source. + pub fn source_length(&self) -> usize { + self.length + self.offset + } +} + +/// Constructors for the various forms of token. +#[allow(non_snake_case)] +impl Token { + /// Construct a token representing a referent identifier. + pub fn Referent(name:impl Str, offset:usize) -> Token { + let str = name.into(); + let length = str.chars().count(); + let shape = Shape::Referent(str); + Token{shape,length,offset} + } + + /// Construct a token representing a variable identifier. + pub fn Variable(name:impl Str, offset:usize) -> Token { + let str = name.into(); + let length = str.chars().count(); + let shape = Shape::Variable(str); + Token{shape,length,offset} + } + + /// Construct a token representing an external identifier. + pub fn External(name:impl Str, offset:usize) -> Token { + let str = name.into(); + let length = str.chars().count(); + let shape = Shape::External(str); + Token{shape,length,offset} + } + + /// Construct a token representing a blank identifier. + pub fn Blank(offset:usize) -> Token { + let shape = Shape::Blank; + let length = 1; + Token{shape,length,offset} + } + + /// Construct a token representing an operator. + pub fn Operator(name:impl Str, offset:usize) -> Token { + let str = name.into(); + let length = str.chars().count(); + let shape = Shape::Operator(str); + Token{shape,length,offset} + } + + /// Construct a token representing a modifier operator. + pub fn Modifier(name:impl Str, offset:usize) -> Token { + let str = name.into(); + let length = str.chars().count() + 1; + let shape = Shape::Modifier(str); + Token{shape,length,offset} + } + + /// Construct a token representing a number literal. + pub fn Number(base:impl Str, num:impl Into, offset:usize) -> Token { + let str = num.into(); + let base_str = base.into(); + let length = if base_str.is_empty() { + str.chars().count() + } else { + base_str.chars().count() + 1 + str.chars().count() + }; + let shape = Shape::Number{base:base_str,number:str}; + Token{shape,length,offset} + } + + /// Construct a token representing a dangling number base. + pub fn DanglingBase(base:impl Str, offset:usize) -> Token { + let base_str = base.into(); + let length = base_str.chars().count() + 1; + let shape = Shape::DanglingBase(base_str); + Token{shape,length,offset} + } + + /// Construct a token representing a text literal. + pub fn Text(text:impl Str, offset:usize) -> Token { + let str = text.into(); + let length = str.chars().count(); + let shape = Shape::Text(str); + Token{shape,length,offset} + } + + /// Construct a token representing a line of tokens. + pub fn Line(tokens:Vec, offset:usize, trailing_line_ending:LineEnding) -> Token { + let line_ending_len = trailing_line_ending.size(); + let length = tokens.iter().fold(line_ending_len,|l,r| l + r.offset + r.length); + let shape = Shape::Line{tokens,trailing_line_ending}; + Token{shape,length,offset} + } + + /// Construct a token representing a blank line. + /// + /// The `offset` for blank lines is from the leftmost column, not from the parent block's + /// indentation. + pub fn BlankLine(offset:usize, trailing_line_ending:LineEnding) -> Token { + let length = trailing_line_ending.size(); + let shape = Shape::BlankLine(trailing_line_ending); + Token{shape,length,offset} + } + + /// Construct a token representing a block. + pub fn Block + ( block_type : BlockType + , indent : usize + , lines : Vec + , offset : usize + ) -> Token { + let length = lines.iter().map(|line| { + let line_length = line.length; + let line_offset = line.offset; + match line.shape { + Shape::Line{..} => indent + line_offset + line_length, + Shape::BlankLine(_) => line_offset + line_length, + _ => unreachable_panic!("Tokens in a blocks should always be lines."), + } + }).sum(); + let shape = Shape::Block{block_type,indent,lines}; + Token{shape,length,offset} + } + + /// Construct a token representing an invalid suffix. + pub fn InvalidSuffix(text:impl Str, offset:usize) -> Token { + let str = text.into(); + let length = str.chars().count(); + let shape = Shape::InvalidSuffix(str); + Token{shape,length,offset} + } + + /// Construct a token representing an unrecognised lexeme. + pub fn Unrecognized(text:impl Str, offset:usize) -> Token { + let str = text.into(); + let length = str.chars().count(); + let shape = Shape::Unrecognized(str); + Token{shape,length,offset} + } +} + + + +// ================= +// === BlockType === +// ================= + +/// The type for an Enso Block token. +#[derive(Copy,Clone,Debug,PartialEq,Eq)] +pub enum BlockType { + /// A block made up of arguments to a function. + Continuous, + /// A block made up of separate lines. + Discontinuous, +} + +// =================== +// === NewlineType === +// =================== + +/// The type of newline associated with the line. +#[derive(Copy,Clone,Debug,Display,PartialEq,Eq)] +pub enum LineEnding { + /// There is no newline. + None, + /// The unix-style line-feed (`'\n'`), + LF, + /// The windows-style carriage-return, line-feed (`"\r\n"`). + CRLF +} + +impl LineEnding { + /// Get the number of rust `char`s that the newline type takes up. + pub fn size(self) -> usize { + match self { + Self::None => 0, + Self::LF => 1, + Self::CRLF => 2, + } + } +} + + +// === Trait Impls === + +impl Default for LineEnding { + fn default() -> Self { + LineEnding::None + } +} + + + +// ============= +// === Shape === +// ============= + +/// The shapes of tokens needed by the Enso lexer. +/// +/// This is a very small set of shapes, because the [`Token`] type only deals with the tokens that +/// the lexer works with, not the full complexity of Enso's syntax. +#[allow(missing_docs)] +#[derive(Clone,Debug,PartialEq,Eq)] +pub enum Shape { + // === Identifiers === + + /// An identifier in referent form. + Referent(String), + /// An identifier in variable form. + Variable(String), + /// An identifier not conforming to the Enso identifier rules (e.g. a Java identifier). + External(String), + /// A blank identifier (`_`). + Blank, + /// An operator identifier. + Operator(String), + /// A modifier identifier. + Modifier(String), + + // === Literals === + + /// A literal number. + Number{base:String, number:String}, + /// A dangling base from a number literal. + DanglingBase(String), + /// A text literal. + /// + /// This is currently way too simplistic to actually represent text, but it is a good + /// placeholder. + Text(String), + + // === Lines === + /// A line containing tokens. + /// + /// The offset for a line is always zero, as it is contained in a block with a defined + /// indentation. + Line{ + /// The tokens on the line. + tokens : Vec, + /// The line ending that _ends_ the line. + /// + /// Please note that the concept of 'ending' the line is a bit strange, as blocks are + /// treated as tokens in their own right, and hence are included in lines. + trailing_line_ending : LineEnding + }, + /// A blank line. + /// + /// The offset for a blank line is from the leftmost column, as it may be negative from the + /// block's indentation level. + BlankLine(LineEnding), + + // === Block === + /// A block of tokens. + Block { + /// The type of the block. + block_type : BlockType, + /// The leading indentation of the block. + indent : usize, + /// The lines in the block. + lines : Vec, + }, + + // === Errors === + /// An invalid suffix. + InvalidSuffix(String), + /// An unrecognized token. + Unrecognized(String), +} + +impl Shape { + + /// Construct an identifier in referent form. + pub fn referent(name:impl Into) -> Shape { + Shape::Referent(name.into()) + } + + /// Construct an identifier in variable form. + pub fn variable(name:impl Into) -> Shape { + Shape::Variable(name.into()) + } + + /// Construct an identifier in external form. + pub fn external(name:impl Into) -> Shape { + Shape::External(name.into()) + } + + /// Construct a blank identifier. + /// + /// This is provided as a function for completeness. + pub fn blank() -> Shape { + Shape::Blank + } + + /// Construct an operator identifier. + pub fn operator(opr:impl Into) -> Shape { + Shape::Operator(opr.into()) + } + + /// Construct a modifier identifier. + pub fn modifier(opr:impl Into) -> Shape { + Shape::Modifier(opr.into()) + } + + /// Construct a number literal. + pub fn number(base:impl Into, num:impl Into) -> Shape { + Shape::Number{base:base.into(),number:num.into()} + } + + /// Construct a dangling base literal. + pub fn dangling_base(base:impl Into) -> Shape { + Shape::DanglingBase(base.into()) + } + + /// Construct a text literal. + pub fn text(text:impl Into) -> Shape { + Shape::Text(text.into()) + } + + /// Construct a line that contains tokens. + pub fn line(tokens:Vec, trailing_line_ending:LineEnding) -> Shape { + Shape::Line{tokens,trailing_line_ending } + } + + /// Construct a line that is blank. + pub fn blank_line(trailing_line_ending:LineEnding) -> Shape { + Shape::BlankLine(trailing_line_ending) + } + + /// Construct a block containing lines. + pub fn block(block_type:BlockType, indent:usize, lines:Vec) -> Shape { + Shape::Block{block_type,indent,lines} + } + + /// Construct an invalid suffix. + pub fn invalid_suffix(text:impl Into) -> Shape { + Shape::InvalidSuffix(text.into()) + } + + /// Construct an unrecognised token. + pub fn unrecognized(text:impl Into) -> Shape { + Shape::Unrecognized(text.into()) + } +} + + + +// ============== +// === Stream === +// ============== + +/// A representation of the Enso token stream. +#[derive(Clone,Debug,Default,PartialEq)] +pub struct Stream { + /// The tokens in the token stream. + tokens:Vec +} + +impl Stream { + /// Append the provided `token` to the token stream. + pub fn append(&mut self, token:Token) { + self.tokens.push(token) + } + + /// Get a reference to the tokens in the stream. + pub fn tokens(&self) -> &Vec { + &self.tokens + } + + /// Get the length of the elements in the token stream. + pub fn tokens_len(&self) -> usize { + self.tokens.iter().map(|token|token.length + token.offset).sum() + } +} + +impl Deref for Stream { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.tokens + } +} + +impl DerefMut for Stream { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.tokens + } +} + + +// === Trait Impls === + +impl From> for Stream { + fn from(tokens:Vec) -> Self { + Stream{tokens} + } +} + +impl Into> for Stream { + fn into(self) -> Vec { + self.tokens + } +} + + + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +mod tests { + use super::*; + use crate::token::BlockType; + + + // === Testing Utilities === + + /// Asserts that the `token` has the provided `shape`. + pub fn assert_shape(token:&Token, shape:Shape) { + assert_eq!(token.shape,shape); + } + + /// Asserts that the `token` has the provided `length`. + pub fn assert_length(token:&Token, length:usize) { + assert_eq!(token.length,length) + } + + + // === Tests for Token Construction === + + #[test] + fn construct_referent_token() { + let token = Token::Referent("Some_Ref_Name",0); + assert_shape(&token,Shape::referent("Some_Ref_Name")); + assert_length(&token,13); + } + + #[test] + fn construct_variable_token() { + let token = Token::Variable("some_variable_name",0); + assert_shape(&token,Shape::variable("some_variable_name")); + assert_length(&token,18); + } + + #[test] + fn construct_external_name_token() { + let token = Token::External("camelCase",0); + assert_shape(&token,Shape::external("camelCase")); + assert_length(&token,9); + } + + #[test] + fn construct_blank_token() { + let token = Token::Blank(0); + assert_shape(&token,Shape::blank()); + assert_length(&token,1); + } + + #[test] + fn construct_operator_token() { + let token = Token::Operator("==>",0); + assert_shape(&token,Shape::operator("==>")); + assert_length(&token,3); + } + + #[test] + fn construct_modifier_token() { + let token = Token::Modifier("+",0); + assert_shape(&token,Shape::modifier("+")); + assert_length(&token,2); + } + + #[test] + fn construct_number_token() { + let token = Token::Number("","1231",0); + assert_shape(&token,Shape::number("","1231")); + assert_length(&token,4); + } + + #[test] + fn construct_dangling_base_token() { + let token = Token::DanglingBase("15",0); + assert_shape(&token,Shape::dangling_base("15")); + assert_length(&token,3); + } + + #[test] + fn construct_text_token() { + let token = Token::Text("some prose goes here",0); + assert_shape(&token,Shape::text("some prose goes here")); + assert_length(&token,20); + // TODO [AA] Make this internally account for length of quotes. + } + + #[test] + fn construct_line_token() { + let tokens = vec![Token::Variable("aa",0),Token::Referent("Abc",1)]; + let token = Token::Line(tokens.clone(), 4, LineEnding::LF); + assert_shape(&token,Shape::line(tokens.clone(), LineEnding::LF)); + assert_length(&token,7); + } + + #[test] + fn construct_blank_line_token() { + let token = Token::BlankLine(13,LineEnding::LF); + assert_shape(&token, Shape::blank_line(LineEnding::LF)); + assert_length(&token,1); + } + + #[test] + fn construct_block_token_lf() { + let lines = vec![ + Token::Line(vec![],0,LineEnding::LF), + Token::Line(vec![],4,LineEnding::LF) + ]; + let token = Token::Block(BlockType::Continuous,4,lines.clone(),0); + assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone())); + assert_length(&token,14); + } + + #[test] + fn construct_block_token_crlf() { + let lines = vec![ + Token::Line(vec![],0,LineEnding::CRLF), + Token::Line(vec![],4,LineEnding::CRLF) + ]; + let token = Token::Block(BlockType::Continuous,4,lines.clone(),0); + assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone())); + assert_length(&token,16); + } + + #[test] + fn construct_invalid_suffix_token() { + let token = Token::InvalidSuffix("aaa",0); + assert_shape(&token,Shape::invalid_suffix("aaa")); + assert_length(&token,3); + } + + #[test] + fn construct_unrecognized_token() { + let token = Token::Unrecognized("a",0); + assert_shape(&token,Shape::unrecognized("a")); + assert_length(&token,1); + } +} diff --git a/lib/rust/lexer/generation/Cargo.toml b/lib/rust/lexer/generation/Cargo.toml new file mode 100644 index 0000000000..3ec4129086 --- /dev/null +++ b/lib/rust/lexer/generation/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "lexer" +version = "0.1.0" +authors = ["Enso Team "] +edition = "2018" + +publish = false + +[lib] +crate-type = ["cdylib", "rlib"] +test = true +bench = true + +[dependencies] +flexer = { path = "../../flexer", version = "0.1.0" } +enso-prelude = { path = "../../enso-prelude", version = "0.1.0" } +lexer-definition = { path = "../definition", version = "0.1.0" } + +[build-dependencies] +flexer = { path = "../../flexer", version = "0.1.0" } +lexer-definition = { path = "../definition", version = "0.1.0" } diff --git a/lib/rust/lexer/generation/build.rs b/lib/rust/lexer/generation/build.rs new file mode 100644 index 0000000000..f909095126 --- /dev/null +++ b/lib/rust/lexer/generation/build.rs @@ -0,0 +1,32 @@ +use std::fs::File; +use lexer_definition::lexer::EnsoLexer; +use std::io::prelude::*; +use flexer::Definition; +use flexer::State; + + + +/// Generates the lexer engine and saves the result into the file `src/engine.rs`. +/// +/// The content of the generated file can be used with the `include!` macro. +fn generate_engine() -> std::io::Result<()> { + let definition_path = "../definition/src/lexer.rs"; + let output_directory = "src/generated"; + let _ = std::fs::create_dir(output_directory); + let output_path = "src/generated/engine.rs"; + let definition_error = format!("The lexer definition should exist at {}.",definition_path); + let output_error = format!("Cannot open output file at {}.",output_path); + let mut lexer_def = File::open(definition_path).expect(definition_error.as_str()); + let mut contents = String::new(); + let mut file = File::create(output_path).expect(output_error.as_str()); + let lexer = EnsoLexer::define(); + let engine = lexer.specialize().unwrap(); + lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition."); + file.write_all(contents.as_bytes()).expect("Unable to write lexer definition."); + file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization."); + Ok(()) +} + +fn main() -> std::io::Result<()> { + generate_engine() +} diff --git a/lib/rust/lexer/generation/src/generated.rs b/lib/rust/lexer/generation/src/generated.rs new file mode 100644 index 0000000000..32d94831fe --- /dev/null +++ b/lib/rust/lexer/generation/src/generated.rs @@ -0,0 +1,3 @@ +//! This module re-exports the generated lexer sources. + +pub mod engine; diff --git a/lib/rust/lexer/generation/src/lib.rs b/lib/rust/lexer/generation/src/lib.rs new file mode 100644 index 0000000000..fedbc12bae --- /dev/null +++ b/lib/rust/lexer/generation/src/lib.rs @@ -0,0 +1,25 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This module exports the interface to the generated Enso lexer. + +pub mod generated; + +/// Support libraries for the lexer definition. +/// +/// This is an intentional re-export in this crate's namespace. +mod library { + pub use lexer_definition::library::*; +} + +/// A library of commonly useful functionality. +mod prelude { + pub use lexer_definition::prelude::*; +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer.rs b/lib/rust/lexer/generation/tests/enso_lexer.rs new file mode 100644 index 0000000000..caf6eed33f --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer.rs @@ -0,0 +1,759 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for the Enso Lexer. + +// TODO [AA] Tests for error scenarios once it's done. + +use flexer::*; +use lexer_definition::library::*; + +use flexer::prelude::reader::decoder::DecoderUTF8; +use flexer::prelude::Reader; +use lexer::generated::engine::EnsoLexer; +use lexer_definition::library::token::Token; +use lexer_definition::token::BlockType; +use lexer_definition::token::LineEnding; + + + +// ================= +// === Utilities === +// ================= + +/// Assert that `result` is a success with tokens `expected`. +fn assert_succeeds_as(result:&LexingResult, expected:token::Stream) { + match result.kind { + ResultKind::Success => assert_eq!(result.tokens,expected), + _ => panic!("Lexing failed.") + } +} + +/// Assert that the provided input lexes as `expected`. +fn assert_lexes(input:impl AsRef, expected:token::Stream) { + let input_len = input.as_ref().chars().count(); + let result = lex(input); + assert_succeeds_as(&result,expected); + let tokens_vec : Vec<_> = result.tokens.into(); + let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum(); + assert_eq!(total_length,input_len); +} + +/// Lex the provided string. +fn lex(input:impl AsRef) -> LexingResult { + let mut lexer = EnsoLexer::new(); + let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8()); + lexer.run(reader) +} + +/// Asserts that the input is a block and has a length equal to `length`. +fn assert_block_has_length(input:impl AsRef, expected_length:usize) { + let result = lex(input); + match result.kind { + ResultKind::Success => { + let tokens = result.tokens.tokens(); + match tokens.first().expect("Token should be present.") { + Token{shape:token::Shape::Block{..},length,..} => + assert_eq!(*length,expected_length), + _ => panic!("Token not a block."), + } + }, + _ => panic!("Lexing failed"), + } +} + +/// Makes the test text have unix line endings to ensure consistency regardless of git checkout +/// style. +fn make_unix_line_endings(input:&str) -> String { + let string = String::from(input); + string.chars().filter(|c| *c != '\r').collect() +} + + + +// ================= +// === Operators === +// ================= + +#[test] +fn function_operator() { + let input = "->"; + let expected = token::Stream::from(vec![Token::Operator("->",0)]); + assert_lexes(input,expected); +} + +#[test] +fn bind_operator() { + let input = "<-"; + let expected = token::Stream::from(vec![Token::Operator("<-",0)]); + assert_lexes(input,expected); +} + +#[test] +fn left_pipe_operator() { + let input = "<|"; + let expected = token::Stream::from(vec![Token::Operator("<|",0)]); + assert_lexes(input,expected); +} + +#[test] +fn right_pipe_operator() { + let input = "|>"; + let expected = token::Stream::from(vec![Token::Operator("|>",0)]); + assert_lexes(input,expected); +} + +#[test] +fn eq_operator() { + let input = "="; + let expected = token::Stream::from(vec![Token::Operator("=",0)]); + assert_lexes(input,expected); +} + +#[test] +fn eq_compare_operator() { + let input = "=="; + let expected = token::Stream::from(vec![Token::Operator("==",0)]); + assert_lexes(input,expected); +} + +#[test] +fn geq_operator() { + let input = ">="; + let expected = token::Stream::from(vec![Token::Operator(">=",0)]); + assert_lexes(input,expected); +} + +#[test] +fn neq_operator() { + let input = "!="; + let expected = token::Stream::from(vec![Token::Operator("!=",0)]); + assert_lexes(input,expected); +} + +#[test] +fn dot_operator() { + let input = "."; + let expected = token::Stream::from(vec![Token::Operator(".",0)]); + assert_lexes(input,expected); +} + +#[test] +fn comma_operator() { + let input = ","; + let expected = token::Stream::from(vec![Token::Operator(",",0)]); + assert_lexes(input,expected); +} + +#[test] +fn double_dot_operator() { + let input = ".."; + let expected = token::Stream::from(vec![Token::Operator("..",0)]); + assert_lexes(input,expected); +} + +#[test] +fn triple_dot_operator() { + let input = "..."; + let expected = token::Stream::from(vec![Token::Operator("...",0)]); + assert_lexes(input,expected); +} + +#[test] +fn error_operator() { + let input = "!"; + let expected = token::Stream::from(vec![Token::Operator("!",0)]); + assert_lexes(input,expected); +} + +#[test] +fn type_ascription_operator() { + let input = ":"; + let expected = token::Stream::from(vec![Token::Operator(":",0)]); + assert_lexes(input,expected); +} + +#[test] +fn in_operator() { + let input = "in"; + let expected = token::Stream::from(vec![Token::Operator("in",0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_union_operator() { + let input = "|"; + let expected = token::Stream::from(vec![Token::Operator("|",0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_intersection_operator() { + let input = "&"; + let expected = token::Stream::from(vec![Token::Operator("&",0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_subtraction_operator() { + let input = "\\"; + let expected = token::Stream::from(vec![Token::Operator("\\",0)]); + assert_lexes(input,expected); +} + +#[test] +fn disable_comment() { + let input = "#"; + let expected = token::Stream::from(vec![Token::Operator("#",0)]); + assert_lexes(input,expected); +} + +#[test] +fn doc_comment() { + let input = "##"; + let expected = token::Stream::from(vec![Token::Operator("##",0)]); + assert_lexes(input,expected); +} + +#[test] +fn arbitrary_left_operator() { + let input = ">"; + let expected = token::Stream::from(vec![Token::Operator("-->>",0)]); + assert_lexes(input,expected); +} + +#[test] +fn modifier_plus() { + let input = "+="; + let expected = token::Stream::from(vec![Token::Modifier("+",0)]); + assert_lexes(input,expected); +} + +#[test] +fn modifier_minus() { + let input = "-="; + let expected = token::Stream::from(vec![Token::Modifier("-",0)]); + assert_lexes(input,expected); +} + +#[test] +fn arbitrary_modifier() { + let input = "<%="; + let expected = token::Stream::from(vec![Token::Modifier("<%",0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_eq_suffix() { + let input = "==="; + let expected = token::Stream::from(vec![Token::Operator("==",0),Token::InvalidSuffix("=",0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_dots_suffix() { + let input = "...."; + let expected = token::Stream::from(vec![Token::Operator("...",0),Token::InvalidSuffix(".",0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_modifier_suffix() { + let input = "+=="; + let expected = token::Stream::from(vec![Token::Operator("+",0),Token::InvalidSuffix("==",0)]); + assert_lexes(input,expected); +} + + + +// =================== +// === Identifiers === +// =================== + +#[test] +fn variable_ident() { + let input = "some_variable_name"; + let expected = token::Stream::from(vec![Token::Variable("some_variable_name",0)]); + assert_lexes(input,expected) +} + +#[test] +fn referent_ident() { + let input = "Some_Referent_Name"; + let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name",0)]); + assert_lexes(input,expected) +} + +#[test] +fn external_ident() { + let input = "__camelCaseIdentifier"; + let expected = token::Stream::from(vec![Token::External("__camelCaseIdentifier",0)]); + assert_lexes(input,expected) +} + +#[test] +fn blank_ident() { + let input = "_"; + let expected = token::Stream::from(vec![Token::Blank(0)]); + assert_lexes(input,expected) +} + +#[test] +fn ticked_variable_ident() { + let input = "some_variable_name'"; + let expected = token::Stream::from(vec![Token::Variable("some_variable_name'",0)]); + assert_lexes(input,expected) +} + +#[test] +fn ticked_referent_ident() { + let input = "Some_Referent_Name'"; + let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'",0)]); + assert_lexes(input,expected) +} + +#[test] +fn multi_ticked_variable_ident() { + let input = "some_variable_name'''"; + let expected = token::Stream::from(vec![Token::Variable("some_variable_name'''",0)]); + assert_lexes(input,expected) +} + +#[test] +fn multi_ticked_referent_ident() { + let input = "Some_Referent_Name'''"; + let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'''",0)]); + assert_lexes(input,expected) +} + +#[test] +fn variable_with_numbers() { + let input = "some0_1"; + let expected = token::Stream::from(vec![Token::Variable("some0_1",0)]); + assert_lexes(input,expected) +} + +#[test] +fn referent_with_numbers() { + let input = "Some_1821"; + let expected = token::Stream::from(vec![Token::Referent("Some_1821",0)]); + assert_lexes(input,expected) +} + +#[test] +fn tick_not_at_end_variable() { + let input = "some_var'iable"; + let expected = token::Stream::from(vec![ + Token::Variable("some_var'",0), + Token::InvalidSuffix("iable",0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn trailing_underscore() { + let input = "some_var_"; + let expected = token::Stream::from(vec![Token::External("some_var_",0)]); + assert_lexes(input,expected) +} + +#[test] +fn trailing_underscore_with_tick() { + let input = "some_var_'"; + let expected = token::Stream::from(vec![Token::External("some_var_'",0)]); + assert_lexes(input,expected) +} + +#[test] +fn invalid_suffix() { + let input = "some_varД"; + let expected = token::Stream::from(vec![ + Token::Variable("some_var",0), + Token::InvalidSuffix("Д",0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn unrecognized_token() { + let input = "some_var`"; + let expected = token::Stream::from(vec![ + Token::Variable("some_var",0), + Token::Unrecognized("`",0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn chained_identifiers() { + let input = "my_func A' someJavaValue some_python_value"; + let expected = token::Stream::from(vec![ + Token::Variable("my_func",0), + Token::Referent("A'",1), + Token::External("someJavaValue",1), + Token::Variable("some_python_value",1), + ]); + assert_lexes(input,expected) +} + + + +// =============== +// === Numbers === +// =============== + +#[test] +fn integer() { + let input = "13831"; + let expected = token::Stream::from(vec![Token::Number("","13831",0)]); + assert_lexes(input,expected); +} + +#[test] +fn integer_with_explicit_base() { + let input = "10_13831"; + let expected = token::Stream::from(vec![Token::Number("10","13831",0)]); + assert_lexes(input,expected); +} + +#[test] +fn dangling_base() { + let input = "10_"; + let expected = token::Stream::from(vec![Token::DanglingBase("10",0)]); + assert_lexes(input,expected); +} + +#[test] +fn hex_number() { + let input = "16_ff"; + let expected = token::Stream::from(vec![Token::Number("16","ff",0)]); + assert_lexes(input,expected); +} + +#[test] +fn decimal() { + let input = "2.71828"; + let expected = token::Stream::from(vec![Token::Number("","2.71828",0)]); + assert_lexes(input,expected); +} + +#[test] +fn decimal_with_explicit_base() { + let input = "10_2.71828"; + let expected = token::Stream::from(vec![Token::Number("10","2.71828",0)]); + assert_lexes(input,expected); +} + +#[test] +fn error_base() { + let input = "10.2_2"; + let expected = token::Stream::from(vec![ + Token::Number("","10.2",0), + Token::InvalidSuffix("_2",0), + ]); + assert_lexes(input,expected); +} + +#[test] +fn offset_number() { + let input = " 10.2"; + let expected = token::Stream::from(vec![ + Token::Number("","10.2",4), + ]); + assert_lexes(input,expected); +} + + + +// ============ +// === Text === +// ============ + + + +// ============== +// === Blocks === +// ============== + +#[test] +fn block_function_call() { + let input = make_unix_line_endings( +r#"f + argument_1 + argument_2 + fn a1 a2 a3 + argument_4 + argument_5"#); + let block_fn_args = + Token::Block( + BlockType::Continuous, + 4, + vec![ + Token::Line( + vec![Token::Variable("argument_1",0)], + 0, + LineEnding::LF + ), + Token::Line( + vec![ + Token::Variable("argument_2",0), + ], + 0, + LineEnding::LF + ), + Token::Line( + vec![ + Token::Variable("fn",0), + Token::Variable("a1",1), + Token::Variable("a2",1), + Token::Variable("a3",1), + ], + 0, + LineEnding::LF + ), + Token::Line( + vec![ + Token::Variable("argument_4",0), + ], + 0, + LineEnding::LF + ), + Token::Line( + vec![ + Token::Variable("argument_5",0), + ], + 0, + LineEnding::None + ), + ], + 0 + ); + let top_level_first_line = Token::Line( + vec![ + Token::Variable("f",0), + block_fn_args + ], + 0, + LineEnding::LF + ); + let top_level_block = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![top_level_first_line], + 0 + ) + ]); + assert_lexes(input,top_level_block); +} + + +#[test] +fn block_empty_lines() { + let input = "f\r\n a\n\n b\n"; + let nested_block = Token::Block( + BlockType::Continuous, + 4, + vec![ + Token::Line(vec![Token::Variable("a",0)],0,LineEnding::LF), + Token::BlankLine(0,LineEnding::LF), + Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF), + ], + 0 + ); + let top_line = Token::Line( + vec![ + Token::Variable("f",0), + nested_block + ], + 0, + LineEnding::CRLF + ); + let expected = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![top_line], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_top_level() { + let input = make_unix_line_endings( +r#" + +foo +bar +baz +"#); + let expected = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![ + Token::BlankLine(0,LineEnding::LF), + Token::BlankLine(0,LineEnding::LF), + Token::Line(vec![Token::Variable("foo",0)],0,LineEnding::LF), + Token::Line(vec![Token::Variable("bar",0)],0,LineEnding::LF), + Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_with_operator() { + let input = make_unix_line_endings( +r#"x -> + foo x 1 +"#); + let nested_block = Token::Block( + BlockType::Discontinuous, + 4, + vec![ + Token::Line(vec![ + Token::Variable("foo",0), + Token::Variable("x",1), + Token::Number("","1",1), + ], 0, LineEnding::LF) + ], + 0 + ); + let expected = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![ + Token::Line(vec![ + Token::Variable("x",0), + Token::Operator("->",1), + nested_block + ], 0, LineEnding::LF) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_with_nesting() { + let input = make_unix_line_endings(r#" +some_long_thing + foo -> + Bar + baz + + quux +"#); + let function_block = Token::Block( + BlockType::Discontinuous, + 8, + vec![ + Token::Line(vec![Token::Referent("Bar",0)],0,LineEnding::LF), + Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF), + Token::BlankLine(0,LineEnding::LF), + ], + 0 + ); + let foo_block = Token::Block( + BlockType::Continuous, + 4, + vec![ + Token::Line(vec![ + Token::Variable("foo",0), + Token::Operator("->",1), + function_block, + ], 0, LineEnding::LF), + Token::Line(vec![Token::Variable("quux",0)],0,LineEnding::LF), + ], + 0 + ); + let expected = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![ + Token::BlankLine(0,LineEnding::LF), + Token::Line(vec![ + Token::Variable("some_long_thing",0), + foo_block + ], 0, LineEnding::LF), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_extra_indented_blank_lines() { + let input = "a\n b\n \n \n c"; + let indented_block = Token::Block( + BlockType::Continuous, + 4, + vec![ + Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF), + Token::BlankLine(8,LineEnding::LF), + Token::BlankLine(2,LineEnding::LF), + Token::Line(vec![Token::Variable("c",0)],0,LineEnding::None), + ], + 0 + ); + let top_level_line = Token::Line(vec![ + Token::Variable("a",0), + indented_block + ],0,LineEnding::LF); + let expected = token::Stream::from(vec![ + Token::Block( + BlockType::Continuous, + 0, + vec![top_level_line], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_length_unix() { + let input = "a\n b\n c"; + assert_block_has_length(input,13); +} + +#[test] +fn block_length_windows() { + let input = "a\r\n b\r\n c"; + assert_block_has_length(input,15); +} + +#[test] +fn block_length_mixed() { + let input = "a\r\n b\n c\n d"; + assert_block_has_length(input,20); +} + + + +// ================ +// === Combined === +// ================ diff --git a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala index c92c27b9c0..14aa242a4f 100644 --- a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala +++ b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala @@ -33,7 +33,6 @@ case class ParserDef() extends flexer.Parser[AST.Module] { val lowerLetter: Pattern = range('a', 'z') val upperLetter: Pattern = range('A', 'Z') val digit: Pattern = range('0', '9') - val hex: Pattern = digit | range('a', 'f') | range('A', 'F') val alphaNum: Pattern = digit | lowerLetter | upperLetter val space: Pattern = ' '.many1 val newline: Pattern = '\n'