diff --git a/.gitignore b/.gitignore index 110d369e07..6e7b1f0d45 100644 --- a/.gitignore +++ b/.gitignore @@ -97,6 +97,7 @@ bench-report.xml .editorconfig .bloop/ .bsp/ +project/metals.sbt ################# ## Build Cache ## diff --git a/Cargo.toml b/Cargo.toml index 272ea8c8a1..256a3f9358 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,9 +2,6 @@ members = [ "lib/rust/ast", - "lib/rust/flexer", - "lib/rust/flexer-testing/definition", - "lib/rust/flexer-testing/generation", "lib/rust/launcher-shims", "lib/rust/lexer/definition", "lib/rust/lexer/generation", @@ -15,16 +12,17 @@ members = [ # assumes you have `rust-lib` in the same directory as `enso`. See: # https://github.com/enso-org/rust-lib/blob/main/docs/CONTRIBUTING.md#developing-in-conjunction-with-enso--ide [patch.crates-io] -# enso-automata = { path = '../rust-lib/src/automata' } -# enso-data = { path = '../rust-lib/src/data' } -# enso-generics = { path = '../rust-lib/src/generics' } -# enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' } -# enso-logger = { path = '../rust-lib/src/logger' } -# enso-macro-utils = { path = '../rust-lib/src/macro-utils' } -# enso-optics = { path = '../rust-lib/src/optics' } -# enso-prelude = { path = '../rust-lib/src/prelude' } -# enso-shapely = { path = '../rust-lib/src/shapely/impl' } -# enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' } +#enso-automata = { path = '../rust-lib/src/automata' } +#enso-data = { path = '../rust-lib/src/data' } +#enso-flexer = { path = '../rust-lib/src/flexer' } +#enso-generics = { path = '../rust-lib/src/generics' } +#enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' } +#enso-logger = { path = '../rust-lib/src/logger' } +#enso-macro-utils = { path = '../rust-lib/src/macro-utils' } +#enso-optics = { path = '../rust-lib/src/optics' } +#enso-prelude = { path = '../rust-lib/src/prelude' } +#enso-shapely = { path = '../rust-lib/src/shapely/impl' } +#enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' } [profile.dev] opt-level = 0 @@ -36,7 +34,6 @@ debug-assertions = true opt-level = 3 lto = true debug = false -panic = 'abort' debug-assertions = false [profile.bench] diff --git a/build.sbt b/build.sbt index b6b218859c..f5f19883b3 100644 --- a/build.sbt +++ b/build.sbt @@ -471,6 +471,45 @@ lazy val syntax = crossProject(JVMPlatform, JSPlatform) Compile / fullOptJS / artifactPath := file("target/scala-parser.js") ) +lazy val `lexer-bench` = + (project in file("lib/scala/syntax/specialization/lexer-bench")) + .settings( + commands += WithDebugCommand.withDebug, + inConfig(Compile)(truffleRunOptionsSettings), + inConfig(Benchmark)(Defaults.testSettings), + parallelExecution in Test := false, + logBuffered in Test := false, + Test / fork := true, + libraryDependencies ++= jmh + ) + .configs(Test) + .configs(Benchmark) + .dependsOn(syntax.jvm) + .dependsOn(flexer.jvm) + .settings( + javaOptions ++= Seq( + "-Xms4096m", + "-Xmx4096m", + "-XX:+FlightRecorder", + ), + mainClass in Benchmark := Some("org.openjdk.jmh.Main"), + bench := Def.task { + (run in Benchmark).toTask("").value + }, + benchOnly := Def.inputTaskDyn { + import complete.Parsers.spaceDelimited + val name = spaceDelimited("").parsed match { + case List(name) => name + case _ => + throw new IllegalArgumentException("Expected one argument.") + } + Def.task { + (testOnly in Benchmark).toTask(" -- -z " + name).value + } + }.evaluated, + parallelExecution in Benchmark := false + ) + lazy val `parser-service` = (project in file("lib/scala/parser-service")) .dependsOn(syntax.jvm) .settings( diff --git a/docs/parser/README.md b/docs/parser/README.md index b36dfd3b9d..15b13ebf93 100644 --- a/docs/parser/README.md +++ b/docs/parser/README.md @@ -26,8 +26,6 @@ below: the implementation technologies for the parser. - [**Parser Architecture:**](./architecture.md) An overview of the architecture of the parser as a whole. -- [**Flexer:**](./flexer.md) An overview of the design and architecture of the - flexer, a generic, DFA-based lexing engine. - [**Lexer:**](./lexer.md) The Enso lexer, responsible for tokenising the input stream of source code. - [**Macro Resolution:**](./macro-resolution.md) The system for defining and diff --git a/docs/parser/ast.md b/docs/parser/ast.md index a76d4bb84d..450789f5d2 100644 --- a/docs/parser/ast.md +++ b/docs/parser/ast.md @@ -3,7 +3,7 @@ layout: developer-doc title: AST category: parser tags: [parser, ast] -order: 9 +order: 8 --- # AST diff --git a/docs/parser/construct-resolution.md b/docs/parser/construct-resolution.md index 47f22c67d2..637be6bdad 100644 --- a/docs/parser/construct-resolution.md +++ b/docs/parser/construct-resolution.md @@ -3,7 +3,7 @@ layout: developer-doc title: Construct Resolution category: parser tags: [parser, construct, resolution] -order: 7 +order: 6 --- # Construct Resolution diff --git a/docs/parser/flexer.md b/docs/parser/flexer.md deleted file mode 100644 index 75ecd5ec7d..0000000000 --- a/docs/parser/flexer.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -layout: developer-doc -title: Flexer -category: syntax -tags: [parser, flexer, lexer, dfa] -order: 3 ---- - -# Flexer - -The flexer is a finite-automata-based engine for the definition and generation -of lexers. Akin to `flex`, and other lexer generators, the user may use it to -define a series of rules for lexing their language, which are then used by the -flexer to generate a highly-efficient lexer implementation. - -Where the flexer differs from other programs in this space, however, is the -power that it gives users. When matching a rule, the flexer allows its users to -execute _arbitrary_ Rust code, which may even manipulate the lexer's state and -position. This means that the languages that can be lexed by the flexer extend -from the simplest regular grammars right up to unrestricted grammars (but please -don't write a programming language whose syntax falls into this category). It -also differs in that it chooses the first complete match for a rule, rather than -the longest one, which makes lexers much easier to define and maintain. - -For detailed library documentation, please see the -[crate documentation](../../lib/rust/flexer/src/lib.rs) itself. This includes a -comprehensive tutorial on how to define a lexer using the flexer. - - - -- [The Lexing Process](#the-lexing-process) -- [Lexing Rules](#lexing-rules) - - [Groups](#groups) - - [Patterns](#patterns) - - [Transition Functions](#transition-functions) -- [Code Generation](#code-generation) - - [Automated Code Generation](#automated-code-generation) -- [Structuring the Flexer Code](#structuring-the-flexer-code) - - [Supporting Code Generation](#supporting-code-generation) - - - -## The Lexing Process - -In the flexer, the lexing process proceeds from the top to the bottom of the -user-defined rules, and selects the first expression that _matches fully_. Once -a pattern has been matched against the input, the associated code is executed -and the process starts again until the input stream has been consumed. - -This point about _matching fully_ is particularly important to keep in mind, as -it differs from other lexer generators that tend to prefer the _longest_ match -instead. - -## Lexing Rules - -A lexing rule for the flexer is a combination of three things: - -1. A group. -2. A pattern. -3. A transition function. - -An example of defining a rule is as follows: - -```rust -fn define() -> Self { - let mut lexer = TestLexer::new(); - let a_word = Pattern::char('a').many1(); - let root_group_id = lexer.initial_state; - let root_group = lexer.groups_mut().group_mut(root_group_id); - // Here is the rule definition. - root_group.create_rule(&a_word,"self.on_first_word(reader)"); - lexer -} -``` - -### Groups - -A group is a mechanism that the flexer provides to allow grouping of rules -together. The flexer has a concept of a "state stack", which records the -currently active state at the current time, that can be manipulated by the -user-defined [transition functions](#transition-functions). - -A state can be made active by using `flexer::push_state(state)`, and can be -deactivated by using `flexer::pop_state(state)` or -`flexer::pop_states_until(state)`. In addition, states may also have _parents_, -from which they can inherit rules. This is fantastic for removing the need to -repeat yourself when defining the lexer. - -When inheriting rules from a parent group, the rules from the parent group are -matched strictly _after_ the rules from the child group. This means that groups -are able to selectively "override" the rules of their parents. Rules are still -matched in order for each group's set of rules. - -### Patterns - -Rules are defined to match _patterns_. Patterns are regular-grammar-like -descriptions of the textual content (as characters) that should be matched. For -a description of the various patterns provided by the flexer, see -[pattern.rs](../../lib/rust/flexer/src/automata/pattern.rs). - -When a pattern is matched, the associated -[transition function](#transition-functions) is executed. - -### Transition Functions - -The transition function is a piece of arbitrary rust code that is executed when -the pattern for a given rule is matched by the flexer. This code may perform -arbitrary manipulations of the lexer state, and is where the majority of the -power of the flexer stems from. - -## Code Generation - -While it would be possible to interpret the flexer definition directly at -runtime, this would involve far too much dynamicism and non-cache-local lookup -to be at all fast. - -Instead, the flexer includes -[`generate.rs`](../../lib/rust/flexer/src/generate.rs), a library for generating -highly-specialized lexer implementations based on the definition provided by the -user. The transformation that it implements operates as follows for each group -of rules. - -1. The set of rules in a group is used to generate a - [Nondeterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton), - (NFA). -2. The NFA is ttransformed into a - [Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) - (DFA), using a variant of the standard - [powerset construction](https://en.wikipedia.org/wiki/Powerset_construction) - algorithm. This variant has been modified to ensure that the following - additional properties hold: - - Patterns are matched in the order in which they are defined. - - The associated transition functions are maintained correctly through the - transformation. - - The lexing process is `O(n)`, where `n` is the size of the input. -3. The DFA is then used to generate the rust code that implements that lexer. - -The generated lexer contains a main loop that consumes the input stream -character-by-character, evaluating what is effectively a big `match` expression -that processes the input to evaluate the user-provided transition functions as -appropriate. - -### Automated Code Generation - -In order to avoid the lexer definition getting out of sync with its -implementation (the generated engine), it is necessary to create a separate -crate for the generated engine that has the lexer definition as one of its -dependencies. - -This separation enables a call to `flexer::State::specialize()` in the crate's -`build.rs` (or a macro) during compilation. The output can be stored in a new -file i.e. `engine.rs` and exported from the library as needed. The project -structure would therefore appear as follows. - -``` -- lib/rust/lexer/ - - definition/ - - src/ - - lib.rs - - cargo.toml - - - generation/ - - src/ - - engine.rs <-- the generated file - - lib.rs <-- `pub mod engine` - - build.rs <-- calls `flexer::State::specialize()` and saves its output to - `src/engine.rs` - - cargo.toml <-- lexer-definition is in dependencies and build-dependencies -``` - -With this design, `flexer.generate_specialized_code()` is going to be executed -on each rebuild of `lexer/generation`. Therefore, `generation` should contain -only the minimum amount of logic, and should endeavor to minimize any -unnecessary dependencies to avoid recompiling too often. - -## Structuring the Flexer Code - -In order to unify the API between the definition and generated usages of the -flexer, the API is separated into the following components: - -- `Flexer`: The main flexer definition itself, providing functionality common to - the definition and implementation of all lexers. -- `flexer::State`: The stateful components of a lexer definition. This trait is - implemented for a particular lexer definition, allowing the user to store - arbitrary data in their lexer, as needed. -- **User-Defined Lexer:** The user can then define a lexer that _wraps_ the - flexer, specialised to the particular `flexer::State` that the user has - defined. It is recommended to implement `Deref` and `DerefMut` between the - defined lexer and the `Flexer`, to allow for ease of use. - -### Supporting Code Generation - -This architecture separates out the generated code (which can be defined purely -on the user-defined lexer), from the code that is defined as part of the lexer -definition. This means that the same underlying structures can be used to both -_define_ the lexer, and be used by the generated code from that definition. - -For an example of how these components are used in the generated lexer, please -see [`generated_api_test`](../../lib/rust/flexer/tests/generated_api_test.rs). diff --git a/docs/parser/jvm-object-generation.md b/docs/parser/jvm-object-generation.md index df32d53db3..16154b18aa 100644 --- a/docs/parser/jvm-object-generation.md +++ b/docs/parser/jvm-object-generation.md @@ -3,7 +3,7 @@ layout: developer-doc title: JVM Object Generation category: parser tags: [parser, jvm, object-generation] -order: 10 +order: 9 --- # JVM Object Generation @@ -14,8 +14,6 @@ the compiler and runtime to work with the AST. -- [Overall Architecture](#overall-architecture) - # Overall Architecture diff --git a/docs/parser/lexer.md b/docs/parser/lexer.md index c785e51643..076e01c679 100644 --- a/docs/parser/lexer.md +++ b/docs/parser/lexer.md @@ -3,7 +3,7 @@ layout: developer-doc title: Lexer category: syntax tags: [parser, lexer] -order: 4 +order: 3 --- # Lexer @@ -19,6 +19,9 @@ identify blocks - [Libraries in the Lexer Definition](#libraries-in-the-lexer-definition) - [Lexer Functionality](#lexer-functionality) - [The Lexer AST](#the-lexer-ast) +- [Benchmarking the Lexer](#benchmarking-the-lexer) + - [Running a Subset of the Benchmarks](#running-a-subset-of-the-benchmarks) + - [Changing the Lexer](#changing-the-lexer) @@ -43,12 +46,12 @@ paths directly from the crate root. ## Lexer Functionality -The lexer needs to provide the following functionality as part of the parser. +The lexer provides the following functionality as part of the parser. - It consumes the source lazily, character by character, and produces a structured token stream consisting of the lexer [ast](#the-lexer-ast). -- It must succeed on _any_ input, even if there are invalid constructs in the - token stream, represented by `Invalid` tokens. +- It succeeds _any_ input, even if there are invalid constructs in the token + stream, represented by `Invalid` tokens. ## The Lexer AST @@ -69,15 +72,29 @@ It contains the following constructs: - `Blank`: The blank name `_`. - `Operator`: Operator identifiers (e.g. `-->>`). - `Modifier`: Modifier operators (e.g. `+=`). +- `Annotation`: An annotation (e.g. `@Tail_Call`). - `Number`: Numbers (`16_FFFF`). - `DanglingBase`: An explicit base without an associated number (e.g. `16_`). -- `Text`: Text (e.g. `"Some text goes here."`). +- `TextLine`: A single-line text literal. +- `TextInlineBlock`: An inline block text literal. +- `TextBlock`: A text block literal. +- `InvalidQuote`: An invalid set of quotes for a text literal. +- `TextSegmentRaw`: A raw text segment in which the contents should be + interpreted literally. +- `TextSegmentEscape:` A text segment containing an escape sequence. +- `TextSegmentInterpolate:` A text segment containing an arbitrary interpolated + expression. +- `TextSegmentUnclosedInterpolate`: An unclosed interpolation text segment. - `Line`: A line in a block that contains tokens. - `BlankLine`: A line in a block that contains only whitespace. - `Block`: Syntactic blocks in the language. - `InvalidSuffix`: Invalid tokens when in a given state that would otherwise be valid. - `Unrecognized`: Tokens that the lexer doesn't recognise. +- `DisableComment`: A standard comment that disables interpretation of the + commented code (i.e. `#`). +- `DocComment:` A documentation comment (e.g. `##`). Documentation syntax is + _not_ lexed by this lexer. The distinction is made here between the various kinds of identifiers in order to keep lexing fast, but also in order to allow macros to switch on the kinds of @@ -87,3 +104,61 @@ identifiers. > > - Determine if we want to have separate ASTs for the lexer and the parser, or > not. + +## Benchmarking the Lexer + +As the lexer is the first port of call when getting an Enso program to run it +needs to be quick. To that end, we insist on comprehensive benchmarks for any +change made to the lexer. The lexer benchmarks are written using +[criterion.rs](https://github.com/bheisler/criterion.rs), and include both +examples of whole program definitions and more specific benchmark examples. + +**Baseline Commit:** TBC (use head of this branch for now). + +The benchmarking process for the lexer is as follows: + +1. Check out the current _baseline commit_, listed above. +2. In `lexer_bench_sources.rs` change the line that reads `.retain_baseline` to + instead read `.save_baseline`. This will save the current baseline (taken on + your machine). +3. Run the benchmarks using `cargo bench`. Please note that running these + benchmarks takes approximately two hours, so sufficient time should be + allotted. +4. Once the baseline run has completed, change the above-mentioned line back to + `.retain_baseline`. This will disable overwriting the saved baseline, and + will perform its regression reporting against it. +5. Make your changes. +6. Run the benchmark suite again. It will report any performance regressions in + the benchmark report, measured against your saved baseline. + +Unfortunately, the use of time-based benchmarks means that we can't commit the +baseline to the repository. There is far too much variance between machines for +this to be useful. + +### Running a Subset of the Benchmarks + +The benchmarks are very comprehensive, running a wide range of program text +through the lexer while replicating it out to various sizes (see +`lexer_bench_sources.rs` for the full list). However, in order to decrease +iteration time it can be useful to run a subset of these. + +There are two main tuning points for this: + +1. The _sizes_ of inputs being executed on. +2. The benchmarks being executed. + +The sizes can be tuned by editing the `SIZES` array in the +`lexer_bench_sources.rs` file. The benchmarks themselves are best tuned by +changing the macro definitions in `lexer_time_bench.rs` to exclude certain +benchmarks or groups of benchmarks. + +While it is _possible_ to tune the benchmarking config (`bench_config` in +`lexer_bench_sources.rs`) to decrease benchmarking time, this is not +recommended. The current settings are tuned to provide reliable results. + +### Changing the Lexer + +When changing the lexer the _full_ benchmark suite must be run against the +current baseline before the changes can be merged. This suite run must use the +provided settings for the benchmarking library, and should be performed using +the process described above. diff --git a/docs/parser/macro-resolution.md b/docs/parser/macro-resolution.md index 456362c797..319c60f1e1 100644 --- a/docs/parser/macro-resolution.md +++ b/docs/parser/macro-resolution.md @@ -3,7 +3,7 @@ layout: developer-doc title: Macro Resolution category: parser tags: [parser, macro, resolution] -order: 5 +order: 4 --- # Macro Resolution diff --git a/docs/parser/operator-resolution.md b/docs/parser/operator-resolution.md index e6d713cf1c..68452cabdf 100644 --- a/docs/parser/operator-resolution.md +++ b/docs/parser/operator-resolution.md @@ -3,7 +3,7 @@ layout: developer-doc title: Operator Resolution category: parser tags: [parser, operator, resolution] -order: 6 +order: 5 --- # Operator Resolution diff --git a/docs/parser/parser-driver.md b/docs/parser/parser-driver.md index 653a4a0331..3445cc8096 100644 --- a/docs/parser/parser-driver.md +++ b/docs/parser/parser-driver.md @@ -3,7 +3,7 @@ layout: developer-doc title: Parser Driver category: parser tags: [parser, driver] -order: 8 +order: 7 --- # Parser Driver diff --git a/docs/parser/reader.md b/docs/parser/reader.md index 6df31cc96c..1e3bf98f49 100644 --- a/docs/parser/reader.md +++ b/docs/parser/reader.md @@ -3,7 +3,7 @@ layout: developer-doc title: Reading Source Code category: parser tags: [parser, reader] -order: 11 +order: 10 --- # Reading Source Code @@ -15,9 +15,14 @@ project is going to use, as well as backing formats for the stream. - [Reader Functionality](#reader-functionality) -- [Provided Readers](#provided-readers) - - [UTF-8 Reader](#utf-8-reader) - - [UTF-16 Reader](#utf-16-reader) +- [Reader Structure](#reader-structure) + - [Read](#read) + - [Decoder](#decoder) +- [Provided Encodings](#provided-encodings) + - [UTF-8](#utf-8) + - [UTF-16](#utf-16) + - [UTF-32](#utf-32) + - [Benchmarks](#benchmarks) diff --git a/docs/syntax/comments.md b/docs/syntax/comments.md index 9ecbac24b9..51941f2597 100644 --- a/docs/syntax/comments.md +++ b/docs/syntax/comments.md @@ -23,6 +23,7 @@ Enso supports a variety of types of comments: - [Disable Comments](#disable-comments) +- [Freeze Comments](#freeze-comments) - [Documentation Comments](#documentation-comments) - [Tags](#tags) - [Sections](#sections) @@ -39,13 +40,35 @@ Disable comments are the standard form of comment seen in a programming language in that they prevent a given piece of code from executing. In Enso, they are created by prefixing the expression to disable with the `#` character. -These aren't _exactly_ like most language's disable comments however: +Disable comments in Enso do not have their contents validated, and continue from +the `#` character to the end of the line. -- When you disable a line in Enso, it is still run through the parser to see if - it is syntactically valid. -- No identifiers in it are resolved, however. -- This is important as it allows the disabled expression to still be displayed - in the visual syntax. +```ruby +x = y + z # here is some commented text +``` + +Disable comments are _not_ allowed inside textual interpolations. + +## Freeze Comments + +Freeze comments are a special type of comment used to enable the 'freezing' or +caching of expensive computations in Enso. When used, they cache the result of +an expression, reusing the value instead of recomputing it even if the +underlying data changes. + +A portion of code that is frozen has the following properties: + +- It is still lexed as if it were code, and validated by the parser to check for + validity. +- No identifier resolution takes place. + +These are very important as they still allow the frozen expression to be +displayed properly in the visual syntax. + +> The actionables for this section are: +> +> - Work out what they should look like visually. +> - Work out how best to implement this. ## Documentation Comments @@ -66,6 +89,8 @@ for more information). By way of example: until I unindent again. ``` +Documentation comments are _not_ allowed inside textual interpolations. + The tool that generates this documentation aims to be fairly robust, and tries to assign produce sensible results even if the user makes a mistake. Such mistakes will be highlighted to the user. diff --git a/docs/syntax/literals.md b/docs/syntax/literals.md index 056a45fa3d..5c28f344b3 100644 --- a/docs/syntax/literals.md +++ b/docs/syntax/literals.md @@ -17,6 +17,8 @@ types in literal form in the source code. - [Text Literals](#text-literals) - [Inline Text Literals](#inline-text-literals) - [Text Block Literals](#text-block-literals) + - [Inline Block Literals](#inline-block-literals) + - [Escape Sequences](#escape-sequences) - [Vector Literals](#vector-literals) @@ -65,7 +67,7 @@ Enso provides rich support for textual literals in the language, supporting both raw and interpolated strings natively. - **Raw Strings:** Raw strings are delimited using the standard double-quote - character (`"`). Raw strings have support for escape sequences. + character (`"`). Raw strings don't support escape sequences except for `\"`. ```ruby raw_string = "Hello, world!" @@ -75,7 +77,8 @@ raw and interpolated strings natively. executable Enso expressions into the string. Such strings are delimited using the single-quote (`'`) character, and splices are delimited using the backtick (`` ` ``) character. Splices are run, and then the result is converted to a - string using `show`. These strings also have support for escape sequences. + string using `show`. These strings also have support for all kinds of + [escape sequences](#escape-sequences). ```ruby fmt_string = 'Hello, my age is `time.now.year - person.birthday.year`' @@ -104,7 +107,7 @@ following layout rules: - Any indentation further than this baseline will be retained as part of the text literal. - The literal is _closed_ by the first line with a _lower_ level of indentation - than the first child lineand will not contain the final blank line. + than the first child line and will not contain the final blank line. ``` block_raw = ''' @@ -116,6 +119,48 @@ block_raw = ''' not_string_expr = foo bar ``` +### Inline Block Literals + +In order to easily transition between using text blocks and single-line +literals, we allow for defining an inline block literal. This is a literal that +uses the same start delimiter as a block literal (see above), but rather than +ending the literal through de-indenting from the block's level of indentation, +the literal is ended upon the line ending. + +``` +inline_block = + """this is all part of the literal + but_this_is_not +``` + +### Escape Sequences + +Format literals in Enso support many kinds of escape sequence. These are +described below. + +| Name | Escape Sequence | Unicode | Notes | +| :----------- | :-------------: | :--------: | :---------------------------------------------------------------------------------------- | +| Byte Escape | `\x##` | `U+00##` | 8-bit character specification. | +| U16 Escape | `\u####` | `U+####` | 16-bit unicode character, where each `#` is a hex digit. | +| U21 Escape | `\u{######}` | `U+######` | 21-bit unicode character, where `######` is 1-6 hex digits. | +| U32 Escape | `\U########` | `U+######` | 32-bit unicode character, where each `#` is a hex digit and the first two bytes are `00`. | +| Null | `\0` | `U+0000` | The null character. | +| Alert | `\a` | `U+0007` | The bell/alert character. | +| Backspace | `\b` | `U+0008` | The backspace character. | +| Form Feed | `\f` | `U+000C` | The form-feed character. | +| LF | `\n` | `U+000A` | The line-feed character (newline on unix systems). | +| CR | `\r` | `U+000D` | The carriage return character (part of newline on windows systems). | +| Tab | `\t` | `U+0009` | The horizontal tab character. | +| Vertical Tab | `\v` | `U+000B` | The vertical tab character. | +| Backslash | `\\` | `U+005C` | A literal backslash character. | +| Double Quote | `\"` | `U+0022` | A literal double quote character. | +| Single Quote | `\'` | `U+0027` | A literal single quote character. | +| Backtick | `` \` `` | `U+0060` | A literal backtick character. | + +The only one of the above escape sequences that is supported in a raw text +literal is `\"`. All other occurrences of `\` in such literals are treated as a +literal backslash. + ## Vector Literals Enso also supports vector literals, which allow users to create literal vectors diff --git a/docs/syntax/macros.md b/docs/syntax/macros.md index acc1642b83..c8750f6ccd 100644 --- a/docs/syntax/macros.md +++ b/docs/syntax/macros.md @@ -28,6 +28,7 @@ provide their users with access to the compilation and type-checking phases - [Annotations](#annotations) + - [Annotation Naming](#annotation-naming) - [Automatic Deriving](#automatic-deriving) @@ -66,6 +67,13 @@ that we are able to reserve words such as `type` to ensure that users can always have a good sense of what the most common constructs in the language mean, rather than allowing them to be overridden outside of the stdlib. +### Annotation Naming + +The naming of annotations follows the standard rules that Enso uses for naming +its [identifiers](./naming.md#naming-constructs). This means that they can be in +both referent or variable form as the annotation head is _not_ a +[pattern context](./naming.md#pattern-contexts). + ## Automatic Deriving In order to make the language easier to debug, we have all types automatically diff --git a/lib/rust/flexer-testing/definition/Cargo.toml b/lib/rust/flexer-testing/definition/Cargo.toml deleted file mode 100644 index d8140aac9a..0000000000 --- a/lib/rust/flexer-testing/definition/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "flexer-test-definition" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -publish = false - -[lib] -crate-type = ["cdylib", "rlib"] -test = true -bench = true - -[dependencies] -flexer = { path = "../../flexer", version = "0.1.0" } diff --git a/lib/rust/flexer-testing/definition/src/lib.rs b/lib/rust/flexer-testing/definition/src/lib.rs deleted file mode 100644 index cdb2acab04..0000000000 --- a/lib/rust/flexer-testing/definition/src/lib.rs +++ /dev/null @@ -1,282 +0,0 @@ -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This file contains the code defining a lexer for the following small language. Due to the way in -//! which the code-generation from the flexer is used, it has to be defined in a separate crate from -//! the site at which it's used. For the actual tests of this code, please see -//! `flexer-testing/generation`. -//! -//! The language here is being defined as follows: -//! -//! a-word = 'a'+; -//! b-word = 'b'+; -//! word = a-word | b-word; -//! space = ' '; -//! spaced-word = space, word; -//! language = word, spaced-word*; -//! -//! Please note that there is a fair amount of duplicated code between this test and the -//! `lexer_generated_api_test` file. This is to present the full view of what each portion of the -//! process looks like. - -use flexer::prelude::*; - -use flexer::*; -use flexer; -use flexer::automata::pattern::Pattern; -use flexer::group::Registry; -use flexer::prelude::logger::Disabled; -use flexer::prelude::reader::BookmarkManager; - - - -// ==================== -// === Type Aliases === -// ==================== - -type Logger = Disabled; - - - -// =========== -// === AST === -// =========== - -/// A very simple AST, sufficient for the simple language being defined. -#[derive(Clone,Debug,PartialEq)] -pub enum Token { - /// A word from the input, consisting of a sequence of all `a` or all `b`. - Word(String), - /// A token that the lexer is unable to recognise. - Unrecognized(String), -} -impl Token { - /// Construct a new word token. - pub fn word(name:impl Into) -> Token { - Token::Word(name.into()) - } - - /// Construct a new unrecognized token. - pub fn unrecognized(name:impl Into) -> Token { - Token::Unrecognized(name.into()) - } -} - -/// A representation of a stream of tokens. -#[allow(missing_docs)] -#[derive(Clone,Debug,Default,PartialEq)] -pub struct TokenStream { - tokens:Vec -} - -impl TokenStream { - /// Append the provided token to the token stream. - pub fn push(&mut self,token:Token) { - self.tokens.push(token); - } -} - - -// === Trait Impls === - -impl From> for TokenStream { - fn from(tokens: Vec) -> Self { - TokenStream {tokens} - } -} - - - -// ================== -// === Test Lexer === -// ================== - -/// The definition of a test lexer for the above-described language. -#[derive(Debug)] -pub struct TestLexer { - lexer:Flexer -} - -impl Deref for TestLexer { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for TestLexer { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl TestLexer { - /// Creates a new instance of this lexer. - pub fn new() -> Self { - let logger = Logger::new("TestLexer"); - let lexer = Flexer::new(logger); - TestLexer{lexer} - } -} - -/// Rules for the root state. -#[allow(dead_code,missing_docs)] -impl TestLexer { - fn on_first_word(&mut self, _reader:&mut R) { - let str = self.current_match.clone(); - let ast = Token::Word(str); - self.output.push(ast); - let id = self.seen_first_word_state; - self.push_state(id); - } - - fn on_err_suffix_first_word(&mut self, _reader:&mut R) { - let ast = Token::Unrecognized(self.current_match.clone()); - self.output.push(ast); - } - - fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} - - fn rules_in_root(lexer:&mut TestLexer) { - let a_word = Pattern::char('a').many1(); - let b_word = Pattern::char('b').many1(); - let any = Pattern::any(); - let end = Pattern::eof(); - - let root_group_id = lexer.initial_state; - let root_group = lexer.groups_mut().group_mut(root_group_id); - - root_group.create_rule(&a_word,"self.on_first_word(reader)"); - root_group.create_rule(&b_word,"self.on_first_word(reader)"); - root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); - root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); - } -} - -/// Rules for the "seen first word" state. -#[allow(dead_code,missing_docs)] -impl TestLexer { - fn on_spaced_word(&mut self, _reader:&mut R) { - let str = self.current_match.clone(); - let ast = Token::Word(String::from(str.trim())); - self.output.push(ast); - } - - fn on_err_suffix(&mut self, reader:&mut R) { - self.on_err_suffix_first_word(reader); - self.pop_state(); - } - - fn on_no_err_suffix(&mut self, reader:&mut R) { - self.on_no_err_suffix_first_word(reader); - self.pop_state(); - } - - fn rules_in_seen_first_word(lexer:&mut TestLexer) { - let a_word = Pattern::char('a').many1(); - let b_word = Pattern::char('b').many1(); - let space = Pattern::char(' '); - let spaced_a_word = &space >> &a_word; - let spaced_b_word = &space >> &b_word; - let any = Pattern::any(); - let end = Pattern::eof(); - - let seen_first_word_group_id = lexer.seen_first_word_state; - let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); - - seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); - seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); - seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); - seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); - } -} - - -// === Trait Impls === - -impl flexer::Definition for TestLexer { - fn define() -> Self { - let mut lexer = TestLexer::new(); - - TestLexer::rules_in_seen_first_word(&mut lexer); - TestLexer::rules_in_root(&mut lexer); - - lexer - } - - fn groups(&self) -> &Registry { - self.lexer.groups() - } - - fn set_up(&mut self) {} - - fn tear_down(&mut self) {} -} - -impl Default for TestLexer { - fn default() -> Self { - TestLexer::new() - } -} - - - -// =================== -// === Lexer State === -// =================== - -/// The stateful components of the test lexer. -#[derive(Debug)] -pub struct TestState { - /// The registry for groups in the lexer. - lexer_states:group::Registry, - /// The initial state of the lexer. - initial_state:group::Identifier, - /// The state entered when the first word has been seen. - seen_first_word_state:group::Identifier, - /// The bookmarks for this lexer. - bookmarks:BookmarkManager -} - - -// === Trait Impls === - -impl flexer::State for TestState { - fn new(_logger:&impl AnyLogger) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); - let bookmarks = BookmarkManager::new(); - Self{lexer_states,initial_state,seen_first_word_state,bookmarks} - } - - fn initial_state(&self) -> group::Identifier { - self.initial_state - } - - fn groups(&self) -> &group::Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut group::Registry { - &mut self.lexer_states - } - - fn bookmarks(&self) -> &BookmarkManager { - &self.bookmarks - } - - fn bookmarks_mut(&mut self) -> &mut BookmarkManager { - &mut self.bookmarks - } - - fn specialize(&self) -> Result { - generate::specialize(self,"TestLexer","TokenStream") - } -} diff --git a/lib/rust/flexer-testing/generation/Cargo.toml b/lib/rust/flexer-testing/generation/Cargo.toml deleted file mode 100644 index 2b1ca75a9b..0000000000 --- a/lib/rust/flexer-testing/generation/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "flexer-test-generation" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -publish = false - -[lib] -crate-type = ["cdylib", "rlib"] -test = true -bench = true - -[dependencies] -flexer = { path = "../../flexer" , version = "0.1.0" } -flexer-test-definition = { path = "../definition", version = "0.1.0" } - -[build-dependencies] -flexer = { path = "../../flexer" , version = "0.1.0" } -flexer-test-definition = { path = "../definition", version = "0.1.0" } diff --git a/lib/rust/flexer-testing/generation/build.rs b/lib/rust/flexer-testing/generation/build.rs deleted file mode 100644 index 154a49db7b..0000000000 --- a/lib/rust/flexer-testing/generation/build.rs +++ /dev/null @@ -1,32 +0,0 @@ -use std::fs::File; -use std::io::prelude::*; -use flexer_test_definition::TestLexer; -use flexer::Definition; -use flexer::State; - - - -/// Generates the lexer engine and saves the result into the file `src/engine.rs`. -/// -/// The content of the generated file can be used with the `include!` macro. -fn generate_engine() -> std::io::Result<()> { - let definition_path = "../definition/src/lib.rs"; - let output_directory = "src/generated"; - let _ = std::fs::create_dir(output_directory); - let output_path = "src/generated/engine.rs"; - let definition_error = format!("The lexer definition should exist at {}.",definition_path); - let output_error = format!("Cannot open output file at {}.",output_path); - let mut lexer_def = File::open(definition_path).expect(definition_error.as_str()); - let mut contents = String::new(); - let mut file = File::create(output_path).expect(output_error.as_str()); - let lexer = TestLexer::define(); - let engine = lexer.specialize().unwrap(); - lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition."); - file.write_all(contents.as_bytes()).expect("Unable to write lexer definition."); - file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization."); - Ok(()) -} - -fn main() -> std::io::Result<()> { - generate_engine() -} diff --git a/lib/rust/flexer-testing/generation/src/generated.rs b/lib/rust/flexer-testing/generation/src/generated.rs deleted file mode 100644 index 99ac31885e..0000000000 --- a/lib/rust/flexer-testing/generation/src/generated.rs +++ /dev/null @@ -1,3 +0,0 @@ -//! This module serves to re-export the generated lexer. - -pub mod engine; diff --git a/lib/rust/flexer-testing/generation/src/lib.rs b/lib/rust/flexer-testing/generation/src/lib.rs deleted file mode 100644 index 02eb98737a..0000000000 --- a/lib/rust/flexer-testing/generation/src/lib.rs +++ /dev/null @@ -1,19 +0,0 @@ -//! This library exposes the specialized version of the Enso lexer. -//! -//! Its sole purpose is to avoid the lexer definition getting out of sync with its implementation -//! (the generated engine), which requires the engine to live in a separate crate. -//! -//! This separation enables generation of the enso lexer source code with `build.rs` during -//! compilation. Its output is then stored in a new file `engine.rs`and exported by `lexer.rs`. - -#![feature(test)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -pub mod generated; diff --git a/lib/rust/flexer-testing/generation/tests/flexer_generated_lexer.rs b/lib/rust/flexer-testing/generation/tests/flexer_generated_lexer.rs deleted file mode 100644 index d1dace86a9..0000000000 --- a/lib/rust/flexer-testing/generation/tests/flexer_generated_lexer.rs +++ /dev/null @@ -1,110 +0,0 @@ -#![feature(test)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This file contains tests for the generated lexer. - -use flexer::prelude::*; - -use flexer::prelude::reader::decoder::DecoderUTF8; -use flexer_test_generation::generated::engine::TestLexer; -use flexer_test_generation::generated::engine::Token; -use flexer_test_generation::generated::engine::TokenStream; - - - -// ============= -// === Tests === -// ============= - -/// Executes the test on the provided input string slice. -fn run_test_on(str:impl AsRef) -> TokenStream { - // Hardcoded for ease of use here. - let reader = Reader::new(str.as_ref().as_bytes(), DecoderUTF8()); - let mut lexer = TestLexer::new(); - let run_result = lexer.run(reader); - - match run_result.kind { - flexer::ResultKind::Success => run_result.tokens, - _ => default() - } -} - -#[test] -fn test_single_a_word() { - let input = "aaaaa"; - let expected_output = TokenStream::from(vec![Token::word(input)]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_single_b_word() { - let input = "bbbbb"; - let expected_output = TokenStream::from(vec![Token::word(input)]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_two_word() { - let input = "aaaaa bbbbb"; - let expected_output = TokenStream::from(vec![Token::word("aaaaa"), Token::word("bbbbb")]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_multi_word() { - let input = "bbb aa a b bbbbb aa"; - let expected_output = TokenStream::from(vec![ - Token::word("bbb"), - Token::word("aa"), - Token::word("a"), - Token::word("b"), - Token::word("bbbbb"), - Token::word("aa") - ]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_invalid_single_word() { - let input = "c"; - let expected_output = TokenStream::from(vec![Token::unrecognized(input)]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_multi_word_invalid() { - let input = "aaaaaa c bbbbbb"; - let expected_output = TokenStream::from(vec![ - Token::word("aaaaaa"), - Token::unrecognized(" "), - Token::unrecognized("c"), - Token::unrecognized(" "), - Token::word("bbbbbb"), - ]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} - -#[test] -fn test_end_invalid() { - let input = "bbbbbb c"; - let expected_output = TokenStream::from(vec![ - Token::word("bbbbbb"), - Token::unrecognized(" "), - Token::unrecognized("c"), - ]); - let result = run_test_on(input); - assert_eq!(result, expected_output); -} diff --git a/lib/rust/flexer/Cargo.toml b/lib/rust/flexer/Cargo.toml deleted file mode 100644 index 29ed210f21..0000000000 --- a/lib/rust/flexer/Cargo.toml +++ /dev/null @@ -1,39 +0,0 @@ -[package] -name = "flexer" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2018" - -description = "A finite-automata-based lexing engine." -readme = "README.md" -homepage = "https://github.com/enso-org/enso/lib/rust/flexer" -repository = "https://github.com/enso-org/enso" -license-file = "../../../LICENSE" - -keywords = ["lexer", "finite-automata"] -categories = ["parsing"] - -publish = false - -[lib] -name = "flexer" -crate-type = ["cdylib", "rlib"] -test = true -bench = true - -[dependencies] -enso-logger = { version = "0.1.1" } -enso-prelude = { version = "0.1.3" } -enso-lazy-reader = { version = "= 0.1.0" } -enso-macro-utils = { version = "0.1.1" } - -itertools = "0.8" -proc-macro2 = "1.0.19" -nonempty = "0.1.5" -quote = "1.0" -syn = { version = "1.0.12", features = ["full", "extra-traits", "visit-mut", "visit", "parsing", "printing"] } -unicode-segmentation = "1.6.0" -wasm-bindgen = "0.2" - -[dev-dependencies] -wasm-bindgen-test = "0.2" diff --git a/lib/rust/flexer/README.md b/lib/rust/flexer/README.md deleted file mode 100644 index fbc983251e..0000000000 --- a/lib/rust/flexer/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Flexer - -This library provides a finite-automata-based lexing engine that can flexibly -tokenize an input stream. diff --git a/lib/rust/flexer/src/automata.rs b/lib/rust/flexer/src/automata.rs deleted file mode 100644 index 66b70e3f1c..0000000000 --- a/lib/rust/flexer/src/automata.rs +++ /dev/null @@ -1,9 +0,0 @@ -//! Provides an API for the construction of finite state automata, in both their deterministic and -//! non-deterministic forms. - -pub mod alphabet; -pub mod dfa; -pub mod nfa; -pub mod pattern; -pub mod state; -pub mod symbol; diff --git a/lib/rust/flexer/src/automata/alphabet.rs b/lib/rust/flexer/src/automata/alphabet.rs deleted file mode 100644 index decd8d0ce7..0000000000 --- a/lib/rust/flexer/src/automata/alphabet.rs +++ /dev/null @@ -1,130 +0,0 @@ -//! Exports an alphabet for an arbitrary finite state automaton. - -use crate::prelude::*; - -use crate::automata::symbol::Symbol; - -use std::collections::BTreeSet; -use std::ops::RangeInclusive; - - - -// ==================== -// === Segmentation === -// ==================== - -/// A representation of the distinct intervals over the input alphabet for a given finite state -/// automaton. -/// -/// These intervals are defined by a set of _divisions_ of the input alphabet, where each division -/// is represented as a point in that alphabet. This is necessary to allow for efficient encoding of -/// state transitions that trigger not just on _one_, but potentially on _many_ of the input -/// symbols in the automaton's alphabet. -/// -/// This is best explained by way of example. Consider the original unbounded alphabet: -/// -/// ```text -/// ... a b c d e f g h ... z ... -/// ``` -/// -/// We want to add a rule that matches on the interval `[b, d]`. This results in there being three -/// intervals on the alphabet, as there are two divisions (annotated below): -/// -/// ```text -/// ... a | b c d | e f g h ... z ... -/// div: 1 2 -/// seg: 1 2 3 -/// ``` -/// -/// If we then add a rule that matches on the interval `[d, f]`, we end up with five intervals on -/// the alphabet, with four divisions (annotated below): -/// -/// ```text -/// ... a | b c | d | e f | g h ... z ... -/// div: 1 2 3 4 -/// seg: 1 2 3 4 5 -/// ``` -/// -/// This type tracks these divisions explicitly for an input alphabet defined for all automata in -/// this library as `0u32..=u32::max_value()`. -#[derive(Clone,Debug,PartialEq,Eq)] -#[allow(missing_docs)] -pub struct Segmentation { - pub divisions:BTreeSet -} - -impl Segmentation { - /// Inserts a range of symbols into the alphabet. - pub fn insert(&mut self, range:RangeInclusive) { - self.divisions.insert(Symbol::from(range.start())); - if range.end().value != Symbol::EOF_CODE.value { - self.divisions.insert(Symbol{value:range.end().value + 1}); - } - } - - /// Creates a [`Segmentation`] from an input set of divisions. - pub fn from_divisions(divisions:&[u32]) -> Self { - let mut dict = Self::default(); - for val in divisions { - dict.divisions.insert(Symbol::from(*val)); - } - dict - } - - /// Obtains the divisions in the alphabet segmentation as a vector. - pub fn divisions_as_vec(&self) -> Vec { - self.divisions.iter().copied().enumerate().map(From::from).collect() - } -} - - -// === Trait Impls === - -impl Default for Segmentation { - fn default() -> Self { - let mut divisions: BTreeSet = default(); - // The existence of the default (0) member in the set is assumed by the implementation of - // the NFA -> DFA conversion. - divisions.insert(default()); - Segmentation{divisions} - } -} - - - -// ================ -// === Division === -// ================ - -/// A division of the alphabet used by the lexer. -#[derive(Copy,Clone,Debug,PartialEq,Eq)] -pub struct Division { - /// The position of the division. - pub position : usize, - /// The symbol at which it divides the alphabet. - pub symbol : Symbol, -} - -impl Division { - /// Create a new division. - pub fn new(position:usize, symbol:Symbol) -> Division { - Division{position,symbol} - } -} - - -// === Trait Impls === - -impl Into<(usize,Symbol)> for Division { - fn into(self) -> (usize, Symbol) { - (self.position,self.symbol) - } -} - -impl From<(usize,Symbol)> for Division { - fn from((position, symbol): (usize, Symbol)) -> Self { - Division::new(position,symbol) - } -} - - diff --git a/lib/rust/flexer/src/automata/dfa.rs b/lib/rust/flexer/src/automata/dfa.rs deleted file mode 100644 index ccaec8489e..0000000000 --- a/lib/rust/flexer/src/automata/dfa.rs +++ /dev/null @@ -1,178 +0,0 @@ -//! The structure for defining deterministic finite automata. - -use crate::automata::alphabet; -use crate::automata::state; -use crate::data::matrix::Matrix; - - - -// ===================================== -// === Deterministic Finite Automata === -// ===================================== - -/// The definition of a [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) for a -/// given set of symbols, states, and transitions. -/// -/// A DFA is a finite state automaton that accepts or rejects a given sequence of symbols by -/// executing on a sequence of states _uniquely_ determined by the sequence of input symbols. -/// -/// ```text -/// ┌───┐ 'D' ┌───┐ 'F' ┌───┐ 'A' ┌───┐ -/// │ 0 │ ----> │ 1 │ ----> │ 2 │ ----> │ 3 │ -/// └───┘ └───┘ └───┘ └───┘ -/// ``` -#[derive(Clone,Debug,Default,Eq,PartialEq)] -pub struct DFA { - /// A set of disjoint intervals over the allowable input alphabet. - pub alphabet_segmentation:alphabet::Segmentation, - /// The transition matrix for the DFA. - /// - /// It represents a function of type `(state, symbol) -> state`, returning the identifier for - /// the new state. - /// - /// For example, the transition matrix for an automaton that accepts the language - /// `{"A" | "B"}*"` would appear as follows, with `-` denoting - /// [the invalid state](state::Identifier::INVALID). The leftmost column encodes the input - /// state, while the topmost row encodes the input symbols. - /// - /// | | A | B | - /// |:-:|:-:|:-:| - /// | 0 | 1 | - | - /// | 1 | - | 0 | - /// - pub links:Matrix, - /// A collection of callbacks for each state (indexable in order) - pub callbacks:Vec>, -} - -impl DFA { - /// Check whether the DFA has a rule for the target state. - /// - /// This method should only be used in generated code, where its invariants are already checked. - /// - /// # Panics - /// - /// If no callback exists for `target_state`. - pub fn has_rule_for(&self, target_state:state::Identifier) -> bool { - self.callbacks.get(target_state.id).unwrap().is_some() - } -} - - -// === Trait Impls === - -impl From>> for Matrix { - fn from(input:Vec>) -> Self { - let rows = input.len(); - let columns = if rows == 0 {0} else {input[0].len()}; - let mut matrix = Self::new(rows,columns); - for row in 0..rows { - for column in 0..columns { - matrix[(row,column)] = state::Identifier::from(input[row][column]); - } - } - matrix - } -} - - - -// ================ -// === Callback === -// ================ - -/// The callback associated with an arbitrary state of a finite automaton. -/// -/// It contains the rust code that is intended to be executed after encountering a -/// [`pattern`](super::pattern::Pattern) that causes the associated state transition. This pattern -/// is declared in [`Rule.pattern`](crate::group::rule::Rule::pattern). -#[derive(Clone,Debug,PartialEq,Eq)] -pub struct RuleExecutable { - /// A description of the priority with which the callback is constructed during codegen. - pub priority:usize, - /// The rust code that will be executed when running this callback. - pub code:String, -} - -impl RuleExecutable { - /// Creates a new rule executable with the provided `priority` and `code`. - pub fn new(priority:usize, code_str:impl Into) -> RuleExecutable { - let code = code_str.into(); - RuleExecutable{priority,code} - } -} - - - -// ============= -// === Tests === -// ============= - -#[cfg(test)] -pub mod tests { - use crate::automata::state; - - use super::*; - - const INVALID:usize = state::Identifier::INVALID.id; - - /// DFA automata that accepts newline '\n'. - pub fn newline() -> DFA { - DFA { - alphabet_segmentation:alphabet::Segmentation::from_divisions(&[10,11]), - links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), - callbacks:vec![ - None, - Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}), - ], - } - } - - /// DFA automata that accepts any letter a..=z. - pub fn letter() -> DFA { - DFA { - alphabet_segmentation:alphabet::Segmentation::from_divisions(&[97,123]), - links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]), - callbacks:vec![ - None, - Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}), - ], - } - } - - /// DFA automata that accepts any number of spaces ' '. - pub fn spaces() -> DFA { - DFA { - alphabet_segmentation:alphabet::Segmentation::from_divisions(&[0,32,33]), - links:Matrix::from(vec![ - vec![INVALID,1,INVALID], - vec![INVALID,2,INVALID], - vec![INVALID,2,INVALID], - ]), - callbacks:vec![ - None, - Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}), - Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}), - ], - } - } - - /// DFA automata that accepts one letter a..=z or any many spaces. - pub fn letter_and_spaces() -> DFA { - DFA { - alphabet_segmentation:alphabet::Segmentation::from_divisions(&[32,33,97,123]), - links:Matrix::from(vec![ - vec![INVALID, 1,INVALID, 2,INVALID], - vec![INVALID, 3,INVALID,INVALID,INVALID], - vec![INVALID,INVALID,INVALID,INVALID,INVALID], - vec![INVALID, 3,INVALID,INVALID,INVALID], - ]), - callbacks:vec![ - None, - Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}), - Some(RuleExecutable {priority:4, code:"group_0_rule_0".into()}), - Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}), - ], - } - } -} diff --git a/lib/rust/flexer/src/automata/nfa.rs b/lib/rust/flexer/src/automata/nfa.rs deleted file mode 100644 index 8c4d3db608..0000000000 --- a/lib/rust/flexer/src/automata/nfa.rs +++ /dev/null @@ -1,345 +0,0 @@ -//! The structure for defining non-deterministic finite automata. - -use crate::automata::alphabet; -use crate::automata::dfa::DFA; -use crate::automata::dfa::RuleExecutable; -use crate::automata::pattern::Pattern; -use crate::automata::state::State; -use crate::automata::state::Transition; -use crate::automata::state; -use crate::automata::symbol::Symbol; -use crate::data::matrix::Matrix; - -use itertools::Itertools; -use std::collections::BTreeSet; -use std::collections::HashMap; -use std::ops::RangeInclusive; - -use crate::prelude::*; - - - -// ========================================= -// === Non-Deterministic Finite Automata === -// ========================================= - -/// A state identifier based on a set of states. -/// -/// This is used during the NFA -> DFA transformation, where multiple states can merge together due -/// to the collapsing of epsilon transitions. -type StateSetId = BTreeSet; - -/// The definition of a [NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) for a -/// given set of symbols, states, and transitions (specifically a NFA with ε-moves). -/// -/// A NFA is a finite state automaton that accepts or rejects a given sequence of symbols. In -/// contrast with a DFA, the NFA may transition between states _without_ reading any new symbol -/// through use of -/// [epsilon links](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton#NFA_with_%CE%B5-moves). -/// -/// ```text -/// ┌───┐ 'N' ┌───┐ ┌───┐ 'F' ┌───┐ ┌───┐ 'A' ┌───┐ -/// │ 0 │ ----> │ 1 │ -> │ 2 │ ----> │ 3 │ -> │ 3 │ ----> │ 3 │ -/// └───┘ └───┘ ε └───┘ └───┘ ε └───┘ └───┘ -/// ``` -#[derive(Clone,Debug,Default,PartialEq,Eq)] -pub struct NFA { - /// A set of disjoint intervals over the input alphabet. - pub alphabet_segmentation:alphabet::Segmentation, - /// A set of named NFA states, with (epsilon) transitions. - pub states:Vec, -} - -impl NFA { - /// Adds a new state to the NFA and returns its identifier. - pub fn new_state(&mut self) -> state::Identifier { - let id = self.states.len(); - self.states.push(State::default()); - state::Identifier{id} - } - - /// Creates an epsilon transition between two states. - /// - /// Whenever the automaton happens to be in `source` state it can immediately transition to the - /// `target` state. It is, however, not _required_ to do so. - pub fn connect(&mut self, source:state::Identifier, target:state::Identifier) { - self.states[source.id].epsilon_links.push(target); - } - - /// Creates an ordinary transition for a range of symbols. - /// - /// If any symbol from such range happens to be the input when the automaton is in the `source` - /// state, it will immediately transition to the `target` state. - pub fn connect_via - ( &mut self - , source : state::Identifier - , target_state : state::Identifier - , symbols : &RangeInclusive - ) { - self.alphabet_segmentation.insert(symbols.clone()); - self.states[source.id].links.push(Transition{symbols:symbols.clone(),target_state}); - } - - /// Transforms a pattern to an NFA using the algorithm described - /// [here](https://www.youtube.com/watch?v=RYNN-tb9WxI). - /// The asymptotic complexity is linear in number of symbols. - pub fn new_pattern(&mut self, source:state::Identifier, pattern:&Pattern) -> state::Identifier { - let current = self.new_state(); - self.connect(source,current); - match pattern { - Pattern::Range(range) => { - let state = self.new_state(); - self.connect_via(current,state,range); - state - }, - Pattern::Many(body) => { - let s1 = self.new_state(); - let s2 = self.new_pattern(s1,body); - let s3 = self.new_state(); - self.connect(current,s1); - self.connect(current,s3); - self.connect(s2,s3); - self.connect(s3,s1); - s3 - }, - Pattern::Seq(patterns) => { - patterns.iter().fold(current,|s,pat| self.new_pattern(s,pat)) - }, - Pattern::Or(patterns) => { - let states = patterns.iter().map(|pat| self.new_pattern(current,pat)).collect_vec(); - let end = self.new_state(); - for state in states { - self.connect(state,end); - } - end - }, - Pattern::Always => current, - } - } - - /// Merges states that are connected by epsilon links, using an algorithm based on the one shown - /// [here](https://www.youtube.com/watch?v=taClnxU-nao). - fn eps_matrix(&self) -> Vec { - fn fill_eps_matrix - ( nfa : &NFA - , states : &mut Vec - , visited : &mut Vec - , state : state::Identifier - ) { - let mut state_set = StateSetId::new(); - visited[state.id] = true; - state_set.insert(state); - for &target in &nfa.states[state.id].epsilon_links { - if !visited[target.id] { - fill_eps_matrix(nfa,states,visited,target); - } - state_set.insert(target); - state_set.extend(states[target.id].iter()); - } - states[state.id] = state_set; - } - - let mut states = vec![StateSetId::new(); self.states.len()]; - for id in 0..self.states.len() { - let mut visited = vec![false; states.len()]; - fill_eps_matrix(self,&mut states,&mut visited,state::Identifier{id}); - } - states - } - - /// Computes a transition matrix `(state, symbol) => state` for the NFA, ignoring epsilon links. - fn nfa_matrix(&self) -> Matrix { - let mut matrix = Matrix::new(self.states.len(),self.alphabet_segmentation.divisions.len()); - - for (state_ix, source) in self.states.iter().enumerate() { - let targets = source.targets(&self.alphabet_segmentation); - for (voc_ix, &target) in targets.iter().enumerate() { - matrix[(state_ix,voc_ix)] = target; - } - } - matrix - } -} - - -// === Trait Impls === - -impl From<&NFA> for DFA { - - /// Transforms an NFA into a DFA, based on the algorithm described - /// [here](https://www.youtube.com/watch?v=taClnxU-nao). - /// The asymptotic complexity is quadratic in number of states. - fn from(nfa:&NFA) -> Self { - let nfa_mat = nfa.nfa_matrix(); - let eps_mat = nfa.eps_matrix(); - let mut dfa_mat = Matrix::new(0,nfa.alphabet_segmentation.divisions.len()); - let mut dfa_eps_ixs = Vec::::new(); - let mut dfa_eps_map = HashMap::::new(); - - dfa_eps_ixs.push(eps_mat[0].clone()); - dfa_eps_map.insert(eps_mat[0].clone(),state::Identifier::from(0)); - - let mut i = 0; - while i < dfa_eps_ixs.len() { - dfa_mat.new_row(); - for voc_ix in 0..nfa.alphabet_segmentation.divisions.len() { - let mut eps_set = StateSetId::new(); - for &eps_ix in &dfa_eps_ixs[i] { - let tgt = nfa_mat[(eps_ix.id,voc_ix)]; - if tgt != state::Identifier::INVALID { - eps_set.extend(eps_mat[tgt.id].iter()); - } - } - if !eps_set.is_empty() { - dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) { - Some(&id) => id, - None => { - let id = state::Identifier::new(dfa_eps_ixs.len()); - dfa_eps_ixs.push(eps_set.clone()); - dfa_eps_map.insert(eps_set,id); - id - }, - }; - } - } - i += 1; - } - - let mut callbacks = vec![None; dfa_eps_ixs.len()]; - let priority = dfa_eps_ixs.len(); - for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() { - let has_name = |&key:&state::Identifier| nfa.states[key.id].name.is_some(); - if let Some(eps) = epss.into_iter().find(has_name) { - let code = nfa.states[eps.id].name.as_ref().cloned().unwrap(); - callbacks[dfa_ix] = Some(RuleExecutable {code,priority}); - } - } - - let alphabet_segmentation = nfa.alphabet_segmentation.clone(); - let links = dfa_mat; - - DFA{alphabet_segmentation,links,callbacks} - } -} - - - -// =========== -// == Tests == -// =========== - -#[cfg(test)] -pub mod tests { - extern crate test; - - use crate::automata::dfa; - - use super::*; - use test::Bencher; - - /// NFA that accepts a newline '\n'. - pub fn newline() -> NFA { - NFA { - states:vec![ - State::from(vec![1]), - State::from(vec![(10..=10,2)]), - State::from(vec![3]).named("group_0_rule_0"), - State::default(), - ], - alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()), - } - } - - /// NFA that accepts any letter in the range a..=z. - pub fn letter() -> NFA { - NFA { - states:vec![ - State::from(vec![1]), - State::from(vec![(97..=122,2)]), - State::from(vec![3]).named("group_0_rule_0"), - State::default(), - ], - alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()), - } - } - - /// NFA that accepts any number of spaces ' '. - pub fn spaces() -> NFA { - NFA { - states:vec![ - State::from(vec![1]), - State::from(vec![2]), - State::from(vec![(32..=32,3)]), - State::from(vec![4]), - State::from(vec![5,8]), - State::from(vec![6]), - State::from(vec![(32..=32,7)]), - State::from(vec![8]), - State::from(vec![5,9]).named("group_0_rule_0"), - State::default(), - ], - alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()), - } - } - - /// NFA that accepts one letter a..=z or many spaces ' '. - pub fn letter_and_spaces() -> NFA { - NFA { - states:vec![ - State::from(vec![1,3]), - State::from(vec![(97..=122,2)]), - State::from(vec![11]).named("group_0_rule_0"), - State::from(vec![4]), - State::from(vec![(32..=32,5)]), - State::from(vec![6]), - State::from(vec![7,10]), - State::from(vec![8]), - State::from(vec![(32..=32,9)]), - State::from(vec![10]), - State::from(vec![7,11]).named("group_0_rule_1"), - State::default(), - ], - alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()), - } - } - - #[test] - fn test_to_dfa_newline() { - assert_eq!(DFA::from(&newline()),dfa::tests::newline()); - } - - #[test] - fn test_to_dfa_letter() { - assert_eq!(DFA::from(&letter()),dfa::tests::letter()); - } - - #[test] - fn test_to_dfa_spaces() { - assert_eq!(DFA::from(&spaces()),dfa::tests::spaces()); - } - - #[test] - fn test_to_dfa_letter_and_spaces() { - assert_eq!(DFA::from(&letter_and_spaces()),dfa::tests::letter_and_spaces()); - } - - #[bench] - fn bench_to_dfa_newline(bencher:&mut Bencher) { - bencher.iter(|| DFA::from(&newline())) - } - - #[bench] - fn bench_to_dfa_letter(bencher:&mut Bencher) { - bencher.iter(|| DFA::from(&letter())) - } - - #[bench] - fn bench_to_dfa_spaces(bencher:&mut Bencher) { - bencher.iter(|| DFA::from(&spaces())) - } - - #[bench] - fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) { - bencher.iter(|| DFA::from(&letter_and_spaces())) - } -} diff --git a/lib/rust/flexer/src/automata/pattern.rs b/lib/rust/flexer/src/automata/pattern.rs deleted file mode 100644 index 2ea3aafb6c..0000000000 --- a/lib/rust/flexer/src/automata/pattern.rs +++ /dev/null @@ -1,194 +0,0 @@ -//! Simple API for constructing regex patterns that are used in parser implementation. - -#[macro_use] -mod macros; - -use crate::automata::symbol::Symbol; - -use core::iter; -use itertools::Itertools; -use std::ops::BitOr; -use std::ops::RangeInclusive; -use std::ops::Shr; - -use Pattern::*; - - - -// ============= -// == Pattern == -// ============= - -/// A representation of a simple regular pattern. -#[derive(Clone,Debug)] -pub enum Pattern { - /// The pattern that triggers on any symbol from the given range. - Range(RangeInclusive), - /// The pattern that triggers on any given pattern from a sequence. - Or(Vec), - /// The pattern that triggers when a sequence of patterns is encountered. - Seq(Vec), - /// The pattern that triggers on 0..N repetitions of given pattern. - Many(Box), - /// The pattern that always triggers. - Always, -} - -impl Pattern { - - /// A pattern that never triggers. - pub fn never() -> Self { - Pattern::symbol(Symbol::INVALID_SYMBOL) - } - - /// A pattern that always triggers - pub fn always() -> Self { - Pattern::Always - } - - /// A pattern that triggers on any character. - pub fn any() -> Self { - Pattern::symbols(Symbol::from(0)..=Symbol::from(u32::max_value())) - } - - /// A pattern that triggers on 0..N repetitions of the pattern described by `self`. - pub fn many(&self) -> Self { - Many(Box::new(self.clone())) - } - - /// A pattern that triggers on 1..N repetitions of the pattern described by `self`. - pub fn many1(&self) -> Self { - self.clone() >> self.many() - } - - /// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`. - pub fn opt(&self) -> Self { - self.clone() | Self::always() - } - - /// A pattern that triggers on the given character. - pub fn char(character:char) -> Self { - Self::symbol(Symbol::from(character)) - } - - /// A pattern that triggers on the given symbol. - pub fn symbol(symbol:Symbol) -> Self { - Pattern::symbols(symbol..=symbol) - } - - /// A pattern that triggers on any of the provided `symbols`. - pub fn symbols(symbols:RangeInclusive) -> Self { - Pattern::Range(symbols) - } - - /// A pattern that triggers at the end of the file. - pub fn eof() -> Self { - Self::symbol(Symbol::EOF_CODE) - } - - /// A pattern that triggers on any character in the provided `range`. - pub fn range(range:RangeInclusive) -> Self { - Pattern::symbols(Symbol::from(*range.start())..=Symbol::from(*range.end())) - } - - /// Pattern that triggers when sequence of characters given by `chars` is encountered. - pub fn all_of(chars:&str) -> Self { - let mut chars_iter = chars.chars(); - if let Some(first) = chars_iter.next() { - chars_iter.fold(Self::char(first),|pat, char| pat >> Self::char(char)) - } else { - Pattern::never() - } - } - - /// The pattern that triggers on any characters contained in `chars`. - pub fn any_of(chars:&str) -> Self { - chars.chars().fold(Self::never(),|pat,char| pat | Self::char(char)) - } - - /// The pattern that doesn't trigger on any character contained in `chars`. - pub fn none_of(chars:&str) -> Self { - let max = u32::max_value(); - let char_iter = chars.chars().map(|char| char as u32); - let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max)); - let mut codes = char_iter2.collect_vec(); - codes.sort(); - codes.iter().tuple_windows().fold(Self::never(),|pat,(prev_code,next_code)| { - let start = prev_code + 1; - let end = next_code - 1; - if end < start {pat} else { - pat | Pattern::symbols(Symbol::from(start)..=Symbol::from(end)) - } - }) - } - - /// The pattern that triggers on any character but `char`. - pub fn not(char:char) -> Self { - Self::none_of(&char.to_string()) - } - - /// The pattern that triggers on `num` repetitions of `pat`. - pub fn repeat(pat:Pattern, num:usize) -> Self { - (0..num).fold(Self::always(),|p,_| p >> pat.clone()) - } - - /// Pattern that triggers on `min`..`max` repetitions of `pat`. - pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self { - (min..max).fold(Self::never(),|p,n| p | Self::repeat(pat.clone(),n)) - } -} - - -// === Trait Impls ==== - -impl BitOr for Pattern { - type Output = Pattern; - fn bitor(self, rhs:Pattern) -> Self::Output { - match (self, rhs) { - (Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)}, - (Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)}, - (lhs , Or(mut rhs)) => {rhs.insert(0,lhs) ; Or(rhs)}, - (lhs , rhs ) => Or(vec![lhs,rhs]), - } - } -} -gen_ref_versions!(Pattern,BitOr,bitor); - -impl Shr for Pattern { - type Output = Pattern; - fn shr(self, rhs:Pattern) -> Self::Output { - match (self, rhs) { - (Seq(mut lhs), Seq(rhs) ) => {lhs.extend(rhs) ; Seq(lhs)}, - (Seq(mut lhs), rhs ) => {lhs.push(rhs) ; Seq(lhs)}, - (lhs , Seq(mut rhs)) => {rhs.insert(0,lhs) ; Seq(rhs)}, - (lhs , rhs ) => Seq(vec![lhs, rhs]), - } - } -} -gen_ref_versions!(Pattern,Shr,shr); - - - -// ================= -// === Utilities === -// ================= - -/// Quote a character as a character pattern. -/// -/// It is equivalent to `Pattern::char(...)`. -#[macro_export] -macro_rules! c { - ($char:literal) => { - Pattern::char($char) - } -} - -/// Quote a string as a literal pattern. -/// -/// It is equivalent to `Pattern::all_of(...)`. -#[macro_export] -macro_rules! l { - ($lit:literal) => { - Pattern::all_of($lit) - } -} diff --git a/lib/rust/flexer/src/automata/pattern/macros.rs b/lib/rust/flexer/src/automata/pattern/macros.rs deleted file mode 100644 index 5e43b948d6..0000000000 --- a/lib/rust/flexer/src/automata/pattern/macros.rs +++ /dev/null @@ -1,28 +0,0 @@ -//! Useful macros for defining operators over patterns. - -/// Generates versions of an operator taking various combinations of by-reference and by-value. -#[macro_export] -macro_rules! gen_ref_versions { - ($ty_name:ty,$opr_name:ident,$fn_name:ident) => ( - impl $opr_name<&$ty_name> for &$ty_name { - type Output = $ty_name; - fn $fn_name(self, rhs:&$ty_name) -> Self::Output { - self.clone().$fn_name(rhs.clone()) - } - } - - impl $opr_name<&$ty_name> for $ty_name { - type Output = $ty_name; - fn $fn_name(self, rhs:&$ty_name) -> Self::Output { - self.$fn_name(rhs.clone()) - } - } - - impl $opr_name<$ty_name> for &$ty_name { - type Output = $ty_name; - fn $fn_name(self, rhs:$ty_name) -> Self::Output { - self.clone().$fn_name(rhs) - } - } - ) -} diff --git a/lib/rust/flexer/src/automata/state.rs b/lib/rust/flexer/src/automata/state.rs deleted file mode 100644 index 16c7588998..0000000000 --- a/lib/rust/flexer/src/automata/state.rs +++ /dev/null @@ -1,136 +0,0 @@ -//! This module exports State implementation for Nondeterministic Finite Automata. - -use crate::automata::alphabet; -use crate::automata::symbol::Symbol; - -use crate::prelude::*; - - - -// =========== -// == State == -// =========== - -/// A named state for a [`super::nfa::NFA`]. -#[derive(Clone,Debug,Default,PartialEq,Eq)] -pub struct State { - /// A set of transitions that can trigger without consuming a symbol (ε-transitions). - pub epsilon_links:Vec, - /// The set of transitions that trigger while consuming a specific symbol. - /// - /// When triggered, the automaton will transition to the [`Transition::target_state`]. - pub links:Vec, - /// The name of the state. - /// - /// This is used to auto-generate a call to the rust method of the same name. - pub name:Option, - /// The function to call when evaluating the state. - pub callback:String -} - -impl State { - /// Updater for field `name`. Returns updated state. - pub fn named(mut self, name:&str) -> Self { - self.name = Some(name.to_owned()); - self - } - - /// Returns transition (next state) for each symbol in alphabet. - pub fn targets(&self, alphabet:&alphabet::Segmentation) -> Vec { - let mut targets = vec![]; - let mut index = 0; - let mut links = self.links.clone(); - links.sort_by_key(|link| *link.symbols.start()); - for &symbol in &alphabet.divisions { - while links.len() > index && *links[index].symbols.end() < symbol { - index += 1; - } - if links.len() <= index || *links[index].symbols.start() > symbol { - targets.push(Identifier::INVALID); - } else { - targets.push(links[index].target_state); - } - } - targets - } -} - - -// === Trait Impls ==== - -impl From> for State { - /// Creates a state with epsilon links. - fn from(vec:Vec) -> Self { - let epsilon_links = vec.iter().cloned().map(|id| Identifier{id}).collect(); - State{epsilon_links,..Default::default()} - } -} - -impl From, usize)>> for State { - /// Creates a state with ordinary links. - fn from(vec:Vec<(RangeInclusive, usize)>) -> Self { - let link = |(range, id): (RangeInclusive, usize)| { - let start = Symbol{value:*range.start()}; - let end = Symbol{value:*range.end()}; - Transition{symbols:start..=end,target_state:Identifier{id}} - }; - let links = vec.iter().cloned().map(link).collect(); - State{links,..Default::default()} - } -} - - - -// ================ -// == Identifier == -// ================ - -/// A state identifier for an arbitrary finite automaton. -#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)] -#[allow(missing_docs)] -pub struct Identifier { - pub id: usize -} - -impl Identifier { - /// An identifier representing the invalid state. - /// - /// When in an invalid state, a finite automaton will reject the sequence of input symbols. - pub const INVALID:Identifier = Identifier{id:usize::max_value()}; - - /// Constructs a new state identifier. - pub fn new(id:usize) -> Identifier { - Identifier{id} - } -} - -// === Trait Impls === - -impl Default for Identifier { - /// Returns state::INVALID. This is because every finite automata has an invalid state - /// and because all transitions in automata transition matrix lead to invalid state by default. - fn default() -> Self { - Identifier::INVALID - } -} - -impl From for Identifier { - fn from(id: usize) -> Self { - Identifier{id} - } -} - - - -// ============ -// === Link === -// ============ - -/// A transition between states in a finite automaton that must consume a symbol to trigger. -#[derive(Clone,Debug,PartialEq,Eq)] -pub struct Transition { - /// The range of symbols on which this transition will trigger. - pub symbols:RangeInclusive, - /// The state that is entered after the transition has triggered. - pub target_state:Identifier, -} diff --git a/lib/rust/flexer/src/automata/symbol.rs b/lib/rust/flexer/src/automata/symbol.rs deleted file mode 100644 index 3b6f90a8dc..0000000000 --- a/lib/rust/flexer/src/automata/symbol.rs +++ /dev/null @@ -1,53 +0,0 @@ -//! Defines a Symbol that is operated on by the finite automata. - - - -// ============== -// === Symbol === -// ============== - -/// An input symbol to a finite automaton. -#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)] -pub struct Symbol { - /// The 4-byte representation of the symbol. - pub value:u32 -} - -impl Symbol { - /// A representation of the null symbol. - pub const NULL:Symbol = Symbol{value:0}; - /// A representation of the end of the file. - pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()}; - /// A representation of an arbitrary invalid unicode symbol. - pub const INVALID_SYMBOL:Symbol = Symbol{value:0xFFFF}; - /// A representation of the group reaching its end without matching. - pub const INCOMPLETE_GROUP:Symbol = Symbol{value:u32::max_value() - 1}; -} - - -// === Trait Impls === - -impl Default for Symbol { - fn default() -> Self { - Symbol::NULL - } -} - -impl From for Symbol { - fn from(value:u32) -> Symbol { - Symbol{value} - } -} - -impl From for Symbol { - fn from(value:char) -> Symbol { - Symbol{value:value as u32} - } -} - -impl From<&Symbol> for Symbol { - fn from(symbol:&Symbol) -> Self { - let value = symbol.value; - Symbol{value} - } -} diff --git a/lib/rust/flexer/src/data.rs b/lib/rust/flexer/src/data.rs deleted file mode 100644 index f0646b3a50..0000000000 --- a/lib/rust/flexer/src/data.rs +++ /dev/null @@ -1,3 +0,0 @@ -//! Generic data-structures to support multiple use-cases. - -pub mod matrix; diff --git a/lib/rust/flexer/src/data/matrix.rs b/lib/rust/flexer/src/data/matrix.rs deleted file mode 100644 index f7590e591b..0000000000 --- a/lib/rust/flexer/src/data/matrix.rs +++ /dev/null @@ -1,75 +0,0 @@ -//! An efficient representation of a 2D matrix. - -use crate::prelude::*; - -use std::ops::Index; -use std::ops::IndexMut; - - - -// ============ -// == Matrix == -// ============ - -/// An efficient 2D matrix implemented on top of [`std::vec::Vec`]. -#[derive(Clone,Debug,Default,PartialEq,Eq)] -pub struct Matrix { - /// The number of rows in the matrix. - rows:usize, - /// The number of columns in the matrix. - columns:usize, - /// The matrix. - matrix:Vec, -} - -impl Matrix { - /// Get the number of rows in the matrix. - pub fn rows(&self) -> usize { - self.rows - } - - /// Get the number of columns in the matrix. - pub fn columns(&self) -> usize { - self.columns - } - - /// Obtain the indices for the rows in this matrix. - pub fn row_indices(&self) -> Range { - 0..self.rows() - } -} - -impl Matrix { - /// Constructs a matrix with the dimensions given by `rows` and `columns`. - pub fn new(rows:usize, columns:usize) -> Self { - let mut matrix = Vec::with_capacity(rows*columns); - for _ in 0..matrix.capacity() { - matrix.push(default()) - } - Self{rows,columns,matrix} - } - - /// Adds a new row to the matrix `self`, filled with default values. - pub fn new_row(&mut self) { - for _ in 0..self.columns { - self.matrix.push(default()); - } - self.rows += 1; - } -} - - -// === Trait Impls === - -impl Index<(usize,usize)> for Matrix { - type Output = T; - fn index(&self, index:(usize,usize)) -> &T { - &self.matrix[index.0*self.columns+index.1] - } -} - -impl IndexMut<(usize,usize)> for Matrix { - fn index_mut(&mut self, index:(usize,usize)) -> &mut T { - &mut self.matrix[index.0*self.columns+index.1] - } -} diff --git a/lib/rust/flexer/src/generate.rs b/lib/rust/flexer/src/generate.rs deleted file mode 100644 index e5e6fa6467..0000000000 --- a/lib/rust/flexer/src/generate.rs +++ /dev/null @@ -1,541 +0,0 @@ -//! This file contains utilities for generating rust code from lexer definitions, allowing the -//! flexer to be specialised for a specific language. - -use crate::prelude::*; -use quote::*; -use syn::*; - -use crate::automata::dfa::DFA; -use crate::automata::dfa::RuleExecutable; -use crate::automata::state::Identifier; -use crate::automata::state::State; -use crate::group::Group; -use crate::group; - -use enso_macro_utils::repr; -use proc_macro2::Literal; -use std::hash::BuildHasher; -use std::result::Result; -use std::fmt; - -use crate as flexer; - - - -// ======================= -// === Code Generation === -// ======================= - -/// Generate specialized code for the provided lexer `definition`. -/// -/// This specialized code is a highly-optimised and tailored lexer that dispatches based on simple -/// code-point switches, with no dynamic lookup. This means that it is very fast, and very low -/// overhead. -pub fn specialize -( definition : &impl flexer::State -, state_type_name : impl Str -, output_type_name : impl Str -) -> Result { - let group_registry = definition.groups(); - let mut body_items = Vec::new(); - body_items.push(run_function(output_type_name)?); - body_items.push(run_current_state_function()); - body_items.push(step(group_registry)); - for group in group_registry.all().iter() { - body_items.extend(automaton_for_group(group,group_registry)?) - } - let result = wrap_in_impl_for(state_type_name,body_items)?; - let code = show_code(&result); - Ok(code) -} - - -// === Whole-Lexer Codegen Utilities === - -/// Wrap the provided implementation items into an `impl` block for the provided `state_name` type. -pub fn wrap_in_impl_for -( state_name : impl Into -, body : Vec -) -> Result { - let state_name:Ident = str_to_ident(state_name.into().as_str())?; - let mut tree:ItemImpl = parse_quote! { - #[allow(missing_docs,dead_code,clippy::all)] - impl #state_name {} - }; - tree.items.extend(body); - Ok(tree) -} - -/// Generate the `run` function for the specialized lexer. -/// -/// This function is what the user of the lexer will call to begin execution. -pub fn run_function(output_type_name:impl Str) -> Result { - let output_type_name = str_to_path(output_type_name)?; - let tree:ImplItem = parse_quote! { - pub fn run(&mut self, mut reader:R) -> LexingResult<#output_type_name> { - self.set_up(); - reader.advance_char(&mut self.bookmarks); - while self.run_current_state(&mut reader) == StageStatus::ExitSuccess {} - let result = match self.status { - StageStatus::ExitFinished => LexingResult::success( - mem::take(&mut self.output) - ), - StageStatus::ExitFail => LexingResult::failure( - mem::take(&mut self.output) - ), - _ => LexingResult::partial(mem::take(&mut self.output)) - }; - self.tear_down(); - result - } - }; - Ok(tree) -} - -/// Generate the function responsible for executing the lexer in its current state. -pub fn run_current_state_function() -> ImplItem { - let tree:ImplItem = parse_quote! { - fn run_current_state(&mut self, reader:&mut R) -> StageStatus { - self.status = StageStatus::Initial; - let mut finished = false; - - // Runs until reaching a state that no longer says to continue. - while let Some(next_state) = self.status.continue_as() { - self.logger.debug(||format!("Current character is {:?}.",reader.character().char)); - self.logger.debug(||format!("Continuing in {:?}.",next_state)); - self.status = self.step(next_state,reader); - - if finished && reader.finished(self.bookmarks()) { - self.logger.info("Input finished."); - self.status = StageStatus::ExitFinished - } - finished = reader.character().is_eof(); - - if self.status.should_continue() { - match reader.character().char { - Ok(char) => { - reader.append_result(char); - self.logger.info(||format!("Result is {:?}.",reader.result())); - }, - Err(flexer::prelude::reader::Error::EOF) => { - self.logger.info("Reached EOF."); - }, - Err(flexer::prelude::reader::Error::EndOfGroup) => { - let current_state = self.current_state(); - let group_name = self.groups().group(current_state).name.as_str(); - let err = format!("Missing rules for state {}.", group_name); - self.logger.error(err.as_str()); - panic!(err) - } - Err(_) => { - self.logger.error("Unexpected error!"); - panic!("Unexpected error!") - } - } - reader.advance_char(&mut self.bookmarks); - } - } - - self.status - } - }; - tree -} - -/// Generate the `step` function for the lexer. -/// -/// This function is responsible for dispatching based on the current state, consuming a character, -/// and returning the state to transition to. -pub fn step(groups:&group::Registry) -> ImplItem { - let arms = groups.all().iter().map(|g| step_match_arm(g.id.into())).collect_vec(); - parse_quote! { - fn step(&mut self, next_state:SubStateId, reader:&mut R) -> StageStatus { - let current_state:usize = self.current_state().into(); - match current_state { - #(#arms)* - _ => unreachable_panic!("Unreachable state reached in lexer."), - } - } - } -} - -/// Generate a match arm for the step function. -/// -/// There is one match arm per lexer state. -pub fn step_match_arm(number:usize) -> Arm { - let literal = Literal::usize_unsuffixed(number); - let function_name_str = format!("dispatch_in_state_{}",number); - let func_name:Ident = parse_str(function_name_str.as_str()).unwrap(); - let arm:Arm = parse_quote! { - #literal => self.#func_name(next_state,reader), - }; - arm -} - - -// === Generation for a Specific Lexer State === - -/// Generate the functions that implement the lexer automaton for a given lexer state. -pub fn automaton_for_group -( group : &Group -, registry : &group::Registry -) -> Result,GenError> { - let nfa = registry.to_nfa_from(group.id); - let mut rules = Vec::with_capacity(nfa.states.len()); - for state in nfa.states.iter() { - if state.name.is_some() { - rules.push(rule_for_state(state)?); - } - } - let mut dfa = DFA::from(&nfa); - let dispatch_for_dfa = dispatch_in_state(&dfa,group.id.into())?; - let mut dfa_transitions = transitions_for_dfa(&mut dfa,group.id.into())?; - dfa_transitions.push(dispatch_for_dfa); - dfa_transitions.extend(rules); - Ok(dfa_transitions) -} - -/// Generate a set of transition functions for the provided `dfa`, with identifier `id`. -pub fn transitions_for_dfa(dfa:&mut DFA, id:usize) -> Result,GenError> { - let mut state_has_overlapping_rules:HashMap = HashMap::new(); - state_has_overlapping_rules.insert(0,false); - let state_names:Vec<_> = dfa.links.row_indices().map(|ix| (ix, name_for_step(id, ix))).collect(); - let mut transitions = Vec::with_capacity(state_names.len()); - for (ix,name) in state_names.into_iter() { - transitions.push(transition_for_dfa(dfa,name,ix,&mut state_has_overlapping_rules)?) - } - Ok(transitions) -} - -/// Generate a specific transition function for -#[allow(clippy::implicit_hasher)] -pub fn transition_for_dfa -( dfa : &mut DFA -, transition_name : Ident -, state_ix : usize -, has_overlaps : &mut HashMap -) -> Result { - let match_expr:Expr = match_for_transition(dfa,state_ix,has_overlaps)?; - let function:ImplItem = parse_quote! { - fn #transition_name(&mut self, reader:&mut R) -> StageStatus { - #match_expr - } - }; - Ok(function) -} - -/// Generate the pattern match for a given transition function. -pub fn match_for_transition -( dfa : &mut DFA -, state_ix : usize -, has_overlaps : &mut HashMap -) -> Result { - let overlaps = *has_overlaps.get(&state_ix).unwrap_or(&false); - let state = dfa.callbacks.get(state_ix).expect("Internal error.").clone(); - let mut trigger_state = dfa.links[(state_ix,0)]; - let mut range_start = u32::min_value(); - let divisions:Vec<_> = dfa.alphabet_segmentation.divisions_as_vec(); - let mut branches = Vec::with_capacity(divisions.len()); - for division in divisions.into_iter() { - let ix = division.position; - let sym = division.symbol; - let new_trigger_state = dfa.links[(state_ix,ix)]; - if new_trigger_state != trigger_state { - let range_end = if sym.value != 0 { sym.value - 1 } else { sym.value }; - let current_trigger_state = trigger_state; - let current_range_start = range_start; - trigger_state = new_trigger_state; - range_start = sym.value; - let body = - branch_body(dfa,current_trigger_state,&state,has_overlaps,overlaps)?; - branches.push(Branch::new(Some(current_range_start..=range_end),body)) - } else {} - } - let catch_all_branch_body = branch_body(dfa,trigger_state,&state,has_overlaps,overlaps)?; - let catch_all_branch = Branch::new(None,catch_all_branch_body); - branches.push(catch_all_branch); - let arms:Vec = branches.into_iter().map(Into::into).collect(); - let mut match_expr:ExprMatch = parse_quote! { - match u32::from(reader.character()) { - #(#arms)* - } - }; - match_expr.arms = arms; - Ok(Expr::Match(match_expr)) -} - -/// Generate the branch body for a transition in the DFA. -pub fn branch_body -( dfa : &mut DFA -, target_state : Identifier -, maybe_state : &Option -, has_overlaps : &mut HashMap -, rules_overlap : bool -) -> Result { - if target_state == Identifier::INVALID { - match maybe_state { - None => { - Ok(parse_quote! {{ - StageStatus::ExitFail - }}) - }, - Some(rule_exec) => { - let rule:Expr = match parse_str(rule_exec.code.as_str()) { - Ok(rule) => rule, - Err(_) => return Err(GenError::BadExpression(rule_exec.code.clone())) - }; - if rules_overlap { - Ok(parse_quote! {{ - let rule_bookmark = self.bookmarks.rule_bookmark; - let matched_bookmark = self.bookmarks.matched_bookmark; - self.bookmarks.rewind(rule_bookmark,reader); - self.current_match = reader.pop_result(); - self.#rule(reader); - self.bookmarks.bookmark(matched_bookmark,reader); - StageStatus::ExitSuccess - }}) - } else { - Ok(parse_quote! {{ - let matched_bookmark = self.bookmarks.matched_bookmark; - self.current_match = reader.pop_result(); - self.#rule(reader); - self.bookmarks.bookmark(matched_bookmark,reader); - StageStatus::ExitSuccess - }}) - } - } - } - } else { - let target_state_has_no_rule = match maybe_state { - Some(state) => if !dfa.has_rule_for(target_state) { - dfa.callbacks[target_state.id] = Some(state.clone()); - has_overlaps.insert(target_state.id,true); - true - } else { - false - }, - None => false - }; - - let state_id = Literal::usize_unsuffixed(target_state.id); - let ret:Expr = parse_quote! { - StageStatus::ContinueWith(#state_id.into()) - }; - - if target_state_has_no_rule && !rules_overlap { - Ok(parse_quote! {{ - let rule_bookmark = self.bookmarks.rule_bookmark; - self.bookmarks.bookmark(rule_bookmark,reader); - #ret - }}) - } else { - Ok(parse_quote! {{ - #ret - }}) - } - } -} - -/// Generate the dispatch function for a given lexer state. -/// -/// This dispatch function is responsible for dispatching based on the sub-state of any given lexer -/// state, and is the main part of implementing the actual lexer transitions. -pub fn dispatch_in_state(dfa:&DFA, id:usize) -> Result { - let dispatch_name:Ident = str_to_ident(format!("dispatch_in_state_{}",id))?; - let state_names = dfa.links.row_indices().map(|ix| (ix, name_for_step(id,ix))).collect_vec(); - let mut branches = Vec::with_capacity(state_names.len()); - for (ix,name) in state_names.into_iter() { - let literal = Literal::usize_unsuffixed(ix); - let arm:Arm = parse_quote! { - #literal => self.#name(reader), - }; - branches.push(arm); - } - - let pattern_match:ExprMatch = parse_quote! { - match new_state_index.into() { - #(#branches)* - _ => unreachable_panic!("Unreachable state reached in lexer.") - } - }; - let func:ImplItem = parse_quote! { - fn #dispatch_name - ( &mut self - , new_state_index:SubStateId - , reader:&mut R - ) -> StageStatus { - #pattern_match - } - }; - - Ok(func) -} - -/// Generate a name for a given step function. -pub fn name_for_step(in_state:usize, to_state:usize) -> Ident { - let name_str = format!("state_{}_to_{}",in_state,to_state); - parse_str(name_str.as_str()).expect("Impossible to not be a valid identifier.") -} - -/// Generate an executable rule function for a given lexer state. -pub fn rule_for_state(state:&State) -> Result { - match &state.name { - None => unreachable_panic!("Rule for state requested, but state has none."), - Some(name) => { - let rule_name = str_to_ident(name)?; - let code:Expr = match parse_str(state.callback.as_str()) { - Ok(expr) => expr, - Err(_) => return Err(GenError::BadExpression(state.callback.clone())) - }; - if !has_reader_arg(&code) { - return Err(GenError::BadCallbackArgument) - } - - let tree:ImplItem = parse_quote! { - fn #rule_name(&mut self, reader:&mut R) { - #code - } - }; - Ok(tree) - } - } -} - -/// Checks if the given `expr` is a call with a single argument "reader" being passed. -#[allow(clippy::cmp_owned)] -pub fn has_reader_arg(expr:&Expr) -> bool { - match expr { - Expr::MethodCall(expr) => match expr.args.first() { - Some(Expr::Path(path)) => { - match path.path.segments.first() { - Some(segment) => { - segment.ident.to_string() == "reader" - } - _ => false - } - } - _ => false - }, - Expr::Call(expr) => match expr.args.first() { - Some(Expr::Path(path)) => { - match path.path.segments.first() { - Some(segment) => { - segment.ident.to_string() == "reader" - } - _ => false - } - } - _ => false - } - _ => false - } -} - - - -// ================ -// === GenError === -// ================ - -/// Errors that arise during code generation. -#[derive(Clone,Debug,PartialEq)] -pub enum GenError { - /// The callback function does not take a single argument `reader`. - BadCallbackArgument, - /// The provided string is not a valid rust identifier. - BadIdentifier(String), - /// The provided expression isn't a valid rust expression. - BadExpression(String), - /// The provided string is not a valid rust literal. - BadLiteral(String), - /// The provided string is not a valid rust path. - BadPath(String), -} - - -// === Trait Impls === - -impl Display for GenError { - fn fmt(&self, f:&mut fmt::Formatter<'_>) -> fmt::Result { - match self { - GenError::BadCallbackArgument => write!(f, - "Bad argument to a callback function. It must take a single argument `reader`." - ), - GenError::BadIdentifier(str) => write!(f,"`{}` is not a valid rust identifier.",str), - GenError::BadExpression(str) => write!(f,"`{}` is not a valid rust expression.",str), - GenError::BadLiteral(str) => write!(f,"`{}` is not a valid rust literal.",str), - GenError::BadPath(str) => write!(f,"`{}` is not a valid rust path.",str), - } - } -} - - - -// ============== -// === Branch === -// ============== - -/// A representation of a dispatch branch for helping to generate pattern arms. -#[allow(missing_docs)] -#[derive(Clone,Debug,PartialEq)] -struct Branch { - pub range:Option>, - pub body:Block -} - -impl Branch { - /// Create a new branch, from the provided `range` and with `body` as the code it executes. - pub fn new(range:Option>, body:Block) -> Branch { - Branch {range,body} - } -} - - -// === Trait Impls === - -impl Into for Branch { - fn into(self) -> Arm { - let body = self.body; - match self.range { - Some(range) => { - let range_start = Literal::u32_unsuffixed(*range.start()); - let range_end = Literal::u32_unsuffixed(*range.end()); - if range.start() == range.end() { - parse_quote! { - #range_start => #body, - } - } else { - parse_quote! { - #range_start..=#range_end => #body, - } - } - } - None => parse_quote! { - _ => #body, - } - } - } -} - - - -// ================= -// === Utilities === -// ================= - -/// Convert a string to an identifier. -pub fn str_to_ident(str:impl Str) -> Result { - parse_str(str.as_ref()).map_err(|_| GenError::BadIdentifier(str.into())) -} - -/// Convert a string to a path. -pub fn str_to_path(str:impl Str) -> Result { - parse_str(str.as_ref()).map_err(|_| GenError::BadPath(str.into())) -} - -/// Convert the syntax tree into a string. -pub fn show_code(tokens:&impl ToTokens) -> String { - repr(tokens) -} - - diff --git a/lib/rust/flexer/src/group.rs b/lib/rust/flexer/src/group.rs deleted file mode 100644 index 630bafb0af..0000000000 --- a/lib/rust/flexer/src/group.rs +++ /dev/null @@ -1,366 +0,0 @@ -//! This module provides an API for grouping multiple flexer rules. - -use crate::automata::nfa::NFA; -use crate::automata::pattern::Pattern; -use crate::group::rule::Rule; - -use itertools::Itertools; -use std::fmt::Display; -use wasm_bindgen::__rt::core::fmt::Formatter; - -pub mod rule; - - - -// ================ -// === Registry === -// ================ - -/// The group Registry is a container for [`Group`]s in the flexer implementation. -/// -/// It allows groups to contain associations between themselves, and also implements useful -/// conversions for groups. -#[derive(Clone,Debug,Default)] -pub struct Registry { - /// The groups defined for the lexer. - groups:Vec -} - -impl Registry { - /// Defines a new group of rules for the lexer with the specified `name` and `parent`. - /// - /// It returns the identifier of the newly-created group. - pub fn define_group - ( &mut self - , name : impl Into - , parent_index : Option - ) -> Identifier { - let id = self.next_id(); - let group = Group::new(id,name.into(),parent_index); - self.groups.push(group); - id - } - - /// Adds an existing `group` to the registry, updating and returning its identifier. - pub fn add_group(&mut self, mut group:Group) -> Identifier { - let new_id = self.next_id(); - group.id = new_id; - self.groups.push(group); - new_id - } - - /// Creates a rule that matches `pattern` for the group identified by `group_id`. - /// - /// Panics if `group_id` refers to a nonexistent group. - pub fn create_rule(&mut self, group:Identifier, pattern:&Pattern, callback:impl AsRef) { - let group = self.group_mut(group); - group.create_rule(pattern,callback.as_ref()); - } - - /// Associates the provided `rule` with the group identified by `group_id`. - /// - /// Panics if `group_id` refers to a nonexistent group. - pub fn add_rule(&mut self, group:Identifier, rule:Rule) { - let group = self.group_mut(group); - group.add_rule(rule); - } - - /// Collates the entire set of rules that are matchable when the lexer has the group identified - /// by `group_id` as active. - /// - /// This set of rules includes the rules inherited from any parent groups. - pub fn rules_for(&self, group:Identifier) -> Vec<&Rule> { - let group_handle = self.group(group); - let mut parent = group_handle.parent_index.map(|p| self.group(p)); - let mut rules = (&group_handle.rules).iter().collect_vec(); - while let Some(parent_group) = parent { - if parent_group.id == group_handle.id { - panic!("There should not be cycles in parent links for lexer groups.") - } - rules.extend((&parent_group.rules).iter()); - parent = parent_group.parent_index.map(|p| self.group(p)); - } - rules - } - - /// Obtains a reference to the group for the given `group_id`. - /// - /// As group identifiers can only be created by use of this `Registry`, this will always - /// succeed. - pub fn group(&self, group:Identifier) -> &Group { - self.groups.get(group.0).expect("The group must exist.") - } - - /// Obtains a mutable reference to the group for the given `group_id`. - /// - /// As group identifiers can only be created by use of this `Registry`, this will always - /// succeed. - pub fn group_mut(&mut self, group:Identifier) -> &mut Group { - self.groups.get_mut(group.0).expect("The group should exist.") - } - - /// Converts the group identified by `group_id` into an NFA. - /// - /// Returns `None` if the group does not exist, or if the conversion fails. - pub fn to_nfa_from(&self, group:Identifier) -> NFA { - let group = self.group(group); - let mut nfa = NFA::default(); - let start = nfa.new_state(); - let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern); - let rules = self.rules_for(group.id); - let callbacks = rules.iter().map(|r| r.callback.clone()).collect_vec(); - let states = rules.into_iter().map(build).collect_vec(); - let end = nfa.new_state(); - for (ix,state) in states.into_iter().enumerate() { - nfa.states[state.id].name = Some(group.callback_name(ix)); - nfa.states[state.id].callback = callbacks.get(ix).unwrap().clone(); - nfa.connect(state,end); - } - nfa - } - - /// Generates the next group identifier for this registry. - fn next_id(&self) -> Identifier { - let val = self.groups.len(); - Identifier(val) - } - - /// Get an immutable reference to the groups contained within the registry. - pub fn all(&self) -> &Vec { - &self.groups - } -} - - - -// ================== -// === Identifier === -// ================== - -/// An identifier for a group. -#[allow(missing_docs)] -#[derive(Copy,Clone,Debug,Default,Eq,PartialEq)] -pub struct Identifier(usize); - - -// === Trait Impls === - -impl From for Identifier { - fn from(id:usize) -> Self { - Identifier(id) - } -} - -impl From<&usize> for Identifier { - fn from(id:&usize) -> Self { - Identifier(*id) - } -} - -impl Into for Identifier { - fn into(self) -> usize { - self.0 - } -} - - - -// =========== -// == Group == -// =========== - -/// A group is a structure for associating multiple rules with each other, and is the basic building -/// block of the flexer. -/// -/// A group consists of the following: -/// -/// - A set of [`Rule`s](Rule), each containing a regex pattern and associated callback. -/// - Inherited rules from a parent group, if such a group exists. -/// -/// Internally, the flexer maintains a stack of groups, where only one group can be active at any -/// given time. Rules are matched _in order_, and hence overlaps are handled by the order in which -/// the rules are matched, with the first callback being triggered. -/// -/// Whenever a [`rule.pattern`](Rule::pattern) from the active group is matched against part of the -/// input, the associated [`rule.callback`](Rule::callback) is executed. This callback may exit the -/// current group or even enter a new one. As a result, groups allow us to elegantly model a -/// situation where certain parts of a program (e.g. within a string literal) have very different -/// lexing rules than other portions of a program (e.g. the body of a function). -#[derive(Clone,Debug,Default)] -pub struct Group { - /// A unique identifier for the group. - pub id:Identifier, - /// A name for the group (useful in debugging). - pub name:String, - /// The parent group from which rules are inherited. - /// - /// It is ensured that the group is held mutably. - pub parent_index:Option, - /// A set of flexer rules. - pub rules:Vec, -} - -impl Group { - - /// Creates a new group. - pub fn new(id:Identifier, name:impl Into, parent_index:Option) -> Self { - let rules = Vec::new(); - Group{id,name:name.into(),parent_index,rules} - } - - /// Adds a new rule to the current group. - pub fn add_rule(&mut self, rule:Rule) { - self.rules.push(rule) - } - - /// Creates a new rule. - pub fn create_rule(&mut self, pattern:&Pattern, code:&str) { - let pattern_clone = pattern.clone(); - let rule = Rule::new(pattern_clone,code); - self.rules.push(rule) - } - - /// The canonical name for a given rule. - pub fn callback_name(&self, rule_ix:usize) -> String { - format!("group_{}_rule_{}",self.id.0,rule_ix) - } -} - -// === Trait Impls === - -impl Into for Group { - fn into(self) -> Registry { - let mut registry = Registry::default(); - registry.add_group(self); - registry - } -} - -impl Display for Group { - fn fmt(&self, f:&mut Formatter<'_>) -> std::fmt::Result { - write!(f,"Group {}",self.name) - } -} - - - -// ============= -// === Tests === -// ============= - -#[cfg(test)] -pub mod tests { - extern crate test; - - use crate::automata::nfa; - use crate::automata::pattern::Pattern; - use crate::group::Group; - use crate::group::Registry; - use crate::group::rule::Rule; - - use std::default::Default; - use test::Bencher; - use enso_prelude::default; - - fn newline() -> Registry { - let pattern = Pattern::char('\n'); - let mut group = Group::default(); - group.add_rule(Rule::new(pattern,"")); - let mut registry = Registry::default(); - registry.add_group(group); - registry - } - - fn letter() -> Registry { - let pattern = Pattern::range('a'..='z'); - let mut group = Group::default(); - group.add_rule(Rule::new(pattern,"")); - group.into() - } - - fn spaces() -> Registry { - let pattern = Pattern::char(' ').many1(); - let mut group = Group::default(); - group.add_rule(Rule::new(pattern,"")); - group.into() - } - - fn letter_and_spaces() -> Registry { - let letter = Pattern::range('a'..='z'); - let spaces = Pattern::char(' ').many1(); - let mut group = Group::default(); - group.add_rule(Rule::new(letter,"")); - group.add_rule(Rule::new(spaces,"")); - group.into() - } - - fn complex_rules(count:usize) -> Registry { - let mut group = Group::default(); - for ix in 0..count { - let string = ix.to_string(); - let all = Pattern::all_of(&string); - let any = Pattern::any_of(&string); - let none = Pattern::none_of(&string); - let all_any_none = all >> any >> none; - let pattern = Pattern::many(&all_any_none); - group.add_rule(Rule::new(pattern.clone(),"")); - } - group.into() - } - - #[test] - fn test_to_nfa_newline() { - assert_eq!(newline().to_nfa_from(default()),nfa::tests::newline()); - } - - #[test] - fn test_to_nfa_letter() { - assert_eq!(letter().to_nfa_from(default()),nfa::tests::letter()); - } - - #[test] - fn test_to_nfa_spaces() { - assert_eq!(spaces().to_nfa_from(default()),nfa::tests::spaces()); - } - - #[test] - fn test_to_nfa_letter_and_spaces() { - let expected = nfa::tests::letter_and_spaces(); - assert_eq!(letter_and_spaces().to_nfa_from(default()),expected); - } - - #[bench] - fn bench_to_nfa_newline(bencher:&mut Bencher) { - bencher.iter(|| newline().to_nfa_from(default())) - } - - #[bench] - fn bench_to_nfa_letter(bencher:&mut Bencher) { - bencher.iter(|| letter().to_nfa_from(default())) - } - - #[bench] - fn bench_to_nfa_spaces(bencher:&mut Bencher) { - bencher.iter(|| spaces().to_nfa_from(default())) - } - - #[bench] - fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) { - bencher.iter(|| letter_and_spaces().to_nfa_from(default())) - } - - #[bench] - fn bench_ten_rules(bencher:&mut Bencher) { - bencher.iter(|| complex_rules(10).to_nfa_from(default())) - } - - #[bench] - fn bench_hundred_rules(bencher:&mut Bencher) { - bencher.iter(|| complex_rules(100).to_nfa_from(default())) - } - - #[bench] - fn bench_thousand_rules(bencher:&mut Bencher) { - bencher.iter(|| complex_rules(1000).to_nfa_from(default())) - } -} diff --git a/lib/rust/flexer/src/group/rule.rs b/lib/rust/flexer/src/group/rule.rs deleted file mode 100644 index daa1b0e56c..0000000000 --- a/lib/rust/flexer/src/group/rule.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! An API for declaring rust-code callbacks to be executed when a given pattern is matched. -//! -//! A flexer rule is a [`crate::automata::pattern`] associated with rust code to be executed as a -//! callback. - -use crate::automata::pattern::Pattern; - - - -// ========== -// == Rule == -// ========== - -/// A flexer rule. -#[derive(Clone,Debug)] -pub struct Rule { - /// The pattern that triggers the callback. - pub pattern:Pattern, - - /// The code to execute when [`Rule::pattern`] matches, containing rust code as a - /// [`std::string::String`]. - /// - /// This code will be called directly from a method defined on your Lexer (the one that contains - /// a [`crate::Flexer`] instance. To this end, the code you provide as a string must be valid in - /// that context. - pub callback:String, -} - -impl Rule { - /// Creates a new rule. - pub fn new(pattern:Pattern, callback:impl Into) -> Self { - Rule{pattern,callback:callback.into()} - } -} diff --git a/lib/rust/flexer/src/lib.rs b/lib/rust/flexer/src/lib.rs deleted file mode 100644 index c66c1a850b..0000000000 --- a/lib/rust/flexer/src/lib.rs +++ /dev/null @@ -1,1378 +0,0 @@ -#![deny(unconditional_recursion)] -#![feature(test)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This module exports the API for defining a simple lexer based on a deterministic finite state -//! automaton. -//! -//! Lexers defined using the Flexer are capable of lexing languages of significant complexity, and -//! while they're defined in a simple way (akin to regular grammars), they can work even with -//! context-sensitive languages. -//! -//! The process of defining a lexer involves the user doing the following: -//! -//! 1. Creating a `Lexer` type that wraps the [`Flexer`]. -//! 2. Creating a `State` type, to hold the user-defined lexing state. -//! 3. Implementing [`State`] for the `State` type. -//! 4. Implementing [`Definition`] for the `Lexer`, along with lexing transition rules to lex the -//! language. -//! -//! The result of defining a lexer using the flexer is a hybrid of the code written using this -//! library, and also the code that this library generates to specialize your lexer. -//! -//! # Writing a Lexer -//! -//! As the Flexer is a library for writing lexers, it would be remiss of us not to include a worked -//! example for how to define a lexer. The following example defines a lexer for a small language, -//! and shows you how to integrate the flexer code generation step with your project's build. -//! -//! ## The Language -//! -//! We're going to define a lexer for a very simple language, represented by the following -//! [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) grammar. -//! -//! ```plain -//! a-word = 'a'+; -//! b-word = 'b'+; -//! word = a-word | b-word; -//! space = ' '; -//! spaced-word = space, word; -//! language = word, spaced-word*; -//! ``` -//! -//! ## The Lexer's Output -//! -//! Every lexer needs the ability to write a stream of tokens as its output. A flexer-based lexer -//! can use any type that it wants as its output type, but this language is going to use a very -//! simple `Token` type, wrapped into a `TokenStream`. -//! -//! ``` -//! #[derive(Clone)] -//! pub enum Token { -//! /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! Word(String), -//! /// A token that the lexer is unable to recognise. -//! Unrecognized(String) -//! } -//! -//! #[derive(Clone,Default)] -//! pub struct TokenStream { -//! tokens:Vec -//! } -//! -//! impl TokenStream { -//! pub fn push(&mut self,token:Token) { -//! self.tokens.push(token) -//! } -//! } -//! ``` -//! -//! These tokens will be inserted into the token stream by our lexer as it recognises valid portions -//! of our language. -//! -//! Whatever you choose as the `Output` type of your lexer, it will need to implement both -//! [`std::clone::Clone`] and [`std::default::Default`]. -//! -//! ## The Lexer's State -//! -//! Every Flexer-based lexer operates over a state that holds all of the user-defined state -//! information required to define the particular lexer. This state type must conform to the -//! [`State`] trait, which defines important functionality that it must provide to the flexer. -//! -//! In our language, we want to only be able to match words with a preceding space character once -//! we've seen an initial word that doesn't have one. To this end, we need a state in our lexer to -//! record that we've 'seen' the first word. As required by the [`State`] trait, we also need to -//! provide the flexer with an initial state, the state registry, and the bookmarks we use. -//! -//! ``` -//! use flexer::group; -//! use flexer::prelude::reader::BookmarkManager; -//! use flexer::State; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! -//! -//! // === LexerState === -//! -//! #[derive(Debug)] -//! pub struct LexerState { -//! /// The registry for groups in the lexer. -//! lexer_states:group::Registry, -//! /// The initial state of the lexer. -//! initial_state:group::Identifier, -//! /// The state entered when the first word has been seen. -//! seen_first_word_state:group::Identifier, -//! /// The bookmarks for this lexer. -//! bookmarks:BookmarkManager -//! } -//! ``` -//! -//! The flexer library provides useful functionality to help with defining your lexer state, such as -//! [`group::Registry`] for containing the various states through which your lexer may transition, -//! amd [`prelude::reader::BookmarkManager`] for storing bookmarks. -//! -//! > ### Bookmarks -//! > In order to enable arbitrary lookahead, the flexer provides a system for "bookmarking" a point -//! > in the input stream so that the lexer may return to it later. In fact, this mechanism is used -//! > _by default_ in the implementation to deal with overlapping rules, and so the -//! > [`prelude::reader::BookmarkManager`] provides some bookmarks for you by default. -//! > -//! > As a user, however, you can define additional bookmarks as part of your state, and mark or -//! > return to them as part of your lexer's transition functions (more on this below). -//! -//! Now that we have our state type, we need to define an implementation of [`State`] for it. This -//! is a mostly trivial exercise, but two functions ([`State::new()`] and [`State::specialize`]) -//! require special attention. We'll look at both below. -//! -//! ``` -//! use flexer::generate; -//! # use flexer::group; -//! use flexer::generate::GenError; -//! use flexer::prelude::AnyLogger; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! -//! impl flexer::State for LexerState { -//! fn new(_logger:&impl AnyLogger) -> Self { -//! // Here we construct all of the elements needed for our lexer state. This function can -//! // contain arbitrarily complex logic and is only called once at initialization time. -//! let mut lexer_states = group::Registry::default(); -//! let initial_state = lexer_states.define_group("ROOT",None); -//! let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! let bookmarks = BookmarkManager::new(); -//! Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! } -//! -//! fn initial_state(&self) -> group::Identifier { -//! self.initial_state -//! } -//! -//! fn groups(&self) -> &group::Registry { -//! &self.lexer_states -//! } -//! -//! fn groups_mut(&mut self) -> &mut group::Registry { -//! &mut self.lexer_states -//! } -//! -//! fn bookmarks(&self) -> &BookmarkManager { -//! &self.bookmarks -//! } -//! -//! fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! &mut self.bookmarks -//! } -//! -//! fn specialize(&self) -> Result { -//! // It is very important to pass both the type name of your lexer and your output -//! // correctly here. This function should always be implemented as a call to the -//! // below-used function. -//! generate::specialize(self,"TestLexer","Token") -//! } -//! } -//! ``` -//! -//! ## Defining the Lexer Type -//! -//! With our state type defined, we now have the prerequisites for defining the lexer itself! -//! -//! The notion behind the way we define lexers in the flexer is to use a chain of -//! [`std::ops::Deref`] implementations to make the disparate parts feel like a cohesive whole. -//! The [`Flexer`] itself already implements deref to your state type, so all that remains is to do -//! the following: -//! -//! 1. Define your lexer struct itself, containing an instance of the [`Flexer`], parametrised by -//! your state and output types. -//! -//! ``` -//! use flexer::Flexer; -//! # use flexer::generate; -//! # use flexer::group; -//! # use flexer::prelude::GenError; -//! # use flexer::prelude::AnyLogger; -//! use flexer::prelude::logger::Disabled; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! -//! type Logger = Disabled; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! # -//! # impl flexer::State for LexerState { -//! # fn new(_logger:&impl AnyLogger) -> Self { -//! # // Here we construct all of the elements needed for our lexer state. This function can -//! # // contain arbitrarily complex logic and is only called once at initialization time. -//! # let mut lexer_states = group::Registry::default(); -//! # let initial_state = lexer_states.define_group("ROOT",None); -//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! # let bookmarks = BookmarkManager::new(); -//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! # } -//! # -//! # fn initial_state(&self) -> group::Identifier { -//! # self.initial_state -//! # } -//! # -//! # fn groups(&self) -> &group::Registry { -//! # &self.lexer_states -//! # } -//! # -//! # fn groups_mut(&mut self) -> &mut group::Registry { -//! # &mut self.lexer_states -//! # } -//! # -//! # fn bookmarks(&self) -> &BookmarkManager { -//! # &self.bookmarks -//! # } -//! # -//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! # &mut self.bookmarks -//! # } -//! # -//! # fn specialize(&self) -> Result { -//! # // It is very important to pass both the type name of your lexer and your output -//! # // correctly here. This function should always be implemented as a call to the -//! # // below-used function. -//! # generate::specialize(self,"TestLexer","Token") -//! # } -//! # } -//! -//! -//! // === Lexer === -//! -//! pub struct Lexer { -//! lexer:Flexer -//! } -//! ``` -//! -//! You'll note that the `Flexer` also takes a logging implementation from the Enso logging library -//! as a type parameter. This lets the client of the library configure the behaviour of logging in -//! their lexer. We recommend aliasing the current logger type (as shown above) for ease of use. -//! -//! 2. Implement a `new()` function for your lexer. -//! -//! ``` -//! # use flexer::Flexer; -//! # use flexer::generate; -//! # use flexer::group; -//! # use flexer::prelude::AnyLogger; -//! # use flexer::prelude::GenError; -//! # use flexer::prelude::logger::Disabled; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! # -//! # type Logger = Disabled; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! # -//! # impl flexer::State for LexerState { -//! # fn new(_logger:&impl AnyLogger) -> Self { -//! # // Here we construct all of the elements needed for our lexer state. This function can -//! # // contain arbitrarily complex logic and is only called once at initialization time. -//! # let mut lexer_states = group::Registry::default(); -//! # let initial_state = lexer_states.define_group("ROOT",None); -//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! # let bookmarks = BookmarkManager::new(); -//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! # } -//! # -//! # fn initial_state(&self) -> group::Identifier { -//! # self.initial_state -//! # } -//! # -//! # fn groups(&self) -> &group::Registry { -//! # &self.lexer_states -//! # } -//! # -//! # fn groups_mut(&mut self) -> &mut group::Registry { -//! # &mut self.lexer_states -//! # } -//! # -//! # fn bookmarks(&self) -> &BookmarkManager { -//! # &self.bookmarks -//! # } -//! # -//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! # &mut self.bookmarks -//! # } -//! # -//! # fn specialize(&self) -> Result { -//! # // It is very important to pass both the type name of your lexer and your output -//! # // correctly here. This function should always be implemented as a call to the -//! # // below-used function. -//! # generate::specialize(self,"TestLexer","Token") -//! # } -//! # } -//! # -//! # -//! # // === Lexer === -//! # -//! # pub struct Lexer { -//! # lexer:Flexer -//! # } -//! -//! impl Lexer { -//! pub fn new() -> Self { -//! let lexer = Flexer::new(Logger::new("Lexer")); -//! Lexer{lexer} -//! } -//! } -//! ``` -//! -//! 3. Define [`std::ops::Deref`] and [`std::ops::DerefMut`] for your lexer. -//! -//! ``` -//! # use flexer::Flexer; -//! # use flexer::generate; -//! # use flexer::group; -//! # use flexer::prelude::AnyLogger; -//! # use flexer::prelude::GenError; -//! # use flexer::prelude::logger::Disabled; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! use std::ops::Deref; -//! use std::ops::DerefMut; -//! # -//! # type Logger = Disabled; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! # -//! # impl flexer::State for LexerState { -//! # fn new(_logger:&impl AnyLogger) -> Self { -//! # // Here we construct all of the elements needed for our lexer state. This function can -//! # // contain arbitrarily complex logic and is only called once at initialization time. -//! # let mut lexer_states = group::Registry::default(); -//! # let initial_state = lexer_states.define_group("ROOT",None); -//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! # let bookmarks = BookmarkManager::new(); -//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! # } -//! # -//! # fn initial_state(&self) -> group::Identifier { -//! # self.initial_state -//! # } -//! # -//! # fn groups(&self) -> &group::Registry { -//! # &self.lexer_states -//! # } -//! # -//! # fn groups_mut(&mut self) -> &mut group::Registry { -//! # &mut self.lexer_states -//! # } -//! # -//! # fn bookmarks(&self) -> &BookmarkManager { -//! # &self.bookmarks -//! # } -//! # -//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! # &mut self.bookmarks -//! # } -//! # -//! # fn specialize(&self) -> Result { -//! # // It is very important to pass both the type name of your lexer and your output -//! # // correctly here. This function should always be implemented as a call to the -//! # // below-used function. -//! # generate::specialize(self,"TestLexer","Token") -//! # } -//! # } -//! # -//! # -//! # // === Lexer === -//! # -//! # pub struct Lexer { -//! # lexer:Flexer -//! # } -//! # -//! # impl Lexer { -//! # pub fn new() -> Self { -//! # let lexer = Flexer::new(Logger::new("Lexer")); -//! # Lexer{lexer} -//! # } -//! # } -//! -//! impl Deref for Lexer { -//! type Target = Flexer ; -//! fn deref(&self) -> &Self::Target { -//! &self.lexer -//! } -//! } -//! impl DerefMut for Lexer { -//! fn deref_mut(&mut self) -> &mut Self::Target { -//! &mut self.lexer -//! } -//! } -//! ``` -//! -//! You'll note that here we've instantiated the flexer with a `Logger`. This is used for providing -//! debug information during development, and can be accessed from all scopes of your lexer. In -//! release mode, however, logging calls at the "trace", "debug", and "info" levels are optimised -//! away. -//! -//! ## Defining the Lexing Rules -//! -//! Flexer-based lexers operate by matching on a series of [`automata::pattern::Pattern`]s that -//! describe the language that it is trying to lex. It combines these patterns with "transition -//! functions" that may execute arbitrary code when a pattern matches on the lexer's input. -//! -//! In order to define the lexing rules, we need to implement [`Definition`] for our lexer, -//! particularly the [`Definition::define()`] function. -//! -//! ``` -//! use flexer::automata::pattern::Pattern; -//! # use flexer::Flexer; -//! # use flexer::generate; -//! use flexer::group::Registry; -//! # use flexer::group; -//! # use flexer::prelude::AnyLogger; -//! # use flexer::prelude::GenError; -//! # use flexer::prelude::logger::Disabled; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! use flexer; -//! # use std::ops::Deref; -//! # use std::ops::DerefMut; -//! # -//! # type Logger = Disabled; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! # -//! # impl flexer::State for LexerState { -//! # fn new(_logger:&impl AnyLogger) -> Self { -//! # // Here we construct all of the elements needed for our lexer state. This function can -//! # // contain arbitrarily complex logic and is only called once at initialization time. -//! # let mut lexer_states = group::Registry::default(); -//! # let initial_state = lexer_states.define_group("ROOT",None); -//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! # let bookmarks = BookmarkManager::new(); -//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! # } -//! # -//! # fn initial_state(&self) -> group::Identifier { -//! # self.initial_state -//! # } -//! # -//! # fn groups(&self) -> &group::Registry { -//! # &self.lexer_states -//! # } -//! # -//! # fn groups_mut(&mut self) -> &mut group::Registry { -//! # &mut self.lexer_states -//! # } -//! # -//! # fn bookmarks(&self) -> &BookmarkManager { -//! # &self.bookmarks -//! # } -//! # -//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! # &mut self.bookmarks -//! # } -//! # -//! # fn specialize(&self) -> Result { -//! # // It is very important to pass both the type name of your lexer and your output -//! # // correctly here. This function should always be implemented as a call to the -//! # // below-used function. -//! # generate::specialize(self,"TestLexer","Token") -//! # } -//! # } -//! # -//! # -//! # // === Lexer === -//! # -//! # pub struct Lexer { -//! # lexer:Flexer -//! # } -//! # -//! # impl Lexer { -//! # pub fn new() -> Self { -//! # let lexer = Flexer::new(Logger::new("Lexer")); -//! # Lexer{lexer} -//! # } -//! # } -//! # -//! # impl Deref for Lexer { -//! # type Target = Flexer ; -//! # fn deref(&self) -> &Self::Target { -//! # &self.lexer -//! # } -//! # } -//! # impl DerefMut for Lexer { -//! # fn deref_mut(&mut self) -> &mut Self::Target { -//! # &mut self.lexer -//! # } -//! # } -//! -//! impl flexer::Definition for Lexer { -//! fn define() -> Self { -//! // First we instantiate our lexer. Definitions take place _directly_ on the lexer, and -//! // manipulate runtime state. -//! let mut lexer = Self::new(); -//! -//! // Then, we define the patterns that we're going to use. For an overview of the p -//! let a_word = Pattern::char('a').many1(); -//! let b_word = Pattern::char('b').many1(); -//! let space = Pattern::char(' '); -//! let spaced_a_word = &space >> &a_word; -//! let spaced_b_word = &space >> &b_word; -//! let any = Pattern::any(); -//! let end = Pattern::eof(); -//! -//! // Next, we define groups of lexer rules. This uses the groups that we've defined in our -//! // lexer's state, and the patterns we've defined above. -//! let root_group_id = lexer.initial_state; -//! let root_group = lexer.groups_mut().group_mut(root_group_id); -//! root_group.create_rule(&a_word,"self.on_first_word(reader)"); -//! root_group.create_rule(&b_word,"self.on_first_word(reader)"); -//! root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); -//! root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); -//! -//! let seen_first_word_group_id = lexer.seen_first_word_state; -//! let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); -//! seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); -//! seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); -//! seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); -//! seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); -//! -//! lexer -//! } -//! -//! /// This function just returns the lexer's groups. -//! fn groups(&self) -> &Registry { -//! self.lexer.groups() -//! } -//! -//! /// Code you want to run before lexing begins. -//! fn set_up(&mut self) {} -//! -//! /// Code you want to run after lexing finishes. -//! fn tear_down(&mut self) {} -//! } -//! ``` -//! -//! > ### Transition Functions -//! > You may be wondering why the transition functions are specified as strings. This allows us to -//! > generate highly-efficient, specialized code for your lexer once you define it. More on this -//! > later. -//! -//! A [`group::Group`] in the lexer is like a state that operates on a stack. A transition function -//! can arbitrarily activate or deactivate a group on the flexer's stack, allowing you to perform -//! context-sensitive lexing behaviour. For more information (including on how to use parent groups -//! to inherit rules), see the relevant module. -//! -//! For more information on the [`automata::pattern::Pattern`] APIs used above, please see the -//! relevant module in this crate. -//! -//! ## Defining the Transition Functions -//! -//! You'll have noticed that, up above, we told the rules to use a bunch of transition functions -//! that we've not yet talked about. These functions can be defined anywhere you like, as long as -//! they are in scope in the file in which you are defining your lexer. We do, however, recommend -//! defining them on your lexer itself, so they can access and manipulate lexer state, so that's -//! what we're going to do here. -//! -//! ``` -//! # use flexer::automata::pattern::Pattern; -//! # use flexer::Flexer; -//! # use flexer::generate; -//! # use flexer::group::Registry; -//! # use flexer::group; -//! # use flexer::prelude::AnyLogger; -//! use flexer::prelude::LazyReader; -//! # use flexer::prelude::GenError; -//! # use flexer::prelude::logger::Disabled; -//! # use flexer::prelude::reader::BookmarkManager; -//! # use flexer::State; -//! # use flexer; -//! # use std::ops::Deref; -//! # use std::ops::DerefMut; -//! # -//! # type Logger = Disabled; -//! # -//! # -//! # // === Token === -//! # -//! # #[derive(Clone)] -//! # pub enum Token { -//! # /// A word from the input, consisting of a sequence of all `a` or all `b`. -//! # Word(String), -//! # /// A token that the lexer is unable to recognise. -//! # Unrecognized(String) -//! # } -//! # -//! # #[derive(Clone,Default)] -//! # pub struct TokenStream { -//! # tokens:Vec -//! # } -//! # -//! # impl TokenStream { -//! # pub fn push(&mut self,token:Token) { -//! # self.tokens.push(token) -//! # } -//! # } -//! # -//! # -//! # // === LexerState === -//! # -//! # #[derive(Debug)] -//! # pub struct LexerState { -//! # /// The registry for groups in the lexer. -//! # lexer_states:group::Registry, -//! # /// The initial state of the lexer. -//! # initial_state:group::Identifier, -//! # /// The state entered when the first word has been seen. -//! # seen_first_word_state:group::Identifier, -//! # /// The bookmarks for this lexer. -//! # bookmarks:BookmarkManager -//! # } -//! # -//! # impl flexer::State for LexerState { -//! # fn new(_logger:&impl AnyLogger) -> Self { -//! # // Here we construct all of the elements needed for our lexer state. This function can -//! # // contain arbitrarily complex logic and is only called once at initialization time. -//! # let mut lexer_states = group::Registry::default(); -//! # let initial_state = lexer_states.define_group("ROOT",None); -//! # let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None); -//! # let bookmarks = BookmarkManager::new(); -//! # Self{lexer_states,initial_state,seen_first_word_state,bookmarks} -//! # } -//! # -//! # fn initial_state(&self) -> group::Identifier { -//! # self.initial_state -//! # } -//! # -//! # fn groups(&self) -> &group::Registry { -//! # &self.lexer_states -//! # } -//! # -//! # fn groups_mut(&mut self) -> &mut group::Registry { -//! # &mut self.lexer_states -//! # } -//! # -//! # fn bookmarks(&self) -> &BookmarkManager { -//! # &self.bookmarks -//! # } -//! # -//! # fn bookmarks_mut(&mut self) -> &mut BookmarkManager { -//! # &mut self.bookmarks -//! # } -//! # -//! # fn specialize(&self) -> Result { -//! # // It is very important to pass both the type name of your lexer and your output -//! # // correctly here. This function should always be implemented as a call to the -//! # // below-used function. -//! # generate::specialize(self,"TestLexer","Token") -//! # } -//! # } -//! # -//! # -//! # // === Lexer === -//! # -//! # pub struct Lexer { -//! # lexer:Flexer -//! # } -//! # -//! # impl Lexer { -//! # pub fn new() -> Self { -//! # let lexer = Flexer::new(Logger::new("Lexer")); -//! # Lexer{lexer} -//! # } -//! # } -//! # -//! # impl Deref for Lexer { -//! # type Target = Flexer ; -//! # fn deref(&self) -> &Self::Target { -//! # &self.lexer -//! # } -//! # } -//! # impl DerefMut for Lexer { -//! # fn deref_mut(&mut self) -> &mut Self::Target { -//! # &mut self.lexer -//! # } -//! # } -//! # -//! # impl flexer::Definition for Lexer { -//! # fn define() -> Self { -//! # // First we instantiate our lexer. Definitions take place _directly_ on the lexer, and -//! # // manipulate runtime state. -//! # let mut lexer = Self::new(); -//! # -//! # // Then, we define the patterns that we're going to use. For an overview of the p -//! # let a_word = Pattern::char('a').many1(); -//! # let b_word = Pattern::char('b').many1(); -//! # let space = Pattern::char(' '); -//! # let spaced_a_word = &space >> &a_word; -//! # let spaced_b_word = &space >> &b_word; -//! # let any = Pattern::any(); -//! # let end = Pattern::eof(); -//! # -//! # // Next, we define groups of lexer rules. This uses the groups that we've defined in our -//! # // lexer's state, and the patterns we've defined above. -//! # let root_group_id = lexer.initial_state; -//! # let root_group = lexer.groups_mut().group_mut(root_group_id); -//! # root_group.create_rule(&a_word,"self.on_first_word(reader)"); -//! # root_group.create_rule(&b_word,"self.on_first_word(reader)"); -//! # root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)"); -//! # root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)"); -//! # -//! # let seen_first_word_group_id = lexer.seen_first_word_state; -//! # let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id); -//! # seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)"); -//! # seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)"); -//! # seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)"); -//! # seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)"); -//! # -//! # lexer -//! # } -//! # -//! # /// This function just returns the lexer's groups. -//! # fn groups(&self) -> &Registry { -//! # self.lexer.groups() -//! # } -//! # -//! # /// Code you want to run before lexing begins. -//! # fn set_up(&mut self) {} -//! # -//! # /// Code you want to run after lexing finishes. -//! # fn tear_down(&mut self) {} -//! # } -//! -//! impl Lexer { -//! pub fn on_first_word(&mut self, _reader:&mut R) { -//! let str = self.current_match.clone(); -//! let ast = Token::Word(str); -//! self.output.push(ast); -//! let id = self.seen_first_word_state; -//! self.push_state(id); -//! } -//! -//! pub fn on_spaced_word(&mut self, _reader:&mut R) { -//! let str = self.current_match.clone(); -//! let ast = Token::Word(String::from(str.trim())); -//! self.output.push(ast); -//! } -//! -//! pub fn on_err_suffix_first_word(&mut self, _reader:&mut R) { -//! let ast = Token::Unrecognized(self.current_match.clone()); -//! self.output.push(ast); -//! } -//! -//! pub fn on_err_suffix(&mut self, reader:&mut R) { -//! self.on_err_suffix_first_word(reader); -//! self.pop_state(); -//! } -//! -//! pub fn on_no_err_suffix_first_word(&mut self, _reader:&mut R) {} -//! -//! pub fn on_no_err_suffix(&mut self, reader:&mut R) { -//! self.on_no_err_suffix_first_word(reader); -//! self.pop_state(); -//! } -//! } -//! ``` -//! -//! > ### Magic Transition Functions -//! > The transition functions are the 'secret sauce', so to speak, of the Flexer. They are called -//! > when a rule matches, and allow arbitrary code to manipulate the lexer. This means that the -//! > flexer can be used to define very complex grammars while still keeping a simple interface and -//! > ensuring performant execution. -//! -//! You'll note that all of these functions have a couple of things in common: -//! -//! 1. They have a type parameter `R` that conforms to the [`prelude::LazyReader`] trait. -//! 2. They take an argument of type `R`, that is the reader over which the lexer is running. -//! -//! Both of these, combined, allow the transition functions to manipulate the text being read by the -//! lexer. -//! -//! ## Specializing the Lexer -//! -//! In order to actually _use_ the lexer that you've defined, you need to specialize it to the rules -//! that you define. Unfortunately, `cargo` doesn't have support for post-build hooks, and so this -//! is a little more involved than we'd like it to be. -//! -//! 1. Create a file that performs the definition of the lexer as above. It can use multiple files -//! in its crate as long as they are publicly exposed. -//! 2. Create a separate cargo project that has a prebuild hook in its `build.rs`. -//! 3. In that build.rs, you need to: -//! 1. Import the lexer definition and instantiate it using `::define()`. -//! 2. Call [`State::specialize()`] on the resultant lexer. This will generate a string that -//! contains the optimised lexer implementation. -//! 3. Write both the generated code and the code from the original lexer definition into an -//! output file. -//! 4. Re-export this output file from your cargo project's `lib.rs`. -//! -//! The process of specialization will generate quite a bit of code, but most importantly it will -//! generate `pub fn run(&mut self, mut reader:R) -> Result`, where `Output` -//! is your lexer's token type. All of these functions are defined on your lexer type (the one whose -//! name is provided to `specialize()`. -//! -//! ## In Summary -//! -//! The flexer allows its clients to define highly optimised lexer implementations that are capable -//! of lexing languages of a high complexity. - -use crate::prelude::*; -use prelude::logger::*; - -use crate::generate::GenError; -use prelude::logger::AnyLogger; -use prelude::reader::BookmarkManager; - -pub mod automata; -pub mod data; -pub mod generate; -pub mod group; - -/// Useful libraries for working with the flexer. -pub mod prelude { - pub use crate::generate::GenError; - pub use enso_prelude::*; - pub use lazy_reader::LazyReader; - pub use lazy_reader::Reader; - pub use logger::AnyLogger; - - /// The lazy reader library. - pub mod reader { - pub use lazy_reader::*; - } - - /// The Enso logging library. - pub mod logger { - pub use enso_logger::*; - pub use enso_logger::disabled::Logger as Disabled; - pub use enso_logger::enabled::Logger as Enabled; - } -} - - - -// ================= -// === Constants === -// ================= - -mod constants { - /// The number of 'frames' to reserve in the state stack, aiming to avoid re-allocation in hot - /// code paths. - pub const STATE_STACK_RESERVATION:usize = 1024; -} - - - -// ============== -// === Flexer === -// ============== - -/// The flexer is an engine for generating lexers. -/// -/// Akin to flex and other lexer generators, it is given a definition as a series of rules from -/// which it then generates code for a highly optimised lexer implemented on top of a -/// [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton). -/// -/// Lexers defined using the flexer work on a stack of _states_, where a state is represented by a -/// [`crate::group::Group`]. Being in a given state (represented below by the top of the -/// `state_stack`) means that the flexer can match a certain set of rules associated with that -/// state. The user may cause the lexer to transition between states by pushing and popping states -/// on the stack, thus allowing a much more flexible lexing engine than pure regular grammars. -#[derive(Clone,Debug)] -pub struct Flexer { - /// The stack of states that are active during lexer execution. - pub state_stack:NonEmptyVec, - /// The result of the current stage of the DFA. - pub status:StageStatus, - /// The tokens that have been lexed. - pub output:Output, - /// The text of the current match of the lexer. - pub current_match:String, - /// A logger for the flexer, accessible in user definitions. - pub logger:Logger, - /// The definition of the user-provided state for the lexer. - definition:Definition, -} - -impl Flexer -where Definition : State, - Logger : AnyLogger, - Output : Default { - /// Create a new lexer instance. - pub fn new(parent_logger:impl AnyLogger) -> Flexer { - let logger = ::sub(&parent_logger,"Flexer"); - let status = default(); - let output = default(); - let definition = Definition::new(&logger); - let initial_state_id = definition.initial_state(); - let mut state_stack = NonEmptyVec::singleton(initial_state_id); - let current_match = default(); - - state_stack.reserve(constants::STATE_STACK_RESERVATION); - Flexer{state_stack,status,output,definition,current_match,logger} - } -} - -impl Flexer -where Definition : State, - Output : Clone, - Logger : AnyLogger { - /// Get the lexer result. - pub fn result(&mut self) -> &Output { - &self.output - } - - /// Get the lexer's initial state. - pub fn initial_state(&self) -> group::Identifier { - self.definition.initial_state() - } - - /// Get the state that the lexer is currently in. - pub fn current_state(&self) -> group::Identifier { - *self.state_stack.last() - } - - /// Tell the lexer to enter the state described by `state`. - pub fn push_state(&mut self, state:group::Identifier) { - self.logger.group_begin( - ||format!("Enter State: {}",self.groups().group(state).name.as_str()) - ); - self.state_stack.push(state); - } - - /// End the current state, returning the popped state identifier if one was ended. - /// - /// It will never end the initial state of the lexer. - pub fn pop_state(&mut self) -> Option { - let result = self.state_stack.pop(); - match result { - None => (), - Some(ident) => debug!(self.logger,"Leave State: {self.groups().group(ident)}"), - }; - self.logger.group_end(); - result - } - - /// End states until the specified `state` is reached, leaving the lexer in `state`. - /// - /// If `state` does not exist on the lexer's stack, then the lexer will be left in the root - /// state. Additionally, this function cannot pop the final occurrence of the root state. - pub fn pop_states_until(&mut self, state:group::Identifier) -> group::Identifier { - while self.current_state() != state && self.current_state() != self.initial_state() { - self.pop_state(); - } - *self.state_stack.last() - } - - /// End states up to and including the first instance of `state`, returning the identifier of - /// the new state the lexer is in. - /// - /// If `state` does not exist on the lexer's stack, the lexer will be left in the root state. - /// Additionally, this function cannot pop the final occurrence of the root state. - pub fn pop_states_including(&mut self, state:group::Identifier) -> group::Identifier { - while self.current_state() != state && self.current_state() != self.initial_state() { - self.pop_state(); - } - self.pop_state(); - *self.state_stack.last() - } - - /// Check if the lexer is currently in the state described by `state`. - pub fn is_in_state(&self, state:group::Identifier) -> bool { - self.current_state() == state - } -} - -// === Trait Impls === - -impl Deref for Flexer { - type Target = Definition; - fn deref(&self) -> &Self::Target { - &self.definition - } -} - -impl DerefMut for Flexer { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.definition - } -} - - - -// ================== -// === SubStateId === -// ================== - -/// An identifier for a sub-state of the lexer to transition to. -#[derive(Copy,Clone,Debug,Default,PartialEq)] -pub struct SubStateId(usize); - -impl SubStateId { - /// Create a new `SubStateId` with the specified value. - pub fn new(val:usize) -> SubStateId { - SubStateId(val) - } -} - - -// === Trait Impls === - -impl From for SubStateId { - fn from(val:usize) -> Self { - SubStateId::new(val) - } -} - -impl From<&usize> for SubStateId { - fn from(val:&usize) -> Self { - SubStateId::new(*val) - } -} - -impl Into for SubStateId { - fn into(self) -> usize { - self.0 - } -} - - - -// =================== -// === StageStatus === -// =================== - -/// The result of executing a single step of the DFA. -#[derive(Clone,Copy,Debug,PartialEq)] -pub enum StageStatus { - /// The initial state of a lexer stage. - Initial, - /// The stage exits successfully, having consumed a complete token. - ExitSuccess, - /// The stage exits unsuccessfully. - ExitFail, - /// A single step of the DFA has executed successfully. - ExitFinished, - /// The lexer should continue, transitioning to the included state. - ContinueWith(SubStateId) -} - -impl StageStatus { - /// Check if the lexer stage should continue. - pub fn should_continue(&self) -> bool { - self.continue_as().is_some() - } - - /// Obtain the state to which the lexer should transition, iff the lexer should continue. - pub fn continue_as(&self) -> Option { - match self { - StageStatus::Initial => Some(SubStateId::new(0)), - StageStatus::ContinueWith(val) => Some(*val), - _ => None - } - } -} - - -// === Trait Impls === - -impl Default for StageStatus { - fn default() -> Self { - StageStatus::Initial - } -} - - - -// ============== -// === Result === -// ============== - -/// The result of executing the lexer on a given input. -#[derive(Clone,Debug)] -pub struct LexingResult { - /// The kind of the result, representing _how_ the lexer completed. - pub kind:ResultKind, - /// The tokens that the lexer was able to process. - pub tokens:T -} - -impl LexingResult { - /// Create a new lexer result using the provided `kind` and `tokens`. - pub fn new(kind:ResultKind,tokens:T) -> LexingResult { - LexingResult {kind,tokens} - } - - /// Create a new success result, with the provided `tokens`. - pub fn success(tokens:T) -> LexingResult { - LexingResult::new(ResultKind::Success, tokens) - } - - /// Create a new partial lex result, with the provided `tokens`. - pub fn partial(tokens:T) -> LexingResult { - LexingResult::new(ResultKind::Partial, tokens) - } - - /// Create a failure result, with the `tokens` it _did_ manage to consume. - pub fn failure(tokens:T) -> LexingResult { - LexingResult::new(ResultKind::Failure, tokens) - } -} - -/// The kind of lexer result. -#[derive(Copy,Clone,Debug)] -pub enum ResultKind { - /// The lexer succeeded, returning the contained token stream. - Success, - /// The lexer succeeded on part of the input, returning the contained token stream. - Partial, - /// The lexer failed on the input, returning any tokens it _did_ manage to consume. - Failure -} - - - -// ============= -// === State === -// ============= - -/// Contains the state needed by the flexer from a lexer implementation. -/// -/// The types for which this trait is implemented will normally also contain the user-defined state -/// for that lexer. -pub trait State { - /// Create a new instance of the lexer's state. - /// - /// This function is guaranteed to be called at most once per run of the lexer. - fn new(parent_logger:&impl AnyLogger) -> Self; - /// Return the _initial_ lexing state. - fn initial_state(&self) -> group::Identifier; - /// Return a reference to the group registry for a given lexer. - fn groups(&self) -> &group::Registry; - /// Return a mutable reference to the group registry for a given lexer. - fn groups_mut(&mut self) -> &mut group::Registry; - /// Get an immutable reference to the bookmark manager for this state. - fn bookmarks(&self) -> &BookmarkManager; - /// Get a mutable reference to the bookmark manager for this state. - fn bookmarks_mut(&mut self) -> &mut BookmarkManager; - /// Generate code to specialize the flexer for the user's particular lexer definition. - /// - /// This function should be implemented as a call to [`generate::specialize`], passing - /// the name of your lexer, and the name of your lexer's output type as a string. - fn specialize(&self) -> Result; -} - - - -// ================== -// === Definition === -// ================== - -/// Allows for the definition of flexer-based lexers. -pub trait Definition { - /// Define the custom lexer. - fn define() -> Self; - /// Obtain the registry of groups for the lexer. - fn groups(&self) -> &group::Registry; - /// Run before any lexing takes place. - fn set_up(&mut self); - /// Run after lexing has completed. - fn tear_down(&mut self); -} diff --git a/lib/rust/flexer/tests/flexer_invalid_definitions.rs b/lib/rust/flexer/tests/flexer_invalid_definitions.rs deleted file mode 100644 index ff432d3f39..0000000000 --- a/lib/rust/flexer/tests/flexer_invalid_definitions.rs +++ /dev/null @@ -1,446 +0,0 @@ -//! This file contains tests for the user-facing error-handling logic in the flexer code generator. -//! -//! This file includes quite a bit of duplicated code, but this is known and intentional as it -//! allows for increased clarity in the testing. - -#![allow(missing_docs)] - -use crate::prelude::LazyReader; -use crate::prelude::logger::AnyLogger; -use crate::prelude::logger::Disabled; -use crate::prelude::reader::BookmarkManager; -use flexer::*; -use flexer::automata::pattern::Pattern; -use flexer::Flexer; -use flexer::generate; -use flexer::group::{Registry, Identifier}; -use flexer::group; -use flexer::prelude::*; -use flexer::State; -use flexer; - - - -// ==================== -// === Type Aliases === -// ==================== - -type Logger = Disabled; - - - -// ==================== -// === Shared Setup === -// ==================== - -/// A token type for these lexers. -#[derive(Copy,Clone,Debug,PartialEq)] -pub enum Token { - Foo, - Bar -} - -/// An output type for these lexers. -#[allow(missing_docs)] -#[derive(Clone,Debug,Default,PartialEq)] -pub struct Output { - tokens:Vec -} - -/// A testing lexer state. -pub struct LexerState { - lexer_states:group::Registry, - initial_state:group::Identifier, -} -impl flexer::State for LexerState { - fn new(_logger:&impl AnyLogger) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - LexerState{lexer_states,initial_state} - } - - fn initial_state(&self) -> Identifier { - self.initial_state - } - - fn groups(&self) -> &Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut Registry { - &mut self.lexer_states - } - - fn bookmarks(&self) -> &BookmarkManager { - unimplemented!() - } - - fn bookmarks_mut(&mut self) -> &mut BookmarkManager { - unimplemented!() - } - - fn specialize(&self) -> Result { - // Note [Naming "Lexer"] - generate::specialize(self,"Lexer","Output") - } -} - -/* Note [Naming "Lexer"] - * ~~~~~~~~~~~~~~~~~~~~~ - * In general, the name passed to `specialize` should match that of your lexer definition. However - * here, as we never compile the code, we set it to a generic constant that is a valid rust - * identifier so as to reduce testing boilerplate. - */ - - - -// ==================== -// === Definition 1 === -// ==================== - -pub struct Lexer1 { - lexer:Flexer -} - -impl Deref for Lexer1 { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for Lexer1 { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl Lexer1 { - pub fn new() -> Lexer1 { - let logger = Logger::new("Lexer1"); - let lexer = Flexer::new(logger); - Lexer1 {lexer} - } - - pub fn my_test_fun(&mut self, _reader:&mut R) { - unimplemented!() - } -} - -impl flexer::Definition for Lexer1 { - fn define() -> Self { - let mut lexer = Self::new(); - - let foo = Pattern::all_of("foo"); - - let root_group_id = lexer.initial_state(); - let root_group = lexer.groups_mut().group_mut(root_group_id); - root_group.create_rule(&foo, "ETERNAL SCREAMING"); - - lexer - } - - fn groups(&self) -> &Registry { - self.lexer.groups() - } - - fn set_up(&mut self) { - unimplemented!() - } - - fn tear_down(&mut self) { - unimplemented!() - } -} - -#[test] -fn test_bad_rule_expression() { - let lexer = Lexer1::define(); - let result = lexer.specialize(); - assert!(result.is_err()); - let message = result.unwrap_err().to_string(); - assert_eq!(message,"`ETERNAL SCREAMING` is not a valid rust expression."); -} - - -// ==================== -// === Definition 2 === -// ==================== - -pub struct Lexer2 { - lexer:Flexer -} - -impl Deref for Lexer2 { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for Lexer2 { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl Lexer2 { - pub fn new() -> Lexer2 { - let logger = Logger::new("Lexer2"); - let lexer = Flexer::new(logger); - Lexer2{lexer} - } - - pub fn my_test_fun(&mut self, _reader:&mut R) { - unimplemented!() - } -} - -impl flexer::Definition for Lexer2 { - fn define() -> Self { - let mut lexer = Self::new(); - - let foo = Pattern::all_of("foo"); - - let root_group_id = lexer.initial_state(); - let root_group = lexer.groups_mut().group_mut(root_group_id); - root_group.create_rule(&foo, "self.test_function_no_reader()"); - - lexer - } - - fn groups(&self) -> &Registry { - self.lexer.groups() - } - - fn set_up(&mut self) { - unimplemented!() - } - - fn tear_down(&mut self) { - unimplemented!() - } -} - -#[test] -pub fn test_no_reader_arg() { - let lexer = Lexer2::define(); - let result = lexer.specialize(); - let expected_message = - "Bad argument to a callback function. It must take a single argument `reader`."; - assert!(result.is_err()); - let message = result.unwrap_err().to_string(); - assert_eq!(message,expected_message); -} - - - -// ==================== -// === Definition 3 === -// ==================== - -pub struct Lexer3 { - lexer:Flexer -} - -impl Deref for Lexer3 { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for Lexer3 { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl Lexer3 { - pub fn new() -> Lexer3 { - let logger = Logger::new("Lexer3"); - let lexer = Flexer::new(logger); - Lexer3{lexer} - } - - pub fn my_test_fun(&mut self, _reader:&mut R) { - unimplemented!() - } -} - -impl flexer::Definition for Lexer3 { - fn define() -> Self { - let mut lexer = Self::new(); - - let foo = Pattern::all_of("foo"); - - let root_group_id = lexer.initial_state(); - let root_group = lexer.groups_mut().group_mut(root_group_id); - root_group.create_rule(&foo, "self.test_function_reader(reader)"); - - lexer - } - - fn groups(&self) -> &Registry { - self.lexer.groups() - } - - fn set_up(&mut self) { - unimplemented!() - } - - fn tear_down(&mut self) { - unimplemented!() - } -} - -pub struct LexerState1 { - lexer_states:group::Registry, - initial_state:group::Identifier, -} -impl flexer::State for LexerState1 { - fn new(_logger:&impl AnyLogger) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - LexerState1 {lexer_states,initial_state} - } - - fn initial_state(&self) -> Identifier { - self.initial_state - } - - fn groups(&self) -> &Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut Registry { - &mut self.lexer_states - } - - fn bookmarks(&self) -> &BookmarkManager { - unimplemented!() - } - - fn bookmarks_mut(&mut self) -> &mut BookmarkManager { - unimplemented!() - } - - fn specialize(&self) -> Result { - generate::specialize(self,"Bad Lexer Name","Output") - } -} - -#[test] -pub fn test_bad_state_name() { - let lexer = Lexer3::define(); - let result = lexer.specialize(); - assert!(result.is_err()); - let message = result.unwrap_err().to_string(); - assert_eq!(message,"`Bad Lexer Name` is not a valid rust identifier."); -} - - - -// ==================== -// === Definition 4 === -// ==================== - -pub struct Lexer4 { - lexer:Flexer -} - -impl Deref for Lexer4 { - type Target = Flexer; - fn deref(&self) -> &Self::Target { - &self.lexer - } -} - -impl DerefMut for Lexer4 { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.lexer - } -} - -impl Lexer4 { - pub fn new() -> Lexer4 { - let logger = Logger::new("Lexer4"); - let lexer = Flexer::new(logger); - Lexer4{lexer} - } - - pub fn my_test_fun(&mut self, _reader:&mut R) { - unimplemented!() - } -} - -impl flexer::Definition for Lexer4 { - fn define() -> Self { - let mut lexer = Self::new(); - - let foo = Pattern::all_of("foo"); - - let root_group_id = lexer.initial_state(); - let root_group = lexer.groups_mut().group_mut(root_group_id); - root_group.create_rule(&foo, "self.test_function_reader(reader)"); - - lexer - } - - fn groups(&self) -> &Registry { - self.lexer.groups() - } - - fn set_up(&mut self) { - unimplemented!() - } - - fn tear_down(&mut self) { - unimplemented!() - } -} - -pub struct LexerState2 { - lexer_states:group::Registry, - initial_state:group::Identifier, -} -impl flexer::State for LexerState2 { - fn new(_logger:&impl AnyLogger) -> Self { - let mut lexer_states = group::Registry::default(); - let initial_state = lexer_states.define_group("ROOT",None); - LexerState2 {lexer_states,initial_state} - } - - fn initial_state(&self) -> Identifier { - self.initial_state - } - - fn groups(&self) -> &Registry { - &self.lexer_states - } - - fn groups_mut(&mut self) -> &mut Registry { - &mut self.lexer_states - } - - fn bookmarks(&self) -> &BookmarkManager { - unimplemented!() - } - - fn bookmarks_mut(&mut self) -> &mut BookmarkManager { - unimplemented!() - } - - fn specialize(&self) -> Result { - generate::specialize(self,"Lexer4","Bad output name") - } -} - -#[test] -pub fn test_bad_output_name() { - let lexer = Lexer4::define(); - let result = lexer.specialize(); - assert!(result.is_err()); - let message = result.unwrap_err().to_string(); - assert_eq!(message,"`Bad output name` is not a valid rust path."); -} diff --git a/lib/rust/lexer/definition/Cargo.toml b/lib/rust/lexer/definition/Cargo.toml index 3a37663861..6f4a2beca6 100644 --- a/lib/rust/lexer/definition/Cargo.toml +++ b/lib/rust/lexer/definition/Cargo.toml @@ -12,7 +12,7 @@ test = true bench = true [dependencies] -flexer = { path = "../../flexer", version = "0.1.0" } -enso-prelude = { version = "0.1.3" } +enso-flexer = { version = "0.1.3" } +enso-prelude = { version = "0.1.7" } uuid = { version = "0.8.1" , features = ["serde","v4","wasm-bindgen"] } diff --git a/lib/rust/lexer/definition/src/escape.rs b/lib/rust/lexer/definition/src/escape.rs new file mode 100644 index 0000000000..1ca86537ec --- /dev/null +++ b/lib/rust/lexer/definition/src/escape.rs @@ -0,0 +1,344 @@ +//! This crate describes valid escape sequences inside Enso text literals. + +use crate::prelude::*; + +use crate::lexeme; +use crate::library::token; +use crate::token::Shape; +use crate::token::EscapeStyle; + + + +// ======================= +// === EscapeCharacter === +// ======================= + +/// A representation of an escape character. +#[derive(Clone,Debug,Default,Eq,PartialEq)] +pub struct EscapeCharacter { + /// The lexing representation of the escape. + /// + /// This is the literal string that must occur in the Enso source code to be interpreted as this + /// escape code. + pub pattern : String, + /// The literal representation of the escape. + /// + /// This is the character-level encoding of this escape character in Rust, as the Rust escape + /// representation and the Enso escape representation may differ, or Rust may not support the + /// same literal escape code as Enso. + pub repr : String, + +} +impl EscapeCharacter { + fn new(pattern:impl Str, repr:impl Str) -> EscapeCharacter { + let pattern = pattern.into(); + let repr = repr.into(); + Self{pattern,repr} + } + + /// The set of character escape codes that Enso supports. + pub fn codes() -> Vec { + vec![ + // === Null === + Self::new(r"\0","\0"), + + // === Newlines === + Self::new(r"\n","\n"), + Self::new(r"\r","\r"), + Self::new(r"\f","\x0C"), + + // === Tabs === + Self::new(r"\t","\t"), + Self::new(r"\v","\x0B"), + + // === Backspace === + Self::new(r"\b","\x08"), + + // === Misc === + Self::new(r"\a","\x07"), + ] + } +} + + + +// ================= +// === Utilities === +// ================= + +/// Check if `c` is a hexadecimal digit. +fn is_hex_digit(c:char) -> bool { + let small_letters = 'a'..='f'; + let large_letters = 'A'..='F'; + let digits = '0'..='9'; + small_letters.contains(&c) || large_letters.contains(&c) || digits.contains(&c) +} + + + +// ====================== +// === EscapeSequence === +// ====================== + +/// A trait representing various kinds of escape sequence. +/// +/// An escape sequence built using this trait will have its digits calculated by stripping the +/// [`Self::prefix_length()`] and [`Self::suffix_length()`] from the input string, and then +/// validated using [`Self::digits_min_length()`], [`Self::digits_max_length()`], and +/// [`Self::validator()`]. All digits must be valid hexadecimal digits as defined by +/// [`is_hex_digit`] above. +/// +/// In addition, the implementation must define [`Self::style_on_success()`] and +/// [`Self::style_on_failure()`] to determine the type of escape output on success and failure. +pub trait EscapeSequence { + /// Create a token of the relevant escape sequence type. + /// + /// This function should be passed the _full_ match for the escape sequence as `repr`, including + /// the delimiters. For example, if we have the escape sequence `\uAFAF`, we want to pass the + /// whole string `"\uAFAF"`, not just `"AFAF"` to this function.. + fn build(repr:impl Str) -> Shape { + if let Some(digits) = Self::validate(repr.as_ref()) { + Shape::text_segment_escape(Self::style_on_success(),digits) + } else { + Shape::text_segment_escape(Self::style_on_failure(),repr) + } + } + + /// Obtain the digits portion of the escape sequence. + fn get_digits(repr:&str) -> &str { + let start = Self::prefix_length(); + let end = repr.len().saturating_sub(Self::suffix_length()); + &repr[start..end] + } + + /// Validate the provided unicode string for this type of escape sequence. + fn validate(repr:&str) -> Option { + let digits = Self::get_digits(repr); + let ge_min = digits.len() >= Self::digits_min_length(); + let le_max = digits.len() <= Self::digits_max_length(); + let valid_length = ge_min && le_max; + let valid_escape = Self::validator(digits); + let valid_digits = digits.chars().all(is_hex_digit); + let is_valid = valid_length && valid_escape && valid_digits; + is_valid.as_some(digits.into()) + } + + /// Return the length of the escape prefix. + /// + /// The suffix is the characters that need to be stripped from the front of the escape sequence + /// to get, in conjunction with [`EscapeSequence::suffix_length()`] the escape value itself. + fn prefix_length() -> usize; + + /// Return the length of the escape suffix. + /// + /// The suffix is the characters that need to be stripped from the end of the escape sequence to + /// get, in conjunction with [`EscapeSequence::prefix_length()`] the escape value itself. + /// + /// This defaults to `0`. + fn suffix_length() -> usize { 0 } + + /// Return the minimum number of digits accepted by the escape sequence type. + fn digits_min_length() -> usize; + + /// Return the maximum number of digits accepted by the escape sequence type. + /// + /// This defaults to `digits_min_length()`. + fn digits_max_length() -> usize { Self::digits_min_length() } + + /// A validator for any additional properties of the escape sequence. + /// + /// It will be passed the _digits_ of the escape sequence, as defined by + /// [`EscapeSequence::get_digits()`], and has a default implementation that always succeeds. + /// Please implement this validator yourself if you would like to assert _additional_ properties + /// on your escape sequence. + fn validator(_digits:&str) -> bool { true } + + /// The style of escape after successful validation. + fn style_on_success() -> token::EscapeStyle; + + /// The style of escape after unsuccessful validation. + fn style_on_failure() -> token::EscapeStyle; +} + + + +// ================== +// === ByteEscape === +// ================== + +/// A validator for ASCII escapes. +/// +/// An ascii escape begins with the sequence `\x` and is followed by two hexadecimal digits (e.g. +/// `\x0F`. +#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)] +pub struct Byte; +impl EscapeSequence for Byte { + fn prefix_length() -> usize { lexeme::len(lexeme::literal::BYTE_ESCAPE_START) } + fn digits_min_length() -> usize { 2 } + fn style_on_success() -> EscapeStyle { token::EscapeStyle::Byte } + fn style_on_failure() -> EscapeStyle { token::EscapeStyle::Invalid } +} + + + +// =========== +// === U16 === +// =========== + +/// A validator for U16 unicode escapes. +/// +/// A U16 unicode escape begins with the sequence `\u` and is followed by four hexadecimal digits, +/// e.g. `\u0F0F`. +#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)] +pub struct U16; +impl EscapeSequence for U16 { + fn prefix_length() -> usize { lexeme::len(lexeme::literal::U16_ESCAPE_START) } + fn digits_min_length() -> usize { 4 } + fn style_on_success() -> EscapeStyle { token::EscapeStyle::U16 } + fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode } +} + + + +// =========== +// === U21 === +// =========== + +/// A validator for U21 unicode escapes. +/// +/// A U21 unicode escape begins with the sequence `\u`, followed by a sequence of 1-6 hexadecimal +/// digits enclosed in braces (`{}`). Both `\u{F}` and `\u{AFAFAF}` are valid U21 escapes. +#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)] +pub struct U21; +impl EscapeSequence for U21 { + fn prefix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_START) } + fn suffix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_END) } + fn digits_min_length() -> usize { 1 } + fn digits_max_length() -> usize { 6 } + fn style_on_success() -> EscapeStyle { token::EscapeStyle::U21 } + fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode } +} + + + +// =========== +// === U32 === +// =========== + +/// A validator for U32 unicode escapes. +/// +/// A U32 unicode escape begins with the sequence \U, followed by 8 hexadecimal digits. Due to the +/// restrictions of unicode, the first two digits _must_ be zero (e.g. `\U00AFAFAF`). +#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)] +pub struct U32; +impl EscapeSequence for U32 { + fn prefix_length() -> usize { lexeme::len(lexeme::literal::U32_ESCAPE_START) } + fn digits_min_length() -> usize { 8 } + fn validator(digits: &str) -> bool { digits.starts_with("00") } + fn style_on_success() -> EscapeStyle { token::EscapeStyle::U32 } + fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode } +} + + + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +mod test { + use super::*; + + + // === Utilities === + + /// Tests a valid input to ensure that it succeeds. + fn test_valid(escape:&str, out:&str, out_style:token::EscapeStyle) { + let shape = Shape::text_segment_escape(out_style,out); + assert_eq!(Esc::build(escape),shape); + } + + /// Tests invalid inputs to ensure they fail for the provided escape type `Esc`. + fn test_invalid(invalid_cases:Vec<&str>, fail_with:token::EscapeStyle) { + for escape in invalid_cases { + let shape = Shape::text_segment_escape(fail_with,escape); + assert_eq!(Esc::build(escape),shape) + } + } + + + // === Is Hex Digit === + + #[test] + fn test_is_hex_digit() { + for val in u8::min_value()..=u8::max_value() { + let char = char::from(val); + let is_in_small = ('a'..='f').contains(&char); + let is_in_large = ('A'..='F').contains(&char); + let is_in_dec_digit = ('0'..='9').contains(&char); + let expected_result = is_in_small || is_in_large || is_in_dec_digit; + assert_eq!(is_hex_digit(char),expected_result); + } + } + + + // === Build === + + #[test] + fn test_byte_build_valid() { + test_valid::(r"\x05","05",token::EscapeStyle::Byte); + } + + #[test] + fn test_byte_build_invalid() { + test_invalid::(vec![ + r"\x5", + r"\x", + r"\x033", + r"\xz2", + ],token::EscapeStyle::Invalid); + } + + #[test] + fn test_u16_build_valid() { + test_valid::(r"\u4fe3","4fe3",token::EscapeStyle::U16); + } + + #[test] + fn test_u16_build_invalid() { + test_invalid::(vec![ + r"\u123", + r"\u", + r"\u123aff", + r"\uazaz", + ],token::EscapeStyle::InvalidUnicode); + } + + #[test] + fn test_u21_build_valid() { + test_valid::(r"\u{fa4e}","fa4e",token::EscapeStyle::U21); + } + + #[test] + fn test_u21_build_invalid() { + test_invalid::(vec![ + r"\u{1234567}", + r"\u{}", + ],token::EscapeStyle::InvalidUnicode); + } + + #[test] + fn test_u32_build_valid() { + test_valid::(r"\U0014A890","0014A890",token::EscapeStyle::U32); + } + + #[test] + fn test_u32_build_invalid() { + test_invalid::(vec![ + r"\U12121212", + r"\U", + r"\U001234", + r"\U001234567" + ],token::EscapeStyle::InvalidUnicode); + } +} diff --git a/lib/rust/lexer/definition/src/lexeme.rs b/lib/rust/lexer/definition/src/lexeme.rs new file mode 100644 index 0000000000..853ce8865b --- /dev/null +++ b/lib/rust/lexer/definition/src/lexeme.rs @@ -0,0 +1,301 @@ +//! This module defines the base lexemes for the Enso language. + +use crate::prelude::*; + +use enso_flexer::automata::pattern::Pattern; + + + +// ================================= +// === Basic Pattern Definitions === +// ================================= + +/// Basic lexemes as patterns. +/// +/// These must _only_ be used as part of the lexer definition, not used at runtime as they are not +/// performant at all. +pub mod definition_pattern { + use super::*; + + /// Match lower-case ASCII letters. + pub fn lower_ascii_letter() -> Pattern { + Pattern::range('a'..='z') + } + + /// Match upper-case ASCII letters. + pub fn upper_ascii_letter() -> Pattern { + Pattern::range('A'..='Z') + } + + /// Match ASCII digits. + pub fn ascii_digit() -> Pattern { + Pattern::range('0'..='9') + } + + /// Match ASCII letters. + pub fn ascii_letter() -> Pattern { + lower_ascii_letter() | upper_ascii_letter() + } + + /// Match ASCII alphanumeric characters. + pub fn ascii_alpha_num() -> Pattern { + ascii_digit() | ascii_letter() + } + + /// Match at least one ASCII space character. + pub fn spaces() -> Pattern { + into_pattern(literal::SPACE).many1() + } + + /// Match the end-of-file character. + pub fn eof() -> Pattern { + Pattern::eof() + } + + /// Match a newline. + /// + /// This matches both Unix (LF) and Windows (CRLF) styles of newlines. This is particularly + /// important so as not to result in incorrect spans on windows clients. + pub fn newline() -> Pattern { + let lf = into_pattern(literal::LF); + let crlf = into_pattern(literal::CRLF); + lf | crlf + } + + /// The characters that break tokens in Enso. + pub fn whitespace_break_chars() -> String { + [literal::TAB,literal::LF,literal::CR].concat() + } + + /// The characters that break token lexing in Enso. + pub fn break_chars() -> String { + [ + literal::INTERPOLATE_QUOTE, + literal::COMMENT, + literal::ANNOTATION_SYMBOL, + literal::SPACE, + literal::COMMA, + literal::DOT, + literal::OPERATOR_CHARS, + literal::GROUP_CHARS, + &whitespace_break_chars() + ].concat() + } + + /// Adds the basic characters not allowed in a raw segment in a format text literal. + fn add_base_format_disallows(chars:&mut String) { + chars.push_str(literal::INTERPOLATE_QUOTE); + chars.push_str(literal::SLASH); + chars.push_str(literal::LF); + chars.push_str(literal::CR); + } + + /// Characters allowable inside a raw segment in a format line. + pub fn format_line_raw_char() -> Pattern { + let mut chars = String::new(); + chars.push_str(literal::FORMAT_QUOTE); + add_base_format_disallows(&mut chars); + Pattern::none_of(&chars) + } + + /// Characters allowable inside a raw segment in a format block. + pub fn format_block_raw_char() -> Pattern { + let mut chars = String::new(); + add_base_format_disallows(&mut chars); + Pattern::none_of(&chars) + } + + /// Adds the basic characters not allowed in a raw segment in a raw text literal. + fn add_base_raw_disallows(chars:&mut String) { + chars.push_str(literal::SLASH); + chars.push_str(literal::LF); + chars.push_str(literal::CR); + } + + /// Characters allowable inside a raw segment in a raw line. + pub fn raw_line_raw_char() -> Pattern { + let mut chars = String::new(); + chars.push_str(literal::RAW_QUOTE); + add_base_raw_disallows(&mut chars); + Pattern::none_of(&chars) + } + + /// Characters allowable inside a raw segment in a raw block. + pub fn raw_block_raw_char() -> Pattern { + let mut chars = String::new(); + add_base_raw_disallows(&mut chars); + Pattern::none_of(&chars) + } + + /// The characters allowed as digits in a unicode escape. + pub fn unicode_escape_digit() -> Pattern { + let chars = &[ + literal::FORMAT_QUOTE, + literal::RAW_QUOTE, + literal::INTERPOLATE_QUOTE, + literal::SLASH, + literal::LF, + literal::CR, + "{}" + ].concat(); + Pattern::none_of(chars) + } +} + + + +// =============================== +// === Enso Lexeme Definitions === +// =============================== + +/// The literal lexemes that make up the Enso language. +pub mod literal { + + /// The type of a literal lexeme. + pub type Literal = &'static str; + + // === The Lexemes === + + /// The space character. + pub const SPACE:Literal = " "; + + /// The line-feed character. + pub const LF:Literal = "\n"; + + /// The carriage-return character. + pub const CR:Literal = "\r"; + + /// The crlf windows-style line ending. + pub const CRLF:Literal = "\r\n"; + + /// The tab character. + pub const TAB:Literal = "\t"; + + /// The comment character. + pub const COMMENT:Literal = "#"; + + /// The doc comment character. + pub const DOC_COMMENT:Literal = "##"; + + /// The symbol for beginning an annotation. + pub const ANNOTATION_SYMBOL:Literal = "@"; + + /// The dot symbol + pub const DOT:Literal = "."; + + /// Two dots. + pub const TWO_DOTS:Literal = ".."; + + /// Three dots. + pub const THREE_DOTS:Literal = "..."; + + /// Three dots. + pub const COMMA:Literal = ","; + + /// The `in` operator. + pub const OPERATOR_IN:Literal = "in"; + + /// The tick allowable at the end of an identifier. + pub const IDENTIFIER_TICK:Literal = "'"; + + /// The quote used to delimit interpolations in format text literals. + pub const INTERPOLATE_QUOTE:Literal = "`"; + + /// The quote used to delimit format text literals. + pub const FORMAT_QUOTE:Literal = "'"; + + /// The quote used to delimit format block literals. + pub const FORMAT_BLOCK_QUOTE:Literal = "'''"; + + /// The quote used to delimit raw text literals. + pub const RAW_QUOTE:Literal = "\""; + + /// The quote used to delimit raw block literals. + pub const RAW_BLOCK_QUOTE:Literal = "\"\"\""; + + /// The equals operator. + pub const EQUALS:Literal = "="; + + /// The equality comparison operator. + pub const EQUALS_COMP:Literal = "=="; + + /// Greater-than or equal. + pub const GE_OPERATOR:Literal = ">="; + + /// Less-than or equal. + pub const LE_OPERATOR:Literal = "<="; + + /// Inequality comparison operator. + pub const NOT_EQUAL:Literal = "!="; + + /// The hash eq operator. + pub const HASH_EQ:Literal = "#="; + + /// The wide arrow operator. + pub const WIDE_ARROW:Literal = "=>"; + + /// The blank identifier. + pub const BLANK_IDENT:Literal = "_"; + + /// The identifier segment separator. + pub const IDENT_SEGMENT_SEPARATOR:Literal = "_"; + + /// The separator between a number literal's explicit base and the number itself. + pub const NUMBER_BASE_SEPARATOR:Literal = "_"; + + /// The separator between the integer and fractional parts of the number literal. + pub const DECIMAL_SEPARATOR:Literal = "."; + + /// The backslash character. + pub const SLASH:Literal = r"\"; + + /// An escaped [`SLASH`]. + pub const ESCAPED_SLASH:Literal = r"\\"; + + /// The beginning of a byte escape. + pub const BYTE_ESCAPE_START:Literal = r"\x"; + + /// The beginning of a u16 escape. + pub const U16_ESCAPE_START:Literal = r"\u"; + + /// The beginning of a u21 escape. + pub const U21_ESCAPE_START:Literal = r"\u{"; + + /// The end of a u21 escape. + pub const U21_ESCAPE_END:Literal = "}"; + + /// The beginning of a u32 escape. + pub const U32_ESCAPE_START:Literal = r"\U"; + + /// The allowable group characters in Enso. + pub const GROUP_CHARS:Literal = "()[]{}"; + + /// The allowable operator characters in Enso. + pub const OPERATOR_CHARS:Literal = ";!$%&*+-/<>?^~|:\\"; +} + + + +// ========================= +// === Utility Functions === +// ========================= + +/// Get the first character of the lexeme, if it exists. +pub fn char(literal:&'static str) -> Option { + literal.chars().nth(0) +} + +/// Get the first character of the lexeme, assuming that it exists. +pub fn unsafe_char(literal:&'static str) -> char { + char(literal).expect("The first character of the literal exists.") +} + +/// Convert the lexeme into a pattern. +pub fn into_pattern(literal:&'static str) -> Pattern { + literal.into() +} + +/// The proper length of the `literal`. +pub fn len(literal:&'static str) -> usize { + literal.chars().count() +} diff --git a/lib/rust/lexer/definition/src/lexer.rs b/lib/rust/lexer/definition/src/lexer.rs index f92c1c091a..7a1354b00a 100644 --- a/lib/rust/lexer/definition/src/lexer.rs +++ b/lib/rust/lexer/definition/src/lexer.rs @@ -1,19 +1,29 @@ //! This module contains the definition of the lexer for the Enso programming language. +//! +//! Due to the architecture of the flexer, the lexer functions can't be separated into modules by +//! their responsibility, and therefore can't be namespaced. Despite this, the convention for this +//! lexer is that their `trace!` messages _should be_. This is simply because it makes it easier +//! to spot functions dealing with a certain class of lexemes in the logging output. use crate::prelude::*; -use flexer::*; +use enso_flexer::*; +use crate::library::lexeme; use crate::library::token::BlockType; use crate::library::token::Token; use crate::library::token; +use crate::library::escape; +use crate::library::escape::EscapeSequence; +use crate::library::rules; -use flexer::automata::pattern::Pattern; -use flexer::group::Group; -use flexer::group::Registry; -use flexer::prelude::logger::Disabled; -use flexer::prelude::reader; -use flexer::State as FlexerState; -use flexer; +use enso_flexer::automata::pattern::Pattern; +use enso_flexer::automata::symbol::Symbol; +use enso_flexer::group::Group; +use enso_flexer::group::Registry; +use enso_flexer::prelude::logger::Disabled; +use enso_flexer::prelude::reader; +use enso_flexer::State as FlexerState; +use enso_flexer; use std::collections::VecDeque; use std::cmp::Ordering; @@ -24,7 +34,7 @@ use std::cmp::Ordering; // ==================== type Logger = Disabled; -type Flexer = flexer::Flexer,token::Stream,Logger>; +type Flexer = enso_flexer::Flexer,token::Stream,Logger>; @@ -66,21 +76,25 @@ impl EnsoLexer { /// Push the current token stream onto the stack. pub fn push_tokens(&mut self) { let current_stream = mem::take(&mut self.output); - debug!(self.logger,"Push Tokens: {¤t_stream:?}"); + debug!(self.logger,"Push Tokens: {¤t_stream:?}."); self.tokens_stack.push(current_stream); } /// Pop the top token stream from the stack and make it current. pub fn pop_tokens(&mut self) { let popped = self.tokens_stack.pop().unwrap_or_default(); - debug!(self.logger,"Pop Tokens: {&popped:?}"); + debug!(self.logger,"Pop Tokens: {&popped:?}."); self.output = popped; } /// Append the provided `token` to the lexer output. pub fn append_token(&mut self, token:Token) { - debug!(self.logger,"Append: {&token:?}"); + debug!(self.logger,"Append: {&token:?}."); self.output.append(token); + if self.block_state.has_delayed_lines() { + let tokens = self.consume_tokens(); + self.block_state.append_line_to_current(tokens.into()); + } } /// Get a reference to the last token in the current lexer output. @@ -90,89 +104,220 @@ impl EnsoLexer { /// Consume the currently active stream of tokens. pub fn consume_tokens(&mut self) -> token::Stream { + debug!(self.logger,"Consume Tokens: {&self.output:?}."); mem::take(&mut self.output) } /// Consume the current match and replace it with the empty string. pub fn consume_current(&mut self) -> String { - debug!(self.logger,"Consume: {self.current_match:?}"); + debug!(self.logger,"Consume: {self.current_match:?}."); mem::take(&mut self.current_match) } /// Discard the current match and replace it with the empty string. pub fn discard_current(&mut self) { - debug!(self.logger,"Discard: {self.current_match:?}"); + debug!(self.logger,"Discard: {self.current_match:?}."); self.current_match = default(); } } +// === Comments === -// === Common Patterns === - -/// Basic character classification +/// The set of rules for lexing Enso comments. #[allow(dead_code)] impl EnsoLexer { - /// Match lower-case ASCII letters. - fn lower_ascii_letter() -> Pattern { - Pattern::range('a'..='z') + + // === Disable Comments === + + /// Triggered when a disable comment is encountered. + fn on_begin_disable_comment(&mut self, _reader:&mut R){ + trace!(self.logger,"Comment::on_begin_disable_comment"); + let offset = self.offset.consume(); + let disable_comment = self.disable_comment; + self.comment_state.set_offset(offset); + self.push_state(disable_comment); } - /// Match upper-case ASCII letters. - fn upper_ascii_letter() -> Pattern { - Pattern::range('A'..='Z') + /// Accumulate the disable comment contents. + fn on_build_disable_comment(&mut self, _reader:&mut R){ + trace!(self.logger,"Comment::on_build_disable_comment"); + let current = self.consume_current(); + self.comment_state.append_to_line(current); } - /// Match ASCII digits. - fn ascii_digit() -> Pattern { - Pattern::range('0'..='9') + /// Triggered when a disable comment is ended. + fn on_end_disable_comment(&mut self, reader:&mut R, line_end:token::LineEnding) { + trace!(self.logger,"Comment::on_end_disable_comment"); + self.discard_current(); + let disable_comment = self.disable_comment; + let comment = self.comment_state.consume_current(); + let token = comment.into(); + self.append_token(token); + self.pop_states_including(disable_comment); + match line_end { + token::LineEnding::None => self.on_eof(reader), + _ => { + let block_seen_newline = self.block_newline; + self.block_state.push_line_ending(line_end); + self.block_submit_line(reader); + self.push_state(block_seen_newline); + }, + } } - /// Match ASCII letters. - fn ascii_letter() -> Pattern { - EnsoLexer::lower_ascii_letter() | EnsoLexer::upper_ascii_letter() + + // === Doc Comments === + + /// Triggered when starting a doc comment. + fn on_begin_doc_comment(&mut self, _reader:&mut R) { + trace!(self.logger,"Comment::on_begin_doc_comment"); + let offset = self.offset.consume(); + let indent = self.consume_current().chars().count(); + let doc_comment = self.doc_comment; + self.comment_state.set_indent(indent); + self.comment_state.set_offset(offset); + self.push_state(doc_comment); } - /// Match ASCII alphanumeric characters. - fn ascii_alpha_num() -> Pattern { - EnsoLexer::ascii_digit() | EnsoLexer::ascii_letter() + /// Accumulate the contents of the current doc comment line. + fn on_build_doc_comment_line(&mut self, _reader:&mut R) { + trace!(self.logger,"Comment::on_build_doc_comment_line"); + let current = self.consume_current(); + self.comment_state.append_to_line(current); } - /// Match at least one ASCII space character. - fn spaces() -> Pattern { - Pattern::char(' ').many1() + /// Triggered when a line is ended in a doc comment. + fn on_doc_comment_end_of_line + ( &mut self + , reader : &mut R + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Comment::on_doc_comment_end_of_line"); + self.comment_state.submit_line(line_ending); + match line_ending { + token::LineEnding::None => { + let matched_bookmark = self.bookmarks.matched_bookmark; + self.bookmarks.rewind(matched_bookmark,reader); + self.on_end_doc_comment(reader); + self.on_eof(reader); + }, + _ => { + let comment_newline = self.doc_comment_newline; + self.push_state(comment_newline); + } + } } - /// Match a newline. - /// - /// This matches both Unix (LF) and Windows (CRLF) styles of newlines. This is particularly - /// important so as not to result in incorrect spans on windows clients. - fn newline() -> Pattern { - Pattern::char('\n') | Pattern::all_of("\r\n") + /// Triggered when ending a doc comment. + fn on_end_doc_comment(&mut self, _reader:&mut R) { + trace!(self.logger,"Comment::on_end_doc_comment"); + let doc_comment = self.doc_comment; + let mut comment = self.comment_state.consume_current(); + let blank_lines = comment.consume_blank_lines(); + if !blank_lines.is_empty() { + self.block_state.seen_newline = true; + } + self.block_state.delayed_append_lines.extend(blank_lines); + let comment_token = comment.into(); + self.append_token(comment_token); + self.pop_states_including(doc_comment); } - /// The allowable group characters in Enso. - fn group_chars() -> String { - String::from("()[]{}") + /// Triggered when a new line is discovered in a doc comment. + fn doc_comment_on_new_line(&mut self, reader:&mut R) { + trace!(self.logger,"Comment::doc_comment_on_new_line"); + self.pop_state(); + let indent = self.consume_current().chars().count(); + let comment_indent = self.comment_state.current_comment.indent; + if indent < comment_indent { + self.on_end_doc_comment(reader); + self.block_on_newline(reader,token::LineEnding::None); + } else if (indent - comment_indent) > 0 { + let remaining_indent = lexeme::literal::SPACE.repeat(indent - comment_indent); + self.comment_state.append_to_line(remaining_indent); + } } - /// The allowable operator characters in Enso. - fn operator_chars() -> String { - String::from(";!$%&*+-/<>?^~|:\\") + /// Triggered when an empty line is discovered in a doc comment. + fn doc_comment_on_empty_line + ( &mut self + , reader : &mut R + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Comment::doc_comment_on_empty_line"); + let current = self.consume_current(); + let indent = current.chars().count() - line_ending.size(); + self.comment_state.submit_blank_line(indent,line_ending); + if let token::LineEnding::None = line_ending { + let block_newline = self.block_newline; + self.on_end_doc_comment(reader); + self.push_state(block_newline); + self.on_eof(reader); + } } - /// The characters that break tokens in Enso. - fn whitespace_break_chars() -> String { - String::from("\t\r\n") - } - /// The characters that break token lexing in Enso. - fn break_chars() -> String { - let mut break_chars = String::from("`@#,. "); - break_chars.push_str(&Self::operator_chars()); - break_chars.push_str(&Self::whitespace_break_chars()); - break_chars.push_str(&Self::group_chars()); - break_chars + // === Comment Rules === + + /// The rules for lexing Enso comments. + fn add_comment_rules(lexer:&mut EnsoLexer) { + let comment_char = lexeme::into_pattern(lexeme::literal::COMMENT); + let lf = lexeme::into_pattern(lexeme::literal::LF); + let crlf = lexeme::into_pattern(lexeme::literal::CRLF); + let eof = lexeme::definition_pattern::eof(); + let spaces = lexeme::definition_pattern::spaces(); + let doc_comment_start = lexeme::into_pattern(lexeme::literal::DOC_COMMENT) >> &spaces.opt(); + let eof_newline = &spaces.opt() >> &eof; + let empty_line_lf = &spaces.opt() >> &lf; + let empty_line_crlf = &spaces.opt() >> &crlf; + let any = Pattern::any(); + + + // === Initial State Rules === + + let initial_state_id = lexer.initial_state; + let initial_state = lexer.group_mut(initial_state_id); + rules!(initial_state with + doc_comment_start => self.on_begin_doc_comment(), + comment_char => self.on_begin_disable_comment(), + ); + + + // === Disable Comment Rules === + + let disable_comment_id = lexer.disable_comment; + let disable_comment = lexer.group_mut(disable_comment_id); + rules!(disable_comment with + lf => self.on_end_disable_comment(token::LineEnding::LF), + crlf => self.on_end_disable_comment(token::LineEnding::CRLF), + eof => self.on_end_disable_comment(token::LineEnding::None), + any => self.on_build_disable_comment(), + ); + + + // === Doc Comment Rules === + + let doc_comment_id = lexer.doc_comment; + let doc_comment = lexer.group_mut(doc_comment_id); + rules!(doc_comment with + lf => self.on_doc_comment_end_of_line(token::LineEnding::LF), + crlf => self.on_doc_comment_end_of_line(token::LineEnding::CRLF), + eof => self.on_doc_comment_end_of_line(token::LineEnding::None), + any => self.on_build_doc_comment_line(), + ); + + + // === Newline Handling in Doc Comments === + + let doc_comment_newline_id = lexer.doc_comment_newline; + let doc_comment_newline = lexer.group_mut(doc_comment_newline_id); + rules!(doc_comment_newline with + spaces.opt() => self.doc_comment_on_new_line(), + empty_line_lf => self.doc_comment_on_empty_line(token::LineEnding::LF), + empty_line_crlf => self.doc_comment_on_empty_line(token::LineEnding::CRLF), + eof_newline => self.doc_comment_on_empty_line(token::LineEnding::None), + ); } } @@ -184,40 +329,46 @@ impl EnsoLexer { impl EnsoLexer { /// Create an arbitrary operator that requires no special handling. - fn on_operator(&mut self, _reader:&mut R) { + fn on_operator(&mut self, _reader:&mut R) { + trace!(self.logger,"Operator::on_operator"); let op_modifier_check = self.operator_modifier_check; let operator = self.consume_current(); let offset = self.offset.consume(); - let token = Token::Operator(operator,offset); + let token = Token::operator(operator,offset); self.append_token(token); self.push_state(op_modifier_check); } /// Create an operator that cannot have an associated modifier. - fn on_operator_no_modifier(&mut self, _reader:&mut R) { + fn on_operator_no_modifier(&mut self, _reader:&mut R) { + trace!(self.logger,"Operator::on_operator_no_modifier"); let op_suffix_check = self.operator_suffix_check; let operator = self.consume_current(); let offset = self.offset.consume(); - let token = Token::Operator(operator,offset); + let token = Token::operator(operator,offset); self.append_token(token); self.push_state(op_suffix_check); } /// Create a grouping operator. - fn on_group(&mut self, reader:&mut R) { - let operator = self.consume_current(); - let offset = self.offset.consume(); - let token = Token::Operator(operator,offset); + fn on_group(&mut self, reader:&mut R) { + trace!(self.logger,"Operator::on_group"); + let suffix_check = self.ident_suffix_check; + let operator = self.consume_current(); + let offset = self.offset.consume(); + let token = Token::operator(operator,offset); self.append_token(token); + self.push_state(suffix_check); self.ident_on_no_error_suffix(reader); } /// Create an operator modifier. - fn on_modifier(&mut self, _reader:&mut R) { + fn on_modifier(&mut self, _reader:&mut R) { + trace!(self.logger,"Operator::on_modifier"); match self.output.pop() { Some(token) => match token.shape { token::Shape::Operator(name) => { - let new_token = Token::Modifier(name,token.offset); + let new_token = Token::modifier(name,token.offset); self.discard_current(); self.append_token(new_token); }, @@ -227,36 +378,78 @@ impl EnsoLexer { } } + /// Triggered when a dot operator is immediately followed by another (e.g. `.+` or `. ==`). + fn on_dotted_operator(&mut self, _reader:&mut R) { + trace!(self.logger,"Operator::on_dot_operator"); + let suffix_check = self.operator_suffix_check; + let offset = self.offset.consume(); + let dotted_op = Token::operator(lexeme::literal::DOT,offset); + let current_match = self.consume_current(); + let second_op_str = current_match.trim_start_matches(|c| { + let space = lexeme::unsafe_char(lexeme::literal::SPACE); + let dot = lexeme::unsafe_char(lexeme::literal::DOT); + c == space || c == dot + }); + let second_offset = current_match.chars().count() - second_op_str.chars().count() - 1; + let second_op = Token::operator(second_op_str,second_offset); + self.append_token(dotted_op); + self.append_token(second_op); + self.push_state(suffix_check); + } + /// The rules for lexing Enso operators. fn add_operator_rules(lexer:&mut EnsoLexer) { - let operator_char = Pattern::any_of(Self::operator_chars().as_str()); - let equals = c!('='); - let comma = c!(','); - let dot = c!('.'); + let operator_char = Pattern::any_of(lexeme::literal::OPERATOR_CHARS); + let equals = lexeme::into_pattern(lexeme::literal::EQUALS); + let comma = lexeme::into_pattern(lexeme::literal::COMMA); + let dot = lexeme::into_pattern(lexeme::literal::DOT); + let spaces = lexeme::definition_pattern::spaces(); let error_char = &operator_char | &equals | &comma | ˙ let error_suffix = &error_char.many1(); let operator_body = &operator_char.many1(); - let ops_eq = &equals | l!("==") | l!(">=") | l!("<=") | l!("!=") | l!("#="); - let ops_in = l!("in"); - let ops_dot = dot | comma | l!("..") | l!("..."); - let ops_group = Pattern::any_of(Self::group_chars().as_str()); - let ops_comment = c!('#') | l!("##"); - let ops_no_modifier = &ops_eq | &ops_dot | &ops_comment | &ops_in; + let equals_comp = lexeme::into_pattern(lexeme::literal::EQUALS_COMP); + let ge_op = lexeme::into_pattern(lexeme::literal::GE_OPERATOR); + let le_op = lexeme::into_pattern(lexeme::literal::LE_OPERATOR); + let not_equal = lexeme::into_pattern(lexeme::literal::NOT_EQUAL); + let hash_eq = lexeme::into_pattern(lexeme::literal::HASH_EQ); + let fat_arrow = lexeme::into_pattern(lexeme::literal::WIDE_ARROW); + let ops_eq = &equals | &equals_comp | &ge_op | &le_op | ¬_equal | &hash_eq + | &fat_arrow; + let ops_in = lexeme::into_pattern(lexeme::literal::OPERATOR_IN); + let ops_dot = &dot | comma | lexeme::into_pattern(lexeme::literal::TWO_DOTS) + | lexeme::into_pattern(lexeme::literal::THREE_DOTS); + let dotted_op = &dot >> &spaces.opt() >> (operator_body | &ops_eq); + let ops_group = Pattern::any_of(lexeme::literal::GROUP_CHARS); + let ops_no_modifier = &ops_eq | &ops_dot | &ops_in; + + + // === Initial State Rules for Operators === let initial_state_id = lexer.initial_state; let initial_state = lexer.group_mut(initial_state_id); - initial_state.create_rule(&operator_body, "self.on_operator(reader)"); - initial_state.create_rule(&ops_no_modifier,"self.on_operator_no_modifier(reader)"); - initial_state.create_rule(&ops_group, "self.on_group(reader)"); + rules!(initial_state with + operator_body => self.on_operator(), + dotted_op => self.on_dotted_operator(), + ops_no_modifier => self.on_operator_no_modifier(), + ops_group => self.on_group(), + ); + + + // === Modifier Checking for Operators === let operator_mod_check_id = lexer.operator_modifier_check; let operator_mod_check = lexer.group_mut(operator_mod_check_id); - operator_mod_check.create_rule(&equals,"self.on_modifier(reader)"); + rules!(operator_mod_check with equals => self.on_modifier()); + + + // === Suffix Checking for Operators === let operator_sfx_check_id = lexer.operator_suffix_check; let operator_sfx_check = lexer.group_mut(operator_sfx_check_id); - operator_sfx_check.create_rule(&error_suffix,"self.ident_on_error_suffix(reader)"); - operator_sfx_check.create_rule(&Pattern::always(),"self.ident_on_no_error_suffix(reader)"); + rules!(operator_sfx_check with + error_suffix => self.ident_on_error_suffix(), + Pattern::always() => self.ident_on_no_error_suffix(), + ); } } @@ -268,77 +461,114 @@ impl EnsoLexer { impl EnsoLexer { /// Create a variable identifier from the current match. - fn on_variable_ident(&mut self, _reader:&mut R) { - let token = Token::Variable(self.consume_current(),self.offset.consume()); + fn on_variable_ident(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::on_variable_ident"); + let token = Token::variable(self.consume_current(),self.offset.consume()); let suffix_check = self.ident_suffix_check; self.append_token(token); self.push_state(suffix_check); } /// Create a referent identifier from the current match. - fn on_referent_ident(&mut self, _reader:&mut R) { - let token = Token::Referent(self.consume_current(),self.offset.consume()); + fn on_referent_ident(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::on_referent_ident"); + let token = Token::referent(self.consume_current(),self.offset.consume()); let suffix_check = self.ident_suffix_check; self.append_token(token); self.push_state(suffix_check); } /// Create an external identifier from the current match. - fn on_external_ident(&mut self, _reader:&mut R) { - let token = Token::External(self.consume_current(),self.offset.consume()); + fn on_external_ident(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::on_external_ident"); + let token = Token::external(self.consume_current(),self.offset.consume()); let suffix_check = self.ident_suffix_check; self.append_token(token); self.push_state(suffix_check); } /// Create a blank identifier from the current match. - fn on_blank(&mut self, _reader:&mut R) { - let token = Token::Blank(self.offset.consume()); + fn on_blank(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::on_blank"); + let token = Token::blank(self.offset.consume()); let suffix_check = self.ident_suffix_check; self.discard_current(); self.append_token(token); self.push_state(suffix_check); } + /// Create an annotation from the current match. + fn on_annotation(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::on_annotation"); + let current = self.consume_current(); + let offset = self.offset.consume(); + let length_to_drop = lexeme::len(lexeme::literal::ANNOTATION_SYMBOL); + let token = Token::annotation(¤t[length_to_drop..],offset); + let suffix_check = self.ident_suffix_check; + self.append_token(token); + self.push_state(suffix_check); + } + /// Tokenize an unexpected error suffix. - fn ident_on_error_suffix(&mut self, _reader:&mut R) { - let token = Token::InvalidSuffix(self.consume_current(),self.offset.consume()); + fn ident_on_error_suffix(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::ident_on_error_suffix"); + let token = Token::invalid_suffix(self.consume_current(),self.offset.consume()); self.append_token(token); self.pop_state(); } /// Submit a non-error identifier. - fn ident_on_no_error_suffix(&mut self, _reader:&mut R) { + fn ident_on_no_error_suffix(&mut self, _reader:&mut R) { + trace!(self.logger,"Identifier::ident_on_no_error_suffix"); self.pop_state(); } /// The set of rules for lexing Enso identifiers. fn add_identifier_rules(lexer:&mut EnsoLexer) { - let body_char = (EnsoLexer::lower_ascii_letter() | EnsoLexer::ascii_digit()).many(); - let underscore = c!('_'); - let ticks = c!('\'').many(); - let init_var_seg = EnsoLexer::lower_ascii_letter() >> &body_char; - let var_seg = (EnsoLexer::lower_ascii_letter() | EnsoLexer::ascii_digit()) >> &body_char; - let init_ref_seg = EnsoLexer::upper_ascii_letter() >> &body_char; - let ref_seg = (EnsoLexer::upper_ascii_letter() | EnsoLexer::ascii_digit()) >> &body_char; - let external_start = EnsoLexer::ascii_letter() | &underscore; - let external_body = EnsoLexer::ascii_alpha_num() | &underscore; - let variable_ident = &init_var_seg >> (&underscore >> &var_seg).many() >> &ticks; - let referent_ident = &init_ref_seg >> (&underscore >> &ref_seg).many() >> &ticks; - let external_ident = &external_start >> external_body.many() >> &ticks; - let error_suffix = Pattern::none_of(EnsoLexer::break_chars().as_str()).many1(); + let lower_ascii = lexeme::definition_pattern::lower_ascii_letter(); + let upper_ascii = lexeme::definition_pattern::upper_ascii_letter(); + let body_char = (&lower_ascii | lexeme::definition_pattern::ascii_digit()).many(); + let blank = lexeme::into_pattern(lexeme::literal::BLANK_IDENT); + let ident_seg_sep = lexeme::into_pattern(lexeme::literal::IDENT_SEGMENT_SEPARATOR); + let ticks = lexeme::into_pattern(lexeme::literal::IDENTIFIER_TICK).many(); + let init_var_seg = &lower_ascii >> &body_char; + let lower_ascii_alnum = &lower_ascii | lexeme::definition_pattern::ascii_digit(); + let var_seg = &lower_ascii_alnum >> &body_char; + let init_ref_seg = &upper_ascii >> &body_char; + let upper_ascii_alnum = &upper_ascii | lexeme::definition_pattern::ascii_digit(); + let ref_seg = &upper_ascii_alnum >> &body_char; + let external_start = lexeme::definition_pattern::ascii_letter() | &ident_seg_sep; + let external_body = lexeme::definition_pattern::ascii_alpha_num() | &ident_seg_sep; + let variable_ident = &init_var_seg >> (&ident_seg_sep >> &var_seg).many() >> &ticks; + let referent_ident = &init_ref_seg >> (&ident_seg_sep >> &ref_seg).many() >> &ticks; + let external_ident = &external_start >> external_body.many() >> &ticks; + let break_chars = lexeme::definition_pattern::break_chars(); + let error_suffix = Pattern::none_of(break_chars.as_str()).many1(); + let annotation_symbol = lexeme::into_pattern(lexeme::literal::ANNOTATION_SYMBOL); + let annotation = annotation_symbol >> (&variable_ident | &referent_ident); + + + // === Initial State Rules for Identifiers === let initial_state_id = lexer.initial_state; let initial_state = lexer.group_mut(initial_state_id); - initial_state.create_rule(&variable_ident,"self.on_variable_ident(reader)"); - initial_state.create_rule(&referent_ident,"self.on_referent_ident(reader)"); - initial_state.create_rule(&underscore, "self.on_blank(reader)"); - initial_state.create_rule(&external_ident,"self.on_external_ident(reader)"); + rules!(initial_state with + variable_ident => self.on_variable_ident(), + referent_ident => self.on_referent_ident(), + blank => self.on_blank(), + external_ident => self.on_external_ident(), + annotation => self.on_annotation(), + ); + + + // === Identifier Suffix Checking Rules === let suffix_check_id = lexer.ident_suffix_check; let suffix_check = lexer.group_mut(suffix_check_id); - suffix_check.create_rule(&error_suffix, "self.ident_on_error_suffix(reader)"); - suffix_check.create_rule(&Pattern::always(),"self.ident_on_no_error_suffix(reader)"); + rules!(suffix_check with + error_suffix => self.ident_on_error_suffix(), + Pattern::always() => self.ident_on_no_error_suffix(), + ); } } @@ -351,20 +581,23 @@ impl EnsoLexer { /// Finalize the lexer when it's done lexing a number with an explicit base. fn finalize_explicit_base(&mut self) { + trace!(self.logger,"Number::finalize_explicit_base"); let number_part_2 = self.number_phase_two; self.pop_states_including(number_part_2); self.number_state.reset(); } /// Triggered when the lexer matches an integer with an implicit base. - fn on_integer(&mut self, _reader:&mut R) { - let number_phase_2 = self.number_phase_two; + fn on_integer(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::on_integer"); + let number_phase_2 = self.number_phase_two; self.number_state.literal = self.consume_current(); self.push_state(number_phase_2) } /// Triggered when the lexer matches a number annotated with an explicit base. - fn on_explicit_base(&mut self, _reader:&mut R) { + fn on_explicit_base(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::on_explicit_base"); let literal = self.consume_current(); self.number_state.literal = literal; let offset = self.offset.consume(); @@ -375,17 +608,19 @@ impl EnsoLexer { /// Triggered when the lexer has seen an explicit base definition that isn't followed by an /// actual number. - fn on_dangling_base(&mut self, _reader:&mut R) { + fn on_dangling_base(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::on_dangling_base"); let base = self.number_state.consume_base(); let offset = self.offset.consume(); - let token = Token::DanglingBase(base,offset); + let token = Token::dangling_base(base,offset); self.append_token(token); self.discard_current(); self.finalize_explicit_base(); } /// Triggered when an explicit decimal number has been seen by the lexer. - fn on_decimal(&mut self, _reader:&mut R) { + fn on_decimal(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::on_decimal"); let decimal_suffix_check = self.decimal_suffix_check; self.number_state.literal = self.consume_current(); let offset = self.offset.consume(); @@ -395,14 +630,16 @@ impl EnsoLexer { } /// Triggered when an explicit base annotation has been seen by the lexer. - fn seen_base(&mut self, _reader:&mut R) { + fn seen_base(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::seen_base"); let seen_base_id = self.number_seen_base; self.push_state(seen_base_id); self.number_state.swap_members(); } /// Submit an integer token into the lexer. - fn submit_integer(&mut self, _reader:&mut R) { + fn submit_integer(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::submit_integer"); let offset = self.offset.consume(); let token = self.number_state.consume_token(offset); self.append_token(token); @@ -410,50 +647,73 @@ impl EnsoLexer { } /// Triggered when a decimal number is followed by an erroneous suffix. - fn decimal_error_suffix(&mut self, _reader:&mut R) { + fn decimal_error_suffix(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::decimal_error_suffix"); let decimal_suffix_check = self.decimal_suffix_check; let current_match = self.consume_current(); let offset = self.offset.consume(); - let token = Token::InvalidSuffix(current_match,offset); + let token = Token::invalid_suffix(current_match,offset); self.append_token(token); self.pop_states_including(decimal_suffix_check); } /// Triggered when a decimal number is followed by a valid suffix. - fn decimal_valid_suffix(&mut self, _reader:&mut R) { + fn decimal_valid_suffix(&mut self, _reader:&mut R) { + trace!(self.logger,"Number::decimal_valid_suffix"); let seen_decimal_id = self.decimal_suffix_check; self.pop_states_including(seen_decimal_id); } /// The rules for lexing numbers in Enso. fn add_number_rules(lexer:&mut EnsoLexer) { - let digits = EnsoLexer::ascii_digit().many1(); - let point = c!('.'); - let underscore = c!('_'); + let digits = lexeme::definition_pattern::ascii_digit().many1(); + let point = lexeme::into_pattern(lexeme::literal::DECIMAL_SEPARATOR); + let base_separator = lexeme::into_pattern(lexeme::literal::NUMBER_BASE_SEPARATOR); let decimal = &digits >> &point >> &digits; - let arbitrary_digits = EnsoLexer::ascii_alpha_num().many1(); + let arbitrary_digits = lexeme::definition_pattern::ascii_alpha_num().many1(); let arbitrary_decimal = &arbitrary_digits >> (&point >> &arbitrary_digits).opt(); - let error_suffix = Pattern::none_of(EnsoLexer::break_chars().as_str()).many1(); + let break_chars = lexeme::definition_pattern::break_chars(); + let error_suffix = Pattern::none_of(break_chars.as_str()).many1(); + + + // === Initial State Rules for Number Literals === let initial_state_id = lexer.initial_state; let initial_state = lexer.group_mut(initial_state_id); - initial_state.create_rule(&digits,"self.on_integer(reader)"); - initial_state.create_rule(&decimal,"self.on_decimal(reader)"); + rules!(initial_state with + digits => self.on_integer(), + decimal => self.on_decimal(), + ); + + + // === Rules in "Phase 2" of Number Lexing (Checks for Bases) === let number_phase_2_id = lexer.number_phase_two; let number_phase_2 = lexer.groups_mut().group_mut(number_phase_2_id); - number_phase_2.create_rule(&underscore, "self.seen_base(reader)"); - number_phase_2.create_rule(&Pattern::always(),"self.submit_integer(reader)"); + rules!(number_phase_2 with + base_separator => self.seen_base(), + Pattern::always() => self.submit_integer(), + ); + + + // === Rules for Seeing an Explicit Base in a Number Literal === let seen_base_id = lexer.number_seen_base; let seen_base = lexer.groups_mut().group_mut(seen_base_id); - seen_base.create_rule(&arbitrary_decimal,"self.on_explicit_base(reader)"); - seen_base.create_rule(&Pattern::always(),"self.on_dangling_base(reader)"); + rules!(seen_base with + arbitrary_decimal => self.on_explicit_base(), + Pattern::always() => self.on_dangling_base(), + ); + + + // === Rules for Seeing an Explicit Decimal Number === let decimal_suffix_check_id = lexer.decimal_suffix_check; let decimal_suffix_check = lexer.groups_mut().group_mut(decimal_suffix_check_id); - decimal_suffix_check.create_rule(&error_suffix,"self.decimal_error_suffix(reader)"); - decimal_suffix_check.create_rule(&Pattern::always(),"self.decimal_valid_suffix(reader)"); + rules!(decimal_suffix_check with + error_suffix => self.decimal_error_suffix(), + Pattern::always() => self.decimal_valid_suffix(), + ); } } @@ -464,9 +724,787 @@ impl EnsoLexer { #[allow(dead_code)] impl EnsoLexer { + // === Error === + + /// Triggered when encountering an invalid set of quotes. + fn on_invalid_quote(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_invalid_quote"); + let offset = self.offset.consume(); + let bad_quotes = self.consume_current(); + let token = Token::invalid_quote(bad_quotes,offset); + self.append_token(token); + } + + /// Submit a missing quote in a nested text line literal. + fn submit_missing_quote_nested + ( &mut self + , reader : &mut R + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Text::submit_missing_quote_nested"); + self.on_missing_quote(reader); + let interpolate_is_closed = false; + self.on_interpolate_end(reader,interpolate_is_closed); + let top_level = self.text_state.unsafe_current_mut(); + let top_level_style = top_level.unsafe_get_style(); + top_level.style = Some(match top_level_style { + token::TextStyle::FormatLine => token::TextStyle::UnclosedLine, + _ => top_level_style, + }); + let text = self.text_state.unsafe_end_literal(); + let literal = text.into(); + self.append_token(literal); + self.pop_state(); + self.on_missing_quote_cleanup(reader,line_ending); + } + + /// Submit a missing closing quote in a text line literal. + fn submit_missing_quote(&mut self, reader:&mut R, line_ending:token::LineEnding) { + trace!(self.logger,"Text::submit_missing_quote"); + self.on_missing_quote(reader); + self.on_missing_quote_cleanup(reader,line_ending); + } + + /// The common logic for dealing with a missing quote in a text line literal. + fn on_missing_quote(&mut self, reader:&mut R) { + trace!(self.logger,"Text::on_missing_quote"); + let matched_bookmark = self.bookmarks.matched_bookmark; + self.bookmarks.rewind(matched_bookmark,reader); + let current = self.text_state.unsafe_current_mut(); + current.style = Some(token::TextStyle::UnclosedLine); + let text = self.text_state.unsafe_end_literal(); + let literal = text.into(); + self.append_token(literal); + self.pop_state(); + } + + /// The common logic that must run after dealing with a missing quote in a text line literal. + fn on_missing_quote_cleanup + ( &mut self + , reader:&mut R + , line_ending:token::LineEnding + ) { + trace!(self.logger,"Text::on_missing_quote_after"); + match line_ending { + token::LineEnding::None => self.on_eof(reader), + _ => self.block_state.push_line_ending(line_ending), + } + } + + /// Triggered when encountering the opening of a text block in a nested context. + fn on_text_block_nested + (&mut self + , reader : &mut R + , line_ending : token::LineEnding + , quote : &'static str + ) { + trace!(self.logger,"Text::on_text_block_nested"); + let current = self.consume_current(); + let offset = self.offset.consume(); + let text = ¤t[0..lexeme::len(quote)]; + let unrecognized = Token::unrecognized(text,offset); + self.append_token(unrecognized); + self.on_newline_in_interpolate(reader,line_ending); + } + + + // === Lines === + + /// Triggered when beginning a format line literal. + fn on_begin_format_line(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_begin_format_line"); + let interpolate = self.text_interpolate; + let literal = self.text_state.begin_literal(); + literal.style = Some(token::TextStyle::FormatLine); + let state = if self.is_inside_state(interpolate) { + self.text_format_line_nested + } else { + self.text_format_line + }; + self.push_state(state); + } + + /// Submits a text line literal. + fn submit_text_line(&mut self, _reader:&mut R, end_including:group::Identifier) { + trace!(self.logger,"Text::submit_text_line"); + let text = self.text_state.unsafe_end_literal(); + let token = text.into(); + self.append_token(token); + self.pop_states_including(end_including); + } + + /// Triggered when beginning a raw line literal. + fn on_begin_raw_line(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_begin_raw_line"); + let interpolate = self.text_interpolate; + let literal = self.text_state.begin_literal(); + literal.style = Some(token::TextStyle::RawLine); + let state = if self.is_inside_state(interpolate) { + self.text_raw_line_nested + } else { + self.text_raw_line + }; + self.push_state(state); + } + + + // === Inline Blocks === + + /// Triggered when beginning an inline text block. + fn text_on_inline_block + ( &mut self + , _reader : &mut R + , style : token::TextStyle + , enter_state : group::Identifier + ) { + trace!(self.logger,"Text::text_on_inline_block"); + let offset = self.offset.consume(); + let text = self.text_state.begin_literal(); + text.style = Some(style); + text.offset = offset; + self.push_state(enter_state); + } + + /// Triggered when ending an inline text block. + fn text_end_inline_block(&mut self) { + trace!(self.logger,"Text::text_end_inline_block"); + let text = self.text_state.unsafe_end_literal(); + let literal = text.into(); + self.append_token(literal); + self.pop_state(); + } + + /// Triggered when encountering EOF inside an inline block. + fn text_on_eof_in_inline_block(&mut self, reader:&mut R) { + trace!(self.logger,"Text::text_on_eof_in_inline_block"); + self.text_end_inline_block(); + self.on_eof(reader); + } + + /// Triggered when encountering a newline in an inline block. + fn text_inline_block_on_newline + ( &mut self + , _reader : &mut R + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Text::text_inline_block_on_newline"); + let block_newline = self.block_newline; + self.block_state.push_line_ending(line_ending); + self.text_end_inline_block(); + self.block_submit_line(_reader); + self.push_state(block_newline); + } + + /// Triggered on completion of a format inline block. + fn submit_format_inline_block(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::submit_format_inline_block"); + let format_inline_block = self.text_format_inline_block; + let text = self.text_state.unsafe_end_literal(); + let token = text.into(); + self.append_token(token); + self.pop_states_including(format_inline_block); + } + + + // === Blocks === + + /// Triggered when beginning a text block. + fn on_begin_text_block + ( &mut self + , _reader : &mut R + , block_style : token::TextStyle + , enter_state : group::Identifier + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Text::on_begin_text_block"); + let block_seen_newline = self.block_newline; + let text_seen_newline = self.text_seen_newline; + let current_state = self.current_state(); + let offset = if current_state == block_seen_newline { + self.pop_state(); + self.block_state.current().indent + } else { self.offset.consume() }; + let text = self.text_state.begin_literal(); + text.starting_line_ending = line_ending; + text.style = Some(block_style); + text.offset = offset; + self.offset.push(); + self.push_state(enter_state); + self.push_state(text_seen_newline); + } + + /// Triggered when ending a text block. + fn text_on_end_of_block(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::text_on_end_of_block"); + self.pop_state(); + let mut text_state = self.text_state.unsafe_end_literal(); + text_state.take_empty_lines().into_iter().for_each(|line| { + self.block_state.append_delayed_line(line) + }); + let text_literal = text_state.into(); + self.append_token(text_literal); + self.offset.pop(); + let matched_bookmark = self.bookmarks.matched_bookmark; + self.bookmarks.rewind(matched_bookmark,_reader); + } + + /// Triggered when encountering a newline followed by an EOF in a text block. + fn text_on_eof_new_line(&mut self, reader:&mut R) { + trace!(self.logger,"Text::text_on_eof_new_line"); + let block_newline = self.block_newline; + self.pop_state(); + self.text_on_end_of_block(reader); + self.push_state(block_newline); + } + + /// Triggered at the end of a line in a text block. + fn text_on_end_of_line(&mut self, _reader:&mut R, line_ending:token::LineEnding) { + trace!(self.logger,"Text::text_on_end_of_line"); + let text_newline = self.text_seen_newline; + self.text_state.push_line_ending(line_ending); + self.text_state.submit_current_line(); + self.push_state(text_newline); + } + + /// Triggered when lexing a new line in a text block. + fn text_on_new_line(&mut self, reader:&mut R) { + trace!(self.logger,"Text::text_on_new_line"); + self.pop_state(); + let indent = self.consume_current().chars().count(); + let text = self.text_state.unsafe_current_mut(); + if text.indent == 0 { text.indent = indent; } + if indent < text.indent { + self.block_state.seen_newline = true; + self.text_on_end_of_block(reader); + self.block_on_newline(reader,token::LineEnding::None); + } else if (indent - text.indent) > 0 { + let segment_offset = 0; + let literal_spaces = " ".repeat(indent - text.indent); + let segment = Token::text_segment_raw(literal_spaces,segment_offset); + text.append_segment_to_line(segment); + } + } + + /// Triggered when lexing an empty line in a text block. + fn text_on_empty_line(&mut self, _reader:&mut R, line_ending:token::LineEnding) { + trace!(self.logger,"Text::text_on_empty_line"); + let current = self.consume_current(); + let indent = current.chars().count() - line_ending.size(); + self.text_state.push_line_ending(line_ending); + self.text_state.append_empty_line(indent); + } + + + // === Segments === + + /// Triggered when beginning an interpolation in a format literal. + fn on_interpolate_begin(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_interpolate_begin"); + let text_interpolate = self.text_interpolate; + self.push_tokens(); + self.offset.push(); + self.push_state(text_interpolate); + } + + /// Triggered when an interpolate quote is seen in a _nested_ text literal. + fn on_nested_interpolate_quote(&mut self, reader:&mut R) { + trace!(self.logger,"Text::on_nested_interpolate_quote"); + let mut nested_text = self.text_state.unsafe_end_literal(); + let new_style = match nested_text.style { + Some(token::TextStyle::RawLine) => Some(token::TextStyle::UnclosedLine), + Some(token::TextStyle::FormatLine) => Some(token::TextStyle::UnclosedLine), + _ => nested_text.style + }; + nested_text.style = new_style; + let literal = nested_text.into(); + let interpolate_is_closed = true; + self.append_token(literal); + self.on_interpolate_end(reader,interpolate_is_closed); + } + + /// Triggered when ending an interpolation in a format text literal. + fn on_interpolate_end(&mut self, reader:&mut R, is_closed:bool) { + trace!(self.logger,"Text::on_interpolate_end"); + let text_interpolate = self.text_interpolate; + let nested_format_line = self.text_format_line_nested; + if self.is_inside_state(text_interpolate) { + if self.is_inside_state(nested_format_line) { + self.pop_states_until(nested_format_line); + let current = self.text_state.unsafe_current_mut(); + current.style = Some(token::TextStyle::UnclosedLine); + let text = self.text_state.unsafe_end_literal(); + let literal = text.into(); + self.append_token(literal); + } + self.pop_states_until(text_interpolate); + let expr_tokens = self.consume_tokens(); + let offset = self.offset.consume(); + let expr_segment = if is_closed { + Token::text_segment_interpolate(expr_tokens.into(),offset) + } else { + Token::text_segment_unclosed_interpolate(expr_tokens.into(),offset) + }; + self.text_state.append_segment(expr_segment); + self.pop_tokens(); + self.offset.pop(); + self.pop_state(); + } else { + self.on_unrecognized(reader); + } + } + + /// Triggered when encountering an EOF inside an interpolation. + fn on_eof_in_interpolate(&mut self, reader:&mut R) { + trace!(self.logger,"Text::on_eof_in_interpolate"); + let interpolate_is_closed = false; + self.on_interpolate_end(reader,interpolate_is_closed); + let mut text = self.text_state.unsafe_end_literal(); + text.style = match text.style { + Some(token::TextStyle::FormatLine) => Some(token::TextStyle::UnclosedLine), + _ => text.style, + }; + let token = text.into(); + self.append_token(token); + self.on_eof(reader); + } + + /// Triggered when encountering a newline inside an interpolation. + fn on_newline_in_interpolate + ( &mut self + , reader : &mut R + , line_ending : token::LineEnding + ) { + trace!(self.logger,"Text::on_newline_in_interpolate"); + let interpolate_is_closed = false; + self.on_interpolate_end(reader,interpolate_is_closed); + let current_style = self.text_state.unsafe_current().unsafe_get_style(); + match current_style { + token::TextStyle::FormatLine => { + let mut text = self.text_state.unsafe_end_literal(); + text.style = match text.style { + Some(token::TextStyle::FormatLine) => Some(token::TextStyle::UnclosedLine), + _ => text.style, + }; + let token = text.into(); + self.append_token(token); + self.pop_state(); + self.block_on_newline(reader,line_ending); + }, + token::TextStyle::FormatInlineBlock => { + self.text_inline_block_on_newline(reader,line_ending); + }, + token::TextStyle::FormatBlock => { + self.text_on_end_of_line(reader,line_ending); + }, + _ => unreachable_panic!("To reach here the lexer must be inside a format literal."), + } + } + + /// Triggered when encountering a doc comment inside an interpolate. + fn on_doc_comment_in_interpolate(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_doc_comment_in_interpolate"); + let offset = self.offset.consume(); + let token = Token::unrecognized(lexeme::literal::DOC_COMMENT,offset); + let doc_len = lexeme::len(lexeme::literal::DOC_COMMENT); + let new_offset = self.consume_current().chars().count() - doc_len; + self.append_token(token); + self.offset.increase(new_offset,0); + } + + /// Triggered when submitting a raw segment of text in a literal. + fn submit_plain_segment(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::submit_plain_segment"); + let last_segment = self.text_state.consume_segment(); + let current_match = self.consume_current(); + let token = match last_segment { + Some(token) => match token { + Token{shape:token::Shape::TextSegmentRaw(lit),offset,..} => { + Token::text_segment_raw(lit + ¤t_match,offset) + }, + _ => { + let offset = self.offset.consume(); + self.text_state.append_segment(token); + Token::text_segment_raw(current_match,offset) + }, + }, + _ => { + let offset = self.offset.consume(); + Token::text_segment_raw(current_match,offset) + }, + }; + self.text_state.append_segment(token); + } + + + // === Escape Sequences === + + /// Triggered when encountering a literal escape sequence. + fn on_escape_literal(&mut self, _reader:&mut R, escape_code:&str) { + trace!(self.logger,"Text::on_escape_literal"); + let offset = self.offset.consume(); + let escape = Token::text_segment_escape(token::EscapeStyle::Literal,escape_code,offset); + self.discard_current(); + self.text_state.append_segment(escape); + } + + /// Triggered when encountering a byte escape sequence. + fn on_escape_byte(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_byte"); + let current_match = self.consume_current(); + let offset = self.offset.consume(); + let escape = escape::Byte::build(current_match); + let token = Token::text_segment_escape_from_shape(escape,offset); + self.text_state.append_segment(token); + } + + /// Triggered when encountering a U16 unicode escape sequence. + fn on_escape_u16(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_u16"); + let current_match = self.consume_current(); + let offset = self.offset.consume(); + let escape = escape::U16::build(current_match); + let token = Token::text_segment_escape_from_shape(escape,offset); + self.text_state.append_segment(token); + } + + /// Triggered when encountering a U21 unicode escape sequence. + fn on_escape_u21(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_u32"); + let current_match = self.consume_current(); + let offset = self.offset.consume(); + let escape = escape::U21::build(current_match); + let token = Token::text_segment_escape_from_shape(escape,offset); + self.text_state.append_segment(token); + } + + /// Triggered when encountering a U32 unicode escape sequence. + fn on_escape_u32(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_u32"); + let current_match = self.consume_current(); + let offset = self.offset.consume(); + let escape = escape::U32::build(current_match); + let token = Token::text_segment_escape_from_shape(escape,offset); + self.text_state.append_segment(token); + } + + /// Triggered when encountering an escaped raw quote. + fn on_escape_raw_quote(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_raw_quote"); + let offset = self.offset.consume(); + let escape = Token::text_segment_escape + (token::EscapeStyle::Literal,lexeme::literal::RAW_QUOTE,offset); + self.discard_current(); + self.text_state.append_segment(escape); + } + + /// Triggered when encountering an escaped format quote. + fn on_escape_format_quote(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_format_quote"); + let offset = self.offset.consume(); + let escape = Token::text_segment_escape + (token::EscapeStyle::Literal,lexeme::literal::FORMAT_QUOTE,offset); + self.discard_current(); + self.text_state.append_segment(escape); + } + + /// Triggered when encountering an escaped interpolate quote. + fn on_escape_interpolate_quote(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_interpolate_quote"); + let offset = self.offset.consume(); + let escape = Token::text_segment_escape + (token::EscapeStyle::Literal,lexeme::literal::INTERPOLATE_QUOTE,offset); + self.discard_current(); + self.text_state.append_segment(escape); + } + + /// Triggered when encountering an escaped backslash. + fn on_escape_slash(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_slash"); + let offset = self.offset.consume(); + let escape = Token::text_segment_escape + (token::EscapeStyle::Literal,lexeme::literal::SLASH,offset); + self.discard_current(); + self.text_state.append_segment(escape); + } + + /// Triggered when encountering an unrecognized escape sequence. + fn on_escape_invalid(&mut self, _reader:&mut R) { + trace!(self.logger,"Text::on_escape_invalid"); + let offset = self.offset.consume(); + let current = self.consume_current(); + let escape = Token::text_segment_escape(token::EscapeStyle::Invalid,current,offset); + self.text_state.append_segment(escape); + } + + + // === Rule Definitions === + /// Define the rules for lexing Enso text literals. - fn add_text_rules(_lexer:&mut EnsoLexer) { - // TODO [AA] Write the lexing rules for text literals. + fn add_text_rules(lexer:&mut EnsoLexer) { + let format_quote = lexeme::into_pattern(lexeme::literal::FORMAT_QUOTE); + let format_block_quote = lexeme::into_pattern(lexeme::literal::FORMAT_BLOCK_QUOTE); + let raw_quote = lexeme::into_pattern(lexeme::literal::RAW_QUOTE); + let raw_block_quote = lexeme::into_pattern(lexeme::literal::RAW_BLOCK_QUOTE); + let backslash = lexeme::into_pattern(lexeme::literal::SLASH); + let spaces = lexeme::definition_pattern::spaces(); + let opt_spaces = spaces.opt(); + let eof = lexeme::definition_pattern::eof(); + let interpolate_quote = lexeme::into_pattern(lexeme::literal::INTERPOLATE_QUOTE); + let lf = lexeme::into_pattern(lexeme::literal::LF); + let crlf = lexeme::into_pattern(lexeme::literal::CRLF); + let format_block_lf = &format_block_quote >> spaces.opt() >> &lf; + let format_block_crlf = &format_block_quote >> spaces.opt() >> &crlf; + let raw_block_lf = &raw_block_quote >> spaces.opt() >> &lf; + let raw_block_crlf = &raw_block_quote >> spaces.opt() >> &crlf; + let format_raw_segment = lexeme::definition_pattern::format_line_raw_char().many1(); + let format_block_segment = lexeme::definition_pattern::format_block_raw_char().many1(); + let raw_segment = lexeme::definition_pattern::raw_line_raw_char().many1(); + let raw_block_segment = lexeme::definition_pattern::raw_block_raw_char().many1(); + let unicode_escape_char = lexeme::definition_pattern::unicode_escape_digit(); + let byte_escape_start = lexeme::into_pattern(lexeme::literal::BYTE_ESCAPE_START); + let u16_escape_start = lexeme::into_pattern(lexeme::literal::U16_ESCAPE_START); + let u21_escape_start = lexeme::into_pattern(lexeme::literal::U21_ESCAPE_START); + let u21_escape_end = lexeme::into_pattern(lexeme::literal::U21_ESCAPE_END); + let u32_escape_start = lexeme::into_pattern(lexeme::literal::U32_ESCAPE_START); + let escape_byte_digits = Pattern::repeat_between(&unicode_escape_char,0,3); + let escape_u16_digits = Pattern::repeat_between(&unicode_escape_char,0,5); + let escape_u32_digits = Pattern::repeat_between(&unicode_escape_char,0,9); + let escape_byte = &byte_escape_start >> &escape_byte_digits; + let escape_u16 = &u16_escape_start >> &escape_u16_digits; + let escape_u21 = u21_escape_start >> &unicode_escape_char.many() >> u21_escape_end; + let escape_u32 = u32_escape_start >> &escape_u32_digits; + let escape_slash = lexeme::into_pattern(lexeme::literal::ESCAPED_SLASH); + let escape_invalid = &backslash >> Pattern::not_symbol(Symbol::eof()).opt(); + let escape_format_quote = &backslash >> &format_quote; + let escape_raw_quote = &backslash >> &raw_quote; + let escape_backtick = &backslash >> &interpolate_quote; + let invalid_format_quote = &format_block_quote >> format_quote.many1(); + let invalid_raw_quote = &raw_block_quote >> raw_quote.many1(); + let eof_new_line = &spaces.opt() >> &eof; + let empty_line_lf = &spaces.opt() >> &lf; + let empty_line_crlf = &spaces.opt() >> &crlf; + let comment_char = lexeme::into_pattern(lexeme::literal::COMMENT).many1(); + let doc_comment_start = lexeme::into_pattern(lexeme::literal::DOC_COMMENT) >> &opt_spaces; + + + // === Initial State Rules for Text Literals === + + let root_state_id = lexer.initial_state; + let root_state = lexer.group_mut(root_state_id); + rules!(root_state with + interpolate_quote => self.on_interpolate_end(true), + invalid_format_quote => self.on_invalid_quote(), + format_quote => self.on_begin_format_line(), + format_block_lf => self.on_begin_text_block + (token::TextStyle::FormatBlock,self.text_format_block,token::LineEnding::LF), + format_block_crlf => self.on_begin_text_block + (token::TextStyle::FormatBlock,self.text_format_block,token::LineEnding::CRLF), + format_block_quote => self.text_on_inline_block + (token::TextStyle::FormatInlineBlock,self.text_format_inline_block), + invalid_raw_quote => self.on_invalid_quote(), + raw_quote => self.on_begin_raw_line(), + raw_block_lf => self.on_begin_text_block + (token::TextStyle::RawBlock,self.text_raw_block,token::LineEnding::LF), + raw_block_crlf => self.on_begin_text_block + (token::TextStyle::RawBlock,self.text_raw_block,token::LineEnding::CRLF), + raw_block_quote => self.text_on_inline_block + (token::TextStyle::RawInlineBlock,self.text_raw_inline_block), + ); + + + // === Rules for Handling Lines in Text Blocks === + + let in_block_line_id = lexer.in_block_line; + let in_block_line = lexer.group_mut(in_block_line_id); + rules!(in_block_line with + format_block_lf => self.on_begin_text_block + (token::TextStyle::FormatBlock,self.text_format_block,token::LineEnding::LF), + format_block_crlf => self.on_begin_text_block + (token::TextStyle::FormatBlock,self.text_format_block,token::LineEnding::CRLF), + raw_block_lf => self.on_begin_text_block + (token::TextStyle::RawBlock,self.text_raw_block,token::LineEnding::LF), + raw_block_crlf => self.on_begin_text_block + (token::TextStyle::RawBlock,self.text_raw_block,token::LineEnding::CRLF), + ); + + + // === Escape Sequences === + + let text_escape_id = lexer.text_escape; + let text_escape = lexer.group_mut(text_escape_id); + for char_escape in escape::EscapeCharacter::codes() { + let pattern = Pattern::all_of(&char_escape.pattern); + let call = format!("self.on_escape_literal(reader,{:?})",&char_escape.repr.as_str()); + text_escape.create_rule(&pattern,call.as_str()); + } + rules!(text_escape with + escape_byte => self.on_escape_byte(), + escape_u21 => self.on_escape_u21(), + escape_u16 => self.on_escape_u16(), + escape_u32 => self.on_escape_u32(), + escape_slash => self.on_escape_slash(), + escape_invalid => self.on_escape_invalid(), + ); + + + // === Interpolation Rules === + + let text_interpolate_id = lexer.text_interpolate; + let text_interpolate = lexer.group_mut(text_interpolate_id); + rules!(text_interpolate with + doc_comment_start => self.on_doc_comment_in_interpolate(), + comment_char => self.on_unrecognized(), + format_block_quote => self.on_unrecognized(), + format_block_lf => self.on_text_block_nested + (token::LineEnding::LF,lexeme::literal::FORMAT_BLOCK_QUOTE), + format_block_crlf => self.on_text_block_nested + (token::LineEnding::CRLF,lexeme::literal::FORMAT_BLOCK_QUOTE), + invalid_format_quote => self.on_invalid_quote(), + raw_block_quote => self.on_unrecognized(), + raw_block_lf => self.on_text_block_nested + (token::LineEnding::LF,lexeme::literal::RAW_BLOCK_QUOTE), + raw_block_crlf => self.on_text_block_nested + (token::LineEnding::CRLF,lexeme::literal::RAW_BLOCK_QUOTE), + invalid_raw_quote => self.on_invalid_quote(), + eof => self.on_eof_in_interpolate(), + lf => self.on_newline_in_interpolate(token::LineEnding::LF), + crlf => self.on_newline_in_interpolate(token::LineEnding::CRLF), + ); + + + // === Format Text Common Rules === + + let text_format_id = lexer.text_format; + let text_format = lexer.group_mut(text_format_id); + rules!(text_format with + interpolate_quote => self.on_interpolate_begin(), + escape_backtick => self.on_escape_interpolate_quote(), + escape_format_quote => self.on_escape_format_quote() + ); + + + // === Format Text Line Rules === + + let text_format_line_id = lexer.text_format_line; + let text_format_line = lexer.group_mut(text_format_line_id); + rules!(text_format_line with + format_quote => self.submit_text_line(self.text_format_line), + format_raw_segment => self.submit_plain_segment(), + eof => self.submit_missing_quote(token::LineEnding::None), + lf => self.submit_missing_quote(token::LineEnding::LF), + crlf => self.submit_missing_quote(token::LineEnding::CRLF), + ); + + + // === Format Text Inline Block Rules === + + let format_inline_block_id = lexer.text_format_inline_block; + let format_inline_block = lexer.group_mut(format_inline_block_id); + rules!(format_inline_block with + format_block_segment => self.submit_plain_segment(), + eof => self.text_on_eof_in_inline_block(), + lf => self.text_inline_block_on_newline(token::LineEnding::LF), + crlf => self.text_inline_block_on_newline(token::LineEnding::CRLF), + ); + + + // === Format Text Block Rules === + + let format_block_id = lexer.text_format_block; + let format_block = lexer.group_mut(format_block_id); + rules!(format_block with + format_block_segment => self.submit_plain_segment(), + eof => self.text_on_end_of_block(), + lf => self.text_on_end_of_line(token::LineEnding::LF), + crlf => self.text_on_end_of_line(token::LineEnding::CRLF), + ); + + + // === Rules for Format Lines Nested Inside Interpolations === + + let format_line_nested_id = lexer.text_format_line_nested; + let format_line_nested = lexer.group_mut(format_line_nested_id); + rules!(format_line_nested with + interpolate_quote => self.on_interpolate_end(true), + format_quote => self.submit_text_line(self.text_format_line_nested), + format_raw_segment => self.submit_plain_segment(), + eof => self.submit_missing_quote_nested(token::LineEnding::None), + lf => self.submit_missing_quote_nested(token::LineEnding::LF), + crlf => self.submit_missing_quote_nested(token::LineEnding::CRLF), + ); + + + // === Raw Text Common Rules === + + let text_raw_id = lexer.text_raw; + let text_raw = lexer.group_mut(text_raw_id); + rules!(text_raw with + escape_raw_quote => self.on_escape_raw_quote(), + escape_slash => self.on_escape_slash(), + escape_invalid => self.submit_plain_segment(), + ); + + + // === Raw Text Line Rules === + + let text_raw_line_id = lexer.text_raw_line; + let text_raw_line = lexer.group_mut(text_raw_line_id); + rules!(text_raw_line with + raw_quote => self.submit_text_line(self.text_raw_line), + raw_segment => self.submit_plain_segment(), + eof => self.submit_missing_quote(token::LineEnding::None), + lf => self.submit_missing_quote(token::LineEnding::LF), + crlf => self.submit_missing_quote(token::LineEnding::CRLF), + ); + + + // === Raw Text Inline Block Rules === + + let raw_inline_block_id = lexer.text_raw_inline_block; + let raw_inline_block = lexer.group_mut(raw_inline_block_id); + rules!(raw_inline_block with + raw_block_segment => self.submit_plain_segment(), + eof => self.text_on_eof_in_inline_block(), + lf => self.text_inline_block_on_newline(token::LineEnding::LF), + crlf => self.text_inline_block_on_newline(token::LineEnding::CRLF), + ); + + + // === Raw Text Block Rules === + + let raw_block_id = lexer.text_raw_block; + let raw_block = lexer.group_mut(raw_block_id); + rules!(raw_block with + raw_block_segment => self.submit_plain_segment(), + eof => self.text_on_end_of_block(), + lf => self.text_on_end_of_line(token::LineEnding::LF), + crlf => self.text_on_end_of_line(token::LineEnding::CRLF), + ); + + + // === Rules for Raw Lines Nested Inside Interpolations === + + let raw_line_nested_id = lexer.text_raw_line_nested; + let raw_line_nested = lexer.group_mut(raw_line_nested_id); + rules!(raw_line_nested with + raw_quote => self.submit_text_line(self.text_raw_line_nested), + raw_segment => self.submit_plain_segment(), + eof => self.submit_missing_quote_nested(token::LineEnding::None), + lf => self.submit_missing_quote_nested(token::LineEnding::LF), + crlf => self.submit_missing_quote_nested(token::LineEnding::CRLF), + ); + + + // === Text Block Line Handling === + + let text_newline_id = lexer.text_seen_newline; + let text_newline = lexer.group_mut(text_newline_id); + rules!(text_newline with + spaces.opt() => self.text_on_new_line(), + empty_line_lf => self.text_on_empty_line(token::LineEnding::LF), + empty_line_crlf => self.text_on_empty_line(token::LineEnding::CRLF), + eof_new_line => self.text_on_eof_new_line(), + ); } } @@ -477,20 +1515,10 @@ impl EnsoLexer { #[allow(dead_code)] impl EnsoLexer { - /// Triggered when a unix-style line ending is seen. - fn block_on_lf(&mut self, reader:&mut R) { - self.block_state.push_line_ending(token::LineEnding::LF); - self.block_on_line_ending(reader); - } - - /// Triggered when a windows-style line ending is seen. - fn block_on_crlf(&mut self, reader:&mut R) { - self.block_state.push_line_ending(token::LineEnding::CRLF); - self.block_on_line_ending(reader); - } - /// Common functionality for both styles of line ending. - fn block_on_line_ending(&mut self, _reader:&mut R) { + fn block_on_newline(&mut self, _reader:&mut R, line_ending:token::LineEnding) { + trace!(self.logger,"Block::block_on_newline"); + self.block_state.push_line_ending(line_ending); let block_newline = self.block_newline; self.block_state.seen_newline = true; self.offset.push(); @@ -498,7 +1526,8 @@ impl EnsoLexer { } /// Transitions the lexer into a state in which it knows it is lexing a block line. - fn block_in_line(&mut self, _reader:&mut R) { + fn block_in_line(&mut self, _reader:&mut R) { + trace!(self.logger,"Block::block_in_line"); let indent_len = self.current_match.chars().count(); self.offset.increase(indent_len,0); let in_block_line = self.in_block_line; @@ -506,10 +1535,10 @@ impl EnsoLexer { } /// Triggered when lexing a non-blank line. - fn block_on_non_empty_line(&mut self, reader:&mut R) { + fn block_on_non_empty_line(&mut self, reader:&mut R) { + trace!(self.logger,"Block::block_on_non_empty_line"); let block_newline = self.block_newline; self.pop_states_including(block_newline); - match self.offset.current.cmp(&self.block_state.current().indent) { Ordering::Equal => { self.offset.consume(); @@ -517,36 +1546,27 @@ impl EnsoLexer { }, Ordering::Greater => { let new_indent = self.offset.consume(); - self.begin_block(new_indent,reader); + self.begin_block(reader,new_indent); }, Ordering::Less => { let new_indent = self.offset.consume(); - self.on_block_end(new_indent,reader); + self.on_block_end(reader,new_indent); } } } - /// Triggered when lexing a block line that is empty and ends in a unix-style line ending. - fn block_on_empty_lf_line(&mut self, reader:&mut R) { - self.block_state.push_line_ending(token::LineEnding::LF); - self.block_in_empty_line(reader); - } - - /// Triggered when lexing a block line that is empty and ends in a windows-style line ending. - fn block_on_empty_crlf_line(&mut self, reader:&mut R) { - self.block_state.push_line_ending(token::LineEnding::CRLF); - self.block_in_empty_line(reader); - } - /// Begin a new block. - fn begin_block(&mut self, block_indent:usize, _reader:&mut R) { + fn begin_block(&mut self, _reader:&mut R, block_indent:usize) { + trace!(self.logger,"Block::begin_block"); let is_orphan = self.output.is_empty(); self.push_tokens(); self.block_state.begin_block(block_indent,is_orphan); } /// Triggered when lexing an empty line in a block. - fn block_in_empty_line(&mut self, reader:&mut R) { + fn block_on_empty_line(&mut self, reader:&mut R, line_ending:token::LineEnding) { + trace!(self.logger,"Block::block_on_empty_line"); + self.block_state.push_line_ending(line_ending); self.block_submit_line(reader); let offset = self.offset.consume(); let block_newline = self.block_newline; @@ -555,17 +1575,19 @@ impl EnsoLexer { } /// Triggered when lexing a line in a block that ends a file. - fn block_in_eof_line(&mut self, reader:&mut R) { + fn block_in_eof_line(&mut self, reader:&mut R) { + trace!(self.logger,"Block::block_in_eof_line"); let initial_state = self.initial_state; self.pop_states_until(initial_state); self.on_eof(reader); } /// Triggered when beginning a top-level block. - fn block_begin_top_level(&mut self, reader:&mut R) { + fn block_begin_top_level(&mut self, reader:&mut R) { + trace!(self.logger,"Block::block_begin_top_level"); let matched_bookmark = self.bookmarks.matched_bookmark; let block_newline = self.block_newline; - let initial_state = self.initial_state; + let initial_state = self.initial_state; self.bookmarks.rewind(matched_bookmark,reader); self.offset.push(); self.pop_states_until(initial_state); @@ -573,15 +1595,16 @@ impl EnsoLexer { } /// Triggered when a block is ended. - fn on_block_end(&mut self, new_indent:usize, reader:&mut R) { + fn on_block_end(&mut self, reader:&mut R, new_indent:usize) { + trace!(self.logger,"Block::on_block_end"); if self.block_state.seen_newline { while new_indent < self.block_state.current().indent { self.block_submit(reader); } if new_indent > self.block_state.current().indent { info!(self.logger,"Block with invalid indentation."); - self.begin_block(new_indent, reader); - self.block_state.current_mut().is_valid = false; + self.begin_block(reader,new_indent); + self.block_state.current_mut().set_invalid(); } else { self.offset.push(); self.block_submit_line(reader); @@ -590,7 +1613,8 @@ impl EnsoLexer { } /// Create a block token from the current block state. - fn build_block(&mut self, reader:&mut R) -> Token { + fn build_block(&mut self, reader:&mut R) -> Token { + trace!(self.logger,"Block::build_block"); self.block_submit_line(reader); let offset = self.offset.consume(); let current_block = self.block_state.consume_current(); @@ -598,18 +1622,17 @@ impl EnsoLexer { } /// Submit a block to the token stream of the lexer. - fn block_submit(&mut self, reader:&mut R) { + fn block_submit(&mut self, reader:&mut R) { + trace!(self.logger,"Block::block_submit"); let mut block = self.build_block(reader); self.pop_tokens(); self.offset.pop(); self.block_state.end_block(); - if let Some(Token{shape:token::Shape::Operator(_),..}) = self.last_token() { - if let token::Shape::Block {indent,lines,..} = block.shape { + if let token::Shape::Block{indent,lines,..} = block.shape { block.shape = token::Shape::block(BlockType::Discontinuous,indent,lines); } } - self.append_token(block); self.offset.push(); } @@ -617,20 +1640,21 @@ impl EnsoLexer { /// Submit a line in a block. /// /// It should be noted that lines that have content in blocks cannot have an offset. - fn block_submit_line(&mut self, _reader:&mut R) { + fn block_submit_line(&mut self, _reader:&mut R) { + trace!(self.logger,"Block::block_submit_line"); if self.block_state.seen_newline { if !self.output.is_empty() { let token_stream = self.consume_tokens(); - let offset = 0; - self.block_state.append_line_to_current(token_stream.into(),offset); + self.block_state.append_line_to_current(token_stream.into()); } - debug!(self.logger,"Clear Output Buffer: Old Length = {self.output.len()}"); + debug!(self.logger,"Clear Output Buffer: Old Length = {self.output.len()}."); self.output.clear(); } } /// Triggered when the top-level block ends. - fn block_end_top_level(&mut self, _reader:&mut R) { + fn block_end_top_level(&mut self, _reader:&mut R) { + trace!(self.logger,"Block::block_end_top_level"); let current_block = self.block_state.consume_current(); if self.block_state.seen_newline { let offset = self.offset.consume(); @@ -638,37 +1662,56 @@ impl EnsoLexer { self.append_token(top_level_block); } else { let additional_offset = current_block.indent; - if let Some(token) = self.output.first_mut() { token.offset += additional_offset } + if let Some(token) = self.output.first_mut() { token.offset += additional_offset } } } /// The rule definitions for lexing blocks in Enso. fn add_block_rules(lexer:&mut EnsoLexer) { - let spaces = EnsoLexer::spaces(); - let lf = c!('\n'); - let crlf = l!("\r\n"); + let spaces = lexeme::definition_pattern::spaces(); + let lf = lexeme::into_pattern(lexeme::literal::LF); + let crlf = lexeme::into_pattern(lexeme::literal::CRLF); let opt_spaces = spaces.opt(); let eof_line = &opt_spaces >> Pattern::eof(); + let always = Pattern::always(); + + + // === Initial State Rules for Blocks === let root_state_id = lexer.initial_state; let root_state = lexer.group_mut(root_state_id); - root_state.create_rule(&lf, "self.block_on_lf(reader)"); - root_state.create_rule(&crlf,"self.block_on_crlf(reader)"); + rules!(root_state with + lf => self.block_on_newline(token::LineEnding::LF), + crlf => self.block_on_newline(token::LineEnding::CRLF), + ); + + + // === Rules for Blocks Having Seen a Newline === let block_newline_id = lexer.block_newline; let block_newline = lexer.group_mut(block_newline_id); - block_newline.create_rule(&opt_spaces,"self.block_in_line(reader)"); - block_newline.create_rule(&eof_line, "self.block_in_eof_line(reader)"); + rules!(block_newline with + opt_spaces => self.block_in_line(), + eof_line => self.block_in_eof_line(), + ); + + + // === Rules for Lines in Blocks === let in_block_line_id = lexer.in_block_line; let in_block_line = lexer.group_mut(in_block_line_id); - in_block_line.create_rule(&lf, "self.block_on_empty_lf_line(reader)"); - in_block_line.create_rule(&crlf, "self.block_on_empty_crlf_line(reader)"); - in_block_line.create_rule(&Pattern::always(),"self.block_on_non_empty_line(reader)"); + rules!(in_block_line with + lf => self.block_on_empty_line(token::LineEnding::LF), + crlf => self.block_on_empty_line(token::LineEnding::CRLF), + always => self.block_on_non_empty_line(), + ); + + + // === Rules for Top-Level Blocks === let block_module_id = lexer.block_top_level; let block_module = lexer.group_mut(block_module_id); - block_module.create_rule(&opt_spaces,"self.block_begin_top_level(reader)"); + rules!(block_module with opt_spaces => self.block_begin_top_level()); } } @@ -680,48 +1723,54 @@ impl EnsoLexer { impl EnsoLexer { /// Triggered on an arbitrary space character. - fn on_space(&mut self, _reader:&mut R) { + fn on_space(&mut self, _reader:&mut R) { + trace!(self.logger,"Root::on_space"); let current_len = self.current_match.chars().count(); self.offset.increase(current_len,0); self.discard_current(); } /// Triggered on an arbitrary eof character. - fn on_eof(&mut self, reader:&mut R) { + fn on_eof(&mut self, reader:&mut R) { + trace!(self.logger,"Root::on_eof"); + let base_block_indent = 0; self.offset.push(); self.block_submit_line(reader); - self.on_block_end(0,reader); + self.on_block_end(reader,base_block_indent); self.block_end_top_level(reader); } /// Triggered on any unrecognized character. - fn on_unrecognized(&mut self, _reader:&mut R) { - let token = Token::Unrecognized(self.consume_current(),self.offset.consume()); + fn on_unrecognized(&mut self, _reader:&mut R) { + trace!(self.logger,"Root::on_unrecognized"); + let token = Token::unrecognized(self.consume_current(),self.offset.consume()); self.append_token(token); } /// The default rules for the lexer. fn add_default_rules(lexer:&mut EnsoLexer) { - let space = Pattern::char(' '); + let space = lexeme::into_pattern(lexeme::literal::SPACE); let eof = Pattern::eof(); let any = Pattern::any(); let initial_state_id = lexer.initial_state; let initial_state = lexer.group_mut(initial_state_id); - initial_state.create_rule(&space,"self.on_space(reader)"); - initial_state.create_rule(&eof, "self.on_eof(reader)"); - initial_state.create_rule(&any, "self.on_unrecognized(reader)"); + rules!(initial_state with + space => self.on_space(), + eof => self.on_eof(), + any => self.on_unrecognized(), + ); } } - // === Trait Impls === -impl flexer::Definition for EnsoLexer { +impl enso_flexer::Definition for EnsoLexer { fn define() -> Self { let mut lexer = EnsoLexer::new(); + EnsoLexer::add_comment_rules(&mut lexer); EnsoLexer::add_operator_rules(&mut lexer); EnsoLexer::add_identifier_rules(&mut lexer); EnsoLexer::add_number_rules(&mut lexer); @@ -757,6 +1806,9 @@ impl Default for EnsoLexer { // =================== /// The state for the Enso lexer. +/// +/// This state contains all of the stateful information required to lex the Enso language, as well +/// as the functionality for manipulating the reader of the lexer. #[derive(Debug)] pub struct State { /// The logger for the lexing state. @@ -785,6 +1837,40 @@ pub struct State { block_newline : group::Identifier, /// The state entered when within the line of a block. in_block_line : group::Identifier, + /// The state containing rules common to escape sequences in text literals. + text_escape : group::Identifier, + /// The state containing rules common to format text literals. + text_format : group::Identifier, + /// The state entered when lexing a format line text literal. + text_format_line : group::Identifier, + /// The state entered when lexing a format inline block text literal. + text_format_inline_block : group::Identifier, + /// The state entered when lexing a format block text literal. + text_format_block : group::Identifier, + /// The state entered when lexing a format line nested inside an interpolation. + text_format_line_nested : group::Identifier, + /// The state containing rules common to raw text literals. + text_raw : group::Identifier, + /// The state entered when lexing a raw line text literal. + text_raw_line : group::Identifier, + /// The state entered when lexing a raw inline block text literal. + text_raw_inline_block : group::Identifier, + /// The state entered when lexing a raw block text literal. + text_raw_block : group::Identifier, + /// The state entered when lexing a raw line literal nested inside an interpolation. + text_raw_line_nested : group::Identifier, + /// The state entered when a literal newline is seen inside a text literal. + text_seen_newline : group::Identifier, + /// The state entered when a literal CRLF is seen inside a text literal. + text_interpolate : group::Identifier, + /// A parent group for all comments. + comment : group::Identifier, + /// A state for lexing disable comments. + disable_comment : group::Identifier, + /// A state for lexing doc comments. + doc_comment : group::Identifier, + /// A state for seeing a newline in a doc comment. + doc_comment_newline : group::Identifier, /// A stack of token matches. tokens_stack : Vec, /// Tracking for the current offset. @@ -792,7 +1878,11 @@ pub struct State { /// State specifically for lexing Enso numbers. number_state : NumberLexingState, /// State specifically for lexing Enso blocks. - block_state : BlockLexingState + block_state : BlockLexingState, + /// State specifically for lexing Enso text literals. + text_state : TextLexingState, + /// State specifically for lexing Enso comments. + comment_state : CommentLexingState, } impl> State { @@ -810,7 +1900,7 @@ impl> State { // === Trait Impls === -impl> flexer::State for State { +impl> enso_flexer::State for State { fn new(parent_logger:&impl AnyLogger) -> Self { let logger = ::sub(parent_logger, "State"); let bookmarks = default(); @@ -823,16 +1913,41 @@ impl> flexer::State for State { let operator_suffix_check = lexer_states.define_group("OPERATOR_SUFFIX_CHECK",None); let operator_modifier_check = lexer_states.define_group("OPERATOR_MODIFIER_CHECK",Some(operator_suffix_check)); - let block_top_level = lexer_states.define_group("BLOCK_MODULE", None); - let block_newline = lexer_states.define_group("BLOCK_NEWLINE",None); - let in_block_line = lexer_states.define_group("IN_BLOCK_LINE",None); - let tokens_stack = Vec::new(); - let offset_logger = ::sub(&logger,"Offset"); - let offset = Offset::new(offset_logger); - let number_state_logger = ::sub(&logger,"NumberState"); - let number_state = NumberLexingState::new(number_state_logger); - let block_state_logger = ::sub(&logger,"BlockLexingState"); - let block_state = BlockLexingState::new(block_state_logger); + let block_top_level = lexer_states.define_group("BLOCK_MODULE", None); + let block_newline = lexer_states.define_group("BLOCK_NEWLINE",None); + let in_block_line = lexer_states.define_group("IN_BLOCK_LINE",None); + let text_escape = lexer_states.define_group("TEXT_ESCAPE",None); + let text_format = lexer_states.define_group("TEXT_FORMAT",Some(text_escape)); + let text_format_line = lexer_states.define_group("TEXT_FORMAT_LINE",Some(text_format)); + let text_format_inline_block = + lexer_states.define_group("TEXT_FORMAT_INLINE_BLOCK",Some(text_format)); + let text_format_block = lexer_states.define_group("TEXT_FORMAT_BLOCK",Some(text_format)); + let text_format_line_nested = + lexer_states.define_group("TEXT_FORMAT_LINE_NESTED",Some(text_format_line)); + let text_raw = lexer_states.define_group("TEXT_RAW",None); + let text_raw_line = lexer_states.define_group("TEXT_RAW_LINE",Some(text_raw)); + let text_raw_inline_block = + lexer_states.define_group("TEXT_RAW_INLINE_BLOCK",Some(text_raw)); + let text_raw_block = lexer_states.define_group("TEXT_RAW_BLOCK",Some(text_raw)); + let text_raw_line_nested = + lexer_states.define_group("TEXT_RAW_LINE_NESTED",Some(text_raw_line)); + let text_seen_newline = lexer_states.define_group("TEXT_SEEN_NEWLINE",None); + let text_interpolate = lexer_states.define_group("TEXT_INTERPOLATE",Some(initial_state)); + let comment = lexer_states.define_group("COMMENT",None); + let disable_comment = lexer_states.define_group("DISABLE_COMMENT",Some(comment)); + let doc_comment = lexer_states.define_group("DOC_COMMENT",Some(comment)); + let doc_comment_newline = lexer_states.define_group("DOC_COMMENT_NEWLINE",None); + let tokens_stack = Vec::new(); + let offset_logger = ::sub(&logger,"Offset"); + let offset = Offset::new(offset_logger); + let number_state_logger = ::sub(&logger,"NumberLexingState"); + let number_state = NumberLexingState::new(number_state_logger); + let block_state_logger = ::sub(&logger,"BlockLexingState"); + let block_state = BlockLexingState::new(block_state_logger); + let text_state_logger = ::sub(&logger,"TextLexingState"); + let text_state = TextLexingState::new(text_state_logger); + let comment_state_logger = ::sub(&logger,"CommentLexingState"); + let comment_state = CommentLexingState::new(comment_state_logger); Self { logger @@ -848,10 +1963,29 @@ impl> flexer::State for State { , block_top_level , block_newline , in_block_line + , text_escape + , text_format + , text_format_line + , text_format_inline_block + , text_format_block + , text_format_line_nested + , text_raw + , text_raw_line + , text_raw_inline_block + , text_raw_block + , text_raw_line_nested + , text_seen_newline + , text_interpolate + , comment + , disable_comment + , doc_comment + , doc_comment_newline , tokens_stack , offset , number_state , block_state + , text_state + , comment_state } } @@ -887,6 +2021,12 @@ impl> flexer::State for State { // ========================= /// A manager for the current offset state of the lexer. +/// +/// The offset is the number of leading spaces between the last-lexed token and the token that is +/// currently being lexed. +/// +/// In addition to containing the _current_ offset, it also provides facilities for manipulating a +/// _stack_ of offsets as the lexer changes state. #[derive(Clone,Debug,Eq,PartialEq)] pub struct Offset { /// The current offset of the lexer. @@ -910,7 +2050,7 @@ impl Offset { /// Push the current offset onto the offset stack. pub fn push(&mut self) { - debug!(self.logger,"Push Offset: {self.current}"); + debug!(self.logger,"Push Offset: {self.current}."); self.stack.push(self.current); self.current = 0; } @@ -918,23 +2058,23 @@ impl Offset { /// Pop the top offset from the offset stack. pub fn pop(&mut self) { self.current = self.stack.pop().unwrap_or(0); - debug!(self.logger,"Pop Offset: {self.current}"); + debug!(self.logger,"Pop Offset: {self.current}."); } /// Consume the current offset. pub fn consume(&mut self) -> usize { let offset = self.current; self.current = 0; - debug!(self.logger,"Consume Offset: {offset}"); + debug!(self.logger,"Consume Offset: {offset}."); offset } /// Increase the current offset by `match_length` + `shift`. pub fn increase(&mut self, match_length:usize, shift:usize) { let diff = match_length + shift; - debug!(self.logger,"Increase Offset By: {diff}"); + debug!(self.logger,"Increase Offset By: {diff}."); self.current += diff; - debug!(self.logger,"Offset Now: {self.current}"); + debug!(self.logger,"Offset Now: {self.current}."); } } @@ -945,6 +2085,9 @@ impl Offset { // ========================= /// The state for lexing an Enso number. +/// +/// It contains the various portions of a number seen so far, and provides useful operations for the +/// manipulation of this state and subsequent conversion into a number token. #[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct NumberLexingState { /// The (optional) base for the number. @@ -958,7 +2101,7 @@ pub struct NumberLexingState { impl NumberLexingState { /// Create a new number lexing state. pub fn new(logger:Logger) -> Self { - let base = default(); + let base = default(); let literal = default(); NumberLexingState{base,literal,logger} } @@ -967,19 +2110,19 @@ impl NumberLexingState { pub fn reset(&mut self) { self.base.truncate(0); self.literal.truncate(0); - debug!(self.logger,"Reset Number State"); + debug!(self.logger,"Reset Number State."); } /// Swap the `base` and `literal` in place. pub fn swap_members(&mut self) { - debug!(self.logger,"Swap Number Fields"); + debug!(self.logger,"Swap Number Fields."); mem::swap(&mut self.base,&mut self.literal); } /// Convert `self` into a token, resetting the lexing state. pub fn consume_token(&mut self, offset:usize) -> Token { - debug!(self.logger,"Consuming Number: Base = {self.base}, Number = {self.literal}"); - Token::Number(mem::take(&mut self.base),mem::take(&mut self.literal),offset) + debug!(self.logger,"Consuming Number: Base = {self.base}, Number = {self.literal}."); + Token::number(mem::take(&mut self.base),mem::take(&mut self.literal),offset) } /// Take the `literal` portion of the number lexing state. @@ -1000,10 +2143,19 @@ impl NumberLexingState { // ======================== /// The state for managing the lexing of blocks in Enso. +/// +/// It contains structures for tracking the nesting of block literals as lexing proceeds, as well +/// as tracking the overall block state of the lexer. #[derive(Clone,Debug,PartialEq)] pub struct BlockLexingState { /// The stack of blocks being lexed. stack : NonEmptyVec, + /// Lines that will be included into the block upon encountering the _next_ valid line. + /// + /// This is necessary due to the language's grouping of trailing blank lines in block-like + /// constructs, as it ensures that they get attributed to the parent block instead of being + /// trailing blank lines in the current block. + delayed_append_lines : VecDeque, /// Whether or not the lexer has seen an explicit newline. seen_newline : bool, /// A logger for the lexing state. @@ -1013,32 +2165,57 @@ pub struct BlockLexingState { impl BlockLexingState { /// Construct a new block lexing state. pub fn new(logger:Logger) -> Self { - let stack = NonEmptyVec::singleton(default()); - let seen_newline = false; - BlockLexingState{stack,seen_newline,logger} + let stack = default(); + let delayed_append_lines = default(); + let seen_newline = false; + BlockLexingState{stack,delayed_append_lines,seen_newline,logger} } /// Set the last seen line ending. pub fn push_line_ending(&mut self, line_ending:token::LineEnding) { - self.current_mut().seen_line_endings.push_back(line_ending); - debug!(self.logger,"Push Line Ending: {line_ending:?}"); + self.seen_newline = true; + self.current_mut().record_line_ending(line_ending); + debug!(self.logger,"Push Line Ending: {line_ending:?}."); } /// Consume the last seen line ending. pub fn pop_line_ending(&mut self) -> token::LineEnding { - let popped = self.current_mut().seen_line_endings.pop_front(); - debug!(self.logger,"Pop Line Ending: {popped:?}"); - popped.unwrap_or(token::LineEnding::None) + let popped = self.current_mut().consume_line_ending(); + debug!(self.logger,"Pop Line Ending: {popped:?}."); + popped } /// Appends a line to the current block. - pub fn append_line_to_current(&mut self, tokens:Vec, offset:usize) { + pub fn append_line_to_current(&mut self, tokens:Vec) { let trailing_line_ending = self.pop_line_ending(); debug!( self.logger, - "Append Line: Line Ending = {trailing_line_ending:?}, Tokens = {&tokens:?}" + "Append Line: Line Ending = {trailing_line_ending:?}, Tokens = {&tokens:?}." ); - self.current_mut().push_line(tokens, offset, trailing_line_ending); + let offset = 0; + self.current_mut().push_line(tokens,offset,trailing_line_ending); + self.process_delayed_lines(); + } + + /// Process the delayed lines in the block, turning them into real lines. + pub fn process_delayed_lines(&mut self) { + if self.has_delayed_lines() { + let delayed = mem::take(&mut self.delayed_append_lines); + debug!(self.logger,"Appending Delayed Lines: {&delayed:?}."); + self.current_mut().lines.extend(delayed); + } + } + + /// Check if the block has delayed lines. + pub fn has_delayed_lines(&self) -> bool { + !self.delayed_append_lines.is_empty() + } + + /// Delay appending a line to the current block until after `Self::append_line_to_current()` + /// is called. + pub fn append_delayed_line(&mut self, line:Token) { + debug!(self.logger,"Delay Appending: {&line:?}."); + self.delayed_append_lines.push_back(line) } /// Get a reference to the current block. @@ -1053,30 +2230,30 @@ impl BlockLexingState { /// Push a new block state onto the stack. pub fn begin_block(&mut self, new_offset:usize, is_orphan:bool) { - debug!(self.logger,"Begin Block State: Indent = {new_offset}"); + debug!(self.logger,"Begin Block State: Indent = {new_offset}."); self.stack.push(default()); self.current_mut().is_orphan = is_orphan; - self.current_mut().indent = new_offset; + self.current_mut().indent = new_offset; } /// Pop a block state from the stack. pub fn end_block(&mut self) -> Option { - debug!(self.logger,"End Block State"); + debug!(self.logger,"End Block State."); self.stack.pop() } /// Consume the state of the current block. pub fn consume_current(&mut self) -> BlockState { let block = mem::take(self.stack.last_mut()); - debug!(self.logger,"Consume Block: {&block:?}"); + debug!(self.logger,"Consume Block: {&block:?}."); block } - /// Push an empty line into the storage for them. + /// Push an empty line into the current block. pub fn push_empty_line(&mut self, offset:usize) { let trailing_line_ending = self.pop_line_ending(); - self.current_mut().push_empty_line(offset, trailing_line_ending); - debug!(self.logger,"Append Empty line: Line Ending = {trailing_line_ending:?}"); + self.current_mut().push_empty_line(offset,trailing_line_ending); + debug!(self.logger,"Append Empty Line: Line Ending = {trailing_line_ending:?}."); } } @@ -1087,41 +2264,35 @@ impl BlockLexingState { // ================== /// The state for lexing a given block in Enso. -#[derive(Clone,Debug,PartialEq,Eq)] +/// +/// It tracks the particulars about a certain block in the program source code, including its +/// validity, whether or not it is orphaned, the root indentation of the block, as well as the lines +/// that make it up. +#[derive(Clone,Debug,Default,PartialEq,Eq)] pub struct BlockState { /// Whether or not the block is orphaned. /// /// An orphaned block is one that has no block parent. - pub is_orphan : bool, + is_orphan : bool, /// Whether or not the block is well-formed. - pub is_valid : bool, + is_invalid: bool, /// The root indentation level of the block. - pub indent: usize, + indent: usize, /// The remaining lines of the block. - pub lines : Vec, + lines : Vec, /// The line endings that have been seen in this block's context. - pub seen_line_endings : VecDeque + seen_line_endings : VecDeque } impl BlockState { - /// Construct a new block state. - pub fn new() -> Self { - let is_orphan = false; - let is_valid = true; - let offset = 0; - let lines = default(); - let seen_line_endings = default(); - BlockState{is_orphan,is_valid, indent: offset,lines,seen_line_endings} - } - /// Push a line into the block. pub fn push_line - (&mut self - , tokens : Vec - , indent: usize - , trailing_line_ending : token::LineEnding + ( &mut self + , tokens : Vec + , indent : usize + , trailing_line_ending : token::LineEnding ) { - let line = Token::Line(tokens,indent,trailing_line_ending); + let line = Token::line(tokens,indent,trailing_line_ending); self.lines.push(line) } @@ -1129,13 +2300,23 @@ impl BlockState { /// /// The offset here should be the offset from the baseline, not from the block indent level. pub fn push_empty_line(&mut self, offset:usize, trailing_line_ending:token::LineEnding) { - let line = Token::BlankLine(offset, trailing_line_ending); + let line = Token::blank_line(offset,trailing_line_ending); self.lines.push(line); } + /// Record seeing the `line_ending`. + pub fn record_line_ending(&mut self, line_ending:token::LineEnding) { + self.seen_line_endings.push_back(line_ending); + } + + /// Consume a `line_ending`. + pub fn consume_line_ending(&mut self) -> token::LineEnding { + self.seen_line_endings.pop_front().unwrap_or(token::LineEnding::None) + } + /// Convert the block state into a block token. pub fn into_token(self, offset:usize) -> Token { - Token::Block( + Token::block( BlockType::Continuous, self.indent, self.lines, @@ -1147,13 +2328,428 @@ impl BlockState { pub fn consume_lines(&mut self) -> Vec { mem::take(&mut self.lines) } + + /// Set the block as invalid. + pub fn set_invalid(&mut self) { + self.is_invalid = true; + } +} + + + +// ======================= +// === TextLexingState === +// ======================= + +/// The required state for managing the lexing of text literals in Enso. +/// +/// This maintains a stack of text literals as it is possible to nest text literals via the use +/// of interpolated segments in format text literals. +#[derive(Clone,Debug,PartialEq)] +pub struct TextLexingState { + /// The stack of text lexing states. + text_stack : Vec, + /// The logger. + logger : Logger +} + +impl TextLexingState { + /// Construct a new text lexing state. + pub fn new(logger:Logger) -> TextLexingState { + let text_stack = default(); + Self{text_stack,logger} + } + + /// Get an immutable reference to the text literal currently being lexed. + pub fn current(&self) -> Option<&TextState> { + self.text_stack.last() + } + + /// Get a mutable reference to the text literal currently being lexed. + pub fn current_mut(&mut self) -> Option<&mut TextState> { + self.text_stack.last_mut() + } + + /// Unsafely get an immutable reference to the current text literal. + /// + /// # Panics + /// If there is no text literal currently being lexed. + pub fn unsafe_current(&self) -> &TextState { + self.current().expect("Text state is present.") + } + + /// Unsafely get a mutable reference to the current text literal. + /// + /// # Panics + /// If there is no text literal currently being lexed. + pub fn unsafe_current_mut(&mut self) -> &mut TextState { + self.current_mut().expect("Text state is present.") + } + + /// Set the last seen line ending. + pub fn push_line_ending(&mut self, line_ending:token::LineEnding) { + if let Some(current_mut) = self.current_mut() { + current_mut.record_line_ending(line_ending); + debug!(self.logger,"Push Line Ending: {line_ending:?}."); + } + } + + /// Consume the last seen line ending. + pub fn pop_line_ending(&mut self) -> token::LineEnding { + if let Some(current_mut) = self.current_mut() { + let ending = current_mut.consume_line_ending(); + debug!(self.logger,"Pop Line Ending: {ending:?}."); + ending + } else { token::LineEnding::None } + } + + /// Append `token` to the currently-active line in the current literal. + pub fn append_segment(&mut self, token:Token) { + debug!(self.logger,"Append Token to Current Line: {&token:?}."); + if let Some(current_mut) = self.current_mut() { + current_mut.append_segment_to_line(token); + } + } + + /// Consume the most-recently added token from the line. + pub fn consume_segment(&mut self) -> Option { + let result = self.current_mut().map(|t| t.consume_segment_from_line()).flatten(); + debug!(self.logger,"Consume Segment: {result:?}."); + result + } + + /// Append a line to the current text literal. + pub fn append_line(&mut self, line:Token) { + debug!(self.logger,"Append Line to Current Literal: {&line:?}."); + self.current_mut().for_each(|t| t.append_line(line)); + } + + /// Append an empty line to the current text literal with `offset` leading spaces. + pub fn append_empty_line(&mut self, offset:usize) { + let line_ending = self.pop_line_ending(); + self.current_mut().iter_mut().for_each(|t| t.append_empty_line(offset,line_ending)); + debug!(self.logger,"Append Empty Line: Line Ending = {line_ending:?}."); + } + + /// Submit the current line into the block. + pub fn submit_current_line(&mut self) { + self.current_mut().for_each(|line| line.submit_current_line()); + } + + /// Begin a new text literal. + pub fn begin_literal(&mut self) -> &mut TextState { + debug!(self.logger,"Begin Text Literal."); + self.text_stack.push_and_get_mut(default()) + } + + /// End the current text literal. + pub fn end_literal(&mut self) -> Option { + debug!(self.logger,"End Text Literal."); + self.text_stack.pop() + } + + /// End the current text literal. + /// + /// # Panics + /// Panics if there is no literal to end. + pub fn unsafe_end_literal(&mut self) -> TextState { + self.end_literal().unwrap() + } + + /// Check if the lexer is currently in a nested text literal. + pub fn is_in_nested_text(&self) -> bool{ + self.text_stack.len() > 1 + } +} + + + +// ================= +// === TextState === +// ================= + +/// The state for lexing a single text literal. +/// +/// This type is responsible for tracking the particulars of a given text literal. This includes its +/// positioning information (offset and indent), as well as the _type_ of literal it is, and any +/// lines and/or segments that make up the literal. +#[derive(Clone,Debug,Default,PartialEq)] +pub struct TextState { + /// The offset of the literal from the token preceding it. + offset : usize, + /// The number of spaces used for the literal's indent in a block. + indent : usize, + /// The style of text literal being lexed. + style : Option, + /// The line ending used to open a block literal. + starting_line_ending : token::LineEnding, + /// The lines that make up the current text token. + lines : Vec, + /// The unused empty lines that make up the current text token. + empty_lines : Vec, + /// The segments that make up the current line in the text literal. + segments : Vec, + /// A stack of line endings in the text literal. + explicit_line_endings : VecDeque, +} + +impl TextState { + + /// Set the starting line ending of the literal to `line_ending`. + pub fn set_starting_line_ending(&mut self, line_ending:token::LineEnding) { + self.starting_line_ending = line_ending; + } + + /// Append a line of text to the current literal. + pub fn append_line(&mut self, line:Token) { + if self.has_empty_lines() { + let lines = self.take_empty_lines(); + lines.into_iter().for_each(|l| self.append_line(l)); + } + self.lines.push(line); + } + + /// Append an empty line with `offset` leading spaces to the current text literal. + pub fn append_empty_line(&mut self, offset:usize, line_ending:token::LineEnding) { + let line = Token::blank_line(offset,line_ending); + self.empty_lines.push(line); + } + + /// Check if the current text state has unprocessed empty lines remaining. + pub fn has_empty_lines(&self) -> bool { + !self.empty_lines.is_empty() + } + + /// Take the empty lines from the literal. + pub fn take_empty_lines(&mut self) -> Vec { + mem::take(&mut self.empty_lines) + } + + /// Append a token to the current line of the text literal. + pub fn append_segment_to_line(&mut self, token:Token) { + if self.has_empty_lines() { + let lines = self.take_empty_lines(); + lines.into_iter().for_each(|l| self.append_line(l)); + } + self.segments.push(token); + } + + /// Consume the last token from the currently active line. + pub fn consume_segment_from_line(&mut self) -> Option { + self.segments.pop() + } + + /// Push a line ending onto the line ending stack. + pub fn record_line_ending(&mut self, line_ending:token::LineEnding) { + self.explicit_line_endings.push_back(line_ending); + } + + /// Consume a line ending from the line ending stack. + pub fn consume_line_ending(&mut self) -> token::LineEnding { + self.explicit_line_endings.pop_front().unwrap_or(token::LineEnding::None) + } + + /// Consume the current line of the text literal as a line. + pub fn consume_current_line(&mut self) -> Token { + let line_ending = self.consume_line_ending(); + let tokens = mem::take(&mut self.segments); + Token::line(tokens, 0, line_ending) + } + + /// Submit the current line in the literal. + pub fn submit_current_line(&mut self) { + let line = self.consume_current_line(); + self.append_line(line); + } + + /// Set the style of text literal being lexed to `style`. + pub fn set_style(&mut self, style:token::TextStyle) { + self.style = Some(style); + } + + /// Reset the text lexing state. + pub fn reset(&mut self) { + *self = default(); + } + + /// Get the text style of the current literal, with the assumption that it's set. + pub fn unsafe_get_style(&self) -> token::TextStyle { + self.style.expect("The text style must be set.") + } } // === Trait Impls === -impl Default for BlockState { - fn default() -> Self { - BlockState::new() +impl From for Token { + fn from(mut text:TextState) -> Self { + let style = text.style.expect("The literal style must be set when consuming the literal."); + if style.is_line_literal() { + let tokens = mem::take(&mut text.segments); + Token::text_line(style,tokens,text.offset) + } else if style.is_block_literal() { + if !text.segments.is_empty() { + let last_line = text.consume_current_line(); + text.append_line(last_line); + } + let lines = mem::take(&mut text.lines); + Token::text_block(text.starting_line_ending,style,lines,text.indent,text.offset) + } else if style.is_inline_block_literal() { + let tokens = mem::take(&mut text.segments); + Token::text_inline_block(style,tokens,text.offset) + } else { + unreachable_panic!("The above cases should cover all styles."); + } + } +} + + + +// ========================== +// === CommentLexingState === +// ========================== + +/// The state for lexing comments in Enso. +/// +/// As it is impossible to nest comments, this serves as a non-consumable state for lexing them. It +/// tracks the information about the current comment being lexed. +#[derive(Clone,Debug,PartialEq)] +pub struct CommentLexingState { + /// The comment currently being lexed. + current_comment : CommentState, + /// A logger for the comment state. + logger : Logger +} + +impl CommentLexingState { + /// Construct a new comment state with the provided `logger`. + pub fn new(logger:Logger) -> Self { + let current_comment = default(); + Self{current_comment,logger} + } + + /// Append `text` to the current comment line. + pub fn append_to_line(&mut self, text:String) { + debug!(self.logger,"Append to Line: {&text:?}."); + self.current_comment.append_to_line(text); + } + + /// Submit the current line in the comment. + pub fn submit_line(&mut self, line_ending:token::LineEnding) { + debug!(self.logger,"Submit Line: Ending = {line_ending:?}."); + self.current_comment.submit_line(line_ending); + } + + /// Submit a blank line in the comment. + pub fn submit_blank_line(&mut self, indent:usize, line_ending:token::LineEnding) { + debug!(self.logger,"Submit Blank Line: Ending = {line_ending:?}."); + self.current_comment.submit_blank_line(indent,line_ending); + } + + /// Get a reference to the current comment line. + pub fn current_line(&self) -> &String { + &self.current_comment.current_line + } + + /// Get a mutable reference to the current comment line. + pub fn current_line_mut(&mut self) -> &String { + &mut self.current_comment.current_line + } + + /// Consume the current comment. + pub fn consume_current(&mut self) -> CommentState { + debug!(self.logger,"Consume Current Comment."); + mem::take(&mut self.current_comment) + } + + /// Set the indent of the current comment. + pub fn set_indent(&mut self, indent:usize) { + debug!(self.logger,"Set Indent = {indent}."); + self.current_comment.indent = indent; + } + + /// Set the offset of the current comment. + pub fn set_offset(&mut self, offset:usize) { + debug!(self.logger,"Set Offset = {offset}."); + self.current_comment.offset = offset; + } +} + + + +// ==================== +// === CommentState === +// ==================== + +/// The state for lexing a single comment. +/// +/// This type tracks the particulars of any given comment, including the lines that make up the +/// comment, as well as the indent and offset of it (the positioning information). +#[derive(Clone,Default,Debug,PartialEq)] +pub struct CommentState { + /// The lines that make up the comment. + lines : Vec, + /// A buffer of blank lines not yet appended. + blank_lines: Vec, + /// The current line being lexed in the comment. + current_line : String, + /// The indent of the comment. + indent : usize, + /// The offset of the comment. + offset : usize, +} + +impl CommentState { + + /// Append `text` to the current comment line. + pub fn append_to_line(&mut self, text:String) { + self.current_line.push_str(text.as_ref()); + } + + /// Submit the current line into the comment. + pub fn submit_line(&mut self, line_ending:token::LineEnding) { + if self.has_blank_lines() { + let blanks = mem::take(&mut self.blank_lines); + self.lines.extend(blanks); + } + let text = self.consume_current_line(); + let text_token = Token::text_segment_raw(text,0); + let line_token = Token::line(vec![text_token],0,line_ending); + self.lines.push(line_token); + } + + /// Submit a blank line. + pub fn submit_blank_line(&mut self, offset:usize, line_ending:token::LineEnding) { + let line = Token::blank_line(offset, line_ending); + self.blank_lines.push(line); + } + + /// Consume the current line. + fn consume_current_line(&mut self) -> String { + mem::take(&mut self.current_line) + } + + /// Check if the comment has blank lines available. + fn has_blank_lines(&self) -> bool { + !self.blank_lines.is_empty() + } + + /// Consume the blank lines from the comment. + pub fn consume_blank_lines(&mut self) -> Vec { + mem::take(&mut self.blank_lines) + } +} + + +// === Trait Impls === + +impl From for Token { + fn from(mut comment:CommentState) -> Self { + if comment.lines.is_empty() { + Token::disable_comment(comment.current_line,comment.offset) + } else { + if !comment.current_line.is_empty() { comment.submit_line(token::LineEnding::None); } + Token::doc_comment(comment.lines,comment.indent,comment.offset) + } } } diff --git a/lib/rust/lexer/definition/src/lib.rs b/lib/rust/lexer/definition/src/lib.rs index 57229735f9..303fc087ef 100644 --- a/lib/rust/lexer/definition/src/lib.rs +++ b/lib/rust/lexer/definition/src/lib.rs @@ -10,7 +10,10 @@ //! This library defines the lexer for the syntax of the Enso language. +pub mod escape; +pub mod lexeme; pub mod lexer; +pub mod rule; pub mod token; /// A module that can be re-exported under the same name in the generation crate. @@ -19,11 +22,14 @@ pub mod token; /// Enso lexer definition. In this project, imports should _not_ be made from the crate root /// _except_ through use of this `library` module. pub mod library { + pub use crate::escape; + pub use crate::lexeme; pub use crate::token; + pub use crate::rules; } /// A collection of functionality for working with the lexer definition. pub mod prelude { - pub use flexer::prelude::*; - pub use flexer::prelude::logger::*; + pub use enso_flexer::prelude::*; + pub use enso_flexer::prelude::logger::*; } diff --git a/lib/rust/lexer/definition/src/rule.rs b/lib/rust/lexer/definition/src/rule.rs new file mode 100644 index 0000000000..a7968908d8 --- /dev/null +++ b/lib/rust/lexer/definition/src/rule.rs @@ -0,0 +1,26 @@ +//! This file contains a macro to simplify writing the lexer rules. + + + +// =================== +// === Rules Macro === +// =================== + +/// Define a group of rules for the lexer. +/// +/// All of the rules must be defined for the same `state_name`, which must be the in-scope name of +/// the state for which the rules are being defined. Each `pattern` is a non-reference pattern that +/// the rule is being defined to match, and `code` is the code that will be executed when the rule +/// matches, omitting the (first) `reader` argument). +/// +/// Branches are matched _in order_, from top-to-bottom, much like a standard `match` statement. +/// +/// Please see `lexer.rs` for myriad examples of this macro's use. +#[macro_export] +macro_rules! rules { + ($state_name:ident with $($pattern:expr => $path_root:ident $(.$path:ident)* ($($arg:tt)*)),+ $(,)?) => { + $($state_name.create_rule(&$pattern,stringify!{ + $path_root $(.$path)* (reader,$($arg)*) + });)* + }; +} diff --git a/lib/rust/lexer/definition/src/token.rs b/lib/rust/lexer/definition/src/token.rs index 768c6ccd66..eaf4568b73 100644 --- a/lib/rust/lexer/definition/src/token.rs +++ b/lib/rust/lexer/definition/src/token.rs @@ -6,6 +6,8 @@ use crate::prelude::*; +use crate::lexeme; + // ============= @@ -24,6 +26,11 @@ pub struct Token { } impl Token { + /// Constructor. + pub fn new(shape:Shape, length:usize, offset:usize) -> Token { + Token{shape,length,offset} + } + /// Get the length that the token takes up in the program source. pub fn source_length(&self) -> usize { self.length + self.offset @@ -31,10 +38,9 @@ impl Token { } /// Constructors for the various forms of token. -#[allow(non_snake_case)] impl Token { /// Construct a token representing a referent identifier. - pub fn Referent(name:impl Str, offset:usize) -> Token { + pub fn referent(name:impl Str, offset:usize) -> Token { let str = name.into(); let length = str.chars().count(); let shape = Shape::Referent(str); @@ -42,7 +48,7 @@ impl Token { } /// Construct a token representing a variable identifier. - pub fn Variable(name:impl Str, offset:usize) -> Token { + pub fn variable(name:impl Str, offset:usize) -> Token { let str = name.into(); let length = str.chars().count(); let shape = Shape::Variable(str); @@ -50,7 +56,7 @@ impl Token { } /// Construct a token representing an external identifier. - pub fn External(name:impl Str, offset:usize) -> Token { + pub fn external(name:impl Str, offset:usize) -> Token { let str = name.into(); let length = str.chars().count(); let shape = Shape::External(str); @@ -58,61 +64,157 @@ impl Token { } /// Construct a token representing a blank identifier. - pub fn Blank(offset:usize) -> Token { + pub fn blank(offset:usize) -> Token { let shape = Shape::Blank; - let length = 1; + let length = lexeme::len(lexeme::literal::BLANK_IDENT); Token{shape,length,offset} } /// Construct a token representing an operator. - pub fn Operator(name:impl Str, offset:usize) -> Token { - let str = name.into(); - let length = str.chars().count(); - let shape = Shape::Operator(str); + pub fn operator(name:impl Str, offset:usize) -> Token { + let name = name.into(); + let length = name.chars().count(); + let shape = Shape::Operator(name); Token{shape,length,offset} } /// Construct a token representing a modifier operator. - pub fn Modifier(name:impl Str, offset:usize) -> Token { - let str = name.into(); - let length = str.chars().count() + 1; - let shape = Shape::Modifier(str); + pub fn modifier(name:impl Str, offset:usize) -> Token { + let name = name.into(); + let modifier_len = lexeme::len(lexeme::literal::EQUALS); + let length = name.chars().count() + modifier_len; + let shape = Shape::Modifier(name); + Token{shape,length,offset} + } + + /// Construct a token representing + pub fn annotation(name_str:impl Str, offset:usize) -> Token { + let name = name_str.into(); + let annotation_len = lexeme::len(lexeme::literal::ANNOTATION_SYMBOL); + let length = name.chars().count() + annotation_len; + let shape = Shape::Annotation(name); Token{shape,length,offset} } /// Construct a token representing a number literal. - pub fn Number(base:impl Str, num:impl Into, offset:usize) -> Token { - let str = num.into(); - let base_str = base.into(); - let length = if base_str.is_empty() { - str.chars().count() + pub fn number(base:impl Str, num:impl Into, offset:usize) -> Token { + let number = num.into(); + let base = base.into(); + let length = if base.is_empty() { + number.chars().count() } else { - base_str.chars().count() + 1 + str.chars().count() + let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR); + base.chars().count() + base_sep_len + number.chars().count() }; - let shape = Shape::Number{base:base_str,number:str}; + let shape = Shape::Number{base,number}; Token{shape,length,offset} } /// Construct a token representing a dangling number base. - pub fn DanglingBase(base:impl Str, offset:usize) -> Token { - let base_str = base.into(); - let length = base_str.chars().count() + 1; - let shape = Shape::DanglingBase(base_str); + pub fn dangling_base(base:impl Str, offset:usize) -> Token { + let base_str = base.into(); + let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR); + let length = base_str.chars().count() + base_sep_len; + let shape = Shape::DanglingBase(base_str); Token{shape,length,offset} } - /// Construct a token representing a text literal. - pub fn Text(text:impl Str, offset:usize) -> Token { - let str = text.into(); - let length = str.chars().count(); - let shape = Shape::Text(str); + /// Construct a token representing a line of text. + pub fn text_line(style:TextStyle, segments:Vec, offset:usize) -> Token { + let segments_len:usize = segments.iter().map(|s| s.source_length()).sum(); + let length = style.length() + segments_len; + let shape = Shape::TextLine{style,segments}; + Token{shape,length,offset} + } + + /// Construct a token representing an inline block text literal. + pub fn text_inline_block + ( style : TextStyle + , segments : Vec + , offset : usize + ) -> Token { + let segments_length:usize = segments.iter().map(|s| s.source_length()).sum(); + let length = style.length() + segments_length; + let shape = Shape::TextInlineBlock{style,segments}; + Token{shape,length,offset} + } + + /// Construct a token representing a block of text. + pub fn text_block + ( start_line_ending : LineEnding + , style : TextStyle + , lines : Vec + , indent : usize + , offset : usize + ) -> Token { + let length = style.length() + start_line_ending.size() + lines.iter().fold(0, |l,r| + l + match r.shape { + Shape::Line {..} => indent + r.source_length(), + Shape::BlankLine(_) => r.source_length(), + _ => unreachable_panic!("Text blocks should only contain lines."), + } + ); + let shape = Shape::TextBlock{start_line_ending,style,lines}; + Token{shape,length,offset} + } + + /// Construct a token representing an invalid quote. + pub fn invalid_quote(bad_quotes:impl Str, offset:usize) -> Token { + let bad_string = bad_quotes.into(); + let length = bad_string.chars().count(); + let shape = Shape::InvalidQuote(bad_string); + Token{shape,length,offset} + } + + /// Construct a token representing a raw text segment. + pub fn text_segment_raw(str:impl Str, offset:usize) -> Token { + let string = str.into(); + let length = string.chars().count(); + let shape = Shape::TextSegmentRaw(string); + Token{shape,length,offset} + } + + /// Construct a token representing an escape sequence. + pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str, offset:usize) -> Token { + let repr = repr_str.into(); + let length = style.size() + repr.chars().count(); + let shape = Shape::TextSegmentEscape{style,repr}; + Token{shape,length,offset} + } + + /// Construct a token representing an escape sequence using a literal `shape`. + pub fn text_segment_escape_from_shape(shape:Shape, offset:usize) -> Token { + match &shape { + Shape::TextSegmentEscape{style,repr} => { + let length = style.size() + repr.chars().count(); + Token{shape,length,offset} + }, + _ => unreachable_panic!("Shape must be a TextSegmentEscape.") + } + } + + /// Construct a token representing an interpolated text segment. + pub fn text_segment_interpolate(tokens:Vec, offset:usize) -> Token { + let length_of_interpolation_ticks = 2; + let length = + length_of_interpolation_ticks + tokens.iter().fold(0,|l,r| l + r.source_length()); + let shape = Shape::TextSegmentInterpolate{tokens}; + Token{shape,length,offset} + } + + /// Construct a token representing an unclosed interpolated text segment. + pub fn text_segment_unclosed_interpolate(tokens:Vec, offset:usize) -> Token { + let length_of_interpolation_tick = 1; + let length = + length_of_interpolation_tick + tokens.iter().fold(0,|l,r| l + r.source_length()); + let shape = Shape::TextSegmentUnclosedInterpolate{tokens}; Token{shape,length,offset} } /// Construct a token representing a line of tokens. - pub fn Line(tokens:Vec, offset:usize, trailing_line_ending:LineEnding) -> Token { + pub fn line(tokens:Vec, offset:usize, trailing_line_ending:LineEnding) -> Token { let line_ending_len = trailing_line_ending.size(); - let length = tokens.iter().fold(line_ending_len,|l,r| l + r.offset + r.length); + let length = tokens.iter().fold(line_ending_len,|l,r| l + r.source_length()); let shape = Shape::Line{tokens,trailing_line_ending}; Token{shape,length,offset} } @@ -121,26 +223,25 @@ impl Token { /// /// The `offset` for blank lines is from the leftmost column, not from the parent block's /// indentation. - pub fn BlankLine(offset:usize, trailing_line_ending:LineEnding) -> Token { + pub fn blank_line(offset:usize, trailing_line_ending:LineEnding) -> Token { let length = trailing_line_ending.size(); let shape = Shape::BlankLine(trailing_line_ending); Token{shape,length,offset} } /// Construct a token representing a block. - pub fn Block + pub fn block ( block_type : BlockType , indent : usize , lines : Vec , offset : usize ) -> Token { let length = lines.iter().map(|line| { - let line_length = line.length; - let line_offset = line.offset; match line.shape { - Shape::Line{..} => indent + line_offset + line_length, - Shape::BlankLine(_) => line_offset + line_length, - _ => unreachable_panic!("Tokens in a blocks should always be lines."), + Shape::Line{..} => indent + line.source_length(), + Shape::BlankLine(_) => line.source_length(), + _ => + unreachable_panic!("Tokens in a blocks should always be lines."), } }).sum(); let shape = Shape::Block{block_type,indent,lines}; @@ -148,18 +249,40 @@ impl Token { } /// Construct a token representing an invalid suffix. - pub fn InvalidSuffix(text:impl Str, offset:usize) -> Token { - let str = text.into(); - let length = str.chars().count(); - let shape = Shape::InvalidSuffix(str); + pub fn invalid_suffix(text:impl Str, offset:usize) -> Token { + let text = text.into(); + let length = text.chars().count(); + let shape = Shape::InvalidSuffix(text); Token{shape,length,offset} } /// Construct a token representing an unrecognised lexeme. - pub fn Unrecognized(text:impl Str, offset:usize) -> Token { - let str = text.into(); - let length = str.chars().count(); - let shape = Shape::Unrecognized(str); + pub fn unrecognized(text:impl Str, offset:usize) -> Token { + let text = text.into(); + let length = text.chars().count(); + let shape = Shape::Unrecognized(text); + Token{shape,length,offset} + } + + /// Construct a token representing a disable comment. + pub fn disable_comment(text:impl Str, offset:usize) -> Token { + let text = text.into(); + let comment_len = lexeme::len(lexeme::literal::COMMENT); + let length = text.chars().count() + comment_len; + let shape = Shape::DisableComment(text); + Token{shape,length,offset} + } + + /// Construct a token representing a documentation comment. + pub fn doc_comment(lines:Vec, indent:usize, offset:usize) -> Token { + let length = lines.iter().map(|line| { + match line.shape { + Shape::Line{..} => indent + line.source_length(), + Shape::BlankLine(_) => line.source_length(), + _ => unreachable_panic!("Tokens in a doc comment should always be lines."), + } + }).sum(); + let shape = Shape::DocComment{lines,indent}; Token{shape,length,offset} } } @@ -179,9 +302,11 @@ pub enum BlockType { Discontinuous, } -// =================== -// === NewlineType === -// =================== + + +// ================== +// === LineEnding === +// ================== /// The type of newline associated with the line. #[derive(Copy,Clone,Debug,Display,PartialEq,Eq)] @@ -195,12 +320,14 @@ pub enum LineEnding { } impl LineEnding { + const NO_LENGTH:usize = 0; + /// Get the number of rust `char`s that the newline type takes up. pub fn size(self) -> usize { match self { - Self::None => 0, - Self::LF => 1, - Self::CRLF => 2, + Self::None => Self::NO_LENGTH, + Self::LF => lexeme::len(lexeme::literal::LF), + Self::CRLF => lexeme::len(lexeme::literal::CRLF), } } } @@ -216,6 +343,128 @@ impl Default for LineEnding { +// ================= +// === TextStyle === +// ================= + +/// The style of the text literal. +#[derive(Copy,Clone,Debug,Eq,PartialEq)] +pub enum TextStyle { + // === Line === + + /// A interpolated text line literal. + FormatLine, + /// A raw text line literal. + RawLine, + /// An unclosed text line literal. + UnclosedLine, + + // === Inline Block === + + /// A format inline block text literal. + FormatInlineBlock, + /// A raw inline block text literal. + RawInlineBlock, + + // === Block === + + /// An interpolated text block literal. + FormatBlock, + /// A raw text block literal. + RawBlock, +} + +impl TextStyle { + /// Calculate the length of the delimiters for a particular style of text literal. + pub fn length(self) -> usize { + match self { + TextStyle::FormatLine => lexeme::len(lexeme::literal::FORMAT_QUOTE) * 2, + TextStyle::RawLine => lexeme::len(lexeme::literal::RAW_QUOTE) * 2, + TextStyle::FormatInlineBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE), + TextStyle::RawInlineBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE), + TextStyle::UnclosedLine => lexeme::len(lexeme::literal::FORMAT_QUOTE), + TextStyle::FormatBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE), + TextStyle::RawBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE), + } + } + + /// Check if the text literal is a line literal. + pub fn is_line_literal(self) -> bool { + match self { + TextStyle::RawLine => true, + TextStyle::FormatLine => true, + TextStyle::UnclosedLine => true, + _ => false, + } + } + + /// Check if the text literal is an inline block literal. + pub fn is_inline_block_literal(self) -> bool { + match self { + TextStyle::FormatInlineBlock => true, + TextStyle::RawInlineBlock => true, + _ => false, + } + } + + /// Check if the text literal is a block literal. + pub fn is_block_literal(self) -> bool { + match self { + TextStyle::FormatBlock => true, + TextStyle::RawBlock => true, + _ => false, + } + } +} + + + +// =================== +// === EscapeStyle === +// =================== + +/// A description of the style of escape sequence seen. +#[derive(Clone,Copy,Debug,Eq,PartialEq)] +pub enum EscapeStyle { + /// A \xNN-style byte escape. + Byte, + /// Unicode 16-bit escape sequence. + U16, + /// Unicode 21-bit escape sequence. + U21, + /// Unicode 32-bit escape sequence. + U32, + /// A literal escape character. + Literal, + /// An invalid unicode escape. + InvalidUnicode, + /// An invalid escape. + Invalid, + /// An escape slash without any following escape. + Unfinished, +} +impl EscapeStyle { + const NO_ADDITIONAL_LENGTH:usize = 0; + + /// Get the length taken up in source by the delimiters to an escape type. + pub fn size(self) -> usize { + match self { + EscapeStyle::Byte => lexeme::len(lexeme::literal::BYTE_ESCAPE_START), + EscapeStyle::Literal => lexeme::len(lexeme::literal::SLASH), + EscapeStyle::U16 => lexeme::len(lexeme::literal::U16_ESCAPE_START), + EscapeStyle::U32 => lexeme::len(lexeme::literal::U32_ESCAPE_START), + EscapeStyle::U21 => { + let start_len = lexeme::len(lexeme::literal::U21_ESCAPE_START); + let end_len = lexeme::len(lexeme::literal::U21_ESCAPE_END); + start_len + end_len + } + _ => Self::NO_ADDITIONAL_LENGTH, + } + } +} + + + // ============= // === Shape === // ============= @@ -241,25 +490,73 @@ pub enum Shape { Operator(String), /// A modifier identifier. Modifier(String), + /// An annotation. + Annotation(String), // === Literals === /// A literal number. - Number{base:String, number:String}, + Number { + /// The (optional) base for the number to be interpreted in. + base:String, + /// The number itself, possibly with a decimal point. + number:String + }, /// A dangling base from a number literal. DanglingBase(String), - /// A text literal. - /// - /// This is currently way too simplistic to actually represent text, but it is a good - /// placeholder. - Text(String), + /// A text line literal. + TextLine { + /// The type of literal being encoded. + style : TextStyle, + /// The segments that make up the line of text. + segments : Vec, + }, + /// An inline block text literal. + TextInlineBlock { + /// The type of literal being encoded. + style : TextStyle, + /// The segments that make up the line of text. + segments : Vec, + }, + /// A text block literal. + TextBlock { + /// The line ending that occurs directly after the opening quote marks. + start_line_ending : LineEnding, + /// The type of literal being encoded. + style : TextStyle, + /// The lines in the text block literal. + lines : Vec + }, + /// An invalid quote for a text literal. + InvalidQuote(String), + /// A segment of a line of text containing only literal text. + TextSegmentRaw(String), + /// A segment of a line of text that represents an escape sequence. + TextSegmentEscape { + /// The type of escape being represented. + style : EscapeStyle, + /// The literal escape sequence. + repr : String, + }, + /// A segment of a line of text that contains an interpolated expression. + TextSegmentInterpolate { + /// The tokens making up the interpolated expression. + tokens : Vec, + }, + /// An interpolated expression that hasn't been closed. + TextSegmentUnclosedInterpolate { + /// The tokens making up the interpolated expression. + tokens : Vec + }, + /// An invalid text segment (e.g. unclosed interpolate segment). + TextSegmentInvalid(String), // === Lines === /// A line containing tokens. /// /// The offset for a line is always zero, as it is contained in a block with a defined /// indentation. - Line{ + Line { /// The tokens on the line. tokens : Vec, /// The line ending that _ends_ the line. @@ -290,6 +587,17 @@ pub enum Shape { InvalidSuffix(String), /// An unrecognized token. Unrecognized(String), + + // === Comments === + /// A disable comment (`# ...`). + DisableComment(String), + /// An Enso documentation comment (`## ...`). + DocComment { + /// The lines in the doc comment body. Each line must contain raw text segments only. + lines : Vec, + /// The indentation of the doc comment's body from the baseline. + indent : usize + } } impl Shape { @@ -326,9 +634,16 @@ impl Shape { Shape::Modifier(opr.into()) } + /// Construct an annotation identifier. + pub fn annotation(name:impl Into) -> Shape { + Shape::Annotation(name.into()) + } + /// Construct a number literal. pub fn number(base:impl Into, num:impl Into) -> Shape { - Shape::Number{base:base.into(),number:num.into()} + let base = base.into(); + let number = num.into(); + Shape::Number{base,number} } /// Construct a dangling base literal. @@ -336,9 +651,50 @@ impl Shape { Shape::DanglingBase(base.into()) } - /// Construct a text literal. - pub fn text(text:impl Into) -> Shape { - Shape::Text(text.into()) + /// Construct a text line literal. + pub fn text_line(style:TextStyle, segments:Vec) -> Shape { + Shape::TextLine{style,segments} + } + + /// Construct an inline block text literal. + pub fn text_inline_block(style:TextStyle, segments:Vec) -> Shape { + Shape::TextInlineBlock{style,segments} + } + + /// Construct a text block literal. + pub fn text_block(start_line_ending: LineEnding, style:TextStyle, lines:Vec) -> Shape { + Shape::TextBlock{start_line_ending,style,lines} + } + + /// Construct an invalid quote literal. + pub fn invalid_quote(bad_quotes:impl Str) -> Shape { + Shape::InvalidQuote(bad_quotes.into()) + } + + /// Construct a raw text segment. + pub fn text_segment_raw(text:impl Str) -> Shape { + Shape::TextSegmentRaw(text.into()) + } + + /// Construct a text segment containing an escape sequence. + pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str) -> Shape { + let repr = repr_str.into(); + Shape::TextSegmentEscape{style,repr} + } + + /// Construct a text segment containing an interpolated expression. + pub fn text_segment_interpolate(tokens:Vec) -> Shape { + Shape::TextSegmentInterpolate{tokens} + } + + /// Construct a text segment containing an unclosed interpolated expression. + pub fn text_segment_unclosed_interpolate(tokens:Vec) -> Shape { + Shape::TextSegmentUnclosedInterpolate{tokens} + } + + /// Construct an invalid text segment. + pub fn text_segment_invalid(str:impl Str) -> Shape { + Shape::TextSegmentInvalid(str.into()) } /// Construct a line that contains tokens. @@ -365,6 +721,16 @@ impl Shape { pub fn unrecognized(text:impl Into) -> Shape { Shape::Unrecognized(text.into()) } + + /// Construct a disable comment shape. + pub fn disable_comment(text:impl Str) -> Shape { + Shape::DisableComment(text.into()) + } + + /// Construct a doc comment shape. + pub fn doc_comment(lines:Vec, indent:usize) -> Shape { + Shape::DocComment{lines,indent} + } } @@ -425,146 +791,3 @@ impl Into> for Stream { self.tokens } } - - - -// ============= -// === Tests === -// ============= - -#[cfg(test)] -mod tests { - use super::*; - use crate::token::BlockType; - - - // === Testing Utilities === - - /// Asserts that the `token` has the provided `shape`. - pub fn assert_shape(token:&Token, shape:Shape) { - assert_eq!(token.shape,shape); - } - - /// Asserts that the `token` has the provided `length`. - pub fn assert_length(token:&Token, length:usize) { - assert_eq!(token.length,length) - } - - - // === Tests for Token Construction === - - #[test] - fn construct_referent_token() { - let token = Token::Referent("Some_Ref_Name",0); - assert_shape(&token,Shape::referent("Some_Ref_Name")); - assert_length(&token,13); - } - - #[test] - fn construct_variable_token() { - let token = Token::Variable("some_variable_name",0); - assert_shape(&token,Shape::variable("some_variable_name")); - assert_length(&token,18); - } - - #[test] - fn construct_external_name_token() { - let token = Token::External("camelCase",0); - assert_shape(&token,Shape::external("camelCase")); - assert_length(&token,9); - } - - #[test] - fn construct_blank_token() { - let token = Token::Blank(0); - assert_shape(&token,Shape::blank()); - assert_length(&token,1); - } - - #[test] - fn construct_operator_token() { - let token = Token::Operator("==>",0); - assert_shape(&token,Shape::operator("==>")); - assert_length(&token,3); - } - - #[test] - fn construct_modifier_token() { - let token = Token::Modifier("+",0); - assert_shape(&token,Shape::modifier("+")); - assert_length(&token,2); - } - - #[test] - fn construct_number_token() { - let token = Token::Number("","1231",0); - assert_shape(&token,Shape::number("","1231")); - assert_length(&token,4); - } - - #[test] - fn construct_dangling_base_token() { - let token = Token::DanglingBase("15",0); - assert_shape(&token,Shape::dangling_base("15")); - assert_length(&token,3); - } - - #[test] - fn construct_text_token() { - let token = Token::Text("some prose goes here",0); - assert_shape(&token,Shape::text("some prose goes here")); - assert_length(&token,20); - // TODO [AA] Make this internally account for length of quotes. - } - - #[test] - fn construct_line_token() { - let tokens = vec![Token::Variable("aa",0),Token::Referent("Abc",1)]; - let token = Token::Line(tokens.clone(), 4, LineEnding::LF); - assert_shape(&token,Shape::line(tokens.clone(), LineEnding::LF)); - assert_length(&token,7); - } - - #[test] - fn construct_blank_line_token() { - let token = Token::BlankLine(13,LineEnding::LF); - assert_shape(&token, Shape::blank_line(LineEnding::LF)); - assert_length(&token,1); - } - - #[test] - fn construct_block_token_lf() { - let lines = vec![ - Token::Line(vec![],0,LineEnding::LF), - Token::Line(vec![],4,LineEnding::LF) - ]; - let token = Token::Block(BlockType::Continuous,4,lines.clone(),0); - assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone())); - assert_length(&token,14); - } - - #[test] - fn construct_block_token_crlf() { - let lines = vec![ - Token::Line(vec![],0,LineEnding::CRLF), - Token::Line(vec![],4,LineEnding::CRLF) - ]; - let token = Token::Block(BlockType::Continuous,4,lines.clone(),0); - assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone())); - assert_length(&token,16); - } - - #[test] - fn construct_invalid_suffix_token() { - let token = Token::InvalidSuffix("aaa",0); - assert_shape(&token,Shape::invalid_suffix("aaa")); - assert_length(&token,3); - } - - #[test] - fn construct_unrecognized_token() { - let token = Token::Unrecognized("a",0); - assert_shape(&token,Shape::unrecognized("a")); - assert_length(&token,1); - } -} diff --git a/lib/rust/lexer/generation/Cargo.toml b/lib/rust/lexer/generation/Cargo.toml index 2dddfa86d2..e5a509c678 100644 --- a/lib/rust/lexer/generation/Cargo.toml +++ b/lib/rust/lexer/generation/Cargo.toml @@ -12,10 +12,17 @@ test = true bench = true [dependencies] -flexer = { path = "../../flexer", version = "0.1.0" } -enso-prelude = { version = "0.1.3" } +enso-flexer = { version = "0.1.3" } +enso-prelude = { version = "0.1.7" } lexer-definition = { path = "../definition", version = "0.1.0" } [build-dependencies] -flexer = { path = "../../flexer", version = "0.1.0" } +enso-flexer = { version = "0.1.3" } lexer-definition = { path = "../definition", version = "0.1.0" } + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "lexer_time_bench" +harness = false diff --git a/lib/rust/lexer/generation/benches/lexer_bench_sources.rs b/lib/rust/lexer/generation/benches/lexer_bench_sources.rs new file mode 100644 index 0000000000..411d829486 --- /dev/null +++ b/lib/rust/lexer/generation/benches/lexer_bench_sources.rs @@ -0,0 +1,337 @@ +//! This file contains the sources that are replicated many times over for the purposes of +//! benchmarking the Enso lexer. + +use criterion::{black_box, Criterion, Throughput}; +use enso_flexer::prelude::Reader; +use enso_flexer::prelude::reader::decoder::DecoderUTF8; +use lexer::generated::engine::EnsoLexer; +use std::time::Duration; + + + +// =============================== +// === Benchmark Configuration === +// =============================== + +/// Configures the benchmarking process. +pub fn bench_config() -> Criterion { + Criterion::default() + .measurement_time(Duration::from_secs(60)) + .warm_up_time(Duration::from_secs(3)) + .sample_size(25) + .retain_baseline("EnsoLexer".to_string()) +} + + + +// ======================= +// === Benchmark Setup === +// ======================= + +/// The sizes of text to run the benchmarks over. +pub const SIZES:[(usize,&str);4] = [ + (1024 , "1KB" ), + (1024*100 , "100KB"), + (1024*1024 , "1MB" ), + (1024*1024*10 , "10MB" ), +]; + + + +// ============================== +// === Benchmarking Utilities === +// ============================== + +/// Execute the provided benchmark for each of the [`SIZES`] above. +pub fn run_bench_sizes(name:&str, input:&str, add_newline:bool, c:&mut Criterion) { + let mut group = c.benchmark_group(name); + SIZES.iter().for_each(|(size,size_name)| { + group.throughput(Throughput::Bytes(*size as u64)); + let input = replicate_to_size(input,*size,add_newline); + group.bench_function( + *size_name, + |b| b.iter(|| { + let mut lexer = EnsoLexer::new(); + let reader = Reader::new(input.as_str().as_bytes(),DecoderUTF8()); + lexer.run(black_box(reader)); + }) + ); + }) +} + +/// This function replicates `input` until it reaches `size` (in bytes). +/// +/// If this cannot be done exactly, it will err on the side of over-replication, +/// meaning that the output will be _larger_ than `size` bytes. If the size of +/// the input already exceeds `size`, it is returned unchanged. +pub fn replicate_to_size(input:&str, size:usize, add_newline:bool) -> String { + let input_size = input.len(); + let times = 1 + (size / input_size); + let mut input_newline = input.to_string(); + let to_add = if add_newline { '\n' } else { ' ' }; + input_newline.push(to_add); + input_newline.repeat(times) +} + +/// Replace any windows-style line-endings in `input` with unix-style line-endings. +fn preprocess(input:&str) -> String { + input.replace("\r\n","\n") +} + + + +// ============== +// === Macros === +// ============== + +#[macro_export] +macro_rules! bench { + (bench_name = $bench_name:literal; fun_name = $fun_name:ident; bench_input = $bench_input:expr;) => { + pub fn $fun_name(c:&mut Criterion) { + src::run_bench_sizes( + $bench_name, + $bench_input.as_str(), + true, + c + ) + } + } +} + + + +// ================================= +// === Literal Benchmark Sources === +// ================================= + +#[allow(missing_docs)] +pub mod literal { + use super::*; + + pub mod number { + use super::*; + + pub fn integer() -> String { + preprocess("12345") + } + + pub fn integer_explicit_base() -> String { + preprocess("16_a4fd31") + } + + pub fn decimal() -> String { + preprocess("1.3141") + } + + pub fn decimal_explicit_base() -> String { + preprocess("10_1.000999") + } + + pub fn error_base() -> String { + preprocess("10.2_2") + } + } + + pub mod text { + use super::*; + + pub fn format_line() -> String { + preprocess(r"'dearest creature in \n creation studying english pronunciation'") + } + + pub fn format_inline_block() -> String { + preprocess(r"''' An inline block. It's a very good inline block carl \u{AB}") + } + + pub fn format_block() -> String { + preprocess( +r#"''' Here is my block of format text. I can `interpolate + things` like that. + It goes on and on and on for `times` times because I feel like it. + + Complex interpolated expression `x -> y ~> x | y` woo! +"#) + } + + pub fn raw_line() -> String { + preprocess(r#""dearest creature in '''' creation studying english pronunciation""#) + } + + pub fn raw_inline_block() -> String { + preprocess(r#"""" An inline block. It's a very good inline block carl ""#) + } + + pub fn raw_block() -> String { + preprocess( +r#"""" Here is my block of raw text. `Interpolations` are nothing special here. + It goes on and on and on for I can escape \" though. + + It also supports blank lines! +"#) + } + } +} + + + +// ============================== +// === Name Benchmark Sources === +// ============================== + +#[allow(missing_docs)] +pub mod name { + use super::*; + + pub fn line_of() -> String { + preprocess( + "Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1" + ) + } + + pub fn invalid_suffix() -> String { + preprocess("some_var'iable some_varД") + } +} + + + +// =================================== +// === Operator Benchmarks Sources === +// =================================== + +#[allow(missing_docs)] +pub mod operator { + use super::*; + + pub fn line_of() -> String { + preprocess("+ - * -> ~> <~ <- ! & | /") + } + + pub fn dot_call() -> String { + preprocess(".== . != .<*> .*> .|>") + } + + pub fn invalid_suffix() -> String { + preprocess(".... +==") + } +} + + + +// ================================ +// === Block Benchmarks Sources === +// ================================ + +#[allow(missing_docs)] +pub mod block { + use super::*; + + pub fn top_level() -> String { + preprocess("foo\nbar\nbaz") + } + + pub fn nested() -> String { + preprocess("foo\nbar\n baz\n quux") + } + + pub fn deeply_nested() -> String { + preprocess( +r#"foo +bar + baz + quux + bim + bam + oh +no +"#) + } +} + + + +// =================================== +// === Comments Benchmarks Sources === +// =================================== + +#[allow(missing_docs)] +pub mod comment { + use super::*; + + pub fn line() -> String { + preprocess("# foo bar baz I have a really long line comment here that goes on and on") + } + + pub fn in_line() -> String { + preprocess("a + b # A useless comment: add a to b") + } + + pub fn doc() -> String { + preprocess( +r#"## I have a really big doc comment here + That just keeps prattling on and on and on. + + With blank lines + + Forever + + and + ever + + and + + + + + ever +documented +"#) + } +} + + + +// =========================== +// === Combined Benchmarks === +// =========================== + +pub mod combined { + use super::*; + + pub fn simple() -> String { + preprocess( +r#" +import Base.Meta + +## Decompose the value using runtime reflection and print its decomposition. +Main.print_decomp a b = + y = a + b + decomp = Meta.decompose y + Io.println decomp +"#) + } + + pub fn complex() -> String { + preprocess( +r#" +import Base.Meta + +## Frobnicate the doodads by constructing a new type operator through runtime reflection such that + it can be passed to another language. + + ! WARNING + Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside + Enso code. +Main.foo a b = + y = x -> z -> + ty = a.gen_type (~>) (<-) b + ty (z x) + decomp = Meta.decompose (y a b) + Io.println decomp + +## Execute the main function of this project. +main = + func = Meta.reify (here.foo "My_Name" "my_field") + Io.println(func) +"#) + } +} diff --git a/lib/rust/lexer/generation/benches/lexer_time_bench.rs b/lib/rust/lexer/generation/benches/lexer_time_bench.rs new file mode 100644 index 0000000000..ac1605c991 --- /dev/null +++ b/lib/rust/lexer/generation/benches/lexer_time_bench.rs @@ -0,0 +1,295 @@ +//! This file contains the time-based benchmarks for the Enso lexer. + +mod lexer_bench_sources; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput, black_box}; +use lexer_bench_sources as src; + + + +// ========================== +// === Literal Benchmarks === +// ========================== + +bench! { + bench_name = "Integer"; + fun_name = bench_literal_number_integer; + bench_input = src::literal::number::integer(); +} + +bench! { + bench_name = "Integer Explicit Base"; + fun_name = bench_literal_number_integer_explicit_base; + bench_input = src::literal::number::integer_explicit_base(); +} + +bench! { + bench_name = "Decimal"; + fun_name = bench_literal_number_decimal; + bench_input = src::literal::number::decimal(); +} + +bench! { + bench_name = "Decimal Explicit Base"; + fun_name = bench_literal_number_decimal_explicit_base; + bench_input = src::literal::number::decimal_explicit_base(); +} + +bench! { + bench_name = "Number Error Base"; + fun_name = bench_literal_number_error_base; + bench_input = src::literal::number::error_base(); +} + +bench! { + bench_name = "Text Format Line"; + fun_name = bench_literal_text_format_line; + bench_input = src::literal::text::format_line(); +} + +bench! { + bench_name = "Text Format Inline Block"; + fun_name = bench_literal_text_format_inline_block; + bench_input = src::literal::text::format_inline_block(); +} + +bench! { + bench_name = "Text Format Block"; + fun_name = bench_literal_text_format_block; + bench_input = src::literal::text::format_block(); +} + +bench! { + bench_name = "Text Raw Line"; + fun_name = bench_literal_text_raw_line; + bench_input = src::literal::text::raw_line(); +} + +bench! { + bench_name = "Text Raw Inline Block"; + fun_name = bench_literal_text_raw_inline_block; + bench_input = src::literal::text::raw_inline_block(); +} + +bench! { + bench_name = "Text Raw Block"; + fun_name = bench_literal_text_raw_block; + bench_input = src::literal::text::raw_block(); +} + +criterion_group!{ + name = literal_benchmarks; + config = src::bench_config(); + targets = + bench_literal_number_integer, + bench_literal_number_integer_explicit_base, + bench_literal_number_decimal, + bench_literal_number_decimal_explicit_base, + bench_literal_number_error_base, + bench_literal_text_format_line, + bench_literal_text_format_inline_block, + bench_literal_text_format_block, + bench_literal_text_raw_line, + bench_literal_text_raw_inline_block, + bench_literal_text_raw_block, +} + + + +// ======================== +// === Names Benchmarks === +// ======================== + +bench! { + bench_name = "Line of Names"; + fun_name = bench_names_line_of; + bench_input = src::name::line_of(); +} + +bench! { + bench_name = "Names with invalid Suffixes"; + fun_name = bench_names_invalid_suffix; + bench_input = src::name::invalid_suffix(); +} + +criterion_group! { + name = name_benchmarks; + config = src::bench_config(); + targets = + bench_names_line_of, + bench_names_invalid_suffix, +} + + + +// =========================== +// === Operator Benchmarks === +// =========================== + +bench! { + bench_name = "Line of Operators"; + fun_name = bench_operator_line_of; + bench_input = src::operator::line_of(); +} + +bench! { + bench_name = "Dot Call Operators"; + fun_name = bench_operator_dot_call; + bench_input = src::operator::dot_call(); +} + +bench! { + bench_name = "Operators with Invalid Suffixes"; + fun_name = bench_operator_invalid_suffix; + bench_input = src::operator::invalid_suffix(); +} + +criterion_group! { + name = operator_benchmarks; + config = src::bench_config(); + targets = + bench_operator_line_of, + bench_operator_dot_call, + bench_operator_invalid_suffix +} + + + +// ======================== +// === Block Benchmarks === +// ======================== + +bench! { + bench_name = "Top Level Block"; + fun_name = bench_block_top_level; + bench_input = src::block::top_level(); +} + +bench! { + bench_name = "Nested Block"; + fun_name = bench_block_nested; + bench_input = src::block::nested(); +} + +bench! { + bench_name = "Deeply Nested Blocks"; + fun_name = bench_block_deeply_nested; + bench_input = src::block::deeply_nested(); +} + +criterion_group! { + name = block_benchmarks; + config = src::bench_config(); + targets = + bench_block_top_level, + bench_block_nested, + bench_block_deeply_nested, +} + + + +// ========================== +// === Comment Benchmarks === +// ========================== + +bench! { + bench_name = "Line Comment"; + fun_name = bench_comment_line; + bench_input = src::comment::line(); +} + +bench! { + bench_name = "Comment in Line"; + fun_name = bench_comment_in_line; + bench_input = src::comment::in_line(); +} + +bench! { + bench_name = "Doc Comment"; + fun_name = bench_comment_doc; + bench_input = src::comment::doc(); +} + +criterion_group! { + name = comment_benchmarks; + config = src::bench_config(); + targets = + bench_comment_line, + bench_comment_in_line, + bench_comment_doc, +} + + + +// =========================== +// === Combined Benchmarks === +// =========================== + +bench! { + bench_name = "Simple Combined Example"; + fun_name = bench_combined_simple; + bench_input = src::combined::simple(); +} + +bench! { + bench_name = "Complex Combined Example"; + fun_name = bench_combined_complex; + bench_input = src::combined::complex(); +} + +criterion_group! { + name = combined_benchmarks; + config = src::bench_config(); + targets = + bench_combined_simple, + bench_combined_complex, +} + + + +// =================== +// === Comparisons === +// =================== + +fn bench_rust_reader(c:&mut Criterion) { + let mut group = c.benchmark_group("Rust Vector"); + src::SIZES.iter().for_each(|(size,name)| { + group.throughput(Throughput::Bytes(*size as u64)); + let input = "abcdefghijklmnopqrstuvwxyz".repeat(1 + size / 26); + group.bench_function( + *name, + |b| b.iter(|| { + let mut counter = 0usize; + for c in black_box(input.as_str()).chars() { + if c == 'f' { + counter += 1; + } + } + counter + }) + ); + }) +} + +criterion_group! { + name = rust_comparison; + config = src::bench_config(); + targets = + bench_rust_reader, +} + + + +// =================== +// === The Harness === +// =================== + +criterion_main!( + literal_benchmarks, + name_benchmarks, + operator_benchmarks, + block_benchmarks, + comment_benchmarks, + combined_benchmarks, + rust_comparison, +); diff --git a/lib/rust/lexer/generation/build.rs b/lib/rust/lexer/generation/build.rs index f909095126..aea553c69d 100644 --- a/lib/rust/lexer/generation/build.rs +++ b/lib/rust/lexer/generation/build.rs @@ -1,8 +1,8 @@ use std::fs::File; use lexer_definition::lexer::EnsoLexer; use std::io::prelude::*; -use flexer::Definition; -use flexer::State; +use enso_flexer::Definition; +use enso_flexer::State; @@ -23,6 +23,7 @@ fn generate_engine() -> std::io::Result<()> { let engine = lexer.specialize().unwrap(); lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition."); file.write_all(contents.as_bytes()).expect("Unable to write lexer definition."); + file.write_all("\n".as_bytes())?; file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization."); Ok(()) } diff --git a/lib/rust/lexer/generation/src/lib.rs b/lib/rust/lexer/generation/src/lib.rs index fedbc12bae..157cc805fb 100644 --- a/lib/rust/lexer/generation/src/lib.rs +++ b/lib/rust/lexer/generation/src/lib.rs @@ -19,6 +19,7 @@ mod library { pub use lexer_definition::library::*; } + /// A library of commonly useful functionality. mod prelude { pub use lexer_definition::prelude::*; diff --git a/lib/rust/lexer/generation/tests/enso_lexer.rs b/lib/rust/lexer/generation/tests/enso_lexer.rs deleted file mode 100644 index caf6eed33f..0000000000 --- a/lib/rust/lexer/generation/tests/enso_lexer.rs +++ /dev/null @@ -1,759 +0,0 @@ -#![feature(test)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unsafe_code)] -#![warn(unused_import_braces)] - -//! This file contains tests for the Enso Lexer. - -// TODO [AA] Tests for error scenarios once it's done. - -use flexer::*; -use lexer_definition::library::*; - -use flexer::prelude::reader::decoder::DecoderUTF8; -use flexer::prelude::Reader; -use lexer::generated::engine::EnsoLexer; -use lexer_definition::library::token::Token; -use lexer_definition::token::BlockType; -use lexer_definition::token::LineEnding; - - - -// ================= -// === Utilities === -// ================= - -/// Assert that `result` is a success with tokens `expected`. -fn assert_succeeds_as(result:&LexingResult, expected:token::Stream) { - match result.kind { - ResultKind::Success => assert_eq!(result.tokens,expected), - _ => panic!("Lexing failed.") - } -} - -/// Assert that the provided input lexes as `expected`. -fn assert_lexes(input:impl AsRef, expected:token::Stream) { - let input_len = input.as_ref().chars().count(); - let result = lex(input); - assert_succeeds_as(&result,expected); - let tokens_vec : Vec<_> = result.tokens.into(); - let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum(); - assert_eq!(total_length,input_len); -} - -/// Lex the provided string. -fn lex(input:impl AsRef) -> LexingResult { - let mut lexer = EnsoLexer::new(); - let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8()); - lexer.run(reader) -} - -/// Asserts that the input is a block and has a length equal to `length`. -fn assert_block_has_length(input:impl AsRef, expected_length:usize) { - let result = lex(input); - match result.kind { - ResultKind::Success => { - let tokens = result.tokens.tokens(); - match tokens.first().expect("Token should be present.") { - Token{shape:token::Shape::Block{..},length,..} => - assert_eq!(*length,expected_length), - _ => panic!("Token not a block."), - } - }, - _ => panic!("Lexing failed"), - } -} - -/// Makes the test text have unix line endings to ensure consistency regardless of git checkout -/// style. -fn make_unix_line_endings(input:&str) -> String { - let string = String::from(input); - string.chars().filter(|c| *c != '\r').collect() -} - - - -// ================= -// === Operators === -// ================= - -#[test] -fn function_operator() { - let input = "->"; - let expected = token::Stream::from(vec![Token::Operator("->",0)]); - assert_lexes(input,expected); -} - -#[test] -fn bind_operator() { - let input = "<-"; - let expected = token::Stream::from(vec![Token::Operator("<-",0)]); - assert_lexes(input,expected); -} - -#[test] -fn left_pipe_operator() { - let input = "<|"; - let expected = token::Stream::from(vec![Token::Operator("<|",0)]); - assert_lexes(input,expected); -} - -#[test] -fn right_pipe_operator() { - let input = "|>"; - let expected = token::Stream::from(vec![Token::Operator("|>",0)]); - assert_lexes(input,expected); -} - -#[test] -fn eq_operator() { - let input = "="; - let expected = token::Stream::from(vec![Token::Operator("=",0)]); - assert_lexes(input,expected); -} - -#[test] -fn eq_compare_operator() { - let input = "=="; - let expected = token::Stream::from(vec![Token::Operator("==",0)]); - assert_lexes(input,expected); -} - -#[test] -fn geq_operator() { - let input = ">="; - let expected = token::Stream::from(vec![Token::Operator(">=",0)]); - assert_lexes(input,expected); -} - -#[test] -fn neq_operator() { - let input = "!="; - let expected = token::Stream::from(vec![Token::Operator("!=",0)]); - assert_lexes(input,expected); -} - -#[test] -fn dot_operator() { - let input = "."; - let expected = token::Stream::from(vec![Token::Operator(".",0)]); - assert_lexes(input,expected); -} - -#[test] -fn comma_operator() { - let input = ","; - let expected = token::Stream::from(vec![Token::Operator(",",0)]); - assert_lexes(input,expected); -} - -#[test] -fn double_dot_operator() { - let input = ".."; - let expected = token::Stream::from(vec![Token::Operator("..",0)]); - assert_lexes(input,expected); -} - -#[test] -fn triple_dot_operator() { - let input = "..."; - let expected = token::Stream::from(vec![Token::Operator("...",0)]); - assert_lexes(input,expected); -} - -#[test] -fn error_operator() { - let input = "!"; - let expected = token::Stream::from(vec![Token::Operator("!",0)]); - assert_lexes(input,expected); -} - -#[test] -fn type_ascription_operator() { - let input = ":"; - let expected = token::Stream::from(vec![Token::Operator(":",0)]); - assert_lexes(input,expected); -} - -#[test] -fn in_operator() { - let input = "in"; - let expected = token::Stream::from(vec![Token::Operator("in",0)]); - assert_lexes(input,expected); -} - -#[test] -fn typeset_union_operator() { - let input = "|"; - let expected = token::Stream::from(vec![Token::Operator("|",0)]); - assert_lexes(input,expected); -} - -#[test] -fn typeset_intersection_operator() { - let input = "&"; - let expected = token::Stream::from(vec![Token::Operator("&",0)]); - assert_lexes(input,expected); -} - -#[test] -fn typeset_subtraction_operator() { - let input = "\\"; - let expected = token::Stream::from(vec![Token::Operator("\\",0)]); - assert_lexes(input,expected); -} - -#[test] -fn disable_comment() { - let input = "#"; - let expected = token::Stream::from(vec![Token::Operator("#",0)]); - assert_lexes(input,expected); -} - -#[test] -fn doc_comment() { - let input = "##"; - let expected = token::Stream::from(vec![Token::Operator("##",0)]); - assert_lexes(input,expected); -} - -#[test] -fn arbitrary_left_operator() { - let input = ">"; - let expected = token::Stream::from(vec![Token::Operator("-->>",0)]); - assert_lexes(input,expected); -} - -#[test] -fn modifier_plus() { - let input = "+="; - let expected = token::Stream::from(vec![Token::Modifier("+",0)]); - assert_lexes(input,expected); -} - -#[test] -fn modifier_minus() { - let input = "-="; - let expected = token::Stream::from(vec![Token::Modifier("-",0)]); - assert_lexes(input,expected); -} - -#[test] -fn arbitrary_modifier() { - let input = "<%="; - let expected = token::Stream::from(vec![Token::Modifier("<%",0)]); - assert_lexes(input,expected); -} - -#[test] -fn invalid_eq_suffix() { - let input = "==="; - let expected = token::Stream::from(vec![Token::Operator("==",0),Token::InvalidSuffix("=",0)]); - assert_lexes(input,expected); -} - -#[test] -fn invalid_dots_suffix() { - let input = "...."; - let expected = token::Stream::from(vec![Token::Operator("...",0),Token::InvalidSuffix(".",0)]); - assert_lexes(input,expected); -} - -#[test] -fn invalid_modifier_suffix() { - let input = "+=="; - let expected = token::Stream::from(vec![Token::Operator("+",0),Token::InvalidSuffix("==",0)]); - assert_lexes(input,expected); -} - - - -// =================== -// === Identifiers === -// =================== - -#[test] -fn variable_ident() { - let input = "some_variable_name"; - let expected = token::Stream::from(vec![Token::Variable("some_variable_name",0)]); - assert_lexes(input,expected) -} - -#[test] -fn referent_ident() { - let input = "Some_Referent_Name"; - let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name",0)]); - assert_lexes(input,expected) -} - -#[test] -fn external_ident() { - let input = "__camelCaseIdentifier"; - let expected = token::Stream::from(vec![Token::External("__camelCaseIdentifier",0)]); - assert_lexes(input,expected) -} - -#[test] -fn blank_ident() { - let input = "_"; - let expected = token::Stream::from(vec![Token::Blank(0)]); - assert_lexes(input,expected) -} - -#[test] -fn ticked_variable_ident() { - let input = "some_variable_name'"; - let expected = token::Stream::from(vec![Token::Variable("some_variable_name'",0)]); - assert_lexes(input,expected) -} - -#[test] -fn ticked_referent_ident() { - let input = "Some_Referent_Name'"; - let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'",0)]); - assert_lexes(input,expected) -} - -#[test] -fn multi_ticked_variable_ident() { - let input = "some_variable_name'''"; - let expected = token::Stream::from(vec![Token::Variable("some_variable_name'''",0)]); - assert_lexes(input,expected) -} - -#[test] -fn multi_ticked_referent_ident() { - let input = "Some_Referent_Name'''"; - let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'''",0)]); - assert_lexes(input,expected) -} - -#[test] -fn variable_with_numbers() { - let input = "some0_1"; - let expected = token::Stream::from(vec![Token::Variable("some0_1",0)]); - assert_lexes(input,expected) -} - -#[test] -fn referent_with_numbers() { - let input = "Some_1821"; - let expected = token::Stream::from(vec![Token::Referent("Some_1821",0)]); - assert_lexes(input,expected) -} - -#[test] -fn tick_not_at_end_variable() { - let input = "some_var'iable"; - let expected = token::Stream::from(vec![ - Token::Variable("some_var'",0), - Token::InvalidSuffix("iable",0), - ]); - assert_lexes(input,expected) -} - -#[test] -fn trailing_underscore() { - let input = "some_var_"; - let expected = token::Stream::from(vec![Token::External("some_var_",0)]); - assert_lexes(input,expected) -} - -#[test] -fn trailing_underscore_with_tick() { - let input = "some_var_'"; - let expected = token::Stream::from(vec![Token::External("some_var_'",0)]); - assert_lexes(input,expected) -} - -#[test] -fn invalid_suffix() { - let input = "some_varД"; - let expected = token::Stream::from(vec![ - Token::Variable("some_var",0), - Token::InvalidSuffix("Д",0), - ]); - assert_lexes(input,expected) -} - -#[test] -fn unrecognized_token() { - let input = "some_var`"; - let expected = token::Stream::from(vec![ - Token::Variable("some_var",0), - Token::Unrecognized("`",0), - ]); - assert_lexes(input,expected) -} - -#[test] -fn chained_identifiers() { - let input = "my_func A' someJavaValue some_python_value"; - let expected = token::Stream::from(vec![ - Token::Variable("my_func",0), - Token::Referent("A'",1), - Token::External("someJavaValue",1), - Token::Variable("some_python_value",1), - ]); - assert_lexes(input,expected) -} - - - -// =============== -// === Numbers === -// =============== - -#[test] -fn integer() { - let input = "13831"; - let expected = token::Stream::from(vec![Token::Number("","13831",0)]); - assert_lexes(input,expected); -} - -#[test] -fn integer_with_explicit_base() { - let input = "10_13831"; - let expected = token::Stream::from(vec![Token::Number("10","13831",0)]); - assert_lexes(input,expected); -} - -#[test] -fn dangling_base() { - let input = "10_"; - let expected = token::Stream::from(vec![Token::DanglingBase("10",0)]); - assert_lexes(input,expected); -} - -#[test] -fn hex_number() { - let input = "16_ff"; - let expected = token::Stream::from(vec![Token::Number("16","ff",0)]); - assert_lexes(input,expected); -} - -#[test] -fn decimal() { - let input = "2.71828"; - let expected = token::Stream::from(vec![Token::Number("","2.71828",0)]); - assert_lexes(input,expected); -} - -#[test] -fn decimal_with_explicit_base() { - let input = "10_2.71828"; - let expected = token::Stream::from(vec![Token::Number("10","2.71828",0)]); - assert_lexes(input,expected); -} - -#[test] -fn error_base() { - let input = "10.2_2"; - let expected = token::Stream::from(vec![ - Token::Number("","10.2",0), - Token::InvalidSuffix("_2",0), - ]); - assert_lexes(input,expected); -} - -#[test] -fn offset_number() { - let input = " 10.2"; - let expected = token::Stream::from(vec![ - Token::Number("","10.2",4), - ]); - assert_lexes(input,expected); -} - - - -// ============ -// === Text === -// ============ - - - -// ============== -// === Blocks === -// ============== - -#[test] -fn block_function_call() { - let input = make_unix_line_endings( -r#"f - argument_1 - argument_2 - fn a1 a2 a3 - argument_4 - argument_5"#); - let block_fn_args = - Token::Block( - BlockType::Continuous, - 4, - vec![ - Token::Line( - vec![Token::Variable("argument_1",0)], - 0, - LineEnding::LF - ), - Token::Line( - vec![ - Token::Variable("argument_2",0), - ], - 0, - LineEnding::LF - ), - Token::Line( - vec![ - Token::Variable("fn",0), - Token::Variable("a1",1), - Token::Variable("a2",1), - Token::Variable("a3",1), - ], - 0, - LineEnding::LF - ), - Token::Line( - vec![ - Token::Variable("argument_4",0), - ], - 0, - LineEnding::LF - ), - Token::Line( - vec![ - Token::Variable("argument_5",0), - ], - 0, - LineEnding::None - ), - ], - 0 - ); - let top_level_first_line = Token::Line( - vec![ - Token::Variable("f",0), - block_fn_args - ], - 0, - LineEnding::LF - ); - let top_level_block = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![top_level_first_line], - 0 - ) - ]); - assert_lexes(input,top_level_block); -} - - -#[test] -fn block_empty_lines() { - let input = "f\r\n a\n\n b\n"; - let nested_block = Token::Block( - BlockType::Continuous, - 4, - vec![ - Token::Line(vec![Token::Variable("a",0)],0,LineEnding::LF), - Token::BlankLine(0,LineEnding::LF), - Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF), - ], - 0 - ); - let top_line = Token::Line( - vec![ - Token::Variable("f",0), - nested_block - ], - 0, - LineEnding::CRLF - ); - let expected = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![top_line], - 0 - ) - ]); - assert_lexes(input,expected); -} - -#[test] -fn block_top_level() { - let input = make_unix_line_endings( -r#" - -foo -bar -baz -"#); - let expected = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![ - Token::BlankLine(0,LineEnding::LF), - Token::BlankLine(0,LineEnding::LF), - Token::Line(vec![Token::Variable("foo",0)],0,LineEnding::LF), - Token::Line(vec![Token::Variable("bar",0)],0,LineEnding::LF), - Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF), - ], - 0 - ) - ]); - assert_lexes(input,expected); -} - -#[test] -fn block_with_operator() { - let input = make_unix_line_endings( -r#"x -> - foo x 1 -"#); - let nested_block = Token::Block( - BlockType::Discontinuous, - 4, - vec![ - Token::Line(vec![ - Token::Variable("foo",0), - Token::Variable("x",1), - Token::Number("","1",1), - ], 0, LineEnding::LF) - ], - 0 - ); - let expected = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![ - Token::Line(vec![ - Token::Variable("x",0), - Token::Operator("->",1), - nested_block - ], 0, LineEnding::LF) - ], - 0 - ) - ]); - assert_lexes(input,expected); -} - -#[test] -fn block_with_nesting() { - let input = make_unix_line_endings(r#" -some_long_thing - foo -> - Bar - baz - - quux -"#); - let function_block = Token::Block( - BlockType::Discontinuous, - 8, - vec![ - Token::Line(vec![Token::Referent("Bar",0)],0,LineEnding::LF), - Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF), - Token::BlankLine(0,LineEnding::LF), - ], - 0 - ); - let foo_block = Token::Block( - BlockType::Continuous, - 4, - vec![ - Token::Line(vec![ - Token::Variable("foo",0), - Token::Operator("->",1), - function_block, - ], 0, LineEnding::LF), - Token::Line(vec![Token::Variable("quux",0)],0,LineEnding::LF), - ], - 0 - ); - let expected = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![ - Token::BlankLine(0,LineEnding::LF), - Token::Line(vec![ - Token::Variable("some_long_thing",0), - foo_block - ], 0, LineEnding::LF), - ], - 0 - ) - ]); - assert_lexes(input,expected); -} - -#[test] -fn block_extra_indented_blank_lines() { - let input = "a\n b\n \n \n c"; - let indented_block = Token::Block( - BlockType::Continuous, - 4, - vec![ - Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF), - Token::BlankLine(8,LineEnding::LF), - Token::BlankLine(2,LineEnding::LF), - Token::Line(vec![Token::Variable("c",0)],0,LineEnding::None), - ], - 0 - ); - let top_level_line = Token::Line(vec![ - Token::Variable("a",0), - indented_block - ],0,LineEnding::LF); - let expected = token::Stream::from(vec![ - Token::Block( - BlockType::Continuous, - 0, - vec![top_level_line], - 0 - ) - ]); - assert_lexes(input,expected); -} - -#[test] -fn block_length_unix() { - let input = "a\n b\n c"; - assert_block_has_length(input,13); -} - -#[test] -fn block_length_windows() { - let input = "a\r\n b\r\n c"; - assert_block_has_length(input,15); -} - -#[test] -fn block_length_mixed() { - let input = "a\r\n b\n c\n d"; - assert_block_has_length(input,20); -} - - - -// ================ -// === Combined === -// ================ diff --git a/lib/rust/lexer/generation/tests/enso_lexer_blocks.rs b/lib/rust/lexer/generation/tests/enso_lexer_blocks.rs new file mode 100644 index 0000000000..e1a2f514e0 --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_blocks.rs @@ -0,0 +1,337 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing blocks in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; +use lexer_definition::token::BlockType; +use lexer_definition::token::LineEnding; + + + +// ============== +// === Blocks === +// ============== + +#[test] +fn function_call() { + let input = make_unix_line_endings( +r#"f + argument_1 + argument_2 + fn a1 a2 a3 + argument_4 + argument_5"#); + let block_fn_args = + Token::block( + BlockType::Continuous, + 4, + vec![ + Token::line( + vec![Token::variable("argument_1", 0)], + 0, + LineEnding::LF + ), + Token::line( + vec![ + Token::variable("argument_2", 0), + ], + 0, + LineEnding::LF + ), + Token::line( + vec![ + Token::variable("fn", 0), + Token::variable("a1", 1), + Token::variable("a2", 1), + Token::variable("a3", 1), + ], + 0, + LineEnding::LF + ), + Token::line( + vec![ + Token::variable("argument_4", 0), + ], + 0, + LineEnding::LF + ), + Token::line( + vec![ + Token::variable("argument_5", 0), + ], + 0, + LineEnding::None + ), + ], + 0 + ); + let top_level_first_line = Token::line( + vec![ + Token::variable("f", 0), + block_fn_args + ], + 0, + LineEnding::LF + ); + let top_level_block = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![top_level_first_line], + 0 + ) + ]); + assert_lexes(input,top_level_block); +} + + +#[test] +fn empty_lines() { + let input = "f\r\n a\n\n b\n"; + let nested_block = Token::block( + BlockType::Continuous, + 4, + vec![ + Token::line(vec![Token::variable("a", 0)], 0, LineEnding::LF), + Token::blank_line(0, LineEnding::LF), + Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF), + ], + 0 + ); + let top_line = Token::line( + vec![ + Token::variable("f", 0), + nested_block + ], + 0, + LineEnding::CRLF + ); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![top_line], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn top_level() { + let input = make_unix_line_endings( +r#" + +foo +bar +baz +"#); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![ + Token::blank_line(0, LineEnding::LF), + Token::blank_line(0, LineEnding::LF), + Token::line(vec![Token::variable("foo", 0)], 0, LineEnding::LF), + Token::line(vec![Token::variable("bar", 0)], 0, LineEnding::LF), + Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn with_operator() { + let input = make_unix_line_endings( +r#"x -> + foo x 1 +"#); + let nested_block = Token::block( + BlockType::Discontinuous, + 4, + vec![ + Token::line(vec![ + Token::variable("foo", 0), + Token::variable("x", 1), + Token::number("", "1", 1), + ], 0, LineEnding::LF) + ], + 0 + ); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![ + Token::line(vec![ + Token::variable("x", 0), + Token::operator("->", 1), + nested_block + ], 0, LineEnding::LF) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn with_nesting() { + let input = make_unix_line_endings(r#" +some_long_thing + foo -> + Bar + baz + + quux +"#); + let function_block = Token::block( + BlockType::Discontinuous, + 8, + vec![ + Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF), + Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF), + Token::blank_line(0, LineEnding::LF), + ], + 0 + ); + let foo_block = Token::block( + BlockType::Continuous, + 4, + vec![ + Token::line(vec![ + Token::variable("foo", 0), + Token::operator("->", 1), + function_block, + ], 0, LineEnding::LF), + Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF), + ], + 0 + ); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![ + Token::blank_line(0, LineEnding::LF), + Token::line(vec![ + Token::variable("some_long_thing", 0), + foo_block + ], 0, LineEnding::LF), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn multiple_dedent() { + let input = make_unix_line_endings(r#" +some_long_thing + foo -> + Bar + baz +quux +"#); + let function_block = Token::block( + BlockType::Discontinuous, + 8, + vec![ + Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF), + Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF), + ], + 0 + ); + let foo_block = Token::block( + BlockType::Continuous, + 4, + vec![ + Token::line(vec![ + Token::variable("foo", 0), + Token::operator("->", 1), + function_block, + ], 0, LineEnding::LF), + ], + 0 + ); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![ + Token::blank_line(0, LineEnding::LF), + Token::line(vec![ + Token::variable("some_long_thing", 0), + foo_block + ], 0, LineEnding::LF), + Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn extra_indented_blank_lines() { + let input = "a\n b\n \n \n c"; + let indented_block = Token::block( + BlockType::Continuous, + 4, + vec![ + Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF), + Token::blank_line(8, LineEnding::LF), + Token::blank_line(2, LineEnding::LF), + Token::line(vec![Token::variable("c", 0)], 0, LineEnding::None), + ], + 0 + ); + let top_level_line = Token::line(vec![ + Token::variable("a", 0), + indented_block + ], 0, LineEnding::LF); + let expected = token::Stream::from(vec![ + Token::block( + BlockType::Continuous, + 0, + vec![top_level_line], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn length_unix() { + let input = "a\n b\n c"; + assert_block_has_length(input,13); +} + +#[test] +fn length_windows() { + let input = "a\r\n b\r\n c"; + assert_block_has_length(input,15); +} + +#[test] +fn length_mixed() { + let input = "a\r\n b\n c\n d"; + assert_block_has_length(input,20); +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_combined.rs b/lib/rust/lexer/generation/tests/enso_lexer_combined.rs new file mode 100644 index 0000000000..f1b626842a --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_combined.rs @@ -0,0 +1,660 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing full-on Enso with the lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// ================ +// === Combined === +// ================ + +#[test] +fn method_definition() { + let input = make_unix_line_endings( +r#"## Traverse the heterogeneous list, applying the provided polymorphic function + wherever it matches. +@Tail_Call +map : forall ts ts' => (this : H_List ts) -> (exists a b . a ~> b) -> H_List ts' +map this fn -> case this.types of + Cons x xs -> + x' = fn x + x.Cons (map xs) + x -> fn x +"#); + let doc_comment = Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![ + Token::text_segment_raw( + "Traverse the heterogeneous list, applying the provided polymorphic \ + function", + 0 + ) + ], + 0, + token::LineEnding::LF, + ), + Token::line( + vec![Token::text_segment_raw("wherever it matches.", 0)], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ), + ], + 0, + token::LineEnding::None, + ); + let annotation = Token::line( + vec![Token::annotation("Tail_Call", 0)], + 0, + token::LineEnding::LF, + ); + let signature = Token::line( + vec![ + Token::variable("map", 0), + Token::operator(":", 1), + Token::variable("forall", 1), + Token::variable("ts", 1), + Token::variable("ts'", 1), + Token::operator("=>", 1), + Token::operator("(", 1), + Token::variable("this", 0), + Token::operator(":", 1), + Token::referent("H_List", 1), + Token::variable("ts", 1), + Token::operator(")", 0), + Token::operator("->", 1), + Token::operator("(", 1), + Token::variable("exists", 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::operator(".", 1), + Token::variable("a", 1), + Token::operator("~>", 1), + Token::variable("b", 1), + Token::operator(")", 0), + Token::operator("->", 1), + Token::referent("H_List", 1), + Token::variable("ts'", 1), + ], + 0, + token::LineEnding::LF + ); + let cons_branch_body = Token::block( + token::BlockType::Discontinuous, + 8, + vec![ + Token::line( + vec![ + Token::variable("x'", 0), + Token::operator("=", 1), + Token::variable("fn", 1), + Token::variable("x", 1), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("x", 0), + Token::operator(".", 0), + Token::referent("Cons", 0), + Token::operator("(", 1), + Token::variable("map", 0), + Token::variable("xs", 1), + Token::operator(")", 0), + ], + 0, + token::LineEnding::LF + ), + ], + 0 + ); + let case_body = Token::block( + token::BlockType::Continuous, + 4, + vec![ + Token::line( + vec![ + Token::referent("Cons", 0), + Token::variable("x", 1), + Token::variable("xs", 1), + Token::operator("->", 1), + cons_branch_body, + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("x", 0), + Token::operator("->", 1), + Token::variable("fn", 1), + Token::variable("x", 1) + ], + 0, + token::LineEnding::LF, + ) + ], + 0 + ); + let function = Token::line( + vec![ + Token::variable("map", 0), + Token::variable("this", 1), + Token::variable("fn", 1), + Token::operator("->", 1), + Token::variable("case", 1), + Token::variable("this", 1), + Token::operator(".", 0), + Token::variable("types", 0), + Token::variable("of", 1), + case_body, + ], + 0, + token::LineEnding::LF + ); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![doc_comment,annotation,signature,function], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn complex_type() { + let input = make_unix_line_endings( +r#" +type Maybe a + type Just item:a + Nothing + + is_just = case this of + Just _ -> True + Nothing -> False +"#); + let case_block = Token::block( + token::BlockType::Continuous, + 8, + vec![ + Token::line( + vec![ + Token::referent("Just", 0), + Token::blank(1), + Token::operator("->", 2), + Token::referent("True", 1), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::referent("Nothing", 0), + Token::operator("->", 1), + Token::referent("False", 1) + ], + 0, + token::LineEnding::LF + ), + ], + 0 + ); + let type_body = Token::block( + token::BlockType::Continuous, + 4, + vec![ + Token::line( + vec![ + Token::variable("type", 0), + Token::referent("Just", 1), + Token::variable("item", 1), + Token::operator(":", 0), + Token::variable("a", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line(vec![Token::referent("Nothing", 0)], 0, token::LineEnding::LF), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::variable("is_just", 0), + Token::operator("=", 1), + Token::variable("case", 1), + Token::variable("this", 1), + Token::variable("of", 1), + case_block, + ], + 0, + token::LineEnding::LF + ) + ], + 0 + ); + let complex_type = Token::line( + vec![ + Token::variable("type", 0), + Token::referent("Maybe", 1), + Token::variable("a", 1), + type_body, + ], + 0, + token::LineEnding::LF + ); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::blank_line(0, token::LineEnding::LF), + complex_type + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn imports_exports() { + let input = make_unix_line_endings( +r#"import Base.List +import Base.Number.Extensions +from Builtins import Unit, Number, Integer, Any, True, False + +from Builtins export all + +from Base.List export Nil, Cons +from Base.Number.Extensions export all hiding Math + +polyglot java import com.ibm.icu.text.BreakIterator +polyglot java import org.enso.base.Text_Utils +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::variable("import", 0), + Token::referent("Base", 1), + Token::operator(".", 0), + Token::referent("List", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("import", 0), + Token::referent("Base", 1), + Token::operator(".", 0), + Token::referent("Number", 0), + Token::operator(".", 0), + Token::referent("Extensions", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("from", 0), + Token::referent("Builtins", 1), + Token::variable("import", 1), + Token::referent("Unit", 1), + Token::operator(",", 0), + Token::referent("Number", 1), + Token::operator(",", 0), + Token::referent("Integer", 1), + Token::operator(",", 0), + Token::referent("Any", 1), + Token::operator(",", 0), + Token::referent("True", 1), + Token::operator(",", 0), + Token::referent("False", 1), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::variable("from", 0), + Token::referent("Builtins", 1), + Token::variable("export", 1), + Token::variable("all", 1), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::variable("from", 0), + Token::referent("Base", 1), + Token::operator(".", 0), + Token::referent("List", 0), + Token::variable("export", 1), + Token::referent("Nil", 1), + Token::operator(",", 0), + Token::referent("Cons", 1), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("from", 0), + Token::referent("Base", 1), + Token::operator(".", 0), + Token::referent("Number", 0), + Token::operator(".", 0), + Token::referent("Extensions", 0), + Token::variable("export", 1), + Token::variable("all", 1), + Token::variable("hiding", 1), + Token::referent("Math", 1), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::variable("polyglot", 0), + Token::variable("java", 1), + Token::variable("import", 1), + Token::variable("com", 1), + Token::operator(".", 0), + Token::variable("ibm", 0), + Token::operator(".", 0), + Token::variable("icu", 0), + Token::operator(".", 0), + Token::variable("text", 0), + Token::operator(".", 0), + Token::external("BreakIterator", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::variable("polyglot", 0), + Token::variable("java", 1), + Token::variable("import", 1), + Token::variable("org", 1), + Token::operator(".", 0), + Token::variable("enso", 0), + Token::operator(".", 0), + Token::variable("base", 0), + Token::operator(".", 0), + Token::referent("Text_Utils", 0), + ], + 0, + token::LineEnding::LF + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn some_stdlib() { + let input = make_unix_line_endings( +r#"from Base import all + +## The top-level entry point for a test suite. +type Suite specs + +## PRIVATE +type Spec name behaviors + +## PRIVATE +type Behavior name result + +## PRIVATE +Behavior.is_fail = this.result.is_fail + +## PRIVATE +Spec.is_fail = this.behaviors.any is_fail + +## PRIVATE +Suite.is_fail = this.specs.any is_fail +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::variable("from", 0), + Token::referent("Base", 1), + Token::variable("import", 1), + Token::variable("all", 1), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![ + Token::text_segment_raw( + "The top-level entry point for a test suite.", + 0 + ), + ], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::variable("type", 0), + Token::referent("Suite", 1), + Token::variable("specs", 1), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("PRIVATE", 0),], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::variable("type", 0), + Token::referent("Spec", 1), + Token::variable("name", 1), + Token::variable("behaviors", 1) + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("PRIVATE", 0),], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::variable("type", 0), + Token::referent("Behavior", 1), + Token::variable("name", 1), + Token::variable("result", 1) + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("PRIVATE", 0),], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::referent("Behavior", 0), + Token::operator(".", 0), + Token::variable("is_fail", 0), + Token::operator("=", 1), + Token::variable("this", 1), + Token::operator(".", 0), + Token::variable("result", 0), + Token::operator(".", 0), + Token::variable("is_fail", 0), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("PRIVATE", 0),], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::referent("Spec", 0), + Token::operator(".", 0), + Token::variable("is_fail", 0), + Token::operator("=", 1), + Token::variable("this", 1), + Token::operator(".", 0), + Token::variable("behaviors", 0), + Token::operator(".", 0), + Token::variable("any", 0), + Token::variable("is_fail", 1) + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("PRIVATE", 0),], + 0, + token::LineEnding::LF, + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::line( + vec![ + Token::referent("Suite", 0), + Token::operator(".", 0), + Token::variable("is_fail", 0), + Token::operator("=", 1), + Token::variable("this", 1), + Token::operator(".", 0), + Token::variable("specs", 0), + Token::operator(".", 0), + Token::variable("any", 0), + Token::variable("is_fail", 1) + ], + 0, + token::LineEnding::LF + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_comments.rs b/lib/rust/lexer/generation/tests/enso_lexer_comments.rs new file mode 100644 index 0000000000..4f99ffd49f --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_comments.rs @@ -0,0 +1,318 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing comments in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// ================ +// === Comments === +// ================ + +#[test] +fn disable_eof() { + let input = "# Here is a nice long comment string."; + let expected = token::Stream::from(vec![ + Token::disable_comment(" Here is a nice long comment string.", 0) + ]); + assert_lexes(input,expected); +} + +#[test] +fn disable_lf() { + let input = "# Here is a nice long comment string.\n"; + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![Token::disable_comment(" Here is a nice long comment string.", 0)], + 0, + token::LineEnding::LF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn disable_crlf() { + let input = "# Here is a nice long comment string.\r\n"; + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![Token::disable_comment(" Here is a nice long comment string.", 0)], + 0, + token::LineEnding::CRLF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn disable_in_line() { + let input = "a + b <*> N # Compare the frobnicators."; + let expected = token::Stream::from(vec![ + Token::variable("a", 0), + Token::operator("+", 1), + Token::variable("b", 1), + Token::operator("<*>", 1), + Token::referent("N", 1), + Token::disable_comment(" Compare the frobnicators.", 1), + ]); + assert_lexes(input,expected) +} + +#[test] +fn disable_in_interpolate() { + let input = "'String `1 + 1 # add` stuff.'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("String ", 0), + Token::text_segment_interpolate( + vec![ + Token::number("", "1", 0), + Token::operator("+", 1), + Token::number("", "1", 1), + Token::unrecognized("#", 1), + Token::variable("add", 1) + ], + 0 + ), + Token::text_segment_raw(" stuff.", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_single_line_eof() { + let input = "## Foo bar baz"; + let expected = token::Stream::from(vec![ + Token::doc_comment( + vec![ + Token::line(vec![Token::text_segment_raw("Foo bar baz", 0)], 0, token::LineEnding::None) + ], + 3, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_single_line_lf() { + let input = "## Foo bar baz\n"; + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("Foo bar baz", 0)], + 0, + token::LineEnding::LF + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::blank_line(0, token::LineEnding::None), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_single_line_crlf() { + let input = "## Foo bar baz\r\n"; + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("Foo bar baz", 0)], + 0, + token::LineEnding::CRLF + ) + ], + 3, + 0 + ) + ], + 0, + token::LineEnding::None + ), + Token::blank_line(0, token::LineEnding::None), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_in_interpolate() { + let input = "'String `1 + 1 ## add` stuff.'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("String ", 0), + Token::text_segment_interpolate( + vec![ + Token::number("", "1", 0), + Token::operator("+", 1), + Token::number("", "1", 1), + Token::unrecognized("##", 1), + Token::variable("add", 1) + ], + 0 + ), + Token::text_segment_raw(" stuff.", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_multi_line() { + let input = make_unix_line_endings( +r#"## Here is a doc comment. + It spans multiple lines. + Some are indented much further. + And this is okay. + + It keeps going, even with blank lines. + Until the indentation decreases back. + +trailing_blanks_not_part_of_comment"#); + let doc_comment = Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("Here is a doc comment.", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw("It spans multiple lines.", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw(" Some are indented much further.", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw(" And this is okay.", 0)], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![Token::text_segment_raw("It keeps going, even with blank lines.", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw("Until the indentation decreases back.", 0)], + 0, + token::LineEnding::LF + ), + ], + 4, + 0 + ); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line(vec![doc_comment], 0, token::LineEnding::None), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![Token::variable("trailing_blanks_not_part_of_comment", 0)], + 0, + token::LineEnding::None + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn doc_mixed_line_endings() { + let input = "## Start a doc comment\n It has indent 3.\r\n \n An indented blank too."; + let expected = token::Stream::from(vec![ + Token::doc_comment( + vec![ + Token::line( + vec![Token::text_segment_raw("Start a doc comment", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw("It has indent 3.", 0)], + 0, + token::LineEnding::CRLF + ), + Token::blank_line(4, token::LineEnding::LF), + Token::line( + vec![Token::text_segment_raw(" An indented blank too.", 0)], + 0, + token::LineEnding::None + ) + ], + 3, + 0 + ) + ]); + assert_lexes(input,expected); +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_identifiers.rs b/lib/rust/lexer/generation/tests/enso_lexer_identifiers.rs new file mode 100644 index 0000000000..c7ef127017 --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_identifiers.rs @@ -0,0 +1,178 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing identifiers in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// =================== +// === Identifiers === +// =================== + +#[test] +fn variable_ident() { + let input = "some_variable_name"; + let expected = token::Stream::from(vec![Token::variable("some_variable_name", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn referent_ident() { + let input = "Some_Referent_Name"; + let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn external_ident() { + let input = "__camelCaseIdentifier"; + let expected = token::Stream::from(vec![Token::external("__camelCaseIdentifier", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn blank_ident() { + let input = "_"; + let expected = token::Stream::from(vec![Token::blank(0)]); + assert_lexes(input,expected) +} + +#[test] +fn annotation() { + let input = "@debug"; + let expected = token::Stream::from(vec![Token::annotation("debug", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn ticked_variable_ident() { + let input = "some_variable_name'"; + let expected = token::Stream::from(vec![Token::variable("some_variable_name'", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn ticked_referent_ident() { + let input = "Some_Referent_Name'"; + let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn ticked_annotation() { + let input = "@debug'"; + let expected = token::Stream::from(vec![Token::annotation("debug'", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn multi_ticked_variable_ident() { + let input = "some_variable_name'''"; + let expected = token::Stream::from(vec![Token::variable("some_variable_name'''", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn multi_ticked_referent_ident() { + let input = "Some_Referent_Name'''"; + let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'''", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn multi_ticked_annotation() { + let input = "@debug''"; + let expected = token::Stream::from(vec![Token::annotation("debug''", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn variable_with_numbers() { + let input = "some0_1"; + let expected = token::Stream::from(vec![Token::variable("some0_1", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn referent_with_numbers() { + let input = "Some_1821"; + let expected = token::Stream::from(vec![Token::referent("Some_1821", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn annotation_with_numbers() { + let input = "@debug_1"; + let expected = token::Stream::from(vec![Token::annotation("debug_1", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn tick_not_at_end_variable() { + let input = "some_var'iable"; + let expected = token::Stream::from(vec![ + Token::variable("some_var'", 0), + Token::invalid_suffix("iable", 0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn trailing_underscore() { + let input = "some_var_"; + let expected = token::Stream::from(vec![Token::external("some_var_", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn trailing_underscore_with_tick() { + let input = "some_var_'"; + let expected = token::Stream::from(vec![Token::external("some_var_'", 0)]); + assert_lexes(input,expected) +} + +#[test] +fn invalid_suffix() { + let input = "some_varД"; + let expected = token::Stream::from(vec![ + Token::variable("some_var", 0), + Token::invalid_suffix("Д", 0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn unrecognized_token() { + let input = "some_var@"; + let expected = token::Stream::from(vec![ + Token::variable("some_var", 0), + Token::unrecognized("@", 0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn chained_identifiers() { + let input = "my_func A' someJavaValue some_python_value"; + let expected = token::Stream::from(vec![ + Token::variable("my_func", 0), + Token::referent("A'", 1), + Token::external("someJavaValue", 1), + Token::variable("some_python_value", 1), + ]); + assert_lexes(input,expected) +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_number_literals.rs b/lib/rust/lexer/generation/tests/enso_lexer_number_literals.rs new file mode 100644 index 0000000000..20e47e51c9 --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_number_literals.rs @@ -0,0 +1,85 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing number literals in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// =============== +// === Numbers === +// =============== + +#[test] +fn integer() { + let input = "13831"; + let expected = token::Stream::from(vec![Token::number("", "13831", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn integer_with_explicit_base() { + let input = "10_13831"; + let expected = token::Stream::from(vec![Token::number("10", "13831", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn dangling_base() { + let input = "10_"; + let expected = token::Stream::from(vec![Token::dangling_base("10", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn hex_number() { + let input = "16_ff"; + let expected = token::Stream::from(vec![Token::number("16", "ff", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn decimal() { + let input = "2.71828"; + let expected = token::Stream::from(vec![Token::number("", "2.71828", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn decimal_with_explicit_base() { + let input = "10_2.71828"; + let expected = token::Stream::from(vec![Token::number("10", "2.71828", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn error_base() { + let input = "10.2_2"; + let expected = token::Stream::from(vec![ + Token::number("", "10.2", 0), + Token::invalid_suffix("_2", 0), + ]); + assert_lexes(input,expected); +} + +#[test] +fn offset_number() { + let input = " 10.2"; + let expected = token::Stream::from(vec![ + Token::number("", "10.2", 4), + ]); + assert_lexes(input,expected); +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_operators.rs b/lib/rust/lexer/generation/tests/enso_lexer_operators.rs new file mode 100644 index 0000000000..00cfadadb1 --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_operators.rs @@ -0,0 +1,230 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing operators in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// ================= +// === Operators === +// ================= + +#[test] +fn function_operator() { + let input = "->"; + let expected = token::Stream::from(vec![Token::operator("->", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn bind_operator() { + let input = "<-"; + let expected = token::Stream::from(vec![Token::operator("<-", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn left_pipe_operator() { + let input = "<|"; + let expected = token::Stream::from(vec![Token::operator("<|", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn right_pipe_operator() { + let input = "|>"; + let expected = token::Stream::from(vec![Token::operator("|>", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn eq_operator() { + let input = "="; + let expected = token::Stream::from(vec![Token::operator("=", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn eq_compare_operator() { + let input = "=="; + let expected = token::Stream::from(vec![Token::operator("==", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn geq_operator() { + let input = ">="; + let expected = token::Stream::from(vec![Token::operator(">=", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn neq_operator() { + let input = "!="; + let expected = token::Stream::from(vec![Token::operator("!=", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn dot_operator() { + let input = "."; + let expected = token::Stream::from(vec![Token::operator(".", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn comma_operator() { + let input = ","; + let expected = token::Stream::from(vec![Token::operator(",", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn double_dot_operator() { + let input = ".."; + let expected = token::Stream::from(vec![Token::operator("..", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn triple_dot_operator() { + let input = "..."; + let expected = token::Stream::from(vec![Token::operator("...", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn error_operator() { + let input = "!"; + let expected = token::Stream::from(vec![Token::operator("!", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn type_ascription_operator() { + let input = ":"; + let expected = token::Stream::from(vec![Token::operator(":", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn in_operator() { + let input = "in"; + let expected = token::Stream::from(vec![Token::operator("in", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_union_operator() { + let input = "|"; + let expected = token::Stream::from(vec![Token::operator("|", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_intersection_operator() { + let input = "&"; + let expected = token::Stream::from(vec![Token::operator("&", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn typeset_subtraction_operator() { + let input = "\\"; + let expected = token::Stream::from(vec![Token::operator("\\", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn arbitrary_left_operator() { + let input = ">"; + let expected = token::Stream::from(vec![Token::operator("-->>", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn modifier_plus() { + let input = "+="; + let expected = token::Stream::from(vec![Token::modifier("+", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn modifier_minus() { + let input = "-="; + let expected = token::Stream::from(vec![Token::modifier("-", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn arbitrary_modifier() { + let input = "<%="; + let expected = token::Stream::from(vec![Token::modifier("<%", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_eq_suffix() { + let input = "==="; + let expected = token::Stream::from(vec![Token::operator("==", 0), Token::invalid_suffix("=", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_dots_suffix() { + let input = "...."; + let expected = token::Stream::from(vec![Token::operator("...", 0), Token::invalid_suffix(".", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_modifier_suffix() { + let input = "+=="; + let expected = token::Stream::from(vec![Token::operator("+", 0), Token::invalid_suffix("==", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn dot_call_operator() { + let input = ".+ .<*>"; + let expected = token::Stream::from(vec![ + Token::operator(".", 0), + Token::operator("+", 0), + Token::operator(".", 1), + Token::operator("<*>", 0), + ]); + assert_lexes(input,expected) +} + +#[test] +fn dot_eq_operator() { + let input = ".== . !="; + let expected = token::Stream::from(vec![ + Token::operator(".", 0), + Token::operator("==", 0), + Token::operator(".", 1), + Token::operator("!=", 2), + ]); + assert_lexes(input,expected); +} diff --git a/lib/rust/lexer/generation/tests/enso_lexer_text_literals.rs b/lib/rust/lexer/generation/tests/enso_lexer_text_literals.rs new file mode 100644 index 0000000000..f095468282 --- /dev/null +++ b/lib/rust/lexer/generation/tests/enso_lexer_text_literals.rs @@ -0,0 +1,1838 @@ +#![feature(test)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unsafe_code)] +#![warn(unused_import_braces)] + +//! This file contains tests for lexing text literals in the Enso lexer. + +mod test_utils; + +use lexer_definition::library::*; +use test_utils::*; + +use lexer_definition::library::token::Token; + + + +// ============ +// === Text === +// ============ + +#[test] +fn invalid_interpolate_quote() { + let input = "`"; + let expected = token::Stream::from(vec![Token::unrecognized("`", 0)]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_format_quote() { + let input = r#"''''"#; + let expected = token::Stream::from(vec![Token::invalid_quote(r#"''''"#, 0)]); + assert_lexes(input,expected); +} + +#[test] +fn single_line_format_text() { + let input = r#"'dearest creature in creation, studying english pronunciation'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw( + "dearest creature in creation, studying english pronunciation" + ,0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn single_line_format_with_one_interpolation() { + let input = "'The result is `result.pretty`!'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("The result is ", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("result", 0), + Token::operator(".", 0), + Token::variable("pretty", 0), + ], + 0 + ), + Token::text_segment_raw("!", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn single_line_format_with_multiple_interpolations() { + let input = "'My_Type: name=`self.name`, suspended=`self.suspended`'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("My_Type: name=", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("self", 0), + Token::operator(".", 0), + Token::variable("name", 0), + ], + 0 + ), + Token::text_segment_raw(", suspended=", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("self", 0), + Token::operator(".", 0), + Token::variable("suspended", 0), + ], + 0 + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_format_text() { + let input = make_unix_line_endings( +r#"''' + I have a format text block literal. + It may optionally contain `interpolations` interspersed with the text like `this`. + + And it ends when the indent of `indent` goes back. +"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("I have a format text block literal.", 0) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_raw("It may optionally contain ", 0), + Token::text_segment_interpolate( + vec![Token::variable("interpolations", 0)], + 0 + ), + Token::text_segment_raw(" interspersed with the text like ", 0), + Token::text_segment_interpolate( + vec![Token::variable("this", 0)], + 0 + ), + Token::text_segment_raw(".", 0), + ], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![ + Token::text_segment_raw("And it ends when the indent of ", 0), + Token::text_segment_interpolate(vec![Token::variable("indent", 0)], 0), + Token::text_segment_raw(" goes back.", 0) + ], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block() { + let input = "'''foo bar `interp` 'baz"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("foo bar ", 0), + Token::text_segment_interpolate(vec![Token::variable("interp", 0)], 0), + Token::text_segment_raw(" 'baz", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_line_escape_test() { + let input = "'\\'\\U00131313 = \\u{z2}\\u{AFD3} `a + b` \\U23232323\\uAA\\uAAAA a b c \\xAF'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_escape(token::EscapeStyle::Literal, "'", 0), + Token::text_segment_escape(token::EscapeStyle::U32, "00131313", 0), + Token::text_segment_raw(" = ", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\u{z2}", 0), + Token::text_segment_escape(token::EscapeStyle::U21, "AFD3", 0), + Token::text_segment_raw(" ", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("a", 0), + Token::operator("+", 1), + Token::variable("b", 1) + ], + 0 + ), + Token::text_segment_raw(" ", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\U23232323", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\uAA", 0), + Token::text_segment_escape(token::EscapeStyle::U16, "AAAA", 0), + Token::text_segment_raw(" a b c ", 0), + Token::text_segment_escape(token::EscapeStyle::Byte, "AF", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_line_unfinished_escape() { + let input = "'\\"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_escape(token::EscapeStyle::Invalid, "\\", 0)], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_escape_sequences() { + let input = "'''\\U00131313 =' \\u{z2}\\u{AFD3} `a + b` \\U23232323\\uAA\\uAAAA a b c \\xAF"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_escape(token::EscapeStyle::U32, "00131313", 0), + Token::text_segment_raw(" =' ", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\u{z2}", 0), + Token::text_segment_escape(token::EscapeStyle::U21, "AFD3", 0), + Token::text_segment_raw(" ", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("a", 0), + Token::operator("+", 1), + Token::variable("b", 1) + ], + 0 + ), + Token::text_segment_raw(" ", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\U23232323", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\uAA", 0), + Token::text_segment_escape(token::EscapeStyle::U16, "AAAA", 0), + Token::text_segment_raw(" a b c ", 0), + Token::text_segment_escape(token::EscapeStyle::Byte, "AF", 0), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_escape_sequences() { + let input = make_unix_line_endings( +r#"''' + \U00131313 =' \u{z2}\u{AFD3} `a + b`\` + \U23232323\uAA\uAAAA a b c \xAF\n + \' +"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_escape(token::EscapeStyle::U32, "00131313", 0), + Token::text_segment_raw(" =' ", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\u{z2}", 0), + Token::text_segment_escape(token::EscapeStyle::U21, "AFD3", 0), + Token::text_segment_raw(" ", 0), + Token::text_segment_interpolate( + vec![ + Token::variable("a", 0), + Token::operator("+", 1), + Token::variable("b", 1) + ], + 0 + ), + Token::text_segment_escape(token::EscapeStyle::Literal, "`", 0) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\U23232323", 0), + Token::text_segment_escape(token::EscapeStyle::InvalidUnicode, "\\uAA", 0), + Token::text_segment_escape(token::EscapeStyle::U16, "AAAA", 0), + Token::text_segment_raw(" a b c ", 0), + Token::text_segment_escape(token::EscapeStyle::Byte, "AF", 0), + Token::text_segment_escape(token::EscapeStyle::Literal, "\n", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_escape(token::EscapeStyle::Literal, "'", 0), + ], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_line_unclosed_interpolate() { + let input = "'Foo bar `baz'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![Token::variable("baz'", 0)], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_unclosed_interpolate() { + let input = "'''Foo bar ` baz"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::variable("baz", 1), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_unclosed_interpolate() { + let input = make_unix_line_endings( +r#"''' + Here's an interpolated block. + And here's an `unfinished interpolation + And `another"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![Token::text_segment_raw("Here's an interpolated block.", 0)], + 0, + token::LineEnding::LF, + ), + Token::line( + vec![ + Token::text_segment_raw("And here's an ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::variable("unfinished", 0), + Token::variable("interpolation", 1) + ], + 0 + ), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_raw("And ", 0), + Token::text_segment_unclosed_interpolate( + vec![Token::variable("another", 0)], + 0 + ), + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_raw_line() { + let input = r#"'Foo bar `"baz"`'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_raw_line_unclosed() { + let input = r#"'Foo bar `"baz`'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz`'", 0)], + 0 + ) + ], + 0 + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_line() { + let input = "'Foo bar `'baz'`'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_line_unclosed() { + let input = "'Foo bar `'baz`'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_line_interpolate() { + let input = "'Foo bar `'baz``'`'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + Token::text_segment_interpolate( + vec![Token::text_line( + token::TextStyle::UnclosedLine, + vec![], + 0 + )], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_line_unclosed_interpolate() { + let input = "'Foo bar `'baz"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_raw_line_unclosed_interpolate() { + let input = r#"'Foo bar `"baz"'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ), + Token::text_line(token::TextStyle::UnclosedLine, vec![], 0) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_inline_block() { + let input = "'Foo bar `''' a b c d`'"; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized("'''", 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_raw_inline_block() { + let input = r#"'Foo bar `""" a b c d`'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized(r#"""""#, 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_invalid_format_quote() { + let input = r#"'Foo bar `'''''`'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate(vec![Token::invalid_quote("'''''", 0)], 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_invalid_raw_quote() { + let input = r#"'Foo bar `"""""`'"#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate(vec![Token::invalid_quote(r#"""""""#, 0)], 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_format_block() { + let input = make_unix_line_endings( +r#"'Foo bar `''' +Foo`' +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::unrecognized(r#"'''"#, 0) + ], + 0 + ), + ], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::referent("Foo", 0), + Token::unrecognized("`", 0), + Token::text_line(token::TextStyle::UnclosedLine, vec![], 0) + ], + 0, + token::LineEnding::LF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn nested_raw_block() { + let input = make_unix_line_endings( +r#"'Foo bar `""" +Foo`' +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::unrecognized(r#"""""#, 0) + ], + 0 + ), + ], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::referent("Foo", 0), + Token::unrecognized("`", 0), + Token::text_line(token::TextStyle::UnclosedLine, vec![], 0) + ], + 0, + token::LineEnding::LF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_raw_line() { + let input = r#"'''Foo bar `"baz"`"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_raw_line_unclosed() { + let input = r#"'''Foo bar `"baz`"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz`", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_line() { + let input = "'''Foo bar `'baz'`"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_line_unclosed() { + let input = "'''Foo bar `'baz`"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_line_interpolate() { + let input = "'''Foo bar `'baz``'`"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + Token::text_segment_interpolate( + vec![Token::text_line( + token::TextStyle::UnclosedLine, + vec![], + 0 + )], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_line_unclosed_interpolate() { + let input = "'''Foo bar `'baz"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_raw_line_unclosed_interpolate() { + let input = r#"'''Foo bar `"baz""#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_inline_block() { + let input = "'''Foo bar `''' a b c d`"; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized("'''", 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_raw_inline_block() { + let input = r#"'''Foo bar `""" a b c d`"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized(r#"""""#, 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_invalid_format_quote() { + let input = r#"'''Foo bar `'''''`"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate(vec![Token::invalid_quote("'''''", 0)], 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_invalid_raw_quote() { + let input = r#"'''Foo bar `"""""`"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate(vec![Token::invalid_quote(r#"""""""#, 0)], 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_format_block() { + let input = make_unix_line_endings( +r#"'''Foo bar `''' +Foo` +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::unrecognized(r#"'''"#, 0) + ], + 0 + ), + ], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::referent("Foo", 0), + Token::unrecognized("`", 0), + ], + 0, + token::LineEnding::LF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_inline_block_nested_raw_block() { + let input = make_unix_line_endings( +r#"'''Foo bar `""" +Foo` +"#); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line( + vec![ + Token::text_inline_block( + token::TextStyle::FormatInlineBlock, + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::unrecognized(r#"""""#, 0) + ], + 0 + ), + ], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::referent("Foo", 0), + Token::unrecognized("`", 0), + ], + 0, + token::LineEnding::LF + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_raw_line() { + let input = make_unix_line_endings( +r#"''' + Foo bar `"baz"`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_raw_line_unclosed() { + let input = make_unix_line_endings( +r#"''' + Foo bar `"baz`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz`", 0)], + 0 + ) + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_line() { + let input = make_unix_line_endings( +r#"''' + Foo bar `'baz'`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::FormatLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_line_unclosed() { + let input = make_unix_line_endings( +r#"''' + Foo bar `'baz`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_line_interpolate() { + let input = make_unix_line_endings( +r#"''' + Foo bar `'baz``'`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + Token::text_segment_interpolate( + vec![Token::text_line( + token::TextStyle::UnclosedLine, + vec![], + 0 + )], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_line_unclosed_interpolate() { + let input = make_unix_line_endings( +r#"''' + Foo bar `'baz"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_raw_line_unclosed_interpolate() { + let input = make_unix_line_endings( +r#"''' + Foo bar `"baz"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![ + Token::text_line( + token::TextStyle::UnclosedLine, + vec![Token::text_segment_raw("baz", 0)], + 0 + ) + ], + 0 + ), + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_inline_block() { + let input = make_unix_line_endings( +r#"''' + Foo bar `''' a b c d`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized(r#"'''"#, 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_raw_inline_block() { + let input = make_unix_line_endings( +r#"''' + Foo bar `""" a b c d`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![ + Token::unrecognized(r#"""""#, 0), + Token::variable("a", 1), + Token::variable("b", 1), + Token::variable("c", 1), + Token::variable("d", 1), + ], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_invalid_format_quote() { + let input = make_unix_line_endings( +r#"''' + Foo bar `'''''`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![Token::invalid_quote(r#"'''''"#, 0)], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_invalid_raw_quote() { + let input = make_unix_line_endings( +r#"''' + Foo bar `"""""`"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_interpolate( + vec![Token::invalid_quote(r#"""""""#, 0)], + 0 + ) + ], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_format_block() { + let input = make_unix_line_endings( +r#"''' + Foo bar `''' + Foo` +"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![Token::unrecognized(r#"'''"#, 0)], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_raw("Foo", 0), + Token::text_segment_unclosed_interpolate(vec![], 0), + ], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_nested_raw_block() { + let input = make_unix_line_endings( +r#"''' + Foo bar `""" + Foo` +"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line( + vec![ + Token::text_segment_raw("Foo bar ", 0), + Token::text_segment_unclosed_interpolate( + vec![Token::unrecognized(r#"""""#, 0)], + 0 + ) + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_raw("Foo", 0), + Token::text_segment_unclosed_interpolate(vec![], 0), + ], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn invalid_raw_quote() { + let input = r#"""""""#; + let expected = token::Stream::from(vec![Token::invalid_quote(r#"""""""#, 0)]); + assert_lexes(input,expected); +} + +#[test] +fn single_line_raw_text() { + let input = r#""dearest creature in creation, studying english pronunciation""#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![ + Token::text_segment_raw( + "dearest creature in creation, studying english pronunciation" + ,0 + ) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn block_raw_text() { + let input = make_unix_line_endings( +r#"""" + I have a raw text block literal. + + `Interpolations` are not anything special in these literals. + + And it ends when the prevailing indentation is reduced. +"#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::RawBlock, + vec![ + Token::line( + vec![Token::text_segment_raw("I have a raw text block literal.", 0)], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![Token::text_segment_raw( + "`Interpolations` are not anything special in these literals.", + 0 + )], + 0, + token::LineEnding::LF + ), + Token::blank_line(0, token::LineEnding::LF), + Token::line( + vec![Token::text_segment_raw( + "And it ends when the prevailing indentation is reduced.", + 0 + )], + 0, + token::LineEnding::LF + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn raw_inline_block() { + let input = r#""""foo bar `interp` 'baz"#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::RawInlineBlock, + vec![Token::text_segment_raw("foo bar `interp` 'baz", 0), ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn raw_line_escapes() { + let input = r#""I must escape \" in raw lines. Other escapes \n\u{EA}\r don't work.""#; + let expected = token::Stream::from(vec![ + Token::text_line( + token::TextStyle::RawLine, + vec![ + Token::text_segment_raw("I must escape ", 0), + Token::text_segment_escape(token::EscapeStyle::Literal, r#"""#, 0), + Token::text_segment_raw(r" in raw lines. Other escapes \n\u{EA}\r don't work.", 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn raw_inline_block_escapes() { + let input = r#""""I don't have to escape " here but I can \"."#; + let expected = token::Stream::from(vec![ + Token::text_inline_block( + token::TextStyle::RawInlineBlock, + vec![ + Token::text_segment_raw(r#"I don't have to escape " here but I can "#, 0), + Token::text_segment_escape(token::EscapeStyle::Literal, r#"""#, 0), + Token::text_segment_raw(".", 0) + ], + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn raw_block_escapes() { + let input = make_unix_line_endings( +r#"""" + I'm in a raw block now. + I don't have to escape " but I can \". + Other escapes \xFF\uAAAAA don't work."#); + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::RawBlock, + vec![ + Token::line( + vec![Token::text_segment_raw("I'm in a raw block now.", 0)], + 0, + token::LineEnding::LF + ), + Token::line( + vec![ + Token::text_segment_raw(r#"I don't have to escape " but I can "#, 0), + Token::text_segment_escape(token::EscapeStyle::Literal, r#"""#, 0), + Token::text_segment_raw(".", 0), + ], + 0, + token::LineEnding::LF + ), + Token::line( + vec![Token::text_segment_raw(r#"Other escapes \xFF\uAAAAA don't work."#, 0)], + 0, + token::LineEnding::None + ) + ], + 4, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn raw_block_mixed_line_endings() { + let input = "\"\"\"\n Line one.\r\n Line two.\n \r\n Line three.\n"; + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::RawBlock, + vec![ + Token::line(vec![Token::text_segment_raw("Line one.", 0)], 0, token::LineEnding::CRLF), + Token::line(vec![Token::text_segment_raw("Line two.", 0)], 0, token::LineEnding::LF), + Token::blank_line(4, token::LineEnding::CRLF), + Token::line(vec![Token::text_segment_raw("Line three.", 0)], 0, token::LineEnding::LF), + ], + 2, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn format_block_mixed_line_endings() { + let input = "'''\n Line one.\r\n Line two.\n \r\n Line three.\n"; + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line(vec![Token::text_segment_raw("Line one.", 0)], 0, token::LineEnding::CRLF), + Token::line(vec![Token::text_segment_raw("Line two.", 0)], 0, token::LineEnding::LF), + Token::blank_line(4, token::LineEnding::CRLF), + Token::line(vec![Token::text_segment_raw("Line three.", 0)], 0, token::LineEnding::LF), + ], + 2, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn text_block_first_line_blank() { + let input = +r#"''' + + a"#; + let expected = token::Stream::from(vec![ + Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::blank_line(0, token::LineEnding::LF), + Token::line(vec![Token::text_segment_raw("a", 0)], 0, token::LineEnding::None) + ], + 2, + 0 + ) + ]); + assert_lexes(input,expected); +} + +#[test] +fn top_level_text_block() { + let input = make_unix_line_endings( +r#"''' + a + + b +c"#); + let text_block = Token::text_block( + token::LineEnding::LF, + token::TextStyle::FormatBlock, + vec![ + Token::line(vec![Token::text_segment_raw("a", 0)], 0, token::LineEnding::LF), + Token::blank_line(0, token::LineEnding::LF), + Token::line(vec![Token::text_segment_raw("b", 0)], 0, token::LineEnding::LF), + ], + 2, + 0 + ); + let expected = token::Stream::from(vec![ + Token::block( + token::BlockType::Continuous, + 0, + vec![ + Token::line(vec![text_block], 0, token::LineEnding::None), + Token::line(vec![Token::variable("c", 0)], 0, token::LineEnding::None), + ], + 0 + ) + ]); + assert_lexes(input,expected); +} diff --git a/lib/rust/lexer/generation/tests/test_utils.rs b/lib/rust/lexer/generation/tests/test_utils.rs new file mode 100644 index 0000000000..ca21cdb14d --- /dev/null +++ b/lib/rust/lexer/generation/tests/test_utils.rs @@ -0,0 +1,65 @@ +//! Utilities for testing the Enso lexer. + +#![allow(dead_code)] + +use enso_flexer::*; +use lexer_definition::library::*; + +use enso_flexer::prelude::reader::decoder::DecoderUTF8; +use enso_flexer::prelude::Reader; +use lexer::generated::engine::EnsoLexer; +use lexer_definition::library::token::Token; + + + +// ================= +// === Utilities === +// ================= + +/// Assert that `result` is a success with tokens `expected`. +pub fn assert_succeeds_as(result:&LexingResult, expected:token::Stream) { + match result.kind { + ResultKind::Success => assert_eq!(result.tokens,expected), + _ => panic!("Lexing failed.") + } +} + +/// Assert that the provided input lexes as `expected`. +pub fn assert_lexes(input:impl AsRef, expected:token::Stream) { + let input_len = input.as_ref().chars().count(); + let result = lex(input); + assert_succeeds_as(&result,expected); + let tokens_vec : Vec<_> = result.tokens.into(); + let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum(); + assert_eq!(total_length,input_len); +} + +/// Lex the provided string. +pub fn lex(input:impl AsRef) -> LexingResult { + let mut lexer = EnsoLexer::new(); + let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8()); + lexer.run(reader) +} + +/// Asserts that the input is a block and has a length equal to `length`. +pub fn assert_block_has_length(input:impl AsRef, expected_length:usize) { + let result = lex(input); + match result.kind { + ResultKind::Success => { + let tokens = result.tokens.tokens(); + match tokens.first().expect("Token should be present.") { + Token{shape:token::Shape::Block{..},length,..} => + assert_eq!(*length,expected_length), + _ => panic!("Token not a block."), + } + }, + _ => panic!("Lexing failed"), + } +} + +/// Makes the test text have unix line endings to ensure consistency regardless of git checkout +/// style. +pub fn make_unix_line_endings(input:&str) -> String { + let string = String::from(input); + string.chars().filter(|c| *c != '\r').collect() +} diff --git a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala index 8753aefbc1..462ff3cb10 100644 --- a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala +++ b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/ParserDef.scala @@ -516,6 +516,9 @@ case class ParserDef() extends flexer.Parser[AST.Module] { logger.trace { onEscape(Segment.Escape.Slash) } + def onEscapeFormatQuote(): Unit = logger.trace { + onEscape(Segment.Escape.Quote) + } def onEscapeQuote(): Unit = logger.trace { @@ -618,47 +621,24 @@ case class ParserDef() extends flexer.Parser[AST.Module] { FMT_BLCK.parent = FMT } - ROOT || '`' || text.onInterpolateEnd() - text.FMT || '`' || text.onInterpolateBegin() - + ROOT || '`' || text.onInterpolateEnd() ROOT || "'''" >> "'".many1 || text.onInvalidQuote() + ROOT || "'" || text.onBegin(text.FMT_LINE) + ROOT || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK) + ROOT || "'''" || text.onInlineBlock() ROOT || "\"\"\"" >> "\"".many1 || text.onInvalidQuote() + ROOT || '"' || text.onBegin(text.RAW_LINE) + ROOT || text.rawBlock || text.onBeginBlock(text.RAW_BLCK) + ROOT || "\"\"\"" || text.onInlineBlock() - ROOT || "'" || text.onBegin(text.FMT_LINE) - text.FMT_LINE || "'" || text.submit() - text.FMT_LINE || "'".many1 || text.submitInvalidQuote() - text.FMT_LINE || text.fmtSeg || text.submitPlainSegment() - text.FMT_LINE || eof || text.submitMissingQuote() - text.FMT_LINE || newline || text.submitMissingQuote() block.FIRSTCHAR || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK) - ROOT || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK) - ROOT || "'''" || text.onInlineBlock() - text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment() - text.FMT_BLCK || eof || text.onEndOfBlock() - text.FMT_BLCK || newline || text.onEndOfLine() - - ROOT || '"' || text.onBegin(text.RAW_LINE) - text.RAW_LINE || '"' || text.submit() - text.RAW_LINE || '"'.many1 || text.submitInvalidQuote() - text.RAW_LINE || text.rawSeg || text.submitPlainSegment() - text.RAW_LINE || eof || text.submitMissingQuote() - text.RAW_LINE || newline || text.submitMissingQuote() block.FIRSTCHAR || text.rawBlock || text.onBeginBlock(text.RAW_BLCK) - ROOT || text.rawBlock || text.onBeginBlock(text.RAW_BLCK) - ROOT || "\"\"\"" || text.onInlineBlock() - text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment() - text.RAW_BLCK || eof || text.onEndOfBlock() - text.RAW_BLCK || newline || text.onEndOfLine() - - text.NEWLINE || space.opt || text.onNewLine() - text.NEWLINE || space.opt >> newline || text.onEmptyLine() - text.NEWLINE || space.opt >> eof || text.onEOFNewLine() + text.FMT || '`' || text.onInterpolateBegin() AST.Text.Segment.Escape.Character.codes.foreach { code => val char = s"text.Segment.Escape.Character.$code" text.FMT || s"\\$code" run s"text.onEscape($char)" } - AST.Text.Segment.Escape.Control.codes.foreach { code => val ctrl = s"text.Segment.Escape.Control.$code" text.FMT || s"\\$code" run s"text.onEscape($ctrl)" @@ -668,16 +648,39 @@ case class ParserDef() extends flexer.Parser[AST.Module] { text.FMT || text.escape_u16 || text.onEscapeU16() text.FMT || text.escape_u32 || text.onEscapeU32() text.FMT || text.escape_int || text.onEscapeInt() + text.FMT || "\\'" || text.onEscapeFormatQuote() text.FMT || "\\\\" || text.onEscapeSlash() - text.FMT || "\\'" || text.onEscapeQuote() text.FMT || "\\" >> any || text.onEscapeInvalid() text.FMT || "\\" || text.onEscapeUnfinished() + text.FMT_LINE || "'" || text.submit() + text.FMT_LINE || "'".many1 || text.submitInvalidQuote() + text.FMT_LINE || text.fmtSeg || text.submitPlainSegment() + text.FMT_LINE || eof || text.submitMissingQuote() + text.FMT_LINE || newline || text.submitMissingQuote() + + text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment() + text.FMT_BLCK || eof || text.onEndOfBlock() + text.FMT_BLCK || newline || text.onEndOfLine() + + text.RAW_LINE || '"' || text.submit() + text.RAW_LINE || '"'.many1 || text.submitInvalidQuote() + text.RAW_LINE || text.rawSeg || text.submitPlainSegment() + text.RAW_LINE || eof || text.submitMissingQuote() + text.RAW_LINE || newline || text.submitMissingQuote() text.RAW_LINE || "\\\"" || text.onEscapeRawQuote() text.RAW_LINE || "\\\\" || text.onEscapeSlash() text.RAW_LINE || "\\" >> any || text.onEscapeInvalid() text.RAW_LINE || "\\" || text.onEscapeUnfinished() + text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment() + text.RAW_BLCK || eof || text.onEndOfBlock() + text.RAW_BLCK || newline || text.onEndOfLine() + + text.NEWLINE || space.opt || text.onNewLine() + text.NEWLINE || space.opt >> newline || text.onEmptyLine() + text.NEWLINE || space.opt >> eof || text.onEOFNewLine() + ////////////// /// Blocks /// ////////////// diff --git a/lib/scala/syntax/specialization/lexer-bench/src/bench/java/org/enso/syntax/LexerBench.java b/lib/scala/syntax/specialization/lexer-bench/src/bench/java/org/enso/syntax/LexerBench.java new file mode 100644 index 0000000000..2153da6afd --- /dev/null +++ b/lib/scala/syntax/specialization/lexer-bench/src/bench/java/org/enso/syntax/LexerBench.java @@ -0,0 +1,169 @@ +package org.enso.syntax; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.BenchmarkParams; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@Fork(1) +@Warmup(iterations = 5) +@Measurement(iterations = 10) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +public class LexerBench { + + @State(Scope.Thread) + public static class BenchState { + @Param({"1024" /* 1KB */, "102400" /* 100KB */, "1048576" /* 1MB */, "10485760" /* 10MB */}) + public int bytesSize; + + public String myInput; + + @Setup(Level.Trial) + public void doSetup(BenchmarkParams params) { + var benchNameSegments = params.getBenchmark().split("\\."); + var benchName = benchNameSegments[benchNameSegments.length - 1]; + var benchInput = LexerBenchFixtures.benchmarks().get(benchName).get(); + this.myInput = LexerBenchFixtures.replicate(benchInput, bytesSize, false); + } + } + + + // === Literals === + + @Benchmark + public void literalNumberInteger(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalNumberIntegerExplicitBase(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalNumberDecimal(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalNumberDecimalExplicitBase(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalNumberErrorBase(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalTextFormatLine(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalTextFormatInlineBlock(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalTextFormatBlock(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalTextRawLine(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalRawInlineBlock(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void literalRawBlock(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + + // === Names === + + @Benchmark + public void nameLineOf(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void nameInvalidSuffix(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + + // === Operators === + + @Benchmark + public void operatorLineOf(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void operatorDotCall(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void operatorInvalidSuffix(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + + // === Blocks === + + @Benchmark + public void blockTopLevel(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void blockNested(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void blockDeeplyNested(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + + // === Comments === + + @Benchmark + public void commentLine(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void commentInLine(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void commentDoc(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + + // === Combined === + + @Benchmark + public void combinedSimple(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } + + @Benchmark + public void combinedComplex(Blackhole blackhole, BenchState state) { + blackhole.consume(LexerBenchFixtures.runLexer(state.myInput)); + } +} diff --git a/lib/scala/syntax/specialization/lexer-bench/src/bench/scala/org/enso/syntax/LexerBenchFixtures.scala b/lib/scala/syntax/specialization/lexer-bench/src/bench/scala/org/enso/syntax/LexerBenchFixtures.scala new file mode 100644 index 0000000000..00a6f96a5b --- /dev/null +++ b/lib/scala/syntax/specialization/lexer-bench/src/bench/scala/org/enso/syntax/LexerBenchFixtures.scala @@ -0,0 +1,223 @@ +package org.enso.syntax + +import java.nio.charset.StandardCharsets + +import org.enso.flexer +import org.enso.flexer.Reader +import org.enso.syntax.text.AST +import org.enso.syntax.text.spec.{ParserDef, ParserDef2} + +object LexerBenchFixtures { + + private val newEngine = flexer.Parser.compile(ParserDef()) + + // === Lexer Runner === + + /** Execute the lexer on the provided `input`. + * + * @param input the source code + * @return the result of lexing `input` + */ + def runLexer(input: String): ParserDef2.Result[AST.Module] = { + val engine = newEngine() + val reader = new Reader(input) + engine.run(reader) + } + + // === Utilities === + + /** Replicate the provided `input` out to the provided `size` in bytes + * (according to utf-8). + * + * @param input the text to replicate + * @param size the size to replicate `input` to + * @param addNewline whether or not a newline should be added after each + * repetition of `input` + * @return `input`, repeated enough times to make the size >= `size` + */ + def replicate( + input: String, + size: Int, + addNewline: Boolean = false + ): String = { + val inputSize = input.getBytes(StandardCharsets.UTF_8).length + val times = 1 + size / inputSize + val inputNewline = if (addNewline) input + "\n" else input + " " + inputNewline.repeat(times) + } + + /** Replace all CRLF line endings in the input by LF. + * + * @param input the input text + * @return `input` with all `\r\n` replaced by `\n` + */ + def preprocess(input: String): String = { + input.replace("\r\n", "\n") + } + + // === Benchmarks === + + val benchmarks: Map[String, String] = Map( + // Literals + ("literalNumberInteger", Literal.Number.integer), + ("literalNumberIntegerExplicitBase", Literal.Number.integerExplicitBase), + ("literalNumberDecimal", Literal.Number.decimal), + ("literalNumberDecimalExplictBase", Literal.Number.decimalExplicitBase), + ("literalNumberErrorBase", Literal.Number.errorBase), + ("literalTextFormatLine", Literal.Text.formatLine), + ("literalTextFormatInlineBlock", Literal.Text.formatInlineBlock), + ("literalTextFormatBlock", Literal.Text.formatBlock), + ("literalTextRawLine", Literal.Text.rawLine), + ("literalTextRawInlineBlock", Literal.Text.rawInlineBlock), + ("literalTextRawBlock", Literal.Text.rawBlock), + // Names + ("nameLineOf", Name.lineOf), + ("nameInvalidSuffix", Name.invalidSuffix), + // Operators + ("operatorLineOf", Operator.lineOf), + ("operatorDotCall", Operator.dotCall), + ("operatorInvalidSuffix", Operator.invalidSuffix), + // Blocks + ("blockTopLevel", Block.topLevel), + ("blockNested", Block.nested), + ("blockDeeplyNested", Block.deeplyNested), + // Comments + ("commentLine", Comment.line), + ("commentInLine", Comment.inLine), + ("commentDoc", Comment.doc), + // Combined + ("combinedSimple", Combined.simple), + ("combinedComplex", Combined.complex) + ) + + // === Inputs === + + object Literal { + object Number { + val integer: String = preprocess("12345") + val integerExplicitBase: String = preprocess("16_a4fd31") + val decimal: String = preprocess("1.3141") + val decimalExplicitBase: String = preprocess("10_1.000999") + val errorBase: String = preprocess("10.2_2") + } + + object Text { + val formatLine: String = + "'dearest creature in \\n creation studying english pronunciation'" + + val formatInlineBlock: String = + "''' An inline block. It's a very good inline block carl \\u{AB}" + + val formatBlock: String = + """''' Here is my block of format text. I can `interpolate + things` like that. + | It goes on and on and on for `times` times because I feel like it. + | + | Complex interpolated expression `x -> y ~> x | y` woo! + |""".stripMargin + + val rawLine: String = + "\"dearest creature in '''' creation studying english pronunciation\"" + + val rawInlineBlock: String = + "\"\"\" An inline block. It's a very good inline block carl " + + val tQuote = "\"\"\"" + val rawBlock: String = + s"""$tQuote Here is my block of raw text. `Interpolations` are nothing special here. + | It goes on and on and on for I can escape \" though. + | + | It also supports blank lines! + |""".stripMargin + } + } + + object Name { + val lineOf: String = + "Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1" + + val invalidSuffix: String = "some_var'iable some_varД" + } + + object Operator { + val lineOf: String = "+ - * -> ~> <~ <- ! & | /" + val dotCall: String = ".== . != .<*> .*> .|>" + val invalidSuffix: String = ".... +==" + } + + object Block { + val topLevel: String = "foo\nbar\nbaz" + val nested: String = "foo\\nbar\\n baz\\n quux" + val deeplyNested: String = + """foo + |bar + | baz + | quux + | bim + | bam + | oh + |no + |""".stripMargin + } + + object Comment { + val line: String = + "# foo bar baz I have a really long line comment here that goes on and on" + + val inLine: String = "a + b # A useless comment: add a to b" + + val doc: String = + """## I have a really big doc comment here + | That just keeps prattling on and on and on. + | + | With blank lines + | + | Forever + | + | and + | ever + | + | and + | + | + | + | + | ever + |documented + |""".stripMargin + } + + object Combined { + val simple: String = + """ + |import Base.Meta + | + |## Decompose the value using runtime reflection and print its decomposition. + |Main.print_decomp a b = + | y = a + b + | decomp = Meta.decompose y + | Io.println decomp + |""".stripMargin + + val complex: String = + """import Base.Meta + | + |## Frobnicate the doodads by constructing a new type operator through runtime reflection such that + | it can be passed to another language. + | + | ! WARNING + | Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside + | Enso code. + |Main.foo a b = + | y = x -> z -> + | ty = a.gen_type (~>) (<-) b + | ty (z x) + | decomp = Meta.decompose (y a b) + | Io.println decomp + | + |## Execute the main function of this project. + |main = + | func = Meta.reify (here.foo "My_Name" "my_field") + | Io.println(func) + |""".stripMargin + } +}