Complete the implementation of the Enso lexer (#1177)

This commit is contained in:
Ara Adkins 2020-10-30 14:06:24 +00:00 committed by GitHub
parent 35efd8ea55
commit e5695e6f5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
64 changed files with 8001 additions and 5965 deletions

1
.gitignore vendored
View File

@ -97,6 +97,7 @@ bench-report.xml
.editorconfig
.bloop/
.bsp/
project/metals.sbt
#################
## Build Cache ##

View File

@ -2,9 +2,6 @@
members = [
"lib/rust/ast",
"lib/rust/flexer",
"lib/rust/flexer-testing/definition",
"lib/rust/flexer-testing/generation",
"lib/rust/launcher-shims",
"lib/rust/lexer/definition",
"lib/rust/lexer/generation",
@ -15,16 +12,17 @@ members = [
# assumes you have `rust-lib` in the same directory as `enso`. See:
# https://github.com/enso-org/rust-lib/blob/main/docs/CONTRIBUTING.md#developing-in-conjunction-with-enso--ide
[patch.crates-io]
# enso-automata = { path = '../rust-lib/src/automata' }
# enso-data = { path = '../rust-lib/src/data' }
# enso-generics = { path = '../rust-lib/src/generics' }
# enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' }
# enso-logger = { path = '../rust-lib/src/logger' }
# enso-macro-utils = { path = '../rust-lib/src/macro-utils' }
# enso-optics = { path = '../rust-lib/src/optics' }
# enso-prelude = { path = '../rust-lib/src/prelude' }
# enso-shapely = { path = '../rust-lib/src/shapely/impl' }
# enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' }
#enso-automata = { path = '../rust-lib/src/automata' }
#enso-data = { path = '../rust-lib/src/data' }
#enso-flexer = { path = '../rust-lib/src/flexer' }
#enso-generics = { path = '../rust-lib/src/generics' }
#enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' }
#enso-logger = { path = '../rust-lib/src/logger' }
#enso-macro-utils = { path = '../rust-lib/src/macro-utils' }
#enso-optics = { path = '../rust-lib/src/optics' }
#enso-prelude = { path = '../rust-lib/src/prelude' }
#enso-shapely = { path = '../rust-lib/src/shapely/impl' }
#enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' }
[profile.dev]
opt-level = 0
@ -36,7 +34,6 @@ debug-assertions = true
opt-level = 3
lto = true
debug = false
panic = 'abort'
debug-assertions = false
[profile.bench]

View File

@ -471,6 +471,45 @@ lazy val syntax = crossProject(JVMPlatform, JSPlatform)
Compile / fullOptJS / artifactPath := file("target/scala-parser.js")
)
lazy val `lexer-bench` =
(project in file("lib/scala/syntax/specialization/lexer-bench"))
.settings(
commands += WithDebugCommand.withDebug,
inConfig(Compile)(truffleRunOptionsSettings),
inConfig(Benchmark)(Defaults.testSettings),
parallelExecution in Test := false,
logBuffered in Test := false,
Test / fork := true,
libraryDependencies ++= jmh
)
.configs(Test)
.configs(Benchmark)
.dependsOn(syntax.jvm)
.dependsOn(flexer.jvm)
.settings(
javaOptions ++= Seq(
"-Xms4096m",
"-Xmx4096m",
"-XX:+FlightRecorder",
),
mainClass in Benchmark := Some("org.openjdk.jmh.Main"),
bench := Def.task {
(run in Benchmark).toTask("").value
},
benchOnly := Def.inputTaskDyn {
import complete.Parsers.spaceDelimited
val name = spaceDelimited("<name>").parsed match {
case List(name) => name
case _ =>
throw new IllegalArgumentException("Expected one argument.")
}
Def.task {
(testOnly in Benchmark).toTask(" -- -z " + name).value
}
}.evaluated,
parallelExecution in Benchmark := false
)
lazy val `parser-service` = (project in file("lib/scala/parser-service"))
.dependsOn(syntax.jvm)
.settings(

View File

@ -26,8 +26,6 @@ below:
the implementation technologies for the parser.
- [**Parser Architecture:**](./architecture.md) An overview of the architecture
of the parser as a whole.
- [**Flexer:**](./flexer.md) An overview of the design and architecture of the
flexer, a generic, DFA-based lexing engine.
- [**Lexer:**](./lexer.md) The Enso lexer, responsible for tokenising the input
stream of source code.
- [**Macro Resolution:**](./macro-resolution.md) The system for defining and

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: AST
category: parser
tags: [parser, ast]
order: 9
order: 8
---
# AST

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Construct Resolution
category: parser
tags: [parser, construct, resolution]
order: 7
order: 6
---
# Construct Resolution

View File

@ -1,199 +0,0 @@
---
layout: developer-doc
title: Flexer
category: syntax
tags: [parser, flexer, lexer, dfa]
order: 3
---
# Flexer
The flexer is a finite-automata-based engine for the definition and generation
of lexers. Akin to `flex`, and other lexer generators, the user may use it to
define a series of rules for lexing their language, which are then used by the
flexer to generate a highly-efficient lexer implementation.
Where the flexer differs from other programs in this space, however, is the
power that it gives users. When matching a rule, the flexer allows its users to
execute _arbitrary_ Rust code, which may even manipulate the lexer's state and
position. This means that the languages that can be lexed by the flexer extend
from the simplest regular grammars right up to unrestricted grammars (but please
don't write a programming language whose syntax falls into this category). It
also differs in that it chooses the first complete match for a rule, rather than
the longest one, which makes lexers much easier to define and maintain.
For detailed library documentation, please see the
[crate documentation](../../lib/rust/flexer/src/lib.rs) itself. This includes a
comprehensive tutorial on how to define a lexer using the flexer.
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [The Lexing Process](#the-lexing-process)
- [Lexing Rules](#lexing-rules)
- [Groups](#groups)
- [Patterns](#patterns)
- [Transition Functions](#transition-functions)
- [Code Generation](#code-generation)
- [Automated Code Generation](#automated-code-generation)
- [Structuring the Flexer Code](#structuring-the-flexer-code)
- [Supporting Code Generation](#supporting-code-generation)
<!-- /MarkdownTOC -->
## The Lexing Process
In the flexer, the lexing process proceeds from the top to the bottom of the
user-defined rules, and selects the first expression that _matches fully_. Once
a pattern has been matched against the input, the associated code is executed
and the process starts again until the input stream has been consumed.
This point about _matching fully_ is particularly important to keep in mind, as
it differs from other lexer generators that tend to prefer the _longest_ match
instead.
## Lexing Rules
A lexing rule for the flexer is a combination of three things:
1. A group.
2. A pattern.
3. A transition function.
An example of defining a rule is as follows:
```rust
fn define() -> Self {
let mut lexer = TestLexer::new();
let a_word = Pattern::char('a').many1();
let root_group_id = lexer.initial_state;
let root_group = lexer.groups_mut().group_mut(root_group_id);
// Here is the rule definition.
root_group.create_rule(&a_word,"self.on_first_word(reader)");
lexer
}
```
### Groups
A group is a mechanism that the flexer provides to allow grouping of rules
together. The flexer has a concept of a "state stack", which records the
currently active state at the current time, that can be manipulated by the
user-defined [transition functions](#transition-functions).
A state can be made active by using `flexer::push_state(state)`, and can be
deactivated by using `flexer::pop_state(state)` or
`flexer::pop_states_until(state)`. In addition, states may also have _parents_,
from which they can inherit rules. This is fantastic for removing the need to
repeat yourself when defining the lexer.
When inheriting rules from a parent group, the rules from the parent group are
matched strictly _after_ the rules from the child group. This means that groups
are able to selectively "override" the rules of their parents. Rules are still
matched in order for each group's set of rules.
### Patterns
Rules are defined to match _patterns_. Patterns are regular-grammar-like
descriptions of the textual content (as characters) that should be matched. For
a description of the various patterns provided by the flexer, see
[pattern.rs](../../lib/rust/flexer/src/automata/pattern.rs).
When a pattern is matched, the associated
[transition function](#transition-functions) is executed.
### Transition Functions
The transition function is a piece of arbitrary rust code that is executed when
the pattern for a given rule is matched by the flexer. This code may perform
arbitrary manipulations of the lexer state, and is where the majority of the
power of the flexer stems from.
## Code Generation
While it would be possible to interpret the flexer definition directly at
runtime, this would involve far too much dynamicism and non-cache-local lookup
to be at all fast.
Instead, the flexer includes
[`generate.rs`](../../lib/rust/flexer/src/generate.rs), a library for generating
highly-specialized lexer implementations based on the definition provided by the
user. The transformation that it implements operates as follows for each group
of rules.
1. The set of rules in a group is used to generate a
[Nondeterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton),
(NFA).
2. The NFA is ttransformed into a
[Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton)
(DFA), using a variant of the standard
[powerset construction](https://en.wikipedia.org/wiki/Powerset_construction)
algorithm. This variant has been modified to ensure that the following
additional properties hold:
- Patterns are matched in the order in which they are defined.
- The associated transition functions are maintained correctly through the
transformation.
- The lexing process is `O(n)`, where `n` is the size of the input.
3. The DFA is then used to generate the rust code that implements that lexer.
The generated lexer contains a main loop that consumes the input stream
character-by-character, evaluating what is effectively a big `match` expression
that processes the input to evaluate the user-provided transition functions as
appropriate.
### Automated Code Generation
In order to avoid the lexer definition getting out of sync with its
implementation (the generated engine), it is necessary to create a separate
crate for the generated engine that has the lexer definition as one of its
dependencies.
This separation enables a call to `flexer::State::specialize()` in the crate's
`build.rs` (or a macro) during compilation. The output can be stored in a new
file i.e. `engine.rs` and exported from the library as needed. The project
structure would therefore appear as follows.
```
- lib/rust/lexer/
- definition/
- src/
- lib.rs
- cargo.toml
- generation/
- src/
- engine.rs <-- the generated file
- lib.rs <-- `pub mod engine`
- build.rs <-- calls `flexer::State::specialize()` and saves its output to
`src/engine.rs`
- cargo.toml <-- lexer-definition is in dependencies and build-dependencies
```
With this design, `flexer.generate_specialized_code()` is going to be executed
on each rebuild of `lexer/generation`. Therefore, `generation` should contain
only the minimum amount of logic, and should endeavor to minimize any
unnecessary dependencies to avoid recompiling too often.
## Structuring the Flexer Code
In order to unify the API between the definition and generated usages of the
flexer, the API is separated into the following components:
- `Flexer`: The main flexer definition itself, providing functionality common to
the definition and implementation of all lexers.
- `flexer::State`: The stateful components of a lexer definition. This trait is
implemented for a particular lexer definition, allowing the user to store
arbitrary data in their lexer, as needed.
- **User-Defined Lexer:** The user can then define a lexer that _wraps_ the
flexer, specialised to the particular `flexer::State` that the user has
defined. It is recommended to implement `Deref` and `DerefMut` between the
defined lexer and the `Flexer`, to allow for ease of use.
### Supporting Code Generation
This architecture separates out the generated code (which can be defined purely
on the user-defined lexer), from the code that is defined as part of the lexer
definition. This means that the same underlying structures can be used to both
_define_ the lexer, and be used by the generated code from that definition.
For an example of how these components are used in the generated lexer, please
see [`generated_api_test`](../../lib/rust/flexer/tests/generated_api_test.rs).

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: JVM Object Generation
category: parser
tags: [parser, jvm, object-generation]
order: 10
order: 9
---
# JVM Object Generation
@ -14,8 +14,6 @@ the compiler and runtime to work with the AST.
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [Overall Architecture](#overall-architecture)
<!-- /MarkdownTOC -->
# Overall Architecture

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Lexer
category: syntax
tags: [parser, lexer]
order: 4
order: 3
---
# Lexer
@ -19,6 +19,9 @@ identify blocks
- [Libraries in the Lexer Definition](#libraries-in-the-lexer-definition)
- [Lexer Functionality](#lexer-functionality)
- [The Lexer AST](#the-lexer-ast)
- [Benchmarking the Lexer](#benchmarking-the-lexer)
- [Running a Subset of the Benchmarks](#running-a-subset-of-the-benchmarks)
- [Changing the Lexer](#changing-the-lexer)
<!-- /MarkdownTOC -->
@ -43,12 +46,12 @@ paths directly from the crate root.
## Lexer Functionality
The lexer needs to provide the following functionality as part of the parser.
The lexer provides the following functionality as part of the parser.
- It consumes the source lazily, character by character, and produces a
structured token stream consisting of the lexer [ast](#the-lexer-ast).
- It must succeed on _any_ input, even if there are invalid constructs in the
token stream, represented by `Invalid` tokens.
- It succeeds _any_ input, even if there are invalid constructs in the token
stream, represented by `Invalid` tokens.
## The Lexer AST
@ -69,15 +72,29 @@ It contains the following constructs:
- `Blank`: The blank name `_`.
- `Operator`: Operator identifiers (e.g. `-->>`).
- `Modifier`: Modifier operators (e.g. `+=`).
- `Annotation`: An annotation (e.g. `@Tail_Call`).
- `Number`: Numbers (`16_FFFF`).
- `DanglingBase`: An explicit base without an associated number (e.g. `16_`).
- `Text`: Text (e.g. `"Some text goes here."`).
- `TextLine`: A single-line text literal.
- `TextInlineBlock`: An inline block text literal.
- `TextBlock`: A text block literal.
- `InvalidQuote`: An invalid set of quotes for a text literal.
- `TextSegmentRaw`: A raw text segment in which the contents should be
interpreted literally.
- `TextSegmentEscape:` A text segment containing an escape sequence.
- `TextSegmentInterpolate:` A text segment containing an arbitrary interpolated
expression.
- `TextSegmentUnclosedInterpolate`: An unclosed interpolation text segment.
- `Line`: A line in a block that contains tokens.
- `BlankLine`: A line in a block that contains only whitespace.
- `Block`: Syntactic blocks in the language.
- `InvalidSuffix`: Invalid tokens when in a given state that would otherwise be
valid.
- `Unrecognized`: Tokens that the lexer doesn't recognise.
- `DisableComment`: A standard comment that disables interpretation of the
commented code (i.e. `#`).
- `DocComment:` A documentation comment (e.g. `##`). Documentation syntax is
_not_ lexed by this lexer.
The distinction is made here between the various kinds of identifiers in order
to keep lexing fast, but also in order to allow macros to switch on the kinds of
@ -87,3 +104,61 @@ identifiers.
>
> - Determine if we want to have separate ASTs for the lexer and the parser, or
> not.
## Benchmarking the Lexer
As the lexer is the first port of call when getting an Enso program to run it
needs to be quick. To that end, we insist on comprehensive benchmarks for any
change made to the lexer. The lexer benchmarks are written using
[criterion.rs](https://github.com/bheisler/criterion.rs), and include both
examples of whole program definitions and more specific benchmark examples.
**Baseline Commit:** TBC (use head of this branch for now).
The benchmarking process for the lexer is as follows:
1. Check out the current _baseline commit_, listed above.
2. In `lexer_bench_sources.rs` change the line that reads `.retain_baseline` to
instead read `.save_baseline`. This will save the current baseline (taken on
your machine).
3. Run the benchmarks using `cargo bench`. Please note that running these
benchmarks takes approximately two hours, so sufficient time should be
allotted.
4. Once the baseline run has completed, change the above-mentioned line back to
`.retain_baseline`. This will disable overwriting the saved baseline, and
will perform its regression reporting against it.
5. Make your changes.
6. Run the benchmark suite again. It will report any performance regressions in
the benchmark report, measured against your saved baseline.
Unfortunately, the use of time-based benchmarks means that we can't commit the
baseline to the repository. There is far too much variance between machines for
this to be useful.
### Running a Subset of the Benchmarks
The benchmarks are very comprehensive, running a wide range of program text
through the lexer while replicating it out to various sizes (see
`lexer_bench_sources.rs` for the full list). However, in order to decrease
iteration time it can be useful to run a subset of these.
There are two main tuning points for this:
1. The _sizes_ of inputs being executed on.
2. The benchmarks being executed.
The sizes can be tuned by editing the `SIZES` array in the
`lexer_bench_sources.rs` file. The benchmarks themselves are best tuned by
changing the macro definitions in `lexer_time_bench.rs` to exclude certain
benchmarks or groups of benchmarks.
While it is _possible_ to tune the benchmarking config (`bench_config` in
`lexer_bench_sources.rs`) to decrease benchmarking time, this is not
recommended. The current settings are tuned to provide reliable results.
### Changing the Lexer
When changing the lexer the _full_ benchmark suite must be run against the
current baseline before the changes can be merged. This suite run must use the
provided settings for the benchmarking library, and should be performed using
the process described above.

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Macro Resolution
category: parser
tags: [parser, macro, resolution]
order: 5
order: 4
---
# Macro Resolution

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Operator Resolution
category: parser
tags: [parser, operator, resolution]
order: 6
order: 5
---
# Operator Resolution

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Parser Driver
category: parser
tags: [parser, driver]
order: 8
order: 7
---
# Parser Driver

View File

@ -3,7 +3,7 @@ layout: developer-doc
title: Reading Source Code
category: parser
tags: [parser, reader]
order: 11
order: 10
---
# Reading Source Code
@ -15,9 +15,14 @@ project is going to use, as well as backing formats for the stream.
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [Reader Functionality](#reader-functionality)
- [Provided Readers](#provided-readers)
- [UTF-8 Reader](#utf-8-reader)
- [UTF-16 Reader](#utf-16-reader)
- [Reader Structure](#reader-structure)
- [Read](#read)
- [Decoder](#decoder)
- [Provided Encodings](#provided-encodings)
- [UTF-8](#utf-8)
- [UTF-16](#utf-16)
- [UTF-32](#utf-32)
- [Benchmarks](#benchmarks)
<!-- /MarkdownTOC -->

View File

@ -23,6 +23,7 @@ Enso supports a variety of types of comments:
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [Disable Comments](#disable-comments)
- [Freeze Comments](#freeze-comments)
- [Documentation Comments](#documentation-comments)
- [Tags](#tags)
- [Sections](#sections)
@ -39,13 +40,35 @@ Disable comments are the standard form of comment seen in a programming language
in that they prevent a given piece of code from executing. In Enso, they are
created by prefixing the expression to disable with the `#` character.
These aren't _exactly_ like most language's disable comments however:
Disable comments in Enso do not have their contents validated, and continue from
the `#` character to the end of the line.
- When you disable a line in Enso, it is still run through the parser to see if
it is syntactically valid.
- No identifiers in it are resolved, however.
- This is important as it allows the disabled expression to still be displayed
in the visual syntax.
```ruby
x = y + z # here is some commented text
```
Disable comments are _not_ allowed inside textual interpolations.
## Freeze Comments
Freeze comments are a special type of comment used to enable the 'freezing' or
caching of expensive computations in Enso. When used, they cache the result of
an expression, reusing the value instead of recomputing it even if the
underlying data changes.
A portion of code that is frozen has the following properties:
- It is still lexed as if it were code, and validated by the parser to check for
validity.
- No identifier resolution takes place.
These are very important as they still allow the frozen expression to be
displayed properly in the visual syntax.
> The actionables for this section are:
>
> - Work out what they should look like visually.
> - Work out how best to implement this.
## Documentation Comments
@ -66,6 +89,8 @@ for more information). By way of example:
until I unindent again.
```
Documentation comments are _not_ allowed inside textual interpolations.
The tool that generates this documentation aims to be fairly robust, and tries
to assign produce sensible results even if the user makes a mistake. Such
mistakes will be highlighted to the user.

View File

@ -17,6 +17,8 @@ types in literal form in the source code.
- [Text Literals](#text-literals)
- [Inline Text Literals](#inline-text-literals)
- [Text Block Literals](#text-block-literals)
- [Inline Block Literals](#inline-block-literals)
- [Escape Sequences](#escape-sequences)
- [Vector Literals](#vector-literals)
<!-- /MarkdownTOC -->
@ -65,7 +67,7 @@ Enso provides rich support for textual literals in the language, supporting both
raw and interpolated strings natively.
- **Raw Strings:** Raw strings are delimited using the standard double-quote
character (`"`). Raw strings have support for escape sequences.
character (`"`). Raw strings don't support escape sequences except for `\"`.
```ruby
raw_string = "Hello, world!"
@ -75,7 +77,8 @@ raw and interpolated strings natively.
executable Enso expressions into the string. Such strings are delimited using
the single-quote (`'`) character, and splices are delimited using the backtick
(`` ` ``) character. Splices are run, and then the result is converted to a
string using `show`. These strings also have support for escape sequences.
string using `show`. These strings also have support for all kinds of
[escape sequences](#escape-sequences).
```ruby
fmt_string = 'Hello, my age is `time.now.year - person.birthday.year`'
@ -104,7 +107,7 @@ following layout rules:
- Any indentation further than this baseline will be retained as part of the
text literal.
- The literal is _closed_ by the first line with a _lower_ level of indentation
than the first child lineand will not contain the final blank line.
than the first child line and will not contain the final blank line.
```
block_raw = '''
@ -116,6 +119,48 @@ block_raw = '''
not_string_expr = foo bar
```
### Inline Block Literals
In order to easily transition between using text blocks and single-line
literals, we allow for defining an inline block literal. This is a literal that
uses the same start delimiter as a block literal (see above), but rather than
ending the literal through de-indenting from the block's level of indentation,
the literal is ended upon the line ending.
```
inline_block =
"""this is all part of the literal
but_this_is_not
```
### Escape Sequences
Format literals in Enso support many kinds of escape sequence. These are
described below.
| Name | Escape Sequence | Unicode | Notes |
| :----------- | :-------------: | :--------: | :---------------------------------------------------------------------------------------- |
| Byte Escape | `\x##` | `U+00##` | 8-bit character specification. |
| U16 Escape | `\u####` | `U+####` | 16-bit unicode character, where each `#` is a hex digit. |
| U21 Escape | `\u{######}` | `U+######` | 21-bit unicode character, where `######` is 1-6 hex digits. |
| U32 Escape | `\U########` | `U+######` | 32-bit unicode character, where each `#` is a hex digit and the first two bytes are `00`. |
| Null | `\0` | `U+0000` | The null character. |
| Alert | `\a` | `U+0007` | The bell/alert character. |
| Backspace | `\b` | `U+0008` | The backspace character. |
| Form Feed | `\f` | `U+000C` | The form-feed character. |
| LF | `\n` | `U+000A` | The line-feed character (newline on unix systems). |
| CR | `\r` | `U+000D` | The carriage return character (part of newline on windows systems). |
| Tab | `\t` | `U+0009` | The horizontal tab character. |
| Vertical Tab | `\v` | `U+000B` | The vertical tab character. |
| Backslash | `\\` | `U+005C` | A literal backslash character. |
| Double Quote | `\"` | `U+0022` | A literal double quote character. |
| Single Quote | `\'` | `U+0027` | A literal single quote character. |
| Backtick | `` \` `` | `U+0060` | A literal backtick character. |
The only one of the above escape sequences that is supported in a raw text
literal is `\"`. All other occurrences of `\` in such literals are treated as a
literal backslash.
## Vector Literals
Enso also supports vector literals, which allow users to create literal vectors

View File

@ -28,6 +28,7 @@ provide their users with access to the compilation and type-checking phases
<!-- MarkdownTOC levels="2,3" autolink="true" -->
- [Annotations](#annotations)
- [Annotation Naming](#annotation-naming)
- [Automatic Deriving](#automatic-deriving)
<!-- /MarkdownTOC -->
@ -66,6 +67,13 @@ that we are able to reserve words such as `type` to ensure that users can always
have a good sense of what the most common constructs in the language mean,
rather than allowing them to be overridden outside of the stdlib.
### Annotation Naming
The naming of annotations follows the standard rules that Enso uses for naming
its [identifiers](./naming.md#naming-constructs). This means that they can be in
both referent or variable form as the annotation head is _not_ a
[pattern context](./naming.md#pattern-contexts).
## Automatic Deriving
In order to make the language easier to debug, we have all types automatically

View File

@ -1,15 +0,0 @@
[package]
name = "flexer-test-definition"
version = "0.1.0"
authors = ["Enso Team <enso-dev@enso.org>"]
edition = "2018"
publish = false
[lib]
crate-type = ["cdylib", "rlib"]
test = true
bench = true
[dependencies]
flexer = { path = "../../flexer", version = "0.1.0" }

View File

@ -1,282 +0,0 @@
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains the code defining a lexer for the following small language. Due to the way in
//! which the code-generation from the flexer is used, it has to be defined in a separate crate from
//! the site at which it's used. For the actual tests of this code, please see
//! `flexer-testing/generation`.
//!
//! The language here is being defined as follows:
//!
//! a-word = 'a'+;
//! b-word = 'b'+;
//! word = a-word | b-word;
//! space = ' ';
//! spaced-word = space, word;
//! language = word, spaced-word*;
//!
//! Please note that there is a fair amount of duplicated code between this test and the
//! `lexer_generated_api_test` file. This is to present the full view of what each portion of the
//! process looks like.
use flexer::prelude::*;
use flexer::*;
use flexer;
use flexer::automata::pattern::Pattern;
use flexer::group::Registry;
use flexer::prelude::logger::Disabled;
use flexer::prelude::reader::BookmarkManager;
// ====================
// === Type Aliases ===
// ====================
type Logger = Disabled;
// ===========
// === AST ===
// ===========
/// A very simple AST, sufficient for the simple language being defined.
#[derive(Clone,Debug,PartialEq)]
pub enum Token {
/// A word from the input, consisting of a sequence of all `a` or all `b`.
Word(String),
/// A token that the lexer is unable to recognise.
Unrecognized(String),
}
impl Token {
/// Construct a new word token.
pub fn word(name:impl Into<String>) -> Token {
Token::Word(name.into())
}
/// Construct a new unrecognized token.
pub fn unrecognized(name:impl Into<String>) -> Token {
Token::Unrecognized(name.into())
}
}
/// A representation of a stream of tokens.
#[allow(missing_docs)]
#[derive(Clone,Debug,Default,PartialEq)]
pub struct TokenStream {
tokens:Vec<Token>
}
impl TokenStream {
/// Append the provided token to the token stream.
pub fn push(&mut self,token:Token) {
self.tokens.push(token);
}
}
// === Trait Impls ===
impl From<Vec<Token>> for TokenStream {
fn from(tokens: Vec<Token>) -> Self {
TokenStream {tokens}
}
}
// ==================
// === Test Lexer ===
// ==================
/// The definition of a test lexer for the above-described language.
#[derive(Debug)]
pub struct TestLexer {
lexer:Flexer<TestState,TokenStream,Logger>
}
impl Deref for TestLexer {
type Target = Flexer<TestState,TokenStream,Logger>;
fn deref(&self) -> &Self::Target {
&self.lexer
}
}
impl DerefMut for TestLexer {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.lexer
}
}
impl TestLexer {
/// Creates a new instance of this lexer.
pub fn new() -> Self {
let logger = Logger::new("TestLexer");
let lexer = Flexer::new(logger);
TestLexer{lexer}
}
}
/// Rules for the root state.
#[allow(dead_code,missing_docs)]
impl TestLexer {
fn on_first_word<R:LazyReader>(&mut self, _reader:&mut R) {
let str = self.current_match.clone();
let ast = Token::Word(str);
self.output.push(ast);
let id = self.seen_first_word_state;
self.push_state(id);
}
fn on_err_suffix_first_word<R:LazyReader>(&mut self, _reader:&mut R) {
let ast = Token::Unrecognized(self.current_match.clone());
self.output.push(ast);
}
fn on_no_err_suffix_first_word<R:LazyReader>(&mut self, _reader:&mut R) {}
fn rules_in_root(lexer:&mut TestLexer) {
let a_word = Pattern::char('a').many1();
let b_word = Pattern::char('b').many1();
let any = Pattern::any();
let end = Pattern::eof();
let root_group_id = lexer.initial_state;
let root_group = lexer.groups_mut().group_mut(root_group_id);
root_group.create_rule(&a_word,"self.on_first_word(reader)");
root_group.create_rule(&b_word,"self.on_first_word(reader)");
root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)");
root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)");
}
}
/// Rules for the "seen first word" state.
#[allow(dead_code,missing_docs)]
impl TestLexer {
fn on_spaced_word<R:LazyReader>(&mut self, _reader:&mut R) {
let str = self.current_match.clone();
let ast = Token::Word(String::from(str.trim()));
self.output.push(ast);
}
fn on_err_suffix<R:LazyReader>(&mut self, reader:&mut R) {
self.on_err_suffix_first_word(reader);
self.pop_state();
}
fn on_no_err_suffix<R:LazyReader>(&mut self, reader:&mut R) {
self.on_no_err_suffix_first_word(reader);
self.pop_state();
}
fn rules_in_seen_first_word(lexer:&mut TestLexer) {
let a_word = Pattern::char('a').many1();
let b_word = Pattern::char('b').many1();
let space = Pattern::char(' ');
let spaced_a_word = &space >> &a_word;
let spaced_b_word = &space >> &b_word;
let any = Pattern::any();
let end = Pattern::eof();
let seen_first_word_group_id = lexer.seen_first_word_state;
let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id);
seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)");
seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)");
seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)");
seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)");
}
}
// === Trait Impls ===
impl flexer::Definition for TestLexer {
fn define() -> Self {
let mut lexer = TestLexer::new();
TestLexer::rules_in_seen_first_word(&mut lexer);
TestLexer::rules_in_root(&mut lexer);
lexer
}
fn groups(&self) -> &Registry {
self.lexer.groups()
}
fn set_up(&mut self) {}
fn tear_down(&mut self) {}
}
impl Default for TestLexer {
fn default() -> Self {
TestLexer::new()
}
}
// ===================
// === Lexer State ===
// ===================
/// The stateful components of the test lexer.
#[derive(Debug)]
pub struct TestState {
/// The registry for groups in the lexer.
lexer_states:group::Registry,
/// The initial state of the lexer.
initial_state:group::Identifier,
/// The state entered when the first word has been seen.
seen_first_word_state:group::Identifier,
/// The bookmarks for this lexer.
bookmarks:BookmarkManager
}
// === Trait Impls ===
impl flexer::State for TestState {
fn new(_logger:&impl AnyLogger) -> Self {
let mut lexer_states = group::Registry::default();
let initial_state = lexer_states.define_group("ROOT",None);
let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None);
let bookmarks = BookmarkManager::new();
Self{lexer_states,initial_state,seen_first_word_state,bookmarks}
}
fn initial_state(&self) -> group::Identifier {
self.initial_state
}
fn groups(&self) -> &group::Registry {
&self.lexer_states
}
fn groups_mut(&mut self) -> &mut group::Registry {
&mut self.lexer_states
}
fn bookmarks(&self) -> &BookmarkManager {
&self.bookmarks
}
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
&mut self.bookmarks
}
fn specialize(&self) -> Result<String,GenError> {
generate::specialize(self,"TestLexer","TokenStream")
}
}

View File

@ -1,20 +0,0 @@
[package]
name = "flexer-test-generation"
version = "0.1.0"
authors = ["Enso Team <enso-dev@enso.org>"]
edition = "2018"
publish = false
[lib]
crate-type = ["cdylib", "rlib"]
test = true
bench = true
[dependencies]
flexer = { path = "../../flexer" , version = "0.1.0" }
flexer-test-definition = { path = "../definition", version = "0.1.0" }
[build-dependencies]
flexer = { path = "../../flexer" , version = "0.1.0" }
flexer-test-definition = { path = "../definition", version = "0.1.0" }

View File

@ -1,32 +0,0 @@
use std::fs::File;
use std::io::prelude::*;
use flexer_test_definition::TestLexer;
use flexer::Definition;
use flexer::State;
/// Generates the lexer engine and saves the result into the file `src/engine.rs`.
///
/// The content of the generated file can be used with the `include!` macro.
fn generate_engine() -> std::io::Result<()> {
let definition_path = "../definition/src/lib.rs";
let output_directory = "src/generated";
let _ = std::fs::create_dir(output_directory);
let output_path = "src/generated/engine.rs";
let definition_error = format!("The lexer definition should exist at {}.",definition_path);
let output_error = format!("Cannot open output file at {}.",output_path);
let mut lexer_def = File::open(definition_path).expect(definition_error.as_str());
let mut contents = String::new();
let mut file = File::create(output_path).expect(output_error.as_str());
let lexer = TestLexer::define();
let engine = lexer.specialize().unwrap();
lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition.");
file.write_all(contents.as_bytes()).expect("Unable to write lexer definition.");
file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization.");
Ok(())
}
fn main() -> std::io::Result<()> {
generate_engine()
}

View File

@ -1,3 +0,0 @@
//! This module serves to re-export the generated lexer.
pub mod engine;

View File

@ -1,19 +0,0 @@
//! This library exposes the specialized version of the Enso lexer.
//!
//! Its sole purpose is to avoid the lexer definition getting out of sync with its implementation
//! (the generated engine), which requires the engine to live in a separate crate.
//!
//! This separation enables generation of the enso lexer source code with `build.rs` during
//! compilation. Its output is then stored in a new file `engine.rs`and exported by `lexer.rs`.
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
pub mod generated;

View File

@ -1,110 +0,0 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for the generated lexer.
use flexer::prelude::*;
use flexer::prelude::reader::decoder::DecoderUTF8;
use flexer_test_generation::generated::engine::TestLexer;
use flexer_test_generation::generated::engine::Token;
use flexer_test_generation::generated::engine::TokenStream;
// =============
// === Tests ===
// =============
/// Executes the test on the provided input string slice.
fn run_test_on(str:impl AsRef<str>) -> TokenStream {
// Hardcoded for ease of use here.
let reader = Reader::new(str.as_ref().as_bytes(), DecoderUTF8());
let mut lexer = TestLexer::new();
let run_result = lexer.run(reader);
match run_result.kind {
flexer::ResultKind::Success => run_result.tokens,
_ => default()
}
}
#[test]
fn test_single_a_word() {
let input = "aaaaa";
let expected_output = TokenStream::from(vec![Token::word(input)]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_single_b_word() {
let input = "bbbbb";
let expected_output = TokenStream::from(vec![Token::word(input)]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_two_word() {
let input = "aaaaa bbbbb";
let expected_output = TokenStream::from(vec![Token::word("aaaaa"), Token::word("bbbbb")]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_multi_word() {
let input = "bbb aa a b bbbbb aa";
let expected_output = TokenStream::from(vec![
Token::word("bbb"),
Token::word("aa"),
Token::word("a"),
Token::word("b"),
Token::word("bbbbb"),
Token::word("aa")
]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_invalid_single_word() {
let input = "c";
let expected_output = TokenStream::from(vec![Token::unrecognized(input)]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_multi_word_invalid() {
let input = "aaaaaa c bbbbbb";
let expected_output = TokenStream::from(vec![
Token::word("aaaaaa"),
Token::unrecognized(" "),
Token::unrecognized("c"),
Token::unrecognized(" "),
Token::word("bbbbbb"),
]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}
#[test]
fn test_end_invalid() {
let input = "bbbbbb c";
let expected_output = TokenStream::from(vec![
Token::word("bbbbbb"),
Token::unrecognized(" "),
Token::unrecognized("c"),
]);
let result = run_test_on(input);
assert_eq!(result, expected_output);
}

View File

@ -1,39 +0,0 @@
[package]
name = "flexer"
version = "0.1.0"
authors = ["Enso Team <enso-dev@enso.org>"]
edition = "2018"
description = "A finite-automata-based lexing engine."
readme = "README.md"
homepage = "https://github.com/enso-org/enso/lib/rust/flexer"
repository = "https://github.com/enso-org/enso"
license-file = "../../../LICENSE"
keywords = ["lexer", "finite-automata"]
categories = ["parsing"]
publish = false
[lib]
name = "flexer"
crate-type = ["cdylib", "rlib"]
test = true
bench = true
[dependencies]
enso-logger = { version = "0.1.1" }
enso-prelude = { version = "0.1.3" }
enso-lazy-reader = { version = "= 0.1.0" }
enso-macro-utils = { version = "0.1.1" }
itertools = "0.8"
proc-macro2 = "1.0.19"
nonempty = "0.1.5"
quote = "1.0"
syn = { version = "1.0.12", features = ["full", "extra-traits", "visit-mut", "visit", "parsing", "printing"] }
unicode-segmentation = "1.6.0"
wasm-bindgen = "0.2"
[dev-dependencies]
wasm-bindgen-test = "0.2"

View File

@ -1,4 +0,0 @@
# Flexer
This library provides a finite-automata-based lexing engine that can flexibly
tokenize an input stream.

View File

@ -1,9 +0,0 @@
//! Provides an API for the construction of finite state automata, in both their deterministic and
//! non-deterministic forms.
pub mod alphabet;
pub mod dfa;
pub mod nfa;
pub mod pattern;
pub mod state;
pub mod symbol;

View File

@ -1,130 +0,0 @@
//! Exports an alphabet for an arbitrary finite state automaton.
use crate::prelude::*;
use crate::automata::symbol::Symbol;
use std::collections::BTreeSet;
use std::ops::RangeInclusive;
// ====================
// === Segmentation ===
// ====================
/// A representation of the distinct intervals over the input alphabet for a given finite state
/// automaton.
///
/// These intervals are defined by a set of _divisions_ of the input alphabet, where each division
/// is represented as a point in that alphabet. This is necessary to allow for efficient encoding of
/// state transitions that trigger not just on _one_, but potentially on _many_ of the input
/// symbols in the automaton's alphabet.
///
/// This is best explained by way of example. Consider the original unbounded alphabet:
///
/// ```text
/// ... a b c d e f g h ... z ...
/// ```
///
/// We want to add a rule that matches on the interval `[b, d]`. This results in there being three
/// intervals on the alphabet, as there are two divisions (annotated below):
///
/// ```text
/// ... a | b c d | e f g h ... z ...
/// div: 1 2
/// seg: 1 2 3
/// ```
///
/// If we then add a rule that matches on the interval `[d, f]`, we end up with five intervals on
/// the alphabet, with four divisions (annotated below):
///
/// ```text
/// ... a | b c | d | e f | g h ... z ...
/// div: 1 2 3 4
/// seg: 1 2 3 4 5
/// ```
///
/// This type tracks these divisions explicitly for an input alphabet defined for all automata in
/// this library as `0u32..=u32::max_value()`.
#[derive(Clone,Debug,PartialEq,Eq)]
#[allow(missing_docs)]
pub struct Segmentation {
pub divisions:BTreeSet<Symbol>
}
impl Segmentation {
/// Inserts a range of symbols into the alphabet.
pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
self.divisions.insert(Symbol::from(range.start()));
if range.end().value != Symbol::EOF_CODE.value {
self.divisions.insert(Symbol{value:range.end().value + 1});
}
}
/// Creates a [`Segmentation`] from an input set of divisions.
pub fn from_divisions(divisions:&[u32]) -> Self {
let mut dict = Self::default();
for val in divisions {
dict.divisions.insert(Symbol::from(*val));
}
dict
}
/// Obtains the divisions in the alphabet segmentation as a vector.
pub fn divisions_as_vec(&self) -> Vec<Division> {
self.divisions.iter().copied().enumerate().map(From::from).collect()
}
}
// === Trait Impls ===
impl Default for Segmentation {
fn default() -> Self {
let mut divisions: BTreeSet<Symbol> = default();
// The existence of the default (0) member in the set is assumed by the implementation of
// the NFA -> DFA conversion.
divisions.insert(default());
Segmentation{divisions}
}
}
// ================
// === Division ===
// ================
/// A division of the alphabet used by the lexer.
#[derive(Copy,Clone,Debug,PartialEq,Eq)]
pub struct Division {
/// The position of the division.
pub position : usize,
/// The symbol at which it divides the alphabet.
pub symbol : Symbol,
}
impl Division {
/// Create a new division.
pub fn new(position:usize, symbol:Symbol) -> Division {
Division{position,symbol}
}
}
// === Trait Impls ===
impl Into<(usize,Symbol)> for Division {
fn into(self) -> (usize, Symbol) {
(self.position,self.symbol)
}
}
impl From<(usize,Symbol)> for Division {
fn from((position, symbol): (usize, Symbol)) -> Self {
Division::new(position,symbol)
}
}

View File

@ -1,178 +0,0 @@
//! The structure for defining deterministic finite automata.
use crate::automata::alphabet;
use crate::automata::state;
use crate::data::matrix::Matrix;
// =====================================
// === Deterministic Finite Automata ===
// =====================================
/// The definition of a [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) for a
/// given set of symbols, states, and transitions.
///
/// A DFA is a finite state automaton that accepts or rejects a given sequence of symbols by
/// executing on a sequence of states _uniquely_ determined by the sequence of input symbols.
///
/// ```text
/// ┌───┐ 'D' ┌───┐ 'F' ┌───┐ 'A' ┌───┐
/// │ 0 │ ----> │ 1 │ ----> │ 2 │ ----> │ 3 │
/// └───┘ └───┘ └───┘ └───┘
/// ```
#[derive(Clone,Debug,Default,Eq,PartialEq)]
pub struct DFA {
/// A set of disjoint intervals over the allowable input alphabet.
pub alphabet_segmentation:alphabet::Segmentation,
/// The transition matrix for the DFA.
///
/// It represents a function of type `(state, symbol) -> state`, returning the identifier for
/// the new state.
///
/// For example, the transition matrix for an automaton that accepts the language
/// `{"A" | "B"}*"` would appear as follows, with `-` denoting
/// [the invalid state](state::Identifier::INVALID). The leftmost column encodes the input
/// state, while the topmost row encodes the input symbols.
///
/// | | A | B |
/// |:-:|:-:|:-:|
/// | 0 | 1 | - |
/// | 1 | - | 0 |
///
pub links:Matrix<state::Identifier>,
/// A collection of callbacks for each state (indexable in order)
pub callbacks:Vec<Option<RuleExecutable>>,
}
impl DFA {
/// Check whether the DFA has a rule for the target state.
///
/// This method should only be used in generated code, where its invariants are already checked.
///
/// # Panics
///
/// If no callback exists for `target_state`.
pub fn has_rule_for(&self, target_state:state::Identifier) -> bool {
self.callbacks.get(target_state.id).unwrap().is_some()
}
}
// === Trait Impls ===
impl From<Vec<Vec<usize>>> for Matrix<state::Identifier> {
fn from(input:Vec<Vec<usize>>) -> Self {
let rows = input.len();
let columns = if rows == 0 {0} else {input[0].len()};
let mut matrix = Self::new(rows,columns);
for row in 0..rows {
for column in 0..columns {
matrix[(row,column)] = state::Identifier::from(input[row][column]);
}
}
matrix
}
}
// ================
// === Callback ===
// ================
/// The callback associated with an arbitrary state of a finite automaton.
///
/// It contains the rust code that is intended to be executed after encountering a
/// [`pattern`](super::pattern::Pattern) that causes the associated state transition. This pattern
/// is declared in [`Rule.pattern`](crate::group::rule::Rule::pattern).
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct RuleExecutable {
/// A description of the priority with which the callback is constructed during codegen.
pub priority:usize,
/// The rust code that will be executed when running this callback.
pub code:String,
}
impl RuleExecutable {
/// Creates a new rule executable with the provided `priority` and `code`.
pub fn new(priority:usize, code_str:impl Into<String>) -> RuleExecutable {
let code = code_str.into();
RuleExecutable{priority,code}
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
pub mod tests {
use crate::automata::state;
use super::*;
const INVALID:usize = state::Identifier::INVALID.id;
/// DFA automata that accepts newline '\n'.
pub fn newline() -> DFA {
DFA {
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[10,11]),
links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
callbacks:vec![
None,
Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}),
],
}
}
/// DFA automata that accepts any letter a..=z.
pub fn letter() -> DFA {
DFA {
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[97,123]),
links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
callbacks:vec![
None,
Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}),
],
}
}
/// DFA automata that accepts any number of spaces ' '.
pub fn spaces() -> DFA {
DFA {
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[0,32,33]),
links:Matrix::from(vec![
vec![INVALID,1,INVALID],
vec![INVALID,2,INVALID],
vec![INVALID,2,INVALID],
]),
callbacks:vec![
None,
Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}),
Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}),
],
}
}
/// DFA automata that accepts one letter a..=z or any many spaces.
pub fn letter_and_spaces() -> DFA {
DFA {
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[32,33,97,123]),
links:Matrix::from(vec![
vec![INVALID, 1,INVALID, 2,INVALID],
vec![INVALID, 3,INVALID,INVALID,INVALID],
vec![INVALID,INVALID,INVALID,INVALID,INVALID],
vec![INVALID, 3,INVALID,INVALID,INVALID],
]),
callbacks:vec![
None,
Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}),
Some(RuleExecutable {priority:4, code:"group_0_rule_0".into()}),
Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}),
],
}
}
}

View File

@ -1,345 +0,0 @@
//! The structure for defining non-deterministic finite automata.
use crate::automata::alphabet;
use crate::automata::dfa::DFA;
use crate::automata::dfa::RuleExecutable;
use crate::automata::pattern::Pattern;
use crate::automata::state::State;
use crate::automata::state::Transition;
use crate::automata::state;
use crate::automata::symbol::Symbol;
use crate::data::matrix::Matrix;
use itertools::Itertools;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::ops::RangeInclusive;
use crate::prelude::*;
// =========================================
// === Non-Deterministic Finite Automata ===
// =========================================
/// A state identifier based on a set of states.
///
/// This is used during the NFA -> DFA transformation, where multiple states can merge together due
/// to the collapsing of epsilon transitions.
type StateSetId = BTreeSet<state::Identifier>;
/// The definition of a [NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) for a
/// given set of symbols, states, and transitions (specifically a NFA with ε-moves).
///
/// A NFA is a finite state automaton that accepts or rejects a given sequence of symbols. In
/// contrast with a DFA, the NFA may transition between states _without_ reading any new symbol
/// through use of
/// [epsilon links](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton#NFA_with_%CE%B5-moves).
///
/// ```text
/// ┌───┐ 'N' ┌───┐ ┌───┐ 'F' ┌───┐ ┌───┐ 'A' ┌───┐
/// │ 0 │ ----> │ 1 │ -> │ 2 │ ----> │ 3 │ -> │ 3 │ ----> │ 3 │
/// └───┘ └───┘ ε └───┘ └───┘ ε └───┘ └───┘
/// ```
#[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct NFA {
/// A set of disjoint intervals over the input alphabet.
pub alphabet_segmentation:alphabet::Segmentation,
/// A set of named NFA states, with (epsilon) transitions.
pub states:Vec<State>,
}
impl NFA {
/// Adds a new state to the NFA and returns its identifier.
pub fn new_state(&mut self) -> state::Identifier {
let id = self.states.len();
self.states.push(State::default());
state::Identifier{id}
}
/// Creates an epsilon transition between two states.
///
/// Whenever the automaton happens to be in `source` state it can immediately transition to the
/// `target` state. It is, however, not _required_ to do so.
pub fn connect(&mut self, source:state::Identifier, target:state::Identifier) {
self.states[source.id].epsilon_links.push(target);
}
/// Creates an ordinary transition for a range of symbols.
///
/// If any symbol from such range happens to be the input when the automaton is in the `source`
/// state, it will immediately transition to the `target` state.
pub fn connect_via
( &mut self
, source : state::Identifier
, target_state : state::Identifier
, symbols : &RangeInclusive<Symbol>
) {
self.alphabet_segmentation.insert(symbols.clone());
self.states[source.id].links.push(Transition{symbols:symbols.clone(),target_state});
}
/// Transforms a pattern to an NFA using the algorithm described
/// [here](https://www.youtube.com/watch?v=RYNN-tb9WxI).
/// The asymptotic complexity is linear in number of symbols.
pub fn new_pattern(&mut self, source:state::Identifier, pattern:&Pattern) -> state::Identifier {
let current = self.new_state();
self.connect(source,current);
match pattern {
Pattern::Range(range) => {
let state = self.new_state();
self.connect_via(current,state,range);
state
},
Pattern::Many(body) => {
let s1 = self.new_state();
let s2 = self.new_pattern(s1,body);
let s3 = self.new_state();
self.connect(current,s1);
self.connect(current,s3);
self.connect(s2,s3);
self.connect(s3,s1);
s3
},
Pattern::Seq(patterns) => {
patterns.iter().fold(current,|s,pat| self.new_pattern(s,pat))
},
Pattern::Or(patterns) => {
let states = patterns.iter().map(|pat| self.new_pattern(current,pat)).collect_vec();
let end = self.new_state();
for state in states {
self.connect(state,end);
}
end
},
Pattern::Always => current,
}
}
/// Merges states that are connected by epsilon links, using an algorithm based on the one shown
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
fn eps_matrix(&self) -> Vec<StateSetId> {
fn fill_eps_matrix
( nfa : &NFA
, states : &mut Vec<StateSetId>
, visited : &mut Vec<bool>
, state : state::Identifier
) {
let mut state_set = StateSetId::new();
visited[state.id] = true;
state_set.insert(state);
for &target in &nfa.states[state.id].epsilon_links {
if !visited[target.id] {
fill_eps_matrix(nfa,states,visited,target);
}
state_set.insert(target);
state_set.extend(states[target.id].iter());
}
states[state.id] = state_set;
}
let mut states = vec![StateSetId::new(); self.states.len()];
for id in 0..self.states.len() {
let mut visited = vec![false; states.len()];
fill_eps_matrix(self,&mut states,&mut visited,state::Identifier{id});
}
states
}
/// Computes a transition matrix `(state, symbol) => state` for the NFA, ignoring epsilon links.
fn nfa_matrix(&self) -> Matrix<state::Identifier> {
let mut matrix = Matrix::new(self.states.len(),self.alphabet_segmentation.divisions.len());
for (state_ix, source) in self.states.iter().enumerate() {
let targets = source.targets(&self.alphabet_segmentation);
for (voc_ix, &target) in targets.iter().enumerate() {
matrix[(state_ix,voc_ix)] = target;
}
}
matrix
}
}
// === Trait Impls ===
impl From<&NFA> for DFA {
/// Transforms an NFA into a DFA, based on the algorithm described
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
/// The asymptotic complexity is quadratic in number of states.
fn from(nfa:&NFA) -> Self {
let nfa_mat = nfa.nfa_matrix();
let eps_mat = nfa.eps_matrix();
let mut dfa_mat = Matrix::new(0,nfa.alphabet_segmentation.divisions.len());
let mut dfa_eps_ixs = Vec::<StateSetId>::new();
let mut dfa_eps_map = HashMap::<StateSetId,state::Identifier>::new();
dfa_eps_ixs.push(eps_mat[0].clone());
dfa_eps_map.insert(eps_mat[0].clone(),state::Identifier::from(0));
let mut i = 0;
while i < dfa_eps_ixs.len() {
dfa_mat.new_row();
for voc_ix in 0..nfa.alphabet_segmentation.divisions.len() {
let mut eps_set = StateSetId::new();
for &eps_ix in &dfa_eps_ixs[i] {
let tgt = nfa_mat[(eps_ix.id,voc_ix)];
if tgt != state::Identifier::INVALID {
eps_set.extend(eps_mat[tgt.id].iter());
}
}
if !eps_set.is_empty() {
dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) {
Some(&id) => id,
None => {
let id = state::Identifier::new(dfa_eps_ixs.len());
dfa_eps_ixs.push(eps_set.clone());
dfa_eps_map.insert(eps_set,id);
id
},
};
}
}
i += 1;
}
let mut callbacks = vec![None; dfa_eps_ixs.len()];
let priority = dfa_eps_ixs.len();
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
let has_name = |&key:&state::Identifier| nfa.states[key.id].name.is_some();
if let Some(eps) = epss.into_iter().find(has_name) {
let code = nfa.states[eps.id].name.as_ref().cloned().unwrap();
callbacks[dfa_ix] = Some(RuleExecutable {code,priority});
}
}
let alphabet_segmentation = nfa.alphabet_segmentation.clone();
let links = dfa_mat;
DFA{alphabet_segmentation,links,callbacks}
}
}
// ===========
// == Tests ==
// ===========
#[cfg(test)]
pub mod tests {
extern crate test;
use crate::automata::dfa;
use super::*;
use test::Bencher;
/// NFA that accepts a newline '\n'.
pub fn newline() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![(10..=10,2)]),
State::from(vec![3]).named("group_0_rule_0"),
State::default(),
],
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()),
}
}
/// NFA that accepts any letter in the range a..=z.
pub fn letter() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![(97..=122,2)]),
State::from(vec![3]).named("group_0_rule_0"),
State::default(),
],
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()),
}
}
/// NFA that accepts any number of spaces ' '.
pub fn spaces() -> NFA {
NFA {
states:vec![
State::from(vec![1]),
State::from(vec![2]),
State::from(vec![(32..=32,3)]),
State::from(vec![4]),
State::from(vec![5,8]),
State::from(vec![6]),
State::from(vec![(32..=32,7)]),
State::from(vec![8]),
State::from(vec![5,9]).named("group_0_rule_0"),
State::default(),
],
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()),
}
}
/// NFA that accepts one letter a..=z or many spaces ' '.
pub fn letter_and_spaces() -> NFA {
NFA {
states:vec![
State::from(vec![1,3]),
State::from(vec![(97..=122,2)]),
State::from(vec![11]).named("group_0_rule_0"),
State::from(vec![4]),
State::from(vec![(32..=32,5)]),
State::from(vec![6]),
State::from(vec![7,10]),
State::from(vec![8]),
State::from(vec![(32..=32,9)]),
State::from(vec![10]),
State::from(vec![7,11]).named("group_0_rule_1"),
State::default(),
],
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()),
}
}
#[test]
fn test_to_dfa_newline() {
assert_eq!(DFA::from(&newline()),dfa::tests::newline());
}
#[test]
fn test_to_dfa_letter() {
assert_eq!(DFA::from(&letter()),dfa::tests::letter());
}
#[test]
fn test_to_dfa_spaces() {
assert_eq!(DFA::from(&spaces()),dfa::tests::spaces());
}
#[test]
fn test_to_dfa_letter_and_spaces() {
assert_eq!(DFA::from(&letter_and_spaces()),dfa::tests::letter_and_spaces());
}
#[bench]
fn bench_to_dfa_newline(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&newline()))
}
#[bench]
fn bench_to_dfa_letter(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&letter()))
}
#[bench]
fn bench_to_dfa_spaces(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&spaces()))
}
#[bench]
fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) {
bencher.iter(|| DFA::from(&letter_and_spaces()))
}
}

View File

@ -1,194 +0,0 @@
//! Simple API for constructing regex patterns that are used in parser implementation.
#[macro_use]
mod macros;
use crate::automata::symbol::Symbol;
use core::iter;
use itertools::Itertools;
use std::ops::BitOr;
use std::ops::RangeInclusive;
use std::ops::Shr;
use Pattern::*;
// =============
// == Pattern ==
// =============
/// A representation of a simple regular pattern.
#[derive(Clone,Debug)]
pub enum Pattern {
/// The pattern that triggers on any symbol from the given range.
Range(RangeInclusive<Symbol>),
/// The pattern that triggers on any given pattern from a sequence.
Or(Vec<Pattern>),
/// The pattern that triggers when a sequence of patterns is encountered.
Seq(Vec<Pattern>),
/// The pattern that triggers on 0..N repetitions of given pattern.
Many(Box<Pattern>),
/// The pattern that always triggers.
Always,
}
impl Pattern {
/// A pattern that never triggers.
pub fn never() -> Self {
Pattern::symbol(Symbol::INVALID_SYMBOL)
}
/// A pattern that always triggers
pub fn always() -> Self {
Pattern::Always
}
/// A pattern that triggers on any character.
pub fn any() -> Self {
Pattern::symbols(Symbol::from(0)..=Symbol::from(u32::max_value()))
}
/// A pattern that triggers on 0..N repetitions of the pattern described by `self`.
pub fn many(&self) -> Self {
Many(Box::new(self.clone()))
}
/// A pattern that triggers on 1..N repetitions of the pattern described by `self`.
pub fn many1(&self) -> Self {
self.clone() >> self.many()
}
/// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`.
pub fn opt(&self) -> Self {
self.clone() | Self::always()
}
/// A pattern that triggers on the given character.
pub fn char(character:char) -> Self {
Self::symbol(Symbol::from(character))
}
/// A pattern that triggers on the given symbol.
pub fn symbol(symbol:Symbol) -> Self {
Pattern::symbols(symbol..=symbol)
}
/// A pattern that triggers on any of the provided `symbols`.
pub fn symbols(symbols:RangeInclusive<Symbol>) -> Self {
Pattern::Range(symbols)
}
/// A pattern that triggers at the end of the file.
pub fn eof() -> Self {
Self::symbol(Symbol::EOF_CODE)
}
/// A pattern that triggers on any character in the provided `range`.
pub fn range(range:RangeInclusive<char>) -> Self {
Pattern::symbols(Symbol::from(*range.start())..=Symbol::from(*range.end()))
}
/// Pattern that triggers when sequence of characters given by `chars` is encountered.
pub fn all_of(chars:&str) -> Self {
let mut chars_iter = chars.chars();
if let Some(first) = chars_iter.next() {
chars_iter.fold(Self::char(first),|pat, char| pat >> Self::char(char))
} else {
Pattern::never()
}
}
/// The pattern that triggers on any characters contained in `chars`.
pub fn any_of(chars:&str) -> Self {
chars.chars().fold(Self::never(),|pat,char| pat | Self::char(char))
}
/// The pattern that doesn't trigger on any character contained in `chars`.
pub fn none_of(chars:&str) -> Self {
let max = u32::max_value();
let char_iter = chars.chars().map(|char| char as u32);
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
let mut codes = char_iter2.collect_vec();
codes.sort();
codes.iter().tuple_windows().fold(Self::never(),|pat,(prev_code,next_code)| {
let start = prev_code + 1;
let end = next_code - 1;
if end < start {pat} else {
pat | Pattern::symbols(Symbol::from(start)..=Symbol::from(end))
}
})
}
/// The pattern that triggers on any character but `char`.
pub fn not(char:char) -> Self {
Self::none_of(&char.to_string())
}
/// The pattern that triggers on `num` repetitions of `pat`.
pub fn repeat(pat:Pattern, num:usize) -> Self {
(0..num).fold(Self::always(),|p,_| p >> pat.clone())
}
/// Pattern that triggers on `min`..`max` repetitions of `pat`.
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
(min..max).fold(Self::never(),|p,n| p | Self::repeat(pat.clone(),n))
}
}
// === Trait Impls ====
impl BitOr<Pattern> for Pattern {
type Output = Pattern;
fn bitor(self, rhs:Pattern) -> Self::Output {
match (self, rhs) {
(Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)},
(Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)},
(lhs , Or(mut rhs)) => {rhs.insert(0,lhs) ; Or(rhs)},
(lhs , rhs ) => Or(vec![lhs,rhs]),
}
}
}
gen_ref_versions!(Pattern,BitOr,bitor);
impl Shr<Pattern> for Pattern {
type Output = Pattern;
fn shr(self, rhs:Pattern) -> Self::Output {
match (self, rhs) {
(Seq(mut lhs), Seq(rhs) ) => {lhs.extend(rhs) ; Seq(lhs)},
(Seq(mut lhs), rhs ) => {lhs.push(rhs) ; Seq(lhs)},
(lhs , Seq(mut rhs)) => {rhs.insert(0,lhs) ; Seq(rhs)},
(lhs , rhs ) => Seq(vec![lhs, rhs]),
}
}
}
gen_ref_versions!(Pattern,Shr,shr);
// =================
// === Utilities ===
// =================
/// Quote a character as a character pattern.
///
/// It is equivalent to `Pattern::char(...)`.
#[macro_export]
macro_rules! c {
($char:literal) => {
Pattern::char($char)
}
}
/// Quote a string as a literal pattern.
///
/// It is equivalent to `Pattern::all_of(...)`.
#[macro_export]
macro_rules! l {
($lit:literal) => {
Pattern::all_of($lit)
}
}

View File

@ -1,28 +0,0 @@
//! Useful macros for defining operators over patterns.
/// Generates versions of an operator taking various combinations of by-reference and by-value.
#[macro_export]
macro_rules! gen_ref_versions {
($ty_name:ty,$opr_name:ident,$fn_name:ident) => (
impl $opr_name<&$ty_name> for &$ty_name {
type Output = $ty_name;
fn $fn_name(self, rhs:&$ty_name) -> Self::Output {
self.clone().$fn_name(rhs.clone())
}
}
impl $opr_name<&$ty_name> for $ty_name {
type Output = $ty_name;
fn $fn_name(self, rhs:&$ty_name) -> Self::Output {
self.$fn_name(rhs.clone())
}
}
impl $opr_name<$ty_name> for &$ty_name {
type Output = $ty_name;
fn $fn_name(self, rhs:$ty_name) -> Self::Output {
self.clone().$fn_name(rhs)
}
}
)
}

View File

@ -1,136 +0,0 @@
//! This module exports State implementation for Nondeterministic Finite Automata.
use crate::automata::alphabet;
use crate::automata::symbol::Symbol;
use crate::prelude::*;
// ===========
// == State ==
// ===========
/// A named state for a [`super::nfa::NFA`].
#[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct State {
/// A set of transitions that can trigger without consuming a symbol (ε-transitions).
pub epsilon_links:Vec<Identifier>,
/// The set of transitions that trigger while consuming a specific symbol.
///
/// When triggered, the automaton will transition to the [`Transition::target_state`].
pub links:Vec<Transition>,
/// The name of the state.
///
/// This is used to auto-generate a call to the rust method of the same name.
pub name:Option<String>,
/// The function to call when evaluating the state.
pub callback:String
}
impl State {
/// Updater for field `name`. Returns updated state.
pub fn named(mut self, name:&str) -> Self {
self.name = Some(name.to_owned());
self
}
/// Returns transition (next state) for each symbol in alphabet.
pub fn targets(&self, alphabet:&alphabet::Segmentation) -> Vec<Identifier> {
let mut targets = vec![];
let mut index = 0;
let mut links = self.links.clone();
links.sort_by_key(|link| *link.symbols.start());
for &symbol in &alphabet.divisions {
while links.len() > index && *links[index].symbols.end() < symbol {
index += 1;
}
if links.len() <= index || *links[index].symbols.start() > symbol {
targets.push(Identifier::INVALID);
} else {
targets.push(links[index].target_state);
}
}
targets
}
}
// === Trait Impls ====
impl From<Vec<usize>> for State {
/// Creates a state with epsilon links.
fn from(vec:Vec<usize>) -> Self {
let epsilon_links = vec.iter().cloned().map(|id| Identifier{id}).collect();
State{epsilon_links,..Default::default()}
}
}
impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
/// Creates a state with ordinary links.
fn from(vec:Vec<(RangeInclusive<u32>, usize)>) -> Self {
let link = |(range, id): (RangeInclusive<u32>, usize)| {
let start = Symbol{value:*range.start()};
let end = Symbol{value:*range.end()};
Transition{symbols:start..=end,target_state:Identifier{id}}
};
let links = vec.iter().cloned().map(link).collect();
State{links,..Default::default()}
}
}
// ================
// == Identifier ==
// ================
/// A state identifier for an arbitrary finite automaton.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
#[allow(missing_docs)]
pub struct Identifier {
pub id: usize
}
impl Identifier {
/// An identifier representing the invalid state.
///
/// When in an invalid state, a finite automaton will reject the sequence of input symbols.
pub const INVALID:Identifier = Identifier{id:usize::max_value()};
/// Constructs a new state identifier.
pub fn new(id:usize) -> Identifier {
Identifier{id}
}
}
// === Trait Impls ===
impl Default for Identifier {
/// Returns state::INVALID. This is because every finite automata has an invalid state
/// and because all transitions in automata transition matrix lead to invalid state by default.
fn default() -> Self {
Identifier::INVALID
}
}
impl From<usize> for Identifier {
fn from(id: usize) -> Self {
Identifier{id}
}
}
// ============
// === Link ===
// ============
/// A transition between states in a finite automaton that must consume a symbol to trigger.
#[derive(Clone,Debug,PartialEq,Eq)]
pub struct Transition {
/// The range of symbols on which this transition will trigger.
pub symbols:RangeInclusive<Symbol>,
/// The state that is entered after the transition has triggered.
pub target_state:Identifier,
}

View File

@ -1,53 +0,0 @@
//! Defines a Symbol that is operated on by the finite automata.
// ==============
// === Symbol ===
// ==============
/// An input symbol to a finite automaton.
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
pub struct Symbol {
/// The 4-byte representation of the symbol.
pub value:u32
}
impl Symbol {
/// A representation of the null symbol.
pub const NULL:Symbol = Symbol{value:0};
/// A representation of the end of the file.
pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()};
/// A representation of an arbitrary invalid unicode symbol.
pub const INVALID_SYMBOL:Symbol = Symbol{value:0xFFFF};
/// A representation of the group reaching its end without matching.
pub const INCOMPLETE_GROUP:Symbol = Symbol{value:u32::max_value() - 1};
}
// === Trait Impls ===
impl Default for Symbol {
fn default() -> Self {
Symbol::NULL
}
}
impl From<u32> for Symbol {
fn from(value:u32) -> Symbol {
Symbol{value}
}
}
impl From<char> for Symbol {
fn from(value:char) -> Symbol {
Symbol{value:value as u32}
}
}
impl From<&Symbol> for Symbol {
fn from(symbol:&Symbol) -> Self {
let value = symbol.value;
Symbol{value}
}
}

View File

@ -1,3 +0,0 @@
//! Generic data-structures to support multiple use-cases.
pub mod matrix;

View File

@ -1,75 +0,0 @@
//! An efficient representation of a 2D matrix.
use crate::prelude::*;
use std::ops::Index;
use std::ops::IndexMut;
// ============
// == Matrix ==
// ============
/// An efficient 2D matrix implemented on top of [`std::vec::Vec`].
#[derive(Clone,Debug,Default,PartialEq,Eq)]
pub struct Matrix<T> {
/// The number of rows in the matrix.
rows:usize,
/// The number of columns in the matrix.
columns:usize,
/// The matrix.
matrix:Vec<T>,
}
impl<T> Matrix<T> {
/// Get the number of rows in the matrix.
pub fn rows(&self) -> usize {
self.rows
}
/// Get the number of columns in the matrix.
pub fn columns(&self) -> usize {
self.columns
}
/// Obtain the indices for the rows in this matrix.
pub fn row_indices(&self) -> Range<usize> {
0..self.rows()
}
}
impl<T:Default> Matrix<T> {
/// Constructs a matrix with the dimensions given by `rows` and `columns`.
pub fn new(rows:usize, columns:usize) -> Self {
let mut matrix = Vec::with_capacity(rows*columns);
for _ in 0..matrix.capacity() {
matrix.push(default())
}
Self{rows,columns,matrix}
}
/// Adds a new row to the matrix `self`, filled with default values.
pub fn new_row(&mut self) {
for _ in 0..self.columns {
self.matrix.push(default());
}
self.rows += 1;
}
}
// === Trait Impls ===
impl<T> Index<(usize,usize)> for Matrix<T> {
type Output = T;
fn index(&self, index:(usize,usize)) -> &T {
&self.matrix[index.0*self.columns+index.1]
}
}
impl<T> IndexMut<(usize,usize)> for Matrix<T> {
fn index_mut(&mut self, index:(usize,usize)) -> &mut T {
&mut self.matrix[index.0*self.columns+index.1]
}
}

View File

@ -1,541 +0,0 @@
//! This file contains utilities for generating rust code from lexer definitions, allowing the
//! flexer to be specialised for a specific language.
use crate::prelude::*;
use quote::*;
use syn::*;
use crate::automata::dfa::DFA;
use crate::automata::dfa::RuleExecutable;
use crate::automata::state::Identifier;
use crate::automata::state::State;
use crate::group::Group;
use crate::group;
use enso_macro_utils::repr;
use proc_macro2::Literal;
use std::hash::BuildHasher;
use std::result::Result;
use std::fmt;
use crate as flexer;
// =======================
// === Code Generation ===
// =======================
/// Generate specialized code for the provided lexer `definition`.
///
/// This specialized code is a highly-optimised and tailored lexer that dispatches based on simple
/// code-point switches, with no dynamic lookup. This means that it is very fast, and very low
/// overhead.
pub fn specialize
( definition : &impl flexer::State
, state_type_name : impl Str
, output_type_name : impl Str
) -> Result<String,GenError> {
let group_registry = definition.groups();
let mut body_items = Vec::new();
body_items.push(run_function(output_type_name)?);
body_items.push(run_current_state_function());
body_items.push(step(group_registry));
for group in group_registry.all().iter() {
body_items.extend(automaton_for_group(group,group_registry)?)
}
let result = wrap_in_impl_for(state_type_name,body_items)?;
let code = show_code(&result);
Ok(code)
}
// === Whole-Lexer Codegen Utilities ===
/// Wrap the provided implementation items into an `impl` block for the provided `state_name` type.
pub fn wrap_in_impl_for
( state_name : impl Into<String>
, body : Vec<ImplItem>
) -> Result<ItemImpl,GenError> {
let state_name:Ident = str_to_ident(state_name.into().as_str())?;
let mut tree:ItemImpl = parse_quote! {
#[allow(missing_docs,dead_code,clippy::all)]
impl #state_name {}
};
tree.items.extend(body);
Ok(tree)
}
/// Generate the `run` function for the specialized lexer.
///
/// This function is what the user of the lexer will call to begin execution.
pub fn run_function(output_type_name:impl Str) -> Result<ImplItem,GenError> {
let output_type_name = str_to_path(output_type_name)?;
let tree:ImplItem = parse_quote! {
pub fn run<R:LazyReader>(&mut self, mut reader:R) -> LexingResult<#output_type_name> {
self.set_up();
reader.advance_char(&mut self.bookmarks);
while self.run_current_state(&mut reader) == StageStatus::ExitSuccess {}
let result = match self.status {
StageStatus::ExitFinished => LexingResult::success(
mem::take(&mut self.output)
),
StageStatus::ExitFail => LexingResult::failure(
mem::take(&mut self.output)
),
_ => LexingResult::partial(mem::take(&mut self.output))
};
self.tear_down();
result
}
};
Ok(tree)
}
/// Generate the function responsible for executing the lexer in its current state.
pub fn run_current_state_function() -> ImplItem {
let tree:ImplItem = parse_quote! {
fn run_current_state<R:LazyReader>(&mut self, reader:&mut R) -> StageStatus {
self.status = StageStatus::Initial;
let mut finished = false;
// Runs until reaching a state that no longer says to continue.
while let Some(next_state) = self.status.continue_as() {
self.logger.debug(||format!("Current character is {:?}.",reader.character().char));
self.logger.debug(||format!("Continuing in {:?}.",next_state));
self.status = self.step(next_state,reader);
if finished && reader.finished(self.bookmarks()) {
self.logger.info("Input finished.");
self.status = StageStatus::ExitFinished
}
finished = reader.character().is_eof();
if self.status.should_continue() {
match reader.character().char {
Ok(char) => {
reader.append_result(char);
self.logger.info(||format!("Result is {:?}.",reader.result()));
},
Err(flexer::prelude::reader::Error::EOF) => {
self.logger.info("Reached EOF.");
},
Err(flexer::prelude::reader::Error::EndOfGroup) => {
let current_state = self.current_state();
let group_name = self.groups().group(current_state).name.as_str();
let err = format!("Missing rules for state {}.", group_name);
self.logger.error(err.as_str());
panic!(err)
}
Err(_) => {
self.logger.error("Unexpected error!");
panic!("Unexpected error!")
}
}
reader.advance_char(&mut self.bookmarks);
}
}
self.status
}
};
tree
}
/// Generate the `step` function for the lexer.
///
/// This function is responsible for dispatching based on the current state, consuming a character,
/// and returning the state to transition to.
pub fn step(groups:&group::Registry) -> ImplItem {
let arms = groups.all().iter().map(|g| step_match_arm(g.id.into())).collect_vec();
parse_quote! {
fn step<R:LazyReader>(&mut self, next_state:SubStateId, reader:&mut R) -> StageStatus {
let current_state:usize = self.current_state().into();
match current_state {
#(#arms)*
_ => unreachable_panic!("Unreachable state reached in lexer."),
}
}
}
}
/// Generate a match arm for the step function.
///
/// There is one match arm per lexer state.
pub fn step_match_arm(number:usize) -> Arm {
let literal = Literal::usize_unsuffixed(number);
let function_name_str = format!("dispatch_in_state_{}",number);
let func_name:Ident = parse_str(function_name_str.as_str()).unwrap();
let arm:Arm = parse_quote! {
#literal => self.#func_name(next_state,reader),
};
arm
}
// === Generation for a Specific Lexer State ===
/// Generate the functions that implement the lexer automaton for a given lexer state.
pub fn automaton_for_group
( group : &Group
, registry : &group::Registry
) -> Result<Vec<ImplItem>,GenError> {
let nfa = registry.to_nfa_from(group.id);
let mut rules = Vec::with_capacity(nfa.states.len());
for state in nfa.states.iter() {
if state.name.is_some() {
rules.push(rule_for_state(state)?);
}
}
let mut dfa = DFA::from(&nfa);
let dispatch_for_dfa = dispatch_in_state(&dfa,group.id.into())?;
let mut dfa_transitions = transitions_for_dfa(&mut dfa,group.id.into())?;
dfa_transitions.push(dispatch_for_dfa);
dfa_transitions.extend(rules);
Ok(dfa_transitions)
}
/// Generate a set of transition functions for the provided `dfa`, with identifier `id`.
pub fn transitions_for_dfa(dfa:&mut DFA, id:usize) -> Result<Vec<ImplItem>,GenError> {
let mut state_has_overlapping_rules:HashMap<usize,bool> = HashMap::new();
state_has_overlapping_rules.insert(0,false);
let state_names:Vec<_> = dfa.links.row_indices().map(|ix| (ix, name_for_step(id, ix))).collect();
let mut transitions = Vec::with_capacity(state_names.len());
for (ix,name) in state_names.into_iter() {
transitions.push(transition_for_dfa(dfa,name,ix,&mut state_has_overlapping_rules)?)
}
Ok(transitions)
}
/// Generate a specific transition function for
#[allow(clippy::implicit_hasher)]
pub fn transition_for_dfa<S:BuildHasher>
( dfa : &mut DFA
, transition_name : Ident
, state_ix : usize
, has_overlaps : &mut HashMap<usize,bool,S>
) -> Result<ImplItem,GenError> {
let match_expr:Expr = match_for_transition(dfa,state_ix,has_overlaps)?;
let function:ImplItem = parse_quote! {
fn #transition_name<R:LazyReader>(&mut self, reader:&mut R) -> StageStatus {
#match_expr
}
};
Ok(function)
}
/// Generate the pattern match for a given transition function.
pub fn match_for_transition<S:BuildHasher>
( dfa : &mut DFA
, state_ix : usize
, has_overlaps : &mut HashMap<usize,bool,S>
) -> Result<Expr,GenError> {
let overlaps = *has_overlaps.get(&state_ix).unwrap_or(&false);
let state = dfa.callbacks.get(state_ix).expect("Internal error.").clone();
let mut trigger_state = dfa.links[(state_ix,0)];
let mut range_start = u32::min_value();
let divisions:Vec<_> = dfa.alphabet_segmentation.divisions_as_vec();
let mut branches = Vec::with_capacity(divisions.len());
for division in divisions.into_iter() {
let ix = division.position;
let sym = division.symbol;
let new_trigger_state = dfa.links[(state_ix,ix)];
if new_trigger_state != trigger_state {
let range_end = if sym.value != 0 { sym.value - 1 } else { sym.value };
let current_trigger_state = trigger_state;
let current_range_start = range_start;
trigger_state = new_trigger_state;
range_start = sym.value;
let body =
branch_body(dfa,current_trigger_state,&state,has_overlaps,overlaps)?;
branches.push(Branch::new(Some(current_range_start..=range_end),body))
} else {}
}
let catch_all_branch_body = branch_body(dfa,trigger_state,&state,has_overlaps,overlaps)?;
let catch_all_branch = Branch::new(None,catch_all_branch_body);
branches.push(catch_all_branch);
let arms:Vec<Arm> = branches.into_iter().map(Into::into).collect();
let mut match_expr:ExprMatch = parse_quote! {
match u32::from(reader.character()) {
#(#arms)*
}
};
match_expr.arms = arms;
Ok(Expr::Match(match_expr))
}
/// Generate the branch body for a transition in the DFA.
pub fn branch_body<S:BuildHasher>
( dfa : &mut DFA
, target_state : Identifier
, maybe_state : &Option<RuleExecutable>
, has_overlaps : &mut HashMap<usize,bool,S>
, rules_overlap : bool
) -> Result<Block,GenError> {
if target_state == Identifier::INVALID {
match maybe_state {
None => {
Ok(parse_quote! {{
StageStatus::ExitFail
}})
},
Some(rule_exec) => {
let rule:Expr = match parse_str(rule_exec.code.as_str()) {
Ok(rule) => rule,
Err(_) => return Err(GenError::BadExpression(rule_exec.code.clone()))
};
if rules_overlap {
Ok(parse_quote! {{
let rule_bookmark = self.bookmarks.rule_bookmark;
let matched_bookmark = self.bookmarks.matched_bookmark;
self.bookmarks.rewind(rule_bookmark,reader);
self.current_match = reader.pop_result();
self.#rule(reader);
self.bookmarks.bookmark(matched_bookmark,reader);
StageStatus::ExitSuccess
}})
} else {
Ok(parse_quote! {{
let matched_bookmark = self.bookmarks.matched_bookmark;
self.current_match = reader.pop_result();
self.#rule(reader);
self.bookmarks.bookmark(matched_bookmark,reader);
StageStatus::ExitSuccess
}})
}
}
}
} else {
let target_state_has_no_rule = match maybe_state {
Some(state) => if !dfa.has_rule_for(target_state) {
dfa.callbacks[target_state.id] = Some(state.clone());
has_overlaps.insert(target_state.id,true);
true
} else {
false
},
None => false
};
let state_id = Literal::usize_unsuffixed(target_state.id);
let ret:Expr = parse_quote! {
StageStatus::ContinueWith(#state_id.into())
};
if target_state_has_no_rule && !rules_overlap {
Ok(parse_quote! {{
let rule_bookmark = self.bookmarks.rule_bookmark;
self.bookmarks.bookmark(rule_bookmark,reader);
#ret
}})
} else {
Ok(parse_quote! {{
#ret
}})
}
}
}
/// Generate the dispatch function for a given lexer state.
///
/// This dispatch function is responsible for dispatching based on the sub-state of any given lexer
/// state, and is the main part of implementing the actual lexer transitions.
pub fn dispatch_in_state(dfa:&DFA, id:usize) -> Result<ImplItem,GenError> {
let dispatch_name:Ident = str_to_ident(format!("dispatch_in_state_{}",id))?;
let state_names = dfa.links.row_indices().map(|ix| (ix, name_for_step(id,ix))).collect_vec();
let mut branches = Vec::with_capacity(state_names.len());
for (ix,name) in state_names.into_iter() {
let literal = Literal::usize_unsuffixed(ix);
let arm:Arm = parse_quote! {
#literal => self.#name(reader),
};
branches.push(arm);
}
let pattern_match:ExprMatch = parse_quote! {
match new_state_index.into() {
#(#branches)*
_ => unreachable_panic!("Unreachable state reached in lexer.")
}
};
let func:ImplItem = parse_quote! {
fn #dispatch_name<R:LazyReader>
( &mut self
, new_state_index:SubStateId
, reader:&mut R
) -> StageStatus {
#pattern_match
}
};
Ok(func)
}
/// Generate a name for a given step function.
pub fn name_for_step(in_state:usize, to_state:usize) -> Ident {
let name_str = format!("state_{}_to_{}",in_state,to_state);
parse_str(name_str.as_str()).expect("Impossible to not be a valid identifier.")
}
/// Generate an executable rule function for a given lexer state.
pub fn rule_for_state(state:&State) -> Result<ImplItem,GenError> {
match &state.name {
None => unreachable_panic!("Rule for state requested, but state has none."),
Some(name) => {
let rule_name = str_to_ident(name)?;
let code:Expr = match parse_str(state.callback.as_str()) {
Ok(expr) => expr,
Err(_) => return Err(GenError::BadExpression(state.callback.clone()))
};
if !has_reader_arg(&code) {
return Err(GenError::BadCallbackArgument)
}
let tree:ImplItem = parse_quote! {
fn #rule_name<R:LazyReader>(&mut self, reader:&mut R) {
#code
}
};
Ok(tree)
}
}
}
/// Checks if the given `expr` is a call with a single argument "reader" being passed.
#[allow(clippy::cmp_owned)]
pub fn has_reader_arg(expr:&Expr) -> bool {
match expr {
Expr::MethodCall(expr) => match expr.args.first() {
Some(Expr::Path(path)) => {
match path.path.segments.first() {
Some(segment) => {
segment.ident.to_string() == "reader"
}
_ => false
}
}
_ => false
},
Expr::Call(expr) => match expr.args.first() {
Some(Expr::Path(path)) => {
match path.path.segments.first() {
Some(segment) => {
segment.ident.to_string() == "reader"
}
_ => false
}
}
_ => false
}
_ => false
}
}
// ================
// === GenError ===
// ================
/// Errors that arise during code generation.
#[derive(Clone,Debug,PartialEq)]
pub enum GenError {
/// The callback function does not take a single argument `reader`.
BadCallbackArgument,
/// The provided string is not a valid rust identifier.
BadIdentifier(String),
/// The provided expression isn't a valid rust expression.
BadExpression(String),
/// The provided string is not a valid rust literal.
BadLiteral(String),
/// The provided string is not a valid rust path.
BadPath(String),
}
// === Trait Impls ===
impl Display for GenError {
fn fmt(&self, f:&mut fmt::Formatter<'_>) -> fmt::Result {
match self {
GenError::BadCallbackArgument => write!(f,
"Bad argument to a callback function. It must take a single argument `reader`."
),
GenError::BadIdentifier(str) => write!(f,"`{}` is not a valid rust identifier.",str),
GenError::BadExpression(str) => write!(f,"`{}` is not a valid rust expression.",str),
GenError::BadLiteral(str) => write!(f,"`{}` is not a valid rust literal.",str),
GenError::BadPath(str) => write!(f,"`{}` is not a valid rust path.",str),
}
}
}
// ==============
// === Branch ===
// ==============
/// A representation of a dispatch branch for helping to generate pattern arms.
#[allow(missing_docs)]
#[derive(Clone,Debug,PartialEq)]
struct Branch {
pub range:Option<RangeInclusive<u32>>,
pub body:Block
}
impl Branch {
/// Create a new branch, from the provided `range` and with `body` as the code it executes.
pub fn new(range:Option<RangeInclusive<u32>>, body:Block) -> Branch {
Branch {range,body}
}
}
// === Trait Impls ===
impl Into<Arm> for Branch {
fn into(self) -> Arm {
let body = self.body;
match self.range {
Some(range) => {
let range_start = Literal::u32_unsuffixed(*range.start());
let range_end = Literal::u32_unsuffixed(*range.end());
if range.start() == range.end() {
parse_quote! {
#range_start => #body,
}
} else {
parse_quote! {
#range_start..=#range_end => #body,
}
}
}
None => parse_quote! {
_ => #body,
}
}
}
}
// =================
// === Utilities ===
// =================
/// Convert a string to an identifier.
pub fn str_to_ident(str:impl Str) -> Result<Ident,GenError> {
parse_str(str.as_ref()).map_err(|_| GenError::BadIdentifier(str.into()))
}
/// Convert a string to a path.
pub fn str_to_path(str:impl Str) -> Result<Path,GenError> {
parse_str(str.as_ref()).map_err(|_| GenError::BadPath(str.into()))
}
/// Convert the syntax tree into a string.
pub fn show_code(tokens:&impl ToTokens) -> String {
repr(tokens)
}

View File

@ -1,366 +0,0 @@
//! This module provides an API for grouping multiple flexer rules.
use crate::automata::nfa::NFA;
use crate::automata::pattern::Pattern;
use crate::group::rule::Rule;
use itertools::Itertools;
use std::fmt::Display;
use wasm_bindgen::__rt::core::fmt::Formatter;
pub mod rule;
// ================
// === Registry ===
// ================
/// The group Registry is a container for [`Group`]s in the flexer implementation.
///
/// It allows groups to contain associations between themselves, and also implements useful
/// conversions for groups.
#[derive(Clone,Debug,Default)]
pub struct Registry {
/// The groups defined for the lexer.
groups:Vec<Group>
}
impl Registry {
/// Defines a new group of rules for the lexer with the specified `name` and `parent`.
///
/// It returns the identifier of the newly-created group.
pub fn define_group
( &mut self
, name : impl Into<String>
, parent_index : Option<Identifier>
) -> Identifier {
let id = self.next_id();
let group = Group::new(id,name.into(),parent_index);
self.groups.push(group);
id
}
/// Adds an existing `group` to the registry, updating and returning its identifier.
pub fn add_group(&mut self, mut group:Group) -> Identifier {
let new_id = self.next_id();
group.id = new_id;
self.groups.push(group);
new_id
}
/// Creates a rule that matches `pattern` for the group identified by `group_id`.
///
/// Panics if `group_id` refers to a nonexistent group.
pub fn create_rule(&mut self, group:Identifier, pattern:&Pattern, callback:impl AsRef<str>) {
let group = self.group_mut(group);
group.create_rule(pattern,callback.as_ref());
}
/// Associates the provided `rule` with the group identified by `group_id`.
///
/// Panics if `group_id` refers to a nonexistent group.
pub fn add_rule(&mut self, group:Identifier, rule:Rule) {
let group = self.group_mut(group);
group.add_rule(rule);
}
/// Collates the entire set of rules that are matchable when the lexer has the group identified
/// by `group_id` as active.
///
/// This set of rules includes the rules inherited from any parent groups.
pub fn rules_for(&self, group:Identifier) -> Vec<&Rule> {
let group_handle = self.group(group);
let mut parent = group_handle.parent_index.map(|p| self.group(p));
let mut rules = (&group_handle.rules).iter().collect_vec();
while let Some(parent_group) = parent {
if parent_group.id == group_handle.id {
panic!("There should not be cycles in parent links for lexer groups.")
}
rules.extend((&parent_group.rules).iter());
parent = parent_group.parent_index.map(|p| self.group(p));
}
rules
}
/// Obtains a reference to the group for the given `group_id`.
///
/// As group identifiers can only be created by use of this `Registry`, this will always
/// succeed.
pub fn group(&self, group:Identifier) -> &Group {
self.groups.get(group.0).expect("The group must exist.")
}
/// Obtains a mutable reference to the group for the given `group_id`.
///
/// As group identifiers can only be created by use of this `Registry`, this will always
/// succeed.
pub fn group_mut(&mut self, group:Identifier) -> &mut Group {
self.groups.get_mut(group.0).expect("The group should exist.")
}
/// Converts the group identified by `group_id` into an NFA.
///
/// Returns `None` if the group does not exist, or if the conversion fails.
pub fn to_nfa_from(&self, group:Identifier) -> NFA {
let group = self.group(group);
let mut nfa = NFA::default();
let start = nfa.new_state();
let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern);
let rules = self.rules_for(group.id);
let callbacks = rules.iter().map(|r| r.callback.clone()).collect_vec();
let states = rules.into_iter().map(build).collect_vec();
let end = nfa.new_state();
for (ix,state) in states.into_iter().enumerate() {
nfa.states[state.id].name = Some(group.callback_name(ix));
nfa.states[state.id].callback = callbacks.get(ix).unwrap().clone();
nfa.connect(state,end);
}
nfa
}
/// Generates the next group identifier for this registry.
fn next_id(&self) -> Identifier {
let val = self.groups.len();
Identifier(val)
}
/// Get an immutable reference to the groups contained within the registry.
pub fn all(&self) -> &Vec<Group> {
&self.groups
}
}
// ==================
// === Identifier ===
// ==================
/// An identifier for a group.
#[allow(missing_docs)]
#[derive(Copy,Clone,Debug,Default,Eq,PartialEq)]
pub struct Identifier(usize);
// === Trait Impls ===
impl From<usize> for Identifier {
fn from(id:usize) -> Self {
Identifier(id)
}
}
impl From<&usize> for Identifier {
fn from(id:&usize) -> Self {
Identifier(*id)
}
}
impl Into<usize> for Identifier {
fn into(self) -> usize {
self.0
}
}
// ===========
// == Group ==
// ===========
/// A group is a structure for associating multiple rules with each other, and is the basic building
/// block of the flexer.
///
/// A group consists of the following:
///
/// - A set of [`Rule`s](Rule), each containing a regex pattern and associated callback.
/// - Inherited rules from a parent group, if such a group exists.
///
/// Internally, the flexer maintains a stack of groups, where only one group can be active at any
/// given time. Rules are matched _in order_, and hence overlaps are handled by the order in which
/// the rules are matched, with the first callback being triggered.
///
/// Whenever a [`rule.pattern`](Rule::pattern) from the active group is matched against part of the
/// input, the associated [`rule.callback`](Rule::callback) is executed. This callback may exit the
/// current group or even enter a new one. As a result, groups allow us to elegantly model a
/// situation where certain parts of a program (e.g. within a string literal) have very different
/// lexing rules than other portions of a program (e.g. the body of a function).
#[derive(Clone,Debug,Default)]
pub struct Group {
/// A unique identifier for the group.
pub id:Identifier,
/// A name for the group (useful in debugging).
pub name:String,
/// The parent group from which rules are inherited.
///
/// It is ensured that the group is held mutably.
pub parent_index:Option<Identifier>,
/// A set of flexer rules.
pub rules:Vec<Rule>,
}
impl Group {
/// Creates a new group.
pub fn new(id:Identifier, name:impl Into<String>, parent_index:Option<Identifier>) -> Self {
let rules = Vec::new();
Group{id,name:name.into(),parent_index,rules}
}
/// Adds a new rule to the current group.
pub fn add_rule(&mut self, rule:Rule) {
self.rules.push(rule)
}
/// Creates a new rule.
pub fn create_rule(&mut self, pattern:&Pattern, code:&str) {
let pattern_clone = pattern.clone();
let rule = Rule::new(pattern_clone,code);
self.rules.push(rule)
}
/// The canonical name for a given rule.
pub fn callback_name(&self, rule_ix:usize) -> String {
format!("group_{}_rule_{}",self.id.0,rule_ix)
}
}
// === Trait Impls ===
impl Into<Registry> for Group {
fn into(self) -> Registry {
let mut registry = Registry::default();
registry.add_group(self);
registry
}
}
impl Display for Group {
fn fmt(&self, f:&mut Formatter<'_>) -> std::fmt::Result {
write!(f,"Group {}",self.name)
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
pub mod tests {
extern crate test;
use crate::automata::nfa;
use crate::automata::pattern::Pattern;
use crate::group::Group;
use crate::group::Registry;
use crate::group::rule::Rule;
use std::default::Default;
use test::Bencher;
use enso_prelude::default;
fn newline() -> Registry {
let pattern = Pattern::char('\n');
let mut group = Group::default();
group.add_rule(Rule::new(pattern,""));
let mut registry = Registry::default();
registry.add_group(group);
registry
}
fn letter() -> Registry {
let pattern = Pattern::range('a'..='z');
let mut group = Group::default();
group.add_rule(Rule::new(pattern,""));
group.into()
}
fn spaces() -> Registry {
let pattern = Pattern::char(' ').many1();
let mut group = Group::default();
group.add_rule(Rule::new(pattern,""));
group.into()
}
fn letter_and_spaces() -> Registry {
let letter = Pattern::range('a'..='z');
let spaces = Pattern::char(' ').many1();
let mut group = Group::default();
group.add_rule(Rule::new(letter,""));
group.add_rule(Rule::new(spaces,""));
group.into()
}
fn complex_rules(count:usize) -> Registry {
let mut group = Group::default();
for ix in 0..count {
let string = ix.to_string();
let all = Pattern::all_of(&string);
let any = Pattern::any_of(&string);
let none = Pattern::none_of(&string);
let all_any_none = all >> any >> none;
let pattern = Pattern::many(&all_any_none);
group.add_rule(Rule::new(pattern.clone(),""));
}
group.into()
}
#[test]
fn test_to_nfa_newline() {
assert_eq!(newline().to_nfa_from(default()),nfa::tests::newline());
}
#[test]
fn test_to_nfa_letter() {
assert_eq!(letter().to_nfa_from(default()),nfa::tests::letter());
}
#[test]
fn test_to_nfa_spaces() {
assert_eq!(spaces().to_nfa_from(default()),nfa::tests::spaces());
}
#[test]
fn test_to_nfa_letter_and_spaces() {
let expected = nfa::tests::letter_and_spaces();
assert_eq!(letter_and_spaces().to_nfa_from(default()),expected);
}
#[bench]
fn bench_to_nfa_newline(bencher:&mut Bencher) {
bencher.iter(|| newline().to_nfa_from(default()))
}
#[bench]
fn bench_to_nfa_letter(bencher:&mut Bencher) {
bencher.iter(|| letter().to_nfa_from(default()))
}
#[bench]
fn bench_to_nfa_spaces(bencher:&mut Bencher) {
bencher.iter(|| spaces().to_nfa_from(default()))
}
#[bench]
fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) {
bencher.iter(|| letter_and_spaces().to_nfa_from(default()))
}
#[bench]
fn bench_ten_rules(bencher:&mut Bencher) {
bencher.iter(|| complex_rules(10).to_nfa_from(default()))
}
#[bench]
fn bench_hundred_rules(bencher:&mut Bencher) {
bencher.iter(|| complex_rules(100).to_nfa_from(default()))
}
#[bench]
fn bench_thousand_rules(bencher:&mut Bencher) {
bencher.iter(|| complex_rules(1000).to_nfa_from(default()))
}
}

View File

@ -1,34 +0,0 @@
//! An API for declaring rust-code callbacks to be executed when a given pattern is matched.
//!
//! A flexer rule is a [`crate::automata::pattern`] associated with rust code to be executed as a
//! callback.
use crate::automata::pattern::Pattern;
// ==========
// == Rule ==
// ==========
/// A flexer rule.
#[derive(Clone,Debug)]
pub struct Rule {
/// The pattern that triggers the callback.
pub pattern:Pattern,
/// The code to execute when [`Rule::pattern`] matches, containing rust code as a
/// [`std::string::String`].
///
/// This code will be called directly from a method defined on your Lexer (the one that contains
/// a [`crate::Flexer`] instance. To this end, the code you provide as a string must be valid in
/// that context.
pub callback:String,
}
impl Rule {
/// Creates a new rule.
pub fn new(pattern:Pattern, callback:impl Into<String>) -> Self {
Rule{pattern,callback:callback.into()}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,446 +0,0 @@
//! This file contains tests for the user-facing error-handling logic in the flexer code generator.
//!
//! This file includes quite a bit of duplicated code, but this is known and intentional as it
//! allows for increased clarity in the testing.
#![allow(missing_docs)]
use crate::prelude::LazyReader;
use crate::prelude::logger::AnyLogger;
use crate::prelude::logger::Disabled;
use crate::prelude::reader::BookmarkManager;
use flexer::*;
use flexer::automata::pattern::Pattern;
use flexer::Flexer;
use flexer::generate;
use flexer::group::{Registry, Identifier};
use flexer::group;
use flexer::prelude::*;
use flexer::State;
use flexer;
// ====================
// === Type Aliases ===
// ====================
type Logger = Disabled;
// ====================
// === Shared Setup ===
// ====================
/// A token type for these lexers.
#[derive(Copy,Clone,Debug,PartialEq)]
pub enum Token {
Foo,
Bar
}
/// An output type for these lexers.
#[allow(missing_docs)]
#[derive(Clone,Debug,Default,PartialEq)]
pub struct Output {
tokens:Vec<Token>
}
/// A testing lexer state.
pub struct LexerState {
lexer_states:group::Registry,
initial_state:group::Identifier,
}
impl flexer::State for LexerState {
fn new(_logger:&impl AnyLogger) -> Self {
let mut lexer_states = group::Registry::default();
let initial_state = lexer_states.define_group("ROOT",None);
LexerState{lexer_states,initial_state}
}
fn initial_state(&self) -> Identifier {
self.initial_state
}
fn groups(&self) -> &Registry {
&self.lexer_states
}
fn groups_mut(&mut self) -> &mut Registry {
&mut self.lexer_states
}
fn bookmarks(&self) -> &BookmarkManager {
unimplemented!()
}
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
unimplemented!()
}
fn specialize(&self) -> Result<String,GenError> {
// Note [Naming "Lexer"]
generate::specialize(self,"Lexer","Output")
}
}
/* Note [Naming "Lexer"]
* ~~~~~~~~~~~~~~~~~~~~~
* In general, the name passed to `specialize` should match that of your lexer definition. However
* here, as we never compile the code, we set it to a generic constant that is a valid rust
* identifier so as to reduce testing boilerplate.
*/
// ====================
// === Definition 1 ===
// ====================
pub struct Lexer1 {
lexer:Flexer<LexerState,Output,Logger>
}
impl Deref for Lexer1 {
type Target = Flexer<LexerState,Output,Logger>;
fn deref(&self) -> &Self::Target {
&self.lexer
}
}
impl DerefMut for Lexer1 {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.lexer
}
}
impl Lexer1 {
pub fn new() -> Lexer1 {
let logger = Logger::new("Lexer1");
let lexer = Flexer::new(logger);
Lexer1 {lexer}
}
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
unimplemented!()
}
}
impl flexer::Definition for Lexer1 {
fn define() -> Self {
let mut lexer = Self::new();
let foo = Pattern::all_of("foo");
let root_group_id = lexer.initial_state();
let root_group = lexer.groups_mut().group_mut(root_group_id);
root_group.create_rule(&foo, "ETERNAL SCREAMING");
lexer
}
fn groups(&self) -> &Registry {
self.lexer.groups()
}
fn set_up(&mut self) {
unimplemented!()
}
fn tear_down(&mut self) {
unimplemented!()
}
}
#[test]
fn test_bad_rule_expression() {
let lexer = Lexer1::define();
let result = lexer.specialize();
assert!(result.is_err());
let message = result.unwrap_err().to_string();
assert_eq!(message,"`ETERNAL SCREAMING` is not a valid rust expression.");
}
// ====================
// === Definition 2 ===
// ====================
pub struct Lexer2 {
lexer:Flexer<LexerState,Output,Logger>
}
impl Deref for Lexer2 {
type Target = Flexer<LexerState,Output,Logger>;
fn deref(&self) -> &Self::Target {
&self.lexer
}
}
impl DerefMut for Lexer2 {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.lexer
}
}
impl Lexer2 {
pub fn new() -> Lexer2 {
let logger = Logger::new("Lexer2");
let lexer = Flexer::new(logger);
Lexer2{lexer}
}
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
unimplemented!()
}
}
impl flexer::Definition for Lexer2 {
fn define() -> Self {
let mut lexer = Self::new();
let foo = Pattern::all_of("foo");
let root_group_id = lexer.initial_state();
let root_group = lexer.groups_mut().group_mut(root_group_id);
root_group.create_rule(&foo, "self.test_function_no_reader()");
lexer
}
fn groups(&self) -> &Registry {
self.lexer.groups()
}
fn set_up(&mut self) {
unimplemented!()
}
fn tear_down(&mut self) {
unimplemented!()
}
}
#[test]
pub fn test_no_reader_arg() {
let lexer = Lexer2::define();
let result = lexer.specialize();
let expected_message =
"Bad argument to a callback function. It must take a single argument `reader`.";
assert!(result.is_err());
let message = result.unwrap_err().to_string();
assert_eq!(message,expected_message);
}
// ====================
// === Definition 3 ===
// ====================
pub struct Lexer3 {
lexer:Flexer<LexerState1,Output,Logger>
}
impl Deref for Lexer3 {
type Target = Flexer<LexerState1,Output,Logger>;
fn deref(&self) -> &Self::Target {
&self.lexer
}
}
impl DerefMut for Lexer3 {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.lexer
}
}
impl Lexer3 {
pub fn new() -> Lexer3 {
let logger = Logger::new("Lexer3");
let lexer = Flexer::new(logger);
Lexer3{lexer}
}
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
unimplemented!()
}
}
impl flexer::Definition for Lexer3 {
fn define() -> Self {
let mut lexer = Self::new();
let foo = Pattern::all_of("foo");
let root_group_id = lexer.initial_state();
let root_group = lexer.groups_mut().group_mut(root_group_id);
root_group.create_rule(&foo, "self.test_function_reader(reader)");
lexer
}
fn groups(&self) -> &Registry {
self.lexer.groups()
}
fn set_up(&mut self) {
unimplemented!()
}
fn tear_down(&mut self) {
unimplemented!()
}
}
pub struct LexerState1 {
lexer_states:group::Registry,
initial_state:group::Identifier,
}
impl flexer::State for LexerState1 {
fn new(_logger:&impl AnyLogger) -> Self {
let mut lexer_states = group::Registry::default();
let initial_state = lexer_states.define_group("ROOT",None);
LexerState1 {lexer_states,initial_state}
}
fn initial_state(&self) -> Identifier {
self.initial_state
}
fn groups(&self) -> &Registry {
&self.lexer_states
}
fn groups_mut(&mut self) -> &mut Registry {
&mut self.lexer_states
}
fn bookmarks(&self) -> &BookmarkManager {
unimplemented!()
}
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
unimplemented!()
}
fn specialize(&self) -> Result<String,GenError> {
generate::specialize(self,"Bad Lexer Name","Output")
}
}
#[test]
pub fn test_bad_state_name() {
let lexer = Lexer3::define();
let result = lexer.specialize();
assert!(result.is_err());
let message = result.unwrap_err().to_string();
assert_eq!(message,"`Bad Lexer Name` is not a valid rust identifier.");
}
// ====================
// === Definition 4 ===
// ====================
pub struct Lexer4 {
lexer:Flexer<LexerState2,Output,Logger>
}
impl Deref for Lexer4 {
type Target = Flexer<LexerState2,Output,Logger>;
fn deref(&self) -> &Self::Target {
&self.lexer
}
}
impl DerefMut for Lexer4 {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.lexer
}
}
impl Lexer4 {
pub fn new() -> Lexer4 {
let logger = Logger::new("Lexer4");
let lexer = Flexer::new(logger);
Lexer4{lexer}
}
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
unimplemented!()
}
}
impl flexer::Definition for Lexer4 {
fn define() -> Self {
let mut lexer = Self::new();
let foo = Pattern::all_of("foo");
let root_group_id = lexer.initial_state();
let root_group = lexer.groups_mut().group_mut(root_group_id);
root_group.create_rule(&foo, "self.test_function_reader(reader)");
lexer
}
fn groups(&self) -> &Registry {
self.lexer.groups()
}
fn set_up(&mut self) {
unimplemented!()
}
fn tear_down(&mut self) {
unimplemented!()
}
}
pub struct LexerState2 {
lexer_states:group::Registry,
initial_state:group::Identifier,
}
impl flexer::State for LexerState2 {
fn new(_logger:&impl AnyLogger) -> Self {
let mut lexer_states = group::Registry::default();
let initial_state = lexer_states.define_group("ROOT",None);
LexerState2 {lexer_states,initial_state}
}
fn initial_state(&self) -> Identifier {
self.initial_state
}
fn groups(&self) -> &Registry {
&self.lexer_states
}
fn groups_mut(&mut self) -> &mut Registry {
&mut self.lexer_states
}
fn bookmarks(&self) -> &BookmarkManager {
unimplemented!()
}
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
unimplemented!()
}
fn specialize(&self) -> Result<String,GenError> {
generate::specialize(self,"Lexer4","Bad output name")
}
}
#[test]
pub fn test_bad_output_name() {
let lexer = Lexer4::define();
let result = lexer.specialize();
assert!(result.is_err());
let message = result.unwrap_err().to_string();
assert_eq!(message,"`Bad output name` is not a valid rust path.");
}

View File

@ -12,7 +12,7 @@ test = true
bench = true
[dependencies]
flexer = { path = "../../flexer", version = "0.1.0" }
enso-prelude = { version = "0.1.3" }
enso-flexer = { version = "0.1.3" }
enso-prelude = { version = "0.1.7" }
uuid = { version = "0.8.1" , features = ["serde","v4","wasm-bindgen"] }

View File

@ -0,0 +1,344 @@
//! This crate describes valid escape sequences inside Enso text literals.
use crate::prelude::*;
use crate::lexeme;
use crate::library::token;
use crate::token::Shape;
use crate::token::EscapeStyle;
// =======================
// === EscapeCharacter ===
// =======================
/// A representation of an escape character.
#[derive(Clone,Debug,Default,Eq,PartialEq)]
pub struct EscapeCharacter {
/// The lexing representation of the escape.
///
/// This is the literal string that must occur in the Enso source code to be interpreted as this
/// escape code.
pub pattern : String,
/// The literal representation of the escape.
///
/// This is the character-level encoding of this escape character in Rust, as the Rust escape
/// representation and the Enso escape representation may differ, or Rust may not support the
/// same literal escape code as Enso.
pub repr : String,
}
impl EscapeCharacter {
fn new(pattern:impl Str, repr:impl Str) -> EscapeCharacter {
let pattern = pattern.into();
let repr = repr.into();
Self{pattern,repr}
}
/// The set of character escape codes that Enso supports.
pub fn codes() -> Vec<EscapeCharacter> {
vec![
// === Null ===
Self::new(r"\0","\0"),
// === Newlines ===
Self::new(r"\n","\n"),
Self::new(r"\r","\r"),
Self::new(r"\f","\x0C"),
// === Tabs ===
Self::new(r"\t","\t"),
Self::new(r"\v","\x0B"),
// === Backspace ===
Self::new(r"\b","\x08"),
// === Misc ===
Self::new(r"\a","\x07"),
]
}
}
// =================
// === Utilities ===
// =================
/// Check if `c` is a hexadecimal digit.
fn is_hex_digit(c:char) -> bool {
let small_letters = 'a'..='f';
let large_letters = 'A'..='F';
let digits = '0'..='9';
small_letters.contains(&c) || large_letters.contains(&c) || digits.contains(&c)
}
// ======================
// === EscapeSequence ===
// ======================
/// A trait representing various kinds of escape sequence.
///
/// An escape sequence built using this trait will have its digits calculated by stripping the
/// [`Self::prefix_length()`] and [`Self::suffix_length()`] from the input string, and then
/// validated using [`Self::digits_min_length()`], [`Self::digits_max_length()`], and
/// [`Self::validator()`]. All digits must be valid hexadecimal digits as defined by
/// [`is_hex_digit`] above.
///
/// In addition, the implementation must define [`Self::style_on_success()`] and
/// [`Self::style_on_failure()`] to determine the type of escape output on success and failure.
pub trait EscapeSequence {
/// Create a token of the relevant escape sequence type.
///
/// This function should be passed the _full_ match for the escape sequence as `repr`, including
/// the delimiters. For example, if we have the escape sequence `\uAFAF`, we want to pass the
/// whole string `"\uAFAF"`, not just `"AFAF"` to this function..
fn build(repr:impl Str) -> Shape {
if let Some(digits) = Self::validate(repr.as_ref()) {
Shape::text_segment_escape(Self::style_on_success(),digits)
} else {
Shape::text_segment_escape(Self::style_on_failure(),repr)
}
}
/// Obtain the digits portion of the escape sequence.
fn get_digits(repr:&str) -> &str {
let start = Self::prefix_length();
let end = repr.len().saturating_sub(Self::suffix_length());
&repr[start..end]
}
/// Validate the provided unicode string for this type of escape sequence.
fn validate(repr:&str) -> Option<String> {
let digits = Self::get_digits(repr);
let ge_min = digits.len() >= Self::digits_min_length();
let le_max = digits.len() <= Self::digits_max_length();
let valid_length = ge_min && le_max;
let valid_escape = Self::validator(digits);
let valid_digits = digits.chars().all(is_hex_digit);
let is_valid = valid_length && valid_escape && valid_digits;
is_valid.as_some(digits.into())
}
/// Return the length of the escape prefix.
///
/// The suffix is the characters that need to be stripped from the front of the escape sequence
/// to get, in conjunction with [`EscapeSequence::suffix_length()`] the escape value itself.
fn prefix_length() -> usize;
/// Return the length of the escape suffix.
///
/// The suffix is the characters that need to be stripped from the end of the escape sequence to
/// get, in conjunction with [`EscapeSequence::prefix_length()`] the escape value itself.
///
/// This defaults to `0`.
fn suffix_length() -> usize { 0 }
/// Return the minimum number of digits accepted by the escape sequence type.
fn digits_min_length() -> usize;
/// Return the maximum number of digits accepted by the escape sequence type.
///
/// This defaults to `digits_min_length()`.
fn digits_max_length() -> usize { Self::digits_min_length() }
/// A validator for any additional properties of the escape sequence.
///
/// It will be passed the _digits_ of the escape sequence, as defined by
/// [`EscapeSequence::get_digits()`], and has a default implementation that always succeeds.
/// Please implement this validator yourself if you would like to assert _additional_ properties
/// on your escape sequence.
fn validator(_digits:&str) -> bool { true }
/// The style of escape after successful validation.
fn style_on_success() -> token::EscapeStyle;
/// The style of escape after unsuccessful validation.
fn style_on_failure() -> token::EscapeStyle;
}
// ==================
// === ByteEscape ===
// ==================
/// A validator for ASCII escapes.
///
/// An ascii escape begins with the sequence `\x` and is followed by two hexadecimal digits (e.g.
/// `\x0F`.
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
pub struct Byte;
impl EscapeSequence for Byte {
fn prefix_length() -> usize { lexeme::len(lexeme::literal::BYTE_ESCAPE_START) }
fn digits_min_length() -> usize { 2 }
fn style_on_success() -> EscapeStyle { token::EscapeStyle::Byte }
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::Invalid }
}
// ===========
// === U16 ===
// ===========
/// A validator for U16 unicode escapes.
///
/// A U16 unicode escape begins with the sequence `\u` and is followed by four hexadecimal digits,
/// e.g. `\u0F0F`.
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
pub struct U16;
impl EscapeSequence for U16 {
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U16_ESCAPE_START) }
fn digits_min_length() -> usize { 4 }
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U16 }
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
}
// ===========
// === U21 ===
// ===========
/// A validator for U21 unicode escapes.
///
/// A U21 unicode escape begins with the sequence `\u`, followed by a sequence of 1-6 hexadecimal
/// digits enclosed in braces (`{}`). Both `\u{F}` and `\u{AFAFAF}` are valid U21 escapes.
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
pub struct U21;
impl EscapeSequence for U21 {
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_START) }
fn suffix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_END) }
fn digits_min_length() -> usize { 1 }
fn digits_max_length() -> usize { 6 }
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U21 }
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
}
// ===========
// === U32 ===
// ===========
/// A validator for U32 unicode escapes.
///
/// A U32 unicode escape begins with the sequence \U, followed by 8 hexadecimal digits. Due to the
/// restrictions of unicode, the first two digits _must_ be zero (e.g. `\U00AFAFAF`).
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
pub struct U32;
impl EscapeSequence for U32 {
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U32_ESCAPE_START) }
fn digits_min_length() -> usize { 8 }
fn validator(digits: &str) -> bool { digits.starts_with("00") }
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U32 }
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod test {
use super::*;
// === Utilities ===
/// Tests a valid input to ensure that it succeeds.
fn test_valid<Esc:EscapeSequence>(escape:&str, out:&str, out_style:token::EscapeStyle) {
let shape = Shape::text_segment_escape(out_style,out);
assert_eq!(Esc::build(escape),shape);
}
/// Tests invalid inputs to ensure they fail for the provided escape type `Esc`.
fn test_invalid<Esc:EscapeSequence>(invalid_cases:Vec<&str>, fail_with:token::EscapeStyle) {
for escape in invalid_cases {
let shape = Shape::text_segment_escape(fail_with,escape);
assert_eq!(Esc::build(escape),shape)
}
}
// === Is Hex Digit ===
#[test]
fn test_is_hex_digit() {
for val in u8::min_value()..=u8::max_value() {
let char = char::from(val);
let is_in_small = ('a'..='f').contains(&char);
let is_in_large = ('A'..='F').contains(&char);
let is_in_dec_digit = ('0'..='9').contains(&char);
let expected_result = is_in_small || is_in_large || is_in_dec_digit;
assert_eq!(is_hex_digit(char),expected_result);
}
}
// === Build ===
#[test]
fn test_byte_build_valid() {
test_valid::<Byte>(r"\x05","05",token::EscapeStyle::Byte);
}
#[test]
fn test_byte_build_invalid() {
test_invalid::<Byte>(vec![
r"\x5",
r"\x",
r"\x033",
r"\xz2",
],token::EscapeStyle::Invalid);
}
#[test]
fn test_u16_build_valid() {
test_valid::<U16>(r"\u4fe3","4fe3",token::EscapeStyle::U16);
}
#[test]
fn test_u16_build_invalid() {
test_invalid::<U16>(vec![
r"\u123",
r"\u",
r"\u123aff",
r"\uazaz",
],token::EscapeStyle::InvalidUnicode);
}
#[test]
fn test_u21_build_valid() {
test_valid::<U21>(r"\u{fa4e}","fa4e",token::EscapeStyle::U21);
}
#[test]
fn test_u21_build_invalid() {
test_invalid::<U21>(vec![
r"\u{1234567}",
r"\u{}",
],token::EscapeStyle::InvalidUnicode);
}
#[test]
fn test_u32_build_valid() {
test_valid::<U32>(r"\U0014A890","0014A890",token::EscapeStyle::U32);
}
#[test]
fn test_u32_build_invalid() {
test_invalid::<U32>(vec![
r"\U12121212",
r"\U",
r"\U001234",
r"\U001234567"
],token::EscapeStyle::InvalidUnicode);
}
}

View File

@ -0,0 +1,301 @@
//! This module defines the base lexemes for the Enso language.
use crate::prelude::*;
use enso_flexer::automata::pattern::Pattern;
// =================================
// === Basic Pattern Definitions ===
// =================================
/// Basic lexemes as patterns.
///
/// These must _only_ be used as part of the lexer definition, not used at runtime as they are not
/// performant at all.
pub mod definition_pattern {
use super::*;
/// Match lower-case ASCII letters.
pub fn lower_ascii_letter() -> Pattern {
Pattern::range('a'..='z')
}
/// Match upper-case ASCII letters.
pub fn upper_ascii_letter() -> Pattern {
Pattern::range('A'..='Z')
}
/// Match ASCII digits.
pub fn ascii_digit() -> Pattern {
Pattern::range('0'..='9')
}
/// Match ASCII letters.
pub fn ascii_letter() -> Pattern {
lower_ascii_letter() | upper_ascii_letter()
}
/// Match ASCII alphanumeric characters.
pub fn ascii_alpha_num() -> Pattern {
ascii_digit() | ascii_letter()
}
/// Match at least one ASCII space character.
pub fn spaces() -> Pattern {
into_pattern(literal::SPACE).many1()
}
/// Match the end-of-file character.
pub fn eof() -> Pattern {
Pattern::eof()
}
/// Match a newline.
///
/// This matches both Unix (LF) and Windows (CRLF) styles of newlines. This is particularly
/// important so as not to result in incorrect spans on windows clients.
pub fn newline() -> Pattern {
let lf = into_pattern(literal::LF);
let crlf = into_pattern(literal::CRLF);
lf | crlf
}
/// The characters that break tokens in Enso.
pub fn whitespace_break_chars() -> String {
[literal::TAB,literal::LF,literal::CR].concat()
}
/// The characters that break token lexing in Enso.
pub fn break_chars() -> String {
[
literal::INTERPOLATE_QUOTE,
literal::COMMENT,
literal::ANNOTATION_SYMBOL,
literal::SPACE,
literal::COMMA,
literal::DOT,
literal::OPERATOR_CHARS,
literal::GROUP_CHARS,
&whitespace_break_chars()
].concat()
}
/// Adds the basic characters not allowed in a raw segment in a format text literal.
fn add_base_format_disallows(chars:&mut String) {
chars.push_str(literal::INTERPOLATE_QUOTE);
chars.push_str(literal::SLASH);
chars.push_str(literal::LF);
chars.push_str(literal::CR);
}
/// Characters allowable inside a raw segment in a format line.
pub fn format_line_raw_char() -> Pattern {
let mut chars = String::new();
chars.push_str(literal::FORMAT_QUOTE);
add_base_format_disallows(&mut chars);
Pattern::none_of(&chars)
}
/// Characters allowable inside a raw segment in a format block.
pub fn format_block_raw_char() -> Pattern {
let mut chars = String::new();
add_base_format_disallows(&mut chars);
Pattern::none_of(&chars)
}
/// Adds the basic characters not allowed in a raw segment in a raw text literal.
fn add_base_raw_disallows(chars:&mut String) {
chars.push_str(literal::SLASH);
chars.push_str(literal::LF);
chars.push_str(literal::CR);
}
/// Characters allowable inside a raw segment in a raw line.
pub fn raw_line_raw_char() -> Pattern {
let mut chars = String::new();
chars.push_str(literal::RAW_QUOTE);
add_base_raw_disallows(&mut chars);
Pattern::none_of(&chars)
}
/// Characters allowable inside a raw segment in a raw block.
pub fn raw_block_raw_char() -> Pattern {
let mut chars = String::new();
add_base_raw_disallows(&mut chars);
Pattern::none_of(&chars)
}
/// The characters allowed as digits in a unicode escape.
pub fn unicode_escape_digit() -> Pattern {
let chars = &[
literal::FORMAT_QUOTE,
literal::RAW_QUOTE,
literal::INTERPOLATE_QUOTE,
literal::SLASH,
literal::LF,
literal::CR,
"{}"
].concat();
Pattern::none_of(chars)
}
}
// ===============================
// === Enso Lexeme Definitions ===
// ===============================
/// The literal lexemes that make up the Enso language.
pub mod literal {
/// The type of a literal lexeme.
pub type Literal = &'static str;
// === The Lexemes ===
/// The space character.
pub const SPACE:Literal = " ";
/// The line-feed character.
pub const LF:Literal = "\n";
/// The carriage-return character.
pub const CR:Literal = "\r";
/// The crlf windows-style line ending.
pub const CRLF:Literal = "\r\n";
/// The tab character.
pub const TAB:Literal = "\t";
/// The comment character.
pub const COMMENT:Literal = "#";
/// The doc comment character.
pub const DOC_COMMENT:Literal = "##";
/// The symbol for beginning an annotation.
pub const ANNOTATION_SYMBOL:Literal = "@";
/// The dot symbol
pub const DOT:Literal = ".";
/// Two dots.
pub const TWO_DOTS:Literal = "..";
/// Three dots.
pub const THREE_DOTS:Literal = "...";
/// Three dots.
pub const COMMA:Literal = ",";
/// The `in` operator.
pub const OPERATOR_IN:Literal = "in";
/// The tick allowable at the end of an identifier.
pub const IDENTIFIER_TICK:Literal = "'";
/// The quote used to delimit interpolations in format text literals.
pub const INTERPOLATE_QUOTE:Literal = "`";
/// The quote used to delimit format text literals.
pub const FORMAT_QUOTE:Literal = "'";
/// The quote used to delimit format block literals.
pub const FORMAT_BLOCK_QUOTE:Literal = "'''";
/// The quote used to delimit raw text literals.
pub const RAW_QUOTE:Literal = "\"";
/// The quote used to delimit raw block literals.
pub const RAW_BLOCK_QUOTE:Literal = "\"\"\"";
/// The equals operator.
pub const EQUALS:Literal = "=";
/// The equality comparison operator.
pub const EQUALS_COMP:Literal = "==";
/// Greater-than or equal.
pub const GE_OPERATOR:Literal = ">=";
/// Less-than or equal.
pub const LE_OPERATOR:Literal = "<=";
/// Inequality comparison operator.
pub const NOT_EQUAL:Literal = "!=";
/// The hash eq operator.
pub const HASH_EQ:Literal = "#=";
/// The wide arrow operator.
pub const WIDE_ARROW:Literal = "=>";
/// The blank identifier.
pub const BLANK_IDENT:Literal = "_";
/// The identifier segment separator.
pub const IDENT_SEGMENT_SEPARATOR:Literal = "_";
/// The separator between a number literal's explicit base and the number itself.
pub const NUMBER_BASE_SEPARATOR:Literal = "_";
/// The separator between the integer and fractional parts of the number literal.
pub const DECIMAL_SEPARATOR:Literal = ".";
/// The backslash character.
pub const SLASH:Literal = r"\";
/// An escaped [`SLASH`].
pub const ESCAPED_SLASH:Literal = r"\\";
/// The beginning of a byte escape.
pub const BYTE_ESCAPE_START:Literal = r"\x";
/// The beginning of a u16 escape.
pub const U16_ESCAPE_START:Literal = r"\u";
/// The beginning of a u21 escape.
pub const U21_ESCAPE_START:Literal = r"\u{";
/// The end of a u21 escape.
pub const U21_ESCAPE_END:Literal = "}";
/// The beginning of a u32 escape.
pub const U32_ESCAPE_START:Literal = r"\U";
/// The allowable group characters in Enso.
pub const GROUP_CHARS:Literal = "()[]{}";
/// The allowable operator characters in Enso.
pub const OPERATOR_CHARS:Literal = ";!$%&*+-/<>?^~|:\\";
}
// =========================
// === Utility Functions ===
// =========================
/// Get the first character of the lexeme, if it exists.
pub fn char(literal:&'static str) -> Option<char> {
literal.chars().nth(0)
}
/// Get the first character of the lexeme, assuming that it exists.
pub fn unsafe_char(literal:&'static str) -> char {
char(literal).expect("The first character of the literal exists.")
}
/// Convert the lexeme into a pattern.
pub fn into_pattern(literal:&'static str) -> Pattern {
literal.into()
}
/// The proper length of the `literal`.
pub fn len(literal:&'static str) -> usize {
literal.chars().count()
}

File diff suppressed because it is too large Load Diff

View File

@ -10,7 +10,10 @@
//! This library defines the lexer for the syntax of the Enso language.
pub mod escape;
pub mod lexeme;
pub mod lexer;
pub mod rule;
pub mod token;
/// A module that can be re-exported under the same name in the generation crate.
@ -19,11 +22,14 @@ pub mod token;
/// Enso lexer definition. In this project, imports should _not_ be made from the crate root
/// _except_ through use of this `library` module.
pub mod library {
pub use crate::escape;
pub use crate::lexeme;
pub use crate::token;
pub use crate::rules;
}
/// A collection of functionality for working with the lexer definition.
pub mod prelude {
pub use flexer::prelude::*;
pub use flexer::prelude::logger::*;
pub use enso_flexer::prelude::*;
pub use enso_flexer::prelude::logger::*;
}

View File

@ -0,0 +1,26 @@
//! This file contains a macro to simplify writing the lexer rules.
// ===================
// === Rules Macro ===
// ===================
/// Define a group of rules for the lexer.
///
/// All of the rules must be defined for the same `state_name`, which must be the in-scope name of
/// the state for which the rules are being defined. Each `pattern` is a non-reference pattern that
/// the rule is being defined to match, and `code` is the code that will be executed when the rule
/// matches, omitting the (first) `reader` argument).
///
/// Branches are matched _in order_, from top-to-bottom, much like a standard `match` statement.
///
/// Please see `lexer.rs` for myriad examples of this macro's use.
#[macro_export]
macro_rules! rules {
($state_name:ident with $($pattern:expr => $path_root:ident $(.$path:ident)* ($($arg:tt)*)),+ $(,)?) => {
$($state_name.create_rule(&$pattern,stringify!{
$path_root $(.$path)* (reader,$($arg)*)
});)*
};
}

View File

@ -6,6 +6,8 @@
use crate::prelude::*;
use crate::lexeme;
// =============
@ -24,6 +26,11 @@ pub struct Token {
}
impl Token {
/// Constructor.
pub fn new(shape:Shape, length:usize, offset:usize) -> Token {
Token{shape,length,offset}
}
/// Get the length that the token takes up in the program source.
pub fn source_length(&self) -> usize {
self.length + self.offset
@ -31,10 +38,9 @@ impl Token {
}
/// Constructors for the various forms of token.
#[allow(non_snake_case)]
impl Token {
/// Construct a token representing a referent identifier.
pub fn Referent(name:impl Str, offset:usize) -> Token {
pub fn referent(name:impl Str, offset:usize) -> Token {
let str = name.into();
let length = str.chars().count();
let shape = Shape::Referent(str);
@ -42,7 +48,7 @@ impl Token {
}
/// Construct a token representing a variable identifier.
pub fn Variable(name:impl Str, offset:usize) -> Token {
pub fn variable(name:impl Str, offset:usize) -> Token {
let str = name.into();
let length = str.chars().count();
let shape = Shape::Variable(str);
@ -50,7 +56,7 @@ impl Token {
}
/// Construct a token representing an external identifier.
pub fn External(name:impl Str, offset:usize) -> Token {
pub fn external(name:impl Str, offset:usize) -> Token {
let str = name.into();
let length = str.chars().count();
let shape = Shape::External(str);
@ -58,61 +64,157 @@ impl Token {
}
/// Construct a token representing a blank identifier.
pub fn Blank(offset:usize) -> Token {
pub fn blank(offset:usize) -> Token {
let shape = Shape::Blank;
let length = 1;
let length = lexeme::len(lexeme::literal::BLANK_IDENT);
Token{shape,length,offset}
}
/// Construct a token representing an operator.
pub fn Operator(name:impl Str, offset:usize) -> Token {
let str = name.into();
let length = str.chars().count();
let shape = Shape::Operator(str);
pub fn operator(name:impl Str, offset:usize) -> Token {
let name = name.into();
let length = name.chars().count();
let shape = Shape::Operator(name);
Token{shape,length,offset}
}
/// Construct a token representing a modifier operator.
pub fn Modifier(name:impl Str, offset:usize) -> Token {
let str = name.into();
let length = str.chars().count() + 1;
let shape = Shape::Modifier(str);
pub fn modifier(name:impl Str, offset:usize) -> Token {
let name = name.into();
let modifier_len = lexeme::len(lexeme::literal::EQUALS);
let length = name.chars().count() + modifier_len;
let shape = Shape::Modifier(name);
Token{shape,length,offset}
}
/// Construct a token representing
pub fn annotation(name_str:impl Str, offset:usize) -> Token {
let name = name_str.into();
let annotation_len = lexeme::len(lexeme::literal::ANNOTATION_SYMBOL);
let length = name.chars().count() + annotation_len;
let shape = Shape::Annotation(name);
Token{shape,length,offset}
}
/// Construct a token representing a number literal.
pub fn Number(base:impl Str, num:impl Into<String>, offset:usize) -> Token {
let str = num.into();
let base_str = base.into();
let length = if base_str.is_empty() {
str.chars().count()
pub fn number(base:impl Str, num:impl Into<String>, offset:usize) -> Token {
let number = num.into();
let base = base.into();
let length = if base.is_empty() {
number.chars().count()
} else {
base_str.chars().count() + 1 + str.chars().count()
let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR);
base.chars().count() + base_sep_len + number.chars().count()
};
let shape = Shape::Number{base:base_str,number:str};
let shape = Shape::Number{base,number};
Token{shape,length,offset}
}
/// Construct a token representing a dangling number base.
pub fn DanglingBase(base:impl Str, offset:usize) -> Token {
pub fn dangling_base(base:impl Str, offset:usize) -> Token {
let base_str = base.into();
let length = base_str.chars().count() + 1;
let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR);
let length = base_str.chars().count() + base_sep_len;
let shape = Shape::DanglingBase(base_str);
Token{shape,length,offset}
}
/// Construct a token representing a text literal.
pub fn Text(text:impl Str, offset:usize) -> Token {
let str = text.into();
let length = str.chars().count();
let shape = Shape::Text(str);
/// Construct a token representing a line of text.
pub fn text_line(style:TextStyle, segments:Vec<Token>, offset:usize) -> Token {
let segments_len:usize = segments.iter().map(|s| s.source_length()).sum();
let length = style.length() + segments_len;
let shape = Shape::TextLine{style,segments};
Token{shape,length,offset}
}
/// Construct a token representing an inline block text literal.
pub fn text_inline_block
( style : TextStyle
, segments : Vec<Token>
, offset : usize
) -> Token {
let segments_length:usize = segments.iter().map(|s| s.source_length()).sum();
let length = style.length() + segments_length;
let shape = Shape::TextInlineBlock{style,segments};
Token{shape,length,offset}
}
/// Construct a token representing a block of text.
pub fn text_block
( start_line_ending : LineEnding
, style : TextStyle
, lines : Vec<Token>
, indent : usize
, offset : usize
) -> Token {
let length = style.length() + start_line_ending.size() + lines.iter().fold(0, |l,r|
l + match r.shape {
Shape::Line {..} => indent + r.source_length(),
Shape::BlankLine(_) => r.source_length(),
_ => unreachable_panic!("Text blocks should only contain lines."),
}
);
let shape = Shape::TextBlock{start_line_ending,style,lines};
Token{shape,length,offset}
}
/// Construct a token representing an invalid quote.
pub fn invalid_quote(bad_quotes:impl Str, offset:usize) -> Token {
let bad_string = bad_quotes.into();
let length = bad_string.chars().count();
let shape = Shape::InvalidQuote(bad_string);
Token{shape,length,offset}
}
/// Construct a token representing a raw text segment.
pub fn text_segment_raw(str:impl Str, offset:usize) -> Token {
let string = str.into();
let length = string.chars().count();
let shape = Shape::TextSegmentRaw(string);
Token{shape,length,offset}
}
/// Construct a token representing an escape sequence.
pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str, offset:usize) -> Token {
let repr = repr_str.into();
let length = style.size() + repr.chars().count();
let shape = Shape::TextSegmentEscape{style,repr};
Token{shape,length,offset}
}
/// Construct a token representing an escape sequence using a literal `shape`.
pub fn text_segment_escape_from_shape(shape:Shape, offset:usize) -> Token {
match &shape {
Shape::TextSegmentEscape{style,repr} => {
let length = style.size() + repr.chars().count();
Token{shape,length,offset}
},
_ => unreachable_panic!("Shape must be a TextSegmentEscape.")
}
}
/// Construct a token representing an interpolated text segment.
pub fn text_segment_interpolate(tokens:Vec<Token>, offset:usize) -> Token {
let length_of_interpolation_ticks = 2;
let length =
length_of_interpolation_ticks + tokens.iter().fold(0,|l,r| l + r.source_length());
let shape = Shape::TextSegmentInterpolate{tokens};
Token{shape,length,offset}
}
/// Construct a token representing an unclosed interpolated text segment.
pub fn text_segment_unclosed_interpolate(tokens:Vec<Token>, offset:usize) -> Token {
let length_of_interpolation_tick = 1;
let length =
length_of_interpolation_tick + tokens.iter().fold(0,|l,r| l + r.source_length());
let shape = Shape::TextSegmentUnclosedInterpolate{tokens};
Token{shape,length,offset}
}
/// Construct a token representing a line of tokens.
pub fn Line(tokens:Vec<Token>, offset:usize, trailing_line_ending:LineEnding) -> Token {
pub fn line(tokens:Vec<Token>, offset:usize, trailing_line_ending:LineEnding) -> Token {
let line_ending_len = trailing_line_ending.size();
let length = tokens.iter().fold(line_ending_len,|l,r| l + r.offset + r.length);
let length = tokens.iter().fold(line_ending_len,|l,r| l + r.source_length());
let shape = Shape::Line{tokens,trailing_line_ending};
Token{shape,length,offset}
}
@ -121,26 +223,25 @@ impl Token {
///
/// The `offset` for blank lines is from the leftmost column, not from the parent block's
/// indentation.
pub fn BlankLine(offset:usize, trailing_line_ending:LineEnding) -> Token {
pub fn blank_line(offset:usize, trailing_line_ending:LineEnding) -> Token {
let length = trailing_line_ending.size();
let shape = Shape::BlankLine(trailing_line_ending);
Token{shape,length,offset}
}
/// Construct a token representing a block.
pub fn Block
pub fn block
( block_type : BlockType
, indent : usize
, lines : Vec<Token>
, offset : usize
) -> Token {
let length = lines.iter().map(|line| {
let line_length = line.length;
let line_offset = line.offset;
match line.shape {
Shape::Line{..} => indent + line_offset + line_length,
Shape::BlankLine(_) => line_offset + line_length,
_ => unreachable_panic!("Tokens in a blocks should always be lines."),
Shape::Line{..} => indent + line.source_length(),
Shape::BlankLine(_) => line.source_length(),
_ =>
unreachable_panic!("Tokens in a blocks should always be lines."),
}
}).sum();
let shape = Shape::Block{block_type,indent,lines};
@ -148,18 +249,40 @@ impl Token {
}
/// Construct a token representing an invalid suffix.
pub fn InvalidSuffix(text:impl Str, offset:usize) -> Token {
let str = text.into();
let length = str.chars().count();
let shape = Shape::InvalidSuffix(str);
pub fn invalid_suffix(text:impl Str, offset:usize) -> Token {
let text = text.into();
let length = text.chars().count();
let shape = Shape::InvalidSuffix(text);
Token{shape,length,offset}
}
/// Construct a token representing an unrecognised lexeme.
pub fn Unrecognized(text:impl Str, offset:usize) -> Token {
let str = text.into();
let length = str.chars().count();
let shape = Shape::Unrecognized(str);
pub fn unrecognized(text:impl Str, offset:usize) -> Token {
let text = text.into();
let length = text.chars().count();
let shape = Shape::Unrecognized(text);
Token{shape,length,offset}
}
/// Construct a token representing a disable comment.
pub fn disable_comment(text:impl Str, offset:usize) -> Token {
let text = text.into();
let comment_len = lexeme::len(lexeme::literal::COMMENT);
let length = text.chars().count() + comment_len;
let shape = Shape::DisableComment(text);
Token{shape,length,offset}
}
/// Construct a token representing a documentation comment.
pub fn doc_comment(lines:Vec<Token>, indent:usize, offset:usize) -> Token {
let length = lines.iter().map(|line| {
match line.shape {
Shape::Line{..} => indent + line.source_length(),
Shape::BlankLine(_) => line.source_length(),
_ => unreachable_panic!("Tokens in a doc comment should always be lines."),
}
}).sum();
let shape = Shape::DocComment{lines,indent};
Token{shape,length,offset}
}
}
@ -179,9 +302,11 @@ pub enum BlockType {
Discontinuous,
}
// ===================
// === NewlineType ===
// ===================
// ==================
// === LineEnding ===
// ==================
/// The type of newline associated with the line.
#[derive(Copy,Clone,Debug,Display,PartialEq,Eq)]
@ -195,12 +320,14 @@ pub enum LineEnding {
}
impl LineEnding {
const NO_LENGTH:usize = 0;
/// Get the number of rust `char`s that the newline type takes up.
pub fn size(self) -> usize {
match self {
Self::None => 0,
Self::LF => 1,
Self::CRLF => 2,
Self::None => Self::NO_LENGTH,
Self::LF => lexeme::len(lexeme::literal::LF),
Self::CRLF => lexeme::len(lexeme::literal::CRLF),
}
}
}
@ -216,6 +343,128 @@ impl Default for LineEnding {
// =================
// === TextStyle ===
// =================
/// The style of the text literal.
#[derive(Copy,Clone,Debug,Eq,PartialEq)]
pub enum TextStyle {
// === Line ===
/// A interpolated text line literal.
FormatLine,
/// A raw text line literal.
RawLine,
/// An unclosed text line literal.
UnclosedLine,
// === Inline Block ===
/// A format inline block text literal.
FormatInlineBlock,
/// A raw inline block text literal.
RawInlineBlock,
// === Block ===
/// An interpolated text block literal.
FormatBlock,
/// A raw text block literal.
RawBlock,
}
impl TextStyle {
/// Calculate the length of the delimiters for a particular style of text literal.
pub fn length(self) -> usize {
match self {
TextStyle::FormatLine => lexeme::len(lexeme::literal::FORMAT_QUOTE) * 2,
TextStyle::RawLine => lexeme::len(lexeme::literal::RAW_QUOTE) * 2,
TextStyle::FormatInlineBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE),
TextStyle::RawInlineBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE),
TextStyle::UnclosedLine => lexeme::len(lexeme::literal::FORMAT_QUOTE),
TextStyle::FormatBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE),
TextStyle::RawBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE),
}
}
/// Check if the text literal is a line literal.
pub fn is_line_literal(self) -> bool {
match self {
TextStyle::RawLine => true,
TextStyle::FormatLine => true,
TextStyle::UnclosedLine => true,
_ => false,
}
}
/// Check if the text literal is an inline block literal.
pub fn is_inline_block_literal(self) -> bool {
match self {
TextStyle::FormatInlineBlock => true,
TextStyle::RawInlineBlock => true,
_ => false,
}
}
/// Check if the text literal is a block literal.
pub fn is_block_literal(self) -> bool {
match self {
TextStyle::FormatBlock => true,
TextStyle::RawBlock => true,
_ => false,
}
}
}
// ===================
// === EscapeStyle ===
// ===================
/// A description of the style of escape sequence seen.
#[derive(Clone,Copy,Debug,Eq,PartialEq)]
pub enum EscapeStyle {
/// A \xNN-style byte escape.
Byte,
/// Unicode 16-bit escape sequence.
U16,
/// Unicode 21-bit escape sequence.
U21,
/// Unicode 32-bit escape sequence.
U32,
/// A literal escape character.
Literal,
/// An invalid unicode escape.
InvalidUnicode,
/// An invalid escape.
Invalid,
/// An escape slash without any following escape.
Unfinished,
}
impl EscapeStyle {
const NO_ADDITIONAL_LENGTH:usize = 0;
/// Get the length taken up in source by the delimiters to an escape type.
pub fn size(self) -> usize {
match self {
EscapeStyle::Byte => lexeme::len(lexeme::literal::BYTE_ESCAPE_START),
EscapeStyle::Literal => lexeme::len(lexeme::literal::SLASH),
EscapeStyle::U16 => lexeme::len(lexeme::literal::U16_ESCAPE_START),
EscapeStyle::U32 => lexeme::len(lexeme::literal::U32_ESCAPE_START),
EscapeStyle::U21 => {
let start_len = lexeme::len(lexeme::literal::U21_ESCAPE_START);
let end_len = lexeme::len(lexeme::literal::U21_ESCAPE_END);
start_len + end_len
}
_ => Self::NO_ADDITIONAL_LENGTH,
}
}
}
// =============
// === Shape ===
// =============
@ -241,25 +490,73 @@ pub enum Shape {
Operator(String),
/// A modifier identifier.
Modifier(String),
/// An annotation.
Annotation(String),
// === Literals ===
/// A literal number.
Number{base:String, number:String},
Number {
/// The (optional) base for the number to be interpreted in.
base:String,
/// The number itself, possibly with a decimal point.
number:String
},
/// A dangling base from a number literal.
DanglingBase(String),
/// A text literal.
///
/// This is currently way too simplistic to actually represent text, but it is a good
/// placeholder.
Text(String),
/// A text line literal.
TextLine {
/// The type of literal being encoded.
style : TextStyle,
/// The segments that make up the line of text.
segments : Vec<Token>,
},
/// An inline block text literal.
TextInlineBlock {
/// The type of literal being encoded.
style : TextStyle,
/// The segments that make up the line of text.
segments : Vec<Token>,
},
/// A text block literal.
TextBlock {
/// The line ending that occurs directly after the opening quote marks.
start_line_ending : LineEnding,
/// The type of literal being encoded.
style : TextStyle,
/// The lines in the text block literal.
lines : Vec<Token>
},
/// An invalid quote for a text literal.
InvalidQuote(String),
/// A segment of a line of text containing only literal text.
TextSegmentRaw(String),
/// A segment of a line of text that represents an escape sequence.
TextSegmentEscape {
/// The type of escape being represented.
style : EscapeStyle,
/// The literal escape sequence.
repr : String,
},
/// A segment of a line of text that contains an interpolated expression.
TextSegmentInterpolate {
/// The tokens making up the interpolated expression.
tokens : Vec<Token>,
},
/// An interpolated expression that hasn't been closed.
TextSegmentUnclosedInterpolate {
/// The tokens making up the interpolated expression.
tokens : Vec<Token>
},
/// An invalid text segment (e.g. unclosed interpolate segment).
TextSegmentInvalid(String),
// === Lines ===
/// A line containing tokens.
///
/// The offset for a line is always zero, as it is contained in a block with a defined
/// indentation.
Line{
Line {
/// The tokens on the line.
tokens : Vec<Token>,
/// The line ending that _ends_ the line.
@ -290,6 +587,17 @@ pub enum Shape {
InvalidSuffix(String),
/// An unrecognized token.
Unrecognized(String),
// === Comments ===
/// A disable comment (`# ...`).
DisableComment(String),
/// An Enso documentation comment (`## ...`).
DocComment {
/// The lines in the doc comment body. Each line must contain raw text segments only.
lines : Vec<Token>,
/// The indentation of the doc comment's body from the baseline.
indent : usize
}
}
impl Shape {
@ -326,9 +634,16 @@ impl Shape {
Shape::Modifier(opr.into())
}
/// Construct an annotation identifier.
pub fn annotation(name:impl Into<String>) -> Shape {
Shape::Annotation(name.into())
}
/// Construct a number literal.
pub fn number(base:impl Into<String>, num:impl Into<String>) -> Shape {
Shape::Number{base:base.into(),number:num.into()}
let base = base.into();
let number = num.into();
Shape::Number{base,number}
}
/// Construct a dangling base literal.
@ -336,9 +651,50 @@ impl Shape {
Shape::DanglingBase(base.into())
}
/// Construct a text literal.
pub fn text(text:impl Into<String>) -> Shape {
Shape::Text(text.into())
/// Construct a text line literal.
pub fn text_line(style:TextStyle, segments:Vec<Token>) -> Shape {
Shape::TextLine{style,segments}
}
/// Construct an inline block text literal.
pub fn text_inline_block(style:TextStyle, segments:Vec<Token>) -> Shape {
Shape::TextInlineBlock{style,segments}
}
/// Construct a text block literal.
pub fn text_block(start_line_ending: LineEnding, style:TextStyle, lines:Vec<Token>) -> Shape {
Shape::TextBlock{start_line_ending,style,lines}
}
/// Construct an invalid quote literal.
pub fn invalid_quote(bad_quotes:impl Str) -> Shape {
Shape::InvalidQuote(bad_quotes.into())
}
/// Construct a raw text segment.
pub fn text_segment_raw(text:impl Str) -> Shape {
Shape::TextSegmentRaw(text.into())
}
/// Construct a text segment containing an escape sequence.
pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str) -> Shape {
let repr = repr_str.into();
Shape::TextSegmentEscape{style,repr}
}
/// Construct a text segment containing an interpolated expression.
pub fn text_segment_interpolate(tokens:Vec<Token>) -> Shape {
Shape::TextSegmentInterpolate{tokens}
}
/// Construct a text segment containing an unclosed interpolated expression.
pub fn text_segment_unclosed_interpolate(tokens:Vec<Token>) -> Shape {
Shape::TextSegmentUnclosedInterpolate{tokens}
}
/// Construct an invalid text segment.
pub fn text_segment_invalid(str:impl Str) -> Shape {
Shape::TextSegmentInvalid(str.into())
}
/// Construct a line that contains tokens.
@ -365,6 +721,16 @@ impl Shape {
pub fn unrecognized(text:impl Into<String>) -> Shape {
Shape::Unrecognized(text.into())
}
/// Construct a disable comment shape.
pub fn disable_comment(text:impl Str) -> Shape {
Shape::DisableComment(text.into())
}
/// Construct a doc comment shape.
pub fn doc_comment(lines:Vec<Token>, indent:usize) -> Shape {
Shape::DocComment{lines,indent}
}
}
@ -425,146 +791,3 @@ impl Into<Vec<Token>> for Stream {
self.tokens
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod tests {
use super::*;
use crate::token::BlockType;
// === Testing Utilities ===
/// Asserts that the `token` has the provided `shape`.
pub fn assert_shape(token:&Token, shape:Shape) {
assert_eq!(token.shape,shape);
}
/// Asserts that the `token` has the provided `length`.
pub fn assert_length(token:&Token, length:usize) {
assert_eq!(token.length,length)
}
// === Tests for Token Construction ===
#[test]
fn construct_referent_token() {
let token = Token::Referent("Some_Ref_Name",0);
assert_shape(&token,Shape::referent("Some_Ref_Name"));
assert_length(&token,13);
}
#[test]
fn construct_variable_token() {
let token = Token::Variable("some_variable_name",0);
assert_shape(&token,Shape::variable("some_variable_name"));
assert_length(&token,18);
}
#[test]
fn construct_external_name_token() {
let token = Token::External("camelCase",0);
assert_shape(&token,Shape::external("camelCase"));
assert_length(&token,9);
}
#[test]
fn construct_blank_token() {
let token = Token::Blank(0);
assert_shape(&token,Shape::blank());
assert_length(&token,1);
}
#[test]
fn construct_operator_token() {
let token = Token::Operator("==>",0);
assert_shape(&token,Shape::operator("==>"));
assert_length(&token,3);
}
#[test]
fn construct_modifier_token() {
let token = Token::Modifier("+",0);
assert_shape(&token,Shape::modifier("+"));
assert_length(&token,2);
}
#[test]
fn construct_number_token() {
let token = Token::Number("","1231",0);
assert_shape(&token,Shape::number("","1231"));
assert_length(&token,4);
}
#[test]
fn construct_dangling_base_token() {
let token = Token::DanglingBase("15",0);
assert_shape(&token,Shape::dangling_base("15"));
assert_length(&token,3);
}
#[test]
fn construct_text_token() {
let token = Token::Text("some prose goes here",0);
assert_shape(&token,Shape::text("some prose goes here"));
assert_length(&token,20);
// TODO [AA] Make this internally account for length of quotes.
}
#[test]
fn construct_line_token() {
let tokens = vec![Token::Variable("aa",0),Token::Referent("Abc",1)];
let token = Token::Line(tokens.clone(), 4, LineEnding::LF);
assert_shape(&token,Shape::line(tokens.clone(), LineEnding::LF));
assert_length(&token,7);
}
#[test]
fn construct_blank_line_token() {
let token = Token::BlankLine(13,LineEnding::LF);
assert_shape(&token, Shape::blank_line(LineEnding::LF));
assert_length(&token,1);
}
#[test]
fn construct_block_token_lf() {
let lines = vec![
Token::Line(vec![],0,LineEnding::LF),
Token::Line(vec![],4,LineEnding::LF)
];
let token = Token::Block(BlockType::Continuous,4,lines.clone(),0);
assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone()));
assert_length(&token,14);
}
#[test]
fn construct_block_token_crlf() {
let lines = vec![
Token::Line(vec![],0,LineEnding::CRLF),
Token::Line(vec![],4,LineEnding::CRLF)
];
let token = Token::Block(BlockType::Continuous,4,lines.clone(),0);
assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone()));
assert_length(&token,16);
}
#[test]
fn construct_invalid_suffix_token() {
let token = Token::InvalidSuffix("aaa",0);
assert_shape(&token,Shape::invalid_suffix("aaa"));
assert_length(&token,3);
}
#[test]
fn construct_unrecognized_token() {
let token = Token::Unrecognized("a",0);
assert_shape(&token,Shape::unrecognized("a"));
assert_length(&token,1);
}
}

View File

@ -12,10 +12,17 @@ test = true
bench = true
[dependencies]
flexer = { path = "../../flexer", version = "0.1.0" }
enso-prelude = { version = "0.1.3" }
enso-flexer = { version = "0.1.3" }
enso-prelude = { version = "0.1.7" }
lexer-definition = { path = "../definition", version = "0.1.0" }
[build-dependencies]
flexer = { path = "../../flexer", version = "0.1.0" }
enso-flexer = { version = "0.1.3" }
lexer-definition = { path = "../definition", version = "0.1.0" }
[dev-dependencies]
criterion = "0.3"
[[bench]]
name = "lexer_time_bench"
harness = false

View File

@ -0,0 +1,337 @@
//! This file contains the sources that are replicated many times over for the purposes of
//! benchmarking the Enso lexer.
use criterion::{black_box, Criterion, Throughput};
use enso_flexer::prelude::Reader;
use enso_flexer::prelude::reader::decoder::DecoderUTF8;
use lexer::generated::engine::EnsoLexer;
use std::time::Duration;
// ===============================
// === Benchmark Configuration ===
// ===============================
/// Configures the benchmarking process.
pub fn bench_config() -> Criterion {
Criterion::default()
.measurement_time(Duration::from_secs(60))
.warm_up_time(Duration::from_secs(3))
.sample_size(25)
.retain_baseline("EnsoLexer".to_string())
}
// =======================
// === Benchmark Setup ===
// =======================
/// The sizes of text to run the benchmarks over.
pub const SIZES:[(usize,&str);4] = [
(1024 , "1KB" ),
(1024*100 , "100KB"),
(1024*1024 , "1MB" ),
(1024*1024*10 , "10MB" ),
];
// ==============================
// === Benchmarking Utilities ===
// ==============================
/// Execute the provided benchmark for each of the [`SIZES`] above.
pub fn run_bench_sizes(name:&str, input:&str, add_newline:bool, c:&mut Criterion) {
let mut group = c.benchmark_group(name);
SIZES.iter().for_each(|(size,size_name)| {
group.throughput(Throughput::Bytes(*size as u64));
let input = replicate_to_size(input,*size,add_newline);
group.bench_function(
*size_name,
|b| b.iter(|| {
let mut lexer = EnsoLexer::new();
let reader = Reader::new(input.as_str().as_bytes(),DecoderUTF8());
lexer.run(black_box(reader));
})
);
})
}
/// This function replicates `input` until it reaches `size` (in bytes).
///
/// If this cannot be done exactly, it will err on the side of over-replication,
/// meaning that the output will be _larger_ than `size` bytes. If the size of
/// the input already exceeds `size`, it is returned unchanged.
pub fn replicate_to_size(input:&str, size:usize, add_newline:bool) -> String {
let input_size = input.len();
let times = 1 + (size / input_size);
let mut input_newline = input.to_string();
let to_add = if add_newline { '\n' } else { ' ' };
input_newline.push(to_add);
input_newline.repeat(times)
}
/// Replace any windows-style line-endings in `input` with unix-style line-endings.
fn preprocess(input:&str) -> String {
input.replace("\r\n","\n")
}
// ==============
// === Macros ===
// ==============
#[macro_export]
macro_rules! bench {
(bench_name = $bench_name:literal; fun_name = $fun_name:ident; bench_input = $bench_input:expr;) => {
pub fn $fun_name(c:&mut Criterion) {
src::run_bench_sizes(
$bench_name,
$bench_input.as_str(),
true,
c
)
}
}
}
// =================================
// === Literal Benchmark Sources ===
// =================================
#[allow(missing_docs)]
pub mod literal {
use super::*;
pub mod number {
use super::*;
pub fn integer() -> String {
preprocess("12345")
}
pub fn integer_explicit_base() -> String {
preprocess("16_a4fd31")
}
pub fn decimal() -> String {
preprocess("1.3141")
}
pub fn decimal_explicit_base() -> String {
preprocess("10_1.000999")
}
pub fn error_base() -> String {
preprocess("10.2_2")
}
}
pub mod text {
use super::*;
pub fn format_line() -> String {
preprocess(r"'dearest creature in \n creation studying english pronunciation'")
}
pub fn format_inline_block() -> String {
preprocess(r"''' An inline block. It's a very good inline block carl \u{AB}")
}
pub fn format_block() -> String {
preprocess(
r#"''' Here is my block of format text. I can `interpolate + things` like that.
It goes on and on and on for `times` times because I feel like it.
Complex interpolated expression `x -> y ~> x | y` woo!
"#)
}
pub fn raw_line() -> String {
preprocess(r#""dearest creature in '''' creation studying english pronunciation""#)
}
pub fn raw_inline_block() -> String {
preprocess(r#"""" An inline block. It's a very good inline block carl ""#)
}
pub fn raw_block() -> String {
preprocess(
r#"""" Here is my block of raw text. `Interpolations` are nothing special here.
It goes on and on and on for I can escape \" though.
It also supports blank lines!
"#)
}
}
}
// ==============================
// === Name Benchmark Sources ===
// ==============================
#[allow(missing_docs)]
pub mod name {
use super::*;
pub fn line_of() -> String {
preprocess(
"Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1"
)
}
pub fn invalid_suffix() -> String {
preprocess("some_var'iable some_varД")
}
}
// ===================================
// === Operator Benchmarks Sources ===
// ===================================
#[allow(missing_docs)]
pub mod operator {
use super::*;
pub fn line_of() -> String {
preprocess("+ - * -> ~> <~ <- ! & | /")
}
pub fn dot_call() -> String {
preprocess(".== . != .<*> .*> .|>")
}
pub fn invalid_suffix() -> String {
preprocess(".... +==")
}
}
// ================================
// === Block Benchmarks Sources ===
// ================================
#[allow(missing_docs)]
pub mod block {
use super::*;
pub fn top_level() -> String {
preprocess("foo\nbar\nbaz")
}
pub fn nested() -> String {
preprocess("foo\nbar\n baz\n quux")
}
pub fn deeply_nested() -> String {
preprocess(
r#"foo
bar
baz
quux
bim
bam
oh
no
"#)
}
}
// ===================================
// === Comments Benchmarks Sources ===
// ===================================
#[allow(missing_docs)]
pub mod comment {
use super::*;
pub fn line() -> String {
preprocess("# foo bar baz I have a really long line comment here that goes on and on")
}
pub fn in_line() -> String {
preprocess("a + b # A useless comment: add a to b")
}
pub fn doc() -> String {
preprocess(
r#"## I have a really big doc comment here
That just keeps prattling on and on and on.
With blank lines
Forever
and
ever
and
ever
documented
"#)
}
}
// ===========================
// === Combined Benchmarks ===
// ===========================
pub mod combined {
use super::*;
pub fn simple() -> String {
preprocess(
r#"
import Base.Meta
## Decompose the value using runtime reflection and print its decomposition.
Main.print_decomp a b =
y = a + b
decomp = Meta.decompose y
Io.println decomp
"#)
}
pub fn complex() -> String {
preprocess(
r#"
import Base.Meta
## Frobnicate the doodads by constructing a new type operator through runtime reflection such that
it can be passed to another language.
! WARNING
Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside
Enso code.
Main.foo a b =
y = x -> z ->
ty = a.gen_type (~>) (<-) b
ty (z x)
decomp = Meta.decompose (y a b)
Io.println decomp
## Execute the main function of this project.
main =
func = Meta.reify (here.foo "My_Name" "my_field")
Io.println(func)
"#)
}
}

View File

@ -0,0 +1,295 @@
//! This file contains the time-based benchmarks for the Enso lexer.
mod lexer_bench_sources;
use criterion::{criterion_group, criterion_main, Criterion, Throughput, black_box};
use lexer_bench_sources as src;
// ==========================
// === Literal Benchmarks ===
// ==========================
bench! {
bench_name = "Integer";
fun_name = bench_literal_number_integer;
bench_input = src::literal::number::integer();
}
bench! {
bench_name = "Integer Explicit Base";
fun_name = bench_literal_number_integer_explicit_base;
bench_input = src::literal::number::integer_explicit_base();
}
bench! {
bench_name = "Decimal";
fun_name = bench_literal_number_decimal;
bench_input = src::literal::number::decimal();
}
bench! {
bench_name = "Decimal Explicit Base";
fun_name = bench_literal_number_decimal_explicit_base;
bench_input = src::literal::number::decimal_explicit_base();
}
bench! {
bench_name = "Number Error Base";
fun_name = bench_literal_number_error_base;
bench_input = src::literal::number::error_base();
}
bench! {
bench_name = "Text Format Line";
fun_name = bench_literal_text_format_line;
bench_input = src::literal::text::format_line();
}
bench! {
bench_name = "Text Format Inline Block";
fun_name = bench_literal_text_format_inline_block;
bench_input = src::literal::text::format_inline_block();
}
bench! {
bench_name = "Text Format Block";
fun_name = bench_literal_text_format_block;
bench_input = src::literal::text::format_block();
}
bench! {
bench_name = "Text Raw Line";
fun_name = bench_literal_text_raw_line;
bench_input = src::literal::text::raw_line();
}
bench! {
bench_name = "Text Raw Inline Block";
fun_name = bench_literal_text_raw_inline_block;
bench_input = src::literal::text::raw_inline_block();
}
bench! {
bench_name = "Text Raw Block";
fun_name = bench_literal_text_raw_block;
bench_input = src::literal::text::raw_block();
}
criterion_group!{
name = literal_benchmarks;
config = src::bench_config();
targets =
bench_literal_number_integer,
bench_literal_number_integer_explicit_base,
bench_literal_number_decimal,
bench_literal_number_decimal_explicit_base,
bench_literal_number_error_base,
bench_literal_text_format_line,
bench_literal_text_format_inline_block,
bench_literal_text_format_block,
bench_literal_text_raw_line,
bench_literal_text_raw_inline_block,
bench_literal_text_raw_block,
}
// ========================
// === Names Benchmarks ===
// ========================
bench! {
bench_name = "Line of Names";
fun_name = bench_names_line_of;
bench_input = src::name::line_of();
}
bench! {
bench_name = "Names with invalid Suffixes";
fun_name = bench_names_invalid_suffix;
bench_input = src::name::invalid_suffix();
}
criterion_group! {
name = name_benchmarks;
config = src::bench_config();
targets =
bench_names_line_of,
bench_names_invalid_suffix,
}
// ===========================
// === Operator Benchmarks ===
// ===========================
bench! {
bench_name = "Line of Operators";
fun_name = bench_operator_line_of;
bench_input = src::operator::line_of();
}
bench! {
bench_name = "Dot Call Operators";
fun_name = bench_operator_dot_call;
bench_input = src::operator::dot_call();
}
bench! {
bench_name = "Operators with Invalid Suffixes";
fun_name = bench_operator_invalid_suffix;
bench_input = src::operator::invalid_suffix();
}
criterion_group! {
name = operator_benchmarks;
config = src::bench_config();
targets =
bench_operator_line_of,
bench_operator_dot_call,
bench_operator_invalid_suffix
}
// ========================
// === Block Benchmarks ===
// ========================
bench! {
bench_name = "Top Level Block";
fun_name = bench_block_top_level;
bench_input = src::block::top_level();
}
bench! {
bench_name = "Nested Block";
fun_name = bench_block_nested;
bench_input = src::block::nested();
}
bench! {
bench_name = "Deeply Nested Blocks";
fun_name = bench_block_deeply_nested;
bench_input = src::block::deeply_nested();
}
criterion_group! {
name = block_benchmarks;
config = src::bench_config();
targets =
bench_block_top_level,
bench_block_nested,
bench_block_deeply_nested,
}
// ==========================
// === Comment Benchmarks ===
// ==========================
bench! {
bench_name = "Line Comment";
fun_name = bench_comment_line;
bench_input = src::comment::line();
}
bench! {
bench_name = "Comment in Line";
fun_name = bench_comment_in_line;
bench_input = src::comment::in_line();
}
bench! {
bench_name = "Doc Comment";
fun_name = bench_comment_doc;
bench_input = src::comment::doc();
}
criterion_group! {
name = comment_benchmarks;
config = src::bench_config();
targets =
bench_comment_line,
bench_comment_in_line,
bench_comment_doc,
}
// ===========================
// === Combined Benchmarks ===
// ===========================
bench! {
bench_name = "Simple Combined Example";
fun_name = bench_combined_simple;
bench_input = src::combined::simple();
}
bench! {
bench_name = "Complex Combined Example";
fun_name = bench_combined_complex;
bench_input = src::combined::complex();
}
criterion_group! {
name = combined_benchmarks;
config = src::bench_config();
targets =
bench_combined_simple,
bench_combined_complex,
}
// ===================
// === Comparisons ===
// ===================
fn bench_rust_reader(c:&mut Criterion) {
let mut group = c.benchmark_group("Rust Vector");
src::SIZES.iter().for_each(|(size,name)| {
group.throughput(Throughput::Bytes(*size as u64));
let input = "abcdefghijklmnopqrstuvwxyz".repeat(1 + size / 26);
group.bench_function(
*name,
|b| b.iter(|| {
let mut counter = 0usize;
for c in black_box(input.as_str()).chars() {
if c == 'f' {
counter += 1;
}
}
counter
})
);
})
}
criterion_group! {
name = rust_comparison;
config = src::bench_config();
targets =
bench_rust_reader,
}
// ===================
// === The Harness ===
// ===================
criterion_main!(
literal_benchmarks,
name_benchmarks,
operator_benchmarks,
block_benchmarks,
comment_benchmarks,
combined_benchmarks,
rust_comparison,
);

View File

@ -1,8 +1,8 @@
use std::fs::File;
use lexer_definition::lexer::EnsoLexer;
use std::io::prelude::*;
use flexer::Definition;
use flexer::State;
use enso_flexer::Definition;
use enso_flexer::State;
@ -23,6 +23,7 @@ fn generate_engine() -> std::io::Result<()> {
let engine = lexer.specialize().unwrap();
lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition.");
file.write_all(contents.as_bytes()).expect("Unable to write lexer definition.");
file.write_all("\n".as_bytes())?;
file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization.");
Ok(())
}

View File

@ -19,6 +19,7 @@ mod library {
pub use lexer_definition::library::*;
}
/// A library of commonly useful functionality.
mod prelude {
pub use lexer_definition::prelude::*;

View File

@ -1,759 +0,0 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for the Enso Lexer.
// TODO [AA] Tests for error scenarios once it's done.
use flexer::*;
use lexer_definition::library::*;
use flexer::prelude::reader::decoder::DecoderUTF8;
use flexer::prelude::Reader;
use lexer::generated::engine::EnsoLexer;
use lexer_definition::library::token::Token;
use lexer_definition::token::BlockType;
use lexer_definition::token::LineEnding;
// =================
// === Utilities ===
// =================
/// Assert that `result` is a success with tokens `expected`.
fn assert_succeeds_as(result:&LexingResult<token::Stream>, expected:token::Stream) {
match result.kind {
ResultKind::Success => assert_eq!(result.tokens,expected),
_ => panic!("Lexing failed.")
}
}
/// Assert that the provided input lexes as `expected`.
fn assert_lexes(input:impl AsRef<str>, expected:token::Stream) {
let input_len = input.as_ref().chars().count();
let result = lex(input);
assert_succeeds_as(&result,expected);
let tokens_vec : Vec<_> = result.tokens.into();
let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum();
assert_eq!(total_length,input_len);
}
/// Lex the provided string.
fn lex(input:impl AsRef<str>) -> LexingResult<token::Stream> {
let mut lexer = EnsoLexer::new();
let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8());
lexer.run(reader)
}
/// Asserts that the input is a block and has a length equal to `length`.
fn assert_block_has_length(input:impl AsRef<str>, expected_length:usize) {
let result = lex(input);
match result.kind {
ResultKind::Success => {
let tokens = result.tokens.tokens();
match tokens.first().expect("Token should be present.") {
Token{shape:token::Shape::Block{..},length,..} =>
assert_eq!(*length,expected_length),
_ => panic!("Token not a block."),
}
},
_ => panic!("Lexing failed"),
}
}
/// Makes the test text have unix line endings to ensure consistency regardless of git checkout
/// style.
fn make_unix_line_endings(input:&str) -> String {
let string = String::from(input);
string.chars().filter(|c| *c != '\r').collect()
}
// =================
// === Operators ===
// =================
#[test]
fn function_operator() {
let input = "->";
let expected = token::Stream::from(vec![Token::Operator("->",0)]);
assert_lexes(input,expected);
}
#[test]
fn bind_operator() {
let input = "<-";
let expected = token::Stream::from(vec![Token::Operator("<-",0)]);
assert_lexes(input,expected);
}
#[test]
fn left_pipe_operator() {
let input = "<|";
let expected = token::Stream::from(vec![Token::Operator("<|",0)]);
assert_lexes(input,expected);
}
#[test]
fn right_pipe_operator() {
let input = "|>";
let expected = token::Stream::from(vec![Token::Operator("|>",0)]);
assert_lexes(input,expected);
}
#[test]
fn eq_operator() {
let input = "=";
let expected = token::Stream::from(vec![Token::Operator("=",0)]);
assert_lexes(input,expected);
}
#[test]
fn eq_compare_operator() {
let input = "==";
let expected = token::Stream::from(vec![Token::Operator("==",0)]);
assert_lexes(input,expected);
}
#[test]
fn geq_operator() {
let input = ">=";
let expected = token::Stream::from(vec![Token::Operator(">=",0)]);
assert_lexes(input,expected);
}
#[test]
fn neq_operator() {
let input = "!=";
let expected = token::Stream::from(vec![Token::Operator("!=",0)]);
assert_lexes(input,expected);
}
#[test]
fn dot_operator() {
let input = ".";
let expected = token::Stream::from(vec![Token::Operator(".",0)]);
assert_lexes(input,expected);
}
#[test]
fn comma_operator() {
let input = ",";
let expected = token::Stream::from(vec![Token::Operator(",",0)]);
assert_lexes(input,expected);
}
#[test]
fn double_dot_operator() {
let input = "..";
let expected = token::Stream::from(vec![Token::Operator("..",0)]);
assert_lexes(input,expected);
}
#[test]
fn triple_dot_operator() {
let input = "...";
let expected = token::Stream::from(vec![Token::Operator("...",0)]);
assert_lexes(input,expected);
}
#[test]
fn error_operator() {
let input = "!";
let expected = token::Stream::from(vec![Token::Operator("!",0)]);
assert_lexes(input,expected);
}
#[test]
fn type_ascription_operator() {
let input = ":";
let expected = token::Stream::from(vec![Token::Operator(":",0)]);
assert_lexes(input,expected);
}
#[test]
fn in_operator() {
let input = "in";
let expected = token::Stream::from(vec![Token::Operator("in",0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_union_operator() {
let input = "|";
let expected = token::Stream::from(vec![Token::Operator("|",0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_intersection_operator() {
let input = "&";
let expected = token::Stream::from(vec![Token::Operator("&",0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_subtraction_operator() {
let input = "\\";
let expected = token::Stream::from(vec![Token::Operator("\\",0)]);
assert_lexes(input,expected);
}
#[test]
fn disable_comment() {
let input = "#";
let expected = token::Stream::from(vec![Token::Operator("#",0)]);
assert_lexes(input,expected);
}
#[test]
fn doc_comment() {
let input = "##";
let expected = token::Stream::from(vec![Token::Operator("##",0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_left_operator() {
let input = "<!!-";
let expected = token::Stream::from(vec![Token::Operator("<!!-",0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_right_operator() {
let input = "-->>";
let expected = token::Stream::from(vec![Token::Operator("-->>",0)]);
assert_lexes(input,expected);
}
#[test]
fn modifier_plus() {
let input = "+=";
let expected = token::Stream::from(vec![Token::Modifier("+",0)]);
assert_lexes(input,expected);
}
#[test]
fn modifier_minus() {
let input = "-=";
let expected = token::Stream::from(vec![Token::Modifier("-",0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_modifier() {
let input = "<%=";
let expected = token::Stream::from(vec![Token::Modifier("<%",0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_eq_suffix() {
let input = "===";
let expected = token::Stream::from(vec![Token::Operator("==",0),Token::InvalidSuffix("=",0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_dots_suffix() {
let input = "....";
let expected = token::Stream::from(vec![Token::Operator("...",0),Token::InvalidSuffix(".",0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_modifier_suffix() {
let input = "+==";
let expected = token::Stream::from(vec![Token::Operator("+",0),Token::InvalidSuffix("==",0)]);
assert_lexes(input,expected);
}
// ===================
// === Identifiers ===
// ===================
#[test]
fn variable_ident() {
let input = "some_variable_name";
let expected = token::Stream::from(vec![Token::Variable("some_variable_name",0)]);
assert_lexes(input,expected)
}
#[test]
fn referent_ident() {
let input = "Some_Referent_Name";
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name",0)]);
assert_lexes(input,expected)
}
#[test]
fn external_ident() {
let input = "__camelCaseIdentifier";
let expected = token::Stream::from(vec![Token::External("__camelCaseIdentifier",0)]);
assert_lexes(input,expected)
}
#[test]
fn blank_ident() {
let input = "_";
let expected = token::Stream::from(vec![Token::Blank(0)]);
assert_lexes(input,expected)
}
#[test]
fn ticked_variable_ident() {
let input = "some_variable_name'";
let expected = token::Stream::from(vec![Token::Variable("some_variable_name'",0)]);
assert_lexes(input,expected)
}
#[test]
fn ticked_referent_ident() {
let input = "Some_Referent_Name'";
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'",0)]);
assert_lexes(input,expected)
}
#[test]
fn multi_ticked_variable_ident() {
let input = "some_variable_name'''";
let expected = token::Stream::from(vec![Token::Variable("some_variable_name'''",0)]);
assert_lexes(input,expected)
}
#[test]
fn multi_ticked_referent_ident() {
let input = "Some_Referent_Name'''";
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'''",0)]);
assert_lexes(input,expected)
}
#[test]
fn variable_with_numbers() {
let input = "some0_1";
let expected = token::Stream::from(vec![Token::Variable("some0_1",0)]);
assert_lexes(input,expected)
}
#[test]
fn referent_with_numbers() {
let input = "Some_1821";
let expected = token::Stream::from(vec![Token::Referent("Some_1821",0)]);
assert_lexes(input,expected)
}
#[test]
fn tick_not_at_end_variable() {
let input = "some_var'iable";
let expected = token::Stream::from(vec![
Token::Variable("some_var'",0),
Token::InvalidSuffix("iable",0),
]);
assert_lexes(input,expected)
}
#[test]
fn trailing_underscore() {
let input = "some_var_";
let expected = token::Stream::from(vec![Token::External("some_var_",0)]);
assert_lexes(input,expected)
}
#[test]
fn trailing_underscore_with_tick() {
let input = "some_var_'";
let expected = token::Stream::from(vec![Token::External("some_var_'",0)]);
assert_lexes(input,expected)
}
#[test]
fn invalid_suffix() {
let input = "some_varД";
let expected = token::Stream::from(vec![
Token::Variable("some_var",0),
Token::InvalidSuffix("Д",0),
]);
assert_lexes(input,expected)
}
#[test]
fn unrecognized_token() {
let input = "some_var`";
let expected = token::Stream::from(vec![
Token::Variable("some_var",0),
Token::Unrecognized("`",0),
]);
assert_lexes(input,expected)
}
#[test]
fn chained_identifiers() {
let input = "my_func A' someJavaValue some_python_value";
let expected = token::Stream::from(vec![
Token::Variable("my_func",0),
Token::Referent("A'",1),
Token::External("someJavaValue",1),
Token::Variable("some_python_value",1),
]);
assert_lexes(input,expected)
}
// ===============
// === Numbers ===
// ===============
#[test]
fn integer() {
let input = "13831";
let expected = token::Stream::from(vec![Token::Number("","13831",0)]);
assert_lexes(input,expected);
}
#[test]
fn integer_with_explicit_base() {
let input = "10_13831";
let expected = token::Stream::from(vec![Token::Number("10","13831",0)]);
assert_lexes(input,expected);
}
#[test]
fn dangling_base() {
let input = "10_";
let expected = token::Stream::from(vec![Token::DanglingBase("10",0)]);
assert_lexes(input,expected);
}
#[test]
fn hex_number() {
let input = "16_ff";
let expected = token::Stream::from(vec![Token::Number("16","ff",0)]);
assert_lexes(input,expected);
}
#[test]
fn decimal() {
let input = "2.71828";
let expected = token::Stream::from(vec![Token::Number("","2.71828",0)]);
assert_lexes(input,expected);
}
#[test]
fn decimal_with_explicit_base() {
let input = "10_2.71828";
let expected = token::Stream::from(vec![Token::Number("10","2.71828",0)]);
assert_lexes(input,expected);
}
#[test]
fn error_base() {
let input = "10.2_2";
let expected = token::Stream::from(vec![
Token::Number("","10.2",0),
Token::InvalidSuffix("_2",0),
]);
assert_lexes(input,expected);
}
#[test]
fn offset_number() {
let input = " 10.2";
let expected = token::Stream::from(vec![
Token::Number("","10.2",4),
]);
assert_lexes(input,expected);
}
// ============
// === Text ===
// ============
// ==============
// === Blocks ===
// ==============
#[test]
fn block_function_call() {
let input = make_unix_line_endings(
r#"f
argument_1
argument_2
fn a1 a2 a3
argument_4
argument_5"#);
let block_fn_args =
Token::Block(
BlockType::Continuous,
4,
vec![
Token::Line(
vec![Token::Variable("argument_1",0)],
0,
LineEnding::LF
),
Token::Line(
vec![
Token::Variable("argument_2",0),
],
0,
LineEnding::LF
),
Token::Line(
vec![
Token::Variable("fn",0),
Token::Variable("a1",1),
Token::Variable("a2",1),
Token::Variable("a3",1),
],
0,
LineEnding::LF
),
Token::Line(
vec![
Token::Variable("argument_4",0),
],
0,
LineEnding::LF
),
Token::Line(
vec![
Token::Variable("argument_5",0),
],
0,
LineEnding::None
),
],
0
);
let top_level_first_line = Token::Line(
vec![
Token::Variable("f",0),
block_fn_args
],
0,
LineEnding::LF
);
let top_level_block = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![top_level_first_line],
0
)
]);
assert_lexes(input,top_level_block);
}
#[test]
fn block_empty_lines() {
let input = "f\r\n a\n\n b\n";
let nested_block = Token::Block(
BlockType::Continuous,
4,
vec![
Token::Line(vec![Token::Variable("a",0)],0,LineEnding::LF),
Token::BlankLine(0,LineEnding::LF),
Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF),
],
0
);
let top_line = Token::Line(
vec![
Token::Variable("f",0),
nested_block
],
0,
LineEnding::CRLF
);
let expected = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![top_line],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn block_top_level() {
let input = make_unix_line_endings(
r#"
foo
bar
baz
"#);
let expected = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![
Token::BlankLine(0,LineEnding::LF),
Token::BlankLine(0,LineEnding::LF),
Token::Line(vec![Token::Variable("foo",0)],0,LineEnding::LF),
Token::Line(vec![Token::Variable("bar",0)],0,LineEnding::LF),
Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn block_with_operator() {
let input = make_unix_line_endings(
r#"x ->
foo x 1
"#);
let nested_block = Token::Block(
BlockType::Discontinuous,
4,
vec![
Token::Line(vec![
Token::Variable("foo",0),
Token::Variable("x",1),
Token::Number("","1",1),
], 0, LineEnding::LF)
],
0
);
let expected = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![
Token::Line(vec![
Token::Variable("x",0),
Token::Operator("->",1),
nested_block
], 0, LineEnding::LF)
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn block_with_nesting() {
let input = make_unix_line_endings(r#"
some_long_thing
foo ->
Bar
baz
quux
"#);
let function_block = Token::Block(
BlockType::Discontinuous,
8,
vec![
Token::Line(vec![Token::Referent("Bar",0)],0,LineEnding::LF),
Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF),
Token::BlankLine(0,LineEnding::LF),
],
0
);
let foo_block = Token::Block(
BlockType::Continuous,
4,
vec![
Token::Line(vec![
Token::Variable("foo",0),
Token::Operator("->",1),
function_block,
], 0, LineEnding::LF),
Token::Line(vec![Token::Variable("quux",0)],0,LineEnding::LF),
],
0
);
let expected = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![
Token::BlankLine(0,LineEnding::LF),
Token::Line(vec![
Token::Variable("some_long_thing",0),
foo_block
], 0, LineEnding::LF),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn block_extra_indented_blank_lines() {
let input = "a\n b\n \n \n c";
let indented_block = Token::Block(
BlockType::Continuous,
4,
vec![
Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF),
Token::BlankLine(8,LineEnding::LF),
Token::BlankLine(2,LineEnding::LF),
Token::Line(vec![Token::Variable("c",0)],0,LineEnding::None),
],
0
);
let top_level_line = Token::Line(vec![
Token::Variable("a",0),
indented_block
],0,LineEnding::LF);
let expected = token::Stream::from(vec![
Token::Block(
BlockType::Continuous,
0,
vec![top_level_line],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn block_length_unix() {
let input = "a\n b\n c";
assert_block_has_length(input,13);
}
#[test]
fn block_length_windows() {
let input = "a\r\n b\r\n c";
assert_block_has_length(input,15);
}
#[test]
fn block_length_mixed() {
let input = "a\r\n b\n c\n d";
assert_block_has_length(input,20);
}
// ================
// === Combined ===
// ================

View File

@ -0,0 +1,337 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing blocks in the Enso lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
use lexer_definition::token::BlockType;
use lexer_definition::token::LineEnding;
// ==============
// === Blocks ===
// ==============
#[test]
fn function_call() {
let input = make_unix_line_endings(
r#"f
argument_1
argument_2
fn a1 a2 a3
argument_4
argument_5"#);
let block_fn_args =
Token::block(
BlockType::Continuous,
4,
vec![
Token::line(
vec![Token::variable("argument_1", 0)],
0,
LineEnding::LF
),
Token::line(
vec![
Token::variable("argument_2", 0),
],
0,
LineEnding::LF
),
Token::line(
vec![
Token::variable("fn", 0),
Token::variable("a1", 1),
Token::variable("a2", 1),
Token::variable("a3", 1),
],
0,
LineEnding::LF
),
Token::line(
vec![
Token::variable("argument_4", 0),
],
0,
LineEnding::LF
),
Token::line(
vec![
Token::variable("argument_5", 0),
],
0,
LineEnding::None
),
],
0
);
let top_level_first_line = Token::line(
vec![
Token::variable("f", 0),
block_fn_args
],
0,
LineEnding::LF
);
let top_level_block = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![top_level_first_line],
0
)
]);
assert_lexes(input,top_level_block);
}
#[test]
fn empty_lines() {
let input = "f\r\n a\n\n b\n";
let nested_block = Token::block(
BlockType::Continuous,
4,
vec![
Token::line(vec![Token::variable("a", 0)], 0, LineEnding::LF),
Token::blank_line(0, LineEnding::LF),
Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF),
],
0
);
let top_line = Token::line(
vec![
Token::variable("f", 0),
nested_block
],
0,
LineEnding::CRLF
);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![top_line],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn top_level() {
let input = make_unix_line_endings(
r#"
foo
bar
baz
"#);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![
Token::blank_line(0, LineEnding::LF),
Token::blank_line(0, LineEnding::LF),
Token::line(vec![Token::variable("foo", 0)], 0, LineEnding::LF),
Token::line(vec![Token::variable("bar", 0)], 0, LineEnding::LF),
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn with_operator() {
let input = make_unix_line_endings(
r#"x ->
foo x 1
"#);
let nested_block = Token::block(
BlockType::Discontinuous,
4,
vec![
Token::line(vec![
Token::variable("foo", 0),
Token::variable("x", 1),
Token::number("", "1", 1),
], 0, LineEnding::LF)
],
0
);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![
Token::line(vec![
Token::variable("x", 0),
Token::operator("->", 1),
nested_block
], 0, LineEnding::LF)
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn with_nesting() {
let input = make_unix_line_endings(r#"
some_long_thing
foo ->
Bar
baz
quux
"#);
let function_block = Token::block(
BlockType::Discontinuous,
8,
vec![
Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF),
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
Token::blank_line(0, LineEnding::LF),
],
0
);
let foo_block = Token::block(
BlockType::Continuous,
4,
vec![
Token::line(vec![
Token::variable("foo", 0),
Token::operator("->", 1),
function_block,
], 0, LineEnding::LF),
Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF),
],
0
);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![
Token::blank_line(0, LineEnding::LF),
Token::line(vec![
Token::variable("some_long_thing", 0),
foo_block
], 0, LineEnding::LF),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn multiple_dedent() {
let input = make_unix_line_endings(r#"
some_long_thing
foo ->
Bar
baz
quux
"#);
let function_block = Token::block(
BlockType::Discontinuous,
8,
vec![
Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF),
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
],
0
);
let foo_block = Token::block(
BlockType::Continuous,
4,
vec![
Token::line(vec![
Token::variable("foo", 0),
Token::operator("->", 1),
function_block,
], 0, LineEnding::LF),
],
0
);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![
Token::blank_line(0, LineEnding::LF),
Token::line(vec![
Token::variable("some_long_thing", 0),
foo_block
], 0, LineEnding::LF),
Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn extra_indented_blank_lines() {
let input = "a\n b\n \n \n c";
let indented_block = Token::block(
BlockType::Continuous,
4,
vec![
Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF),
Token::blank_line(8, LineEnding::LF),
Token::blank_line(2, LineEnding::LF),
Token::line(vec![Token::variable("c", 0)], 0, LineEnding::None),
],
0
);
let top_level_line = Token::line(vec![
Token::variable("a", 0),
indented_block
], 0, LineEnding::LF);
let expected = token::Stream::from(vec![
Token::block(
BlockType::Continuous,
0,
vec![top_level_line],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn length_unix() {
let input = "a\n b\n c";
assert_block_has_length(input,13);
}
#[test]
fn length_windows() {
let input = "a\r\n b\r\n c";
assert_block_has_length(input,15);
}
#[test]
fn length_mixed() {
let input = "a\r\n b\n c\n d";
assert_block_has_length(input,20);
}

View File

@ -0,0 +1,660 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing full-on Enso with the lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
// ================
// === Combined ===
// ================
#[test]
fn method_definition() {
let input = make_unix_line_endings(
r#"## Traverse the heterogeneous list, applying the provided polymorphic function
wherever it matches.
@Tail_Call
map : forall ts ts' => (this : H_List ts) -> (exists a b . a ~> b) -> H_List ts'
map this fn -> case this.types of
Cons x xs ->
x' = fn x
x.Cons (map xs)
x -> fn x
"#);
let doc_comment = Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![
Token::text_segment_raw(
"Traverse the heterogeneous list, applying the provided polymorphic \
function",
0
)
],
0,
token::LineEnding::LF,
),
Token::line(
vec![Token::text_segment_raw("wherever it matches.", 0)],
0,
token::LineEnding::LF
)
],
4,
0
),
],
0,
token::LineEnding::None,
);
let annotation = Token::line(
vec![Token::annotation("Tail_Call", 0)],
0,
token::LineEnding::LF,
);
let signature = Token::line(
vec![
Token::variable("map", 0),
Token::operator(":", 1),
Token::variable("forall", 1),
Token::variable("ts", 1),
Token::variable("ts'", 1),
Token::operator("=>", 1),
Token::operator("(", 1),
Token::variable("this", 0),
Token::operator(":", 1),
Token::referent("H_List", 1),
Token::variable("ts", 1),
Token::operator(")", 0),
Token::operator("->", 1),
Token::operator("(", 1),
Token::variable("exists", 0),
Token::variable("a", 1),
Token::variable("b", 1),
Token::operator(".", 1),
Token::variable("a", 1),
Token::operator("~>", 1),
Token::variable("b", 1),
Token::operator(")", 0),
Token::operator("->", 1),
Token::referent("H_List", 1),
Token::variable("ts'", 1),
],
0,
token::LineEnding::LF
);
let cons_branch_body = Token::block(
token::BlockType::Discontinuous,
8,
vec![
Token::line(
vec![
Token::variable("x'", 0),
Token::operator("=", 1),
Token::variable("fn", 1),
Token::variable("x", 1),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("x", 0),
Token::operator(".", 0),
Token::referent("Cons", 0),
Token::operator("(", 1),
Token::variable("map", 0),
Token::variable("xs", 1),
Token::operator(")", 0),
],
0,
token::LineEnding::LF
),
],
0
);
let case_body = Token::block(
token::BlockType::Continuous,
4,
vec![
Token::line(
vec![
Token::referent("Cons", 0),
Token::variable("x", 1),
Token::variable("xs", 1),
Token::operator("->", 1),
cons_branch_body,
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("x", 0),
Token::operator("->", 1),
Token::variable("fn", 1),
Token::variable("x", 1)
],
0,
token::LineEnding::LF,
)
],
0
);
let function = Token::line(
vec![
Token::variable("map", 0),
Token::variable("this", 1),
Token::variable("fn", 1),
Token::operator("->", 1),
Token::variable("case", 1),
Token::variable("this", 1),
Token::operator(".", 0),
Token::variable("types", 0),
Token::variable("of", 1),
case_body,
],
0,
token::LineEnding::LF
);
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![doc_comment,annotation,signature,function],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn complex_type() {
let input = make_unix_line_endings(
r#"
type Maybe a
type Just item:a
Nothing
is_just = case this of
Just _ -> True
Nothing -> False
"#);
let case_block = Token::block(
token::BlockType::Continuous,
8,
vec![
Token::line(
vec![
Token::referent("Just", 0),
Token::blank(1),
Token::operator("->", 2),
Token::referent("True", 1),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::referent("Nothing", 0),
Token::operator("->", 1),
Token::referent("False", 1)
],
0,
token::LineEnding::LF
),
],
0
);
let type_body = Token::block(
token::BlockType::Continuous,
4,
vec![
Token::line(
vec![
Token::variable("type", 0),
Token::referent("Just", 1),
Token::variable("item", 1),
Token::operator(":", 0),
Token::variable("a", 0),
],
0,
token::LineEnding::LF
),
Token::line(vec![Token::referent("Nothing", 0)], 0, token::LineEnding::LF),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::variable("is_just", 0),
Token::operator("=", 1),
Token::variable("case", 1),
Token::variable("this", 1),
Token::variable("of", 1),
case_block,
],
0,
token::LineEnding::LF
)
],
0
);
let complex_type = Token::line(
vec![
Token::variable("type", 0),
Token::referent("Maybe", 1),
Token::variable("a", 1),
type_body,
],
0,
token::LineEnding::LF
);
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::blank_line(0, token::LineEnding::LF),
complex_type
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn imports_exports() {
let input = make_unix_line_endings(
r#"import Base.List
import Base.Number.Extensions
from Builtins import Unit, Number, Integer, Any, True, False
from Builtins export all
from Base.List export Nil, Cons
from Base.Number.Extensions export all hiding Math
polyglot java import com.ibm.icu.text.BreakIterator
polyglot java import org.enso.base.Text_Utils
"#);
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![
Token::variable("import", 0),
Token::referent("Base", 1),
Token::operator(".", 0),
Token::referent("List", 0),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("import", 0),
Token::referent("Base", 1),
Token::operator(".", 0),
Token::referent("Number", 0),
Token::operator(".", 0),
Token::referent("Extensions", 0),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("from", 0),
Token::referent("Builtins", 1),
Token::variable("import", 1),
Token::referent("Unit", 1),
Token::operator(",", 0),
Token::referent("Number", 1),
Token::operator(",", 0),
Token::referent("Integer", 1),
Token::operator(",", 0),
Token::referent("Any", 1),
Token::operator(",", 0),
Token::referent("True", 1),
Token::operator(",", 0),
Token::referent("False", 1),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::variable("from", 0),
Token::referent("Builtins", 1),
Token::variable("export", 1),
Token::variable("all", 1),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::variable("from", 0),
Token::referent("Base", 1),
Token::operator(".", 0),
Token::referent("List", 0),
Token::variable("export", 1),
Token::referent("Nil", 1),
Token::operator(",", 0),
Token::referent("Cons", 1),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("from", 0),
Token::referent("Base", 1),
Token::operator(".", 0),
Token::referent("Number", 0),
Token::operator(".", 0),
Token::referent("Extensions", 0),
Token::variable("export", 1),
Token::variable("all", 1),
Token::variable("hiding", 1),
Token::referent("Math", 1),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::variable("polyglot", 0),
Token::variable("java", 1),
Token::variable("import", 1),
Token::variable("com", 1),
Token::operator(".", 0),
Token::variable("ibm", 0),
Token::operator(".", 0),
Token::variable("icu", 0),
Token::operator(".", 0),
Token::variable("text", 0),
Token::operator(".", 0),
Token::external("BreakIterator", 0),
],
0,
token::LineEnding::LF
),
Token::line(
vec![
Token::variable("polyglot", 0),
Token::variable("java", 1),
Token::variable("import", 1),
Token::variable("org", 1),
Token::operator(".", 0),
Token::variable("enso", 0),
Token::operator(".", 0),
Token::variable("base", 0),
Token::operator(".", 0),
Token::referent("Text_Utils", 0),
],
0,
token::LineEnding::LF
),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn some_stdlib() {
let input = make_unix_line_endings(
r#"from Base import all
## The top-level entry point for a test suite.
type Suite specs
## PRIVATE
type Spec name behaviors
## PRIVATE
type Behavior name result
## PRIVATE
Behavior.is_fail = this.result.is_fail
## PRIVATE
Spec.is_fail = this.behaviors.any is_fail
## PRIVATE
Suite.is_fail = this.specs.any is_fail
"#);
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![
Token::variable("from", 0),
Token::referent("Base", 1),
Token::variable("import", 1),
Token::variable("all", 1),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![
Token::text_segment_raw(
"The top-level entry point for a test suite.",
0
),
],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::variable("type", 0),
Token::referent("Suite", 1),
Token::variable("specs", 1),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("PRIVATE", 0),],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::variable("type", 0),
Token::referent("Spec", 1),
Token::variable("name", 1),
Token::variable("behaviors", 1)
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("PRIVATE", 0),],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::variable("type", 0),
Token::referent("Behavior", 1),
Token::variable("name", 1),
Token::variable("result", 1)
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("PRIVATE", 0),],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::referent("Behavior", 0),
Token::operator(".", 0),
Token::variable("is_fail", 0),
Token::operator("=", 1),
Token::variable("this", 1),
Token::operator(".", 0),
Token::variable("result", 0),
Token::operator(".", 0),
Token::variable("is_fail", 0),
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("PRIVATE", 0),],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::referent("Spec", 0),
Token::operator(".", 0),
Token::variable("is_fail", 0),
Token::operator("=", 1),
Token::variable("this", 1),
Token::operator(".", 0),
Token::variable("behaviors", 0),
Token::operator(".", 0),
Token::variable("any", 0),
Token::variable("is_fail", 1)
],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("PRIVATE", 0),],
0,
token::LineEnding::LF,
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::line(
vec![
Token::referent("Suite", 0),
Token::operator(".", 0),
Token::variable("is_fail", 0),
Token::operator("=", 1),
Token::variable("this", 1),
Token::operator(".", 0),
Token::variable("specs", 0),
Token::operator(".", 0),
Token::variable("any", 0),
Token::variable("is_fail", 1)
],
0,
token::LineEnding::LF
),
],
0
)
]);
assert_lexes(input,expected);
}

View File

@ -0,0 +1,318 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing comments in the Enso lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
// ================
// === Comments ===
// ================
#[test]
fn disable_eof() {
let input = "# Here is a nice long comment string.";
let expected = token::Stream::from(vec![
Token::disable_comment(" Here is a nice long comment string.", 0)
]);
assert_lexes(input,expected);
}
#[test]
fn disable_lf() {
let input = "# Here is a nice long comment string.\n";
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![Token::disable_comment(" Here is a nice long comment string.", 0)],
0,
token::LineEnding::LF
)
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn disable_crlf() {
let input = "# Here is a nice long comment string.\r\n";
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![Token::disable_comment(" Here is a nice long comment string.", 0)],
0,
token::LineEnding::CRLF
)
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn disable_in_line() {
let input = "a + b <*> N # Compare the frobnicators.";
let expected = token::Stream::from(vec![
Token::variable("a", 0),
Token::operator("+", 1),
Token::variable("b", 1),
Token::operator("<*>", 1),
Token::referent("N", 1),
Token::disable_comment(" Compare the frobnicators.", 1),
]);
assert_lexes(input,expected)
}
#[test]
fn disable_in_interpolate() {
let input = "'String `1 + 1 # add` stuff.'";
let expected = token::Stream::from(vec![
Token::text_line(
token::TextStyle::FormatLine,
vec![
Token::text_segment_raw("String ", 0),
Token::text_segment_interpolate(
vec![
Token::number("", "1", 0),
Token::operator("+", 1),
Token::number("", "1", 1),
Token::unrecognized("#", 1),
Token::variable("add", 1)
],
0
),
Token::text_segment_raw(" stuff.", 0),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_single_line_eof() {
let input = "## Foo bar baz";
let expected = token::Stream::from(vec![
Token::doc_comment(
vec![
Token::line(vec![Token::text_segment_raw("Foo bar baz", 0)], 0, token::LineEnding::None)
],
3,
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_single_line_lf() {
let input = "## Foo bar baz\n";
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("Foo bar baz", 0)],
0,
token::LineEnding::LF
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::blank_line(0, token::LineEnding::None),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_single_line_crlf() {
let input = "## Foo bar baz\r\n";
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(
vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("Foo bar baz", 0)],
0,
token::LineEnding::CRLF
)
],
3,
0
)
],
0,
token::LineEnding::None
),
Token::blank_line(0, token::LineEnding::None),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_in_interpolate() {
let input = "'String `1 + 1 ## add` stuff.'";
let expected = token::Stream::from(vec![
Token::text_line(
token::TextStyle::FormatLine,
vec![
Token::text_segment_raw("String ", 0),
Token::text_segment_interpolate(
vec![
Token::number("", "1", 0),
Token::operator("+", 1),
Token::number("", "1", 1),
Token::unrecognized("##", 1),
Token::variable("add", 1)
],
0
),
Token::text_segment_raw(" stuff.", 0),
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_multi_line() {
let input = make_unix_line_endings(
r#"## Here is a doc comment.
It spans multiple lines.
Some are indented much further.
And this is okay.
It keeps going, even with blank lines.
Until the indentation decreases back.
trailing_blanks_not_part_of_comment"#);
let doc_comment = Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("Here is a doc comment.", 0)],
0,
token::LineEnding::LF
),
Token::line(
vec![Token::text_segment_raw("It spans multiple lines.", 0)],
0,
token::LineEnding::LF
),
Token::line(
vec![Token::text_segment_raw(" Some are indented much further.", 0)],
0,
token::LineEnding::LF
),
Token::line(
vec![Token::text_segment_raw(" And this is okay.", 0)],
0,
token::LineEnding::LF
),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![Token::text_segment_raw("It keeps going, even with blank lines.", 0)],
0,
token::LineEnding::LF
),
Token::line(
vec![Token::text_segment_raw("Until the indentation decreases back.", 0)],
0,
token::LineEnding::LF
),
],
4,
0
);
let expected = token::Stream::from(vec![
Token::block(
token::BlockType::Continuous,
0,
vec![
Token::line(vec![doc_comment], 0, token::LineEnding::None),
Token::blank_line(0, token::LineEnding::LF),
Token::line(
vec![Token::variable("trailing_blanks_not_part_of_comment", 0)],
0,
token::LineEnding::None
)
],
0
)
]);
assert_lexes(input,expected);
}
#[test]
fn doc_mixed_line_endings() {
let input = "## Start a doc comment\n It has indent 3.\r\n \n An indented blank too.";
let expected = token::Stream::from(vec![
Token::doc_comment(
vec![
Token::line(
vec![Token::text_segment_raw("Start a doc comment", 0)],
0,
token::LineEnding::LF
),
Token::line(
vec![Token::text_segment_raw("It has indent 3.", 0)],
0,
token::LineEnding::CRLF
),
Token::blank_line(4, token::LineEnding::LF),
Token::line(
vec![Token::text_segment_raw(" An indented blank too.", 0)],
0,
token::LineEnding::None
)
],
3,
0
)
]);
assert_lexes(input,expected);
}

View File

@ -0,0 +1,178 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing identifiers in the Enso lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
// ===================
// === Identifiers ===
// ===================
#[test]
fn variable_ident() {
let input = "some_variable_name";
let expected = token::Stream::from(vec![Token::variable("some_variable_name", 0)]);
assert_lexes(input,expected)
}
#[test]
fn referent_ident() {
let input = "Some_Referent_Name";
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name", 0)]);
assert_lexes(input,expected)
}
#[test]
fn external_ident() {
let input = "__camelCaseIdentifier";
let expected = token::Stream::from(vec![Token::external("__camelCaseIdentifier", 0)]);
assert_lexes(input,expected)
}
#[test]
fn blank_ident() {
let input = "_";
let expected = token::Stream::from(vec![Token::blank(0)]);
assert_lexes(input,expected)
}
#[test]
fn annotation() {
let input = "@debug";
let expected = token::Stream::from(vec![Token::annotation("debug", 0)]);
assert_lexes(input,expected);
}
#[test]
fn ticked_variable_ident() {
let input = "some_variable_name'";
let expected = token::Stream::from(vec![Token::variable("some_variable_name'", 0)]);
assert_lexes(input,expected)
}
#[test]
fn ticked_referent_ident() {
let input = "Some_Referent_Name'";
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'", 0)]);
assert_lexes(input,expected)
}
#[test]
fn ticked_annotation() {
let input = "@debug'";
let expected = token::Stream::from(vec![Token::annotation("debug'", 0)]);
assert_lexes(input,expected);
}
#[test]
fn multi_ticked_variable_ident() {
let input = "some_variable_name'''";
let expected = token::Stream::from(vec![Token::variable("some_variable_name'''", 0)]);
assert_lexes(input,expected)
}
#[test]
fn multi_ticked_referent_ident() {
let input = "Some_Referent_Name'''";
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'''", 0)]);
assert_lexes(input,expected)
}
#[test]
fn multi_ticked_annotation() {
let input = "@debug''";
let expected = token::Stream::from(vec![Token::annotation("debug''", 0)]);
assert_lexes(input,expected);
}
#[test]
fn variable_with_numbers() {
let input = "some0_1";
let expected = token::Stream::from(vec![Token::variable("some0_1", 0)]);
assert_lexes(input,expected)
}
#[test]
fn referent_with_numbers() {
let input = "Some_1821";
let expected = token::Stream::from(vec![Token::referent("Some_1821", 0)]);
assert_lexes(input,expected)
}
#[test]
fn annotation_with_numbers() {
let input = "@debug_1";
let expected = token::Stream::from(vec![Token::annotation("debug_1", 0)]);
assert_lexes(input,expected);
}
#[test]
fn tick_not_at_end_variable() {
let input = "some_var'iable";
let expected = token::Stream::from(vec![
Token::variable("some_var'", 0),
Token::invalid_suffix("iable", 0),
]);
assert_lexes(input,expected)
}
#[test]
fn trailing_underscore() {
let input = "some_var_";
let expected = token::Stream::from(vec![Token::external("some_var_", 0)]);
assert_lexes(input,expected)
}
#[test]
fn trailing_underscore_with_tick() {
let input = "some_var_'";
let expected = token::Stream::from(vec![Token::external("some_var_'", 0)]);
assert_lexes(input,expected)
}
#[test]
fn invalid_suffix() {
let input = "some_varД";
let expected = token::Stream::from(vec![
Token::variable("some_var", 0),
Token::invalid_suffix("Д", 0),
]);
assert_lexes(input,expected)
}
#[test]
fn unrecognized_token() {
let input = "some_var@";
let expected = token::Stream::from(vec![
Token::variable("some_var", 0),
Token::unrecognized("@", 0),
]);
assert_lexes(input,expected)
}
#[test]
fn chained_identifiers() {
let input = "my_func A' someJavaValue some_python_value";
let expected = token::Stream::from(vec![
Token::variable("my_func", 0),
Token::referent("A'", 1),
Token::external("someJavaValue", 1),
Token::variable("some_python_value", 1),
]);
assert_lexes(input,expected)
}

View File

@ -0,0 +1,85 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing number literals in the Enso lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
// ===============
// === Numbers ===
// ===============
#[test]
fn integer() {
let input = "13831";
let expected = token::Stream::from(vec![Token::number("", "13831", 0)]);
assert_lexes(input,expected);
}
#[test]
fn integer_with_explicit_base() {
let input = "10_13831";
let expected = token::Stream::from(vec![Token::number("10", "13831", 0)]);
assert_lexes(input,expected);
}
#[test]
fn dangling_base() {
let input = "10_";
let expected = token::Stream::from(vec![Token::dangling_base("10", 0)]);
assert_lexes(input,expected);
}
#[test]
fn hex_number() {
let input = "16_ff";
let expected = token::Stream::from(vec![Token::number("16", "ff", 0)]);
assert_lexes(input,expected);
}
#[test]
fn decimal() {
let input = "2.71828";
let expected = token::Stream::from(vec![Token::number("", "2.71828", 0)]);
assert_lexes(input,expected);
}
#[test]
fn decimal_with_explicit_base() {
let input = "10_2.71828";
let expected = token::Stream::from(vec![Token::number("10", "2.71828", 0)]);
assert_lexes(input,expected);
}
#[test]
fn error_base() {
let input = "10.2_2";
let expected = token::Stream::from(vec![
Token::number("", "10.2", 0),
Token::invalid_suffix("_2", 0),
]);
assert_lexes(input,expected);
}
#[test]
fn offset_number() {
let input = " 10.2";
let expected = token::Stream::from(vec![
Token::number("", "10.2", 4),
]);
assert_lexes(input,expected);
}

View File

@ -0,0 +1,230 @@
#![feature(test)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unsafe_code)]
#![warn(unused_import_braces)]
//! This file contains tests for lexing operators in the Enso lexer.
mod test_utils;
use lexer_definition::library::*;
use test_utils::*;
use lexer_definition::library::token::Token;
// =================
// === Operators ===
// =================
#[test]
fn function_operator() {
let input = "->";
let expected = token::Stream::from(vec![Token::operator("->", 0)]);
assert_lexes(input,expected);
}
#[test]
fn bind_operator() {
let input = "<-";
let expected = token::Stream::from(vec![Token::operator("<-", 0)]);
assert_lexes(input,expected);
}
#[test]
fn left_pipe_operator() {
let input = "<|";
let expected = token::Stream::from(vec![Token::operator("<|", 0)]);
assert_lexes(input,expected);
}
#[test]
fn right_pipe_operator() {
let input = "|>";
let expected = token::Stream::from(vec![Token::operator("|>", 0)]);
assert_lexes(input,expected);
}
#[test]
fn eq_operator() {
let input = "=";
let expected = token::Stream::from(vec![Token::operator("=", 0)]);
assert_lexes(input,expected);
}
#[test]
fn eq_compare_operator() {
let input = "==";
let expected = token::Stream::from(vec![Token::operator("==", 0)]);
assert_lexes(input,expected);
}
#[test]
fn geq_operator() {
let input = ">=";
let expected = token::Stream::from(vec![Token::operator(">=", 0)]);
assert_lexes(input,expected);
}
#[test]
fn neq_operator() {
let input = "!=";
let expected = token::Stream::from(vec![Token::operator("!=", 0)]);
assert_lexes(input,expected);
}
#[test]
fn dot_operator() {
let input = ".";
let expected = token::Stream::from(vec![Token::operator(".", 0)]);
assert_lexes(input,expected);
}
#[test]
fn comma_operator() {
let input = ",";
let expected = token::Stream::from(vec![Token::operator(",", 0)]);
assert_lexes(input,expected);
}
#[test]
fn double_dot_operator() {
let input = "..";
let expected = token::Stream::from(vec![Token::operator("..", 0)]);
assert_lexes(input,expected);
}
#[test]
fn triple_dot_operator() {
let input = "...";
let expected = token::Stream::from(vec![Token::operator("...", 0)]);
assert_lexes(input,expected);
}
#[test]
fn error_operator() {
let input = "!";
let expected = token::Stream::from(vec![Token::operator("!", 0)]);
assert_lexes(input,expected);
}
#[test]
fn type_ascription_operator() {
let input = ":";
let expected = token::Stream::from(vec![Token::operator(":", 0)]);
assert_lexes(input,expected);
}
#[test]
fn in_operator() {
let input = "in";
let expected = token::Stream::from(vec![Token::operator("in", 0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_union_operator() {
let input = "|";
let expected = token::Stream::from(vec![Token::operator("|", 0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_intersection_operator() {
let input = "&";
let expected = token::Stream::from(vec![Token::operator("&", 0)]);
assert_lexes(input,expected);
}
#[test]
fn typeset_subtraction_operator() {
let input = "\\";
let expected = token::Stream::from(vec![Token::operator("\\", 0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_left_operator() {
let input = "<!!-";
let expected = token::Stream::from(vec![Token::operator("<!!-", 0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_right_operator() {
let input = "-->>";
let expected = token::Stream::from(vec![Token::operator("-->>", 0)]);
assert_lexes(input,expected);
}
#[test]
fn modifier_plus() {
let input = "+=";
let expected = token::Stream::from(vec![Token::modifier("+", 0)]);
assert_lexes(input,expected);
}
#[test]
fn modifier_minus() {
let input = "-=";
let expected = token::Stream::from(vec![Token::modifier("-", 0)]);
assert_lexes(input,expected);
}
#[test]
fn arbitrary_modifier() {
let input = "<%=";
let expected = token::Stream::from(vec![Token::modifier("<%", 0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_eq_suffix() {
let input = "===";
let expected = token::Stream::from(vec![Token::operator("==", 0), Token::invalid_suffix("=", 0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_dots_suffix() {
let input = "....";
let expected = token::Stream::from(vec![Token::operator("...", 0), Token::invalid_suffix(".", 0)]);
assert_lexes(input,expected);
}
#[test]
fn invalid_modifier_suffix() {
let input = "+==";
let expected = token::Stream::from(vec![Token::operator("+", 0), Token::invalid_suffix("==", 0)]);
assert_lexes(input,expected);
}
#[test]
fn dot_call_operator() {
let input = ".+ .<*>";
let expected = token::Stream::from(vec![
Token::operator(".", 0),
Token::operator("+", 0),
Token::operator(".", 1),
Token::operator("<*>", 0),
]);
assert_lexes(input,expected)
}
#[test]
fn dot_eq_operator() {
let input = ".== . !=";
let expected = token::Stream::from(vec![
Token::operator(".", 0),
Token::operator("==", 0),
Token::operator(".", 1),
Token::operator("!=", 2),
]);
assert_lexes(input,expected);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
//! Utilities for testing the Enso lexer.
#![allow(dead_code)]
use enso_flexer::*;
use lexer_definition::library::*;
use enso_flexer::prelude::reader::decoder::DecoderUTF8;
use enso_flexer::prelude::Reader;
use lexer::generated::engine::EnsoLexer;
use lexer_definition::library::token::Token;
// =================
// === Utilities ===
// =================
/// Assert that `result` is a success with tokens `expected`.
pub fn assert_succeeds_as(result:&LexingResult<token::Stream>, expected:token::Stream) {
match result.kind {
ResultKind::Success => assert_eq!(result.tokens,expected),
_ => panic!("Lexing failed.")
}
}
/// Assert that the provided input lexes as `expected`.
pub fn assert_lexes(input:impl AsRef<str>, expected:token::Stream) {
let input_len = input.as_ref().chars().count();
let result = lex(input);
assert_succeeds_as(&result,expected);
let tokens_vec : Vec<_> = result.tokens.into();
let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum();
assert_eq!(total_length,input_len);
}
/// Lex the provided string.
pub fn lex(input:impl AsRef<str>) -> LexingResult<token::Stream> {
let mut lexer = EnsoLexer::new();
let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8());
lexer.run(reader)
}
/// Asserts that the input is a block and has a length equal to `length`.
pub fn assert_block_has_length(input:impl AsRef<str>, expected_length:usize) {
let result = lex(input);
match result.kind {
ResultKind::Success => {
let tokens = result.tokens.tokens();
match tokens.first().expect("Token should be present.") {
Token{shape:token::Shape::Block{..},length,..} =>
assert_eq!(*length,expected_length),
_ => panic!("Token not a block."),
}
},
_ => panic!("Lexing failed"),
}
}
/// Makes the test text have unix line endings to ensure consistency regardless of git checkout
/// style.
pub fn make_unix_line_endings(input:&str) -> String {
let string = String::from(input);
string.chars().filter(|c| *c != '\r').collect()
}

View File

@ -516,6 +516,9 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
logger.trace {
onEscape(Segment.Escape.Slash)
}
def onEscapeFormatQuote(): Unit = logger.trace {
onEscape(Segment.Escape.Quote)
}
def onEscapeQuote(): Unit =
logger.trace {
@ -619,46 +622,23 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
}
ROOT || '`' || text.onInterpolateEnd()
text.FMT || '`' || text.onInterpolateBegin()
ROOT || "'''" >> "'".many1 || text.onInvalidQuote()
ROOT || "\"\"\"" >> "\"".many1 || text.onInvalidQuote()
ROOT || "'" || text.onBegin(text.FMT_LINE)
text.FMT_LINE || "'" || text.submit()
text.FMT_LINE || "'".many1 || text.submitInvalidQuote()
text.FMT_LINE || text.fmtSeg || text.submitPlainSegment()
text.FMT_LINE || eof || text.submitMissingQuote()
text.FMT_LINE || newline || text.submitMissingQuote()
block.FIRSTCHAR || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
ROOT || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
ROOT || "'''" || text.onInlineBlock()
text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment()
text.FMT_BLCK || eof || text.onEndOfBlock()
text.FMT_BLCK || newline || text.onEndOfLine()
ROOT || "\"\"\"" >> "\"".many1 || text.onInvalidQuote()
ROOT || '"' || text.onBegin(text.RAW_LINE)
text.RAW_LINE || '"' || text.submit()
text.RAW_LINE || '"'.many1 || text.submitInvalidQuote()
text.RAW_LINE || text.rawSeg || text.submitPlainSegment()
text.RAW_LINE || eof || text.submitMissingQuote()
text.RAW_LINE || newline || text.submitMissingQuote()
block.FIRSTCHAR || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
ROOT || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
ROOT || "\"\"\"" || text.onInlineBlock()
text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment()
text.RAW_BLCK || eof || text.onEndOfBlock()
text.RAW_BLCK || newline || text.onEndOfLine()
text.NEWLINE || space.opt || text.onNewLine()
text.NEWLINE || space.opt >> newline || text.onEmptyLine()
text.NEWLINE || space.opt >> eof || text.onEOFNewLine()
block.FIRSTCHAR || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
block.FIRSTCHAR || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
text.FMT || '`' || text.onInterpolateBegin()
AST.Text.Segment.Escape.Character.codes.foreach { code =>
val char = s"text.Segment.Escape.Character.$code"
text.FMT || s"\\$code" run s"text.onEscape($char)"
}
AST.Text.Segment.Escape.Control.codes.foreach { code =>
val ctrl = s"text.Segment.Escape.Control.$code"
text.FMT || s"\\$code" run s"text.onEscape($ctrl)"
@ -668,16 +648,39 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
text.FMT || text.escape_u16 || text.onEscapeU16()
text.FMT || text.escape_u32 || text.onEscapeU32()
text.FMT || text.escape_int || text.onEscapeInt()
text.FMT || "\\'" || text.onEscapeFormatQuote()
text.FMT || "\\\\" || text.onEscapeSlash()
text.FMT || "\\'" || text.onEscapeQuote()
text.FMT || "\\" >> any || text.onEscapeInvalid()
text.FMT || "\\" || text.onEscapeUnfinished()
text.FMT_LINE || "'" || text.submit()
text.FMT_LINE || "'".many1 || text.submitInvalidQuote()
text.FMT_LINE || text.fmtSeg || text.submitPlainSegment()
text.FMT_LINE || eof || text.submitMissingQuote()
text.FMT_LINE || newline || text.submitMissingQuote()
text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment()
text.FMT_BLCK || eof || text.onEndOfBlock()
text.FMT_BLCK || newline || text.onEndOfLine()
text.RAW_LINE || '"' || text.submit()
text.RAW_LINE || '"'.many1 || text.submitInvalidQuote()
text.RAW_LINE || text.rawSeg || text.submitPlainSegment()
text.RAW_LINE || eof || text.submitMissingQuote()
text.RAW_LINE || newline || text.submitMissingQuote()
text.RAW_LINE || "\\\"" || text.onEscapeRawQuote()
text.RAW_LINE || "\\\\" || text.onEscapeSlash()
text.RAW_LINE || "\\" >> any || text.onEscapeInvalid()
text.RAW_LINE || "\\" || text.onEscapeUnfinished()
text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment()
text.RAW_BLCK || eof || text.onEndOfBlock()
text.RAW_BLCK || newline || text.onEndOfLine()
text.NEWLINE || space.opt || text.onNewLine()
text.NEWLINE || space.opt >> newline || text.onEmptyLine()
text.NEWLINE || space.opt >> eof || text.onEOFNewLine()
//////////////
/// Blocks ///
//////////////

View File

@ -0,0 +1,169 @@
package org.enso.syntax;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.BenchmarkParams;
import org.openjdk.jmh.infra.Blackhole;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@Fork(1)
@Warmup(iterations = 5)
@Measurement(iterations = 10)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class LexerBench {
@State(Scope.Thread)
public static class BenchState {
@Param({"1024" /* 1KB */, "102400" /* 100KB */, "1048576" /* 1MB */, "10485760" /* 10MB */})
public int bytesSize;
public String myInput;
@Setup(Level.Trial)
public void doSetup(BenchmarkParams params) {
var benchNameSegments = params.getBenchmark().split("\\.");
var benchName = benchNameSegments[benchNameSegments.length - 1];
var benchInput = LexerBenchFixtures.benchmarks().get(benchName).get();
this.myInput = LexerBenchFixtures.replicate(benchInput, bytesSize, false);
}
}
// === Literals ===
@Benchmark
public void literalNumberInteger(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalNumberIntegerExplicitBase(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalNumberDecimal(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalNumberDecimalExplicitBase(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalNumberErrorBase(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalTextFormatLine(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalTextFormatInlineBlock(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalTextFormatBlock(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalTextRawLine(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalRawInlineBlock(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void literalRawBlock(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
// === Names ===
@Benchmark
public void nameLineOf(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void nameInvalidSuffix(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
// === Operators ===
@Benchmark
public void operatorLineOf(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void operatorDotCall(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void operatorInvalidSuffix(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
// === Blocks ===
@Benchmark
public void blockTopLevel(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void blockNested(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void blockDeeplyNested(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
// === Comments ===
@Benchmark
public void commentLine(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void commentInLine(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void commentDoc(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
// === Combined ===
@Benchmark
public void combinedSimple(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
@Benchmark
public void combinedComplex(Blackhole blackhole, BenchState state) {
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
}
}

View File

@ -0,0 +1,223 @@
package org.enso.syntax
import java.nio.charset.StandardCharsets
import org.enso.flexer
import org.enso.flexer.Reader
import org.enso.syntax.text.AST
import org.enso.syntax.text.spec.{ParserDef, ParserDef2}
object LexerBenchFixtures {
private val newEngine = flexer.Parser.compile(ParserDef())
// === Lexer Runner ===
/** Execute the lexer on the provided `input`.
*
* @param input the source code
* @return the result of lexing `input`
*/
def runLexer(input: String): ParserDef2.Result[AST.Module] = {
val engine = newEngine()
val reader = new Reader(input)
engine.run(reader)
}
// === Utilities ===
/** Replicate the provided `input` out to the provided `size` in bytes
* (according to utf-8).
*
* @param input the text to replicate
* @param size the size to replicate `input` to
* @param addNewline whether or not a newline should be added after each
* repetition of `input`
* @return `input`, repeated enough times to make the size >= `size`
*/
def replicate(
input: String,
size: Int,
addNewline: Boolean = false
): String = {
val inputSize = input.getBytes(StandardCharsets.UTF_8).length
val times = 1 + size / inputSize
val inputNewline = if (addNewline) input + "\n" else input + " "
inputNewline.repeat(times)
}
/** Replace all CRLF line endings in the input by LF.
*
* @param input the input text
* @return `input` with all `\r\n` replaced by `\n`
*/
def preprocess(input: String): String = {
input.replace("\r\n", "\n")
}
// === Benchmarks ===
val benchmarks: Map[String, String] = Map(
// Literals
("literalNumberInteger", Literal.Number.integer),
("literalNumberIntegerExplicitBase", Literal.Number.integerExplicitBase),
("literalNumberDecimal", Literal.Number.decimal),
("literalNumberDecimalExplictBase", Literal.Number.decimalExplicitBase),
("literalNumberErrorBase", Literal.Number.errorBase),
("literalTextFormatLine", Literal.Text.formatLine),
("literalTextFormatInlineBlock", Literal.Text.formatInlineBlock),
("literalTextFormatBlock", Literal.Text.formatBlock),
("literalTextRawLine", Literal.Text.rawLine),
("literalTextRawInlineBlock", Literal.Text.rawInlineBlock),
("literalTextRawBlock", Literal.Text.rawBlock),
// Names
("nameLineOf", Name.lineOf),
("nameInvalidSuffix", Name.invalidSuffix),
// Operators
("operatorLineOf", Operator.lineOf),
("operatorDotCall", Operator.dotCall),
("operatorInvalidSuffix", Operator.invalidSuffix),
// Blocks
("blockTopLevel", Block.topLevel),
("blockNested", Block.nested),
("blockDeeplyNested", Block.deeplyNested),
// Comments
("commentLine", Comment.line),
("commentInLine", Comment.inLine),
("commentDoc", Comment.doc),
// Combined
("combinedSimple", Combined.simple),
("combinedComplex", Combined.complex)
)
// === Inputs ===
object Literal {
object Number {
val integer: String = preprocess("12345")
val integerExplicitBase: String = preprocess("16_a4fd31")
val decimal: String = preprocess("1.3141")
val decimalExplicitBase: String = preprocess("10_1.000999")
val errorBase: String = preprocess("10.2_2")
}
object Text {
val formatLine: String =
"'dearest creature in \\n creation studying english pronunciation'"
val formatInlineBlock: String =
"''' An inline block. It's a very good inline block carl \\u{AB}"
val formatBlock: String =
"""''' Here is my block of format text. I can `interpolate + things` like that.
| It goes on and on and on for `times` times because I feel like it.
|
| Complex interpolated expression `x -> y ~> x | y` woo!
|""".stripMargin
val rawLine: String =
"\"dearest creature in '''' creation studying english pronunciation\""
val rawInlineBlock: String =
"\"\"\" An inline block. It's a very good inline block carl "
val tQuote = "\"\"\""
val rawBlock: String =
s"""$tQuote Here is my block of raw text. `Interpolations` are nothing special here.
| It goes on and on and on for I can escape \" though.
|
| It also supports blank lines!
|""".stripMargin
}
}
object Name {
val lineOf: String =
"Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1"
val invalidSuffix: String = "some_var'iable some_varД"
}
object Operator {
val lineOf: String = "+ - * -> ~> <~ <- ! & | /"
val dotCall: String = ".== . != .<*> .*> .|>"
val invalidSuffix: String = ".... +=="
}
object Block {
val topLevel: String = "foo\nbar\nbaz"
val nested: String = "foo\\nbar\\n baz\\n quux"
val deeplyNested: String =
"""foo
|bar
| baz
| quux
| bim
| bam
| oh
|no
|""".stripMargin
}
object Comment {
val line: String =
"# foo bar baz I have a really long line comment here that goes on and on"
val inLine: String = "a + b # A useless comment: add a to b"
val doc: String =
"""## I have a really big doc comment here
| That just keeps prattling on and on and on.
|
| With blank lines
|
| Forever
|
| and
| ever
|
| and
|
|
|
|
| ever
|documented
|""".stripMargin
}
object Combined {
val simple: String =
"""
|import Base.Meta
|
|## Decompose the value using runtime reflection and print its decomposition.
|Main.print_decomp a b =
| y = a + b
| decomp = Meta.decompose y
| Io.println decomp
|""".stripMargin
val complex: String =
"""import Base.Meta
|
|## Frobnicate the doodads by constructing a new type operator through runtime reflection such that
| it can be passed to another language.
|
| ! WARNING
| Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside
| Enso code.
|Main.foo a b =
| y = x -> z ->
| ty = a.gen_type (~>) (<-) b
| ty (z x)
| decomp = Meta.decompose (y a b)
| Io.println decomp
|
|## Execute the main function of this project.
|main =
| func = Meta.reify (here.foo "My_Name" "my_field")
| Io.println(func)
|""".stripMargin
}
}