mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
Complete the implementation of the Enso lexer (#1177)
This commit is contained in:
parent
35efd8ea55
commit
e5695e6f5d
1
.gitignore
vendored
1
.gitignore
vendored
@ -97,6 +97,7 @@ bench-report.xml
|
||||
.editorconfig
|
||||
.bloop/
|
||||
.bsp/
|
||||
project/metals.sbt
|
||||
|
||||
#################
|
||||
## Build Cache ##
|
||||
|
25
Cargo.toml
25
Cargo.toml
@ -2,9 +2,6 @@
|
||||
|
||||
members = [
|
||||
"lib/rust/ast",
|
||||
"lib/rust/flexer",
|
||||
"lib/rust/flexer-testing/definition",
|
||||
"lib/rust/flexer-testing/generation",
|
||||
"lib/rust/launcher-shims",
|
||||
"lib/rust/lexer/definition",
|
||||
"lib/rust/lexer/generation",
|
||||
@ -15,16 +12,17 @@ members = [
|
||||
# assumes you have `rust-lib` in the same directory as `enso`. See:
|
||||
# https://github.com/enso-org/rust-lib/blob/main/docs/CONTRIBUTING.md#developing-in-conjunction-with-enso--ide
|
||||
[patch.crates-io]
|
||||
# enso-automata = { path = '../rust-lib/src/automata' }
|
||||
# enso-data = { path = '../rust-lib/src/data' }
|
||||
# enso-generics = { path = '../rust-lib/src/generics' }
|
||||
# enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' }
|
||||
# enso-logger = { path = '../rust-lib/src/logger' }
|
||||
# enso-macro-utils = { path = '../rust-lib/src/macro-utils' }
|
||||
# enso-optics = { path = '../rust-lib/src/optics' }
|
||||
# enso-prelude = { path = '../rust-lib/src/prelude' }
|
||||
# enso-shapely = { path = '../rust-lib/src/shapely/impl' }
|
||||
# enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' }
|
||||
#enso-automata = { path = '../rust-lib/src/automata' }
|
||||
#enso-data = { path = '../rust-lib/src/data' }
|
||||
#enso-flexer = { path = '../rust-lib/src/flexer' }
|
||||
#enso-generics = { path = '../rust-lib/src/generics' }
|
||||
#enso-lazy-reader = { path = '../rust-lib/src/lazy-reader' }
|
||||
#enso-logger = { path = '../rust-lib/src/logger' }
|
||||
#enso-macro-utils = { path = '../rust-lib/src/macro-utils' }
|
||||
#enso-optics = { path = '../rust-lib/src/optics' }
|
||||
#enso-prelude = { path = '../rust-lib/src/prelude' }
|
||||
#enso-shapely = { path = '../rust-lib/src/shapely/impl' }
|
||||
#enso-shapely-macros = { path = '../rust-lib/src/shapely/macros' }
|
||||
|
||||
[profile.dev]
|
||||
opt-level = 0
|
||||
@ -36,7 +34,6 @@ debug-assertions = true
|
||||
opt-level = 3
|
||||
lto = true
|
||||
debug = false
|
||||
panic = 'abort'
|
||||
debug-assertions = false
|
||||
|
||||
[profile.bench]
|
||||
|
39
build.sbt
39
build.sbt
@ -471,6 +471,45 @@ lazy val syntax = crossProject(JVMPlatform, JSPlatform)
|
||||
Compile / fullOptJS / artifactPath := file("target/scala-parser.js")
|
||||
)
|
||||
|
||||
lazy val `lexer-bench` =
|
||||
(project in file("lib/scala/syntax/specialization/lexer-bench"))
|
||||
.settings(
|
||||
commands += WithDebugCommand.withDebug,
|
||||
inConfig(Compile)(truffleRunOptionsSettings),
|
||||
inConfig(Benchmark)(Defaults.testSettings),
|
||||
parallelExecution in Test := false,
|
||||
logBuffered in Test := false,
|
||||
Test / fork := true,
|
||||
libraryDependencies ++= jmh
|
||||
)
|
||||
.configs(Test)
|
||||
.configs(Benchmark)
|
||||
.dependsOn(syntax.jvm)
|
||||
.dependsOn(flexer.jvm)
|
||||
.settings(
|
||||
javaOptions ++= Seq(
|
||||
"-Xms4096m",
|
||||
"-Xmx4096m",
|
||||
"-XX:+FlightRecorder",
|
||||
),
|
||||
mainClass in Benchmark := Some("org.openjdk.jmh.Main"),
|
||||
bench := Def.task {
|
||||
(run in Benchmark).toTask("").value
|
||||
},
|
||||
benchOnly := Def.inputTaskDyn {
|
||||
import complete.Parsers.spaceDelimited
|
||||
val name = spaceDelimited("<name>").parsed match {
|
||||
case List(name) => name
|
||||
case _ =>
|
||||
throw new IllegalArgumentException("Expected one argument.")
|
||||
}
|
||||
Def.task {
|
||||
(testOnly in Benchmark).toTask(" -- -z " + name).value
|
||||
}
|
||||
}.evaluated,
|
||||
parallelExecution in Benchmark := false
|
||||
)
|
||||
|
||||
lazy val `parser-service` = (project in file("lib/scala/parser-service"))
|
||||
.dependsOn(syntax.jvm)
|
||||
.settings(
|
||||
|
@ -26,8 +26,6 @@ below:
|
||||
the implementation technologies for the parser.
|
||||
- [**Parser Architecture:**](./architecture.md) An overview of the architecture
|
||||
of the parser as a whole.
|
||||
- [**Flexer:**](./flexer.md) An overview of the design and architecture of the
|
||||
flexer, a generic, DFA-based lexing engine.
|
||||
- [**Lexer:**](./lexer.md) The Enso lexer, responsible for tokenising the input
|
||||
stream of source code.
|
||||
- [**Macro Resolution:**](./macro-resolution.md) The system for defining and
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: AST
|
||||
category: parser
|
||||
tags: [parser, ast]
|
||||
order: 9
|
||||
order: 8
|
||||
---
|
||||
|
||||
# AST
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Construct Resolution
|
||||
category: parser
|
||||
tags: [parser, construct, resolution]
|
||||
order: 7
|
||||
order: 6
|
||||
---
|
||||
|
||||
# Construct Resolution
|
||||
|
@ -1,199 +0,0 @@
|
||||
---
|
||||
layout: developer-doc
|
||||
title: Flexer
|
||||
category: syntax
|
||||
tags: [parser, flexer, lexer, dfa]
|
||||
order: 3
|
||||
---
|
||||
|
||||
# Flexer
|
||||
|
||||
The flexer is a finite-automata-based engine for the definition and generation
|
||||
of lexers. Akin to `flex`, and other lexer generators, the user may use it to
|
||||
define a series of rules for lexing their language, which are then used by the
|
||||
flexer to generate a highly-efficient lexer implementation.
|
||||
|
||||
Where the flexer differs from other programs in this space, however, is the
|
||||
power that it gives users. When matching a rule, the flexer allows its users to
|
||||
execute _arbitrary_ Rust code, which may even manipulate the lexer's state and
|
||||
position. This means that the languages that can be lexed by the flexer extend
|
||||
from the simplest regular grammars right up to unrestricted grammars (but please
|
||||
don't write a programming language whose syntax falls into this category). It
|
||||
also differs in that it chooses the first complete match for a rule, rather than
|
||||
the longest one, which makes lexers much easier to define and maintain.
|
||||
|
||||
For detailed library documentation, please see the
|
||||
[crate documentation](../../lib/rust/flexer/src/lib.rs) itself. This includes a
|
||||
comprehensive tutorial on how to define a lexer using the flexer.
|
||||
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [The Lexing Process](#the-lexing-process)
|
||||
- [Lexing Rules](#lexing-rules)
|
||||
- [Groups](#groups)
|
||||
- [Patterns](#patterns)
|
||||
- [Transition Functions](#transition-functions)
|
||||
- [Code Generation](#code-generation)
|
||||
- [Automated Code Generation](#automated-code-generation)
|
||||
- [Structuring the Flexer Code](#structuring-the-flexer-code)
|
||||
- [Supporting Code Generation](#supporting-code-generation)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
|
||||
## The Lexing Process
|
||||
|
||||
In the flexer, the lexing process proceeds from the top to the bottom of the
|
||||
user-defined rules, and selects the first expression that _matches fully_. Once
|
||||
a pattern has been matched against the input, the associated code is executed
|
||||
and the process starts again until the input stream has been consumed.
|
||||
|
||||
This point about _matching fully_ is particularly important to keep in mind, as
|
||||
it differs from other lexer generators that tend to prefer the _longest_ match
|
||||
instead.
|
||||
|
||||
## Lexing Rules
|
||||
|
||||
A lexing rule for the flexer is a combination of three things:
|
||||
|
||||
1. A group.
|
||||
2. A pattern.
|
||||
3. A transition function.
|
||||
|
||||
An example of defining a rule is as follows:
|
||||
|
||||
```rust
|
||||
fn define() -> Self {
|
||||
let mut lexer = TestLexer::new();
|
||||
let a_word = Pattern::char('a').many1();
|
||||
let root_group_id = lexer.initial_state;
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
// Here is the rule definition.
|
||||
root_group.create_rule(&a_word,"self.on_first_word(reader)");
|
||||
lexer
|
||||
}
|
||||
```
|
||||
|
||||
### Groups
|
||||
|
||||
A group is a mechanism that the flexer provides to allow grouping of rules
|
||||
together. The flexer has a concept of a "state stack", which records the
|
||||
currently active state at the current time, that can be manipulated by the
|
||||
user-defined [transition functions](#transition-functions).
|
||||
|
||||
A state can be made active by using `flexer::push_state(state)`, and can be
|
||||
deactivated by using `flexer::pop_state(state)` or
|
||||
`flexer::pop_states_until(state)`. In addition, states may also have _parents_,
|
||||
from which they can inherit rules. This is fantastic for removing the need to
|
||||
repeat yourself when defining the lexer.
|
||||
|
||||
When inheriting rules from a parent group, the rules from the parent group are
|
||||
matched strictly _after_ the rules from the child group. This means that groups
|
||||
are able to selectively "override" the rules of their parents. Rules are still
|
||||
matched in order for each group's set of rules.
|
||||
|
||||
### Patterns
|
||||
|
||||
Rules are defined to match _patterns_. Patterns are regular-grammar-like
|
||||
descriptions of the textual content (as characters) that should be matched. For
|
||||
a description of the various patterns provided by the flexer, see
|
||||
[pattern.rs](../../lib/rust/flexer/src/automata/pattern.rs).
|
||||
|
||||
When a pattern is matched, the associated
|
||||
[transition function](#transition-functions) is executed.
|
||||
|
||||
### Transition Functions
|
||||
|
||||
The transition function is a piece of arbitrary rust code that is executed when
|
||||
the pattern for a given rule is matched by the flexer. This code may perform
|
||||
arbitrary manipulations of the lexer state, and is where the majority of the
|
||||
power of the flexer stems from.
|
||||
|
||||
## Code Generation
|
||||
|
||||
While it would be possible to interpret the flexer definition directly at
|
||||
runtime, this would involve far too much dynamicism and non-cache-local lookup
|
||||
to be at all fast.
|
||||
|
||||
Instead, the flexer includes
|
||||
[`generate.rs`](../../lib/rust/flexer/src/generate.rs), a library for generating
|
||||
highly-specialized lexer implementations based on the definition provided by the
|
||||
user. The transformation that it implements operates as follows for each group
|
||||
of rules.
|
||||
|
||||
1. The set of rules in a group is used to generate a
|
||||
[Nondeterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton),
|
||||
(NFA).
|
||||
2. The NFA is ttransformed into a
|
||||
[Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton)
|
||||
(DFA), using a variant of the standard
|
||||
[powerset construction](https://en.wikipedia.org/wiki/Powerset_construction)
|
||||
algorithm. This variant has been modified to ensure that the following
|
||||
additional properties hold:
|
||||
- Patterns are matched in the order in which they are defined.
|
||||
- The associated transition functions are maintained correctly through the
|
||||
transformation.
|
||||
- The lexing process is `O(n)`, where `n` is the size of the input.
|
||||
3. The DFA is then used to generate the rust code that implements that lexer.
|
||||
|
||||
The generated lexer contains a main loop that consumes the input stream
|
||||
character-by-character, evaluating what is effectively a big `match` expression
|
||||
that processes the input to evaluate the user-provided transition functions as
|
||||
appropriate.
|
||||
|
||||
### Automated Code Generation
|
||||
|
||||
In order to avoid the lexer definition getting out of sync with its
|
||||
implementation (the generated engine), it is necessary to create a separate
|
||||
crate for the generated engine that has the lexer definition as one of its
|
||||
dependencies.
|
||||
|
||||
This separation enables a call to `flexer::State::specialize()` in the crate's
|
||||
`build.rs` (or a macro) during compilation. The output can be stored in a new
|
||||
file i.e. `engine.rs` and exported from the library as needed. The project
|
||||
structure would therefore appear as follows.
|
||||
|
||||
```
|
||||
- lib/rust/lexer/
|
||||
- definition/
|
||||
- src/
|
||||
- lib.rs
|
||||
- cargo.toml
|
||||
|
||||
- generation/
|
||||
- src/
|
||||
- engine.rs <-- the generated file
|
||||
- lib.rs <-- `pub mod engine`
|
||||
- build.rs <-- calls `flexer::State::specialize()` and saves its output to
|
||||
`src/engine.rs`
|
||||
- cargo.toml <-- lexer-definition is in dependencies and build-dependencies
|
||||
```
|
||||
|
||||
With this design, `flexer.generate_specialized_code()` is going to be executed
|
||||
on each rebuild of `lexer/generation`. Therefore, `generation` should contain
|
||||
only the minimum amount of logic, and should endeavor to minimize any
|
||||
unnecessary dependencies to avoid recompiling too often.
|
||||
|
||||
## Structuring the Flexer Code
|
||||
|
||||
In order to unify the API between the definition and generated usages of the
|
||||
flexer, the API is separated into the following components:
|
||||
|
||||
- `Flexer`: The main flexer definition itself, providing functionality common to
|
||||
the definition and implementation of all lexers.
|
||||
- `flexer::State`: The stateful components of a lexer definition. This trait is
|
||||
implemented for a particular lexer definition, allowing the user to store
|
||||
arbitrary data in their lexer, as needed.
|
||||
- **User-Defined Lexer:** The user can then define a lexer that _wraps_ the
|
||||
flexer, specialised to the particular `flexer::State` that the user has
|
||||
defined. It is recommended to implement `Deref` and `DerefMut` between the
|
||||
defined lexer and the `Flexer`, to allow for ease of use.
|
||||
|
||||
### Supporting Code Generation
|
||||
|
||||
This architecture separates out the generated code (which can be defined purely
|
||||
on the user-defined lexer), from the code that is defined as part of the lexer
|
||||
definition. This means that the same underlying structures can be used to both
|
||||
_define_ the lexer, and be used by the generated code from that definition.
|
||||
|
||||
For an example of how these components are used in the generated lexer, please
|
||||
see [`generated_api_test`](../../lib/rust/flexer/tests/generated_api_test.rs).
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: JVM Object Generation
|
||||
category: parser
|
||||
tags: [parser, jvm, object-generation]
|
||||
order: 10
|
||||
order: 9
|
||||
---
|
||||
|
||||
# JVM Object Generation
|
||||
@ -14,8 +14,6 @@ the compiler and runtime to work with the AST.
|
||||
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [Overall Architecture](#overall-architecture)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
|
||||
# Overall Architecture
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Lexer
|
||||
category: syntax
|
||||
tags: [parser, lexer]
|
||||
order: 4
|
||||
order: 3
|
||||
---
|
||||
|
||||
# Lexer
|
||||
@ -19,6 +19,9 @@ identify blocks
|
||||
- [Libraries in the Lexer Definition](#libraries-in-the-lexer-definition)
|
||||
- [Lexer Functionality](#lexer-functionality)
|
||||
- [The Lexer AST](#the-lexer-ast)
|
||||
- [Benchmarking the Lexer](#benchmarking-the-lexer)
|
||||
- [Running a Subset of the Benchmarks](#running-a-subset-of-the-benchmarks)
|
||||
- [Changing the Lexer](#changing-the-lexer)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
|
||||
@ -43,12 +46,12 @@ paths directly from the crate root.
|
||||
|
||||
## Lexer Functionality
|
||||
|
||||
The lexer needs to provide the following functionality as part of the parser.
|
||||
The lexer provides the following functionality as part of the parser.
|
||||
|
||||
- It consumes the source lazily, character by character, and produces a
|
||||
structured token stream consisting of the lexer [ast](#the-lexer-ast).
|
||||
- It must succeed on _any_ input, even if there are invalid constructs in the
|
||||
token stream, represented by `Invalid` tokens.
|
||||
- It succeeds _any_ input, even if there are invalid constructs in the token
|
||||
stream, represented by `Invalid` tokens.
|
||||
|
||||
## The Lexer AST
|
||||
|
||||
@ -69,15 +72,29 @@ It contains the following constructs:
|
||||
- `Blank`: The blank name `_`.
|
||||
- `Operator`: Operator identifiers (e.g. `-->>`).
|
||||
- `Modifier`: Modifier operators (e.g. `+=`).
|
||||
- `Annotation`: An annotation (e.g. `@Tail_Call`).
|
||||
- `Number`: Numbers (`16_FFFF`).
|
||||
- `DanglingBase`: An explicit base without an associated number (e.g. `16_`).
|
||||
- `Text`: Text (e.g. `"Some text goes here."`).
|
||||
- `TextLine`: A single-line text literal.
|
||||
- `TextInlineBlock`: An inline block text literal.
|
||||
- `TextBlock`: A text block literal.
|
||||
- `InvalidQuote`: An invalid set of quotes for a text literal.
|
||||
- `TextSegmentRaw`: A raw text segment in which the contents should be
|
||||
interpreted literally.
|
||||
- `TextSegmentEscape:` A text segment containing an escape sequence.
|
||||
- `TextSegmentInterpolate:` A text segment containing an arbitrary interpolated
|
||||
expression.
|
||||
- `TextSegmentUnclosedInterpolate`: An unclosed interpolation text segment.
|
||||
- `Line`: A line in a block that contains tokens.
|
||||
- `BlankLine`: A line in a block that contains only whitespace.
|
||||
- `Block`: Syntactic blocks in the language.
|
||||
- `InvalidSuffix`: Invalid tokens when in a given state that would otherwise be
|
||||
valid.
|
||||
- `Unrecognized`: Tokens that the lexer doesn't recognise.
|
||||
- `DisableComment`: A standard comment that disables interpretation of the
|
||||
commented code (i.e. `#`).
|
||||
- `DocComment:` A documentation comment (e.g. `##`). Documentation syntax is
|
||||
_not_ lexed by this lexer.
|
||||
|
||||
The distinction is made here between the various kinds of identifiers in order
|
||||
to keep lexing fast, but also in order to allow macros to switch on the kinds of
|
||||
@ -87,3 +104,61 @@ identifiers.
|
||||
>
|
||||
> - Determine if we want to have separate ASTs for the lexer and the parser, or
|
||||
> not.
|
||||
|
||||
## Benchmarking the Lexer
|
||||
|
||||
As the lexer is the first port of call when getting an Enso program to run it
|
||||
needs to be quick. To that end, we insist on comprehensive benchmarks for any
|
||||
change made to the lexer. The lexer benchmarks are written using
|
||||
[criterion.rs](https://github.com/bheisler/criterion.rs), and include both
|
||||
examples of whole program definitions and more specific benchmark examples.
|
||||
|
||||
**Baseline Commit:** TBC (use head of this branch for now).
|
||||
|
||||
The benchmarking process for the lexer is as follows:
|
||||
|
||||
1. Check out the current _baseline commit_, listed above.
|
||||
2. In `lexer_bench_sources.rs` change the line that reads `.retain_baseline` to
|
||||
instead read `.save_baseline`. This will save the current baseline (taken on
|
||||
your machine).
|
||||
3. Run the benchmarks using `cargo bench`. Please note that running these
|
||||
benchmarks takes approximately two hours, so sufficient time should be
|
||||
allotted.
|
||||
4. Once the baseline run has completed, change the above-mentioned line back to
|
||||
`.retain_baseline`. This will disable overwriting the saved baseline, and
|
||||
will perform its regression reporting against it.
|
||||
5. Make your changes.
|
||||
6. Run the benchmark suite again. It will report any performance regressions in
|
||||
the benchmark report, measured against your saved baseline.
|
||||
|
||||
Unfortunately, the use of time-based benchmarks means that we can't commit the
|
||||
baseline to the repository. There is far too much variance between machines for
|
||||
this to be useful.
|
||||
|
||||
### Running a Subset of the Benchmarks
|
||||
|
||||
The benchmarks are very comprehensive, running a wide range of program text
|
||||
through the lexer while replicating it out to various sizes (see
|
||||
`lexer_bench_sources.rs` for the full list). However, in order to decrease
|
||||
iteration time it can be useful to run a subset of these.
|
||||
|
||||
There are two main tuning points for this:
|
||||
|
||||
1. The _sizes_ of inputs being executed on.
|
||||
2. The benchmarks being executed.
|
||||
|
||||
The sizes can be tuned by editing the `SIZES` array in the
|
||||
`lexer_bench_sources.rs` file. The benchmarks themselves are best tuned by
|
||||
changing the macro definitions in `lexer_time_bench.rs` to exclude certain
|
||||
benchmarks or groups of benchmarks.
|
||||
|
||||
While it is _possible_ to tune the benchmarking config (`bench_config` in
|
||||
`lexer_bench_sources.rs`) to decrease benchmarking time, this is not
|
||||
recommended. The current settings are tuned to provide reliable results.
|
||||
|
||||
### Changing the Lexer
|
||||
|
||||
When changing the lexer the _full_ benchmark suite must be run against the
|
||||
current baseline before the changes can be merged. This suite run must use the
|
||||
provided settings for the benchmarking library, and should be performed using
|
||||
the process described above.
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Macro Resolution
|
||||
category: parser
|
||||
tags: [parser, macro, resolution]
|
||||
order: 5
|
||||
order: 4
|
||||
---
|
||||
|
||||
# Macro Resolution
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Operator Resolution
|
||||
category: parser
|
||||
tags: [parser, operator, resolution]
|
||||
order: 6
|
||||
order: 5
|
||||
---
|
||||
|
||||
# Operator Resolution
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Parser Driver
|
||||
category: parser
|
||||
tags: [parser, driver]
|
||||
order: 8
|
||||
order: 7
|
||||
---
|
||||
|
||||
# Parser Driver
|
||||
|
@ -3,7 +3,7 @@ layout: developer-doc
|
||||
title: Reading Source Code
|
||||
category: parser
|
||||
tags: [parser, reader]
|
||||
order: 11
|
||||
order: 10
|
||||
---
|
||||
|
||||
# Reading Source Code
|
||||
@ -15,9 +15,14 @@ project is going to use, as well as backing formats for the stream.
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [Reader Functionality](#reader-functionality)
|
||||
- [Provided Readers](#provided-readers)
|
||||
- [UTF-8 Reader](#utf-8-reader)
|
||||
- [UTF-16 Reader](#utf-16-reader)
|
||||
- [Reader Structure](#reader-structure)
|
||||
- [Read](#read)
|
||||
- [Decoder](#decoder)
|
||||
- [Provided Encodings](#provided-encodings)
|
||||
- [UTF-8](#utf-8)
|
||||
- [UTF-16](#utf-16)
|
||||
- [UTF-32](#utf-32)
|
||||
- [Benchmarks](#benchmarks)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
|
||||
|
@ -23,6 +23,7 @@ Enso supports a variety of types of comments:
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [Disable Comments](#disable-comments)
|
||||
- [Freeze Comments](#freeze-comments)
|
||||
- [Documentation Comments](#documentation-comments)
|
||||
- [Tags](#tags)
|
||||
- [Sections](#sections)
|
||||
@ -39,13 +40,35 @@ Disable comments are the standard form of comment seen in a programming language
|
||||
in that they prevent a given piece of code from executing. In Enso, they are
|
||||
created by prefixing the expression to disable with the `#` character.
|
||||
|
||||
These aren't _exactly_ like most language's disable comments however:
|
||||
Disable comments in Enso do not have their contents validated, and continue from
|
||||
the `#` character to the end of the line.
|
||||
|
||||
- When you disable a line in Enso, it is still run through the parser to see if
|
||||
it is syntactically valid.
|
||||
- No identifiers in it are resolved, however.
|
||||
- This is important as it allows the disabled expression to still be displayed
|
||||
in the visual syntax.
|
||||
```ruby
|
||||
x = y + z # here is some commented text
|
||||
```
|
||||
|
||||
Disable comments are _not_ allowed inside textual interpolations.
|
||||
|
||||
## Freeze Comments
|
||||
|
||||
Freeze comments are a special type of comment used to enable the 'freezing' or
|
||||
caching of expensive computations in Enso. When used, they cache the result of
|
||||
an expression, reusing the value instead of recomputing it even if the
|
||||
underlying data changes.
|
||||
|
||||
A portion of code that is frozen has the following properties:
|
||||
|
||||
- It is still lexed as if it were code, and validated by the parser to check for
|
||||
validity.
|
||||
- No identifier resolution takes place.
|
||||
|
||||
These are very important as they still allow the frozen expression to be
|
||||
displayed properly in the visual syntax.
|
||||
|
||||
> The actionables for this section are:
|
||||
>
|
||||
> - Work out what they should look like visually.
|
||||
> - Work out how best to implement this.
|
||||
|
||||
## Documentation Comments
|
||||
|
||||
@ -66,6 +89,8 @@ for more information). By way of example:
|
||||
until I unindent again.
|
||||
```
|
||||
|
||||
Documentation comments are _not_ allowed inside textual interpolations.
|
||||
|
||||
The tool that generates this documentation aims to be fairly robust, and tries
|
||||
to assign produce sensible results even if the user makes a mistake. Such
|
||||
mistakes will be highlighted to the user.
|
||||
|
@ -17,6 +17,8 @@ types in literal form in the source code.
|
||||
- [Text Literals](#text-literals)
|
||||
- [Inline Text Literals](#inline-text-literals)
|
||||
- [Text Block Literals](#text-block-literals)
|
||||
- [Inline Block Literals](#inline-block-literals)
|
||||
- [Escape Sequences](#escape-sequences)
|
||||
- [Vector Literals](#vector-literals)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
@ -65,7 +67,7 @@ Enso provides rich support for textual literals in the language, supporting both
|
||||
raw and interpolated strings natively.
|
||||
|
||||
- **Raw Strings:** Raw strings are delimited using the standard double-quote
|
||||
character (`"`). Raw strings have support for escape sequences.
|
||||
character (`"`). Raw strings don't support escape sequences except for `\"`.
|
||||
|
||||
```ruby
|
||||
raw_string = "Hello, world!"
|
||||
@ -75,7 +77,8 @@ raw and interpolated strings natively.
|
||||
executable Enso expressions into the string. Such strings are delimited using
|
||||
the single-quote (`'`) character, and splices are delimited using the backtick
|
||||
(`` ` ``) character. Splices are run, and then the result is converted to a
|
||||
string using `show`. These strings also have support for escape sequences.
|
||||
string using `show`. These strings also have support for all kinds of
|
||||
[escape sequences](#escape-sequences).
|
||||
|
||||
```ruby
|
||||
fmt_string = 'Hello, my age is `time.now.year - person.birthday.year`'
|
||||
@ -104,7 +107,7 @@ following layout rules:
|
||||
- Any indentation further than this baseline will be retained as part of the
|
||||
text literal.
|
||||
- The literal is _closed_ by the first line with a _lower_ level of indentation
|
||||
than the first child lineand will not contain the final blank line.
|
||||
than the first child line and will not contain the final blank line.
|
||||
|
||||
```
|
||||
block_raw = '''
|
||||
@ -116,6 +119,48 @@ block_raw = '''
|
||||
not_string_expr = foo bar
|
||||
```
|
||||
|
||||
### Inline Block Literals
|
||||
|
||||
In order to easily transition between using text blocks and single-line
|
||||
literals, we allow for defining an inline block literal. This is a literal that
|
||||
uses the same start delimiter as a block literal (see above), but rather than
|
||||
ending the literal through de-indenting from the block's level of indentation,
|
||||
the literal is ended upon the line ending.
|
||||
|
||||
```
|
||||
inline_block =
|
||||
"""this is all part of the literal
|
||||
but_this_is_not
|
||||
```
|
||||
|
||||
### Escape Sequences
|
||||
|
||||
Format literals in Enso support many kinds of escape sequence. These are
|
||||
described below.
|
||||
|
||||
| Name | Escape Sequence | Unicode | Notes |
|
||||
| :----------- | :-------------: | :--------: | :---------------------------------------------------------------------------------------- |
|
||||
| Byte Escape | `\x##` | `U+00##` | 8-bit character specification. |
|
||||
| U16 Escape | `\u####` | `U+####` | 16-bit unicode character, where each `#` is a hex digit. |
|
||||
| U21 Escape | `\u{######}` | `U+######` | 21-bit unicode character, where `######` is 1-6 hex digits. |
|
||||
| U32 Escape | `\U########` | `U+######` | 32-bit unicode character, where each `#` is a hex digit and the first two bytes are `00`. |
|
||||
| Null | `\0` | `U+0000` | The null character. |
|
||||
| Alert | `\a` | `U+0007` | The bell/alert character. |
|
||||
| Backspace | `\b` | `U+0008` | The backspace character. |
|
||||
| Form Feed | `\f` | `U+000C` | The form-feed character. |
|
||||
| LF | `\n` | `U+000A` | The line-feed character (newline on unix systems). |
|
||||
| CR | `\r` | `U+000D` | The carriage return character (part of newline on windows systems). |
|
||||
| Tab | `\t` | `U+0009` | The horizontal tab character. |
|
||||
| Vertical Tab | `\v` | `U+000B` | The vertical tab character. |
|
||||
| Backslash | `\\` | `U+005C` | A literal backslash character. |
|
||||
| Double Quote | `\"` | `U+0022` | A literal double quote character. |
|
||||
| Single Quote | `\'` | `U+0027` | A literal single quote character. |
|
||||
| Backtick | `` \` `` | `U+0060` | A literal backtick character. |
|
||||
|
||||
The only one of the above escape sequences that is supported in a raw text
|
||||
literal is `\"`. All other occurrences of `\` in such literals are treated as a
|
||||
literal backslash.
|
||||
|
||||
## Vector Literals
|
||||
|
||||
Enso also supports vector literals, which allow users to create literal vectors
|
||||
|
@ -28,6 +28,7 @@ provide their users with access to the compilation and type-checking phases
|
||||
<!-- MarkdownTOC levels="2,3" autolink="true" -->
|
||||
|
||||
- [Annotations](#annotations)
|
||||
- [Annotation Naming](#annotation-naming)
|
||||
- [Automatic Deriving](#automatic-deriving)
|
||||
|
||||
<!-- /MarkdownTOC -->
|
||||
@ -66,6 +67,13 @@ that we are able to reserve words such as `type` to ensure that users can always
|
||||
have a good sense of what the most common constructs in the language mean,
|
||||
rather than allowing them to be overridden outside of the stdlib.
|
||||
|
||||
### Annotation Naming
|
||||
|
||||
The naming of annotations follows the standard rules that Enso uses for naming
|
||||
its [identifiers](./naming.md#naming-constructs). This means that they can be in
|
||||
both referent or variable form as the annotation head is _not_ a
|
||||
[pattern context](./naming.md#pattern-contexts).
|
||||
|
||||
## Automatic Deriving
|
||||
|
||||
In order to make the language easier to debug, we have all types automatically
|
||||
|
@ -1,15 +0,0 @@
|
||||
[package]
|
||||
name = "flexer-test-definition"
|
||||
version = "0.1.0"
|
||||
authors = ["Enso Team <enso-dev@enso.org>"]
|
||||
edition = "2018"
|
||||
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
flexer = { path = "../../flexer", version = "0.1.0" }
|
@ -1,282 +0,0 @@
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains the code defining a lexer for the following small language. Due to the way in
|
||||
//! which the code-generation from the flexer is used, it has to be defined in a separate crate from
|
||||
//! the site at which it's used. For the actual tests of this code, please see
|
||||
//! `flexer-testing/generation`.
|
||||
//!
|
||||
//! The language here is being defined as follows:
|
||||
//!
|
||||
//! a-word = 'a'+;
|
||||
//! b-word = 'b'+;
|
||||
//! word = a-word | b-word;
|
||||
//! space = ' ';
|
||||
//! spaced-word = space, word;
|
||||
//! language = word, spaced-word*;
|
||||
//!
|
||||
//! Please note that there is a fair amount of duplicated code between this test and the
|
||||
//! `lexer_generated_api_test` file. This is to present the full view of what each portion of the
|
||||
//! process looks like.
|
||||
|
||||
use flexer::prelude::*;
|
||||
|
||||
use flexer::*;
|
||||
use flexer;
|
||||
use flexer::automata::pattern::Pattern;
|
||||
use flexer::group::Registry;
|
||||
use flexer::prelude::logger::Disabled;
|
||||
use flexer::prelude::reader::BookmarkManager;
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Type Aliases ===
|
||||
// ====================
|
||||
|
||||
type Logger = Disabled;
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// === AST ===
|
||||
// ===========
|
||||
|
||||
/// A very simple AST, sufficient for the simple language being defined.
|
||||
#[derive(Clone,Debug,PartialEq)]
|
||||
pub enum Token {
|
||||
/// A word from the input, consisting of a sequence of all `a` or all `b`.
|
||||
Word(String),
|
||||
/// A token that the lexer is unable to recognise.
|
||||
Unrecognized(String),
|
||||
}
|
||||
impl Token {
|
||||
/// Construct a new word token.
|
||||
pub fn word(name:impl Into<String>) -> Token {
|
||||
Token::Word(name.into())
|
||||
}
|
||||
|
||||
/// Construct a new unrecognized token.
|
||||
pub fn unrecognized(name:impl Into<String>) -> Token {
|
||||
Token::Unrecognized(name.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// A representation of a stream of tokens.
|
||||
#[allow(missing_docs)]
|
||||
#[derive(Clone,Debug,Default,PartialEq)]
|
||||
pub struct TokenStream {
|
||||
tokens:Vec<Token>
|
||||
}
|
||||
|
||||
impl TokenStream {
|
||||
/// Append the provided token to the token stream.
|
||||
pub fn push(&mut self,token:Token) {
|
||||
self.tokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<Vec<Token>> for TokenStream {
|
||||
fn from(tokens: Vec<Token>) -> Self {
|
||||
TokenStream {tokens}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==================
|
||||
// === Test Lexer ===
|
||||
// ==================
|
||||
|
||||
/// The definition of a test lexer for the above-described language.
|
||||
#[derive(Debug)]
|
||||
pub struct TestLexer {
|
||||
lexer:Flexer<TestState,TokenStream,Logger>
|
||||
}
|
||||
|
||||
impl Deref for TestLexer {
|
||||
type Target = Flexer<TestState,TokenStream,Logger>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for TestLexer {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl TestLexer {
|
||||
/// Creates a new instance of this lexer.
|
||||
pub fn new() -> Self {
|
||||
let logger = Logger::new("TestLexer");
|
||||
let lexer = Flexer::new(logger);
|
||||
TestLexer{lexer}
|
||||
}
|
||||
}
|
||||
|
||||
/// Rules for the root state.
|
||||
#[allow(dead_code,missing_docs)]
|
||||
impl TestLexer {
|
||||
fn on_first_word<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
let str = self.current_match.clone();
|
||||
let ast = Token::Word(str);
|
||||
self.output.push(ast);
|
||||
let id = self.seen_first_word_state;
|
||||
self.push_state(id);
|
||||
}
|
||||
|
||||
fn on_err_suffix_first_word<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
let ast = Token::Unrecognized(self.current_match.clone());
|
||||
self.output.push(ast);
|
||||
}
|
||||
|
||||
fn on_no_err_suffix_first_word<R:LazyReader>(&mut self, _reader:&mut R) {}
|
||||
|
||||
fn rules_in_root(lexer:&mut TestLexer) {
|
||||
let a_word = Pattern::char('a').many1();
|
||||
let b_word = Pattern::char('b').many1();
|
||||
let any = Pattern::any();
|
||||
let end = Pattern::eof();
|
||||
|
||||
let root_group_id = lexer.initial_state;
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
|
||||
root_group.create_rule(&a_word,"self.on_first_word(reader)");
|
||||
root_group.create_rule(&b_word,"self.on_first_word(reader)");
|
||||
root_group.create_rule(&end, "self.on_no_err_suffix_first_word(reader)");
|
||||
root_group.create_rule(&any, "self.on_err_suffix_first_word(reader)");
|
||||
}
|
||||
}
|
||||
|
||||
/// Rules for the "seen first word" state.
|
||||
#[allow(dead_code,missing_docs)]
|
||||
impl TestLexer {
|
||||
fn on_spaced_word<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
let str = self.current_match.clone();
|
||||
let ast = Token::Word(String::from(str.trim()));
|
||||
self.output.push(ast);
|
||||
}
|
||||
|
||||
fn on_err_suffix<R:LazyReader>(&mut self, reader:&mut R) {
|
||||
self.on_err_suffix_first_word(reader);
|
||||
self.pop_state();
|
||||
}
|
||||
|
||||
fn on_no_err_suffix<R:LazyReader>(&mut self, reader:&mut R) {
|
||||
self.on_no_err_suffix_first_word(reader);
|
||||
self.pop_state();
|
||||
}
|
||||
|
||||
fn rules_in_seen_first_word(lexer:&mut TestLexer) {
|
||||
let a_word = Pattern::char('a').many1();
|
||||
let b_word = Pattern::char('b').many1();
|
||||
let space = Pattern::char(' ');
|
||||
let spaced_a_word = &space >> &a_word;
|
||||
let spaced_b_word = &space >> &b_word;
|
||||
let any = Pattern::any();
|
||||
let end = Pattern::eof();
|
||||
|
||||
let seen_first_word_group_id = lexer.seen_first_word_state;
|
||||
let seen_first_word_group = lexer.groups_mut().group_mut(seen_first_word_group_id);
|
||||
|
||||
seen_first_word_group.create_rule(&spaced_a_word,"self.on_spaced_word(reader)");
|
||||
seen_first_word_group.create_rule(&spaced_b_word,"self.on_spaced_word(reader)");
|
||||
seen_first_word_group.create_rule(&end, "self.on_no_err_suffix(reader)");
|
||||
seen_first_word_group.create_rule(&any, "self.on_err_suffix(reader)");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl flexer::Definition for TestLexer {
|
||||
fn define() -> Self {
|
||||
let mut lexer = TestLexer::new();
|
||||
|
||||
TestLexer::rules_in_seen_first_word(&mut lexer);
|
||||
TestLexer::rules_in_root(&mut lexer);
|
||||
|
||||
lexer
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
self.lexer.groups()
|
||||
}
|
||||
|
||||
fn set_up(&mut self) {}
|
||||
|
||||
fn tear_down(&mut self) {}
|
||||
}
|
||||
|
||||
impl Default for TestLexer {
|
||||
fn default() -> Self {
|
||||
TestLexer::new()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === Lexer State ===
|
||||
// ===================
|
||||
|
||||
/// The stateful components of the test lexer.
|
||||
#[derive(Debug)]
|
||||
pub struct TestState {
|
||||
/// The registry for groups in the lexer.
|
||||
lexer_states:group::Registry,
|
||||
/// The initial state of the lexer.
|
||||
initial_state:group::Identifier,
|
||||
/// The state entered when the first word has been seen.
|
||||
seen_first_word_state:group::Identifier,
|
||||
/// The bookmarks for this lexer.
|
||||
bookmarks:BookmarkManager
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl flexer::State for TestState {
|
||||
fn new(_logger:&impl AnyLogger) -> Self {
|
||||
let mut lexer_states = group::Registry::default();
|
||||
let initial_state = lexer_states.define_group("ROOT",None);
|
||||
let seen_first_word_state = lexer_states.define_group("SEEN FIRST WORD",None);
|
||||
let bookmarks = BookmarkManager::new();
|
||||
Self{lexer_states,initial_state,seen_first_word_state,bookmarks}
|
||||
}
|
||||
|
||||
fn initial_state(&self) -> group::Identifier {
|
||||
self.initial_state
|
||||
}
|
||||
|
||||
fn groups(&self) -> &group::Registry {
|
||||
&self.lexer_states
|
||||
}
|
||||
|
||||
fn groups_mut(&mut self) -> &mut group::Registry {
|
||||
&mut self.lexer_states
|
||||
}
|
||||
|
||||
fn bookmarks(&self) -> &BookmarkManager {
|
||||
&self.bookmarks
|
||||
}
|
||||
|
||||
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
|
||||
&mut self.bookmarks
|
||||
}
|
||||
|
||||
fn specialize(&self) -> Result<String,GenError> {
|
||||
generate::specialize(self,"TestLexer","TokenStream")
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "flexer-test-generation"
|
||||
version = "0.1.0"
|
||||
authors = ["Enso Team <enso-dev@enso.org>"]
|
||||
edition = "2018"
|
||||
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
flexer = { path = "../../flexer" , version = "0.1.0" }
|
||||
flexer-test-definition = { path = "../definition", version = "0.1.0" }
|
||||
|
||||
[build-dependencies]
|
||||
flexer = { path = "../../flexer" , version = "0.1.0" }
|
||||
flexer-test-definition = { path = "../definition", version = "0.1.0" }
|
@ -1,32 +0,0 @@
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use flexer_test_definition::TestLexer;
|
||||
use flexer::Definition;
|
||||
use flexer::State;
|
||||
|
||||
|
||||
|
||||
/// Generates the lexer engine and saves the result into the file `src/engine.rs`.
|
||||
///
|
||||
/// The content of the generated file can be used with the `include!` macro.
|
||||
fn generate_engine() -> std::io::Result<()> {
|
||||
let definition_path = "../definition/src/lib.rs";
|
||||
let output_directory = "src/generated";
|
||||
let _ = std::fs::create_dir(output_directory);
|
||||
let output_path = "src/generated/engine.rs";
|
||||
let definition_error = format!("The lexer definition should exist at {}.",definition_path);
|
||||
let output_error = format!("Cannot open output file at {}.",output_path);
|
||||
let mut lexer_def = File::open(definition_path).expect(definition_error.as_str());
|
||||
let mut contents = String::new();
|
||||
let mut file = File::create(output_path).expect(output_error.as_str());
|
||||
let lexer = TestLexer::define();
|
||||
let engine = lexer.specialize().unwrap();
|
||||
lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition.");
|
||||
file.write_all(contents.as_bytes()).expect("Unable to write lexer definition.");
|
||||
file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> std::io::Result<()> {
|
||||
generate_engine()
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
//! This module serves to re-export the generated lexer.
|
||||
|
||||
pub mod engine;
|
@ -1,19 +0,0 @@
|
||||
//! This library exposes the specialized version of the Enso lexer.
|
||||
//!
|
||||
//! Its sole purpose is to avoid the lexer definition getting out of sync with its implementation
|
||||
//! (the generated engine), which requires the engine to live in a separate crate.
|
||||
//!
|
||||
//! This separation enables generation of the enso lexer source code with `build.rs` during
|
||||
//! compilation. Its output is then stored in a new file `engine.rs`and exported by `lexer.rs`.
|
||||
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
pub mod generated;
|
@ -1,110 +0,0 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for the generated lexer.
|
||||
|
||||
use flexer::prelude::*;
|
||||
|
||||
use flexer::prelude::reader::decoder::DecoderUTF8;
|
||||
use flexer_test_generation::generated::engine::TestLexer;
|
||||
use flexer_test_generation::generated::engine::Token;
|
||||
use flexer_test_generation::generated::engine::TokenStream;
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
/// Executes the test on the provided input string slice.
|
||||
fn run_test_on(str:impl AsRef<str>) -> TokenStream {
|
||||
// Hardcoded for ease of use here.
|
||||
let reader = Reader::new(str.as_ref().as_bytes(), DecoderUTF8());
|
||||
let mut lexer = TestLexer::new();
|
||||
let run_result = lexer.run(reader);
|
||||
|
||||
match run_result.kind {
|
||||
flexer::ResultKind::Success => run_result.tokens,
|
||||
_ => default()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_a_word() {
|
||||
let input = "aaaaa";
|
||||
let expected_output = TokenStream::from(vec![Token::word(input)]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_b_word() {
|
||||
let input = "bbbbb";
|
||||
let expected_output = TokenStream::from(vec![Token::word(input)]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_two_word() {
|
||||
let input = "aaaaa bbbbb";
|
||||
let expected_output = TokenStream::from(vec![Token::word("aaaaa"), Token::word("bbbbb")]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_word() {
|
||||
let input = "bbb aa a b bbbbb aa";
|
||||
let expected_output = TokenStream::from(vec![
|
||||
Token::word("bbb"),
|
||||
Token::word("aa"),
|
||||
Token::word("a"),
|
||||
Token::word("b"),
|
||||
Token::word("bbbbb"),
|
||||
Token::word("aa")
|
||||
]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_single_word() {
|
||||
let input = "c";
|
||||
let expected_output = TokenStream::from(vec![Token::unrecognized(input)]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_word_invalid() {
|
||||
let input = "aaaaaa c bbbbbb";
|
||||
let expected_output = TokenStream::from(vec![
|
||||
Token::word("aaaaaa"),
|
||||
Token::unrecognized(" "),
|
||||
Token::unrecognized("c"),
|
||||
Token::unrecognized(" "),
|
||||
Token::word("bbbbbb"),
|
||||
]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_end_invalid() {
|
||||
let input = "bbbbbb c";
|
||||
let expected_output = TokenStream::from(vec![
|
||||
Token::word("bbbbbb"),
|
||||
Token::unrecognized(" "),
|
||||
Token::unrecognized("c"),
|
||||
]);
|
||||
let result = run_test_on(input);
|
||||
assert_eq!(result, expected_output);
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
[package]
|
||||
name = "flexer"
|
||||
version = "0.1.0"
|
||||
authors = ["Enso Team <enso-dev@enso.org>"]
|
||||
edition = "2018"
|
||||
|
||||
description = "A finite-automata-based lexing engine."
|
||||
readme = "README.md"
|
||||
homepage = "https://github.com/enso-org/enso/lib/rust/flexer"
|
||||
repository = "https://github.com/enso-org/enso"
|
||||
license-file = "../../../LICENSE"
|
||||
|
||||
keywords = ["lexer", "finite-automata"]
|
||||
categories = ["parsing"]
|
||||
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
name = "flexer"
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
enso-logger = { version = "0.1.1" }
|
||||
enso-prelude = { version = "0.1.3" }
|
||||
enso-lazy-reader = { version = "= 0.1.0" }
|
||||
enso-macro-utils = { version = "0.1.1" }
|
||||
|
||||
itertools = "0.8"
|
||||
proc-macro2 = "1.0.19"
|
||||
nonempty = "0.1.5"
|
||||
quote = "1.0"
|
||||
syn = { version = "1.0.12", features = ["full", "extra-traits", "visit-mut", "visit", "parsing", "printing"] }
|
||||
unicode-segmentation = "1.6.0"
|
||||
wasm-bindgen = "0.2"
|
||||
|
||||
[dev-dependencies]
|
||||
wasm-bindgen-test = "0.2"
|
@ -1,4 +0,0 @@
|
||||
# Flexer
|
||||
|
||||
This library provides a finite-automata-based lexing engine that can flexibly
|
||||
tokenize an input stream.
|
@ -1,9 +0,0 @@
|
||||
//! Provides an API for the construction of finite state automata, in both their deterministic and
|
||||
//! non-deterministic forms.
|
||||
|
||||
pub mod alphabet;
|
||||
pub mod dfa;
|
||||
pub mod nfa;
|
||||
pub mod pattern;
|
||||
pub mod state;
|
||||
pub mod symbol;
|
@ -1,130 +0,0 @@
|
||||
//! Exports an alphabet for an arbitrary finite state automaton.
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Segmentation ===
|
||||
// ====================
|
||||
|
||||
/// A representation of the distinct intervals over the input alphabet for a given finite state
|
||||
/// automaton.
|
||||
///
|
||||
/// These intervals are defined by a set of _divisions_ of the input alphabet, where each division
|
||||
/// is represented as a point in that alphabet. This is necessary to allow for efficient encoding of
|
||||
/// state transitions that trigger not just on _one_, but potentially on _many_ of the input
|
||||
/// symbols in the automaton's alphabet.
|
||||
///
|
||||
/// This is best explained by way of example. Consider the original unbounded alphabet:
|
||||
///
|
||||
/// ```text
|
||||
/// ... a b c d e f g h ... z ...
|
||||
/// ```
|
||||
///
|
||||
/// We want to add a rule that matches on the interval `[b, d]`. This results in there being three
|
||||
/// intervals on the alphabet, as there are two divisions (annotated below):
|
||||
///
|
||||
/// ```text
|
||||
/// ... a | b c d | e f g h ... z ...
|
||||
/// div: 1 2
|
||||
/// seg: 1 2 3
|
||||
/// ```
|
||||
///
|
||||
/// If we then add a rule that matches on the interval `[d, f]`, we end up with five intervals on
|
||||
/// the alphabet, with four divisions (annotated below):
|
||||
///
|
||||
/// ```text
|
||||
/// ... a | b c | d | e f | g h ... z ...
|
||||
/// div: 1 2 3 4
|
||||
/// seg: 1 2 3 4 5
|
||||
/// ```
|
||||
///
|
||||
/// This type tracks these divisions explicitly for an input alphabet defined for all automata in
|
||||
/// this library as `0u32..=u32::max_value()`.
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct Segmentation {
|
||||
pub divisions:BTreeSet<Symbol>
|
||||
}
|
||||
|
||||
impl Segmentation {
|
||||
/// Inserts a range of symbols into the alphabet.
|
||||
pub fn insert(&mut self, range:RangeInclusive<Symbol>) {
|
||||
self.divisions.insert(Symbol::from(range.start()));
|
||||
if range.end().value != Symbol::EOF_CODE.value {
|
||||
self.divisions.insert(Symbol{value:range.end().value + 1});
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a [`Segmentation`] from an input set of divisions.
|
||||
pub fn from_divisions(divisions:&[u32]) -> Self {
|
||||
let mut dict = Self::default();
|
||||
for val in divisions {
|
||||
dict.divisions.insert(Symbol::from(*val));
|
||||
}
|
||||
dict
|
||||
}
|
||||
|
||||
/// Obtains the divisions in the alphabet segmentation as a vector.
|
||||
pub fn divisions_as_vec(&self) -> Vec<Division> {
|
||||
self.divisions.iter().copied().enumerate().map(From::from).collect()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Segmentation {
|
||||
fn default() -> Self {
|
||||
let mut divisions: BTreeSet<Symbol> = default();
|
||||
// The existence of the default (0) member in the set is assumed by the implementation of
|
||||
// the NFA -> DFA conversion.
|
||||
divisions.insert(default());
|
||||
Segmentation{divisions}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Division ===
|
||||
// ================
|
||||
|
||||
/// A division of the alphabet used by the lexer.
|
||||
#[derive(Copy,Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Division {
|
||||
/// The position of the division.
|
||||
pub position : usize,
|
||||
/// The symbol at which it divides the alphabet.
|
||||
pub symbol : Symbol,
|
||||
}
|
||||
|
||||
impl Division {
|
||||
/// Create a new division.
|
||||
pub fn new(position:usize, symbol:Symbol) -> Division {
|
||||
Division{position,symbol}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Into<(usize,Symbol)> for Division {
|
||||
fn into(self) -> (usize, Symbol) {
|
||||
(self.position,self.symbol)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(usize,Symbol)> for Division {
|
||||
fn from((position, symbol): (usize, Symbol)) -> Self {
|
||||
Division::new(position,symbol)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,178 +0,0 @@
|
||||
//! The structure for defining deterministic finite automata.
|
||||
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::state;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
|
||||
|
||||
// =====================================
|
||||
// === Deterministic Finite Automata ===
|
||||
// =====================================
|
||||
|
||||
/// The definition of a [DFA](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) for a
|
||||
/// given set of symbols, states, and transitions.
|
||||
///
|
||||
/// A DFA is a finite state automaton that accepts or rejects a given sequence of symbols by
|
||||
/// executing on a sequence of states _uniquely_ determined by the sequence of input symbols.
|
||||
///
|
||||
/// ```text
|
||||
/// ┌───┐ 'D' ┌───┐ 'F' ┌───┐ 'A' ┌───┐
|
||||
/// │ 0 │ ----> │ 1 │ ----> │ 2 │ ----> │ 3 │
|
||||
/// └───┘ └───┘ └───┘ └───┘
|
||||
/// ```
|
||||
#[derive(Clone,Debug,Default,Eq,PartialEq)]
|
||||
pub struct DFA {
|
||||
/// A set of disjoint intervals over the allowable input alphabet.
|
||||
pub alphabet_segmentation:alphabet::Segmentation,
|
||||
/// The transition matrix for the DFA.
|
||||
///
|
||||
/// It represents a function of type `(state, symbol) -> state`, returning the identifier for
|
||||
/// the new state.
|
||||
///
|
||||
/// For example, the transition matrix for an automaton that accepts the language
|
||||
/// `{"A" | "B"}*"` would appear as follows, with `-` denoting
|
||||
/// [the invalid state](state::Identifier::INVALID). The leftmost column encodes the input
|
||||
/// state, while the topmost row encodes the input symbols.
|
||||
///
|
||||
/// | | A | B |
|
||||
/// |:-:|:-:|:-:|
|
||||
/// | 0 | 1 | - |
|
||||
/// | 1 | - | 0 |
|
||||
///
|
||||
pub links:Matrix<state::Identifier>,
|
||||
/// A collection of callbacks for each state (indexable in order)
|
||||
pub callbacks:Vec<Option<RuleExecutable>>,
|
||||
}
|
||||
|
||||
impl DFA {
|
||||
/// Check whether the DFA has a rule for the target state.
|
||||
///
|
||||
/// This method should only be used in generated code, where its invariants are already checked.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If no callback exists for `target_state`.
|
||||
pub fn has_rule_for(&self, target_state:state::Identifier) -> bool {
|
||||
self.callbacks.get(target_state.id).unwrap().is_some()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<Vec<Vec<usize>>> for Matrix<state::Identifier> {
|
||||
fn from(input:Vec<Vec<usize>>) -> Self {
|
||||
let rows = input.len();
|
||||
let columns = if rows == 0 {0} else {input[0].len()};
|
||||
let mut matrix = Self::new(rows,columns);
|
||||
for row in 0..rows {
|
||||
for column in 0..columns {
|
||||
matrix[(row,column)] = state::Identifier::from(input[row][column]);
|
||||
}
|
||||
}
|
||||
matrix
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Callback ===
|
||||
// ================
|
||||
|
||||
/// The callback associated with an arbitrary state of a finite automaton.
|
||||
///
|
||||
/// It contains the rust code that is intended to be executed after encountering a
|
||||
/// [`pattern`](super::pattern::Pattern) that causes the associated state transition. This pattern
|
||||
/// is declared in [`Rule.pattern`](crate::group::rule::Rule::pattern).
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct RuleExecutable {
|
||||
/// A description of the priority with which the callback is constructed during codegen.
|
||||
pub priority:usize,
|
||||
/// The rust code that will be executed when running this callback.
|
||||
pub code:String,
|
||||
}
|
||||
|
||||
impl RuleExecutable {
|
||||
/// Creates a new rule executable with the provided `priority` and `code`.
|
||||
pub fn new(priority:usize, code_str:impl Into<String>) -> RuleExecutable {
|
||||
let code = code_str.into();
|
||||
RuleExecutable{priority,code}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use crate::automata::state;
|
||||
|
||||
use super::*;
|
||||
|
||||
const INVALID:usize = state::Identifier::INVALID.id;
|
||||
|
||||
/// DFA automata that accepts newline '\n'.
|
||||
pub fn newline() -> DFA {
|
||||
DFA {
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[10,11]),
|
||||
links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts any letter a..=z.
|
||||
pub fn letter() -> DFA {
|
||||
DFA {
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[97,123]),
|
||||
links:Matrix::from(vec![vec![INVALID,1,INVALID], vec![INVALID,INVALID,INVALID]]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(RuleExecutable {priority:2, code:"group_0_rule_0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[0,32,33]),
|
||||
links:Matrix::from(vec![
|
||||
vec![INVALID,1,INVALID],
|
||||
vec![INVALID,2,INVALID],
|
||||
vec![INVALID,2,INVALID],
|
||||
]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}),
|
||||
Some(RuleExecutable {priority:3, code:"group_0_rule_0".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/// DFA automata that accepts one letter a..=z or any many spaces.
|
||||
pub fn letter_and_spaces() -> DFA {
|
||||
DFA {
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(&[32,33,97,123]),
|
||||
links:Matrix::from(vec![
|
||||
vec![INVALID, 1,INVALID, 2,INVALID],
|
||||
vec![INVALID, 3,INVALID,INVALID,INVALID],
|
||||
vec![INVALID,INVALID,INVALID,INVALID,INVALID],
|
||||
vec![INVALID, 3,INVALID,INVALID,INVALID],
|
||||
]),
|
||||
callbacks:vec![
|
||||
None,
|
||||
Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}),
|
||||
Some(RuleExecutable {priority:4, code:"group_0_rule_0".into()}),
|
||||
Some(RuleExecutable {priority:4, code:"group_0_rule_1".into()}),
|
||||
],
|
||||
}
|
||||
}
|
||||
}
|
@ -1,345 +0,0 @@
|
||||
//! The structure for defining non-deterministic finite automata.
|
||||
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::dfa::DFA;
|
||||
use crate::automata::dfa::RuleExecutable;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::automata::state::State;
|
||||
use crate::automata::state::Transition;
|
||||
use crate::automata::state;
|
||||
use crate::automata::symbol::Symbol;
|
||||
use crate::data::matrix::Matrix;
|
||||
|
||||
use itertools::Itertools;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
|
||||
|
||||
// =========================================
|
||||
// === Non-Deterministic Finite Automata ===
|
||||
// =========================================
|
||||
|
||||
/// A state identifier based on a set of states.
|
||||
///
|
||||
/// This is used during the NFA -> DFA transformation, where multiple states can merge together due
|
||||
/// to the collapsing of epsilon transitions.
|
||||
type StateSetId = BTreeSet<state::Identifier>;
|
||||
|
||||
/// The definition of a [NFA](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) for a
|
||||
/// given set of symbols, states, and transitions (specifically a NFA with ε-moves).
|
||||
///
|
||||
/// A NFA is a finite state automaton that accepts or rejects a given sequence of symbols. In
|
||||
/// contrast with a DFA, the NFA may transition between states _without_ reading any new symbol
|
||||
/// through use of
|
||||
/// [epsilon links](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton#NFA_with_%CE%B5-moves).
|
||||
///
|
||||
/// ```text
|
||||
/// ┌───┐ 'N' ┌───┐ ┌───┐ 'F' ┌───┐ ┌───┐ 'A' ┌───┐
|
||||
/// │ 0 │ ----> │ 1 │ -> │ 2 │ ----> │ 3 │ -> │ 3 │ ----> │ 3 │
|
||||
/// └───┘ └───┘ ε └───┘ └───┘ ε └───┘ └───┘
|
||||
/// ```
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct NFA {
|
||||
/// A set of disjoint intervals over the input alphabet.
|
||||
pub alphabet_segmentation:alphabet::Segmentation,
|
||||
/// A set of named NFA states, with (epsilon) transitions.
|
||||
pub states:Vec<State>,
|
||||
}
|
||||
|
||||
impl NFA {
|
||||
/// Adds a new state to the NFA and returns its identifier.
|
||||
pub fn new_state(&mut self) -> state::Identifier {
|
||||
let id = self.states.len();
|
||||
self.states.push(State::default());
|
||||
state::Identifier{id}
|
||||
}
|
||||
|
||||
/// Creates an epsilon transition between two states.
|
||||
///
|
||||
/// Whenever the automaton happens to be in `source` state it can immediately transition to the
|
||||
/// `target` state. It is, however, not _required_ to do so.
|
||||
pub fn connect(&mut self, source:state::Identifier, target:state::Identifier) {
|
||||
self.states[source.id].epsilon_links.push(target);
|
||||
}
|
||||
|
||||
/// Creates an ordinary transition for a range of symbols.
|
||||
///
|
||||
/// If any symbol from such range happens to be the input when the automaton is in the `source`
|
||||
/// state, it will immediately transition to the `target` state.
|
||||
pub fn connect_via
|
||||
( &mut self
|
||||
, source : state::Identifier
|
||||
, target_state : state::Identifier
|
||||
, symbols : &RangeInclusive<Symbol>
|
||||
) {
|
||||
self.alphabet_segmentation.insert(symbols.clone());
|
||||
self.states[source.id].links.push(Transition{symbols:symbols.clone(),target_state});
|
||||
}
|
||||
|
||||
/// Transforms a pattern to an NFA using the algorithm described
|
||||
/// [here](https://www.youtube.com/watch?v=RYNN-tb9WxI).
|
||||
/// The asymptotic complexity is linear in number of symbols.
|
||||
pub fn new_pattern(&mut self, source:state::Identifier, pattern:&Pattern) -> state::Identifier {
|
||||
let current = self.new_state();
|
||||
self.connect(source,current);
|
||||
match pattern {
|
||||
Pattern::Range(range) => {
|
||||
let state = self.new_state();
|
||||
self.connect_via(current,state,range);
|
||||
state
|
||||
},
|
||||
Pattern::Many(body) => {
|
||||
let s1 = self.new_state();
|
||||
let s2 = self.new_pattern(s1,body);
|
||||
let s3 = self.new_state();
|
||||
self.connect(current,s1);
|
||||
self.connect(current,s3);
|
||||
self.connect(s2,s3);
|
||||
self.connect(s3,s1);
|
||||
s3
|
||||
},
|
||||
Pattern::Seq(patterns) => {
|
||||
patterns.iter().fold(current,|s,pat| self.new_pattern(s,pat))
|
||||
},
|
||||
Pattern::Or(patterns) => {
|
||||
let states = patterns.iter().map(|pat| self.new_pattern(current,pat)).collect_vec();
|
||||
let end = self.new_state();
|
||||
for state in states {
|
||||
self.connect(state,end);
|
||||
}
|
||||
end
|
||||
},
|
||||
Pattern::Always => current,
|
||||
}
|
||||
}
|
||||
|
||||
/// Merges states that are connected by epsilon links, using an algorithm based on the one shown
|
||||
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
|
||||
fn eps_matrix(&self) -> Vec<StateSetId> {
|
||||
fn fill_eps_matrix
|
||||
( nfa : &NFA
|
||||
, states : &mut Vec<StateSetId>
|
||||
, visited : &mut Vec<bool>
|
||||
, state : state::Identifier
|
||||
) {
|
||||
let mut state_set = StateSetId::new();
|
||||
visited[state.id] = true;
|
||||
state_set.insert(state);
|
||||
for &target in &nfa.states[state.id].epsilon_links {
|
||||
if !visited[target.id] {
|
||||
fill_eps_matrix(nfa,states,visited,target);
|
||||
}
|
||||
state_set.insert(target);
|
||||
state_set.extend(states[target.id].iter());
|
||||
}
|
||||
states[state.id] = state_set;
|
||||
}
|
||||
|
||||
let mut states = vec![StateSetId::new(); self.states.len()];
|
||||
for id in 0..self.states.len() {
|
||||
let mut visited = vec![false; states.len()];
|
||||
fill_eps_matrix(self,&mut states,&mut visited,state::Identifier{id});
|
||||
}
|
||||
states
|
||||
}
|
||||
|
||||
/// Computes a transition matrix `(state, symbol) => state` for the NFA, ignoring epsilon links.
|
||||
fn nfa_matrix(&self) -> Matrix<state::Identifier> {
|
||||
let mut matrix = Matrix::new(self.states.len(),self.alphabet_segmentation.divisions.len());
|
||||
|
||||
for (state_ix, source) in self.states.iter().enumerate() {
|
||||
let targets = source.targets(&self.alphabet_segmentation);
|
||||
for (voc_ix, &target) in targets.iter().enumerate() {
|
||||
matrix[(state_ix,voc_ix)] = target;
|
||||
}
|
||||
}
|
||||
matrix
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<&NFA> for DFA {
|
||||
|
||||
/// Transforms an NFA into a DFA, based on the algorithm described
|
||||
/// [here](https://www.youtube.com/watch?v=taClnxU-nao).
|
||||
/// The asymptotic complexity is quadratic in number of states.
|
||||
fn from(nfa:&NFA) -> Self {
|
||||
let nfa_mat = nfa.nfa_matrix();
|
||||
let eps_mat = nfa.eps_matrix();
|
||||
let mut dfa_mat = Matrix::new(0,nfa.alphabet_segmentation.divisions.len());
|
||||
let mut dfa_eps_ixs = Vec::<StateSetId>::new();
|
||||
let mut dfa_eps_map = HashMap::<StateSetId,state::Identifier>::new();
|
||||
|
||||
dfa_eps_ixs.push(eps_mat[0].clone());
|
||||
dfa_eps_map.insert(eps_mat[0].clone(),state::Identifier::from(0));
|
||||
|
||||
let mut i = 0;
|
||||
while i < dfa_eps_ixs.len() {
|
||||
dfa_mat.new_row();
|
||||
for voc_ix in 0..nfa.alphabet_segmentation.divisions.len() {
|
||||
let mut eps_set = StateSetId::new();
|
||||
for &eps_ix in &dfa_eps_ixs[i] {
|
||||
let tgt = nfa_mat[(eps_ix.id,voc_ix)];
|
||||
if tgt != state::Identifier::INVALID {
|
||||
eps_set.extend(eps_mat[tgt.id].iter());
|
||||
}
|
||||
}
|
||||
if !eps_set.is_empty() {
|
||||
dfa_mat[(i,voc_ix)] = match dfa_eps_map.get(&eps_set) {
|
||||
Some(&id) => id,
|
||||
None => {
|
||||
let id = state::Identifier::new(dfa_eps_ixs.len());
|
||||
dfa_eps_ixs.push(eps_set.clone());
|
||||
dfa_eps_map.insert(eps_set,id);
|
||||
id
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
let mut callbacks = vec![None; dfa_eps_ixs.len()];
|
||||
let priority = dfa_eps_ixs.len();
|
||||
for (dfa_ix, epss) in dfa_eps_ixs.into_iter().enumerate() {
|
||||
let has_name = |&key:&state::Identifier| nfa.states[key.id].name.is_some();
|
||||
if let Some(eps) = epss.into_iter().find(has_name) {
|
||||
let code = nfa.states[eps.id].name.as_ref().cloned().unwrap();
|
||||
callbacks[dfa_ix] = Some(RuleExecutable {code,priority});
|
||||
}
|
||||
}
|
||||
|
||||
let alphabet_segmentation = nfa.alphabet_segmentation.clone();
|
||||
let links = dfa_mat;
|
||||
|
||||
DFA{alphabet_segmentation,links,callbacks}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// == Tests ==
|
||||
// ===========
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
extern crate test;
|
||||
|
||||
use crate::automata::dfa;
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
|
||||
/// NFA that accepts a newline '\n'.
|
||||
pub fn newline() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![(10..=10,2)]),
|
||||
State::from(vec![3]).named("group_0_rule_0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![10, 11].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA that accepts any letter in the range a..=z.
|
||||
pub fn letter() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![(97..=122,2)]),
|
||||
State::from(vec![3]).named("group_0_rule_0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![97, 123].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA that accepts any number of spaces ' '.
|
||||
pub fn spaces() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1]),
|
||||
State::from(vec![2]),
|
||||
State::from(vec![(32..=32,3)]),
|
||||
State::from(vec![4]),
|
||||
State::from(vec![5,8]),
|
||||
State::from(vec![6]),
|
||||
State::from(vec![(32..=32,7)]),
|
||||
State::from(vec![8]),
|
||||
State::from(vec![5,9]).named("group_0_rule_0"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![0, 32, 33].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
/// NFA that accepts one letter a..=z or many spaces ' '.
|
||||
pub fn letter_and_spaces() -> NFA {
|
||||
NFA {
|
||||
states:vec![
|
||||
State::from(vec![1,3]),
|
||||
State::from(vec![(97..=122,2)]),
|
||||
State::from(vec![11]).named("group_0_rule_0"),
|
||||
State::from(vec![4]),
|
||||
State::from(vec![(32..=32,5)]),
|
||||
State::from(vec![6]),
|
||||
State::from(vec![7,10]),
|
||||
State::from(vec![8]),
|
||||
State::from(vec![(32..=32,9)]),
|
||||
State::from(vec![10]),
|
||||
State::from(vec![7,11]).named("group_0_rule_1"),
|
||||
State::default(),
|
||||
],
|
||||
alphabet_segmentation:alphabet::Segmentation::from_divisions(vec![32, 33, 97, 123].as_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_newline() {
|
||||
assert_eq!(DFA::from(&newline()),dfa::tests::newline());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_letter() {
|
||||
assert_eq!(DFA::from(&letter()),dfa::tests::letter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_spaces() {
|
||||
assert_eq!(DFA::from(&spaces()),dfa::tests::spaces());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_dfa_letter_and_spaces() {
|
||||
assert_eq!(DFA::from(&letter_and_spaces()),dfa::tests::letter_and_spaces());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_newline(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&newline()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_letter(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&letter()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&spaces()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_dfa_letter_and_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| DFA::from(&letter_and_spaces()))
|
||||
}
|
||||
}
|
@ -1,194 +0,0 @@
|
||||
//! Simple API for constructing regex patterns that are used in parser implementation.
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use core::iter;
|
||||
use itertools::Itertools;
|
||||
use std::ops::BitOr;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::ops::Shr;
|
||||
|
||||
use Pattern::*;
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// == Pattern ==
|
||||
// =============
|
||||
|
||||
/// A representation of a simple regular pattern.
|
||||
#[derive(Clone,Debug)]
|
||||
pub enum Pattern {
|
||||
/// The pattern that triggers on any symbol from the given range.
|
||||
Range(RangeInclusive<Symbol>),
|
||||
/// The pattern that triggers on any given pattern from a sequence.
|
||||
Or(Vec<Pattern>),
|
||||
/// The pattern that triggers when a sequence of patterns is encountered.
|
||||
Seq(Vec<Pattern>),
|
||||
/// The pattern that triggers on 0..N repetitions of given pattern.
|
||||
Many(Box<Pattern>),
|
||||
/// The pattern that always triggers.
|
||||
Always,
|
||||
}
|
||||
|
||||
impl Pattern {
|
||||
|
||||
/// A pattern that never triggers.
|
||||
pub fn never() -> Self {
|
||||
Pattern::symbol(Symbol::INVALID_SYMBOL)
|
||||
}
|
||||
|
||||
/// A pattern that always triggers
|
||||
pub fn always() -> Self {
|
||||
Pattern::Always
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any character.
|
||||
pub fn any() -> Self {
|
||||
Pattern::symbols(Symbol::from(0)..=Symbol::from(u32::max_value()))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 0..N repetitions of the pattern described by `self`.
|
||||
pub fn many(&self) -> Self {
|
||||
Many(Box::new(self.clone()))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 1..N repetitions of the pattern described by `self`.
|
||||
pub fn many1(&self) -> Self {
|
||||
self.clone() >> self.many()
|
||||
}
|
||||
|
||||
/// A pattern that triggers on 0..=1 repetitions of the pattern described by `self`.
|
||||
pub fn opt(&self) -> Self {
|
||||
self.clone() | Self::always()
|
||||
}
|
||||
|
||||
/// A pattern that triggers on the given character.
|
||||
pub fn char(character:char) -> Self {
|
||||
Self::symbol(Symbol::from(character))
|
||||
}
|
||||
|
||||
/// A pattern that triggers on the given symbol.
|
||||
pub fn symbol(symbol:Symbol) -> Self {
|
||||
Pattern::symbols(symbol..=symbol)
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any of the provided `symbols`.
|
||||
pub fn symbols(symbols:RangeInclusive<Symbol>) -> Self {
|
||||
Pattern::Range(symbols)
|
||||
}
|
||||
|
||||
/// A pattern that triggers at the end of the file.
|
||||
pub fn eof() -> Self {
|
||||
Self::symbol(Symbol::EOF_CODE)
|
||||
}
|
||||
|
||||
/// A pattern that triggers on any character in the provided `range`.
|
||||
pub fn range(range:RangeInclusive<char>) -> Self {
|
||||
Pattern::symbols(Symbol::from(*range.start())..=Symbol::from(*range.end()))
|
||||
}
|
||||
|
||||
/// Pattern that triggers when sequence of characters given by `chars` is encountered.
|
||||
pub fn all_of(chars:&str) -> Self {
|
||||
let mut chars_iter = chars.chars();
|
||||
if let Some(first) = chars_iter.next() {
|
||||
chars_iter.fold(Self::char(first),|pat, char| pat >> Self::char(char))
|
||||
} else {
|
||||
Pattern::never()
|
||||
}
|
||||
}
|
||||
|
||||
/// The pattern that triggers on any characters contained in `chars`.
|
||||
pub fn any_of(chars:&str) -> Self {
|
||||
chars.chars().fold(Self::never(),|pat,char| pat | Self::char(char))
|
||||
}
|
||||
|
||||
/// The pattern that doesn't trigger on any character contained in `chars`.
|
||||
pub fn none_of(chars:&str) -> Self {
|
||||
let max = u32::max_value();
|
||||
let char_iter = chars.chars().map(|char| char as u32);
|
||||
let char_iter2 = iter::once(0).chain(char_iter).chain(iter::once(max));
|
||||
let mut codes = char_iter2.collect_vec();
|
||||
codes.sort();
|
||||
codes.iter().tuple_windows().fold(Self::never(),|pat,(prev_code,next_code)| {
|
||||
let start = prev_code + 1;
|
||||
let end = next_code - 1;
|
||||
if end < start {pat} else {
|
||||
pat | Pattern::symbols(Symbol::from(start)..=Symbol::from(end))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// The pattern that triggers on any character but `char`.
|
||||
pub fn not(char:char) -> Self {
|
||||
Self::none_of(&char.to_string())
|
||||
}
|
||||
|
||||
/// The pattern that triggers on `num` repetitions of `pat`.
|
||||
pub fn repeat(pat:Pattern, num:usize) -> Self {
|
||||
(0..num).fold(Self::always(),|p,_| p >> pat.clone())
|
||||
}
|
||||
|
||||
/// Pattern that triggers on `min`..`max` repetitions of `pat`.
|
||||
pub fn repeat_between(pat:Pattern, min:usize, max:usize) -> Self {
|
||||
(min..max).fold(Self::never(),|p,n| p | Self::repeat(pat.clone(),n))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ====
|
||||
|
||||
impl BitOr<Pattern> for Pattern {
|
||||
type Output = Pattern;
|
||||
fn bitor(self, rhs:Pattern) -> Self::Output {
|
||||
match (self, rhs) {
|
||||
(Or(mut lhs), Or( rhs)) => {lhs.extend(rhs) ; Or(lhs)},
|
||||
(Or(mut lhs), rhs ) => {lhs.push(rhs) ; Or(lhs)},
|
||||
(lhs , Or(mut rhs)) => {rhs.insert(0,lhs) ; Or(rhs)},
|
||||
(lhs , rhs ) => Or(vec![lhs,rhs]),
|
||||
}
|
||||
}
|
||||
}
|
||||
gen_ref_versions!(Pattern,BitOr,bitor);
|
||||
|
||||
impl Shr<Pattern> for Pattern {
|
||||
type Output = Pattern;
|
||||
fn shr(self, rhs:Pattern) -> Self::Output {
|
||||
match (self, rhs) {
|
||||
(Seq(mut lhs), Seq(rhs) ) => {lhs.extend(rhs) ; Seq(lhs)},
|
||||
(Seq(mut lhs), rhs ) => {lhs.push(rhs) ; Seq(lhs)},
|
||||
(lhs , Seq(mut rhs)) => {rhs.insert(0,lhs) ; Seq(rhs)},
|
||||
(lhs , rhs ) => Seq(vec![lhs, rhs]),
|
||||
}
|
||||
}
|
||||
}
|
||||
gen_ref_versions!(Pattern,Shr,shr);
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Utilities ===
|
||||
// =================
|
||||
|
||||
/// Quote a character as a character pattern.
|
||||
///
|
||||
/// It is equivalent to `Pattern::char(...)`.
|
||||
#[macro_export]
|
||||
macro_rules! c {
|
||||
($char:literal) => {
|
||||
Pattern::char($char)
|
||||
}
|
||||
}
|
||||
|
||||
/// Quote a string as a literal pattern.
|
||||
///
|
||||
/// It is equivalent to `Pattern::all_of(...)`.
|
||||
#[macro_export]
|
||||
macro_rules! l {
|
||||
($lit:literal) => {
|
||||
Pattern::all_of($lit)
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
//! Useful macros for defining operators over patterns.
|
||||
|
||||
/// Generates versions of an operator taking various combinations of by-reference and by-value.
|
||||
#[macro_export]
|
||||
macro_rules! gen_ref_versions {
|
||||
($ty_name:ty,$opr_name:ident,$fn_name:ident) => (
|
||||
impl $opr_name<&$ty_name> for &$ty_name {
|
||||
type Output = $ty_name;
|
||||
fn $fn_name(self, rhs:&$ty_name) -> Self::Output {
|
||||
self.clone().$fn_name(rhs.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl $opr_name<&$ty_name> for $ty_name {
|
||||
type Output = $ty_name;
|
||||
fn $fn_name(self, rhs:&$ty_name) -> Self::Output {
|
||||
self.$fn_name(rhs.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl $opr_name<$ty_name> for &$ty_name {
|
||||
type Output = $ty_name;
|
||||
fn $fn_name(self, rhs:$ty_name) -> Self::Output {
|
||||
self.clone().$fn_name(rhs)
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
@ -1,136 +0,0 @@
|
||||
//! This module exports State implementation for Nondeterministic Finite Automata.
|
||||
|
||||
use crate::automata::alphabet;
|
||||
use crate::automata::symbol::Symbol;
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// == State ==
|
||||
// ===========
|
||||
|
||||
/// A named state for a [`super::nfa::NFA`].
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct State {
|
||||
/// A set of transitions that can trigger without consuming a symbol (ε-transitions).
|
||||
pub epsilon_links:Vec<Identifier>,
|
||||
/// The set of transitions that trigger while consuming a specific symbol.
|
||||
///
|
||||
/// When triggered, the automaton will transition to the [`Transition::target_state`].
|
||||
pub links:Vec<Transition>,
|
||||
/// The name of the state.
|
||||
///
|
||||
/// This is used to auto-generate a call to the rust method of the same name.
|
||||
pub name:Option<String>,
|
||||
/// The function to call when evaluating the state.
|
||||
pub callback:String
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Updater for field `name`. Returns updated state.
|
||||
pub fn named(mut self, name:&str) -> Self {
|
||||
self.name = Some(name.to_owned());
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns transition (next state) for each symbol in alphabet.
|
||||
pub fn targets(&self, alphabet:&alphabet::Segmentation) -> Vec<Identifier> {
|
||||
let mut targets = vec![];
|
||||
let mut index = 0;
|
||||
let mut links = self.links.clone();
|
||||
links.sort_by_key(|link| *link.symbols.start());
|
||||
for &symbol in &alphabet.divisions {
|
||||
while links.len() > index && *links[index].symbols.end() < symbol {
|
||||
index += 1;
|
||||
}
|
||||
if links.len() <= index || *links[index].symbols.start() > symbol {
|
||||
targets.push(Identifier::INVALID);
|
||||
} else {
|
||||
targets.push(links[index].target_state);
|
||||
}
|
||||
}
|
||||
targets
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ====
|
||||
|
||||
impl From<Vec<usize>> for State {
|
||||
/// Creates a state with epsilon links.
|
||||
fn from(vec:Vec<usize>) -> Self {
|
||||
let epsilon_links = vec.iter().cloned().map(|id| Identifier{id}).collect();
|
||||
State{epsilon_links,..Default::default()}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<(RangeInclusive<u32>, usize)>> for State {
|
||||
/// Creates a state with ordinary links.
|
||||
fn from(vec:Vec<(RangeInclusive<u32>, usize)>) -> Self {
|
||||
let link = |(range, id): (RangeInclusive<u32>, usize)| {
|
||||
let start = Symbol{value:*range.start()};
|
||||
let end = Symbol{value:*range.end()};
|
||||
Transition{symbols:start..=end,target_state:Identifier{id}}
|
||||
};
|
||||
let links = vec.iter().cloned().map(link).collect();
|
||||
State{links,..Default::default()}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// == Identifier ==
|
||||
// ================
|
||||
|
||||
/// A state identifier for an arbitrary finite automaton.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct Identifier {
|
||||
pub id: usize
|
||||
}
|
||||
|
||||
impl Identifier {
|
||||
/// An identifier representing the invalid state.
|
||||
///
|
||||
/// When in an invalid state, a finite automaton will reject the sequence of input symbols.
|
||||
pub const INVALID:Identifier = Identifier{id:usize::max_value()};
|
||||
|
||||
/// Constructs a new state identifier.
|
||||
pub fn new(id:usize) -> Identifier {
|
||||
Identifier{id}
|
||||
}
|
||||
}
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Identifier {
|
||||
/// Returns state::INVALID. This is because every finite automata has an invalid state
|
||||
/// and because all transitions in automata transition matrix lead to invalid state by default.
|
||||
fn default() -> Self {
|
||||
Identifier::INVALID
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for Identifier {
|
||||
fn from(id: usize) -> Self {
|
||||
Identifier{id}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ============
|
||||
// === Link ===
|
||||
// ============
|
||||
|
||||
/// A transition between states in a finite automaton that must consume a symbol to trigger.
|
||||
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||
pub struct Transition {
|
||||
/// The range of symbols on which this transition will trigger.
|
||||
pub symbols:RangeInclusive<Symbol>,
|
||||
/// The state that is entered after the transition has triggered.
|
||||
pub target_state:Identifier,
|
||||
}
|
@ -1,53 +0,0 @@
|
||||
//! Defines a Symbol that is operated on by the finite automata.
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Symbol ===
|
||||
// ==============
|
||||
|
||||
/// An input symbol to a finite automaton.
|
||||
#[derive(Clone,Copy,Debug,PartialEq,Eq,PartialOrd,Ord,Hash)]
|
||||
pub struct Symbol {
|
||||
/// The 4-byte representation of the symbol.
|
||||
pub value:u32
|
||||
}
|
||||
|
||||
impl Symbol {
|
||||
/// A representation of the null symbol.
|
||||
pub const NULL:Symbol = Symbol{value:0};
|
||||
/// A representation of the end of the file.
|
||||
pub const EOF_CODE:Symbol = Symbol{value:u32::max_value()};
|
||||
/// A representation of an arbitrary invalid unicode symbol.
|
||||
pub const INVALID_SYMBOL:Symbol = Symbol{value:0xFFFF};
|
||||
/// A representation of the group reaching its end without matching.
|
||||
pub const INCOMPLETE_GROUP:Symbol = Symbol{value:u32::max_value() - 1};
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Default for Symbol {
|
||||
fn default() -> Self {
|
||||
Symbol::NULL
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for Symbol {
|
||||
fn from(value:u32) -> Symbol {
|
||||
Symbol{value}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<char> for Symbol {
|
||||
fn from(value:char) -> Symbol {
|
||||
Symbol{value:value as u32}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Symbol> for Symbol {
|
||||
fn from(symbol:&Symbol) -> Self {
|
||||
let value = symbol.value;
|
||||
Symbol{value}
|
||||
}
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
//! Generic data-structures to support multiple use-cases.
|
||||
|
||||
pub mod matrix;
|
@ -1,75 +0,0 @@
|
||||
//! An efficient representation of a 2D matrix.
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
use std::ops::Index;
|
||||
use std::ops::IndexMut;
|
||||
|
||||
|
||||
|
||||
// ============
|
||||
// == Matrix ==
|
||||
// ============
|
||||
|
||||
/// An efficient 2D matrix implemented on top of [`std::vec::Vec`].
|
||||
#[derive(Clone,Debug,Default,PartialEq,Eq)]
|
||||
pub struct Matrix<T> {
|
||||
/// The number of rows in the matrix.
|
||||
rows:usize,
|
||||
/// The number of columns in the matrix.
|
||||
columns:usize,
|
||||
/// The matrix.
|
||||
matrix:Vec<T>,
|
||||
}
|
||||
|
||||
impl<T> Matrix<T> {
|
||||
/// Get the number of rows in the matrix.
|
||||
pub fn rows(&self) -> usize {
|
||||
self.rows
|
||||
}
|
||||
|
||||
/// Get the number of columns in the matrix.
|
||||
pub fn columns(&self) -> usize {
|
||||
self.columns
|
||||
}
|
||||
|
||||
/// Obtain the indices for the rows in this matrix.
|
||||
pub fn row_indices(&self) -> Range<usize> {
|
||||
0..self.rows()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T:Default> Matrix<T> {
|
||||
/// Constructs a matrix with the dimensions given by `rows` and `columns`.
|
||||
pub fn new(rows:usize, columns:usize) -> Self {
|
||||
let mut matrix = Vec::with_capacity(rows*columns);
|
||||
for _ in 0..matrix.capacity() {
|
||||
matrix.push(default())
|
||||
}
|
||||
Self{rows,columns,matrix}
|
||||
}
|
||||
|
||||
/// Adds a new row to the matrix `self`, filled with default values.
|
||||
pub fn new_row(&mut self) {
|
||||
for _ in 0..self.columns {
|
||||
self.matrix.push(default());
|
||||
}
|
||||
self.rows += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl<T> Index<(usize,usize)> for Matrix<T> {
|
||||
type Output = T;
|
||||
fn index(&self, index:(usize,usize)) -> &T {
|
||||
&self.matrix[index.0*self.columns+index.1]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize,usize)> for Matrix<T> {
|
||||
fn index_mut(&mut self, index:(usize,usize)) -> &mut T {
|
||||
&mut self.matrix[index.0*self.columns+index.1]
|
||||
}
|
||||
}
|
@ -1,541 +0,0 @@
|
||||
//! This file contains utilities for generating rust code from lexer definitions, allowing the
|
||||
//! flexer to be specialised for a specific language.
|
||||
|
||||
use crate::prelude::*;
|
||||
use quote::*;
|
||||
use syn::*;
|
||||
|
||||
use crate::automata::dfa::DFA;
|
||||
use crate::automata::dfa::RuleExecutable;
|
||||
use crate::automata::state::Identifier;
|
||||
use crate::automata::state::State;
|
||||
use crate::group::Group;
|
||||
use crate::group;
|
||||
|
||||
use enso_macro_utils::repr;
|
||||
use proc_macro2::Literal;
|
||||
use std::hash::BuildHasher;
|
||||
use std::result::Result;
|
||||
use std::fmt;
|
||||
|
||||
use crate as flexer;
|
||||
|
||||
|
||||
|
||||
// =======================
|
||||
// === Code Generation ===
|
||||
// =======================
|
||||
|
||||
/// Generate specialized code for the provided lexer `definition`.
|
||||
///
|
||||
/// This specialized code is a highly-optimised and tailored lexer that dispatches based on simple
|
||||
/// code-point switches, with no dynamic lookup. This means that it is very fast, and very low
|
||||
/// overhead.
|
||||
pub fn specialize
|
||||
( definition : &impl flexer::State
|
||||
, state_type_name : impl Str
|
||||
, output_type_name : impl Str
|
||||
) -> Result<String,GenError> {
|
||||
let group_registry = definition.groups();
|
||||
let mut body_items = Vec::new();
|
||||
body_items.push(run_function(output_type_name)?);
|
||||
body_items.push(run_current_state_function());
|
||||
body_items.push(step(group_registry));
|
||||
for group in group_registry.all().iter() {
|
||||
body_items.extend(automaton_for_group(group,group_registry)?)
|
||||
}
|
||||
let result = wrap_in_impl_for(state_type_name,body_items)?;
|
||||
let code = show_code(&result);
|
||||
Ok(code)
|
||||
}
|
||||
|
||||
|
||||
// === Whole-Lexer Codegen Utilities ===
|
||||
|
||||
/// Wrap the provided implementation items into an `impl` block for the provided `state_name` type.
|
||||
pub fn wrap_in_impl_for
|
||||
( state_name : impl Into<String>
|
||||
, body : Vec<ImplItem>
|
||||
) -> Result<ItemImpl,GenError> {
|
||||
let state_name:Ident = str_to_ident(state_name.into().as_str())?;
|
||||
let mut tree:ItemImpl = parse_quote! {
|
||||
#[allow(missing_docs,dead_code,clippy::all)]
|
||||
impl #state_name {}
|
||||
};
|
||||
tree.items.extend(body);
|
||||
Ok(tree)
|
||||
}
|
||||
|
||||
/// Generate the `run` function for the specialized lexer.
|
||||
///
|
||||
/// This function is what the user of the lexer will call to begin execution.
|
||||
pub fn run_function(output_type_name:impl Str) -> Result<ImplItem,GenError> {
|
||||
let output_type_name = str_to_path(output_type_name)?;
|
||||
let tree:ImplItem = parse_quote! {
|
||||
pub fn run<R:LazyReader>(&mut self, mut reader:R) -> LexingResult<#output_type_name> {
|
||||
self.set_up();
|
||||
reader.advance_char(&mut self.bookmarks);
|
||||
while self.run_current_state(&mut reader) == StageStatus::ExitSuccess {}
|
||||
let result = match self.status {
|
||||
StageStatus::ExitFinished => LexingResult::success(
|
||||
mem::take(&mut self.output)
|
||||
),
|
||||
StageStatus::ExitFail => LexingResult::failure(
|
||||
mem::take(&mut self.output)
|
||||
),
|
||||
_ => LexingResult::partial(mem::take(&mut self.output))
|
||||
};
|
||||
self.tear_down();
|
||||
result
|
||||
}
|
||||
};
|
||||
Ok(tree)
|
||||
}
|
||||
|
||||
/// Generate the function responsible for executing the lexer in its current state.
|
||||
pub fn run_current_state_function() -> ImplItem {
|
||||
let tree:ImplItem = parse_quote! {
|
||||
fn run_current_state<R:LazyReader>(&mut self, reader:&mut R) -> StageStatus {
|
||||
self.status = StageStatus::Initial;
|
||||
let mut finished = false;
|
||||
|
||||
// Runs until reaching a state that no longer says to continue.
|
||||
while let Some(next_state) = self.status.continue_as() {
|
||||
self.logger.debug(||format!("Current character is {:?}.",reader.character().char));
|
||||
self.logger.debug(||format!("Continuing in {:?}.",next_state));
|
||||
self.status = self.step(next_state,reader);
|
||||
|
||||
if finished && reader.finished(self.bookmarks()) {
|
||||
self.logger.info("Input finished.");
|
||||
self.status = StageStatus::ExitFinished
|
||||
}
|
||||
finished = reader.character().is_eof();
|
||||
|
||||
if self.status.should_continue() {
|
||||
match reader.character().char {
|
||||
Ok(char) => {
|
||||
reader.append_result(char);
|
||||
self.logger.info(||format!("Result is {:?}.",reader.result()));
|
||||
},
|
||||
Err(flexer::prelude::reader::Error::EOF) => {
|
||||
self.logger.info("Reached EOF.");
|
||||
},
|
||||
Err(flexer::prelude::reader::Error::EndOfGroup) => {
|
||||
let current_state = self.current_state();
|
||||
let group_name = self.groups().group(current_state).name.as_str();
|
||||
let err = format!("Missing rules for state {}.", group_name);
|
||||
self.logger.error(err.as_str());
|
||||
panic!(err)
|
||||
}
|
||||
Err(_) => {
|
||||
self.logger.error("Unexpected error!");
|
||||
panic!("Unexpected error!")
|
||||
}
|
||||
}
|
||||
reader.advance_char(&mut self.bookmarks);
|
||||
}
|
||||
}
|
||||
|
||||
self.status
|
||||
}
|
||||
};
|
||||
tree
|
||||
}
|
||||
|
||||
/// Generate the `step` function for the lexer.
|
||||
///
|
||||
/// This function is responsible for dispatching based on the current state, consuming a character,
|
||||
/// and returning the state to transition to.
|
||||
pub fn step(groups:&group::Registry) -> ImplItem {
|
||||
let arms = groups.all().iter().map(|g| step_match_arm(g.id.into())).collect_vec();
|
||||
parse_quote! {
|
||||
fn step<R:LazyReader>(&mut self, next_state:SubStateId, reader:&mut R) -> StageStatus {
|
||||
let current_state:usize = self.current_state().into();
|
||||
match current_state {
|
||||
#(#arms)*
|
||||
_ => unreachable_panic!("Unreachable state reached in lexer."),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a match arm for the step function.
|
||||
///
|
||||
/// There is one match arm per lexer state.
|
||||
pub fn step_match_arm(number:usize) -> Arm {
|
||||
let literal = Literal::usize_unsuffixed(number);
|
||||
let function_name_str = format!("dispatch_in_state_{}",number);
|
||||
let func_name:Ident = parse_str(function_name_str.as_str()).unwrap();
|
||||
let arm:Arm = parse_quote! {
|
||||
#literal => self.#func_name(next_state,reader),
|
||||
};
|
||||
arm
|
||||
}
|
||||
|
||||
|
||||
// === Generation for a Specific Lexer State ===
|
||||
|
||||
/// Generate the functions that implement the lexer automaton for a given lexer state.
|
||||
pub fn automaton_for_group
|
||||
( group : &Group
|
||||
, registry : &group::Registry
|
||||
) -> Result<Vec<ImplItem>,GenError> {
|
||||
let nfa = registry.to_nfa_from(group.id);
|
||||
let mut rules = Vec::with_capacity(nfa.states.len());
|
||||
for state in nfa.states.iter() {
|
||||
if state.name.is_some() {
|
||||
rules.push(rule_for_state(state)?);
|
||||
}
|
||||
}
|
||||
let mut dfa = DFA::from(&nfa);
|
||||
let dispatch_for_dfa = dispatch_in_state(&dfa,group.id.into())?;
|
||||
let mut dfa_transitions = transitions_for_dfa(&mut dfa,group.id.into())?;
|
||||
dfa_transitions.push(dispatch_for_dfa);
|
||||
dfa_transitions.extend(rules);
|
||||
Ok(dfa_transitions)
|
||||
}
|
||||
|
||||
/// Generate a set of transition functions for the provided `dfa`, with identifier `id`.
|
||||
pub fn transitions_for_dfa(dfa:&mut DFA, id:usize) -> Result<Vec<ImplItem>,GenError> {
|
||||
let mut state_has_overlapping_rules:HashMap<usize,bool> = HashMap::new();
|
||||
state_has_overlapping_rules.insert(0,false);
|
||||
let state_names:Vec<_> = dfa.links.row_indices().map(|ix| (ix, name_for_step(id, ix))).collect();
|
||||
let mut transitions = Vec::with_capacity(state_names.len());
|
||||
for (ix,name) in state_names.into_iter() {
|
||||
transitions.push(transition_for_dfa(dfa,name,ix,&mut state_has_overlapping_rules)?)
|
||||
}
|
||||
Ok(transitions)
|
||||
}
|
||||
|
||||
/// Generate a specific transition function for
|
||||
#[allow(clippy::implicit_hasher)]
|
||||
pub fn transition_for_dfa<S:BuildHasher>
|
||||
( dfa : &mut DFA
|
||||
, transition_name : Ident
|
||||
, state_ix : usize
|
||||
, has_overlaps : &mut HashMap<usize,bool,S>
|
||||
) -> Result<ImplItem,GenError> {
|
||||
let match_expr:Expr = match_for_transition(dfa,state_ix,has_overlaps)?;
|
||||
let function:ImplItem = parse_quote! {
|
||||
fn #transition_name<R:LazyReader>(&mut self, reader:&mut R) -> StageStatus {
|
||||
#match_expr
|
||||
}
|
||||
};
|
||||
Ok(function)
|
||||
}
|
||||
|
||||
/// Generate the pattern match for a given transition function.
|
||||
pub fn match_for_transition<S:BuildHasher>
|
||||
( dfa : &mut DFA
|
||||
, state_ix : usize
|
||||
, has_overlaps : &mut HashMap<usize,bool,S>
|
||||
) -> Result<Expr,GenError> {
|
||||
let overlaps = *has_overlaps.get(&state_ix).unwrap_or(&false);
|
||||
let state = dfa.callbacks.get(state_ix).expect("Internal error.").clone();
|
||||
let mut trigger_state = dfa.links[(state_ix,0)];
|
||||
let mut range_start = u32::min_value();
|
||||
let divisions:Vec<_> = dfa.alphabet_segmentation.divisions_as_vec();
|
||||
let mut branches = Vec::with_capacity(divisions.len());
|
||||
for division in divisions.into_iter() {
|
||||
let ix = division.position;
|
||||
let sym = division.symbol;
|
||||
let new_trigger_state = dfa.links[(state_ix,ix)];
|
||||
if new_trigger_state != trigger_state {
|
||||
let range_end = if sym.value != 0 { sym.value - 1 } else { sym.value };
|
||||
let current_trigger_state = trigger_state;
|
||||
let current_range_start = range_start;
|
||||
trigger_state = new_trigger_state;
|
||||
range_start = sym.value;
|
||||
let body =
|
||||
branch_body(dfa,current_trigger_state,&state,has_overlaps,overlaps)?;
|
||||
branches.push(Branch::new(Some(current_range_start..=range_end),body))
|
||||
} else {}
|
||||
}
|
||||
let catch_all_branch_body = branch_body(dfa,trigger_state,&state,has_overlaps,overlaps)?;
|
||||
let catch_all_branch = Branch::new(None,catch_all_branch_body);
|
||||
branches.push(catch_all_branch);
|
||||
let arms:Vec<Arm> = branches.into_iter().map(Into::into).collect();
|
||||
let mut match_expr:ExprMatch = parse_quote! {
|
||||
match u32::from(reader.character()) {
|
||||
#(#arms)*
|
||||
}
|
||||
};
|
||||
match_expr.arms = arms;
|
||||
Ok(Expr::Match(match_expr))
|
||||
}
|
||||
|
||||
/// Generate the branch body for a transition in the DFA.
|
||||
pub fn branch_body<S:BuildHasher>
|
||||
( dfa : &mut DFA
|
||||
, target_state : Identifier
|
||||
, maybe_state : &Option<RuleExecutable>
|
||||
, has_overlaps : &mut HashMap<usize,bool,S>
|
||||
, rules_overlap : bool
|
||||
) -> Result<Block,GenError> {
|
||||
if target_state == Identifier::INVALID {
|
||||
match maybe_state {
|
||||
None => {
|
||||
Ok(parse_quote! {{
|
||||
StageStatus::ExitFail
|
||||
}})
|
||||
},
|
||||
Some(rule_exec) => {
|
||||
let rule:Expr = match parse_str(rule_exec.code.as_str()) {
|
||||
Ok(rule) => rule,
|
||||
Err(_) => return Err(GenError::BadExpression(rule_exec.code.clone()))
|
||||
};
|
||||
if rules_overlap {
|
||||
Ok(parse_quote! {{
|
||||
let rule_bookmark = self.bookmarks.rule_bookmark;
|
||||
let matched_bookmark = self.bookmarks.matched_bookmark;
|
||||
self.bookmarks.rewind(rule_bookmark,reader);
|
||||
self.current_match = reader.pop_result();
|
||||
self.#rule(reader);
|
||||
self.bookmarks.bookmark(matched_bookmark,reader);
|
||||
StageStatus::ExitSuccess
|
||||
}})
|
||||
} else {
|
||||
Ok(parse_quote! {{
|
||||
let matched_bookmark = self.bookmarks.matched_bookmark;
|
||||
self.current_match = reader.pop_result();
|
||||
self.#rule(reader);
|
||||
self.bookmarks.bookmark(matched_bookmark,reader);
|
||||
StageStatus::ExitSuccess
|
||||
}})
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let target_state_has_no_rule = match maybe_state {
|
||||
Some(state) => if !dfa.has_rule_for(target_state) {
|
||||
dfa.callbacks[target_state.id] = Some(state.clone());
|
||||
has_overlaps.insert(target_state.id,true);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
},
|
||||
None => false
|
||||
};
|
||||
|
||||
let state_id = Literal::usize_unsuffixed(target_state.id);
|
||||
let ret:Expr = parse_quote! {
|
||||
StageStatus::ContinueWith(#state_id.into())
|
||||
};
|
||||
|
||||
if target_state_has_no_rule && !rules_overlap {
|
||||
Ok(parse_quote! {{
|
||||
let rule_bookmark = self.bookmarks.rule_bookmark;
|
||||
self.bookmarks.bookmark(rule_bookmark,reader);
|
||||
#ret
|
||||
}})
|
||||
} else {
|
||||
Ok(parse_quote! {{
|
||||
#ret
|
||||
}})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate the dispatch function for a given lexer state.
|
||||
///
|
||||
/// This dispatch function is responsible for dispatching based on the sub-state of any given lexer
|
||||
/// state, and is the main part of implementing the actual lexer transitions.
|
||||
pub fn dispatch_in_state(dfa:&DFA, id:usize) -> Result<ImplItem,GenError> {
|
||||
let dispatch_name:Ident = str_to_ident(format!("dispatch_in_state_{}",id))?;
|
||||
let state_names = dfa.links.row_indices().map(|ix| (ix, name_for_step(id,ix))).collect_vec();
|
||||
let mut branches = Vec::with_capacity(state_names.len());
|
||||
for (ix,name) in state_names.into_iter() {
|
||||
let literal = Literal::usize_unsuffixed(ix);
|
||||
let arm:Arm = parse_quote! {
|
||||
#literal => self.#name(reader),
|
||||
};
|
||||
branches.push(arm);
|
||||
}
|
||||
|
||||
let pattern_match:ExprMatch = parse_quote! {
|
||||
match new_state_index.into() {
|
||||
#(#branches)*
|
||||
_ => unreachable_panic!("Unreachable state reached in lexer.")
|
||||
}
|
||||
};
|
||||
let func:ImplItem = parse_quote! {
|
||||
fn #dispatch_name<R:LazyReader>
|
||||
( &mut self
|
||||
, new_state_index:SubStateId
|
||||
, reader:&mut R
|
||||
) -> StageStatus {
|
||||
#pattern_match
|
||||
}
|
||||
};
|
||||
|
||||
Ok(func)
|
||||
}
|
||||
|
||||
/// Generate a name for a given step function.
|
||||
pub fn name_for_step(in_state:usize, to_state:usize) -> Ident {
|
||||
let name_str = format!("state_{}_to_{}",in_state,to_state);
|
||||
parse_str(name_str.as_str()).expect("Impossible to not be a valid identifier.")
|
||||
}
|
||||
|
||||
/// Generate an executable rule function for a given lexer state.
|
||||
pub fn rule_for_state(state:&State) -> Result<ImplItem,GenError> {
|
||||
match &state.name {
|
||||
None => unreachable_panic!("Rule for state requested, but state has none."),
|
||||
Some(name) => {
|
||||
let rule_name = str_to_ident(name)?;
|
||||
let code:Expr = match parse_str(state.callback.as_str()) {
|
||||
Ok(expr) => expr,
|
||||
Err(_) => return Err(GenError::BadExpression(state.callback.clone()))
|
||||
};
|
||||
if !has_reader_arg(&code) {
|
||||
return Err(GenError::BadCallbackArgument)
|
||||
}
|
||||
|
||||
let tree:ImplItem = parse_quote! {
|
||||
fn #rule_name<R:LazyReader>(&mut self, reader:&mut R) {
|
||||
#code
|
||||
}
|
||||
};
|
||||
Ok(tree)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the given `expr` is a call with a single argument "reader" being passed.
|
||||
#[allow(clippy::cmp_owned)]
|
||||
pub fn has_reader_arg(expr:&Expr) -> bool {
|
||||
match expr {
|
||||
Expr::MethodCall(expr) => match expr.args.first() {
|
||||
Some(Expr::Path(path)) => {
|
||||
match path.path.segments.first() {
|
||||
Some(segment) => {
|
||||
segment.ident.to_string() == "reader"
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
_ => false
|
||||
},
|
||||
Expr::Call(expr) => match expr.args.first() {
|
||||
Some(Expr::Path(path)) => {
|
||||
match path.path.segments.first() {
|
||||
Some(segment) => {
|
||||
segment.ident.to_string() == "reader"
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === GenError ===
|
||||
// ================
|
||||
|
||||
/// Errors that arise during code generation.
|
||||
#[derive(Clone,Debug,PartialEq)]
|
||||
pub enum GenError {
|
||||
/// The callback function does not take a single argument `reader`.
|
||||
BadCallbackArgument,
|
||||
/// The provided string is not a valid rust identifier.
|
||||
BadIdentifier(String),
|
||||
/// The provided expression isn't a valid rust expression.
|
||||
BadExpression(String),
|
||||
/// The provided string is not a valid rust literal.
|
||||
BadLiteral(String),
|
||||
/// The provided string is not a valid rust path.
|
||||
BadPath(String),
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Display for GenError {
|
||||
fn fmt(&self, f:&mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
GenError::BadCallbackArgument => write!(f,
|
||||
"Bad argument to a callback function. It must take a single argument `reader`."
|
||||
),
|
||||
GenError::BadIdentifier(str) => write!(f,"`{}` is not a valid rust identifier.",str),
|
||||
GenError::BadExpression(str) => write!(f,"`{}` is not a valid rust expression.",str),
|
||||
GenError::BadLiteral(str) => write!(f,"`{}` is not a valid rust literal.",str),
|
||||
GenError::BadPath(str) => write!(f,"`{}` is not a valid rust path.",str),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Branch ===
|
||||
// ==============
|
||||
|
||||
/// A representation of a dispatch branch for helping to generate pattern arms.
|
||||
#[allow(missing_docs)]
|
||||
#[derive(Clone,Debug,PartialEq)]
|
||||
struct Branch {
|
||||
pub range:Option<RangeInclusive<u32>>,
|
||||
pub body:Block
|
||||
}
|
||||
|
||||
impl Branch {
|
||||
/// Create a new branch, from the provided `range` and with `body` as the code it executes.
|
||||
pub fn new(range:Option<RangeInclusive<u32>>, body:Block) -> Branch {
|
||||
Branch {range,body}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Into<Arm> for Branch {
|
||||
fn into(self) -> Arm {
|
||||
let body = self.body;
|
||||
match self.range {
|
||||
Some(range) => {
|
||||
let range_start = Literal::u32_unsuffixed(*range.start());
|
||||
let range_end = Literal::u32_unsuffixed(*range.end());
|
||||
if range.start() == range.end() {
|
||||
parse_quote! {
|
||||
#range_start => #body,
|
||||
}
|
||||
} else {
|
||||
parse_quote! {
|
||||
#range_start..=#range_end => #body,
|
||||
}
|
||||
}
|
||||
}
|
||||
None => parse_quote! {
|
||||
_ => #body,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Utilities ===
|
||||
// =================
|
||||
|
||||
/// Convert a string to an identifier.
|
||||
pub fn str_to_ident(str:impl Str) -> Result<Ident,GenError> {
|
||||
parse_str(str.as_ref()).map_err(|_| GenError::BadIdentifier(str.into()))
|
||||
}
|
||||
|
||||
/// Convert a string to a path.
|
||||
pub fn str_to_path(str:impl Str) -> Result<Path,GenError> {
|
||||
parse_str(str.as_ref()).map_err(|_| GenError::BadPath(str.into()))
|
||||
}
|
||||
|
||||
/// Convert the syntax tree into a string.
|
||||
pub fn show_code(tokens:&impl ToTokens) -> String {
|
||||
repr(tokens)
|
||||
}
|
||||
|
||||
|
@ -1,366 +0,0 @@
|
||||
//! This module provides an API for grouping multiple flexer rules.
|
||||
|
||||
use crate::automata::nfa::NFA;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::group::rule::Rule;
|
||||
|
||||
use itertools::Itertools;
|
||||
use std::fmt::Display;
|
||||
use wasm_bindgen::__rt::core::fmt::Formatter;
|
||||
|
||||
pub mod rule;
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Registry ===
|
||||
// ================
|
||||
|
||||
/// The group Registry is a container for [`Group`]s in the flexer implementation.
|
||||
///
|
||||
/// It allows groups to contain associations between themselves, and also implements useful
|
||||
/// conversions for groups.
|
||||
#[derive(Clone,Debug,Default)]
|
||||
pub struct Registry {
|
||||
/// The groups defined for the lexer.
|
||||
groups:Vec<Group>
|
||||
}
|
||||
|
||||
impl Registry {
|
||||
/// Defines a new group of rules for the lexer with the specified `name` and `parent`.
|
||||
///
|
||||
/// It returns the identifier of the newly-created group.
|
||||
pub fn define_group
|
||||
( &mut self
|
||||
, name : impl Into<String>
|
||||
, parent_index : Option<Identifier>
|
||||
) -> Identifier {
|
||||
let id = self.next_id();
|
||||
let group = Group::new(id,name.into(),parent_index);
|
||||
self.groups.push(group);
|
||||
id
|
||||
}
|
||||
|
||||
/// Adds an existing `group` to the registry, updating and returning its identifier.
|
||||
pub fn add_group(&mut self, mut group:Group) -> Identifier {
|
||||
let new_id = self.next_id();
|
||||
group.id = new_id;
|
||||
self.groups.push(group);
|
||||
new_id
|
||||
}
|
||||
|
||||
/// Creates a rule that matches `pattern` for the group identified by `group_id`.
|
||||
///
|
||||
/// Panics if `group_id` refers to a nonexistent group.
|
||||
pub fn create_rule(&mut self, group:Identifier, pattern:&Pattern, callback:impl AsRef<str>) {
|
||||
let group = self.group_mut(group);
|
||||
group.create_rule(pattern,callback.as_ref());
|
||||
}
|
||||
|
||||
/// Associates the provided `rule` with the group identified by `group_id`.
|
||||
///
|
||||
/// Panics if `group_id` refers to a nonexistent group.
|
||||
pub fn add_rule(&mut self, group:Identifier, rule:Rule) {
|
||||
let group = self.group_mut(group);
|
||||
group.add_rule(rule);
|
||||
}
|
||||
|
||||
/// Collates the entire set of rules that are matchable when the lexer has the group identified
|
||||
/// by `group_id` as active.
|
||||
///
|
||||
/// This set of rules includes the rules inherited from any parent groups.
|
||||
pub fn rules_for(&self, group:Identifier) -> Vec<&Rule> {
|
||||
let group_handle = self.group(group);
|
||||
let mut parent = group_handle.parent_index.map(|p| self.group(p));
|
||||
let mut rules = (&group_handle.rules).iter().collect_vec();
|
||||
while let Some(parent_group) = parent {
|
||||
if parent_group.id == group_handle.id {
|
||||
panic!("There should not be cycles in parent links for lexer groups.")
|
||||
}
|
||||
rules.extend((&parent_group.rules).iter());
|
||||
parent = parent_group.parent_index.map(|p| self.group(p));
|
||||
}
|
||||
rules
|
||||
}
|
||||
|
||||
/// Obtains a reference to the group for the given `group_id`.
|
||||
///
|
||||
/// As group identifiers can only be created by use of this `Registry`, this will always
|
||||
/// succeed.
|
||||
pub fn group(&self, group:Identifier) -> &Group {
|
||||
self.groups.get(group.0).expect("The group must exist.")
|
||||
}
|
||||
|
||||
/// Obtains a mutable reference to the group for the given `group_id`.
|
||||
///
|
||||
/// As group identifiers can only be created by use of this `Registry`, this will always
|
||||
/// succeed.
|
||||
pub fn group_mut(&mut self, group:Identifier) -> &mut Group {
|
||||
self.groups.get_mut(group.0).expect("The group should exist.")
|
||||
}
|
||||
|
||||
/// Converts the group identified by `group_id` into an NFA.
|
||||
///
|
||||
/// Returns `None` if the group does not exist, or if the conversion fails.
|
||||
pub fn to_nfa_from(&self, group:Identifier) -> NFA {
|
||||
let group = self.group(group);
|
||||
let mut nfa = NFA::default();
|
||||
let start = nfa.new_state();
|
||||
let build = |rule:&Rule| nfa.new_pattern(start,&rule.pattern);
|
||||
let rules = self.rules_for(group.id);
|
||||
let callbacks = rules.iter().map(|r| r.callback.clone()).collect_vec();
|
||||
let states = rules.into_iter().map(build).collect_vec();
|
||||
let end = nfa.new_state();
|
||||
for (ix,state) in states.into_iter().enumerate() {
|
||||
nfa.states[state.id].name = Some(group.callback_name(ix));
|
||||
nfa.states[state.id].callback = callbacks.get(ix).unwrap().clone();
|
||||
nfa.connect(state,end);
|
||||
}
|
||||
nfa
|
||||
}
|
||||
|
||||
/// Generates the next group identifier for this registry.
|
||||
fn next_id(&self) -> Identifier {
|
||||
let val = self.groups.len();
|
||||
Identifier(val)
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the groups contained within the registry.
|
||||
pub fn all(&self) -> &Vec<Group> {
|
||||
&self.groups
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==================
|
||||
// === Identifier ===
|
||||
// ==================
|
||||
|
||||
/// An identifier for a group.
|
||||
#[allow(missing_docs)]
|
||||
#[derive(Copy,Clone,Debug,Default,Eq,PartialEq)]
|
||||
pub struct Identifier(usize);
|
||||
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl From<usize> for Identifier {
|
||||
fn from(id:usize) -> Self {
|
||||
Identifier(id)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&usize> for Identifier {
|
||||
fn from(id:&usize) -> Self {
|
||||
Identifier(*id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<usize> for Identifier {
|
||||
fn into(self) -> usize {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// == Group ==
|
||||
// ===========
|
||||
|
||||
/// A group is a structure for associating multiple rules with each other, and is the basic building
|
||||
/// block of the flexer.
|
||||
///
|
||||
/// A group consists of the following:
|
||||
///
|
||||
/// - A set of [`Rule`s](Rule), each containing a regex pattern and associated callback.
|
||||
/// - Inherited rules from a parent group, if such a group exists.
|
||||
///
|
||||
/// Internally, the flexer maintains a stack of groups, where only one group can be active at any
|
||||
/// given time. Rules are matched _in order_, and hence overlaps are handled by the order in which
|
||||
/// the rules are matched, with the first callback being triggered.
|
||||
///
|
||||
/// Whenever a [`rule.pattern`](Rule::pattern) from the active group is matched against part of the
|
||||
/// input, the associated [`rule.callback`](Rule::callback) is executed. This callback may exit the
|
||||
/// current group or even enter a new one. As a result, groups allow us to elegantly model a
|
||||
/// situation where certain parts of a program (e.g. within a string literal) have very different
|
||||
/// lexing rules than other portions of a program (e.g. the body of a function).
|
||||
#[derive(Clone,Debug,Default)]
|
||||
pub struct Group {
|
||||
/// A unique identifier for the group.
|
||||
pub id:Identifier,
|
||||
/// A name for the group (useful in debugging).
|
||||
pub name:String,
|
||||
/// The parent group from which rules are inherited.
|
||||
///
|
||||
/// It is ensured that the group is held mutably.
|
||||
pub parent_index:Option<Identifier>,
|
||||
/// A set of flexer rules.
|
||||
pub rules:Vec<Rule>,
|
||||
}
|
||||
|
||||
impl Group {
|
||||
|
||||
/// Creates a new group.
|
||||
pub fn new(id:Identifier, name:impl Into<String>, parent_index:Option<Identifier>) -> Self {
|
||||
let rules = Vec::new();
|
||||
Group{id,name:name.into(),parent_index,rules}
|
||||
}
|
||||
|
||||
/// Adds a new rule to the current group.
|
||||
pub fn add_rule(&mut self, rule:Rule) {
|
||||
self.rules.push(rule)
|
||||
}
|
||||
|
||||
/// Creates a new rule.
|
||||
pub fn create_rule(&mut self, pattern:&Pattern, code:&str) {
|
||||
let pattern_clone = pattern.clone();
|
||||
let rule = Rule::new(pattern_clone,code);
|
||||
self.rules.push(rule)
|
||||
}
|
||||
|
||||
/// The canonical name for a given rule.
|
||||
pub fn callback_name(&self, rule_ix:usize) -> String {
|
||||
format!("group_{}_rule_{}",self.id.0,rule_ix)
|
||||
}
|
||||
}
|
||||
|
||||
// === Trait Impls ===
|
||||
|
||||
impl Into<Registry> for Group {
|
||||
fn into(self) -> Registry {
|
||||
let mut registry = Registry::default();
|
||||
registry.add_group(self);
|
||||
registry
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Group {
|
||||
fn fmt(&self, f:&mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f,"Group {}",self.name)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
extern crate test;
|
||||
|
||||
use crate::automata::nfa;
|
||||
use crate::automata::pattern::Pattern;
|
||||
use crate::group::Group;
|
||||
use crate::group::Registry;
|
||||
use crate::group::rule::Rule;
|
||||
|
||||
use std::default::Default;
|
||||
use test::Bencher;
|
||||
use enso_prelude::default;
|
||||
|
||||
fn newline() -> Registry {
|
||||
let pattern = Pattern::char('\n');
|
||||
let mut group = Group::default();
|
||||
group.add_rule(Rule::new(pattern,""));
|
||||
let mut registry = Registry::default();
|
||||
registry.add_group(group);
|
||||
registry
|
||||
}
|
||||
|
||||
fn letter() -> Registry {
|
||||
let pattern = Pattern::range('a'..='z');
|
||||
let mut group = Group::default();
|
||||
group.add_rule(Rule::new(pattern,""));
|
||||
group.into()
|
||||
}
|
||||
|
||||
fn spaces() -> Registry {
|
||||
let pattern = Pattern::char(' ').many1();
|
||||
let mut group = Group::default();
|
||||
group.add_rule(Rule::new(pattern,""));
|
||||
group.into()
|
||||
}
|
||||
|
||||
fn letter_and_spaces() -> Registry {
|
||||
let letter = Pattern::range('a'..='z');
|
||||
let spaces = Pattern::char(' ').many1();
|
||||
let mut group = Group::default();
|
||||
group.add_rule(Rule::new(letter,""));
|
||||
group.add_rule(Rule::new(spaces,""));
|
||||
group.into()
|
||||
}
|
||||
|
||||
fn complex_rules(count:usize) -> Registry {
|
||||
let mut group = Group::default();
|
||||
for ix in 0..count {
|
||||
let string = ix.to_string();
|
||||
let all = Pattern::all_of(&string);
|
||||
let any = Pattern::any_of(&string);
|
||||
let none = Pattern::none_of(&string);
|
||||
let all_any_none = all >> any >> none;
|
||||
let pattern = Pattern::many(&all_any_none);
|
||||
group.add_rule(Rule::new(pattern.clone(),""));
|
||||
}
|
||||
group.into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_newline() {
|
||||
assert_eq!(newline().to_nfa_from(default()),nfa::tests::newline());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_letter() {
|
||||
assert_eq!(letter().to_nfa_from(default()),nfa::tests::letter());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_spaces() {
|
||||
assert_eq!(spaces().to_nfa_from(default()),nfa::tests::spaces());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_nfa_letter_and_spaces() {
|
||||
let expected = nfa::tests::letter_and_spaces();
|
||||
assert_eq!(letter_and_spaces().to_nfa_from(default()),expected);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_newline(bencher:&mut Bencher) {
|
||||
bencher.iter(|| newline().to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_letter(bencher:&mut Bencher) {
|
||||
bencher.iter(|| letter().to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| spaces().to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_to_nfa_letter_and_spaces(bencher:&mut Bencher) {
|
||||
bencher.iter(|| letter_and_spaces().to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_ten_rules(bencher:&mut Bencher) {
|
||||
bencher.iter(|| complex_rules(10).to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_hundred_rules(bencher:&mut Bencher) {
|
||||
bencher.iter(|| complex_rules(100).to_nfa_from(default()))
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_thousand_rules(bencher:&mut Bencher) {
|
||||
bencher.iter(|| complex_rules(1000).to_nfa_from(default()))
|
||||
}
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
//! An API for declaring rust-code callbacks to be executed when a given pattern is matched.
|
||||
//!
|
||||
//! A flexer rule is a [`crate::automata::pattern`] associated with rust code to be executed as a
|
||||
//! callback.
|
||||
|
||||
use crate::automata::pattern::Pattern;
|
||||
|
||||
|
||||
|
||||
// ==========
|
||||
// == Rule ==
|
||||
// ==========
|
||||
|
||||
/// A flexer rule.
|
||||
#[derive(Clone,Debug)]
|
||||
pub struct Rule {
|
||||
/// The pattern that triggers the callback.
|
||||
pub pattern:Pattern,
|
||||
|
||||
/// The code to execute when [`Rule::pattern`] matches, containing rust code as a
|
||||
/// [`std::string::String`].
|
||||
///
|
||||
/// This code will be called directly from a method defined on your Lexer (the one that contains
|
||||
/// a [`crate::Flexer`] instance. To this end, the code you provide as a string must be valid in
|
||||
/// that context.
|
||||
pub callback:String,
|
||||
}
|
||||
|
||||
impl Rule {
|
||||
/// Creates a new rule.
|
||||
pub fn new(pattern:Pattern, callback:impl Into<String>) -> Self {
|
||||
Rule{pattern,callback:callback.into()}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,446 +0,0 @@
|
||||
//! This file contains tests for the user-facing error-handling logic in the flexer code generator.
|
||||
//!
|
||||
//! This file includes quite a bit of duplicated code, but this is known and intentional as it
|
||||
//! allows for increased clarity in the testing.
|
||||
|
||||
#![allow(missing_docs)]
|
||||
|
||||
use crate::prelude::LazyReader;
|
||||
use crate::prelude::logger::AnyLogger;
|
||||
use crate::prelude::logger::Disabled;
|
||||
use crate::prelude::reader::BookmarkManager;
|
||||
use flexer::*;
|
||||
use flexer::automata::pattern::Pattern;
|
||||
use flexer::Flexer;
|
||||
use flexer::generate;
|
||||
use flexer::group::{Registry, Identifier};
|
||||
use flexer::group;
|
||||
use flexer::prelude::*;
|
||||
use flexer::State;
|
||||
use flexer;
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Type Aliases ===
|
||||
// ====================
|
||||
|
||||
type Logger = Disabled;
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Shared Setup ===
|
||||
// ====================
|
||||
|
||||
/// A token type for these lexers.
|
||||
#[derive(Copy,Clone,Debug,PartialEq)]
|
||||
pub enum Token {
|
||||
Foo,
|
||||
Bar
|
||||
}
|
||||
|
||||
/// An output type for these lexers.
|
||||
#[allow(missing_docs)]
|
||||
#[derive(Clone,Debug,Default,PartialEq)]
|
||||
pub struct Output {
|
||||
tokens:Vec<Token>
|
||||
}
|
||||
|
||||
/// A testing lexer state.
|
||||
pub struct LexerState {
|
||||
lexer_states:group::Registry,
|
||||
initial_state:group::Identifier,
|
||||
}
|
||||
impl flexer::State for LexerState {
|
||||
fn new(_logger:&impl AnyLogger) -> Self {
|
||||
let mut lexer_states = group::Registry::default();
|
||||
let initial_state = lexer_states.define_group("ROOT",None);
|
||||
LexerState{lexer_states,initial_state}
|
||||
}
|
||||
|
||||
fn initial_state(&self) -> Identifier {
|
||||
self.initial_state
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
&self.lexer_states
|
||||
}
|
||||
|
||||
fn groups_mut(&mut self) -> &mut Registry {
|
||||
&mut self.lexer_states
|
||||
}
|
||||
|
||||
fn bookmarks(&self) -> &BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn specialize(&self) -> Result<String,GenError> {
|
||||
// Note [Naming "Lexer"]
|
||||
generate::specialize(self,"Lexer","Output")
|
||||
}
|
||||
}
|
||||
|
||||
/* Note [Naming "Lexer"]
|
||||
* ~~~~~~~~~~~~~~~~~~~~~
|
||||
* In general, the name passed to `specialize` should match that of your lexer definition. However
|
||||
* here, as we never compile the code, we set it to a generic constant that is a valid rust
|
||||
* identifier so as to reduce testing boilerplate.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Definition 1 ===
|
||||
// ====================
|
||||
|
||||
pub struct Lexer1 {
|
||||
lexer:Flexer<LexerState,Output,Logger>
|
||||
}
|
||||
|
||||
impl Deref for Lexer1 {
|
||||
type Target = Flexer<LexerState,Output,Logger>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Lexer1 {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl Lexer1 {
|
||||
pub fn new() -> Lexer1 {
|
||||
let logger = Logger::new("Lexer1");
|
||||
let lexer = Flexer::new(logger);
|
||||
Lexer1 {lexer}
|
||||
}
|
||||
|
||||
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl flexer::Definition for Lexer1 {
|
||||
fn define() -> Self {
|
||||
let mut lexer = Self::new();
|
||||
|
||||
let foo = Pattern::all_of("foo");
|
||||
|
||||
let root_group_id = lexer.initial_state();
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
root_group.create_rule(&foo, "ETERNAL SCREAMING");
|
||||
|
||||
lexer
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
self.lexer.groups()
|
||||
}
|
||||
|
||||
fn set_up(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn tear_down(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bad_rule_expression() {
|
||||
let lexer = Lexer1::define();
|
||||
let result = lexer.specialize();
|
||||
assert!(result.is_err());
|
||||
let message = result.unwrap_err().to_string();
|
||||
assert_eq!(message,"`ETERNAL SCREAMING` is not a valid rust expression.");
|
||||
}
|
||||
|
||||
|
||||
// ====================
|
||||
// === Definition 2 ===
|
||||
// ====================
|
||||
|
||||
pub struct Lexer2 {
|
||||
lexer:Flexer<LexerState,Output,Logger>
|
||||
}
|
||||
|
||||
impl Deref for Lexer2 {
|
||||
type Target = Flexer<LexerState,Output,Logger>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Lexer2 {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl Lexer2 {
|
||||
pub fn new() -> Lexer2 {
|
||||
let logger = Logger::new("Lexer2");
|
||||
let lexer = Flexer::new(logger);
|
||||
Lexer2{lexer}
|
||||
}
|
||||
|
||||
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl flexer::Definition for Lexer2 {
|
||||
fn define() -> Self {
|
||||
let mut lexer = Self::new();
|
||||
|
||||
let foo = Pattern::all_of("foo");
|
||||
|
||||
let root_group_id = lexer.initial_state();
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
root_group.create_rule(&foo, "self.test_function_no_reader()");
|
||||
|
||||
lexer
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
self.lexer.groups()
|
||||
}
|
||||
|
||||
fn set_up(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn tear_down(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_no_reader_arg() {
|
||||
let lexer = Lexer2::define();
|
||||
let result = lexer.specialize();
|
||||
let expected_message =
|
||||
"Bad argument to a callback function. It must take a single argument `reader`.";
|
||||
assert!(result.is_err());
|
||||
let message = result.unwrap_err().to_string();
|
||||
assert_eq!(message,expected_message);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Definition 3 ===
|
||||
// ====================
|
||||
|
||||
pub struct Lexer3 {
|
||||
lexer:Flexer<LexerState1,Output,Logger>
|
||||
}
|
||||
|
||||
impl Deref for Lexer3 {
|
||||
type Target = Flexer<LexerState1,Output,Logger>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Lexer3 {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl Lexer3 {
|
||||
pub fn new() -> Lexer3 {
|
||||
let logger = Logger::new("Lexer3");
|
||||
let lexer = Flexer::new(logger);
|
||||
Lexer3{lexer}
|
||||
}
|
||||
|
||||
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl flexer::Definition for Lexer3 {
|
||||
fn define() -> Self {
|
||||
let mut lexer = Self::new();
|
||||
|
||||
let foo = Pattern::all_of("foo");
|
||||
|
||||
let root_group_id = lexer.initial_state();
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
root_group.create_rule(&foo, "self.test_function_reader(reader)");
|
||||
|
||||
lexer
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
self.lexer.groups()
|
||||
}
|
||||
|
||||
fn set_up(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn tear_down(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LexerState1 {
|
||||
lexer_states:group::Registry,
|
||||
initial_state:group::Identifier,
|
||||
}
|
||||
impl flexer::State for LexerState1 {
|
||||
fn new(_logger:&impl AnyLogger) -> Self {
|
||||
let mut lexer_states = group::Registry::default();
|
||||
let initial_state = lexer_states.define_group("ROOT",None);
|
||||
LexerState1 {lexer_states,initial_state}
|
||||
}
|
||||
|
||||
fn initial_state(&self) -> Identifier {
|
||||
self.initial_state
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
&self.lexer_states
|
||||
}
|
||||
|
||||
fn groups_mut(&mut self) -> &mut Registry {
|
||||
&mut self.lexer_states
|
||||
}
|
||||
|
||||
fn bookmarks(&self) -> &BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn specialize(&self) -> Result<String,GenError> {
|
||||
generate::specialize(self,"Bad Lexer Name","Output")
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_bad_state_name() {
|
||||
let lexer = Lexer3::define();
|
||||
let result = lexer.specialize();
|
||||
assert!(result.is_err());
|
||||
let message = result.unwrap_err().to_string();
|
||||
assert_eq!(message,"`Bad Lexer Name` is not a valid rust identifier.");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ====================
|
||||
// === Definition 4 ===
|
||||
// ====================
|
||||
|
||||
pub struct Lexer4 {
|
||||
lexer:Flexer<LexerState2,Output,Logger>
|
||||
}
|
||||
|
||||
impl Deref for Lexer4 {
|
||||
type Target = Flexer<LexerState2,Output,Logger>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Lexer4 {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.lexer
|
||||
}
|
||||
}
|
||||
|
||||
impl Lexer4 {
|
||||
pub fn new() -> Lexer4 {
|
||||
let logger = Logger::new("Lexer4");
|
||||
let lexer = Flexer::new(logger);
|
||||
Lexer4{lexer}
|
||||
}
|
||||
|
||||
pub fn my_test_fun<R:LazyReader>(&mut self, _reader:&mut R) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl flexer::Definition for Lexer4 {
|
||||
fn define() -> Self {
|
||||
let mut lexer = Self::new();
|
||||
|
||||
let foo = Pattern::all_of("foo");
|
||||
|
||||
let root_group_id = lexer.initial_state();
|
||||
let root_group = lexer.groups_mut().group_mut(root_group_id);
|
||||
root_group.create_rule(&foo, "self.test_function_reader(reader)");
|
||||
|
||||
lexer
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
self.lexer.groups()
|
||||
}
|
||||
|
||||
fn set_up(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn tear_down(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LexerState2 {
|
||||
lexer_states:group::Registry,
|
||||
initial_state:group::Identifier,
|
||||
}
|
||||
impl flexer::State for LexerState2 {
|
||||
fn new(_logger:&impl AnyLogger) -> Self {
|
||||
let mut lexer_states = group::Registry::default();
|
||||
let initial_state = lexer_states.define_group("ROOT",None);
|
||||
LexerState2 {lexer_states,initial_state}
|
||||
}
|
||||
|
||||
fn initial_state(&self) -> Identifier {
|
||||
self.initial_state
|
||||
}
|
||||
|
||||
fn groups(&self) -> &Registry {
|
||||
&self.lexer_states
|
||||
}
|
||||
|
||||
fn groups_mut(&mut self) -> &mut Registry {
|
||||
&mut self.lexer_states
|
||||
}
|
||||
|
||||
fn bookmarks(&self) -> &BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn bookmarks_mut(&mut self) -> &mut BookmarkManager {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn specialize(&self) -> Result<String,GenError> {
|
||||
generate::specialize(self,"Lexer4","Bad output name")
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_bad_output_name() {
|
||||
let lexer = Lexer4::define();
|
||||
let result = lexer.specialize();
|
||||
assert!(result.is_err());
|
||||
let message = result.unwrap_err().to_string();
|
||||
assert_eq!(message,"`Bad output name` is not a valid rust path.");
|
||||
}
|
@ -12,7 +12,7 @@ test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
flexer = { path = "../../flexer", version = "0.1.0" }
|
||||
enso-prelude = { version = "0.1.3" }
|
||||
enso-flexer = { version = "0.1.3" }
|
||||
enso-prelude = { version = "0.1.7" }
|
||||
|
||||
uuid = { version = "0.8.1" , features = ["serde","v4","wasm-bindgen"] }
|
||||
|
344
lib/rust/lexer/definition/src/escape.rs
Normal file
344
lib/rust/lexer/definition/src/escape.rs
Normal file
@ -0,0 +1,344 @@
|
||||
//! This crate describes valid escape sequences inside Enso text literals.
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
use crate::lexeme;
|
||||
use crate::library::token;
|
||||
use crate::token::Shape;
|
||||
use crate::token::EscapeStyle;
|
||||
|
||||
|
||||
|
||||
// =======================
|
||||
// === EscapeCharacter ===
|
||||
// =======================
|
||||
|
||||
/// A representation of an escape character.
|
||||
#[derive(Clone,Debug,Default,Eq,PartialEq)]
|
||||
pub struct EscapeCharacter {
|
||||
/// The lexing representation of the escape.
|
||||
///
|
||||
/// This is the literal string that must occur in the Enso source code to be interpreted as this
|
||||
/// escape code.
|
||||
pub pattern : String,
|
||||
/// The literal representation of the escape.
|
||||
///
|
||||
/// This is the character-level encoding of this escape character in Rust, as the Rust escape
|
||||
/// representation and the Enso escape representation may differ, or Rust may not support the
|
||||
/// same literal escape code as Enso.
|
||||
pub repr : String,
|
||||
|
||||
}
|
||||
impl EscapeCharacter {
|
||||
fn new(pattern:impl Str, repr:impl Str) -> EscapeCharacter {
|
||||
let pattern = pattern.into();
|
||||
let repr = repr.into();
|
||||
Self{pattern,repr}
|
||||
}
|
||||
|
||||
/// The set of character escape codes that Enso supports.
|
||||
pub fn codes() -> Vec<EscapeCharacter> {
|
||||
vec![
|
||||
// === Null ===
|
||||
Self::new(r"\0","\0"),
|
||||
|
||||
// === Newlines ===
|
||||
Self::new(r"\n","\n"),
|
||||
Self::new(r"\r","\r"),
|
||||
Self::new(r"\f","\x0C"),
|
||||
|
||||
// === Tabs ===
|
||||
Self::new(r"\t","\t"),
|
||||
Self::new(r"\v","\x0B"),
|
||||
|
||||
// === Backspace ===
|
||||
Self::new(r"\b","\x08"),
|
||||
|
||||
// === Misc ===
|
||||
Self::new(r"\a","\x07"),
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Utilities ===
|
||||
// =================
|
||||
|
||||
/// Check if `c` is a hexadecimal digit.
|
||||
fn is_hex_digit(c:char) -> bool {
|
||||
let small_letters = 'a'..='f';
|
||||
let large_letters = 'A'..='F';
|
||||
let digits = '0'..='9';
|
||||
small_letters.contains(&c) || large_letters.contains(&c) || digits.contains(&c)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ======================
|
||||
// === EscapeSequence ===
|
||||
// ======================
|
||||
|
||||
/// A trait representing various kinds of escape sequence.
|
||||
///
|
||||
/// An escape sequence built using this trait will have its digits calculated by stripping the
|
||||
/// [`Self::prefix_length()`] and [`Self::suffix_length()`] from the input string, and then
|
||||
/// validated using [`Self::digits_min_length()`], [`Self::digits_max_length()`], and
|
||||
/// [`Self::validator()`]. All digits must be valid hexadecimal digits as defined by
|
||||
/// [`is_hex_digit`] above.
|
||||
///
|
||||
/// In addition, the implementation must define [`Self::style_on_success()`] and
|
||||
/// [`Self::style_on_failure()`] to determine the type of escape output on success and failure.
|
||||
pub trait EscapeSequence {
|
||||
/// Create a token of the relevant escape sequence type.
|
||||
///
|
||||
/// This function should be passed the _full_ match for the escape sequence as `repr`, including
|
||||
/// the delimiters. For example, if we have the escape sequence `\uAFAF`, we want to pass the
|
||||
/// whole string `"\uAFAF"`, not just `"AFAF"` to this function..
|
||||
fn build(repr:impl Str) -> Shape {
|
||||
if let Some(digits) = Self::validate(repr.as_ref()) {
|
||||
Shape::text_segment_escape(Self::style_on_success(),digits)
|
||||
} else {
|
||||
Shape::text_segment_escape(Self::style_on_failure(),repr)
|
||||
}
|
||||
}
|
||||
|
||||
/// Obtain the digits portion of the escape sequence.
|
||||
fn get_digits(repr:&str) -> &str {
|
||||
let start = Self::prefix_length();
|
||||
let end = repr.len().saturating_sub(Self::suffix_length());
|
||||
&repr[start..end]
|
||||
}
|
||||
|
||||
/// Validate the provided unicode string for this type of escape sequence.
|
||||
fn validate(repr:&str) -> Option<String> {
|
||||
let digits = Self::get_digits(repr);
|
||||
let ge_min = digits.len() >= Self::digits_min_length();
|
||||
let le_max = digits.len() <= Self::digits_max_length();
|
||||
let valid_length = ge_min && le_max;
|
||||
let valid_escape = Self::validator(digits);
|
||||
let valid_digits = digits.chars().all(is_hex_digit);
|
||||
let is_valid = valid_length && valid_escape && valid_digits;
|
||||
is_valid.as_some(digits.into())
|
||||
}
|
||||
|
||||
/// Return the length of the escape prefix.
|
||||
///
|
||||
/// The suffix is the characters that need to be stripped from the front of the escape sequence
|
||||
/// to get, in conjunction with [`EscapeSequence::suffix_length()`] the escape value itself.
|
||||
fn prefix_length() -> usize;
|
||||
|
||||
/// Return the length of the escape suffix.
|
||||
///
|
||||
/// The suffix is the characters that need to be stripped from the end of the escape sequence to
|
||||
/// get, in conjunction with [`EscapeSequence::prefix_length()`] the escape value itself.
|
||||
///
|
||||
/// This defaults to `0`.
|
||||
fn suffix_length() -> usize { 0 }
|
||||
|
||||
/// Return the minimum number of digits accepted by the escape sequence type.
|
||||
fn digits_min_length() -> usize;
|
||||
|
||||
/// Return the maximum number of digits accepted by the escape sequence type.
|
||||
///
|
||||
/// This defaults to `digits_min_length()`.
|
||||
fn digits_max_length() -> usize { Self::digits_min_length() }
|
||||
|
||||
/// A validator for any additional properties of the escape sequence.
|
||||
///
|
||||
/// It will be passed the _digits_ of the escape sequence, as defined by
|
||||
/// [`EscapeSequence::get_digits()`], and has a default implementation that always succeeds.
|
||||
/// Please implement this validator yourself if you would like to assert _additional_ properties
|
||||
/// on your escape sequence.
|
||||
fn validator(_digits:&str) -> bool { true }
|
||||
|
||||
/// The style of escape after successful validation.
|
||||
fn style_on_success() -> token::EscapeStyle;
|
||||
|
||||
/// The style of escape after unsuccessful validation.
|
||||
fn style_on_failure() -> token::EscapeStyle;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==================
|
||||
// === ByteEscape ===
|
||||
// ==================
|
||||
|
||||
/// A validator for ASCII escapes.
|
||||
///
|
||||
/// An ascii escape begins with the sequence `\x` and is followed by two hexadecimal digits (e.g.
|
||||
/// `\x0F`.
|
||||
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
|
||||
pub struct Byte;
|
||||
impl EscapeSequence for Byte {
|
||||
fn prefix_length() -> usize { lexeme::len(lexeme::literal::BYTE_ESCAPE_START) }
|
||||
fn digits_min_length() -> usize { 2 }
|
||||
fn style_on_success() -> EscapeStyle { token::EscapeStyle::Byte }
|
||||
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::Invalid }
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// === U16 ===
|
||||
// ===========
|
||||
|
||||
/// A validator for U16 unicode escapes.
|
||||
///
|
||||
/// A U16 unicode escape begins with the sequence `\u` and is followed by four hexadecimal digits,
|
||||
/// e.g. `\u0F0F`.
|
||||
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
|
||||
pub struct U16;
|
||||
impl EscapeSequence for U16 {
|
||||
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U16_ESCAPE_START) }
|
||||
fn digits_min_length() -> usize { 4 }
|
||||
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U16 }
|
||||
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// === U21 ===
|
||||
// ===========
|
||||
|
||||
/// A validator for U21 unicode escapes.
|
||||
///
|
||||
/// A U21 unicode escape begins with the sequence `\u`, followed by a sequence of 1-6 hexadecimal
|
||||
/// digits enclosed in braces (`{}`). Both `\u{F}` and `\u{AFAFAF}` are valid U21 escapes.
|
||||
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
|
||||
pub struct U21;
|
||||
impl EscapeSequence for U21 {
|
||||
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_START) }
|
||||
fn suffix_length() -> usize { lexeme::len(lexeme::literal::U21_ESCAPE_END) }
|
||||
fn digits_min_length() -> usize { 1 }
|
||||
fn digits_max_length() -> usize { 6 }
|
||||
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U21 }
|
||||
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========
|
||||
// === U32 ===
|
||||
// ===========
|
||||
|
||||
/// A validator for U32 unicode escapes.
|
||||
///
|
||||
/// A U32 unicode escape begins with the sequence \U, followed by 8 hexadecimal digits. Due to the
|
||||
/// restrictions of unicode, the first two digits _must_ be zero (e.g. `\U00AFAFAF`).
|
||||
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
|
||||
pub struct U32;
|
||||
impl EscapeSequence for U32 {
|
||||
fn prefix_length() -> usize { lexeme::len(lexeme::literal::U32_ESCAPE_START) }
|
||||
fn digits_min_length() -> usize { 8 }
|
||||
fn validator(digits: &str) -> bool { digits.starts_with("00") }
|
||||
fn style_on_success() -> EscapeStyle { token::EscapeStyle::U32 }
|
||||
fn style_on_failure() -> EscapeStyle { token::EscapeStyle::InvalidUnicode }
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
|
||||
// === Utilities ===
|
||||
|
||||
/// Tests a valid input to ensure that it succeeds.
|
||||
fn test_valid<Esc:EscapeSequence>(escape:&str, out:&str, out_style:token::EscapeStyle) {
|
||||
let shape = Shape::text_segment_escape(out_style,out);
|
||||
assert_eq!(Esc::build(escape),shape);
|
||||
}
|
||||
|
||||
/// Tests invalid inputs to ensure they fail for the provided escape type `Esc`.
|
||||
fn test_invalid<Esc:EscapeSequence>(invalid_cases:Vec<&str>, fail_with:token::EscapeStyle) {
|
||||
for escape in invalid_cases {
|
||||
let shape = Shape::text_segment_escape(fail_with,escape);
|
||||
assert_eq!(Esc::build(escape),shape)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Is Hex Digit ===
|
||||
|
||||
#[test]
|
||||
fn test_is_hex_digit() {
|
||||
for val in u8::min_value()..=u8::max_value() {
|
||||
let char = char::from(val);
|
||||
let is_in_small = ('a'..='f').contains(&char);
|
||||
let is_in_large = ('A'..='F').contains(&char);
|
||||
let is_in_dec_digit = ('0'..='9').contains(&char);
|
||||
let expected_result = is_in_small || is_in_large || is_in_dec_digit;
|
||||
assert_eq!(is_hex_digit(char),expected_result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Build ===
|
||||
|
||||
#[test]
|
||||
fn test_byte_build_valid() {
|
||||
test_valid::<Byte>(r"\x05","05",token::EscapeStyle::Byte);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_byte_build_invalid() {
|
||||
test_invalid::<Byte>(vec![
|
||||
r"\x5",
|
||||
r"\x",
|
||||
r"\x033",
|
||||
r"\xz2",
|
||||
],token::EscapeStyle::Invalid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u16_build_valid() {
|
||||
test_valid::<U16>(r"\u4fe3","4fe3",token::EscapeStyle::U16);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u16_build_invalid() {
|
||||
test_invalid::<U16>(vec![
|
||||
r"\u123",
|
||||
r"\u",
|
||||
r"\u123aff",
|
||||
r"\uazaz",
|
||||
],token::EscapeStyle::InvalidUnicode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u21_build_valid() {
|
||||
test_valid::<U21>(r"\u{fa4e}","fa4e",token::EscapeStyle::U21);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u21_build_invalid() {
|
||||
test_invalid::<U21>(vec![
|
||||
r"\u{1234567}",
|
||||
r"\u{}",
|
||||
],token::EscapeStyle::InvalidUnicode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u32_build_valid() {
|
||||
test_valid::<U32>(r"\U0014A890","0014A890",token::EscapeStyle::U32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u32_build_invalid() {
|
||||
test_invalid::<U32>(vec![
|
||||
r"\U12121212",
|
||||
r"\U",
|
||||
r"\U001234",
|
||||
r"\U001234567"
|
||||
],token::EscapeStyle::InvalidUnicode);
|
||||
}
|
||||
}
|
301
lib/rust/lexer/definition/src/lexeme.rs
Normal file
301
lib/rust/lexer/definition/src/lexeme.rs
Normal file
@ -0,0 +1,301 @@
|
||||
//! This module defines the base lexemes for the Enso language.
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
use enso_flexer::automata::pattern::Pattern;
|
||||
|
||||
|
||||
|
||||
// =================================
|
||||
// === Basic Pattern Definitions ===
|
||||
// =================================
|
||||
|
||||
/// Basic lexemes as patterns.
|
||||
///
|
||||
/// These must _only_ be used as part of the lexer definition, not used at runtime as they are not
|
||||
/// performant at all.
|
||||
pub mod definition_pattern {
|
||||
use super::*;
|
||||
|
||||
/// Match lower-case ASCII letters.
|
||||
pub fn lower_ascii_letter() -> Pattern {
|
||||
Pattern::range('a'..='z')
|
||||
}
|
||||
|
||||
/// Match upper-case ASCII letters.
|
||||
pub fn upper_ascii_letter() -> Pattern {
|
||||
Pattern::range('A'..='Z')
|
||||
}
|
||||
|
||||
/// Match ASCII digits.
|
||||
pub fn ascii_digit() -> Pattern {
|
||||
Pattern::range('0'..='9')
|
||||
}
|
||||
|
||||
/// Match ASCII letters.
|
||||
pub fn ascii_letter() -> Pattern {
|
||||
lower_ascii_letter() | upper_ascii_letter()
|
||||
}
|
||||
|
||||
/// Match ASCII alphanumeric characters.
|
||||
pub fn ascii_alpha_num() -> Pattern {
|
||||
ascii_digit() | ascii_letter()
|
||||
}
|
||||
|
||||
/// Match at least one ASCII space character.
|
||||
pub fn spaces() -> Pattern {
|
||||
into_pattern(literal::SPACE).many1()
|
||||
}
|
||||
|
||||
/// Match the end-of-file character.
|
||||
pub fn eof() -> Pattern {
|
||||
Pattern::eof()
|
||||
}
|
||||
|
||||
/// Match a newline.
|
||||
///
|
||||
/// This matches both Unix (LF) and Windows (CRLF) styles of newlines. This is particularly
|
||||
/// important so as not to result in incorrect spans on windows clients.
|
||||
pub fn newline() -> Pattern {
|
||||
let lf = into_pattern(literal::LF);
|
||||
let crlf = into_pattern(literal::CRLF);
|
||||
lf | crlf
|
||||
}
|
||||
|
||||
/// The characters that break tokens in Enso.
|
||||
pub fn whitespace_break_chars() -> String {
|
||||
[literal::TAB,literal::LF,literal::CR].concat()
|
||||
}
|
||||
|
||||
/// The characters that break token lexing in Enso.
|
||||
pub fn break_chars() -> String {
|
||||
[
|
||||
literal::INTERPOLATE_QUOTE,
|
||||
literal::COMMENT,
|
||||
literal::ANNOTATION_SYMBOL,
|
||||
literal::SPACE,
|
||||
literal::COMMA,
|
||||
literal::DOT,
|
||||
literal::OPERATOR_CHARS,
|
||||
literal::GROUP_CHARS,
|
||||
&whitespace_break_chars()
|
||||
].concat()
|
||||
}
|
||||
|
||||
/// Adds the basic characters not allowed in a raw segment in a format text literal.
|
||||
fn add_base_format_disallows(chars:&mut String) {
|
||||
chars.push_str(literal::INTERPOLATE_QUOTE);
|
||||
chars.push_str(literal::SLASH);
|
||||
chars.push_str(literal::LF);
|
||||
chars.push_str(literal::CR);
|
||||
}
|
||||
|
||||
/// Characters allowable inside a raw segment in a format line.
|
||||
pub fn format_line_raw_char() -> Pattern {
|
||||
let mut chars = String::new();
|
||||
chars.push_str(literal::FORMAT_QUOTE);
|
||||
add_base_format_disallows(&mut chars);
|
||||
Pattern::none_of(&chars)
|
||||
}
|
||||
|
||||
/// Characters allowable inside a raw segment in a format block.
|
||||
pub fn format_block_raw_char() -> Pattern {
|
||||
let mut chars = String::new();
|
||||
add_base_format_disallows(&mut chars);
|
||||
Pattern::none_of(&chars)
|
||||
}
|
||||
|
||||
/// Adds the basic characters not allowed in a raw segment in a raw text literal.
|
||||
fn add_base_raw_disallows(chars:&mut String) {
|
||||
chars.push_str(literal::SLASH);
|
||||
chars.push_str(literal::LF);
|
||||
chars.push_str(literal::CR);
|
||||
}
|
||||
|
||||
/// Characters allowable inside a raw segment in a raw line.
|
||||
pub fn raw_line_raw_char() -> Pattern {
|
||||
let mut chars = String::new();
|
||||
chars.push_str(literal::RAW_QUOTE);
|
||||
add_base_raw_disallows(&mut chars);
|
||||
Pattern::none_of(&chars)
|
||||
}
|
||||
|
||||
/// Characters allowable inside a raw segment in a raw block.
|
||||
pub fn raw_block_raw_char() -> Pattern {
|
||||
let mut chars = String::new();
|
||||
add_base_raw_disallows(&mut chars);
|
||||
Pattern::none_of(&chars)
|
||||
}
|
||||
|
||||
/// The characters allowed as digits in a unicode escape.
|
||||
pub fn unicode_escape_digit() -> Pattern {
|
||||
let chars = &[
|
||||
literal::FORMAT_QUOTE,
|
||||
literal::RAW_QUOTE,
|
||||
literal::INTERPOLATE_QUOTE,
|
||||
literal::SLASH,
|
||||
literal::LF,
|
||||
literal::CR,
|
||||
"{}"
|
||||
].concat();
|
||||
Pattern::none_of(chars)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===============================
|
||||
// === Enso Lexeme Definitions ===
|
||||
// ===============================
|
||||
|
||||
/// The literal lexemes that make up the Enso language.
|
||||
pub mod literal {
|
||||
|
||||
/// The type of a literal lexeme.
|
||||
pub type Literal = &'static str;
|
||||
|
||||
// === The Lexemes ===
|
||||
|
||||
/// The space character.
|
||||
pub const SPACE:Literal = " ";
|
||||
|
||||
/// The line-feed character.
|
||||
pub const LF:Literal = "\n";
|
||||
|
||||
/// The carriage-return character.
|
||||
pub const CR:Literal = "\r";
|
||||
|
||||
/// The crlf windows-style line ending.
|
||||
pub const CRLF:Literal = "\r\n";
|
||||
|
||||
/// The tab character.
|
||||
pub const TAB:Literal = "\t";
|
||||
|
||||
/// The comment character.
|
||||
pub const COMMENT:Literal = "#";
|
||||
|
||||
/// The doc comment character.
|
||||
pub const DOC_COMMENT:Literal = "##";
|
||||
|
||||
/// The symbol for beginning an annotation.
|
||||
pub const ANNOTATION_SYMBOL:Literal = "@";
|
||||
|
||||
/// The dot symbol
|
||||
pub const DOT:Literal = ".";
|
||||
|
||||
/// Two dots.
|
||||
pub const TWO_DOTS:Literal = "..";
|
||||
|
||||
/// Three dots.
|
||||
pub const THREE_DOTS:Literal = "...";
|
||||
|
||||
/// Three dots.
|
||||
pub const COMMA:Literal = ",";
|
||||
|
||||
/// The `in` operator.
|
||||
pub const OPERATOR_IN:Literal = "in";
|
||||
|
||||
/// The tick allowable at the end of an identifier.
|
||||
pub const IDENTIFIER_TICK:Literal = "'";
|
||||
|
||||
/// The quote used to delimit interpolations in format text literals.
|
||||
pub const INTERPOLATE_QUOTE:Literal = "`";
|
||||
|
||||
/// The quote used to delimit format text literals.
|
||||
pub const FORMAT_QUOTE:Literal = "'";
|
||||
|
||||
/// The quote used to delimit format block literals.
|
||||
pub const FORMAT_BLOCK_QUOTE:Literal = "'''";
|
||||
|
||||
/// The quote used to delimit raw text literals.
|
||||
pub const RAW_QUOTE:Literal = "\"";
|
||||
|
||||
/// The quote used to delimit raw block literals.
|
||||
pub const RAW_BLOCK_QUOTE:Literal = "\"\"\"";
|
||||
|
||||
/// The equals operator.
|
||||
pub const EQUALS:Literal = "=";
|
||||
|
||||
/// The equality comparison operator.
|
||||
pub const EQUALS_COMP:Literal = "==";
|
||||
|
||||
/// Greater-than or equal.
|
||||
pub const GE_OPERATOR:Literal = ">=";
|
||||
|
||||
/// Less-than or equal.
|
||||
pub const LE_OPERATOR:Literal = "<=";
|
||||
|
||||
/// Inequality comparison operator.
|
||||
pub const NOT_EQUAL:Literal = "!=";
|
||||
|
||||
/// The hash eq operator.
|
||||
pub const HASH_EQ:Literal = "#=";
|
||||
|
||||
/// The wide arrow operator.
|
||||
pub const WIDE_ARROW:Literal = "=>";
|
||||
|
||||
/// The blank identifier.
|
||||
pub const BLANK_IDENT:Literal = "_";
|
||||
|
||||
/// The identifier segment separator.
|
||||
pub const IDENT_SEGMENT_SEPARATOR:Literal = "_";
|
||||
|
||||
/// The separator between a number literal's explicit base and the number itself.
|
||||
pub const NUMBER_BASE_SEPARATOR:Literal = "_";
|
||||
|
||||
/// The separator between the integer and fractional parts of the number literal.
|
||||
pub const DECIMAL_SEPARATOR:Literal = ".";
|
||||
|
||||
/// The backslash character.
|
||||
pub const SLASH:Literal = r"\";
|
||||
|
||||
/// An escaped [`SLASH`].
|
||||
pub const ESCAPED_SLASH:Literal = r"\\";
|
||||
|
||||
/// The beginning of a byte escape.
|
||||
pub const BYTE_ESCAPE_START:Literal = r"\x";
|
||||
|
||||
/// The beginning of a u16 escape.
|
||||
pub const U16_ESCAPE_START:Literal = r"\u";
|
||||
|
||||
/// The beginning of a u21 escape.
|
||||
pub const U21_ESCAPE_START:Literal = r"\u{";
|
||||
|
||||
/// The end of a u21 escape.
|
||||
pub const U21_ESCAPE_END:Literal = "}";
|
||||
|
||||
/// The beginning of a u32 escape.
|
||||
pub const U32_ESCAPE_START:Literal = r"\U";
|
||||
|
||||
/// The allowable group characters in Enso.
|
||||
pub const GROUP_CHARS:Literal = "()[]{}";
|
||||
|
||||
/// The allowable operator characters in Enso.
|
||||
pub const OPERATOR_CHARS:Literal = ";!$%&*+-/<>?^~|:\\";
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =========================
|
||||
// === Utility Functions ===
|
||||
// =========================
|
||||
|
||||
/// Get the first character of the lexeme, if it exists.
|
||||
pub fn char(literal:&'static str) -> Option<char> {
|
||||
literal.chars().nth(0)
|
||||
}
|
||||
|
||||
/// Get the first character of the lexeme, assuming that it exists.
|
||||
pub fn unsafe_char(literal:&'static str) -> char {
|
||||
char(literal).expect("The first character of the literal exists.")
|
||||
}
|
||||
|
||||
/// Convert the lexeme into a pattern.
|
||||
pub fn into_pattern(literal:&'static str) -> Pattern {
|
||||
literal.into()
|
||||
}
|
||||
|
||||
/// The proper length of the `literal`.
|
||||
pub fn len(literal:&'static str) -> usize {
|
||||
literal.chars().count()
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -10,7 +10,10 @@
|
||||
|
||||
//! This library defines the lexer for the syntax of the Enso language.
|
||||
|
||||
pub mod escape;
|
||||
pub mod lexeme;
|
||||
pub mod lexer;
|
||||
pub mod rule;
|
||||
pub mod token;
|
||||
|
||||
/// A module that can be re-exported under the same name in the generation crate.
|
||||
@ -19,11 +22,14 @@ pub mod token;
|
||||
/// Enso lexer definition. In this project, imports should _not_ be made from the crate root
|
||||
/// _except_ through use of this `library` module.
|
||||
pub mod library {
|
||||
pub use crate::escape;
|
||||
pub use crate::lexeme;
|
||||
pub use crate::token;
|
||||
pub use crate::rules;
|
||||
}
|
||||
|
||||
/// A collection of functionality for working with the lexer definition.
|
||||
pub mod prelude {
|
||||
pub use flexer::prelude::*;
|
||||
pub use flexer::prelude::logger::*;
|
||||
pub use enso_flexer::prelude::*;
|
||||
pub use enso_flexer::prelude::logger::*;
|
||||
}
|
||||
|
26
lib/rust/lexer/definition/src/rule.rs
Normal file
26
lib/rust/lexer/definition/src/rule.rs
Normal file
@ -0,0 +1,26 @@
|
||||
//! This file contains a macro to simplify writing the lexer rules.
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === Rules Macro ===
|
||||
// ===================
|
||||
|
||||
/// Define a group of rules for the lexer.
|
||||
///
|
||||
/// All of the rules must be defined for the same `state_name`, which must be the in-scope name of
|
||||
/// the state for which the rules are being defined. Each `pattern` is a non-reference pattern that
|
||||
/// the rule is being defined to match, and `code` is the code that will be executed when the rule
|
||||
/// matches, omitting the (first) `reader` argument).
|
||||
///
|
||||
/// Branches are matched _in order_, from top-to-bottom, much like a standard `match` statement.
|
||||
///
|
||||
/// Please see `lexer.rs` for myriad examples of this macro's use.
|
||||
#[macro_export]
|
||||
macro_rules! rules {
|
||||
($state_name:ident with $($pattern:expr => $path_root:ident $(.$path:ident)* ($($arg:tt)*)),+ $(,)?) => {
|
||||
$($state_name.create_rule(&$pattern,stringify!{
|
||||
$path_root $(.$path)* (reader,$($arg)*)
|
||||
});)*
|
||||
};
|
||||
}
|
@ -6,6 +6,8 @@
|
||||
|
||||
use crate::prelude::*;
|
||||
|
||||
use crate::lexeme;
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
@ -24,6 +26,11 @@ pub struct Token {
|
||||
}
|
||||
|
||||
impl Token {
|
||||
/// Constructor.
|
||||
pub fn new(shape:Shape, length:usize, offset:usize) -> Token {
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Get the length that the token takes up in the program source.
|
||||
pub fn source_length(&self) -> usize {
|
||||
self.length + self.offset
|
||||
@ -31,10 +38,9 @@ impl Token {
|
||||
}
|
||||
|
||||
/// Constructors for the various forms of token.
|
||||
#[allow(non_snake_case)]
|
||||
impl Token {
|
||||
/// Construct a token representing a referent identifier.
|
||||
pub fn Referent(name:impl Str, offset:usize) -> Token {
|
||||
pub fn referent(name:impl Str, offset:usize) -> Token {
|
||||
let str = name.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::Referent(str);
|
||||
@ -42,7 +48,7 @@ impl Token {
|
||||
}
|
||||
|
||||
/// Construct a token representing a variable identifier.
|
||||
pub fn Variable(name:impl Str, offset:usize) -> Token {
|
||||
pub fn variable(name:impl Str, offset:usize) -> Token {
|
||||
let str = name.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::Variable(str);
|
||||
@ -50,7 +56,7 @@ impl Token {
|
||||
}
|
||||
|
||||
/// Construct a token representing an external identifier.
|
||||
pub fn External(name:impl Str, offset:usize) -> Token {
|
||||
pub fn external(name:impl Str, offset:usize) -> Token {
|
||||
let str = name.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::External(str);
|
||||
@ -58,61 +64,157 @@ impl Token {
|
||||
}
|
||||
|
||||
/// Construct a token representing a blank identifier.
|
||||
pub fn Blank(offset:usize) -> Token {
|
||||
pub fn blank(offset:usize) -> Token {
|
||||
let shape = Shape::Blank;
|
||||
let length = 1;
|
||||
let length = lexeme::len(lexeme::literal::BLANK_IDENT);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an operator.
|
||||
pub fn Operator(name:impl Str, offset:usize) -> Token {
|
||||
let str = name.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::Operator(str);
|
||||
pub fn operator(name:impl Str, offset:usize) -> Token {
|
||||
let name = name.into();
|
||||
let length = name.chars().count();
|
||||
let shape = Shape::Operator(name);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a modifier operator.
|
||||
pub fn Modifier(name:impl Str, offset:usize) -> Token {
|
||||
let str = name.into();
|
||||
let length = str.chars().count() + 1;
|
||||
let shape = Shape::Modifier(str);
|
||||
pub fn modifier(name:impl Str, offset:usize) -> Token {
|
||||
let name = name.into();
|
||||
let modifier_len = lexeme::len(lexeme::literal::EQUALS);
|
||||
let length = name.chars().count() + modifier_len;
|
||||
let shape = Shape::Modifier(name);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing
|
||||
pub fn annotation(name_str:impl Str, offset:usize) -> Token {
|
||||
let name = name_str.into();
|
||||
let annotation_len = lexeme::len(lexeme::literal::ANNOTATION_SYMBOL);
|
||||
let length = name.chars().count() + annotation_len;
|
||||
let shape = Shape::Annotation(name);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a number literal.
|
||||
pub fn Number(base:impl Str, num:impl Into<String>, offset:usize) -> Token {
|
||||
let str = num.into();
|
||||
let base_str = base.into();
|
||||
let length = if base_str.is_empty() {
|
||||
str.chars().count()
|
||||
pub fn number(base:impl Str, num:impl Into<String>, offset:usize) -> Token {
|
||||
let number = num.into();
|
||||
let base = base.into();
|
||||
let length = if base.is_empty() {
|
||||
number.chars().count()
|
||||
} else {
|
||||
base_str.chars().count() + 1 + str.chars().count()
|
||||
let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR);
|
||||
base.chars().count() + base_sep_len + number.chars().count()
|
||||
};
|
||||
let shape = Shape::Number{base:base_str,number:str};
|
||||
let shape = Shape::Number{base,number};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a dangling number base.
|
||||
pub fn DanglingBase(base:impl Str, offset:usize) -> Token {
|
||||
let base_str = base.into();
|
||||
let length = base_str.chars().count() + 1;
|
||||
let shape = Shape::DanglingBase(base_str);
|
||||
pub fn dangling_base(base:impl Str, offset:usize) -> Token {
|
||||
let base_str = base.into();
|
||||
let base_sep_len = lexeme::len(lexeme::literal::NUMBER_BASE_SEPARATOR);
|
||||
let length = base_str.chars().count() + base_sep_len;
|
||||
let shape = Shape::DanglingBase(base_str);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a text literal.
|
||||
pub fn Text(text:impl Str, offset:usize) -> Token {
|
||||
let str = text.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::Text(str);
|
||||
/// Construct a token representing a line of text.
|
||||
pub fn text_line(style:TextStyle, segments:Vec<Token>, offset:usize) -> Token {
|
||||
let segments_len:usize = segments.iter().map(|s| s.source_length()).sum();
|
||||
let length = style.length() + segments_len;
|
||||
let shape = Shape::TextLine{style,segments};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an inline block text literal.
|
||||
pub fn text_inline_block
|
||||
( style : TextStyle
|
||||
, segments : Vec<Token>
|
||||
, offset : usize
|
||||
) -> Token {
|
||||
let segments_length:usize = segments.iter().map(|s| s.source_length()).sum();
|
||||
let length = style.length() + segments_length;
|
||||
let shape = Shape::TextInlineBlock{style,segments};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a block of text.
|
||||
pub fn text_block
|
||||
( start_line_ending : LineEnding
|
||||
, style : TextStyle
|
||||
, lines : Vec<Token>
|
||||
, indent : usize
|
||||
, offset : usize
|
||||
) -> Token {
|
||||
let length = style.length() + start_line_ending.size() + lines.iter().fold(0, |l,r|
|
||||
l + match r.shape {
|
||||
Shape::Line {..} => indent + r.source_length(),
|
||||
Shape::BlankLine(_) => r.source_length(),
|
||||
_ => unreachable_panic!("Text blocks should only contain lines."),
|
||||
}
|
||||
);
|
||||
let shape = Shape::TextBlock{start_line_ending,style,lines};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an invalid quote.
|
||||
pub fn invalid_quote(bad_quotes:impl Str, offset:usize) -> Token {
|
||||
let bad_string = bad_quotes.into();
|
||||
let length = bad_string.chars().count();
|
||||
let shape = Shape::InvalidQuote(bad_string);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a raw text segment.
|
||||
pub fn text_segment_raw(str:impl Str, offset:usize) -> Token {
|
||||
let string = str.into();
|
||||
let length = string.chars().count();
|
||||
let shape = Shape::TextSegmentRaw(string);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an escape sequence.
|
||||
pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str, offset:usize) -> Token {
|
||||
let repr = repr_str.into();
|
||||
let length = style.size() + repr.chars().count();
|
||||
let shape = Shape::TextSegmentEscape{style,repr};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an escape sequence using a literal `shape`.
|
||||
pub fn text_segment_escape_from_shape(shape:Shape, offset:usize) -> Token {
|
||||
match &shape {
|
||||
Shape::TextSegmentEscape{style,repr} => {
|
||||
let length = style.size() + repr.chars().count();
|
||||
Token{shape,length,offset}
|
||||
},
|
||||
_ => unreachable_panic!("Shape must be a TextSegmentEscape.")
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct a token representing an interpolated text segment.
|
||||
pub fn text_segment_interpolate(tokens:Vec<Token>, offset:usize) -> Token {
|
||||
let length_of_interpolation_ticks = 2;
|
||||
let length =
|
||||
length_of_interpolation_ticks + tokens.iter().fold(0,|l,r| l + r.source_length());
|
||||
let shape = Shape::TextSegmentInterpolate{tokens};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an unclosed interpolated text segment.
|
||||
pub fn text_segment_unclosed_interpolate(tokens:Vec<Token>, offset:usize) -> Token {
|
||||
let length_of_interpolation_tick = 1;
|
||||
let length =
|
||||
length_of_interpolation_tick + tokens.iter().fold(0,|l,r| l + r.source_length());
|
||||
let shape = Shape::TextSegmentUnclosedInterpolate{tokens};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a line of tokens.
|
||||
pub fn Line(tokens:Vec<Token>, offset:usize, trailing_line_ending:LineEnding) -> Token {
|
||||
pub fn line(tokens:Vec<Token>, offset:usize, trailing_line_ending:LineEnding) -> Token {
|
||||
let line_ending_len = trailing_line_ending.size();
|
||||
let length = tokens.iter().fold(line_ending_len,|l,r| l + r.offset + r.length);
|
||||
let length = tokens.iter().fold(line_ending_len,|l,r| l + r.source_length());
|
||||
let shape = Shape::Line{tokens,trailing_line_ending};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
@ -121,26 +223,25 @@ impl Token {
|
||||
///
|
||||
/// The `offset` for blank lines is from the leftmost column, not from the parent block's
|
||||
/// indentation.
|
||||
pub fn BlankLine(offset:usize, trailing_line_ending:LineEnding) -> Token {
|
||||
pub fn blank_line(offset:usize, trailing_line_ending:LineEnding) -> Token {
|
||||
let length = trailing_line_ending.size();
|
||||
let shape = Shape::BlankLine(trailing_line_ending);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a block.
|
||||
pub fn Block
|
||||
pub fn block
|
||||
( block_type : BlockType
|
||||
, indent : usize
|
||||
, lines : Vec<Token>
|
||||
, offset : usize
|
||||
) -> Token {
|
||||
let length = lines.iter().map(|line| {
|
||||
let line_length = line.length;
|
||||
let line_offset = line.offset;
|
||||
match line.shape {
|
||||
Shape::Line{..} => indent + line_offset + line_length,
|
||||
Shape::BlankLine(_) => line_offset + line_length,
|
||||
_ => unreachable_panic!("Tokens in a blocks should always be lines."),
|
||||
Shape::Line{..} => indent + line.source_length(),
|
||||
Shape::BlankLine(_) => line.source_length(),
|
||||
_ =>
|
||||
unreachable_panic!("Tokens in a blocks should always be lines."),
|
||||
}
|
||||
}).sum();
|
||||
let shape = Shape::Block{block_type,indent,lines};
|
||||
@ -148,18 +249,40 @@ impl Token {
|
||||
}
|
||||
|
||||
/// Construct a token representing an invalid suffix.
|
||||
pub fn InvalidSuffix(text:impl Str, offset:usize) -> Token {
|
||||
let str = text.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::InvalidSuffix(str);
|
||||
pub fn invalid_suffix(text:impl Str, offset:usize) -> Token {
|
||||
let text = text.into();
|
||||
let length = text.chars().count();
|
||||
let shape = Shape::InvalidSuffix(text);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing an unrecognised lexeme.
|
||||
pub fn Unrecognized(text:impl Str, offset:usize) -> Token {
|
||||
let str = text.into();
|
||||
let length = str.chars().count();
|
||||
let shape = Shape::Unrecognized(str);
|
||||
pub fn unrecognized(text:impl Str, offset:usize) -> Token {
|
||||
let text = text.into();
|
||||
let length = text.chars().count();
|
||||
let shape = Shape::Unrecognized(text);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a disable comment.
|
||||
pub fn disable_comment(text:impl Str, offset:usize) -> Token {
|
||||
let text = text.into();
|
||||
let comment_len = lexeme::len(lexeme::literal::COMMENT);
|
||||
let length = text.chars().count() + comment_len;
|
||||
let shape = Shape::DisableComment(text);
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
|
||||
/// Construct a token representing a documentation comment.
|
||||
pub fn doc_comment(lines:Vec<Token>, indent:usize, offset:usize) -> Token {
|
||||
let length = lines.iter().map(|line| {
|
||||
match line.shape {
|
||||
Shape::Line{..} => indent + line.source_length(),
|
||||
Shape::BlankLine(_) => line.source_length(),
|
||||
_ => unreachable_panic!("Tokens in a doc comment should always be lines."),
|
||||
}
|
||||
}).sum();
|
||||
let shape = Shape::DocComment{lines,indent};
|
||||
Token{shape,length,offset}
|
||||
}
|
||||
}
|
||||
@ -179,9 +302,11 @@ pub enum BlockType {
|
||||
Discontinuous,
|
||||
}
|
||||
|
||||
// ===================
|
||||
// === NewlineType ===
|
||||
// ===================
|
||||
|
||||
|
||||
// ==================
|
||||
// === LineEnding ===
|
||||
// ==================
|
||||
|
||||
/// The type of newline associated with the line.
|
||||
#[derive(Copy,Clone,Debug,Display,PartialEq,Eq)]
|
||||
@ -195,12 +320,14 @@ pub enum LineEnding {
|
||||
}
|
||||
|
||||
impl LineEnding {
|
||||
const NO_LENGTH:usize = 0;
|
||||
|
||||
/// Get the number of rust `char`s that the newline type takes up.
|
||||
pub fn size(self) -> usize {
|
||||
match self {
|
||||
Self::None => 0,
|
||||
Self::LF => 1,
|
||||
Self::CRLF => 2,
|
||||
Self::None => Self::NO_LENGTH,
|
||||
Self::LF => lexeme::len(lexeme::literal::LF),
|
||||
Self::CRLF => lexeme::len(lexeme::literal::CRLF),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -216,6 +343,128 @@ impl Default for LineEnding {
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === TextStyle ===
|
||||
// =================
|
||||
|
||||
/// The style of the text literal.
|
||||
#[derive(Copy,Clone,Debug,Eq,PartialEq)]
|
||||
pub enum TextStyle {
|
||||
// === Line ===
|
||||
|
||||
/// A interpolated text line literal.
|
||||
FormatLine,
|
||||
/// A raw text line literal.
|
||||
RawLine,
|
||||
/// An unclosed text line literal.
|
||||
UnclosedLine,
|
||||
|
||||
// === Inline Block ===
|
||||
|
||||
/// A format inline block text literal.
|
||||
FormatInlineBlock,
|
||||
/// A raw inline block text literal.
|
||||
RawInlineBlock,
|
||||
|
||||
// === Block ===
|
||||
|
||||
/// An interpolated text block literal.
|
||||
FormatBlock,
|
||||
/// A raw text block literal.
|
||||
RawBlock,
|
||||
}
|
||||
|
||||
impl TextStyle {
|
||||
/// Calculate the length of the delimiters for a particular style of text literal.
|
||||
pub fn length(self) -> usize {
|
||||
match self {
|
||||
TextStyle::FormatLine => lexeme::len(lexeme::literal::FORMAT_QUOTE) * 2,
|
||||
TextStyle::RawLine => lexeme::len(lexeme::literal::RAW_QUOTE) * 2,
|
||||
TextStyle::FormatInlineBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE),
|
||||
TextStyle::RawInlineBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE),
|
||||
TextStyle::UnclosedLine => lexeme::len(lexeme::literal::FORMAT_QUOTE),
|
||||
TextStyle::FormatBlock => lexeme::len(lexeme::literal::FORMAT_BLOCK_QUOTE),
|
||||
TextStyle::RawBlock => lexeme::len(lexeme::literal::RAW_BLOCK_QUOTE),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the text literal is a line literal.
|
||||
pub fn is_line_literal(self) -> bool {
|
||||
match self {
|
||||
TextStyle::RawLine => true,
|
||||
TextStyle::FormatLine => true,
|
||||
TextStyle::UnclosedLine => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the text literal is an inline block literal.
|
||||
pub fn is_inline_block_literal(self) -> bool {
|
||||
match self {
|
||||
TextStyle::FormatInlineBlock => true,
|
||||
TextStyle::RawInlineBlock => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the text literal is a block literal.
|
||||
pub fn is_block_literal(self) -> bool {
|
||||
match self {
|
||||
TextStyle::FormatBlock => true,
|
||||
TextStyle::RawBlock => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === EscapeStyle ===
|
||||
// ===================
|
||||
|
||||
/// A description of the style of escape sequence seen.
|
||||
#[derive(Clone,Copy,Debug,Eq,PartialEq)]
|
||||
pub enum EscapeStyle {
|
||||
/// A \xNN-style byte escape.
|
||||
Byte,
|
||||
/// Unicode 16-bit escape sequence.
|
||||
U16,
|
||||
/// Unicode 21-bit escape sequence.
|
||||
U21,
|
||||
/// Unicode 32-bit escape sequence.
|
||||
U32,
|
||||
/// A literal escape character.
|
||||
Literal,
|
||||
/// An invalid unicode escape.
|
||||
InvalidUnicode,
|
||||
/// An invalid escape.
|
||||
Invalid,
|
||||
/// An escape slash without any following escape.
|
||||
Unfinished,
|
||||
}
|
||||
impl EscapeStyle {
|
||||
const NO_ADDITIONAL_LENGTH:usize = 0;
|
||||
|
||||
/// Get the length taken up in source by the delimiters to an escape type.
|
||||
pub fn size(self) -> usize {
|
||||
match self {
|
||||
EscapeStyle::Byte => lexeme::len(lexeme::literal::BYTE_ESCAPE_START),
|
||||
EscapeStyle::Literal => lexeme::len(lexeme::literal::SLASH),
|
||||
EscapeStyle::U16 => lexeme::len(lexeme::literal::U16_ESCAPE_START),
|
||||
EscapeStyle::U32 => lexeme::len(lexeme::literal::U32_ESCAPE_START),
|
||||
EscapeStyle::U21 => {
|
||||
let start_len = lexeme::len(lexeme::literal::U21_ESCAPE_START);
|
||||
let end_len = lexeme::len(lexeme::literal::U21_ESCAPE_END);
|
||||
start_len + end_len
|
||||
}
|
||||
_ => Self::NO_ADDITIONAL_LENGTH,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Shape ===
|
||||
// =============
|
||||
@ -241,25 +490,73 @@ pub enum Shape {
|
||||
Operator(String),
|
||||
/// A modifier identifier.
|
||||
Modifier(String),
|
||||
/// An annotation.
|
||||
Annotation(String),
|
||||
|
||||
// === Literals ===
|
||||
|
||||
/// A literal number.
|
||||
Number{base:String, number:String},
|
||||
Number {
|
||||
/// The (optional) base for the number to be interpreted in.
|
||||
base:String,
|
||||
/// The number itself, possibly with a decimal point.
|
||||
number:String
|
||||
},
|
||||
/// A dangling base from a number literal.
|
||||
DanglingBase(String),
|
||||
/// A text literal.
|
||||
///
|
||||
/// This is currently way too simplistic to actually represent text, but it is a good
|
||||
/// placeholder.
|
||||
Text(String),
|
||||
/// A text line literal.
|
||||
TextLine {
|
||||
/// The type of literal being encoded.
|
||||
style : TextStyle,
|
||||
/// The segments that make up the line of text.
|
||||
segments : Vec<Token>,
|
||||
},
|
||||
/// An inline block text literal.
|
||||
TextInlineBlock {
|
||||
/// The type of literal being encoded.
|
||||
style : TextStyle,
|
||||
/// The segments that make up the line of text.
|
||||
segments : Vec<Token>,
|
||||
},
|
||||
/// A text block literal.
|
||||
TextBlock {
|
||||
/// The line ending that occurs directly after the opening quote marks.
|
||||
start_line_ending : LineEnding,
|
||||
/// The type of literal being encoded.
|
||||
style : TextStyle,
|
||||
/// The lines in the text block literal.
|
||||
lines : Vec<Token>
|
||||
},
|
||||
/// An invalid quote for a text literal.
|
||||
InvalidQuote(String),
|
||||
/// A segment of a line of text containing only literal text.
|
||||
TextSegmentRaw(String),
|
||||
/// A segment of a line of text that represents an escape sequence.
|
||||
TextSegmentEscape {
|
||||
/// The type of escape being represented.
|
||||
style : EscapeStyle,
|
||||
/// The literal escape sequence.
|
||||
repr : String,
|
||||
},
|
||||
/// A segment of a line of text that contains an interpolated expression.
|
||||
TextSegmentInterpolate {
|
||||
/// The tokens making up the interpolated expression.
|
||||
tokens : Vec<Token>,
|
||||
},
|
||||
/// An interpolated expression that hasn't been closed.
|
||||
TextSegmentUnclosedInterpolate {
|
||||
/// The tokens making up the interpolated expression.
|
||||
tokens : Vec<Token>
|
||||
},
|
||||
/// An invalid text segment (e.g. unclosed interpolate segment).
|
||||
TextSegmentInvalid(String),
|
||||
|
||||
// === Lines ===
|
||||
/// A line containing tokens.
|
||||
///
|
||||
/// The offset for a line is always zero, as it is contained in a block with a defined
|
||||
/// indentation.
|
||||
Line{
|
||||
Line {
|
||||
/// The tokens on the line.
|
||||
tokens : Vec<Token>,
|
||||
/// The line ending that _ends_ the line.
|
||||
@ -290,6 +587,17 @@ pub enum Shape {
|
||||
InvalidSuffix(String),
|
||||
/// An unrecognized token.
|
||||
Unrecognized(String),
|
||||
|
||||
// === Comments ===
|
||||
/// A disable comment (`# ...`).
|
||||
DisableComment(String),
|
||||
/// An Enso documentation comment (`## ...`).
|
||||
DocComment {
|
||||
/// The lines in the doc comment body. Each line must contain raw text segments only.
|
||||
lines : Vec<Token>,
|
||||
/// The indentation of the doc comment's body from the baseline.
|
||||
indent : usize
|
||||
}
|
||||
}
|
||||
|
||||
impl Shape {
|
||||
@ -326,9 +634,16 @@ impl Shape {
|
||||
Shape::Modifier(opr.into())
|
||||
}
|
||||
|
||||
/// Construct an annotation identifier.
|
||||
pub fn annotation(name:impl Into<String>) -> Shape {
|
||||
Shape::Annotation(name.into())
|
||||
}
|
||||
|
||||
/// Construct a number literal.
|
||||
pub fn number(base:impl Into<String>, num:impl Into<String>) -> Shape {
|
||||
Shape::Number{base:base.into(),number:num.into()}
|
||||
let base = base.into();
|
||||
let number = num.into();
|
||||
Shape::Number{base,number}
|
||||
}
|
||||
|
||||
/// Construct a dangling base literal.
|
||||
@ -336,9 +651,50 @@ impl Shape {
|
||||
Shape::DanglingBase(base.into())
|
||||
}
|
||||
|
||||
/// Construct a text literal.
|
||||
pub fn text(text:impl Into<String>) -> Shape {
|
||||
Shape::Text(text.into())
|
||||
/// Construct a text line literal.
|
||||
pub fn text_line(style:TextStyle, segments:Vec<Token>) -> Shape {
|
||||
Shape::TextLine{style,segments}
|
||||
}
|
||||
|
||||
/// Construct an inline block text literal.
|
||||
pub fn text_inline_block(style:TextStyle, segments:Vec<Token>) -> Shape {
|
||||
Shape::TextInlineBlock{style,segments}
|
||||
}
|
||||
|
||||
/// Construct a text block literal.
|
||||
pub fn text_block(start_line_ending: LineEnding, style:TextStyle, lines:Vec<Token>) -> Shape {
|
||||
Shape::TextBlock{start_line_ending,style,lines}
|
||||
}
|
||||
|
||||
/// Construct an invalid quote literal.
|
||||
pub fn invalid_quote(bad_quotes:impl Str) -> Shape {
|
||||
Shape::InvalidQuote(bad_quotes.into())
|
||||
}
|
||||
|
||||
/// Construct a raw text segment.
|
||||
pub fn text_segment_raw(text:impl Str) -> Shape {
|
||||
Shape::TextSegmentRaw(text.into())
|
||||
}
|
||||
|
||||
/// Construct a text segment containing an escape sequence.
|
||||
pub fn text_segment_escape(style:EscapeStyle, repr_str:impl Str) -> Shape {
|
||||
let repr = repr_str.into();
|
||||
Shape::TextSegmentEscape{style,repr}
|
||||
}
|
||||
|
||||
/// Construct a text segment containing an interpolated expression.
|
||||
pub fn text_segment_interpolate(tokens:Vec<Token>) -> Shape {
|
||||
Shape::TextSegmentInterpolate{tokens}
|
||||
}
|
||||
|
||||
/// Construct a text segment containing an unclosed interpolated expression.
|
||||
pub fn text_segment_unclosed_interpolate(tokens:Vec<Token>) -> Shape {
|
||||
Shape::TextSegmentUnclosedInterpolate{tokens}
|
||||
}
|
||||
|
||||
/// Construct an invalid text segment.
|
||||
pub fn text_segment_invalid(str:impl Str) -> Shape {
|
||||
Shape::TextSegmentInvalid(str.into())
|
||||
}
|
||||
|
||||
/// Construct a line that contains tokens.
|
||||
@ -365,6 +721,16 @@ impl Shape {
|
||||
pub fn unrecognized(text:impl Into<String>) -> Shape {
|
||||
Shape::Unrecognized(text.into())
|
||||
}
|
||||
|
||||
/// Construct a disable comment shape.
|
||||
pub fn disable_comment(text:impl Str) -> Shape {
|
||||
Shape::DisableComment(text.into())
|
||||
}
|
||||
|
||||
/// Construct a doc comment shape.
|
||||
pub fn doc_comment(lines:Vec<Token>, indent:usize) -> Shape {
|
||||
Shape::DocComment{lines,indent}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -425,146 +791,3 @@ impl Into<Vec<Token>> for Stream {
|
||||
self.tokens
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Tests ===
|
||||
// =============
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::token::BlockType;
|
||||
|
||||
|
||||
// === Testing Utilities ===
|
||||
|
||||
/// Asserts that the `token` has the provided `shape`.
|
||||
pub fn assert_shape(token:&Token, shape:Shape) {
|
||||
assert_eq!(token.shape,shape);
|
||||
}
|
||||
|
||||
/// Asserts that the `token` has the provided `length`.
|
||||
pub fn assert_length(token:&Token, length:usize) {
|
||||
assert_eq!(token.length,length)
|
||||
}
|
||||
|
||||
|
||||
// === Tests for Token Construction ===
|
||||
|
||||
#[test]
|
||||
fn construct_referent_token() {
|
||||
let token = Token::Referent("Some_Ref_Name",0);
|
||||
assert_shape(&token,Shape::referent("Some_Ref_Name"));
|
||||
assert_length(&token,13);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_variable_token() {
|
||||
let token = Token::Variable("some_variable_name",0);
|
||||
assert_shape(&token,Shape::variable("some_variable_name"));
|
||||
assert_length(&token,18);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_external_name_token() {
|
||||
let token = Token::External("camelCase",0);
|
||||
assert_shape(&token,Shape::external("camelCase"));
|
||||
assert_length(&token,9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_blank_token() {
|
||||
let token = Token::Blank(0);
|
||||
assert_shape(&token,Shape::blank());
|
||||
assert_length(&token,1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_operator_token() {
|
||||
let token = Token::Operator("==>",0);
|
||||
assert_shape(&token,Shape::operator("==>"));
|
||||
assert_length(&token,3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_modifier_token() {
|
||||
let token = Token::Modifier("+",0);
|
||||
assert_shape(&token,Shape::modifier("+"));
|
||||
assert_length(&token,2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_number_token() {
|
||||
let token = Token::Number("","1231",0);
|
||||
assert_shape(&token,Shape::number("","1231"));
|
||||
assert_length(&token,4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_dangling_base_token() {
|
||||
let token = Token::DanglingBase("15",0);
|
||||
assert_shape(&token,Shape::dangling_base("15"));
|
||||
assert_length(&token,3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_text_token() {
|
||||
let token = Token::Text("some prose goes here",0);
|
||||
assert_shape(&token,Shape::text("some prose goes here"));
|
||||
assert_length(&token,20);
|
||||
// TODO [AA] Make this internally account for length of quotes.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_line_token() {
|
||||
let tokens = vec![Token::Variable("aa",0),Token::Referent("Abc",1)];
|
||||
let token = Token::Line(tokens.clone(), 4, LineEnding::LF);
|
||||
assert_shape(&token,Shape::line(tokens.clone(), LineEnding::LF));
|
||||
assert_length(&token,7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_blank_line_token() {
|
||||
let token = Token::BlankLine(13,LineEnding::LF);
|
||||
assert_shape(&token, Shape::blank_line(LineEnding::LF));
|
||||
assert_length(&token,1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_block_token_lf() {
|
||||
let lines = vec![
|
||||
Token::Line(vec![],0,LineEnding::LF),
|
||||
Token::Line(vec![],4,LineEnding::LF)
|
||||
];
|
||||
let token = Token::Block(BlockType::Continuous,4,lines.clone(),0);
|
||||
assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone()));
|
||||
assert_length(&token,14);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_block_token_crlf() {
|
||||
let lines = vec![
|
||||
Token::Line(vec![],0,LineEnding::CRLF),
|
||||
Token::Line(vec![],4,LineEnding::CRLF)
|
||||
];
|
||||
let token = Token::Block(BlockType::Continuous,4,lines.clone(),0);
|
||||
assert_shape(&token,Shape::block(BlockType::Continuous,4,lines.clone()));
|
||||
assert_length(&token,16);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_invalid_suffix_token() {
|
||||
let token = Token::InvalidSuffix("aaa",0);
|
||||
assert_shape(&token,Shape::invalid_suffix("aaa"));
|
||||
assert_length(&token,3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn construct_unrecognized_token() {
|
||||
let token = Token::Unrecognized("a",0);
|
||||
assert_shape(&token,Shape::unrecognized("a"));
|
||||
assert_length(&token,1);
|
||||
}
|
||||
}
|
||||
|
@ -12,10 +12,17 @@ test = true
|
||||
bench = true
|
||||
|
||||
[dependencies]
|
||||
flexer = { path = "../../flexer", version = "0.1.0" }
|
||||
enso-prelude = { version = "0.1.3" }
|
||||
enso-flexer = { version = "0.1.3" }
|
||||
enso-prelude = { version = "0.1.7" }
|
||||
lexer-definition = { path = "../definition", version = "0.1.0" }
|
||||
|
||||
[build-dependencies]
|
||||
flexer = { path = "../../flexer", version = "0.1.0" }
|
||||
enso-flexer = { version = "0.1.3" }
|
||||
lexer-definition = { path = "../definition", version = "0.1.0" }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3"
|
||||
|
||||
[[bench]]
|
||||
name = "lexer_time_bench"
|
||||
harness = false
|
||||
|
337
lib/rust/lexer/generation/benches/lexer_bench_sources.rs
Normal file
337
lib/rust/lexer/generation/benches/lexer_bench_sources.rs
Normal file
@ -0,0 +1,337 @@
|
||||
//! This file contains the sources that are replicated many times over for the purposes of
|
||||
//! benchmarking the Enso lexer.
|
||||
|
||||
use criterion::{black_box, Criterion, Throughput};
|
||||
use enso_flexer::prelude::Reader;
|
||||
use enso_flexer::prelude::reader::decoder::DecoderUTF8;
|
||||
use lexer::generated::engine::EnsoLexer;
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
|
||||
// ===============================
|
||||
// === Benchmark Configuration ===
|
||||
// ===============================
|
||||
|
||||
/// Configures the benchmarking process.
|
||||
pub fn bench_config() -> Criterion {
|
||||
Criterion::default()
|
||||
.measurement_time(Duration::from_secs(60))
|
||||
.warm_up_time(Duration::from_secs(3))
|
||||
.sample_size(25)
|
||||
.retain_baseline("EnsoLexer".to_string())
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =======================
|
||||
// === Benchmark Setup ===
|
||||
// =======================
|
||||
|
||||
/// The sizes of text to run the benchmarks over.
|
||||
pub const SIZES:[(usize,&str);4] = [
|
||||
(1024 , "1KB" ),
|
||||
(1024*100 , "100KB"),
|
||||
(1024*1024 , "1MB" ),
|
||||
(1024*1024*10 , "10MB" ),
|
||||
];
|
||||
|
||||
|
||||
|
||||
// ==============================
|
||||
// === Benchmarking Utilities ===
|
||||
// ==============================
|
||||
|
||||
/// Execute the provided benchmark for each of the [`SIZES`] above.
|
||||
pub fn run_bench_sizes(name:&str, input:&str, add_newline:bool, c:&mut Criterion) {
|
||||
let mut group = c.benchmark_group(name);
|
||||
SIZES.iter().for_each(|(size,size_name)| {
|
||||
group.throughput(Throughput::Bytes(*size as u64));
|
||||
let input = replicate_to_size(input,*size,add_newline);
|
||||
group.bench_function(
|
||||
*size_name,
|
||||
|b| b.iter(|| {
|
||||
let mut lexer = EnsoLexer::new();
|
||||
let reader = Reader::new(input.as_str().as_bytes(),DecoderUTF8());
|
||||
lexer.run(black_box(reader));
|
||||
})
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
/// This function replicates `input` until it reaches `size` (in bytes).
|
||||
///
|
||||
/// If this cannot be done exactly, it will err on the side of over-replication,
|
||||
/// meaning that the output will be _larger_ than `size` bytes. If the size of
|
||||
/// the input already exceeds `size`, it is returned unchanged.
|
||||
pub fn replicate_to_size(input:&str, size:usize, add_newline:bool) -> String {
|
||||
let input_size = input.len();
|
||||
let times = 1 + (size / input_size);
|
||||
let mut input_newline = input.to_string();
|
||||
let to_add = if add_newline { '\n' } else { ' ' };
|
||||
input_newline.push(to_add);
|
||||
input_newline.repeat(times)
|
||||
}
|
||||
|
||||
/// Replace any windows-style line-endings in `input` with unix-style line-endings.
|
||||
fn preprocess(input:&str) -> String {
|
||||
input.replace("\r\n","\n")
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Macros ===
|
||||
// ==============
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! bench {
|
||||
(bench_name = $bench_name:literal; fun_name = $fun_name:ident; bench_input = $bench_input:expr;) => {
|
||||
pub fn $fun_name(c:&mut Criterion) {
|
||||
src::run_bench_sizes(
|
||||
$bench_name,
|
||||
$bench_input.as_str(),
|
||||
true,
|
||||
c
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =================================
|
||||
// === Literal Benchmark Sources ===
|
||||
// =================================
|
||||
|
||||
#[allow(missing_docs)]
|
||||
pub mod literal {
|
||||
use super::*;
|
||||
|
||||
pub mod number {
|
||||
use super::*;
|
||||
|
||||
pub fn integer() -> String {
|
||||
preprocess("12345")
|
||||
}
|
||||
|
||||
pub fn integer_explicit_base() -> String {
|
||||
preprocess("16_a4fd31")
|
||||
}
|
||||
|
||||
pub fn decimal() -> String {
|
||||
preprocess("1.3141")
|
||||
}
|
||||
|
||||
pub fn decimal_explicit_base() -> String {
|
||||
preprocess("10_1.000999")
|
||||
}
|
||||
|
||||
pub fn error_base() -> String {
|
||||
preprocess("10.2_2")
|
||||
}
|
||||
}
|
||||
|
||||
pub mod text {
|
||||
use super::*;
|
||||
|
||||
pub fn format_line() -> String {
|
||||
preprocess(r"'dearest creature in \n creation studying english pronunciation'")
|
||||
}
|
||||
|
||||
pub fn format_inline_block() -> String {
|
||||
preprocess(r"''' An inline block. It's a very good inline block carl \u{AB}")
|
||||
}
|
||||
|
||||
pub fn format_block() -> String {
|
||||
preprocess(
|
||||
r#"''' Here is my block of format text. I can `interpolate + things` like that.
|
||||
It goes on and on and on for `times` times because I feel like it.
|
||||
|
||||
Complex interpolated expression `x -> y ~> x | y` woo!
|
||||
"#)
|
||||
}
|
||||
|
||||
pub fn raw_line() -> String {
|
||||
preprocess(r#""dearest creature in '''' creation studying english pronunciation""#)
|
||||
}
|
||||
|
||||
pub fn raw_inline_block() -> String {
|
||||
preprocess(r#"""" An inline block. It's a very good inline block carl ""#)
|
||||
}
|
||||
|
||||
pub fn raw_block() -> String {
|
||||
preprocess(
|
||||
r#"""" Here is my block of raw text. `Interpolations` are nothing special here.
|
||||
It goes on and on and on for I can escape \" though.
|
||||
|
||||
It also supports blank lines!
|
||||
"#)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==============================
|
||||
// === Name Benchmark Sources ===
|
||||
// ==============================
|
||||
|
||||
#[allow(missing_docs)]
|
||||
pub mod name {
|
||||
use super::*;
|
||||
|
||||
pub fn line_of() -> String {
|
||||
preprocess(
|
||||
"Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1"
|
||||
)
|
||||
}
|
||||
|
||||
pub fn invalid_suffix() -> String {
|
||||
preprocess("some_var'iable some_varД")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================================
|
||||
// === Operator Benchmarks Sources ===
|
||||
// ===================================
|
||||
|
||||
#[allow(missing_docs)]
|
||||
pub mod operator {
|
||||
use super::*;
|
||||
|
||||
pub fn line_of() -> String {
|
||||
preprocess("+ - * -> ~> <~ <- ! & | /")
|
||||
}
|
||||
|
||||
pub fn dot_call() -> String {
|
||||
preprocess(".== . != .<*> .*> .|>")
|
||||
}
|
||||
|
||||
pub fn invalid_suffix() -> String {
|
||||
preprocess(".... +==")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================================
|
||||
// === Block Benchmarks Sources ===
|
||||
// ================================
|
||||
|
||||
#[allow(missing_docs)]
|
||||
pub mod block {
|
||||
use super::*;
|
||||
|
||||
pub fn top_level() -> String {
|
||||
preprocess("foo\nbar\nbaz")
|
||||
}
|
||||
|
||||
pub fn nested() -> String {
|
||||
preprocess("foo\nbar\n baz\n quux")
|
||||
}
|
||||
|
||||
pub fn deeply_nested() -> String {
|
||||
preprocess(
|
||||
r#"foo
|
||||
bar
|
||||
baz
|
||||
quux
|
||||
bim
|
||||
bam
|
||||
oh
|
||||
no
|
||||
"#)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================================
|
||||
// === Comments Benchmarks Sources ===
|
||||
// ===================================
|
||||
|
||||
#[allow(missing_docs)]
|
||||
pub mod comment {
|
||||
use super::*;
|
||||
|
||||
pub fn line() -> String {
|
||||
preprocess("# foo bar baz I have a really long line comment here that goes on and on")
|
||||
}
|
||||
|
||||
pub fn in_line() -> String {
|
||||
preprocess("a + b # A useless comment: add a to b")
|
||||
}
|
||||
|
||||
pub fn doc() -> String {
|
||||
preprocess(
|
||||
r#"## I have a really big doc comment here
|
||||
That just keeps prattling on and on and on.
|
||||
|
||||
With blank lines
|
||||
|
||||
Forever
|
||||
|
||||
and
|
||||
ever
|
||||
|
||||
and
|
||||
|
||||
|
||||
|
||||
|
||||
ever
|
||||
documented
|
||||
"#)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========================
|
||||
// === Combined Benchmarks ===
|
||||
// ===========================
|
||||
|
||||
pub mod combined {
|
||||
use super::*;
|
||||
|
||||
pub fn simple() -> String {
|
||||
preprocess(
|
||||
r#"
|
||||
import Base.Meta
|
||||
|
||||
## Decompose the value using runtime reflection and print its decomposition.
|
||||
Main.print_decomp a b =
|
||||
y = a + b
|
||||
decomp = Meta.decompose y
|
||||
Io.println decomp
|
||||
"#)
|
||||
}
|
||||
|
||||
pub fn complex() -> String {
|
||||
preprocess(
|
||||
r#"
|
||||
import Base.Meta
|
||||
|
||||
## Frobnicate the doodads by constructing a new type operator through runtime reflection such that
|
||||
it can be passed to another language.
|
||||
|
||||
! WARNING
|
||||
Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside
|
||||
Enso code.
|
||||
Main.foo a b =
|
||||
y = x -> z ->
|
||||
ty = a.gen_type (~>) (<-) b
|
||||
ty (z x)
|
||||
decomp = Meta.decompose (y a b)
|
||||
Io.println decomp
|
||||
|
||||
## Execute the main function of this project.
|
||||
main =
|
||||
func = Meta.reify (here.foo "My_Name" "my_field")
|
||||
Io.println(func)
|
||||
"#)
|
||||
}
|
||||
}
|
295
lib/rust/lexer/generation/benches/lexer_time_bench.rs
Normal file
295
lib/rust/lexer/generation/benches/lexer_time_bench.rs
Normal file
@ -0,0 +1,295 @@
|
||||
//! This file contains the time-based benchmarks for the Enso lexer.
|
||||
|
||||
mod lexer_bench_sources;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion, Throughput, black_box};
|
||||
use lexer_bench_sources as src;
|
||||
|
||||
|
||||
|
||||
// ==========================
|
||||
// === Literal Benchmarks ===
|
||||
// ==========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Integer";
|
||||
fun_name = bench_literal_number_integer;
|
||||
bench_input = src::literal::number::integer();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Integer Explicit Base";
|
||||
fun_name = bench_literal_number_integer_explicit_base;
|
||||
bench_input = src::literal::number::integer_explicit_base();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Decimal";
|
||||
fun_name = bench_literal_number_decimal;
|
||||
bench_input = src::literal::number::decimal();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Decimal Explicit Base";
|
||||
fun_name = bench_literal_number_decimal_explicit_base;
|
||||
bench_input = src::literal::number::decimal_explicit_base();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Number Error Base";
|
||||
fun_name = bench_literal_number_error_base;
|
||||
bench_input = src::literal::number::error_base();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Format Line";
|
||||
fun_name = bench_literal_text_format_line;
|
||||
bench_input = src::literal::text::format_line();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Format Inline Block";
|
||||
fun_name = bench_literal_text_format_inline_block;
|
||||
bench_input = src::literal::text::format_inline_block();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Format Block";
|
||||
fun_name = bench_literal_text_format_block;
|
||||
bench_input = src::literal::text::format_block();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Raw Line";
|
||||
fun_name = bench_literal_text_raw_line;
|
||||
bench_input = src::literal::text::raw_line();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Raw Inline Block";
|
||||
fun_name = bench_literal_text_raw_inline_block;
|
||||
bench_input = src::literal::text::raw_inline_block();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Text Raw Block";
|
||||
fun_name = bench_literal_text_raw_block;
|
||||
bench_input = src::literal::text::raw_block();
|
||||
}
|
||||
|
||||
criterion_group!{
|
||||
name = literal_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_literal_number_integer,
|
||||
bench_literal_number_integer_explicit_base,
|
||||
bench_literal_number_decimal,
|
||||
bench_literal_number_decimal_explicit_base,
|
||||
bench_literal_number_error_base,
|
||||
bench_literal_text_format_line,
|
||||
bench_literal_text_format_inline_block,
|
||||
bench_literal_text_format_block,
|
||||
bench_literal_text_raw_line,
|
||||
bench_literal_text_raw_inline_block,
|
||||
bench_literal_text_raw_block,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ========================
|
||||
// === Names Benchmarks ===
|
||||
// ========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Line of Names";
|
||||
fun_name = bench_names_line_of;
|
||||
bench_input = src::name::line_of();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Names with invalid Suffixes";
|
||||
fun_name = bench_names_invalid_suffix;
|
||||
bench_input = src::name::invalid_suffix();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = name_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_names_line_of,
|
||||
bench_names_invalid_suffix,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========================
|
||||
// === Operator Benchmarks ===
|
||||
// ===========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Line of Operators";
|
||||
fun_name = bench_operator_line_of;
|
||||
bench_input = src::operator::line_of();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Dot Call Operators";
|
||||
fun_name = bench_operator_dot_call;
|
||||
bench_input = src::operator::dot_call();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Operators with Invalid Suffixes";
|
||||
fun_name = bench_operator_invalid_suffix;
|
||||
bench_input = src::operator::invalid_suffix();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = operator_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_operator_line_of,
|
||||
bench_operator_dot_call,
|
||||
bench_operator_invalid_suffix
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ========================
|
||||
// === Block Benchmarks ===
|
||||
// ========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Top Level Block";
|
||||
fun_name = bench_block_top_level;
|
||||
bench_input = src::block::top_level();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Nested Block";
|
||||
fun_name = bench_block_nested;
|
||||
bench_input = src::block::nested();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Deeply Nested Blocks";
|
||||
fun_name = bench_block_deeply_nested;
|
||||
bench_input = src::block::deeply_nested();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = block_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_block_top_level,
|
||||
bench_block_nested,
|
||||
bench_block_deeply_nested,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ==========================
|
||||
// === Comment Benchmarks ===
|
||||
// ==========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Line Comment";
|
||||
fun_name = bench_comment_line;
|
||||
bench_input = src::comment::line();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Comment in Line";
|
||||
fun_name = bench_comment_in_line;
|
||||
bench_input = src::comment::in_line();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Doc Comment";
|
||||
fun_name = bench_comment_doc;
|
||||
bench_input = src::comment::doc();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = comment_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_comment_line,
|
||||
bench_comment_in_line,
|
||||
bench_comment_doc,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===========================
|
||||
// === Combined Benchmarks ===
|
||||
// ===========================
|
||||
|
||||
bench! {
|
||||
bench_name = "Simple Combined Example";
|
||||
fun_name = bench_combined_simple;
|
||||
bench_input = src::combined::simple();
|
||||
}
|
||||
|
||||
bench! {
|
||||
bench_name = "Complex Combined Example";
|
||||
fun_name = bench_combined_complex;
|
||||
bench_input = src::combined::complex();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = combined_benchmarks;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_combined_simple,
|
||||
bench_combined_complex,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === Comparisons ===
|
||||
// ===================
|
||||
|
||||
fn bench_rust_reader(c:&mut Criterion) {
|
||||
let mut group = c.benchmark_group("Rust Vector");
|
||||
src::SIZES.iter().for_each(|(size,name)| {
|
||||
group.throughput(Throughput::Bytes(*size as u64));
|
||||
let input = "abcdefghijklmnopqrstuvwxyz".repeat(1 + size / 26);
|
||||
group.bench_function(
|
||||
*name,
|
||||
|b| b.iter(|| {
|
||||
let mut counter = 0usize;
|
||||
for c in black_box(input.as_str()).chars() {
|
||||
if c == 'f' {
|
||||
counter += 1;
|
||||
}
|
||||
}
|
||||
counter
|
||||
})
|
||||
);
|
||||
})
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = rust_comparison;
|
||||
config = src::bench_config();
|
||||
targets =
|
||||
bench_rust_reader,
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === The Harness ===
|
||||
// ===================
|
||||
|
||||
criterion_main!(
|
||||
literal_benchmarks,
|
||||
name_benchmarks,
|
||||
operator_benchmarks,
|
||||
block_benchmarks,
|
||||
comment_benchmarks,
|
||||
combined_benchmarks,
|
||||
rust_comparison,
|
||||
);
|
@ -1,8 +1,8 @@
|
||||
use std::fs::File;
|
||||
use lexer_definition::lexer::EnsoLexer;
|
||||
use std::io::prelude::*;
|
||||
use flexer::Definition;
|
||||
use flexer::State;
|
||||
use enso_flexer::Definition;
|
||||
use enso_flexer::State;
|
||||
|
||||
|
||||
|
||||
@ -23,6 +23,7 @@ fn generate_engine() -> std::io::Result<()> {
|
||||
let engine = lexer.specialize().unwrap();
|
||||
lexer_def.read_to_string(&mut contents).expect("Unable to read lexer definition.");
|
||||
file.write_all(contents.as_bytes()).expect("Unable to write lexer definition.");
|
||||
file.write_all("\n".as_bytes())?;
|
||||
file.write_all(engine.as_bytes()).expect("Unable to write lexer specialization.");
|
||||
Ok(())
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ mod library {
|
||||
pub use lexer_definition::library::*;
|
||||
}
|
||||
|
||||
|
||||
/// A library of commonly useful functionality.
|
||||
mod prelude {
|
||||
pub use lexer_definition::prelude::*;
|
||||
|
@ -1,759 +0,0 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for the Enso Lexer.
|
||||
|
||||
// TODO [AA] Tests for error scenarios once it's done.
|
||||
|
||||
use flexer::*;
|
||||
use lexer_definition::library::*;
|
||||
|
||||
use flexer::prelude::reader::decoder::DecoderUTF8;
|
||||
use flexer::prelude::Reader;
|
||||
use lexer::generated::engine::EnsoLexer;
|
||||
use lexer_definition::library::token::Token;
|
||||
use lexer_definition::token::BlockType;
|
||||
use lexer_definition::token::LineEnding;
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Utilities ===
|
||||
// =================
|
||||
|
||||
/// Assert that `result` is a success with tokens `expected`.
|
||||
fn assert_succeeds_as(result:&LexingResult<token::Stream>, expected:token::Stream) {
|
||||
match result.kind {
|
||||
ResultKind::Success => assert_eq!(result.tokens,expected),
|
||||
_ => panic!("Lexing failed.")
|
||||
}
|
||||
}
|
||||
|
||||
/// Assert that the provided input lexes as `expected`.
|
||||
fn assert_lexes(input:impl AsRef<str>, expected:token::Stream) {
|
||||
let input_len = input.as_ref().chars().count();
|
||||
let result = lex(input);
|
||||
assert_succeeds_as(&result,expected);
|
||||
let tokens_vec : Vec<_> = result.tokens.into();
|
||||
let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum();
|
||||
assert_eq!(total_length,input_len);
|
||||
}
|
||||
|
||||
/// Lex the provided string.
|
||||
fn lex(input:impl AsRef<str>) -> LexingResult<token::Stream> {
|
||||
let mut lexer = EnsoLexer::new();
|
||||
let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8());
|
||||
lexer.run(reader)
|
||||
}
|
||||
|
||||
/// Asserts that the input is a block and has a length equal to `length`.
|
||||
fn assert_block_has_length(input:impl AsRef<str>, expected_length:usize) {
|
||||
let result = lex(input);
|
||||
match result.kind {
|
||||
ResultKind::Success => {
|
||||
let tokens = result.tokens.tokens();
|
||||
match tokens.first().expect("Token should be present.") {
|
||||
Token{shape:token::Shape::Block{..},length,..} =>
|
||||
assert_eq!(*length,expected_length),
|
||||
_ => panic!("Token not a block."),
|
||||
}
|
||||
},
|
||||
_ => panic!("Lexing failed"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Makes the test text have unix line endings to ensure consistency regardless of git checkout
|
||||
/// style.
|
||||
fn make_unix_line_endings(input:&str) -> String {
|
||||
let string = String::from(input);
|
||||
string.chars().filter(|c| *c != '\r').collect()
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Operators ===
|
||||
// =================
|
||||
|
||||
#[test]
|
||||
fn function_operator() {
|
||||
let input = "->";
|
||||
let expected = token::Stream::from(vec![Token::Operator("->",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bind_operator() {
|
||||
let input = "<-";
|
||||
let expected = token::Stream::from(vec![Token::Operator("<-",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_pipe_operator() {
|
||||
let input = "<|";
|
||||
let expected = token::Stream::from(vec![Token::Operator("<|",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn right_pipe_operator() {
|
||||
let input = "|>";
|
||||
let expected = token::Stream::from(vec![Token::Operator("|>",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eq_operator() {
|
||||
let input = "=";
|
||||
let expected = token::Stream::from(vec![Token::Operator("=",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eq_compare_operator() {
|
||||
let input = "==";
|
||||
let expected = token::Stream::from(vec![Token::Operator("==",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geq_operator() {
|
||||
let input = ">=";
|
||||
let expected = token::Stream::from(vec![Token::Operator(">=",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn neq_operator() {
|
||||
let input = "!=";
|
||||
let expected = token::Stream::from(vec![Token::Operator("!=",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_operator() {
|
||||
let input = ".";
|
||||
let expected = token::Stream::from(vec![Token::Operator(".",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn comma_operator() {
|
||||
let input = ",";
|
||||
let expected = token::Stream::from(vec![Token::Operator(",",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_dot_operator() {
|
||||
let input = "..";
|
||||
let expected = token::Stream::from(vec![Token::Operator("..",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn triple_dot_operator() {
|
||||
let input = "...";
|
||||
let expected = token::Stream::from(vec![Token::Operator("...",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_operator() {
|
||||
let input = "!";
|
||||
let expected = token::Stream::from(vec![Token::Operator("!",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type_ascription_operator() {
|
||||
let input = ":";
|
||||
let expected = token::Stream::from(vec![Token::Operator(":",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn in_operator() {
|
||||
let input = "in";
|
||||
let expected = token::Stream::from(vec![Token::Operator("in",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_union_operator() {
|
||||
let input = "|";
|
||||
let expected = token::Stream::from(vec![Token::Operator("|",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_intersection_operator() {
|
||||
let input = "&";
|
||||
let expected = token::Stream::from(vec![Token::Operator("&",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_subtraction_operator() {
|
||||
let input = "\\";
|
||||
let expected = token::Stream::from(vec![Token::Operator("\\",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_comment() {
|
||||
let input = "#";
|
||||
let expected = token::Stream::from(vec![Token::Operator("#",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_comment() {
|
||||
let input = "##";
|
||||
let expected = token::Stream::from(vec![Token::Operator("##",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_left_operator() {
|
||||
let input = "<!!-";
|
||||
let expected = token::Stream::from(vec![Token::Operator("<!!-",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_right_operator() {
|
||||
let input = "-->>";
|
||||
let expected = token::Stream::from(vec![Token::Operator("-->>",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modifier_plus() {
|
||||
let input = "+=";
|
||||
let expected = token::Stream::from(vec![Token::Modifier("+",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modifier_minus() {
|
||||
let input = "-=";
|
||||
let expected = token::Stream::from(vec![Token::Modifier("-",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_modifier() {
|
||||
let input = "<%=";
|
||||
let expected = token::Stream::from(vec![Token::Modifier("<%",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_eq_suffix() {
|
||||
let input = "===";
|
||||
let expected = token::Stream::from(vec![Token::Operator("==",0),Token::InvalidSuffix("=",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_dots_suffix() {
|
||||
let input = "....";
|
||||
let expected = token::Stream::from(vec![Token::Operator("...",0),Token::InvalidSuffix(".",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_modifier_suffix() {
|
||||
let input = "+==";
|
||||
let expected = token::Stream::from(vec![Token::Operator("+",0),Token::InvalidSuffix("==",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === Identifiers ===
|
||||
// ===================
|
||||
|
||||
#[test]
|
||||
fn variable_ident() {
|
||||
let input = "some_variable_name";
|
||||
let expected = token::Stream::from(vec![Token::Variable("some_variable_name",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn referent_ident() {
|
||||
let input = "Some_Referent_Name";
|
||||
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn external_ident() {
|
||||
let input = "__camelCaseIdentifier";
|
||||
let expected = token::Stream::from(vec![Token::External("__camelCaseIdentifier",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blank_ident() {
|
||||
let input = "_";
|
||||
let expected = token::Stream::from(vec![Token::Blank(0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ticked_variable_ident() {
|
||||
let input = "some_variable_name'";
|
||||
let expected = token::Stream::from(vec![Token::Variable("some_variable_name'",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ticked_referent_ident() {
|
||||
let input = "Some_Referent_Name'";
|
||||
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_ticked_variable_ident() {
|
||||
let input = "some_variable_name'''";
|
||||
let expected = token::Stream::from(vec![Token::Variable("some_variable_name'''",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_ticked_referent_ident() {
|
||||
let input = "Some_Referent_Name'''";
|
||||
let expected = token::Stream::from(vec![Token::Referent("Some_Referent_Name'''",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn variable_with_numbers() {
|
||||
let input = "some0_1";
|
||||
let expected = token::Stream::from(vec![Token::Variable("some0_1",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn referent_with_numbers() {
|
||||
let input = "Some_1821";
|
||||
let expected = token::Stream::from(vec![Token::Referent("Some_1821",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_not_at_end_variable() {
|
||||
let input = "some_var'iable";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Variable("some_var'",0),
|
||||
Token::InvalidSuffix("iable",0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_underscore() {
|
||||
let input = "some_var_";
|
||||
let expected = token::Stream::from(vec![Token::External("some_var_",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_underscore_with_tick() {
|
||||
let input = "some_var_'";
|
||||
let expected = token::Stream::from(vec![Token::External("some_var_'",0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_suffix() {
|
||||
let input = "some_varД";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Variable("some_var",0),
|
||||
Token::InvalidSuffix("Д",0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unrecognized_token() {
|
||||
let input = "some_var`";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Variable("some_var",0),
|
||||
Token::Unrecognized("`",0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chained_identifiers() {
|
||||
let input = "my_func A' someJavaValue some_python_value";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Variable("my_func",0),
|
||||
Token::Referent("A'",1),
|
||||
Token::External("someJavaValue",1),
|
||||
Token::Variable("some_python_value",1),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ===============
|
||||
// === Numbers ===
|
||||
// ===============
|
||||
|
||||
#[test]
|
||||
fn integer() {
|
||||
let input = "13831";
|
||||
let expected = token::Stream::from(vec![Token::Number("","13831",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integer_with_explicit_base() {
|
||||
let input = "10_13831";
|
||||
let expected = token::Stream::from(vec![Token::Number("10","13831",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dangling_base() {
|
||||
let input = "10_";
|
||||
let expected = token::Stream::from(vec![Token::DanglingBase("10",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hex_number() {
|
||||
let input = "16_ff";
|
||||
let expected = token::Stream::from(vec![Token::Number("16","ff",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decimal() {
|
||||
let input = "2.71828";
|
||||
let expected = token::Stream::from(vec![Token::Number("","2.71828",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decimal_with_explicit_base() {
|
||||
let input = "10_2.71828";
|
||||
let expected = token::Stream::from(vec![Token::Number("10","2.71828",0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_base() {
|
||||
let input = "10.2_2";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Number("","10.2",0),
|
||||
Token::InvalidSuffix("_2",0),
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn offset_number() {
|
||||
let input = " 10.2";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Number("","10.2",4),
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ============
|
||||
// === Text ===
|
||||
// ============
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Blocks ===
|
||||
// ==============
|
||||
|
||||
#[test]
|
||||
fn block_function_call() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"f
|
||||
argument_1
|
||||
argument_2
|
||||
fn a1 a2 a3
|
||||
argument_4
|
||||
argument_5"#);
|
||||
let block_fn_args =
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::Line(
|
||||
vec![Token::Variable("argument_1",0)],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::Line(
|
||||
vec![
|
||||
Token::Variable("argument_2",0),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::Line(
|
||||
vec![
|
||||
Token::Variable("fn",0),
|
||||
Token::Variable("a1",1),
|
||||
Token::Variable("a2",1),
|
||||
Token::Variable("a3",1),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::Line(
|
||||
vec![
|
||||
Token::Variable("argument_4",0),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::Line(
|
||||
vec![
|
||||
Token::Variable("argument_5",0),
|
||||
],
|
||||
0,
|
||||
LineEnding::None
|
||||
),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_level_first_line = Token::Line(
|
||||
vec![
|
||||
Token::Variable("f",0),
|
||||
block_fn_args
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
);
|
||||
let top_level_block = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_level_first_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,top_level_block);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn block_empty_lines() {
|
||||
let input = "f\r\n a\n\n b\n";
|
||||
let nested_block = Token::Block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::Line(vec![Token::Variable("a",0)],0,LineEnding::LF),
|
||||
Token::BlankLine(0,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_line = Token::Line(
|
||||
vec![
|
||||
Token::Variable("f",0),
|
||||
nested_block
|
||||
],
|
||||
0,
|
||||
LineEnding::CRLF
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_top_level() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"
|
||||
|
||||
foo
|
||||
bar
|
||||
baz
|
||||
"#);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::BlankLine(0,LineEnding::LF),
|
||||
Token::BlankLine(0,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("foo",0)],0,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("bar",0)],0,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_with_operator() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"x ->
|
||||
foo x 1
|
||||
"#);
|
||||
let nested_block = Token::Block(
|
||||
BlockType::Discontinuous,
|
||||
4,
|
||||
vec![
|
||||
Token::Line(vec![
|
||||
Token::Variable("foo",0),
|
||||
Token::Variable("x",1),
|
||||
Token::Number("","1",1),
|
||||
], 0, LineEnding::LF)
|
||||
],
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::Line(vec![
|
||||
Token::Variable("x",0),
|
||||
Token::Operator("->",1),
|
||||
nested_block
|
||||
], 0, LineEnding::LF)
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_with_nesting() {
|
||||
let input = make_unix_line_endings(r#"
|
||||
some_long_thing
|
||||
foo ->
|
||||
Bar
|
||||
baz
|
||||
|
||||
quux
|
||||
"#);
|
||||
let function_block = Token::Block(
|
||||
BlockType::Discontinuous,
|
||||
8,
|
||||
vec![
|
||||
Token::Line(vec![Token::Referent("Bar",0)],0,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("baz",0)],0,LineEnding::LF),
|
||||
Token::BlankLine(0,LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let foo_block = Token::Block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::Line(vec![
|
||||
Token::Variable("foo",0),
|
||||
Token::Operator("->",1),
|
||||
function_block,
|
||||
], 0, LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("quux",0)],0,LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::BlankLine(0,LineEnding::LF),
|
||||
Token::Line(vec![
|
||||
Token::Variable("some_long_thing",0),
|
||||
foo_block
|
||||
], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_extra_indented_blank_lines() {
|
||||
let input = "a\n b\n \n \n c";
|
||||
let indented_block = Token::Block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::Line(vec![Token::Variable("b",0)],0,LineEnding::LF),
|
||||
Token::BlankLine(8,LineEnding::LF),
|
||||
Token::BlankLine(2,LineEnding::LF),
|
||||
Token::Line(vec![Token::Variable("c",0)],0,LineEnding::None),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_level_line = Token::Line(vec![
|
||||
Token::Variable("a",0),
|
||||
indented_block
|
||||
],0,LineEnding::LF);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::Block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_level_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_length_unix() {
|
||||
let input = "a\n b\n c";
|
||||
assert_block_has_length(input,13);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_length_windows() {
|
||||
let input = "a\r\n b\r\n c";
|
||||
assert_block_has_length(input,15);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_length_mixed() {
|
||||
let input = "a\r\n b\n c\n d";
|
||||
assert_block_has_length(input,20);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Combined ===
|
||||
// ================
|
337
lib/rust/lexer/generation/tests/enso_lexer_blocks.rs
Normal file
337
lib/rust/lexer/generation/tests/enso_lexer_blocks.rs
Normal file
@ -0,0 +1,337 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing blocks in the Enso lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
use lexer_definition::token::BlockType;
|
||||
use lexer_definition::token::LineEnding;
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Blocks ===
|
||||
// ==============
|
||||
|
||||
#[test]
|
||||
fn function_call() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"f
|
||||
argument_1
|
||||
argument_2
|
||||
fn a1 a2 a3
|
||||
argument_4
|
||||
argument_5"#);
|
||||
let block_fn_args =
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::variable("argument_1", 0)],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("argument_2", 0),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("fn", 0),
|
||||
Token::variable("a1", 1),
|
||||
Token::variable("a2", 1),
|
||||
Token::variable("a3", 1),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("argument_4", 0),
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("argument_5", 0),
|
||||
],
|
||||
0,
|
||||
LineEnding::None
|
||||
),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_level_first_line = Token::line(
|
||||
vec![
|
||||
Token::variable("f", 0),
|
||||
block_fn_args
|
||||
],
|
||||
0,
|
||||
LineEnding::LF
|
||||
);
|
||||
let top_level_block = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_level_first_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,top_level_block);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn empty_lines() {
|
||||
let input = "f\r\n a\n\n b\n";
|
||||
let nested_block = Token::block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(vec![Token::variable("a", 0)], 0, LineEnding::LF),
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_line = Token::line(
|
||||
vec![
|
||||
Token::variable("f", 0),
|
||||
nested_block
|
||||
],
|
||||
0,
|
||||
LineEnding::CRLF
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn top_level() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"
|
||||
|
||||
foo
|
||||
bar
|
||||
baz
|
||||
"#);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("foo", 0)], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("bar", 0)], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn with_operator() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"x ->
|
||||
foo x 1
|
||||
"#);
|
||||
let nested_block = Token::block(
|
||||
BlockType::Discontinuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(vec![
|
||||
Token::variable("foo", 0),
|
||||
Token::variable("x", 1),
|
||||
Token::number("", "1", 1),
|
||||
], 0, LineEnding::LF)
|
||||
],
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(vec![
|
||||
Token::variable("x", 0),
|
||||
Token::operator("->", 1),
|
||||
nested_block
|
||||
], 0, LineEnding::LF)
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn with_nesting() {
|
||||
let input = make_unix_line_endings(r#"
|
||||
some_long_thing
|
||||
foo ->
|
||||
Bar
|
||||
baz
|
||||
|
||||
quux
|
||||
"#);
|
||||
let function_block = Token::block(
|
||||
BlockType::Discontinuous,
|
||||
8,
|
||||
vec![
|
||||
Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let foo_block = Token::block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(vec![
|
||||
Token::variable("foo", 0),
|
||||
Token::operator("->", 1),
|
||||
function_block,
|
||||
], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
Token::line(vec![
|
||||
Token::variable("some_long_thing", 0),
|
||||
foo_block
|
||||
], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_dedent() {
|
||||
let input = make_unix_line_endings(r#"
|
||||
some_long_thing
|
||||
foo ->
|
||||
Bar
|
||||
baz
|
||||
quux
|
||||
"#);
|
||||
let function_block = Token::block(
|
||||
BlockType::Discontinuous,
|
||||
8,
|
||||
vec![
|
||||
Token::line(vec![Token::referent("Bar", 0)], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("baz", 0)], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let foo_block = Token::block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(vec![
|
||||
Token::variable("foo", 0),
|
||||
Token::operator("->", 1),
|
||||
function_block,
|
||||
], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::blank_line(0, LineEnding::LF),
|
||||
Token::line(vec![
|
||||
Token::variable("some_long_thing", 0),
|
||||
foo_block
|
||||
], 0, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("quux", 0)], 0, LineEnding::LF),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extra_indented_blank_lines() {
|
||||
let input = "a\n b\n \n \n c";
|
||||
let indented_block = Token::block(
|
||||
BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(vec![Token::variable("b", 0)], 0, LineEnding::LF),
|
||||
Token::blank_line(8, LineEnding::LF),
|
||||
Token::blank_line(2, LineEnding::LF),
|
||||
Token::line(vec![Token::variable("c", 0)], 0, LineEnding::None),
|
||||
],
|
||||
0
|
||||
);
|
||||
let top_level_line = Token::line(vec![
|
||||
Token::variable("a", 0),
|
||||
indented_block
|
||||
], 0, LineEnding::LF);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
BlockType::Continuous,
|
||||
0,
|
||||
vec![top_level_line],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn length_unix() {
|
||||
let input = "a\n b\n c";
|
||||
assert_block_has_length(input,13);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn length_windows() {
|
||||
let input = "a\r\n b\r\n c";
|
||||
assert_block_has_length(input,15);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn length_mixed() {
|
||||
let input = "a\r\n b\n c\n d";
|
||||
assert_block_has_length(input,20);
|
||||
}
|
660
lib/rust/lexer/generation/tests/enso_lexer_combined.rs
Normal file
660
lib/rust/lexer/generation/tests/enso_lexer_combined.rs
Normal file
@ -0,0 +1,660 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing full-on Enso with the lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Combined ===
|
||||
// ================
|
||||
|
||||
#[test]
|
||||
fn method_definition() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"## Traverse the heterogeneous list, applying the provided polymorphic function
|
||||
wherever it matches.
|
||||
@Tail_Call
|
||||
map : forall ts ts' => (this : H_List ts) -> (exists a b . a ~> b) -> H_List ts'
|
||||
map this fn -> case this.types of
|
||||
Cons x xs ->
|
||||
x' = fn x
|
||||
x.Cons (map xs)
|
||||
x -> fn x
|
||||
"#);
|
||||
let doc_comment = Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::text_segment_raw(
|
||||
"Traverse the heterogeneous list, applying the provided polymorphic \
|
||||
function",
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("wherever it matches.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
)
|
||||
],
|
||||
4,
|
||||
0
|
||||
),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None,
|
||||
);
|
||||
let annotation = Token::line(
|
||||
vec![Token::annotation("Tail_Call", 0)],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
);
|
||||
let signature = Token::line(
|
||||
vec![
|
||||
Token::variable("map", 0),
|
||||
Token::operator(":", 1),
|
||||
Token::variable("forall", 1),
|
||||
Token::variable("ts", 1),
|
||||
Token::variable("ts'", 1),
|
||||
Token::operator("=>", 1),
|
||||
Token::operator("(", 1),
|
||||
Token::variable("this", 0),
|
||||
Token::operator(":", 1),
|
||||
Token::referent("H_List", 1),
|
||||
Token::variable("ts", 1),
|
||||
Token::operator(")", 0),
|
||||
Token::operator("->", 1),
|
||||
Token::operator("(", 1),
|
||||
Token::variable("exists", 0),
|
||||
Token::variable("a", 1),
|
||||
Token::variable("b", 1),
|
||||
Token::operator(".", 1),
|
||||
Token::variable("a", 1),
|
||||
Token::operator("~>", 1),
|
||||
Token::variable("b", 1),
|
||||
Token::operator(")", 0),
|
||||
Token::operator("->", 1),
|
||||
Token::referent("H_List", 1),
|
||||
Token::variable("ts'", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
);
|
||||
let cons_branch_body = Token::block(
|
||||
token::BlockType::Discontinuous,
|
||||
8,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("x'", 0),
|
||||
Token::operator("=", 1),
|
||||
Token::variable("fn", 1),
|
||||
Token::variable("x", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("x", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Cons", 0),
|
||||
Token::operator("(", 1),
|
||||
Token::variable("map", 0),
|
||||
Token::variable("xs", 1),
|
||||
Token::operator(")", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
],
|
||||
0
|
||||
);
|
||||
let case_body = Token::block(
|
||||
token::BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Cons", 0),
|
||||
Token::variable("x", 1),
|
||||
Token::variable("xs", 1),
|
||||
Token::operator("->", 1),
|
||||
cons_branch_body,
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("x", 0),
|
||||
Token::operator("->", 1),
|
||||
Token::variable("fn", 1),
|
||||
Token::variable("x", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
0
|
||||
);
|
||||
let function = Token::line(
|
||||
vec![
|
||||
Token::variable("map", 0),
|
||||
Token::variable("this", 1),
|
||||
Token::variable("fn", 1),
|
||||
Token::operator("->", 1),
|
||||
Token::variable("case", 1),
|
||||
Token::variable("this", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("types", 0),
|
||||
Token::variable("of", 1),
|
||||
case_body,
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![doc_comment,annotation,signature,function],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn complex_type() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"
|
||||
type Maybe a
|
||||
type Just item:a
|
||||
Nothing
|
||||
|
||||
is_just = case this of
|
||||
Just _ -> True
|
||||
Nothing -> False
|
||||
"#);
|
||||
let case_block = Token::block(
|
||||
token::BlockType::Continuous,
|
||||
8,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Just", 0),
|
||||
Token::blank(1),
|
||||
Token::operator("->", 2),
|
||||
Token::referent("True", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Nothing", 0),
|
||||
Token::operator("->", 1),
|
||||
Token::referent("False", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
],
|
||||
0
|
||||
);
|
||||
let type_body = Token::block(
|
||||
token::BlockType::Continuous,
|
||||
4,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("type", 0),
|
||||
Token::referent("Just", 1),
|
||||
Token::variable("item", 1),
|
||||
Token::operator(":", 0),
|
||||
Token::variable("a", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(vec![Token::referent("Nothing", 0)], 0, token::LineEnding::LF),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("is_just", 0),
|
||||
Token::operator("=", 1),
|
||||
Token::variable("case", 1),
|
||||
Token::variable("this", 1),
|
||||
Token::variable("of", 1),
|
||||
case_block,
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
)
|
||||
],
|
||||
0
|
||||
);
|
||||
let complex_type = Token::line(
|
||||
vec![
|
||||
Token::variable("type", 0),
|
||||
Token::referent("Maybe", 1),
|
||||
Token::variable("a", 1),
|
||||
type_body,
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
complex_type
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn imports_exports() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"import Base.List
|
||||
import Base.Number.Extensions
|
||||
from Builtins import Unit, Number, Integer, Any, True, False
|
||||
|
||||
from Builtins export all
|
||||
|
||||
from Base.List export Nil, Cons
|
||||
from Base.Number.Extensions export all hiding Math
|
||||
|
||||
polyglot java import com.ibm.icu.text.BreakIterator
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
"#);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("import", 0),
|
||||
Token::referent("Base", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("List", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("import", 0),
|
||||
Token::referent("Base", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Number", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Extensions", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("from", 0),
|
||||
Token::referent("Builtins", 1),
|
||||
Token::variable("import", 1),
|
||||
Token::referent("Unit", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("Number", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("Integer", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("Any", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("True", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("False", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("from", 0),
|
||||
Token::referent("Builtins", 1),
|
||||
Token::variable("export", 1),
|
||||
Token::variable("all", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("from", 0),
|
||||
Token::referent("Base", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("List", 0),
|
||||
Token::variable("export", 1),
|
||||
Token::referent("Nil", 1),
|
||||
Token::operator(",", 0),
|
||||
Token::referent("Cons", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("from", 0),
|
||||
Token::referent("Base", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Number", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Extensions", 0),
|
||||
Token::variable("export", 1),
|
||||
Token::variable("all", 1),
|
||||
Token::variable("hiding", 1),
|
||||
Token::referent("Math", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("polyglot", 0),
|
||||
Token::variable("java", 1),
|
||||
Token::variable("import", 1),
|
||||
Token::variable("com", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("ibm", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("icu", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("text", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::external("BreakIterator", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("polyglot", 0),
|
||||
Token::variable("java", 1),
|
||||
Token::variable("import", 1),
|
||||
Token::variable("org", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("enso", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("base", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::referent("Text_Utils", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn some_stdlib() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"from Base import all
|
||||
|
||||
## The top-level entry point for a test suite.
|
||||
type Suite specs
|
||||
|
||||
## PRIVATE
|
||||
type Spec name behaviors
|
||||
|
||||
## PRIVATE
|
||||
type Behavior name result
|
||||
|
||||
## PRIVATE
|
||||
Behavior.is_fail = this.result.is_fail
|
||||
|
||||
## PRIVATE
|
||||
Spec.is_fail = this.behaviors.any is_fail
|
||||
|
||||
## PRIVATE
|
||||
Suite.is_fail = this.specs.any is_fail
|
||||
"#);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("from", 0),
|
||||
Token::referent("Base", 1),
|
||||
Token::variable("import", 1),
|
||||
Token::variable("all", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::text_segment_raw(
|
||||
"The top-level entry point for a test suite.",
|
||||
0
|
||||
),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("type", 0),
|
||||
Token::referent("Suite", 1),
|
||||
Token::variable("specs", 1),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("PRIVATE", 0),],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("type", 0),
|
||||
Token::referent("Spec", 1),
|
||||
Token::variable("name", 1),
|
||||
Token::variable("behaviors", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("PRIVATE", 0),],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::variable("type", 0),
|
||||
Token::referent("Behavior", 1),
|
||||
Token::variable("name", 1),
|
||||
Token::variable("result", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("PRIVATE", 0),],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Behavior", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("is_fail", 0),
|
||||
Token::operator("=", 1),
|
||||
Token::variable("this", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("result", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("is_fail", 0),
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("PRIVATE", 0),],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Spec", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("is_fail", 0),
|
||||
Token::operator("=", 1),
|
||||
Token::variable("this", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("behaviors", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("any", 0),
|
||||
Token::variable("is_fail", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("PRIVATE", 0),],
|
||||
0,
|
||||
token::LineEnding::LF,
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::line(
|
||||
vec![
|
||||
Token::referent("Suite", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("is_fail", 0),
|
||||
Token::operator("=", 1),
|
||||
Token::variable("this", 1),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("specs", 0),
|
||||
Token::operator(".", 0),
|
||||
Token::variable("any", 0),
|
||||
Token::variable("is_fail", 1)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
318
lib/rust/lexer/generation/tests/enso_lexer_comments.rs
Normal file
318
lib/rust/lexer/generation/tests/enso_lexer_comments.rs
Normal file
@ -0,0 +1,318 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing comments in the Enso lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// ================
|
||||
// === Comments ===
|
||||
// ================
|
||||
|
||||
#[test]
|
||||
fn disable_eof() {
|
||||
let input = "# Here is a nice long comment string.";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::disable_comment(" Here is a nice long comment string.", 0)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_lf() {
|
||||
let input = "# Here is a nice long comment string.\n";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::disable_comment(" Here is a nice long comment string.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
)
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_crlf() {
|
||||
let input = "# Here is a nice long comment string.\r\n";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::disable_comment(" Here is a nice long comment string.", 0)],
|
||||
0,
|
||||
token::LineEnding::CRLF
|
||||
)
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_in_line() {
|
||||
let input = "a + b <*> N # Compare the frobnicators.";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::variable("a", 0),
|
||||
Token::operator("+", 1),
|
||||
Token::variable("b", 1),
|
||||
Token::operator("<*>", 1),
|
||||
Token::referent("N", 1),
|
||||
Token::disable_comment(" Compare the frobnicators.", 1),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_in_interpolate() {
|
||||
let input = "'String `1 + 1 # add` stuff.'";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::text_line(
|
||||
token::TextStyle::FormatLine,
|
||||
vec![
|
||||
Token::text_segment_raw("String ", 0),
|
||||
Token::text_segment_interpolate(
|
||||
vec![
|
||||
Token::number("", "1", 0),
|
||||
Token::operator("+", 1),
|
||||
Token::number("", "1", 1),
|
||||
Token::unrecognized("#", 1),
|
||||
Token::variable("add", 1)
|
||||
],
|
||||
0
|
||||
),
|
||||
Token::text_segment_raw(" stuff.", 0),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_single_line_eof() {
|
||||
let input = "## Foo bar baz";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(vec![Token::text_segment_raw("Foo bar baz", 0)], 0, token::LineEnding::None)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_single_line_lf() {
|
||||
let input = "## Foo bar baz\n";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("Foo bar baz", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::None),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_single_line_crlf() {
|
||||
let input = "## Foo bar baz\r\n";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(
|
||||
vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("Foo bar baz", 0)],
|
||||
0,
|
||||
token::LineEnding::CRLF
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::None),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_in_interpolate() {
|
||||
let input = "'String `1 + 1 ## add` stuff.'";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::text_line(
|
||||
token::TextStyle::FormatLine,
|
||||
vec![
|
||||
Token::text_segment_raw("String ", 0),
|
||||
Token::text_segment_interpolate(
|
||||
vec![
|
||||
Token::number("", "1", 0),
|
||||
Token::operator("+", 1),
|
||||
Token::number("", "1", 1),
|
||||
Token::unrecognized("##", 1),
|
||||
Token::variable("add", 1)
|
||||
],
|
||||
0
|
||||
),
|
||||
Token::text_segment_raw(" stuff.", 0),
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_multi_line() {
|
||||
let input = make_unix_line_endings(
|
||||
r#"## Here is a doc comment.
|
||||
It spans multiple lines.
|
||||
Some are indented much further.
|
||||
And this is okay.
|
||||
|
||||
It keeps going, even with blank lines.
|
||||
Until the indentation decreases back.
|
||||
|
||||
trailing_blanks_not_part_of_comment"#);
|
||||
let doc_comment = Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("Here is a doc comment.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("It spans multiple lines.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw(" Some are indented much further.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw(" And this is okay.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("It keeps going, even with blank lines.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("Until the indentation decreases back.", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
],
|
||||
4,
|
||||
0
|
||||
);
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::block(
|
||||
token::BlockType::Continuous,
|
||||
0,
|
||||
vec![
|
||||
Token::line(vec![doc_comment], 0, token::LineEnding::None),
|
||||
Token::blank_line(0, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![Token::variable("trailing_blanks_not_part_of_comment", 0)],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
)
|
||||
],
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_mixed_line_endings() {
|
||||
let input = "## Start a doc comment\n It has indent 3.\r\n \n An indented blank too.";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::doc_comment(
|
||||
vec![
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("Start a doc comment", 0)],
|
||||
0,
|
||||
token::LineEnding::LF
|
||||
),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw("It has indent 3.", 0)],
|
||||
0,
|
||||
token::LineEnding::CRLF
|
||||
),
|
||||
Token::blank_line(4, token::LineEnding::LF),
|
||||
Token::line(
|
||||
vec![Token::text_segment_raw(" An indented blank too.", 0)],
|
||||
0,
|
||||
token::LineEnding::None
|
||||
)
|
||||
],
|
||||
3,
|
||||
0
|
||||
)
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
178
lib/rust/lexer/generation/tests/enso_lexer_identifiers.rs
Normal file
178
lib/rust/lexer/generation/tests/enso_lexer_identifiers.rs
Normal file
@ -0,0 +1,178 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing identifiers in the Enso lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// ===================
|
||||
// === Identifiers ===
|
||||
// ===================
|
||||
|
||||
#[test]
|
||||
fn variable_ident() {
|
||||
let input = "some_variable_name";
|
||||
let expected = token::Stream::from(vec![Token::variable("some_variable_name", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn referent_ident() {
|
||||
let input = "Some_Referent_Name";
|
||||
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn external_ident() {
|
||||
let input = "__camelCaseIdentifier";
|
||||
let expected = token::Stream::from(vec![Token::external("__camelCaseIdentifier", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blank_ident() {
|
||||
let input = "_";
|
||||
let expected = token::Stream::from(vec![Token::blank(0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn annotation() {
|
||||
let input = "@debug";
|
||||
let expected = token::Stream::from(vec![Token::annotation("debug", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ticked_variable_ident() {
|
||||
let input = "some_variable_name'";
|
||||
let expected = token::Stream::from(vec![Token::variable("some_variable_name'", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ticked_referent_ident() {
|
||||
let input = "Some_Referent_Name'";
|
||||
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ticked_annotation() {
|
||||
let input = "@debug'";
|
||||
let expected = token::Stream::from(vec![Token::annotation("debug'", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_ticked_variable_ident() {
|
||||
let input = "some_variable_name'''";
|
||||
let expected = token::Stream::from(vec![Token::variable("some_variable_name'''", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_ticked_referent_ident() {
|
||||
let input = "Some_Referent_Name'''";
|
||||
let expected = token::Stream::from(vec![Token::referent("Some_Referent_Name'''", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_ticked_annotation() {
|
||||
let input = "@debug''";
|
||||
let expected = token::Stream::from(vec![Token::annotation("debug''", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn variable_with_numbers() {
|
||||
let input = "some0_1";
|
||||
let expected = token::Stream::from(vec![Token::variable("some0_1", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn referent_with_numbers() {
|
||||
let input = "Some_1821";
|
||||
let expected = token::Stream::from(vec![Token::referent("Some_1821", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn annotation_with_numbers() {
|
||||
let input = "@debug_1";
|
||||
let expected = token::Stream::from(vec![Token::annotation("debug_1", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tick_not_at_end_variable() {
|
||||
let input = "some_var'iable";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::variable("some_var'", 0),
|
||||
Token::invalid_suffix("iable", 0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_underscore() {
|
||||
let input = "some_var_";
|
||||
let expected = token::Stream::from(vec![Token::external("some_var_", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trailing_underscore_with_tick() {
|
||||
let input = "some_var_'";
|
||||
let expected = token::Stream::from(vec![Token::external("some_var_'", 0)]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_suffix() {
|
||||
let input = "some_varД";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::variable("some_var", 0),
|
||||
Token::invalid_suffix("Д", 0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unrecognized_token() {
|
||||
let input = "some_var@";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::variable("some_var", 0),
|
||||
Token::unrecognized("@", 0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chained_identifiers() {
|
||||
let input = "my_func A' someJavaValue some_python_value";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::variable("my_func", 0),
|
||||
Token::referent("A'", 1),
|
||||
Token::external("someJavaValue", 1),
|
||||
Token::variable("some_python_value", 1),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing number literals in the Enso lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// ===============
|
||||
// === Numbers ===
|
||||
// ===============
|
||||
|
||||
#[test]
|
||||
fn integer() {
|
||||
let input = "13831";
|
||||
let expected = token::Stream::from(vec![Token::number("", "13831", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integer_with_explicit_base() {
|
||||
let input = "10_13831";
|
||||
let expected = token::Stream::from(vec![Token::number("10", "13831", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dangling_base() {
|
||||
let input = "10_";
|
||||
let expected = token::Stream::from(vec![Token::dangling_base("10", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hex_number() {
|
||||
let input = "16_ff";
|
||||
let expected = token::Stream::from(vec![Token::number("16", "ff", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decimal() {
|
||||
let input = "2.71828";
|
||||
let expected = token::Stream::from(vec![Token::number("", "2.71828", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decimal_with_explicit_base() {
|
||||
let input = "10_2.71828";
|
||||
let expected = token::Stream::from(vec![Token::number("10", "2.71828", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_base() {
|
||||
let input = "10.2_2";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::number("", "10.2", 0),
|
||||
Token::invalid_suffix("_2", 0),
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn offset_number() {
|
||||
let input = " 10.2";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::number("", "10.2", 4),
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
230
lib/rust/lexer/generation/tests/enso_lexer_operators.rs
Normal file
230
lib/rust/lexer/generation/tests/enso_lexer_operators.rs
Normal file
@ -0,0 +1,230 @@
|
||||
#![feature(test)]
|
||||
#![deny(unconditional_recursion)]
|
||||
#![warn(missing_copy_implementations)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![warn(missing_docs)]
|
||||
#![warn(trivial_casts)]
|
||||
#![warn(trivial_numeric_casts)]
|
||||
#![warn(unsafe_code)]
|
||||
#![warn(unused_import_braces)]
|
||||
|
||||
//! This file contains tests for lexing operators in the Enso lexer.
|
||||
|
||||
mod test_utils;
|
||||
|
||||
use lexer_definition::library::*;
|
||||
use test_utils::*;
|
||||
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Operators ===
|
||||
// =================
|
||||
|
||||
#[test]
|
||||
fn function_operator() {
|
||||
let input = "->";
|
||||
let expected = token::Stream::from(vec![Token::operator("->", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bind_operator() {
|
||||
let input = "<-";
|
||||
let expected = token::Stream::from(vec![Token::operator("<-", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_pipe_operator() {
|
||||
let input = "<|";
|
||||
let expected = token::Stream::from(vec![Token::operator("<|", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn right_pipe_operator() {
|
||||
let input = "|>";
|
||||
let expected = token::Stream::from(vec![Token::operator("|>", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eq_operator() {
|
||||
let input = "=";
|
||||
let expected = token::Stream::from(vec![Token::operator("=", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eq_compare_operator() {
|
||||
let input = "==";
|
||||
let expected = token::Stream::from(vec![Token::operator("==", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geq_operator() {
|
||||
let input = ">=";
|
||||
let expected = token::Stream::from(vec![Token::operator(">=", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn neq_operator() {
|
||||
let input = "!=";
|
||||
let expected = token::Stream::from(vec![Token::operator("!=", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_operator() {
|
||||
let input = ".";
|
||||
let expected = token::Stream::from(vec![Token::operator(".", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn comma_operator() {
|
||||
let input = ",";
|
||||
let expected = token::Stream::from(vec![Token::operator(",", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn double_dot_operator() {
|
||||
let input = "..";
|
||||
let expected = token::Stream::from(vec![Token::operator("..", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn triple_dot_operator() {
|
||||
let input = "...";
|
||||
let expected = token::Stream::from(vec![Token::operator("...", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn error_operator() {
|
||||
let input = "!";
|
||||
let expected = token::Stream::from(vec![Token::operator("!", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn type_ascription_operator() {
|
||||
let input = ":";
|
||||
let expected = token::Stream::from(vec![Token::operator(":", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn in_operator() {
|
||||
let input = "in";
|
||||
let expected = token::Stream::from(vec![Token::operator("in", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_union_operator() {
|
||||
let input = "|";
|
||||
let expected = token::Stream::from(vec![Token::operator("|", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_intersection_operator() {
|
||||
let input = "&";
|
||||
let expected = token::Stream::from(vec![Token::operator("&", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn typeset_subtraction_operator() {
|
||||
let input = "\\";
|
||||
let expected = token::Stream::from(vec![Token::operator("\\", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_left_operator() {
|
||||
let input = "<!!-";
|
||||
let expected = token::Stream::from(vec![Token::operator("<!!-", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_right_operator() {
|
||||
let input = "-->>";
|
||||
let expected = token::Stream::from(vec![Token::operator("-->>", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modifier_plus() {
|
||||
let input = "+=";
|
||||
let expected = token::Stream::from(vec![Token::modifier("+", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modifier_minus() {
|
||||
let input = "-=";
|
||||
let expected = token::Stream::from(vec![Token::modifier("-", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arbitrary_modifier() {
|
||||
let input = "<%=";
|
||||
let expected = token::Stream::from(vec![Token::modifier("<%", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_eq_suffix() {
|
||||
let input = "===";
|
||||
let expected = token::Stream::from(vec![Token::operator("==", 0), Token::invalid_suffix("=", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_dots_suffix() {
|
||||
let input = "....";
|
||||
let expected = token::Stream::from(vec![Token::operator("...", 0), Token::invalid_suffix(".", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_modifier_suffix() {
|
||||
let input = "+==";
|
||||
let expected = token::Stream::from(vec![Token::operator("+", 0), Token::invalid_suffix("==", 0)]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_call_operator() {
|
||||
let input = ".+ .<*>";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::operator(".", 0),
|
||||
Token::operator("+", 0),
|
||||
Token::operator(".", 1),
|
||||
Token::operator("<*>", 0),
|
||||
]);
|
||||
assert_lexes(input,expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_eq_operator() {
|
||||
let input = ".== . !=";
|
||||
let expected = token::Stream::from(vec![
|
||||
Token::operator(".", 0),
|
||||
Token::operator("==", 0),
|
||||
Token::operator(".", 1),
|
||||
Token::operator("!=", 2),
|
||||
]);
|
||||
assert_lexes(input,expected);
|
||||
}
|
1838
lib/rust/lexer/generation/tests/enso_lexer_text_literals.rs
Normal file
1838
lib/rust/lexer/generation/tests/enso_lexer_text_literals.rs
Normal file
File diff suppressed because it is too large
Load Diff
65
lib/rust/lexer/generation/tests/test_utils.rs
Normal file
65
lib/rust/lexer/generation/tests/test_utils.rs
Normal file
@ -0,0 +1,65 @@
|
||||
//! Utilities for testing the Enso lexer.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use enso_flexer::*;
|
||||
use lexer_definition::library::*;
|
||||
|
||||
use enso_flexer::prelude::reader::decoder::DecoderUTF8;
|
||||
use enso_flexer::prelude::Reader;
|
||||
use lexer::generated::engine::EnsoLexer;
|
||||
use lexer_definition::library::token::Token;
|
||||
|
||||
|
||||
|
||||
// =================
|
||||
// === Utilities ===
|
||||
// =================
|
||||
|
||||
/// Assert that `result` is a success with tokens `expected`.
|
||||
pub fn assert_succeeds_as(result:&LexingResult<token::Stream>, expected:token::Stream) {
|
||||
match result.kind {
|
||||
ResultKind::Success => assert_eq!(result.tokens,expected),
|
||||
_ => panic!("Lexing failed.")
|
||||
}
|
||||
}
|
||||
|
||||
/// Assert that the provided input lexes as `expected`.
|
||||
pub fn assert_lexes(input:impl AsRef<str>, expected:token::Stream) {
|
||||
let input_len = input.as_ref().chars().count();
|
||||
let result = lex(input);
|
||||
assert_succeeds_as(&result,expected);
|
||||
let tokens_vec : Vec<_> = result.tokens.into();
|
||||
let total_length : usize = tokens_vec.iter().map(|token| token.offset + token.length).sum();
|
||||
assert_eq!(total_length,input_len);
|
||||
}
|
||||
|
||||
/// Lex the provided string.
|
||||
pub fn lex(input:impl AsRef<str>) -> LexingResult<token::Stream> {
|
||||
let mut lexer = EnsoLexer::new();
|
||||
let reader = Reader::new(input.as_ref().as_bytes(),DecoderUTF8());
|
||||
lexer.run(reader)
|
||||
}
|
||||
|
||||
/// Asserts that the input is a block and has a length equal to `length`.
|
||||
pub fn assert_block_has_length(input:impl AsRef<str>, expected_length:usize) {
|
||||
let result = lex(input);
|
||||
match result.kind {
|
||||
ResultKind::Success => {
|
||||
let tokens = result.tokens.tokens();
|
||||
match tokens.first().expect("Token should be present.") {
|
||||
Token{shape:token::Shape::Block{..},length,..} =>
|
||||
assert_eq!(*length,expected_length),
|
||||
_ => panic!("Token not a block."),
|
||||
}
|
||||
},
|
||||
_ => panic!("Lexing failed"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Makes the test text have unix line endings to ensure consistency regardless of git checkout
|
||||
/// style.
|
||||
pub fn make_unix_line_endings(input:&str) -> String {
|
||||
let string = String::from(input);
|
||||
string.chars().filter(|c| *c != '\r').collect()
|
||||
}
|
@ -516,6 +516,9 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
|
||||
logger.trace {
|
||||
onEscape(Segment.Escape.Slash)
|
||||
}
|
||||
def onEscapeFormatQuote(): Unit = logger.trace {
|
||||
onEscape(Segment.Escape.Quote)
|
||||
}
|
||||
|
||||
def onEscapeQuote(): Unit =
|
||||
logger.trace {
|
||||
@ -618,47 +621,24 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
|
||||
FMT_BLCK.parent = FMT
|
||||
}
|
||||
|
||||
ROOT || '`' || text.onInterpolateEnd()
|
||||
text.FMT || '`' || text.onInterpolateBegin()
|
||||
|
||||
ROOT || '`' || text.onInterpolateEnd()
|
||||
ROOT || "'''" >> "'".many1 || text.onInvalidQuote()
|
||||
ROOT || "'" || text.onBegin(text.FMT_LINE)
|
||||
ROOT || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
|
||||
ROOT || "'''" || text.onInlineBlock()
|
||||
ROOT || "\"\"\"" >> "\"".many1 || text.onInvalidQuote()
|
||||
ROOT || '"' || text.onBegin(text.RAW_LINE)
|
||||
ROOT || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
|
||||
ROOT || "\"\"\"" || text.onInlineBlock()
|
||||
|
||||
ROOT || "'" || text.onBegin(text.FMT_LINE)
|
||||
text.FMT_LINE || "'" || text.submit()
|
||||
text.FMT_LINE || "'".many1 || text.submitInvalidQuote()
|
||||
text.FMT_LINE || text.fmtSeg || text.submitPlainSegment()
|
||||
text.FMT_LINE || eof || text.submitMissingQuote()
|
||||
text.FMT_LINE || newline || text.submitMissingQuote()
|
||||
block.FIRSTCHAR || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
|
||||
ROOT || text.fmtBlock || text.onBeginBlock(text.FMT_BLCK)
|
||||
ROOT || "'''" || text.onInlineBlock()
|
||||
text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment()
|
||||
text.FMT_BLCK || eof || text.onEndOfBlock()
|
||||
text.FMT_BLCK || newline || text.onEndOfLine()
|
||||
|
||||
ROOT || '"' || text.onBegin(text.RAW_LINE)
|
||||
text.RAW_LINE || '"' || text.submit()
|
||||
text.RAW_LINE || '"'.many1 || text.submitInvalidQuote()
|
||||
text.RAW_LINE || text.rawSeg || text.submitPlainSegment()
|
||||
text.RAW_LINE || eof || text.submitMissingQuote()
|
||||
text.RAW_LINE || newline || text.submitMissingQuote()
|
||||
block.FIRSTCHAR || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
|
||||
ROOT || text.rawBlock || text.onBeginBlock(text.RAW_BLCK)
|
||||
ROOT || "\"\"\"" || text.onInlineBlock()
|
||||
text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment()
|
||||
text.RAW_BLCK || eof || text.onEndOfBlock()
|
||||
text.RAW_BLCK || newline || text.onEndOfLine()
|
||||
|
||||
text.NEWLINE || space.opt || text.onNewLine()
|
||||
text.NEWLINE || space.opt >> newline || text.onEmptyLine()
|
||||
text.NEWLINE || space.opt >> eof || text.onEOFNewLine()
|
||||
|
||||
text.FMT || '`' || text.onInterpolateBegin()
|
||||
AST.Text.Segment.Escape.Character.codes.foreach { code =>
|
||||
val char = s"text.Segment.Escape.Character.$code"
|
||||
text.FMT || s"\\$code" run s"text.onEscape($char)"
|
||||
}
|
||||
|
||||
AST.Text.Segment.Escape.Control.codes.foreach { code =>
|
||||
val ctrl = s"text.Segment.Escape.Control.$code"
|
||||
text.FMT || s"\\$code" run s"text.onEscape($ctrl)"
|
||||
@ -668,16 +648,39 @@ case class ParserDef() extends flexer.Parser[AST.Module] {
|
||||
text.FMT || text.escape_u16 || text.onEscapeU16()
|
||||
text.FMT || text.escape_u32 || text.onEscapeU32()
|
||||
text.FMT || text.escape_int || text.onEscapeInt()
|
||||
text.FMT || "\\'" || text.onEscapeFormatQuote()
|
||||
text.FMT || "\\\\" || text.onEscapeSlash()
|
||||
text.FMT || "\\'" || text.onEscapeQuote()
|
||||
text.FMT || "\\" >> any || text.onEscapeInvalid()
|
||||
text.FMT || "\\" || text.onEscapeUnfinished()
|
||||
|
||||
text.FMT_LINE || "'" || text.submit()
|
||||
text.FMT_LINE || "'".many1 || text.submitInvalidQuote()
|
||||
text.FMT_LINE || text.fmtSeg || text.submitPlainSegment()
|
||||
text.FMT_LINE || eof || text.submitMissingQuote()
|
||||
text.FMT_LINE || newline || text.submitMissingQuote()
|
||||
|
||||
text.FMT_BLCK || text.fmtBSeg || text.submitPlainSegment()
|
||||
text.FMT_BLCK || eof || text.onEndOfBlock()
|
||||
text.FMT_BLCK || newline || text.onEndOfLine()
|
||||
|
||||
text.RAW_LINE || '"' || text.submit()
|
||||
text.RAW_LINE || '"'.many1 || text.submitInvalidQuote()
|
||||
text.RAW_LINE || text.rawSeg || text.submitPlainSegment()
|
||||
text.RAW_LINE || eof || text.submitMissingQuote()
|
||||
text.RAW_LINE || newline || text.submitMissingQuote()
|
||||
text.RAW_LINE || "\\\"" || text.onEscapeRawQuote()
|
||||
text.RAW_LINE || "\\\\" || text.onEscapeSlash()
|
||||
text.RAW_LINE || "\\" >> any || text.onEscapeInvalid()
|
||||
text.RAW_LINE || "\\" || text.onEscapeUnfinished()
|
||||
|
||||
text.RAW_BLCK || text.rawBSeg || text.submitPlainSegment()
|
||||
text.RAW_BLCK || eof || text.onEndOfBlock()
|
||||
text.RAW_BLCK || newline || text.onEndOfLine()
|
||||
|
||||
text.NEWLINE || space.opt || text.onNewLine()
|
||||
text.NEWLINE || space.opt >> newline || text.onEmptyLine()
|
||||
text.NEWLINE || space.opt >> eof || text.onEOFNewLine()
|
||||
|
||||
//////////////
|
||||
/// Blocks ///
|
||||
//////////////
|
||||
|
@ -0,0 +1,169 @@
|
||||
package org.enso.syntax;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.openjdk.jmh.infra.BenchmarkParams;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@Fork(1)
|
||||
@Warmup(iterations = 5)
|
||||
@Measurement(iterations = 10)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
public class LexerBench {
|
||||
|
||||
@State(Scope.Thread)
|
||||
public static class BenchState {
|
||||
@Param({"1024" /* 1KB */, "102400" /* 100KB */, "1048576" /* 1MB */, "10485760" /* 10MB */})
|
||||
public int bytesSize;
|
||||
|
||||
public String myInput;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void doSetup(BenchmarkParams params) {
|
||||
var benchNameSegments = params.getBenchmark().split("\\.");
|
||||
var benchName = benchNameSegments[benchNameSegments.length - 1];
|
||||
var benchInput = LexerBenchFixtures.benchmarks().get(benchName).get();
|
||||
this.myInput = LexerBenchFixtures.replicate(benchInput, bytesSize, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Literals ===
|
||||
|
||||
@Benchmark
|
||||
public void literalNumberInteger(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalNumberIntegerExplicitBase(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalNumberDecimal(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalNumberDecimalExplicitBase(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalNumberErrorBase(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalTextFormatLine(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalTextFormatInlineBlock(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalTextFormatBlock(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalTextRawLine(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalRawInlineBlock(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void literalRawBlock(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
|
||||
// === Names ===
|
||||
|
||||
@Benchmark
|
||||
public void nameLineOf(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void nameInvalidSuffix(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
|
||||
// === Operators ===
|
||||
|
||||
@Benchmark
|
||||
public void operatorLineOf(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void operatorDotCall(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void operatorInvalidSuffix(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
|
||||
// === Blocks ===
|
||||
|
||||
@Benchmark
|
||||
public void blockTopLevel(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void blockNested(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void blockDeeplyNested(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
|
||||
// === Comments ===
|
||||
|
||||
@Benchmark
|
||||
public void commentLine(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void commentInLine(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void commentDoc(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
|
||||
// === Combined ===
|
||||
|
||||
@Benchmark
|
||||
public void combinedSimple(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void combinedComplex(Blackhole blackhole, BenchState state) {
|
||||
blackhole.consume(LexerBenchFixtures.runLexer(state.myInput));
|
||||
}
|
||||
}
|
@ -0,0 +1,223 @@
|
||||
package org.enso.syntax
|
||||
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
import org.enso.flexer
|
||||
import org.enso.flexer.Reader
|
||||
import org.enso.syntax.text.AST
|
||||
import org.enso.syntax.text.spec.{ParserDef, ParserDef2}
|
||||
|
||||
object LexerBenchFixtures {
|
||||
|
||||
private val newEngine = flexer.Parser.compile(ParserDef())
|
||||
|
||||
// === Lexer Runner ===
|
||||
|
||||
/** Execute the lexer on the provided `input`.
|
||||
*
|
||||
* @param input the source code
|
||||
* @return the result of lexing `input`
|
||||
*/
|
||||
def runLexer(input: String): ParserDef2.Result[AST.Module] = {
|
||||
val engine = newEngine()
|
||||
val reader = new Reader(input)
|
||||
engine.run(reader)
|
||||
}
|
||||
|
||||
// === Utilities ===
|
||||
|
||||
/** Replicate the provided `input` out to the provided `size` in bytes
|
||||
* (according to utf-8).
|
||||
*
|
||||
* @param input the text to replicate
|
||||
* @param size the size to replicate `input` to
|
||||
* @param addNewline whether or not a newline should be added after each
|
||||
* repetition of `input`
|
||||
* @return `input`, repeated enough times to make the size >= `size`
|
||||
*/
|
||||
def replicate(
|
||||
input: String,
|
||||
size: Int,
|
||||
addNewline: Boolean = false
|
||||
): String = {
|
||||
val inputSize = input.getBytes(StandardCharsets.UTF_8).length
|
||||
val times = 1 + size / inputSize
|
||||
val inputNewline = if (addNewline) input + "\n" else input + " "
|
||||
inputNewline.repeat(times)
|
||||
}
|
||||
|
||||
/** Replace all CRLF line endings in the input by LF.
|
||||
*
|
||||
* @param input the input text
|
||||
* @return `input` with all `\r\n` replaced by `\n`
|
||||
*/
|
||||
def preprocess(input: String): String = {
|
||||
input.replace("\r\n", "\n")
|
||||
}
|
||||
|
||||
// === Benchmarks ===
|
||||
|
||||
val benchmarks: Map[String, String] = Map(
|
||||
// Literals
|
||||
("literalNumberInteger", Literal.Number.integer),
|
||||
("literalNumberIntegerExplicitBase", Literal.Number.integerExplicitBase),
|
||||
("literalNumberDecimal", Literal.Number.decimal),
|
||||
("literalNumberDecimalExplictBase", Literal.Number.decimalExplicitBase),
|
||||
("literalNumberErrorBase", Literal.Number.errorBase),
|
||||
("literalTextFormatLine", Literal.Text.formatLine),
|
||||
("literalTextFormatInlineBlock", Literal.Text.formatInlineBlock),
|
||||
("literalTextFormatBlock", Literal.Text.formatBlock),
|
||||
("literalTextRawLine", Literal.Text.rawLine),
|
||||
("literalTextRawInlineBlock", Literal.Text.rawInlineBlock),
|
||||
("literalTextRawBlock", Literal.Text.rawBlock),
|
||||
// Names
|
||||
("nameLineOf", Name.lineOf),
|
||||
("nameInvalidSuffix", Name.invalidSuffix),
|
||||
// Operators
|
||||
("operatorLineOf", Operator.lineOf),
|
||||
("operatorDotCall", Operator.dotCall),
|
||||
("operatorInvalidSuffix", Operator.invalidSuffix),
|
||||
// Blocks
|
||||
("blockTopLevel", Block.topLevel),
|
||||
("blockNested", Block.nested),
|
||||
("blockDeeplyNested", Block.deeplyNested),
|
||||
// Comments
|
||||
("commentLine", Comment.line),
|
||||
("commentInLine", Comment.inLine),
|
||||
("commentDoc", Comment.doc),
|
||||
// Combined
|
||||
("combinedSimple", Combined.simple),
|
||||
("combinedComplex", Combined.complex)
|
||||
)
|
||||
|
||||
// === Inputs ===
|
||||
|
||||
object Literal {
|
||||
object Number {
|
||||
val integer: String = preprocess("12345")
|
||||
val integerExplicitBase: String = preprocess("16_a4fd31")
|
||||
val decimal: String = preprocess("1.3141")
|
||||
val decimalExplicitBase: String = preprocess("10_1.000999")
|
||||
val errorBase: String = preprocess("10.2_2")
|
||||
}
|
||||
|
||||
object Text {
|
||||
val formatLine: String =
|
||||
"'dearest creature in \\n creation studying english pronunciation'"
|
||||
|
||||
val formatInlineBlock: String =
|
||||
"''' An inline block. It's a very good inline block carl \\u{AB}"
|
||||
|
||||
val formatBlock: String =
|
||||
"""''' Here is my block of format text. I can `interpolate + things` like that.
|
||||
| It goes on and on and on for `times` times because I feel like it.
|
||||
|
|
||||
| Complex interpolated expression `x -> y ~> x | y` woo!
|
||||
|""".stripMargin
|
||||
|
||||
val rawLine: String =
|
||||
"\"dearest creature in '''' creation studying english pronunciation\""
|
||||
|
||||
val rawInlineBlock: String =
|
||||
"\"\"\" An inline block. It's a very good inline block carl "
|
||||
|
||||
val tQuote = "\"\"\""
|
||||
val rawBlock: String =
|
||||
s"""$tQuote Here is my block of raw text. `Interpolations` are nothing special here.
|
||||
| It goes on and on and on for I can escape \" though.
|
||||
|
|
||||
| It also supports blank lines!
|
||||
|""".stripMargin
|
||||
}
|
||||
}
|
||||
|
||||
object Name {
|
||||
val lineOf: String =
|
||||
"Referent_Ident var_ident JavaType _ @annotation ticked_ident' number_1"
|
||||
|
||||
val invalidSuffix: String = "some_var'iable some_varД"
|
||||
}
|
||||
|
||||
object Operator {
|
||||
val lineOf: String = "+ - * -> ~> <~ <- ! & | /"
|
||||
val dotCall: String = ".== . != .<*> .*> .|>"
|
||||
val invalidSuffix: String = ".... +=="
|
||||
}
|
||||
|
||||
object Block {
|
||||
val topLevel: String = "foo\nbar\nbaz"
|
||||
val nested: String = "foo\\nbar\\n baz\\n quux"
|
||||
val deeplyNested: String =
|
||||
"""foo
|
||||
|bar
|
||||
| baz
|
||||
| quux
|
||||
| bim
|
||||
| bam
|
||||
| oh
|
||||
|no
|
||||
|""".stripMargin
|
||||
}
|
||||
|
||||
object Comment {
|
||||
val line: String =
|
||||
"# foo bar baz I have a really long line comment here that goes on and on"
|
||||
|
||||
val inLine: String = "a + b # A useless comment: add a to b"
|
||||
|
||||
val doc: String =
|
||||
"""## I have a really big doc comment here
|
||||
| That just keeps prattling on and on and on.
|
||||
|
|
||||
| With blank lines
|
||||
|
|
||||
| Forever
|
||||
|
|
||||
| and
|
||||
| ever
|
||||
|
|
||||
| and
|
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
| ever
|
||||
|documented
|
||||
|""".stripMargin
|
||||
}
|
||||
|
||||
object Combined {
|
||||
val simple: String =
|
||||
"""
|
||||
|import Base.Meta
|
||||
|
|
||||
|## Decompose the value using runtime reflection and print its decomposition.
|
||||
|Main.print_decomp a b =
|
||||
| y = a + b
|
||||
| decomp = Meta.decompose y
|
||||
| Io.println decomp
|
||||
|""".stripMargin
|
||||
|
||||
val complex: String =
|
||||
"""import Base.Meta
|
||||
|
|
||||
|## Frobnicate the doodads by constructing a new type operator through runtime reflection such that
|
||||
| it can be passed to another language.
|
||||
|
|
||||
| ! WARNING
|
||||
| Type-checking code like this is virtually impossible, and it is treated as `Dynamic` inside
|
||||
| Enso code.
|
||||
|Main.foo a b =
|
||||
| y = x -> z ->
|
||||
| ty = a.gen_type (~>) (<-) b
|
||||
| ty (z x)
|
||||
| decomp = Meta.decompose (y a b)
|
||||
| Io.println decomp
|
||||
|
|
||||
|## Execute the main function of this project.
|
||||
|main =
|
||||
| func = Meta.reify (here.foo "My_Name" "my_field")
|
||||
| Io.println(func)
|
||||
|""".stripMargin
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user