Simple assignments and function definitions (#3572)

implement simple variable assignments and function definitions.

This implements:
- https://www.pivotaltracker.com/story/show/182497122
- https://www.pivotaltracker.com/story/show/182497144 (the code blocks are not created yet, but the function declaration is recognized.)

# Important Notes
- Introduced S-expression-based tests, and pretty-printing-roundtrip testing.
- Started writing tests for TypeDef based on the examples in the issue. None of them parse successfully.
- Fixed Number tokenizing.
- Moved most contents of parser's `main.rs` to `lib.rs` (fixes a warning).
This commit is contained in:
Kaz Wesley 2022-07-07 15:31:00 -07:00 committed by GitHub
parent d8dddf40c6
commit 100eeda673
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 870 additions and 207 deletions

40
Cargo.lock generated
View File

@ -2035,6 +2035,18 @@ dependencies = [
"derive_more",
]
[[package]]
name = "enso-metamodel-lexpr"
version = "0.1.0"
dependencies = [
"bincode",
"derivative",
"enso-metamodel",
"enso-reflect",
"lexpr",
"serde",
]
[[package]]
name = "enso-optics"
version = "0.2.0"
@ -2048,12 +2060,15 @@ version = "0.1.0"
dependencies = [
"bincode",
"enso-data-structures",
"enso-metamodel",
"enso-metamodel-lexpr",
"enso-parser-syntax-tree-builder",
"enso-parser-syntax-tree-visitor",
"enso-prelude",
"enso-reflect",
"enso-shapely-macros",
"enso-types",
"lexpr",
"serde",
]
@ -3650,7 +3665,7 @@ dependencies = [
"base64 0.13.0",
"bytes 1.1.0",
"http",
"httpdate 0.3.2",
"httpdate 1.0.2",
"language-tags 0.3.2",
"mime 0.3.16",
"percent-encoding 2.1.0",
@ -4075,6 +4090,29 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "lexpr"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ceee0b80e0043f17bf81130471e1b0975179af75fe657af45577d80e2698fe3b"
dependencies = [
"itoa 0.4.8",
"lexpr-macros",
"proc-macro-hack",
"ryu",
]
[[package]]
name = "lexpr-macros"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd627fb38e19c00d8d068618259205f7a91c91aeade5c15bc35dbca037bb1c35"
dependencies = [
"proc-macro-hack",
"proc-macro2",
"quote",
]
[[package]]
name = "libc"
version = "0.2.126"

View File

@ -12,6 +12,7 @@ derive_more = "0.99"
bincode = "1.3"
[features]
default = ["graphviz", "java", "rust"]
graphviz = []
java = []
rust = []

View File

@ -0,0 +1,16 @@
[package]
name = "enso-metamodel-lexpr"
version = "0.1.0"
edition = "2021"
authors = ["Enso Team <contact@enso.org>"]
[dependencies]
lexpr = "0.2.6"
bincode = "1.3"
serde = "1"
enso-metamodel = { path = "../" }
derivative = "2.2"
[dev-dependencies]
serde = { version = "1", features = ["derive"] }
enso-reflect = { path = "../../reflect" }

View File

@ -0,0 +1,277 @@
//! Producing S-expression representations of data based on reflection information.
//!
//! The chosen output format is compact--more so than the pretty-printing supported by `lexpr`
//! (which is what is used by `serde_lexpr` to derive an S-expression "format" for data).
//!
//! - A struct is represented as a list of its fields.
//! - No type names are emitted. For variant types, the discriminant is included before the fields.
//! - Named fields are represented with the structure used for Lisp's `alist`s: `(name . value)`.
//! - Field names are prefixed with ':'.
//! - Sequence types like Rust's `Vec<_>` are represent with `lexpr` `Vector`s: `#(element element)`
//! - An option prints the same way as its contained value in the `Some` case, or as an empty list
//! `()` in the `None` case.
//!
//! # Example
//!
//! ```
//! # use enso_reflect::prelude::*;
//! # use serde::Serialize;
//! use enso_reflect::Reflect;
//! use lexpr::Value;
//!
//! // A type, and a value. We'd like to see the S-expr representation of the value.
//! #[derive(Serialize, Reflect)]
//! struct A {
//! value: u32,
//! }
//! let value = A { value: 36 };
//! // Get `meta` type info for the type.
//! let (graph, rust_to_meta) = enso_metamodel::rust::to_meta(value.reflect_type());
//! let value_ty = rust_to_meta[&value.reflect_type().id];
//! // Use the type info to get a representation of an instance's data.
//! let s_expr = enso_metamodel_lexpr::ToSExpr::new(&graph).value(value_ty, &value);
//! let field_expr = Value::cons(Value::symbol(":value"), Value::Number(36.into()));
//! assert_eq!(s_expr, Value::cons(field_expr, Value::Null));
//! ```
// === Features ===
// === Standard Linter Configuration ===
#![deny(non_ascii_idents)]
#![warn(unsafe_code)]
// === Non-Standard Linter Configuration ===
#![allow(clippy::option_map_unit_fn)]
#![allow(clippy::precedence)]
#![allow(dead_code)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unused_import_braces)]
#![warn(unused_qualifications)]
use derivative::Derivative;
use enso_metamodel::meta::*;
use lexpr::Value;
use std::collections::BTreeMap;
// =============================
// === Meta to S-expressions ===
// =============================
/// Render data to an S-expression representation based on its `meta` model.
#[derive(Derivative)]
#[derivative(Debug)]
pub struct ToSExpr<'g> {
graph: &'g TypeGraph,
#[derivative(Debug = "ignore")]
mappers: BTreeMap<TypeId, Box<dyn Fn(Value) -> Value>>,
}
impl<'g> ToSExpr<'g> {
#[allow(missing_docs)]
pub fn new(graph: &'g TypeGraph) -> Self {
let mappers = Default::default();
Self { graph, mappers }
}
/// Set a transformation to be applied to a type after translating to an S-expression.
pub fn mapper(&mut self, id: TypeId, f: impl Fn(Value) -> Value + 'static) {
self.mappers.insert(id, Box::new(f));
}
/// Given a bincode-serialized input, use its `meta` type info to transcribe it to an
/// S-expression.
pub fn value<T: serde::Serialize>(&self, id: TypeId, input: &T) -> Value {
use bincode::Options;
let bincoder = bincode::DefaultOptions::new().with_fixint_encoding();
let data = bincoder.serialize(input).unwrap();
let mut data = &data[..];
let value = self.value_(id, &mut data);
assert_eq!(data, &[0; 0], "{}", value);
value
}
}
// === Implementation ===
impl<'g> ToSExpr<'g> {
fn value_(&self, id: TypeId, data: &mut &[u8]) -> Value {
match &self.graph[id].data {
Data::Struct(_) => self.struct_(id, data),
Data::Primitive(primitive) => self.primitive(*primitive, data),
}
}
fn struct_(&self, id: TypeId, data: &mut &[u8]) -> Value {
let mut hierarchy = vec![];
let mut child = None;
let discriminants = &self.graph[id].discriminants;
if !discriminants.is_empty() {
let discriminant_index = read_u32(data);
let child_ = discriminants[&(discriminant_index as usize)];
hierarchy.push(child_);
child = Some(child_);
}
hierarchy.push(id);
let mut id_ = id;
while let Some(parent) = self.graph[id_].parent {
hierarchy.push(parent);
id_ = parent;
}
let mut out = vec![];
self.fields(&mut hierarchy, data, &mut out);
assert_eq!(hierarchy, &[]);
let mut value = Value::list(out);
if let Some(id) = child {
if let Some(mapper) = self.mappers.get(&id) {
value = (mapper)(value);
if !value.is_cons() {
value = Value::cons(value, Value::Null);
}
};
let discriminant = self.graph[id].name.to_pascal_case().into_boxed_str();
let discriminant = Value::Symbol(discriminant);
value = Value::cons(discriminant, value);
}
if let Some(mapper) = self.mappers.get(&id) {
value = (mapper)(value);
}
value
}
fn fields(&self, hierarchy: &mut Vec<TypeId>, data: &mut &[u8], out: &mut Vec<Value>) {
let id = match hierarchy.pop() {
Some(id) => id,
None => return,
};
let fields = match &self.graph[id].data {
Data::Struct(fields) => fields,
Data::Primitive(_) => panic!(),
};
if self.graph[id].child_field == Some(0) || fields.is_empty() {
self.fields(hierarchy, data, out);
}
for (i, field) in fields.iter().enumerate() {
if !field.name.is_empty() {
let car = Value::Symbol(format!(":{}", field.name).into_boxed_str());
let cdr = self.value_(field.type_, data);
out.push(Value::cons(car, cdr));
} else {
out.push(self.value_(field.type_, data));
}
if self.graph[id].child_field == Some(i + 1) {
self.fields(hierarchy, data, out);
}
}
}
fn primitive(&self, primitive: Primitive, data: &mut &[u8]) -> Value {
match primitive {
Primitive::U32 => Value::Number(read_u32(data).into()),
Primitive::U64 => Value::Number(read_u64(data).into()),
Primitive::Bool => {
let value = read_u8(data);
let value = match value {
0 => false,
1 => true,
_ => panic!(),
};
Value::Bool(value)
}
Primitive::String => Value::String(read_string(data).into()),
Primitive::Sequence(t0) => {
let len = read_u64(data);
Value::vector((0..len).map(|_| self.value_(t0, data)))
}
Primitive::Option(t0) => match read_u8(data) {
0 => Value::Null,
1 => self.value_(t0, data),
_ => panic!(),
},
Primitive::Result(t0, t1) => {
let mut values = vec![];
match read_u32(data) {
0 => {
values.push(Value::Symbol("Ok".to_owned().into_boxed_str()));
values.push(self.value_(t0, data));
}
1 => {
values.push(Value::Symbol("Err".to_owned().into_boxed_str()));
values.push(self.value_(t1, data));
}
_ => panic!(),
}
Value::list(values)
}
}
}
}
// === Primitive Deserializers ===
fn read_u8(buffer: &mut &[u8]) -> u8 {
let (bytes, rest) = buffer.split_at(1);
*buffer = rest;
bytes[0]
}
fn read_u32(buffer: &mut &[u8]) -> u32 {
let (bytes, rest) = buffer.split_at(4);
*buffer = rest;
let mut data = [0; 4];
data.copy_from_slice(bytes);
u32::from_le_bytes(data)
}
fn read_u64(buffer: &mut &[u8]) -> u64 {
let (bytes, rest) = buffer.split_at(8);
*buffer = rest;
let mut data = [0; 8];
data.copy_from_slice(bytes);
u64::from_le_bytes(data)
}
fn read_string(buffer: &mut &[u8]) -> String {
let len = read_u64(buffer);
let (bytes, rest) = buffer.split_at(len as usize);
*buffer = rest;
String::from_utf8(bytes.to_owned()).unwrap()
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod test {
use super::*;
#[test]
fn unit_test() {
#[derive(serde::Serialize)]
struct A {
value: u32,
}
let mut graph = TypeGraph::new();
let int_name = TypeName::from_pascal_case("U32");
let int = Type::new(int_name, Data::Primitive(Primitive::U32));
let int = graph.types.insert(int);
let a_name = TypeName::from_pascal_case("A");
let a_field_name = FieldName::from_snake_case("value");
let a_field = Field::named(a_field_name, int);
let a = Type::new(a_name, Data::Struct(vec![a_field]));
let a = graph.types.insert(a);
let a_value = A { value: 36 };
let s_expr = ToSExpr::new(&graph).value(a, &a_value);
let field_expr = Value::cons(Value::symbol(":value"), Value::Number(36.into()));
assert_eq!(s_expr, Value::cons(field_expr, Value::Null));
}
}

View File

@ -45,7 +45,6 @@
// === Features ===
#![feature(map_first_last)]
#![feature(associated_type_defaults)]
#![feature(option_get_or_insert_default)]
// === Standard Linter Configuration ===
#![deny(non_ascii_idents)]

View File

@ -179,12 +179,13 @@ impl Identifier {
/// Render in camelCase.
pub fn to_camel_case(&self) -> String {
let mut camel = String::with_capacity(self.segments_len());
let (head, tail) = self.segments.split_first().unwrap();
camel.push_str(head);
for segment in tail {
let mut chars = segment.chars();
camel.push(chars.next().unwrap().to_ascii_uppercase());
camel.extend(chars);
if let Some((head, tail)) = self.segments.split_first() {
camel.push_str(head);
for segment in tail {
let mut chars = segment.chars();
camel.push(chars.next().unwrap().to_ascii_uppercase());
camel.extend(chars);
}
}
camel
}
@ -225,6 +226,11 @@ impl Identifier {
pub fn append(&mut self, other: Self) {
self.segments.extend(other.segments)
}
/// Return whether this identifier is zero-length.
pub fn is_empty(&self) -> bool {
self.segments.is_empty()
}
}
@ -284,6 +290,10 @@ impl FieldName {
pub fn append(&mut self, other: Self) {
self.0.append(other.0)
}
/// Return whether this identifier is zero-length.
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}

View File

@ -18,8 +18,9 @@ use std::mem::take;
pub fn to_meta(ty: TypeData) -> (meta::TypeGraph, BTreeMap<TypeId, meta::TypeId>) {
let mut to_meta = ToMeta::new();
let root_ = to_meta.run(ty);
to_meta.graph.gc(vec![root_]);
(to_meta.graph, to_meta.rust_to_meta)
let (mut graph, rust_to_meta) = to_meta.finish();
graph.gc(vec![root_]);
(graph, rust_to_meta)
}
#[derive(Debug, Default)]
@ -212,6 +213,11 @@ impl ToMeta {
self.rust_to_meta[&root_rust_id]
}
/// Return results.
pub fn finish(self) -> (meta::TypeGraph, BTreeMap<TypeId, meta::TypeId>) {
(self.graph, self.rust_to_meta)
}
fn generate_subtypes(&mut self, rust_types: &BTreeMap<TypeId, TypeData>) {
let mut parent_ids = BTreeMap::new();
let mut aliases = vec![];

View File

@ -20,5 +20,7 @@ enso-parser-syntax-tree-builder = { path = "src/syntax/tree/builder" }
serde = { version = "1.0", features = ["derive"] }
bincode = "1.3"
[lib]
path = "src/main.rs"
[dev-dependencies]
enso-metamodel = { path = "../metamodel", features = ["rust"] }
enso-metamodel-lexpr = { path = "../metamodel/lexpr" }
lexpr = "0.2.6"

View File

@ -721,12 +721,12 @@ impl<'s> Lexer<'s> {
/// 2. Some parsers could consume input even if it should be qualified as something else. Thus, some
/// parsers should be run first in order to make the token consuming process correct.
const PARSERS: &[for<'r> fn(&'r mut Lexer<'_>)] = &[
|t| t.number(),
|t| t.ident(),
|t| t.operator(),
|t| t.newline(),
|t| t.symbol(),
|t| t.comment(),
|t| t.number(),
|t| t.text(),
];
@ -752,7 +752,7 @@ impl<'s> Lexer<'s> {
}
}
if self.current_char != None {
panic!("Internal error. Lexer did not consume all input.");
panic!("Internal error. Lexer did not consume all input. State: {self:?}");
}
while self.end_block().is_some() {
let block_end = self.marker_token(token::Variant::block_end());
@ -902,6 +902,11 @@ mod tests {
]))
}
#[test]
fn test_numeric_literal() {
test_lexer("10", vec![number_("", "10")]);
}
#[test]
fn test_case_idents() {
test_lexer_many(vec![

265
lib/rust/parser/src/lib.rs Normal file
View File

@ -0,0 +1,265 @@
//! The Enso parser. Parsing is a multi-stage process:
//!
//! # Lexing.
//! First, the source code is feed to [`lexer::Lexer`], which consumes it and outputs a stream of
//! [`Token`]. Tokens are chunks of the input with a generic description attached, like "operator",
//! or "identifier".
//!
//! # Building macro registry.
//! Macros in Enso are a very powerful mechanism and are used to transform group of tokens into
//! almost any statement. First, macros need to be discovered and registered. Currently, there is no
//! real macro discovery process, as there is no support for user-defined macros. Instead, there is
//! a set of hardcoded macros defined in the compiler.
//!
//! Each macro defines one or more segments. Every segment starts with a predefined token and can
//! contain any number of other tokens. For example, the macro `if ... then ... else ...` contains
//! three segments. Macros can also accept prefix tokens, a set of tokens on the left of the first
//! segment. A good example is the lambda macro `... -> ...`.
//!
//! In this step, a [`MacroMatchTree`] is built. Basically, it is a map from the possible next
//! segment name to information of what other segments are required and what is the macro definition
//! in case these segments were found. For example, let's consider two macros: `if ... then ...`,
//! and `if ... then ... else ...`. In such a case, the macro registry will contain only one entry,
//! "if", and two sets of possible resolution paths: ["then"], and ["then", "else"], each associated
//! with the corresponding macro definition.
//!
//! # Splitting the token stream by the macro segments.
//! The input token stream is being iterated and is being split based on the segments of the
//! registered macros. For example, for the input `if a b then c d else e f`, the token stream will
//! be split into three segments, `a b`, `c d`, and `e f`, which will be associated with the
//! `if ... then ... else ...` macro definition.
//!
//! The splitting process is hierarchical. It means that a new macro can start being resolved during
//! resolution of a parent macro. For example, `if if a then b then c else d` is a correct
//! expression. After finding the first `if` token, the token stream will be split. The next `if`
//! token starts a new token stream splitting. The first `then` token belongs to the nested macro,
//! however, as soon as the resolver sees the second `then` token, it will consider the nested macro
//! to be finished, and will come back to parent macro resolution.
//!
//! # Resolving right-hand-side patterns of macro segments.
//! In the next steps, each macro is being analyzed, started from the most nested ones. For each
//! macro, the [`Pattern`] of last segment is being run to check which tokens belong to that macro,
//! and which tokens should be transferred to parent macro definition. For example, consider the
//! following code `process (read file) content-> print content`. The `(...)` is a macro with two
//! sections `(` and `)`. Let's mark the token splitting with `[` and `]` characters. The previous
//! macro resolution steps would output such split of the token stream:
//! `process [(read file][) content[-> print content]]`. In this step, the most inner macro will be
//! analyzed first. The pattern of the last segment of the inner macro (`->`) defines that it
//! consumes all tokens, so all the tokens `print content` are left as they are. Now, the resolution
//! moves to the parent macro. Its last segment starts with the `)` token, which pattern defines
//! that it does not consume any tokens, so all of its current tokens (`content[-> print content]]`)
//! are popped to a parent definition, forming `process [(read file][)] content[-> print content]`.
//!
//! Please note, that root of the expression is considered a special macro as well. It is done for
//! the algorithm unification purposes.
//!
//! # Resolving left-hand-side patterns of macro segments.
//! In this step, each macro is being analyzed, started from the most nested ones. For each macro,
//! the [`Pattern`] of the macro prefix is being run to check which tokens belong to the prefix of
//! the macro (in case the macro defines the prefix). In the example above, the macro `->` defines
//! complex prefix rules: if the token on the left of the arrow used no space, then only a single
//! token will be consumed. As a result of this step, the following token split will occur:
//! `[process [(read file][)] [content-> print content]`, which is exactly what we wanted.
//!
//! # Resolving patterns of macro segments.
//! In this step, all macro segment patterns are being resolved and errors are reported in case it
//! was not possible. If tokens in a segment match the segment pattern, they are sent to the
//! operator precedence resolver for final transformation.
//!
//! # Operator precedence resolution.
//! Each token stream sent to the operator resolver is processed by a modified Shunting Yard
//! algorithm, which handles such situations as multiple operators placed next to each other,
//! multiple identifiers placed next to each other, and also takes spacing into consideration in
//! order to implement spacing-aware precedence rules. After all segments are resolved, the macro
//! is being treated as a single token in one of the segments of the parent macro, and is being
//! processed by the operator precedence resolver as well. In the end, a single [`syntax::Tree`] is
//! produced, containing the parsed expression.
#![recursion_limit = "256"]
// === Features ===
#![allow(incomplete_features)]
#![feature(allocator_api)]
#![feature(test)]
#![feature(specialization)]
#![feature(let_chains)]
#![feature(if_let_guard)]
// === Standard Linter Configuration ===
#![deny(non_ascii_idents)]
#![warn(unsafe_code)]
// === Non-Standard Linter Configuration ===
#![allow(clippy::option_map_unit_fn)]
#![allow(clippy::precedence)]
#![allow(dead_code)]
#![deny(unconditional_recursion)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unused_import_braces)]
#![warn(unused_qualifications)]
use crate::prelude::*;
// ==============
// === Export ===
// ==============
pub mod lexer;
pub mod macros;
pub mod serialization;
pub mod source;
pub mod syntax;
/// Popular utilities, imported by most modules of this crate.
pub mod prelude {
pub use enso_prelude::serde_reexports::*;
pub use enso_prelude::*;
pub use enso_reflect as reflect;
pub use enso_reflect::Reflect;
pub use enso_types::traits::*;
pub use enso_types::unit2::Bytes;
}
// ==============
// === Parser ===
// ==============
/// Enso parser. See the module documentation to learn more about how it works.
#[allow(missing_docs)]
#[derive(Debug)]
pub struct Parser {
pub macros: macros::resolver::SegmentMap<'static>,
}
impl Parser {
/// Constructor.
pub fn new() -> Self {
let macros = macros::built_in::all();
Self { macros }
}
/// Main entry point.
pub fn run<'s>(&self, code: &'s str) -> syntax::Tree<'s> {
let tokens = lexer::run(code);
let mut statements = vec![];
let mut tokens = tokens.into_iter().peekable();
while tokens.peek().is_some() {
let resolver = macros::resolver::Resolver::new_root();
let tree = resolver.run(&self.macros, &mut tokens);
let tree = expression_to_statement(tree);
statements.push(tree);
}
syntax::Tree::block(statements)
}
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
// == Parsing helpers ==
/// Reinterpret an expression in a statement context (i.e. as a top level member of a block).
///
/// In statement context, an expression that has an assignment operator at its top level is
/// interpreted as a variable assignment or method definition.
fn expression_to_statement(tree: syntax::Tree<'_>) -> syntax::Tree<'_> {
use syntax::tree::*;
let tree_ = match &*tree.variant {
Variant::OprSectionBoundary(OprSectionBoundary { ast }) => ast,
_ => &tree,
};
let mut replacement = None;
if let Variant::OprApp(opr_app) = &*tree_.variant {
replacement = expression_to_binding(opr_app);
}
match replacement {
Some(modified) => modified,
None => tree,
}
}
/// If the input is an "=" expression, try to interpret it as either a variable assignment or method
/// definition.
fn expression_to_binding<'a>(app: &syntax::tree::OprApp<'a>) -> Option<syntax::Tree<'a>> {
use syntax::tree::*;
match app {
OprApp { lhs: Some(lhs), opr: Ok(opr), rhs } if opr.code == "=" => {
let mut lhs = lhs;
let mut args = vec![];
while let Variant::App(App { func, arg }) = &*lhs.variant {
lhs = func;
args.push(arg.clone());
}
args.reverse();
if let Some(rhs) = rhs && args.is_empty() {
Some(Tree::assignment(lhs.clone(), opr.clone(), rhs.clone()))
} else if let Variant::Ident(Ident { token }) = &*lhs.variant {
Some(Tree::function(token.clone(), args, opr.clone(), rhs.clone()))
} else {
None
}
}
_ => None,
}
}
// =============
// === Tests ===
// =============
#[cfg(test)]
mod tests {
use super::*;
use enso_parser_syntax_tree_builder::ast_builder;
macro_rules! test_parse {
($input:tt = {$($def:tt)*}) => {
assert_eq!(
Parser::new().run($input),
ast_builder! { $($def)* }
)
};
}
#[test]
fn test_expressions() {
test_parse! {"a" = {a}};
test_parse! {"a b" = {a b}};
test_parse! {"a b c" = {[a b] c}};
}
}
// ==================
// === Benchmarks ===
// ==================
#[cfg(test)]
mod benches {
use super::*;
extern crate test;
use test::Bencher;
#[bench]
fn bench_parsing_type_defs(bencher: &mut Bencher) {
let reps = 1_000;
let str = "type Option a b c\n".repeat(reps);
let parser = Parser::new();
bencher.iter(move || {
parser.run(&str);
});
}
}

View File

@ -71,7 +71,8 @@ fn type_def_body(matched_segments: NonEmptyVec<MatchedSegment>) -> syntax::Tree
// println!("{:#?}", name);
// println!("\n\n------------- 2");
let params = v.nested().query("param").unwrap();
let no_params = vec![];
let params = v.nested().query("param").unwrap_or(&no_params);
// println!("{:#?}", params);
// println!("\n\n------------- 3");

View File

@ -1,79 +1,4 @@
//! The Enso parser. Parsing is a multi-stage process:
//!
//! # Lexing.
//! First, the source code is feed to [`lexer::Lexer`], which consumes it and outputs a stream of
//! [`Token`]. Tokens are chunks of the input with a generic description attached, like "operator",
//! or "identifier".
//!
//! # Building macro registry.
//! Macros in Enso are a very powerful mechanism and are used to transform group of tokens into
//! almost any statement. First, macros need to be discovered and registered. Currently, there is no
//! real macro discovery process, as there is no support for user-defined macros. Instead, there is
//! a set of hardcoded macros defined in the compiler.
//!
//! Each macro defines one or more segments. Every segment starts with a predefined token and can
//! contain any number of other tokens. For example, the macro `if ... then ... else ...` contains
//! three segments. Macros can also accept prefix tokens, a set of tokens on the left of the first
//! segment. A good example is the lambda macro `... -> ...`.
//!
//! In this step, a [`MacroMatchTree`] is built. Basically, it is a map from the possible next
//! segment name to information of what other segments are required and what is the macro definition
//! in case these segments were found. For example, let's consider two macros: `if ... then ...`,
//! and `if ... then ... else ...`. In such a case, the macro registry will contain only one entry,
//! "if", and two sets of possible resolution paths: ["then"], and ["then", "else"], each associated
//! with the corresponding macro definition.
//!
//! # Splitting the token stream by the macro segments.
//! The input token stream is being iterated and is being split based on the segments of the
//! registered macros. For example, for the input `if a b then c d else e f`, the token stream will
//! be split into three segments, `a b`, `c d`, and `e f`, which will be associated with the
//! `if ... then ... else ...` macro definition.
//!
//! The splitting process is hierarchical. It means that a new macro can start being resolved during
//! resolution of a parent macro. For example, `if if a then b then c else d` is a correct
//! expression. After finding the first `if` token, the token stream will be split. The next `if`
//! token starts a new token stream splitting. The first `then` token belongs to the nested macro,
//! however, as soon as the resolver sees the second `then` token, it will consider the nested macro
//! to be finished, and will come back to parent macro resolution.
//!
//! # Resolving right-hand-side patterns of macro segments.
//! In the next steps, each macro is being analyzed, started from the most nested ones. For each
//! macro, the [`Pattern`] of last segment is being run to check which tokens belong to that macro,
//! and which tokens should be transferred to parent macro definition. For example, consider the
//! following code `process (read file) content-> print content`. The `(...)` is a macro with two
//! sections `(` and `)`. Let's mark the token splitting with `[` and `]` characters. The previous
//! macro resolution steps would output such split of the token stream:
//! `process [(read file][) content[-> print content]]`. In this step, the most inner macro will be
//! analyzed first. The pattern of the last segment of the inner macro (`->`) defines that it
//! consumes all tokens, so all the tokens `print content` are left as they are. Now, the resolution
//! moves to the parent macro. Its last segment starts with the `)` token, which pattern defines
//! that it does not consume any tokens, so all of its current tokens (`content[-> print content]]`)
//! are popped to a parent definition, forming `process [(read file][)] content[-> print content]`.
//!
//! Please note, that root of the expression is considered a special macro as well. It is done for
//! the algorithm unification purposes.
//!
//! # Resolving left-hand-side patterns of macro segments.
//! In this step, each macro is being analyzed, started from the most nested ones. For each macro,
//! the [`Pattern`] of the macro prefix is being run to check which tokens belong to the prefix of
//! the macro (in case the macro defines the prefix). In the example above, the macro `->` defines
//! complex prefix rules: if the token on the left of the arrow used no space, then only a single
//! token will be consumed. As a result of this step, the following token split will occur:
//! `[process [(read file][)] [content-> print content]`, which is exactly what we wanted.
//!
//! # Resolving patterns of macro segments.
//! In this step, all macro segment patterns are being resolved and errors are reported in case it
//! was not possible. If tokens in a segment match the segment pattern, they are sent to the
//! operator precedence resolver for final transformation.
//!
//! # Operator precedence resolution.
//! Each token stream sent to the operator resolver is processed by a modified Shunting Yard
//! algorithm, which handles such situations as multiple operators placed next to each other,
//! multiple identifiers placed next to each other, and also takes spacing into consideration in
//! order to implement spacing-aware precedence rules. After all segments are resolved, the macro
//! is being treated as a single token in one of the segments of the parent macro, and is being
//! processed by the operator precedence resolver as well. In the end, a single [`syntax::Tree`] is
//! produced, containing the parsed expression.
//! Tests for [`enso_parser`].
#![recursion_limit = "256"]
// === Features ===
@ -82,6 +7,7 @@
#![feature(test)]
#![feature(specialization)]
#![feature(let_chains)]
#![feature(if_let_guard)]
// === Standard Linter Configuration ===
#![deny(non_ascii_idents)]
#![warn(unsafe_code)]
@ -98,70 +24,7 @@
#![warn(unused_import_braces)]
#![warn(unused_qualifications)]
use crate::prelude::*;
// ==============
// === Export ===
// ==============
pub mod lexer;
pub mod macros;
pub mod serialization;
pub mod source;
pub mod syntax;
/// Popular utilities, imported by most modules of this crate.
pub mod prelude {
pub use enso_prelude::serde_reexports::*;
pub use enso_prelude::*;
pub use enso_reflect as reflect;
pub use enso_reflect::Reflect;
pub use enso_types::traits::*;
pub use enso_types::unit2::Bytes;
}
// ==============
// === Parser ===
// ==============
/// Enso parser. See the module documentation to learn more about how it works.
#[allow(missing_docs)]
#[derive(Debug)]
pub struct Parser {
pub macros: macros::resolver::SegmentMap<'static>,
}
impl Parser {
/// Constructor.
pub fn new() -> Self {
let macros = macros::built_in::all();
Self { macros }
}
/// Main entry point.
pub fn run<'s>(&self, code: &'s str) -> syntax::Tree<'s> {
let tokens = lexer::run(code);
let mut statements = vec![];
let mut tokens = tokens.into_iter().peekable();
while tokens.peek().is_some() {
let resolver = macros::resolver::Resolver::new_root();
let tree = resolver.run(&self.macros, &mut tokens);
statements.push(tree);
}
syntax::Tree::module(statements)
}
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
use enso_parser::prelude::*;
@ -171,52 +34,7 @@ impl Default for Parser {
fn main() {
init_tracing(TRACE);
let ast = Parser::new().run("type Option (a) b c");
let ast = enso_parser::Parser::new().run("type Option (a) b c");
println!("\n\n==================\n\n");
println!("{:#?}", ast);
}
#[cfg(test)]
mod tests {
use super::*;
use enso_parser_syntax_tree_builder::ast_builder;
macro_rules! test_parse {
($input:tt = {$($def:tt)*}) => {
assert_eq!(
Parser::new().run($input),
ast_builder! { $($def)* }
)
};
}
#[test]
fn test_expressions() {
test_parse! {"a" = {a}};
test_parse! {"a b" = {a b}};
test_parse! {"a b c" = {[a b] c}};
}
}
// ==================
// === Benchmarks ===
// ==================
#[cfg(test)]
mod benches {
use super::*;
extern crate test;
use test::Bencher;
#[bench]
fn bench_parsing_type_defs(bencher: &mut Bencher) {
let reps = 1_000;
let str = "type Option a b c\n".repeat(reps);
let parser = Parser::new();
bencher.iter(move || {
parser.run(&str);
});
}
}

View File

@ -39,6 +39,7 @@ pub(crate) fn serialize_cow<S>(cow: &Cow<'_, str>, ser: S) -> Result<S::Ok, S::E
where S: serde::Serializer {
let s = match cow {
Cow::Borrowed(s) => *s,
Cow::Owned(s) if s.is_empty() => "",
Cow::Owned(_) => panic!(),
};
let begin = s.as_ptr() as u32;

View File

@ -15,7 +15,7 @@ use crate::prelude::*;
pub struct Code<'s> {
#[serde(serialize_with = "crate::serialization::serialize_cow")]
#[serde(deserialize_with = "crate::serialization::deserialize_cow")]
#[reflect(as = "crate::serialization::Code", flatten)]
#[reflect(as = "crate::serialization::Code")]
pub repr: Cow<'s, str>,
}

View File

@ -62,8 +62,9 @@ impl From<&str> for VisibleOffset {
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Reflect, Deserialize)]
#[allow(missing_docs)]
pub struct Offset<'s> {
#[reflect(hide)]
pub visible: VisibleOffset,
#[reflect(flatten)]
#[reflect(flatten, hide)]
pub code: Code<'s>,
}

View File

@ -46,6 +46,7 @@ impl<'s> Item<'s> {
match self {
Item::Token(token) => match token.variant {
token::Variant::Ident(ident) => Tree::ident(token.with_variant(ident)),
token::Variant::Number(number) => Tree::number(token.with_variant(number)),
_ => todo!(),
},
Item::Tree(ast) => ast,

View File

@ -16,6 +16,7 @@ use crate::syntax::token::Token;
// computations for any operator (according to the spec)
fn precedence_of(operator: &str) -> usize {
match operator {
"=" => 1,
"+" => 3,
"-" => 3,
"*" => 7,

View File

@ -68,13 +68,17 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args)
pub error: Error,
pub ast: Tree<'s>,
},
Module {
Block {
pub statements: Vec<Tree<'s>>,
},
/// A simple identifier, like `foo` or `bar`.
Ident {
pub token: token::Ident<'s>,
},
/// A numeric literal, like `10`.
Number {
pub token: token::Number<'s>,
},
/// A simple application, like `print "hello"`.
App {
pub func: Tree<'s>,
@ -106,12 +110,22 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args)
MultiSegmentApp {
pub segments: NonEmptyVec<MultiSegmentAppSegment<'s>>,
},
TypeDef {
pub keyword: Token<'s>,
pub name: Tree<'s>,
pub params: Vec<Tree<'s>>,
}
},
Assignment {
pub pattern: Tree<'s>,
pub equals: token::Operator<'s>,
pub expr: Tree<'s>,
},
Function {
pub name: token::Ident<'s>,
pub args: Vec<Tree<'s>>,
pub equals: token::Operator<'s>,
pub body: Option<Tree<'s>>,
},
}
}};}

View File

@ -39,7 +39,7 @@ use std::mem;
#[proc_macro]
pub fn ast_builder(tokens: proc_macro::TokenStream) -> proc_macro::TokenStream {
let output = expr(tokens, None);
let output = quote!(crate::syntax::Tree::module(vec![#output]));
let output = quote!(crate::syntax::Tree::block(vec![#output]));
output.into()
}

View File

@ -0,0 +1,206 @@
//! Parse expressions and compare their results to expected values.
// === Non-Standard Linter Configuration ===
#![allow(clippy::option_map_unit_fn)]
#![allow(clippy::precedence)]
#![allow(dead_code)]
#![deny(non_ascii_idents)]
#![deny(unconditional_recursion)]
#![warn(unsafe_code)]
#![warn(missing_copy_implementations)]
#![warn(missing_debug_implementations)]
#![warn(missing_docs)]
#![warn(trivial_casts)]
#![warn(trivial_numeric_casts)]
#![warn(unused_import_braces)]
#![warn(unused_qualifications)]
use lexpr::sexp;
// ===========================
// === Test support macros ===
// ===========================
/// Parses input as a sequence of S-expressions, and wraps it in a `Block`.
macro_rules! block {
( $statements:tt ) => {
sexp![(Block #($statements))]
}
}
// =============
// === Tests ===
// =============
#[test]
fn application() {
test("a b c", block![(App (App (Ident a) (Ident b)) (Ident c))]);
}
#[test]
fn type_definition_bool() {
test("type Bool", block![(TypeDef (Ident type) (Ident Bool) #())]);
}
#[test]
fn type_definition_option() {
test("type Option a", block![(TypeDef (Ident type) (Ident Option) #((Ident a)))]);
}
#[test]
fn assignment_simple() {
test("foo = 23", block![(Assignment (Ident foo) "=" (Number 23))]);
}
#[test]
fn function_inline_simple_args() {
test("foo a = 23", block![(Function foo #((Ident a)) "=" (Number 23))]);
test("foo a b = 23", block![(Function foo #((Ident a) (Ident b)) "=" (Number 23))]);
test("foo a b c = 23", block![(Function foo #((Ident a) (Ident b) (Ident c)) "=" (Number 23))]);
}
#[test]
fn function_block_noargs() {
test("foo =", block![(Function foo #() "=" ())]);
}
#[test]
fn function_block_simple_args() {
test("foo a =", block![(Function foo #((Ident a)) "=" ())]);
test("foo a b =", block![(Function foo #((Ident a) (Ident b)) "=" ())]);
test("foo a b c =", block![(Function foo #((Ident a) (Ident b) (Ident c)) "=" ())]);
}
// ====================
// === Test Support ===
// ====================
use enso_metamodel_lexpr::ToSExpr;
use enso_reflect::Reflect;
use std::collections::HashSet;
/// Given a block of input Enso code, test that:
/// - The given code parses to the AST represented by the given S-expression.
/// - The AST pretty-prints back to the original code.
///
/// The S-expression format is as documented for [`enso_metamodel_lexpr`], with some
/// postprocessing:
/// - For concision, field names are stripped (as if all structs were tuple structs).
/// - Most token types are represented as their contents, rather than as a token struct. For
/// example, a `token::Number` may be represented like: `sexp![10]`, and a `token::Ident` may look
/// like `sexp![foo]`.
fn test(code: &str, expect: lexpr::Value) {
let ast = enso_parser::Parser::new().run(code);
let ast_s_expr = to_s_expr(&ast, code);
assert_eq!(ast_s_expr.to_string(), expect.to_string());
assert_eq!(ast.code(), code);
}
// =====================
// === S-expressions ===
// =====================
/// Produce an S-expression representation of the input AST type.
pub fn to_s_expr<T>(value: &T, code: &str) -> lexpr::Value
where T: serde::Serialize + Reflect {
let (graph, rust_to_meta) = enso_metamodel::rust::to_meta(value.reflect_type());
let ast_ty = rust_to_meta[&value.reflect_type().id];
let base = code.as_bytes().as_ptr() as usize;
let code: Box<str> = Box::from(code);
let mut to_s_expr = ToSExpr::new(&graph);
to_s_expr.mapper(ast_ty, strip_hidden_fields);
let ident_token = rust_to_meta[&enso_parser::syntax::token::variant::Ident::reflect().id];
let operator_token = rust_to_meta[&enso_parser::syntax::token::variant::Operator::reflect().id];
let number_token = rust_to_meta[&enso_parser::syntax::token::variant::Number::reflect().id];
let token_to_str = move |token: lexpr::Value| {
let range = token_code_range(&token, base);
code[range].to_owned().into_boxed_str()
};
let token_to_str_ = token_to_str.clone();
to_s_expr.mapper(ident_token, move |token| lexpr::Value::symbol(token_to_str_(token)));
let token_to_str_ = token_to_str.clone();
to_s_expr.mapper(operator_token, move |token| lexpr::Value::string(token_to_str_(token)));
let token_to_str_ = token_to_str;
to_s_expr.mapper(number_token, move |token| {
lexpr::Value::Number(token_to_str_(token).parse::<u64>().unwrap().into())
});
tuplify(to_s_expr.value(ast_ty, &value))
}
/// Strip certain fields that should be excluded from output.
fn strip_hidden_fields(tree: lexpr::Value) -> lexpr::Value {
let hidden_tree_fields =
[":spanLeftOffsetVisible", ":spanLeftOffsetCodeRepr", ":spanCodeLength"];
let hidden_tree_fields: HashSet<_> = hidden_tree_fields.into_iter().collect();
lexpr::Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val {
lexpr::Value::Cons(cons) => match cons.car() {
lexpr::Value::Symbol(symbol) => !hidden_tree_fields.contains(symbol.as_ref()),
_ => panic!(),
},
_ => true,
}))
}
/// Given an S-expression representation of a [`Token`] and the base address for `Code` `Cow`s,
/// return the range of the input code the token references.
fn token_code_range(token: &lexpr::Value, base: usize) -> std::ops::Range<usize> {
let code_repr = fields(token).find(|(name, _)| *name == ":codeRepr").unwrap().1;
let mut begin = None;
let mut len = None;
for (name, value) in fields(code_repr) {
match name {
":begin" => begin = Some(value.as_u64().unwrap() as u32),
":len" => len = Some(value.as_u64().unwrap() as u32),
_ => (),
}
}
let begin = begin.unwrap();
let begin = (begin as u64) | (base as u64 & !0xFFFF_FFFF);
let begin = if begin < (base as u64) { begin + 0x1_0000_0000 } else { begin };
let begin = begin as usize - base;
let len = len.unwrap() as usize;
begin..(begin + len)
}
/// Iterate the field `(name, value)` pairs of the S-expression of a struct with named fields.
fn fields(value: &'_ lexpr::Value) -> impl Iterator<Item = (&'_ str, &'_ lexpr::Value)> {
value.list_iter().unwrap().filter_map(|value| match value {
lexpr::Value::Cons(cons) => match cons.car() {
lexpr::Value::Symbol(symbol) => Some((&symbol[..], cons.cdr())),
_ => None,
},
_ => None,
})
}
/// Strip field names from struct representations, so that they are printed more concisely, as if
/// they were tuple-structs.
fn tuplify(value: lexpr::Value) -> lexpr::Value {
let (car, cdr) = match value {
lexpr::Value::Cons(cons) => cons.into_pair(),
lexpr::Value::Vector(mut vector) => {
for value in vector.iter_mut() {
let original = std::mem::replace(value, lexpr::Value::Nil);
*value = tuplify(original);
}
return lexpr::Value::Vector(vector);
}
value => return value,
};
if let lexpr::Value::Symbol(symbol) = &car {
if let Some(':') = symbol.chars().next() {
return tuplify(cdr);
}
}
let car = tuplify(car);
let cdr = tuplify(cdr);
lexpr::Value::Cons(lexpr::Cons::new(car, cdr))
}

View File

@ -10,4 +10,5 @@ enso-metamodel = { path = "../metamodel", features = ["rust"] }
derivative = "2.2"
[features]
default = ["graphviz"]
graphviz = ["enso-metamodel/graphviz"]