diff --git a/Cargo.lock b/Cargo.lock index 23b2b286d92..47ffdc73714 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2035,6 +2035,18 @@ dependencies = [ "derive_more", ] +[[package]] +name = "enso-metamodel-lexpr" +version = "0.1.0" +dependencies = [ + "bincode", + "derivative", + "enso-metamodel", + "enso-reflect", + "lexpr", + "serde", +] + [[package]] name = "enso-optics" version = "0.2.0" @@ -2048,12 +2060,15 @@ version = "0.1.0" dependencies = [ "bincode", "enso-data-structures", + "enso-metamodel", + "enso-metamodel-lexpr", "enso-parser-syntax-tree-builder", "enso-parser-syntax-tree-visitor", "enso-prelude", "enso-reflect", "enso-shapely-macros", "enso-types", + "lexpr", "serde", ] @@ -3650,7 +3665,7 @@ dependencies = [ "base64 0.13.0", "bytes 1.1.0", "http", - "httpdate 0.3.2", + "httpdate 1.0.2", "language-tags 0.3.2", "mime 0.3.16", "percent-encoding 2.1.0", @@ -4075,6 +4090,29 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lexpr" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceee0b80e0043f17bf81130471e1b0975179af75fe657af45577d80e2698fe3b" +dependencies = [ + "itoa 0.4.8", + "lexpr-macros", + "proc-macro-hack", + "ryu", +] + +[[package]] +name = "lexpr-macros" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd627fb38e19c00d8d068618259205f7a91c91aeade5c15bc35dbca037bb1c35" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", +] + [[package]] name = "libc" version = "0.2.126" diff --git a/lib/rust/metamodel/Cargo.toml b/lib/rust/metamodel/Cargo.toml index 9bfe40cfa50..bf945dd7778 100644 --- a/lib/rust/metamodel/Cargo.toml +++ b/lib/rust/metamodel/Cargo.toml @@ -12,6 +12,7 @@ derive_more = "0.99" bincode = "1.3" [features] +default = ["graphviz", "java", "rust"] graphviz = [] java = [] rust = [] diff --git a/lib/rust/metamodel/lexpr/Cargo.toml b/lib/rust/metamodel/lexpr/Cargo.toml new file mode 100644 index 00000000000..786e01d22a4 --- /dev/null +++ b/lib/rust/metamodel/lexpr/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "enso-metamodel-lexpr" +version = "0.1.0" +edition = "2021" +authors = ["Enso Team "] + +[dependencies] +lexpr = "0.2.6" +bincode = "1.3" +serde = "1" +enso-metamodel = { path = "../" } +derivative = "2.2" + +[dev-dependencies] +serde = { version = "1", features = ["derive"] } +enso-reflect = { path = "../../reflect" } diff --git a/lib/rust/metamodel/lexpr/src/lib.rs b/lib/rust/metamodel/lexpr/src/lib.rs new file mode 100644 index 00000000000..ae513f15d78 --- /dev/null +++ b/lib/rust/metamodel/lexpr/src/lib.rs @@ -0,0 +1,277 @@ +//! Producing S-expression representations of data based on reflection information. +//! +//! The chosen output format is compact--more so than the pretty-printing supported by `lexpr` +//! (which is what is used by `serde_lexpr` to derive an S-expression "format" for data). +//! +//! - A struct is represented as a list of its fields. +//! - No type names are emitted. For variant types, the discriminant is included before the fields. +//! - Named fields are represented with the structure used for Lisp's `alist`s: `(name . value)`. +//! - Field names are prefixed with ':'. +//! - Sequence types like Rust's `Vec<_>` are represent with `lexpr` `Vector`s: `#(element element)` +//! - An option prints the same way as its contained value in the `Some` case, or as an empty list +//! `()` in the `None` case. +//! +//! # Example +//! +//! ``` +//! # use enso_reflect::prelude::*; +//! # use serde::Serialize; +//! use enso_reflect::Reflect; +//! use lexpr::Value; +//! +//! // A type, and a value. We'd like to see the S-expr representation of the value. +//! #[derive(Serialize, Reflect)] +//! struct A { +//! value: u32, +//! } +//! let value = A { value: 36 }; +//! // Get `meta` type info for the type. +//! let (graph, rust_to_meta) = enso_metamodel::rust::to_meta(value.reflect_type()); +//! let value_ty = rust_to_meta[&value.reflect_type().id]; +//! // Use the type info to get a representation of an instance's data. +//! let s_expr = enso_metamodel_lexpr::ToSExpr::new(&graph).value(value_ty, &value); +//! let field_expr = Value::cons(Value::symbol(":value"), Value::Number(36.into())); +//! assert_eq!(s_expr, Value::cons(field_expr, Value::Null)); +//! ``` + +// === Features === +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +// === Non-Standard Linter Configuration === +#![allow(clippy::option_map_unit_fn)] +#![allow(clippy::precedence)] +#![allow(dead_code)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unused_import_braces)] +#![warn(unused_qualifications)] + +use derivative::Derivative; +use enso_metamodel::meta::*; +use lexpr::Value; +use std::collections::BTreeMap; + + + +// ============================= +// === Meta to S-expressions === +// ============================= + +/// Render data to an S-expression representation based on its `meta` model. +#[derive(Derivative)] +#[derivative(Debug)] +pub struct ToSExpr<'g> { + graph: &'g TypeGraph, + #[derivative(Debug = "ignore")] + mappers: BTreeMap Value>>, +} + +impl<'g> ToSExpr<'g> { + #[allow(missing_docs)] + pub fn new(graph: &'g TypeGraph) -> Self { + let mappers = Default::default(); + Self { graph, mappers } + } + + /// Set a transformation to be applied to a type after translating to an S-expression. + pub fn mapper(&mut self, id: TypeId, f: impl Fn(Value) -> Value + 'static) { + self.mappers.insert(id, Box::new(f)); + } + + /// Given a bincode-serialized input, use its `meta` type info to transcribe it to an + /// S-expression. + pub fn value(&self, id: TypeId, input: &T) -> Value { + use bincode::Options; + let bincoder = bincode::DefaultOptions::new().with_fixint_encoding(); + let data = bincoder.serialize(input).unwrap(); + let mut data = &data[..]; + let value = self.value_(id, &mut data); + assert_eq!(data, &[0; 0], "{}", value); + value + } +} + + +// === Implementation === + +impl<'g> ToSExpr<'g> { + fn value_(&self, id: TypeId, data: &mut &[u8]) -> Value { + match &self.graph[id].data { + Data::Struct(_) => self.struct_(id, data), + Data::Primitive(primitive) => self.primitive(*primitive, data), + } + } + + fn struct_(&self, id: TypeId, data: &mut &[u8]) -> Value { + let mut hierarchy = vec![]; + let mut child = None; + let discriminants = &self.graph[id].discriminants; + if !discriminants.is_empty() { + let discriminant_index = read_u32(data); + let child_ = discriminants[&(discriminant_index as usize)]; + hierarchy.push(child_); + child = Some(child_); + } + hierarchy.push(id); + let mut id_ = id; + while let Some(parent) = self.graph[id_].parent { + hierarchy.push(parent); + id_ = parent; + } + let mut out = vec![]; + self.fields(&mut hierarchy, data, &mut out); + assert_eq!(hierarchy, &[]); + let mut value = Value::list(out); + if let Some(id) = child { + if let Some(mapper) = self.mappers.get(&id) { + value = (mapper)(value); + if !value.is_cons() { + value = Value::cons(value, Value::Null); + } + }; + let discriminant = self.graph[id].name.to_pascal_case().into_boxed_str(); + let discriminant = Value::Symbol(discriminant); + value = Value::cons(discriminant, value); + } + if let Some(mapper) = self.mappers.get(&id) { + value = (mapper)(value); + } + value + } + + fn fields(&self, hierarchy: &mut Vec, data: &mut &[u8], out: &mut Vec) { + let id = match hierarchy.pop() { + Some(id) => id, + None => return, + }; + let fields = match &self.graph[id].data { + Data::Struct(fields) => fields, + Data::Primitive(_) => panic!(), + }; + if self.graph[id].child_field == Some(0) || fields.is_empty() { + self.fields(hierarchy, data, out); + } + for (i, field) in fields.iter().enumerate() { + if !field.name.is_empty() { + let car = Value::Symbol(format!(":{}", field.name).into_boxed_str()); + let cdr = self.value_(field.type_, data); + out.push(Value::cons(car, cdr)); + } else { + out.push(self.value_(field.type_, data)); + } + if self.graph[id].child_field == Some(i + 1) { + self.fields(hierarchy, data, out); + } + } + } + + fn primitive(&self, primitive: Primitive, data: &mut &[u8]) -> Value { + match primitive { + Primitive::U32 => Value::Number(read_u32(data).into()), + Primitive::U64 => Value::Number(read_u64(data).into()), + Primitive::Bool => { + let value = read_u8(data); + let value = match value { + 0 => false, + 1 => true, + _ => panic!(), + }; + Value::Bool(value) + } + Primitive::String => Value::String(read_string(data).into()), + Primitive::Sequence(t0) => { + let len = read_u64(data); + Value::vector((0..len).map(|_| self.value_(t0, data))) + } + Primitive::Option(t0) => match read_u8(data) { + 0 => Value::Null, + 1 => self.value_(t0, data), + _ => panic!(), + }, + Primitive::Result(t0, t1) => { + let mut values = vec![]; + match read_u32(data) { + 0 => { + values.push(Value::Symbol("Ok".to_owned().into_boxed_str())); + values.push(self.value_(t0, data)); + } + 1 => { + values.push(Value::Symbol("Err".to_owned().into_boxed_str())); + values.push(self.value_(t1, data)); + } + _ => panic!(), + } + Value::list(values) + } + } + } +} + + +// === Primitive Deserializers === + +fn read_u8(buffer: &mut &[u8]) -> u8 { + let (bytes, rest) = buffer.split_at(1); + *buffer = rest; + bytes[0] +} + +fn read_u32(buffer: &mut &[u8]) -> u32 { + let (bytes, rest) = buffer.split_at(4); + *buffer = rest; + let mut data = [0; 4]; + data.copy_from_slice(bytes); + u32::from_le_bytes(data) +} + +fn read_u64(buffer: &mut &[u8]) -> u64 { + let (bytes, rest) = buffer.split_at(8); + *buffer = rest; + let mut data = [0; 8]; + data.copy_from_slice(bytes); + u64::from_le_bytes(data) +} + +fn read_string(buffer: &mut &[u8]) -> String { + let len = read_u64(buffer); + let (bytes, rest) = buffer.split_at(len as usize); + *buffer = rest; + String::from_utf8(bytes.to_owned()).unwrap() +} + + + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn unit_test() { + #[derive(serde::Serialize)] + struct A { + value: u32, + } + let mut graph = TypeGraph::new(); + let int_name = TypeName::from_pascal_case("U32"); + let int = Type::new(int_name, Data::Primitive(Primitive::U32)); + let int = graph.types.insert(int); + let a_name = TypeName::from_pascal_case("A"); + let a_field_name = FieldName::from_snake_case("value"); + let a_field = Field::named(a_field_name, int); + let a = Type::new(a_name, Data::Struct(vec![a_field])); + let a = graph.types.insert(a); + let a_value = A { value: 36 }; + let s_expr = ToSExpr::new(&graph).value(a, &a_value); + let field_expr = Value::cons(Value::symbol(":value"), Value::Number(36.into())); + assert_eq!(s_expr, Value::cons(field_expr, Value::Null)); + } +} diff --git a/lib/rust/metamodel/src/lib.rs b/lib/rust/metamodel/src/lib.rs index 1e80bcca4b6..9765f016372 100644 --- a/lib/rust/metamodel/src/lib.rs +++ b/lib/rust/metamodel/src/lib.rs @@ -45,7 +45,6 @@ // === Features === #![feature(map_first_last)] -#![feature(associated_type_defaults)] #![feature(option_get_or_insert_default)] // === Standard Linter Configuration === #![deny(non_ascii_idents)] diff --git a/lib/rust/metamodel/src/meta/mod.rs b/lib/rust/metamodel/src/meta/mod.rs index a0130ecaf12..2bdcea375eb 100644 --- a/lib/rust/metamodel/src/meta/mod.rs +++ b/lib/rust/metamodel/src/meta/mod.rs @@ -179,12 +179,13 @@ impl Identifier { /// Render in camelCase. pub fn to_camel_case(&self) -> String { let mut camel = String::with_capacity(self.segments_len()); - let (head, tail) = self.segments.split_first().unwrap(); - camel.push_str(head); - for segment in tail { - let mut chars = segment.chars(); - camel.push(chars.next().unwrap().to_ascii_uppercase()); - camel.extend(chars); + if let Some((head, tail)) = self.segments.split_first() { + camel.push_str(head); + for segment in tail { + let mut chars = segment.chars(); + camel.push(chars.next().unwrap().to_ascii_uppercase()); + camel.extend(chars); + } } camel } @@ -225,6 +226,11 @@ impl Identifier { pub fn append(&mut self, other: Self) { self.segments.extend(other.segments) } + + /// Return whether this identifier is zero-length. + pub fn is_empty(&self) -> bool { + self.segments.is_empty() + } } @@ -284,6 +290,10 @@ impl FieldName { pub fn append(&mut self, other: Self) { self.0.append(other.0) } + /// Return whether this identifier is zero-length. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } diff --git a/lib/rust/metamodel/src/rust/to_meta.rs b/lib/rust/metamodel/src/rust/to_meta.rs index 29c21778d58..9f5835969bf 100644 --- a/lib/rust/metamodel/src/rust/to_meta.rs +++ b/lib/rust/metamodel/src/rust/to_meta.rs @@ -18,8 +18,9 @@ use std::mem::take; pub fn to_meta(ty: TypeData) -> (meta::TypeGraph, BTreeMap) { let mut to_meta = ToMeta::new(); let root_ = to_meta.run(ty); - to_meta.graph.gc(vec![root_]); - (to_meta.graph, to_meta.rust_to_meta) + let (mut graph, rust_to_meta) = to_meta.finish(); + graph.gc(vec![root_]); + (graph, rust_to_meta) } #[derive(Debug, Default)] @@ -212,6 +213,11 @@ impl ToMeta { self.rust_to_meta[&root_rust_id] } + /// Return results. + pub fn finish(self) -> (meta::TypeGraph, BTreeMap) { + (self.graph, self.rust_to_meta) + } + fn generate_subtypes(&mut self, rust_types: &BTreeMap) { let mut parent_ids = BTreeMap::new(); let mut aliases = vec![]; diff --git a/lib/rust/parser/Cargo.toml b/lib/rust/parser/Cargo.toml index 91f67619ea6..e752f1e866a 100644 --- a/lib/rust/parser/Cargo.toml +++ b/lib/rust/parser/Cargo.toml @@ -20,5 +20,7 @@ enso-parser-syntax-tree-builder = { path = "src/syntax/tree/builder" } serde = { version = "1.0", features = ["derive"] } bincode = "1.3" -[lib] -path = "src/main.rs" +[dev-dependencies] +enso-metamodel = { path = "../metamodel", features = ["rust"] } +enso-metamodel-lexpr = { path = "../metamodel/lexpr" } +lexpr = "0.2.6" diff --git a/lib/rust/parser/src/lexer.rs b/lib/rust/parser/src/lexer.rs index ef4a68f0505..04c35e8b1f9 100644 --- a/lib/rust/parser/src/lexer.rs +++ b/lib/rust/parser/src/lexer.rs @@ -721,12 +721,12 @@ impl<'s> Lexer<'s> { /// 2. Some parsers could consume input even if it should be qualified as something else. Thus, some /// parsers should be run first in order to make the token consuming process correct. const PARSERS: &[for<'r> fn(&'r mut Lexer<'_>)] = &[ + |t| t.number(), |t| t.ident(), |t| t.operator(), |t| t.newline(), |t| t.symbol(), |t| t.comment(), - |t| t.number(), |t| t.text(), ]; @@ -752,7 +752,7 @@ impl<'s> Lexer<'s> { } } if self.current_char != None { - panic!("Internal error. Lexer did not consume all input."); + panic!("Internal error. Lexer did not consume all input. State: {self:?}"); } while self.end_block().is_some() { let block_end = self.marker_token(token::Variant::block_end()); @@ -902,6 +902,11 @@ mod tests { ])) } + #[test] + fn test_numeric_literal() { + test_lexer("10", vec![number_("", "10")]); + } + #[test] fn test_case_idents() { test_lexer_many(vec![ diff --git a/lib/rust/parser/src/lib.rs b/lib/rust/parser/src/lib.rs new file mode 100644 index 00000000000..3b6eba31174 --- /dev/null +++ b/lib/rust/parser/src/lib.rs @@ -0,0 +1,265 @@ +//! The Enso parser. Parsing is a multi-stage process: +//! +//! # Lexing. +//! First, the source code is feed to [`lexer::Lexer`], which consumes it and outputs a stream of +//! [`Token`]. Tokens are chunks of the input with a generic description attached, like "operator", +//! or "identifier". +//! +//! # Building macro registry. +//! Macros in Enso are a very powerful mechanism and are used to transform group of tokens into +//! almost any statement. First, macros need to be discovered and registered. Currently, there is no +//! real macro discovery process, as there is no support for user-defined macros. Instead, there is +//! a set of hardcoded macros defined in the compiler. +//! +//! Each macro defines one or more segments. Every segment starts with a predefined token and can +//! contain any number of other tokens. For example, the macro `if ... then ... else ...` contains +//! three segments. Macros can also accept prefix tokens, a set of tokens on the left of the first +//! segment. A good example is the lambda macro `... -> ...`. +//! +//! In this step, a [`MacroMatchTree`] is built. Basically, it is a map from the possible next +//! segment name to information of what other segments are required and what is the macro definition +//! in case these segments were found. For example, let's consider two macros: `if ... then ...`, +//! and `if ... then ... else ...`. In such a case, the macro registry will contain only one entry, +//! "if", and two sets of possible resolution paths: ["then"], and ["then", "else"], each associated +//! with the corresponding macro definition. +//! +//! # Splitting the token stream by the macro segments. +//! The input token stream is being iterated and is being split based on the segments of the +//! registered macros. For example, for the input `if a b then c d else e f`, the token stream will +//! be split into three segments, `a b`, `c d`, and `e f`, which will be associated with the +//! `if ... then ... else ...` macro definition. +//! +//! The splitting process is hierarchical. It means that a new macro can start being resolved during +//! resolution of a parent macro. For example, `if if a then b then c else d` is a correct +//! expression. After finding the first `if` token, the token stream will be split. The next `if` +//! token starts a new token stream splitting. The first `then` token belongs to the nested macro, +//! however, as soon as the resolver sees the second `then` token, it will consider the nested macro +//! to be finished, and will come back to parent macro resolution. +//! +//! # Resolving right-hand-side patterns of macro segments. +//! In the next steps, each macro is being analyzed, started from the most nested ones. For each +//! macro, the [`Pattern`] of last segment is being run to check which tokens belong to that macro, +//! and which tokens should be transferred to parent macro definition. For example, consider the +//! following code `process (read file) content-> print content`. The `(...)` is a macro with two +//! sections `(` and `)`. Let's mark the token splitting with `[` and `]` characters. The previous +//! macro resolution steps would output such split of the token stream: +//! `process [(read file][) content[-> print content]]`. In this step, the most inner macro will be +//! analyzed first. The pattern of the last segment of the inner macro (`->`) defines that it +//! consumes all tokens, so all the tokens `print content` are left as they are. Now, the resolution +//! moves to the parent macro. Its last segment starts with the `)` token, which pattern defines +//! that it does not consume any tokens, so all of its current tokens (`content[-> print content]]`) +//! are popped to a parent definition, forming `process [(read file][)] content[-> print content]`. +//! +//! Please note, that root of the expression is considered a special macro as well. It is done for +//! the algorithm unification purposes. +//! +//! # Resolving left-hand-side patterns of macro segments. +//! In this step, each macro is being analyzed, started from the most nested ones. For each macro, +//! the [`Pattern`] of the macro prefix is being run to check which tokens belong to the prefix of +//! the macro (in case the macro defines the prefix). In the example above, the macro `->` defines +//! complex prefix rules: if the token on the left of the arrow used no space, then only a single +//! token will be consumed. As a result of this step, the following token split will occur: +//! `[process [(read file][)] [content-> print content]`, which is exactly what we wanted. +//! +//! # Resolving patterns of macro segments. +//! In this step, all macro segment patterns are being resolved and errors are reported in case it +//! was not possible. If tokens in a segment match the segment pattern, they are sent to the +//! operator precedence resolver for final transformation. +//! +//! # Operator precedence resolution. +//! Each token stream sent to the operator resolver is processed by a modified Shunting Yard +//! algorithm, which handles such situations as multiple operators placed next to each other, +//! multiple identifiers placed next to each other, and also takes spacing into consideration in +//! order to implement spacing-aware precedence rules. After all segments are resolved, the macro +//! is being treated as a single token in one of the segments of the parent macro, and is being +//! processed by the operator precedence resolver as well. In the end, a single [`syntax::Tree`] is +//! produced, containing the parsed expression. + +#![recursion_limit = "256"] +// === Features === +#![allow(incomplete_features)] +#![feature(allocator_api)] +#![feature(test)] +#![feature(specialization)] +#![feature(let_chains)] +#![feature(if_let_guard)] +// === Standard Linter Configuration === +#![deny(non_ascii_idents)] +#![warn(unsafe_code)] +// === Non-Standard Linter Configuration === +#![allow(clippy::option_map_unit_fn)] +#![allow(clippy::precedence)] +#![allow(dead_code)] +#![deny(unconditional_recursion)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unused_import_braces)] +#![warn(unused_qualifications)] + +use crate::prelude::*; + + +// ============== +// === Export === +// ============== + +pub mod lexer; +pub mod macros; +pub mod serialization; +pub mod source; +pub mod syntax; + + + +/// Popular utilities, imported by most modules of this crate. +pub mod prelude { + pub use enso_prelude::serde_reexports::*; + pub use enso_prelude::*; + pub use enso_reflect as reflect; + pub use enso_reflect::Reflect; + pub use enso_types::traits::*; + pub use enso_types::unit2::Bytes; +} + + + +// ============== +// === Parser === +// ============== + +/// Enso parser. See the module documentation to learn more about how it works. +#[allow(missing_docs)] +#[derive(Debug)] +pub struct Parser { + pub macros: macros::resolver::SegmentMap<'static>, +} + +impl Parser { + /// Constructor. + pub fn new() -> Self { + let macros = macros::built_in::all(); + Self { macros } + } + + /// Main entry point. + pub fn run<'s>(&self, code: &'s str) -> syntax::Tree<'s> { + let tokens = lexer::run(code); + let mut statements = vec![]; + let mut tokens = tokens.into_iter().peekable(); + while tokens.peek().is_some() { + let resolver = macros::resolver::Resolver::new_root(); + let tree = resolver.run(&self.macros, &mut tokens); + let tree = expression_to_statement(tree); + statements.push(tree); + } + syntax::Tree::block(statements) + } +} + +impl Default for Parser { + fn default() -> Self { + Self::new() + } +} + + +// == Parsing helpers == + +/// Reinterpret an expression in a statement context (i.e. as a top level member of a block). +/// +/// In statement context, an expression that has an assignment operator at its top level is +/// interpreted as a variable assignment or method definition. +fn expression_to_statement(tree: syntax::Tree<'_>) -> syntax::Tree<'_> { + use syntax::tree::*; + let tree_ = match &*tree.variant { + Variant::OprSectionBoundary(OprSectionBoundary { ast }) => ast, + _ => &tree, + }; + let mut replacement = None; + if let Variant::OprApp(opr_app) = &*tree_.variant { + replacement = expression_to_binding(opr_app); + } + match replacement { + Some(modified) => modified, + None => tree, + } +} + +/// If the input is an "=" expression, try to interpret it as either a variable assignment or method +/// definition. +fn expression_to_binding<'a>(app: &syntax::tree::OprApp<'a>) -> Option> { + use syntax::tree::*; + match app { + OprApp { lhs: Some(lhs), opr: Ok(opr), rhs } if opr.code == "=" => { + let mut lhs = lhs; + let mut args = vec![]; + while let Variant::App(App { func, arg }) = &*lhs.variant { + lhs = func; + args.push(arg.clone()); + } + args.reverse(); + if let Some(rhs) = rhs && args.is_empty() { + Some(Tree::assignment(lhs.clone(), opr.clone(), rhs.clone())) + } else if let Variant::Ident(Ident { token }) = &*lhs.variant { + Some(Tree::function(token.clone(), args, opr.clone(), rhs.clone())) + } else { + None + } + } + _ => None, + } +} + + + +// ============= +// === Tests === +// ============= + +#[cfg(test)] +mod tests { + use super::*; + use enso_parser_syntax_tree_builder::ast_builder; + + macro_rules! test_parse { + ($input:tt = {$($def:tt)*}) => { + assert_eq!( + Parser::new().run($input), + ast_builder! { $($def)* } + ) + }; + } + + #[test] + fn test_expressions() { + test_parse! {"a" = {a}}; + test_parse! {"a b" = {a b}}; + test_parse! {"a b c" = {[a b] c}}; + } +} + + + +// ================== +// === Benchmarks === +// ================== + +#[cfg(test)] +mod benches { + use super::*; + extern crate test; + use test::Bencher; + + #[bench] + fn bench_parsing_type_defs(bencher: &mut Bencher) { + let reps = 1_000; + let str = "type Option a b c\n".repeat(reps); + let parser = Parser::new(); + bencher.iter(move || { + parser.run(&str); + }); + } +} diff --git a/lib/rust/parser/src/macros/built_in.rs b/lib/rust/parser/src/macros/built_in.rs index e68848c0d7a..47d928911d5 100644 --- a/lib/rust/parser/src/macros/built_in.rs +++ b/lib/rust/parser/src/macros/built_in.rs @@ -71,7 +71,8 @@ fn type_def_body(matched_segments: NonEmptyVec) -> syntax::Tree // println!("{:#?}", name); // println!("\n\n------------- 2"); - let params = v.nested().query("param").unwrap(); + let no_params = vec![]; + let params = v.nested().query("param").unwrap_or(&no_params); // println!("{:#?}", params); // println!("\n\n------------- 3"); diff --git a/lib/rust/parser/src/main.rs b/lib/rust/parser/src/main.rs index 825e2eea333..9ae884e98cd 100644 --- a/lib/rust/parser/src/main.rs +++ b/lib/rust/parser/src/main.rs @@ -1,79 +1,4 @@ -//! The Enso parser. Parsing is a multi-stage process: -//! -//! # Lexing. -//! First, the source code is feed to [`lexer::Lexer`], which consumes it and outputs a stream of -//! [`Token`]. Tokens are chunks of the input with a generic description attached, like "operator", -//! or "identifier". -//! -//! # Building macro registry. -//! Macros in Enso are a very powerful mechanism and are used to transform group of tokens into -//! almost any statement. First, macros need to be discovered and registered. Currently, there is no -//! real macro discovery process, as there is no support for user-defined macros. Instead, there is -//! a set of hardcoded macros defined in the compiler. -//! -//! Each macro defines one or more segments. Every segment starts with a predefined token and can -//! contain any number of other tokens. For example, the macro `if ... then ... else ...` contains -//! three segments. Macros can also accept prefix tokens, a set of tokens on the left of the first -//! segment. A good example is the lambda macro `... -> ...`. -//! -//! In this step, a [`MacroMatchTree`] is built. Basically, it is a map from the possible next -//! segment name to information of what other segments are required and what is the macro definition -//! in case these segments were found. For example, let's consider two macros: `if ... then ...`, -//! and `if ... then ... else ...`. In such a case, the macro registry will contain only one entry, -//! "if", and two sets of possible resolution paths: ["then"], and ["then", "else"], each associated -//! with the corresponding macro definition. -//! -//! # Splitting the token stream by the macro segments. -//! The input token stream is being iterated and is being split based on the segments of the -//! registered macros. For example, for the input `if a b then c d else e f`, the token stream will -//! be split into three segments, `a b`, `c d`, and `e f`, which will be associated with the -//! `if ... then ... else ...` macro definition. -//! -//! The splitting process is hierarchical. It means that a new macro can start being resolved during -//! resolution of a parent macro. For example, `if if a then b then c else d` is a correct -//! expression. After finding the first `if` token, the token stream will be split. The next `if` -//! token starts a new token stream splitting. The first `then` token belongs to the nested macro, -//! however, as soon as the resolver sees the second `then` token, it will consider the nested macro -//! to be finished, and will come back to parent macro resolution. -//! -//! # Resolving right-hand-side patterns of macro segments. -//! In the next steps, each macro is being analyzed, started from the most nested ones. For each -//! macro, the [`Pattern`] of last segment is being run to check which tokens belong to that macro, -//! and which tokens should be transferred to parent macro definition. For example, consider the -//! following code `process (read file) content-> print content`. The `(...)` is a macro with two -//! sections `(` and `)`. Let's mark the token splitting with `[` and `]` characters. The previous -//! macro resolution steps would output such split of the token stream: -//! `process [(read file][) content[-> print content]]`. In this step, the most inner macro will be -//! analyzed first. The pattern of the last segment of the inner macro (`->`) defines that it -//! consumes all tokens, so all the tokens `print content` are left as they are. Now, the resolution -//! moves to the parent macro. Its last segment starts with the `)` token, which pattern defines -//! that it does not consume any tokens, so all of its current tokens (`content[-> print content]]`) -//! are popped to a parent definition, forming `process [(read file][)] content[-> print content]`. -//! -//! Please note, that root of the expression is considered a special macro as well. It is done for -//! the algorithm unification purposes. -//! -//! # Resolving left-hand-side patterns of macro segments. -//! In this step, each macro is being analyzed, started from the most nested ones. For each macro, -//! the [`Pattern`] of the macro prefix is being run to check which tokens belong to the prefix of -//! the macro (in case the macro defines the prefix). In the example above, the macro `->` defines -//! complex prefix rules: if the token on the left of the arrow used no space, then only a single -//! token will be consumed. As a result of this step, the following token split will occur: -//! `[process [(read file][)] [content-> print content]`, which is exactly what we wanted. -//! -//! # Resolving patterns of macro segments. -//! In this step, all macro segment patterns are being resolved and errors are reported in case it -//! was not possible. If tokens in a segment match the segment pattern, they are sent to the -//! operator precedence resolver for final transformation. -//! -//! # Operator precedence resolution. -//! Each token stream sent to the operator resolver is processed by a modified Shunting Yard -//! algorithm, which handles such situations as multiple operators placed next to each other, -//! multiple identifiers placed next to each other, and also takes spacing into consideration in -//! order to implement spacing-aware precedence rules. After all segments are resolved, the macro -//! is being treated as a single token in one of the segments of the parent macro, and is being -//! processed by the operator precedence resolver as well. In the end, a single [`syntax::Tree`] is -//! produced, containing the parsed expression. +//! Tests for [`enso_parser`]. #![recursion_limit = "256"] // === Features === @@ -82,6 +7,7 @@ #![feature(test)] #![feature(specialization)] #![feature(let_chains)] +#![feature(if_let_guard)] // === Standard Linter Configuration === #![deny(non_ascii_idents)] #![warn(unsafe_code)] @@ -98,70 +24,7 @@ #![warn(unused_import_braces)] #![warn(unused_qualifications)] -use crate::prelude::*; - - -// ============== -// === Export === -// ============== - -pub mod lexer; -pub mod macros; -pub mod serialization; -pub mod source; -pub mod syntax; - - - -/// Popular utilities, imported by most modules of this crate. -pub mod prelude { - pub use enso_prelude::serde_reexports::*; - pub use enso_prelude::*; - pub use enso_reflect as reflect; - pub use enso_reflect::Reflect; - pub use enso_types::traits::*; - pub use enso_types::unit2::Bytes; -} - - - -// ============== -// === Parser === -// ============== - -/// Enso parser. See the module documentation to learn more about how it works. -#[allow(missing_docs)] -#[derive(Debug)] -pub struct Parser { - pub macros: macros::resolver::SegmentMap<'static>, -} - -impl Parser { - /// Constructor. - pub fn new() -> Self { - let macros = macros::built_in::all(); - Self { macros } - } - - /// Main entry point. - pub fn run<'s>(&self, code: &'s str) -> syntax::Tree<'s> { - let tokens = lexer::run(code); - let mut statements = vec![]; - let mut tokens = tokens.into_iter().peekable(); - while tokens.peek().is_some() { - let resolver = macros::resolver::Resolver::new_root(); - let tree = resolver.run(&self.macros, &mut tokens); - statements.push(tree); - } - syntax::Tree::module(statements) - } -} - -impl Default for Parser { - fn default() -> Self { - Self::new() - } -} +use enso_parser::prelude::*; @@ -171,52 +34,7 @@ impl Default for Parser { fn main() { init_tracing(TRACE); - let ast = Parser::new().run("type Option (a) b c"); + let ast = enso_parser::Parser::new().run("type Option (a) b c"); println!("\n\n==================\n\n"); println!("{:#?}", ast); } - -#[cfg(test)] -mod tests { - use super::*; - use enso_parser_syntax_tree_builder::ast_builder; - - macro_rules! test_parse { - ($input:tt = {$($def:tt)*}) => { - assert_eq!( - Parser::new().run($input), - ast_builder! { $($def)* } - ) - }; - } - - #[test] - fn test_expressions() { - test_parse! {"a" = {a}}; - test_parse! {"a b" = {a b}}; - test_parse! {"a b c" = {[a b] c}}; - } -} - - - -// ================== -// === Benchmarks === -// ================== - -#[cfg(test)] -mod benches { - use super::*; - extern crate test; - use test::Bencher; - - #[bench] - fn bench_parsing_type_defs(bencher: &mut Bencher) { - let reps = 1_000; - let str = "type Option a b c\n".repeat(reps); - let parser = Parser::new(); - bencher.iter(move || { - parser.run(&str); - }); - } -} diff --git a/lib/rust/parser/src/serialization.rs b/lib/rust/parser/src/serialization.rs index c8236466480..02b935f4140 100644 --- a/lib/rust/parser/src/serialization.rs +++ b/lib/rust/parser/src/serialization.rs @@ -39,6 +39,7 @@ pub(crate) fn serialize_cow(cow: &Cow<'_, str>, ser: S) -> Result *s, + Cow::Owned(s) if s.is_empty() => "", Cow::Owned(_) => panic!(), }; let begin = s.as_ptr() as u32; diff --git a/lib/rust/parser/src/source/code.rs b/lib/rust/parser/src/source/code.rs index affbaad7839..af51ab7fead 100644 --- a/lib/rust/parser/src/source/code.rs +++ b/lib/rust/parser/src/source/code.rs @@ -15,7 +15,7 @@ use crate::prelude::*; pub struct Code<'s> { #[serde(serialize_with = "crate::serialization::serialize_cow")] #[serde(deserialize_with = "crate::serialization::deserialize_cow")] - #[reflect(as = "crate::serialization::Code", flatten)] + #[reflect(as = "crate::serialization::Code")] pub repr: Cow<'s, str>, } diff --git a/lib/rust/parser/src/source/span.rs b/lib/rust/parser/src/source/span.rs index 110d2e33050..69e7117e12d 100644 --- a/lib/rust/parser/src/source/span.rs +++ b/lib/rust/parser/src/source/span.rs @@ -62,8 +62,9 @@ impl From<&str> for VisibleOffset { #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Reflect, Deserialize)] #[allow(missing_docs)] pub struct Offset<'s> { + #[reflect(hide)] pub visible: VisibleOffset, - #[reflect(flatten)] + #[reflect(flatten, hide)] pub code: Code<'s>, } diff --git a/lib/rust/parser/src/syntax/item.rs b/lib/rust/parser/src/syntax/item.rs index 8f305ff77f1..42a87e801c5 100644 --- a/lib/rust/parser/src/syntax/item.rs +++ b/lib/rust/parser/src/syntax/item.rs @@ -46,6 +46,7 @@ impl<'s> Item<'s> { match self { Item::Token(token) => match token.variant { token::Variant::Ident(ident) => Tree::ident(token.with_variant(ident)), + token::Variant::Number(number) => Tree::number(token.with_variant(number)), _ => todo!(), }, Item::Tree(ast) => ast, diff --git a/lib/rust/parser/src/syntax/operator.rs b/lib/rust/parser/src/syntax/operator.rs index b41001ad9f1..1bc7a5cea32 100644 --- a/lib/rust/parser/src/syntax/operator.rs +++ b/lib/rust/parser/src/syntax/operator.rs @@ -16,6 +16,7 @@ use crate::syntax::token::Token; // computations for any operator (according to the spec) fn precedence_of(operator: &str) -> usize { match operator { + "=" => 1, "+" => 3, "-" => 3, "*" => 7, diff --git a/lib/rust/parser/src/syntax/tree.rs b/lib/rust/parser/src/syntax/tree.rs index ca90c841871..5ad8b746dbb 100644 --- a/lib/rust/parser/src/syntax/tree.rs +++ b/lib/rust/parser/src/syntax/tree.rs @@ -68,13 +68,17 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args) pub error: Error, pub ast: Tree<'s>, }, - Module { + Block { pub statements: Vec>, }, /// A simple identifier, like `foo` or `bar`. Ident { pub token: token::Ident<'s>, }, + /// A numeric literal, like `10`. + Number { + pub token: token::Number<'s>, + }, /// A simple application, like `print "hello"`. App { pub func: Tree<'s>, @@ -106,12 +110,22 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args) MultiSegmentApp { pub segments: NonEmptyVec>, }, - TypeDef { pub keyword: Token<'s>, pub name: Tree<'s>, pub params: Vec>, - } + }, + Assignment { + pub pattern: Tree<'s>, + pub equals: token::Operator<'s>, + pub expr: Tree<'s>, + }, + Function { + pub name: token::Ident<'s>, + pub args: Vec>, + pub equals: token::Operator<'s>, + pub body: Option>, + }, } }};} diff --git a/lib/rust/parser/src/syntax/tree/builder/src/lib.rs b/lib/rust/parser/src/syntax/tree/builder/src/lib.rs index e4e33524660..11cb2b541ae 100644 --- a/lib/rust/parser/src/syntax/tree/builder/src/lib.rs +++ b/lib/rust/parser/src/syntax/tree/builder/src/lib.rs @@ -39,7 +39,7 @@ use std::mem; #[proc_macro] pub fn ast_builder(tokens: proc_macro::TokenStream) -> proc_macro::TokenStream { let output = expr(tokens, None); - let output = quote!(crate::syntax::Tree::module(vec![#output])); + let output = quote!(crate::syntax::Tree::block(vec![#output])); output.into() } diff --git a/lib/rust/parser/tests/parse.rs b/lib/rust/parser/tests/parse.rs new file mode 100644 index 00000000000..e46f671ff76 --- /dev/null +++ b/lib/rust/parser/tests/parse.rs @@ -0,0 +1,206 @@ +//! Parse expressions and compare their results to expected values. + +// === Non-Standard Linter Configuration === +#![allow(clippy::option_map_unit_fn)] +#![allow(clippy::precedence)] +#![allow(dead_code)] +#![deny(non_ascii_idents)] +#![deny(unconditional_recursion)] +#![warn(unsafe_code)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![warn(trivial_casts)] +#![warn(trivial_numeric_casts)] +#![warn(unused_import_braces)] +#![warn(unused_qualifications)] + +use lexpr::sexp; + + + +// =========================== +// === Test support macros === +// =========================== + +/// Parses input as a sequence of S-expressions, and wraps it in a `Block`. +macro_rules! block { + ( $statements:tt ) => { + sexp![(Block #($statements))] + } +} + + + +// ============= +// === Tests === +// ============= + +#[test] +fn application() { + test("a b c", block![(App (App (Ident a) (Ident b)) (Ident c))]); +} + +#[test] +fn type_definition_bool() { + test("type Bool", block![(TypeDef (Ident type) (Ident Bool) #())]); +} + +#[test] +fn type_definition_option() { + test("type Option a", block![(TypeDef (Ident type) (Ident Option) #((Ident a)))]); +} + +#[test] +fn assignment_simple() { + test("foo = 23", block![(Assignment (Ident foo) "=" (Number 23))]); +} + +#[test] +fn function_inline_simple_args() { + test("foo a = 23", block![(Function foo #((Ident a)) "=" (Number 23))]); + test("foo a b = 23", block![(Function foo #((Ident a) (Ident b)) "=" (Number 23))]); + test("foo a b c = 23", block![(Function foo #((Ident a) (Ident b) (Ident c)) "=" (Number 23))]); +} + +#[test] +fn function_block_noargs() { + test("foo =", block![(Function foo #() "=" ())]); +} + +#[test] +fn function_block_simple_args() { + test("foo a =", block![(Function foo #((Ident a)) "=" ())]); + test("foo a b =", block![(Function foo #((Ident a) (Ident b)) "=" ())]); + test("foo a b c =", block![(Function foo #((Ident a) (Ident b) (Ident c)) "=" ())]); +} + + + +// ==================== +// === Test Support === +// ==================== + +use enso_metamodel_lexpr::ToSExpr; +use enso_reflect::Reflect; +use std::collections::HashSet; + +/// Given a block of input Enso code, test that: +/// - The given code parses to the AST represented by the given S-expression. +/// - The AST pretty-prints back to the original code. +/// +/// The S-expression format is as documented for [`enso_metamodel_lexpr`], with some +/// postprocessing: +/// - For concision, field names are stripped (as if all structs were tuple structs). +/// - Most token types are represented as their contents, rather than as a token struct. For +/// example, a `token::Number` may be represented like: `sexp![10]`, and a `token::Ident` may look +/// like `sexp![foo]`. +fn test(code: &str, expect: lexpr::Value) { + let ast = enso_parser::Parser::new().run(code); + let ast_s_expr = to_s_expr(&ast, code); + assert_eq!(ast_s_expr.to_string(), expect.to_string()); + assert_eq!(ast.code(), code); +} + + + +// ===================== +// === S-expressions === +// ===================== + +/// Produce an S-expression representation of the input AST type. +pub fn to_s_expr(value: &T, code: &str) -> lexpr::Value +where T: serde::Serialize + Reflect { + let (graph, rust_to_meta) = enso_metamodel::rust::to_meta(value.reflect_type()); + let ast_ty = rust_to_meta[&value.reflect_type().id]; + let base = code.as_bytes().as_ptr() as usize; + let code: Box = Box::from(code); + let mut to_s_expr = ToSExpr::new(&graph); + to_s_expr.mapper(ast_ty, strip_hidden_fields); + let ident_token = rust_to_meta[&enso_parser::syntax::token::variant::Ident::reflect().id]; + let operator_token = rust_to_meta[&enso_parser::syntax::token::variant::Operator::reflect().id]; + let number_token = rust_to_meta[&enso_parser::syntax::token::variant::Number::reflect().id]; + let token_to_str = move |token: lexpr::Value| { + let range = token_code_range(&token, base); + code[range].to_owned().into_boxed_str() + }; + let token_to_str_ = token_to_str.clone(); + to_s_expr.mapper(ident_token, move |token| lexpr::Value::symbol(token_to_str_(token))); + let token_to_str_ = token_to_str.clone(); + to_s_expr.mapper(operator_token, move |token| lexpr::Value::string(token_to_str_(token))); + let token_to_str_ = token_to_str; + to_s_expr.mapper(number_token, move |token| { + lexpr::Value::Number(token_to_str_(token).parse::().unwrap().into()) + }); + tuplify(to_s_expr.value(ast_ty, &value)) +} + +/// Strip certain fields that should be excluded from output. +fn strip_hidden_fields(tree: lexpr::Value) -> lexpr::Value { + let hidden_tree_fields = + [":spanLeftOffsetVisible", ":spanLeftOffsetCodeRepr", ":spanCodeLength"]; + let hidden_tree_fields: HashSet<_> = hidden_tree_fields.into_iter().collect(); + lexpr::Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val { + lexpr::Value::Cons(cons) => match cons.car() { + lexpr::Value::Symbol(symbol) => !hidden_tree_fields.contains(symbol.as_ref()), + _ => panic!(), + }, + _ => true, + })) +} + +/// Given an S-expression representation of a [`Token`] and the base address for `Code` `Cow`s, +/// return the range of the input code the token references. +fn token_code_range(token: &lexpr::Value, base: usize) -> std::ops::Range { + let code_repr = fields(token).find(|(name, _)| *name == ":codeRepr").unwrap().1; + let mut begin = None; + let mut len = None; + for (name, value) in fields(code_repr) { + match name { + ":begin" => begin = Some(value.as_u64().unwrap() as u32), + ":len" => len = Some(value.as_u64().unwrap() as u32), + _ => (), + } + } + let begin = begin.unwrap(); + let begin = (begin as u64) | (base as u64 & !0xFFFF_FFFF); + let begin = if begin < (base as u64) { begin + 0x1_0000_0000 } else { begin }; + let begin = begin as usize - base; + let len = len.unwrap() as usize; + begin..(begin + len) +} + +/// Iterate the field `(name, value)` pairs of the S-expression of a struct with named fields. +fn fields(value: &'_ lexpr::Value) -> impl Iterator { + value.list_iter().unwrap().filter_map(|value| match value { + lexpr::Value::Cons(cons) => match cons.car() { + lexpr::Value::Symbol(symbol) => Some((&symbol[..], cons.cdr())), + _ => None, + }, + _ => None, + }) +} + +/// Strip field names from struct representations, so that they are printed more concisely, as if +/// they were tuple-structs. +fn tuplify(value: lexpr::Value) -> lexpr::Value { + let (car, cdr) = match value { + lexpr::Value::Cons(cons) => cons.into_pair(), + lexpr::Value::Vector(mut vector) => { + for value in vector.iter_mut() { + let original = std::mem::replace(value, lexpr::Value::Nil); + *value = tuplify(original); + } + return lexpr::Value::Vector(vector); + } + value => return value, + }; + if let lexpr::Value::Symbol(symbol) = &car { + if let Some(':') = symbol.chars().next() { + return tuplify(cdr); + } + } + let car = tuplify(car); + let cdr = tuplify(cdr); + lexpr::Value::Cons(lexpr::Cons::new(car, cdr)) +} diff --git a/lib/rust/reflect/Cargo.toml b/lib/rust/reflect/Cargo.toml index 65345a8e052..7f5c6d99537 100644 --- a/lib/rust/reflect/Cargo.toml +++ b/lib/rust/reflect/Cargo.toml @@ -10,4 +10,5 @@ enso-metamodel = { path = "../metamodel", features = ["rust"] } derivative = "2.2" [features] +default = ["graphviz"] graphviz = ["enso-metamodel/graphviz"]