From 3b99e18f94602a02155a97ffd834a8f565681fbd Mon Sep 17 00:00:00 2001 From: Kaz Wesley Date: Wed, 20 Jul 2022 07:53:20 -0700 Subject: [PATCH] Code blocks (#3585) --- Cargo.lock | 21 +- Cargo.toml | 1 - lib/rust/metamodel/lexpr/src/lib.rs | 25 +- lib/rust/parser/Cargo.toml | 3 +- lib/rust/parser/generate-java/run.sh | 15 -- lib/rust/parser/src/lexer.rs | 86 ++++-- lib/rust/parser/src/lib.rs | 141 +++++----- lib/rust/parser/src/macros.rs | 3 +- lib/rust/parser/src/macros/built_in.rs | 4 +- lib/rust/parser/src/macros/resolver.rs | 187 +++++++++++-- lib/rust/parser/src/main.rs | 2 +- lib/rust/parser/src/source/code.rs | 2 +- lib/rust/parser/src/source/span.rs | 5 +- lib/rust/parser/src/syntax/item.rs | 42 ++- lib/rust/parser/src/syntax/operator.rs | 129 +++++---- lib/rust/parser/src/syntax/token.rs | 10 +- lib/rust/parser/src/syntax/tree.rs | 117 +++++++- lib/rust/parser/src/syntax/tree/block.rs | 252 ++++++++++++++++++ .../parser/src/syntax/tree/builder/Cargo.toml | 22 -- .../parser/src/syntax/tree/builder/src/lib.rs | 136 ---------- lib/rust/parser/tests/parse.rs | 249 +++++++++++++---- lib/rust/prelude/src/data/non_empty_vec.rs | 3 +- lib/rust/prelude/src/vec.rs | 79 ++++++ 23 files changed, 1108 insertions(+), 426 deletions(-) delete mode 100755 lib/rust/parser/generate-java/run.sh create mode 100644 lib/rust/parser/src/syntax/tree/block.rs delete mode 100644 lib/rust/parser/src/syntax/tree/builder/Cargo.toml delete mode 100644 lib/rust/parser/src/syntax/tree/builder/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 26e8c0e04c9..608e9be3384 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1734,7 +1734,7 @@ dependencies = [ [[package]] name = "enso-build" version = "0.1.0" -source = "git+https://github.com/enso-org/ci-build?branch=develop#5a55bf5241f55bd314ba04498b34d048dae93a34" +source = "git+https://github.com/enso-org/ci-build?branch=develop#acc5a7dacc223ad69ebfc7651c5ed0e3c0f1c9e5" dependencies = [ "anyhow", "async-compression", @@ -1808,7 +1808,7 @@ dependencies = [ [[package]] name = "enso-build-cli" version = "0.1.0" -source = "git+https://github.com/enso-org/ci-build?branch=develop#5a55bf5241f55bd314ba04498b34d048dae93a34" +source = "git+https://github.com/enso-org/ci-build?branch=develop#acc5a7dacc223ad69ebfc7651c5ed0e3c0f1c9e5" dependencies = [ "anyhow", "byte-unit", @@ -2064,13 +2064,14 @@ dependencies = [ "enso-data-structures", "enso-metamodel", "enso-metamodel-lexpr", - "enso-parser-syntax-tree-builder", "enso-parser-syntax-tree-visitor", "enso-prelude", "enso-reflect", "enso-shapely-macros", "enso-types", "lexpr", + "rand 0.8.5", + "rand_chacha 0.3.1", "serde", ] @@ -2085,16 +2086,6 @@ dependencies = [ "enso-reflect", ] -[[package]] -name = "enso-parser-syntax-tree-builder" -version = "0.1.0" -dependencies = [ - "enso-macro-utils", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "enso-parser-syntax-tree-visitor" version = "0.1.0" @@ -3702,7 +3693,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5617e92fc2f2501c3e2bc6ce547cad841adba2bae5b921c7e52510beca6d084c" dependencies = [ - "base64 0.13.0", + "base64 0.11.0", "bytes 1.1.0", "http", "httpdate 1.0.2", @@ -3715,7 +3706,7 @@ dependencies = [ [[package]] name = "ide-ci" version = "0.1.0" -source = "git+https://github.com/enso-org/ci-build?branch=develop#5a55bf5241f55bd314ba04498b34d048dae93a34" +source = "git+https://github.com/enso-org/ci-build?branch=develop#acc5a7dacc223ad69ebfc7651c5ed0e3c0f1c9e5" dependencies = [ "anyhow", "async-compression", diff --git a/Cargo.toml b/Cargo.toml index 2ae03f58fbb..8625520d4f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,6 @@ members = [ "build/rust-scripts", "lib/rust/*", "lib/rust/parser/src/syntax/tree/visitor", - "lib/rust/parser/src/syntax/tree/builder", "lib/rust/parser/generate-java", "lib/rust/profiler/data", "integration-test" diff --git a/lib/rust/metamodel/lexpr/src/lib.rs b/lib/rust/metamodel/lexpr/src/lib.rs index 738cb9375f5..798442ab695 100644 --- a/lib/rust/metamodel/lexpr/src/lib.rs +++ b/lib/rust/metamodel/lexpr/src/lib.rs @@ -55,6 +55,7 @@ use enso_metamodel::meta::*; use derivative::Derivative; use lexpr::Value; use std::collections::BTreeMap; +use std::collections::BTreeSet; @@ -69,13 +70,15 @@ pub struct ToSExpr<'g> { graph: &'g TypeGraph, #[derivative(Debug = "ignore")] mappers: BTreeMap Value>>, + skip: BTreeSet, } impl<'g> ToSExpr<'g> { #[allow(missing_docs)] pub fn new(graph: &'g TypeGraph) -> Self { let mappers = Default::default(); - Self { graph, mappers } + let skip = Default::default(); + Self { graph, mappers, skip } } /// Set a transformation to be applied to a type after translating to an S-expression. @@ -83,6 +86,14 @@ impl<'g> ToSExpr<'g> { self.mappers.insert(id, Box::new(f)); } + /// Omit a type, specified by ID, from the output, wherever it occurs. If it occurs as a field + /// in another struct, that field will be omitted. If the type occurs as a variant of an enum, + /// or as the top-level type passed to [`Self::value`], it will be represented as if it had no + /// fields. + pub fn skip(&mut self, id: TypeId) { + self.skip.insert(id); + } + /// Given a bincode-serialized input, use its `meta` type info to transcribe it to an /// S-expression. pub fn value(&self, id: TypeId, input: &T) -> Value { @@ -126,7 +137,10 @@ impl<'g> ToSExpr<'g> { let mut out = vec![]; self.fields(&mut hierarchy, data, &mut out); assert_eq!(hierarchy, &[]); - let mut value = Value::list(out); + let mut value = match self.skip.contains(&id) { + true => Value::Null, + false => Value::list(out), + }; if let Some(id) = child { if let Some(mapper) = self.mappers.get(&id) { value = (mapper)(value); @@ -157,11 +171,14 @@ impl<'g> ToSExpr<'g> { self.fields(hierarchy, data, out); } for (i, field) in fields.iter().enumerate() { + let skip = self.skip.contains(&field.type_); if !field.name.is_empty() { let car = Value::Symbol(format!(":{}", field.name).into_boxed_str()); let cdr = self.value_(field.type_, data); - out.push(Value::cons(car, cdr)); - } else { + if !skip { + out.push(Value::cons(car, cdr)); + } + } else if !skip { out.push(self.value_(field.type_, data)); } if self.graph[id].child_field == Some(i + 1) { diff --git a/lib/rust/parser/Cargo.toml b/lib/rust/parser/Cargo.toml index e752f1e866a..3a69a3aa05e 100644 --- a/lib/rust/parser/Cargo.toml +++ b/lib/rust/parser/Cargo.toml @@ -16,7 +16,6 @@ enso-data-structures = { path = "../data-structures" } enso-types = { path = "../types", features = ["serde"] } enso-shapely-macros = { path = "../shapely/macros" } enso-parser-syntax-tree-visitor = { path = "src/syntax/tree/visitor" } -enso-parser-syntax-tree-builder = { path = "src/syntax/tree/builder" } serde = { version = "1.0", features = ["derive"] } bincode = "1.3" @@ -24,3 +23,5 @@ bincode = "1.3" enso-metamodel = { path = "../metamodel", features = ["rust"] } enso-metamodel-lexpr = { path = "../metamodel/lexpr" } lexpr = "0.2.6" +rand = "0.8.5" +rand_chacha = "0.3.1" diff --git a/lib/rust/parser/generate-java/run.sh b/lib/rust/parser/generate-java/run.sh deleted file mode 100755 index 1c4809071e4..00000000000 --- a/lib/rust/parser/generate-java/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -set -e - -echo $0 | grep lib/rust || ( echo This tool must be run from the repo root, as lib/rust/parser/generate-java/run.sh; exit 1 ) - -BASE=target/generated_java -OUT=$BASE/org/enso/syntax2 -LIB=lib/rust/parser/generate-java/java -mkdir -p $OUT -cargo test -p enso-parser-generate-java -cargo run -p enso-parser-generate-java --bin enso-parser-generate-java -- $OUT -cargo run -p enso-parser-generate-java --bin java-tests > $BASE/GeneratedFormatTests.java -javac -classpath "$LIB:$BASE" -d $BASE $BASE/GeneratedFormatTests.java -java -classpath $BASE GeneratedFormatTests diff --git a/lib/rust/parser/src/lexer.rs b/lib/rust/parser/src/lexer.rs index 04c35e8b1f9..1a44cc84a26 100644 --- a/lib/rust/parser/src/lexer.rs +++ b/lib/rust/parser/src/lexer.rs @@ -78,10 +78,12 @@ pattern_impl_for_char_slice!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); pub struct Lexer<'s> { #[deref] #[deref_mut] - pub state: LexerState, - pub input: &'s str, - pub iterator: str::CharIndices<'s>, - pub output: Vec>, + pub state: LexerState, + pub input: &'s str, + pub iterator: str::CharIndices<'s>, + pub output: Vec>, + /// Memory for storing tokens, reused as an optimization. + pub token_storage: VecAllocation>, } /// Internal state of the [`Lexer`]. @@ -101,9 +103,10 @@ impl<'s> Lexer<'s> { pub fn new(input: &'s str) -> Self { let iterator = input.char_indices(); let capacity = input.len() / AVERAGE_TOKEN_LEN; - let output = Vec::with_capacity(capacity * mem::size_of::>()); + let output = Vec::with_capacity(capacity); let state = default(); - Self { input, iterator, output, state }.init() + let token_storage = default(); + Self { input, iterator, output, state, token_storage }.init() } fn init(mut self) -> Self { @@ -677,35 +680,31 @@ impl<'s> Lexer<'s> { fn newline(&mut self) { if let Some(token) = self.line_break() { - let mut newlines = vec![token.with_variant(token::Variant::newline())]; + let mut newlines = self.token_storage.take(); while let Some(token) = self.line_break() { newlines.push(token.with_variant(token::Variant::newline())); } let block_indent = self.last_spaces_visible_offset; - if block_indent > self.current_block_indent { let block_start = self.marker_token(token::Variant::block_start()); self.submit_token(block_start); self.start_block(block_indent); - } else { - while block_indent < self.current_block_indent { - let err = "Lexer internal error. Inconsistent code block hierarchy."; - let parent_block_indent = self.end_block().expect(err); - if block_indent > self.current_block_indent { - // The new line indent is smaller than current block but bigger than the - // // previous one. We are treating the line as belonging to the - // block. The warning should be reported by parser. - self.start_block(parent_block_indent); - break; - } else { - let block_end = self.marker_token(token::Variant::block_end()); - self.submit_token(block_end); - } + } + while block_indent < self.current_block_indent { + let previous_indent = self.block_indent_stack.last().copied().unwrap_or_default(); + if block_indent > previous_indent { + // The new line indent is smaller than current block but bigger than the + // previous one. We are treating the line as belonging to the + // block. The warning should be reported by parser. + break; } + self.end_block(); + let block_end = self.marker_token(token::Variant::block_end()); + self.submit_token(block_end); } - for newline in newlines { - self.submit_token(newline); - } + self.submit_token(token.with_variant(token::Variant::newline())); + newlines.drain(..).for_each(|token| self.submit_token(token)); + self.token_storage.set_from(newlines); } } } @@ -876,12 +875,45 @@ mod tests { ident_(" ", "bar"), block_end_("", ""), ]), + ("foo\n +", vec![ + ident_("", "foo"), + block_start_("", ""), + newline_("", "\n"), + operator_(" ", "+"), + block_end_("", ""), + ]), ]); } #[test] - fn test_case_empty() { - test_lexer("", vec![]); + fn test_case_block_bad_indents() { + #[rustfmt::skip] + test_lexer_many(vec![ + ("\n foo\n bar\nbaz", vec![ + block_start_("", ""), + newline_("", "\n"), ident_(" ", "foo"), + newline_("", "\n"), ident_(" ", "bar"), + block_end_("", ""), + newline_("", "\n"), ident_("", "baz"), + ]), + ("\n foo\n bar\n baz", vec![ + block_start_("", ""), + newline_("", "\n"), ident_(" ", "foo"), + newline_("", "\n"), ident_(" ", "bar"), + newline_("", "\n"), ident_(" ", "baz"), + block_end_("", ""), + ]), + ]); + } + + #[test] + fn test_case_whitespace_only_line() { + test_lexer_many(vec![("foo\n \nbar", vec![ + ident_("", "foo"), + newline_("", "\n"), + newline_(" ", "\n"), + ident_("", "bar"), + ])]); } #[test] diff --git a/lib/rust/parser/src/lib.rs b/lib/rust/parser/src/lib.rs index 3b6eba31174..09930b5aaf4 100644 --- a/lib/rust/parser/src/lib.rs +++ b/lib/rust/parser/src/lib.rs @@ -79,10 +79,12 @@ // === Features === #![allow(incomplete_features)] #![feature(allocator_api)] +#![feature(exact_size_is_empty)] #![feature(test)] #![feature(specialization)] #![feature(let_chains)] #![feature(if_let_guard)] +#![feature(box_patterns)] // === Standard Linter Configuration === #![deny(non_ascii_idents)] #![warn(unsafe_code)] @@ -147,15 +149,8 @@ impl Parser { /// Main entry point. pub fn run<'s>(&self, code: &'s str) -> syntax::Tree<'s> { let tokens = lexer::run(code); - let mut statements = vec![]; - let mut tokens = tokens.into_iter().peekable(); - while tokens.peek().is_some() { - let resolver = macros::resolver::Resolver::new_root(); - let tree = resolver.run(&self.macros, &mut tokens); - let tree = expression_to_statement(tree); - statements.push(tree); - } - syntax::Tree::block(statements) + let resolver = macros::resolver::Resolver::new_root(); + resolver.run(&self.macros, tokens) } } @@ -172,73 +167,41 @@ impl Default for Parser { /// /// In statement context, an expression that has an assignment operator at its top level is /// interpreted as a variable assignment or method definition. -fn expression_to_statement(tree: syntax::Tree<'_>) -> syntax::Tree<'_> { +fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> { use syntax::tree::*; - let tree_ = match &*tree.variant { + let tree_ = match &mut *tree.variant { Variant::OprSectionBoundary(OprSectionBoundary { ast }) => ast, - _ => &tree, + _ => &mut tree, }; - let mut replacement = None; - if let Variant::OprApp(opr_app) = &*tree_.variant { - replacement = expression_to_binding(opr_app); - } - match replacement { - Some(modified) => modified, - None => tree, - } -} - -/// If the input is an "=" expression, try to interpret it as either a variable assignment or method -/// definition. -fn expression_to_binding<'a>(app: &syntax::tree::OprApp<'a>) -> Option> { - use syntax::tree::*; - match app { - OprApp { lhs: Some(lhs), opr: Ok(opr), rhs } if opr.code == "=" => { - let mut lhs = lhs; - let mut args = vec![]; - while let Variant::App(App { func, arg }) = &*lhs.variant { - lhs = func; - args.push(arg.clone()); - } - args.reverse(); - if let Some(rhs) = rhs && args.is_empty() { - Some(Tree::assignment(lhs.clone(), opr.clone(), rhs.clone())) - } else if let Variant::Ident(Ident { token }) = &*lhs.variant { - Some(Tree::function(token.clone(), args, opr.clone(), rhs.clone())) - } else { - None - } + let opr_app = match &mut *tree_.variant { + Variant::OprApp(opr_app) => opr_app, + _ => return tree, + }; + if let OprApp { lhs: Some(lhs), opr: Ok(opr), rhs } = opr_app && opr.code == "=" { + let mut args = vec![]; + let mut lhs = lhs; + while let Tree { variant: box Variant::App(App { func, arg }), .. } = lhs { + lhs = func; + args.push(arg.clone()); + } + args.reverse(); + if args.is_empty() && let Some(rhs) = rhs && !is_body_block(rhs) { + // If the LHS has no arguments, and there is a RHS, and the RHS is not a body block, + // this is a variable assignment. + return Tree::assignment(mem::take(lhs), mem::take(opr), mem::take(rhs)) + } + if let Variant::Ident(Ident { token }) = &mut *lhs.variant { + // If this is not a variable assignment, and the leftmost leaf of the `App` tree is + // an identifier, this is a function definition. + return Tree::function(mem::take(token), args, mem::take(opr), mem::take(rhs)) } - _ => None, } + tree } - - -// ============= -// === Tests === -// ============= - -#[cfg(test)] -mod tests { - use super::*; - use enso_parser_syntax_tree_builder::ast_builder; - - macro_rules! test_parse { - ($input:tt = {$($def:tt)*}) => { - assert_eq!( - Parser::new().run($input), - ast_builder! { $($def)* } - ) - }; - } - - #[test] - fn test_expressions() { - test_parse! {"a" = {a}}; - test_parse! {"a b" = {a b}}; - test_parse! {"a b c" = {[a b] c}}; - } +/// Return whether the expression is a body block. +fn is_body_block(expression: &syntax::tree::Tree<'_>) -> bool { + matches!(&*expression.variant, syntax::tree::Variant::BodyBlock { .. }) } @@ -262,4 +225,44 @@ mod benches { parser.run(&str); }); } + + #[bench] + fn bench_blocks(bencher: &mut Bencher) { + use rand::prelude::*; + use rand_chacha::ChaCha8Rng; + let lines = 10_000; + let mut str = String::new(); + let mut rng = ChaCha8Rng::seed_from_u64(0); + let mut indent = 0u32; + for _ in 0..lines { + // Indent: + // 1/8 chance of increasing. + // 1/8 chance of decreasing. + // 3/4 chance of leaving unchanged. + match rng.gen_range(0..8) { + 0u32 => indent = indent.saturating_sub(1), + 1 => indent += 1, + _ => (), + } + for _ in 0..indent { + str.push(' '); + } + // 1/4 chance of operator-block line syntax. + if rng.gen_range(0..4) == 0u32 { + str.push_str("* "); + } + str.push('x'); + // Equal chance of the next line being interpreted as a body block or argument block + // line, if it is indented and doesn't match the operator-block syntax. + // The `=` operator is chosen to exercise the expression-to-statement conversion path. + if rng.gen() { + str.push_str(" ="); + } + str.push('\n'); + } + let parser = Parser::new(); + bencher.iter(move || { + parser.run(&str); + }); + } } diff --git a/lib/rust/parser/src/macros.rs b/lib/rust/parser/src/macros.rs index b38c7706f5c..f1943339a23 100644 --- a/lib/rust/parser/src/macros.rs +++ b/lib/rust/parser/src/macros.rs @@ -106,8 +106,7 @@ fn matched_segments_into_multi_segment_app( let segments = matched_segments.mapped(|segment| { let header = segment.header; let tokens = segment.result.tokens(); - let body = (!tokens.is_empty()) - .as_some_from(|| syntax::operator::resolve_operator_precedence(tokens)); + let body = syntax::operator::resolve_operator_precedence_if_non_empty(tokens); syntax::tree::MultiSegmentAppSegment { header, body } }); syntax::Tree::multi_segment_app(segments) diff --git a/lib/rust/parser/src/macros/built_in.rs b/lib/rust/parser/src/macros/built_in.rs index 47d928911d5..f1e0d3b122b 100644 --- a/lib/rust/parser/src/macros/built_in.rs +++ b/lib/rust/parser/src/macros/built_in.rs @@ -67,7 +67,7 @@ fn type_def_body(matched_segments: NonEmptyVec) -> syntax::Tree let mut v = match_tree.view(); let name = &v.query("name").unwrap()[0]; - let name = operator::resolve_operator_precedence(name.clone()); + let name = operator::resolve_operator_precedence_if_non_empty(name.clone()).unwrap(); // println!("{:#?}", name); // println!("\n\n------------- 2"); @@ -78,7 +78,7 @@ fn type_def_body(matched_segments: NonEmptyVec) -> syntax::Tree let params = params .iter() - .map(|tokens| operator::resolve_operator_precedence(tokens.clone())) + .map(|tokens| operator::resolve_operator_precedence_if_non_empty(tokens.clone()).unwrap()) .collect_vec(); // println!("{:#?}", params); syntax::Tree::type_def(segment.header, name, params) diff --git a/lib/rust/parser/src/macros/resolver.rs b/lib/rust/parser/src/macros/resolver.rs index 52d32a47ae2..82b814f7175 100644 --- a/lib/rust/parser/src/macros/resolver.rs +++ b/lib/rust/parser/src/macros/resolver.rs @@ -1,4 +1,26 @@ //! Macro resolver implementation. Refer to the docs of the main parser module to learn more. +//! +//! # Blocks +//! +//! Macro resolution is informed by block structure. +//! +//! Macros can explicitly manipulate blocks: A macro can use [`pattern`]s to match depending on the +//! contents of a child block, and a macro can create any arbitrary block structure in its output. +//! +//! However, there is one rule that makes block structure more primitive than macros: Each of a +//! macro's segments must begin in the top level of the same block. +//! +//! For some invalid inputs, this rule affects how errors are reported. For example: +//! ```Enso +//! if foo +//! then bar +//! ``` +//! This will be parsed as an `if` macro whose condition is an argument block application applying +//! `foo` to `then bar`; the reported error will be an incomplete application of the `if` macro. +//! +//! This is implemented by starting a new macro resolution [`Scope`] at the beginning of every +//! block; the new scope is initialized with only the root macro. Within a scope the state of all +//! macros defined in parent scopes will never be advanced. use crate::prelude::*; @@ -95,11 +117,16 @@ impl<'a> PartiallyMatchedMacro<'a> { body: Rc::new(|v| { // Taking the first segment, hardcoded above. let body = v.pop().0.result; - syntax::operator::resolve_operator_precedence(body.tokens()) + syntax::operator::resolve_operator_precedence_if_non_empty(body.tokens()).unwrap() }), })); Self { current_segment, resolved_segments, possible_next_segments, matched_macro_def } } + + /// Append an item or partially-matched macro to the current segment. + fn push(&mut self, item: impl Into>) { + self.current_segment.body.push(item.into()); + } } @@ -174,8 +201,12 @@ impl<'s> TryAsRef> for ItemOrPartiallyMatchedMacro<'s> /// to learn more about the macro resolution steps. #[derive(Debug)] pub struct Resolver<'s> { - current_macro: PartiallyMatchedMacro<'s>, - macro_stack: Vec>, + current_macro: PartiallyMatchedMacro<'s>, + macro_stack: Vec>, + scopes: Vec>, + lines: Vec>, + newline: Option>, + line_contains_items: bool, } /// Result of the macro resolution step. @@ -186,26 +217,50 @@ enum Step<'s> { MacroStackPop(syntax::Item<'s>), } +/// Information about macro resolution state that is stored while processing a deeper indentation +/// level. +/// +/// See the module docs ([`self`]) for about the interaction between blocks and macros. +#[derive(Debug)] +struct Scope<'s> { + parent_tokens: std::vec::IntoIter>, + macros_start: usize, + outputs_start: usize, + prev_newline: Option>, + prev_macro: PartiallyMatchedMacro<'s>, +} + impl<'s> Resolver<'s> { /// New resolver with a special "root" segment definition allowing parsing arbitrary /// expressions. pub fn new_root() -> Self { let current_macro = PartiallyMatchedMacro::new_root(); let macro_stack = default(); - Self { current_macro, macro_stack } + let scopes = default(); + let lines = default(); + let newline = Some(token::newline("", "")); + let line_contains_items = default(); + Self { current_macro, macro_stack, scopes, lines, newline, line_contains_items } } - fn replace_current_with_parent_macro(&mut self, mut parent_macro: PartiallyMatchedMacro<'s>) { - mem::swap(&mut parent_macro, &mut self.current_macro); - let child_macro = parent_macro; - self.current_macro.current_segment.body.push(child_macro.into()); + fn replace_current_with_parent_macro(&mut self, parent_macro: PartiallyMatchedMacro<'s>) { + let child_macro = mem::replace(&mut self.current_macro, parent_macro); + self.current_macro.push(child_macro); + } + + /// Returns the index of the first element in `self.macro_stack` that is active in the current + /// scope. Any macros before that index are active in some block that contains the current + /// block, so they will not match tokens within this block. + fn macro_scope_start(&self) -> usize { + self.scopes.last().map(|scope| scope.macros_start).unwrap_or_default() } /// Pop the macro stack if the current token is reserved. For example, when matching the /// `if a if b then c then d` expression, the token `then` after the token `c` will be /// considered reserved and the macro resolution of `if b then c` will be popped from the stack. fn pop_macro_stack_if_reserved(&mut self, repr: &str) -> Option> { - let reserved = self.macro_stack.iter().any(|p| p.possible_next_segments.contains_key(repr)); + let macros = &self.macro_stack[self.macro_scope_start()..]; + let reserved = macros.iter().any(|p| p.possible_next_segments.contains_key(repr)); reserved.and_option_from(|| self.macro_stack.pop()) } @@ -213,16 +268,15 @@ impl<'s> Resolver<'s> { pub fn run( mut self, root_macro_map: &SegmentMap<'s>, - tokens: &mut iter::Peekable>>, + tokens: Vec>, ) -> syntax::Tree<'s> { + let mut tokens = tokens.into_iter(); event!(TRACE, "Running macro resolver. Registered macros:\n{:#?}", root_macro_map); let mut opt_item: Option>; macro_rules! next_token { () => {{ opt_item = tokens.next(); - if let Some(token) = opt_item.as_ref() { - event!(TRACE, "New token {:#?}", token); - } + event!(TRACE, "Next token {:#?}", &opt_item); }}; } macro_rules! trace_state { @@ -232,9 +286,61 @@ impl<'s> Resolver<'s> { }; } next_token!(); - while let Some(token) = opt_item && !token.is_newline() { + loop { + while opt_item.is_none() { + if let Some(newline) = self.newline.take() { + let expression = self.line_contains_items.as_some_from(|| self.unwind_stack()); + self.lines.push(syntax::tree::block::Line { newline, expression }); + } + if let Some(parent_tokens) = self.exit_current_scope() { + tokens = parent_tokens; + next_token!(); + continue; + } + break; + } + let token = match opt_item { + Some(token) => token, + None => break, + }; + if let syntax::Item::Token(Token { + variant: token::Variant::Newline(_), + left_offset, + code, + }) = token + { + let new_newline = token::newline(left_offset, code); + let newline = mem::replace(&mut self.newline, Some(new_newline)); + if let Some(newline) = newline { + let expression = self.line_contains_items.as_some_from(|| self.unwind_stack()); + self.lines.push(syntax::tree::block::Line { newline, expression }); + } + next_token!(); + self.line_contains_items = false; + continue; + } + self.line_contains_items = true; let step_result = match token { syntax::Item::Token(token) => self.process_token(root_macro_map, token), + syntax::Item::Block(tokens_) => { + let parent_tokens = mem::replace(&mut tokens, tokens_.into_iter()); + let new_root = PartiallyMatchedMacro::new_root(); + let prev_macro = mem::replace(&mut self.current_macro, new_root); + let macros_start = self.macro_stack.len(); + let outputs_start = self.lines.len(); + let prev_newline = self.newline.take(); + let scope = Scope { + parent_tokens, + macros_start, + outputs_start, + prev_newline, + prev_macro, + }; + self.scopes.push(scope); + next_token!(); + self.line_contains_items = false; + continue; + } _ => Step::NormalToken(token), }; match step_result { @@ -247,20 +353,63 @@ impl<'s> Resolver<'s> { next_token!() } Step::NormalToken(item) => { - self.current_macro.current_segment.body.push(item.into()); + self.current_macro.push(item); trace_state!(); next_token!(); } } } + syntax::tree::block::body_from_lines(self.lines) + } - event!(TRACE, "Finishing resolution. Popping the macro stack."); - while let Some(parent_macro) = self.macro_stack.pop() { - self.replace_current_with_parent_macro(parent_macro); + /// Finish processing the current block and close its macro scope, unless this is the top-level + /// block, which is indicated by returning `None`. + /// + /// This builds a [`syntax::Item::Block`] from the outputs of the current scope, restores the + /// state to resume processing the parent scope, and submits the built block as a token to the + /// newly-current macro (which would have been the macro active when the block began). + /// + /// Returns the remaining tokens of the parent block. + fn exit_current_scope(&mut self) -> Option>> { + let scope = self.scopes.pop()?; + let Scope { parent_tokens, macros_start, outputs_start, prev_newline, prev_macro } = scope; + debug_assert_eq!(macros_start, self.macro_stack.len()); + self.current_macro = prev_macro; + let lines = self.lines.drain(outputs_start..); + let mut out = Vec::with_capacity(lines.len() * 2); + for line in lines { + let syntax::tree::block::Line { newline, expression } = line; + let newline = syntax::Token::from(newline); + let newline = syntax::Item::from(newline); + out.push(newline); + if let Some(expression) = expression { + let expression = syntax::Item::from(expression); + out.push(expression); + } } + let block = syntax::Item::Block(out); + self.current_macro.push(block); + self.line_contains_items = true; + self.newline = prev_newline; + Some(parent_tokens) + } + fn unwind_stack(&mut self) -> syntax::Tree<'s> { + macro_rules! trace_state { + () => { + event!(TRACE, "Current macro:\n{:#?}", self.current_macro); + event!(TRACE, "Parent macros:\n{:#?}", self.macro_stack); + }; + } + event!(TRACE, "Finishing resolution. Popping the macro stack."); + let macros = self.macro_stack.drain(self.macro_scope_start()..).rev(); + for parent_macro in macros { + let child_macro = mem::replace(&mut self.current_macro, parent_macro); + self.current_macro.push(child_macro); + } trace_state!(); - let (tree, rest) = Self::resolve(self.current_macro); + let macro_ = mem::replace(&mut self.current_macro, PartiallyMatchedMacro::new_root()); + let (tree, rest) = Self::resolve(macro_); if !rest.is_empty() { panic!( "Internal error. Not all tokens were consumed by the macro resolver:\n{:#?}", diff --git a/lib/rust/parser/src/main.rs b/lib/rust/parser/src/main.rs index 9ae884e98cd..cbd5928a0ab 100644 --- a/lib/rust/parser/src/main.rs +++ b/lib/rust/parser/src/main.rs @@ -34,7 +34,7 @@ use enso_parser::prelude::*; fn main() { init_tracing(TRACE); - let ast = enso_parser::Parser::new().run("type Option (a) b c"); + let ast = enso_parser::Parser::new().run("foo = 23"); println!("\n\n==================\n\n"); println!("{:#?}", ast); } diff --git a/lib/rust/parser/src/source/code.rs b/lib/rust/parser/src/source/code.rs index af51ab7fead..affbaad7839 100644 --- a/lib/rust/parser/src/source/code.rs +++ b/lib/rust/parser/src/source/code.rs @@ -15,7 +15,7 @@ use crate::prelude::*; pub struct Code<'s> { #[serde(serialize_with = "crate::serialization::serialize_cow")] #[serde(deserialize_with = "crate::serialization::deserialize_cow")] - #[reflect(as = "crate::serialization::Code")] + #[reflect(as = "crate::serialization::Code", flatten)] pub repr: Cow<'s, str>, } diff --git a/lib/rust/parser/src/source/span.rs b/lib/rust/parser/src/source/span.rs index 69e7117e12d..360af96f8f9 100644 --- a/lib/rust/parser/src/source/span.rs +++ b/lib/rust/parser/src/source/span.rs @@ -327,7 +327,10 @@ where T: Builder<'s> { #[inline(always)] fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> { - self.as_mut().map(|t| Builder::add_to_span(t, span)).unwrap_or_default() + match self { + Some(t) => Builder::add_to_span(t, span), + None => span, + } } } diff --git a/lib/rust/parser/src/syntax/item.rs b/lib/rust/parser/src/syntax/item.rs index 42a87e801c5..323a94e579f 100644 --- a/lib/rust/parser/src/syntax/item.rs +++ b/lib/rust/parser/src/syntax/item.rs @@ -14,7 +14,7 @@ use crate::syntax::*; /// Abstraction for [`Token`] and [`Tree`]. Some functions, such as macro resolver need to /// distinguish between two cases and need to handle both incoming tokens and already constructed /// [`Tree`] nodes. This structure provides handy utilities to work with such cases. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] #[allow(missing_docs)] pub enum Item<'s> { Token(Token<'s>), @@ -47,10 +47,20 @@ impl<'s> Item<'s> { Item::Token(token) => match token.variant { token::Variant::Ident(ident) => Tree::ident(token.with_variant(ident)), token::Variant::Number(number) => Tree::number(token.with_variant(number)), - _ => todo!(), + _ => todo!("{token:?}"), }, Item::Tree(ast) => ast, - Item::Block(_) => todo!(), + Item::Block(items) => build_block(items), + } + } + + /// If this item is an [`Item::Tree`], apply the given function to the contained [`Tree`] and + /// return the result. + pub fn map_tree<'t: 's, F>(self, f: F) -> Self + where F: FnOnce(Tree<'s>) -> Tree<'t> { + match self { + Item::Tree(tree) => Item::Tree(f(tree)), + _ => self, } } } @@ -73,6 +83,32 @@ impl<'s> TryAsRef> for Item<'s> { } } +/// Given a sequence of [`Item`]s belonging to one block, create an AST block node, of a type +/// determined by the syntax of the lines in the block. +fn build_block<'s>(items: impl IntoIterator>) -> Tree<'s> { + let mut line = vec![]; + let mut block_builder = tree::block::Builder::new(); + let mut newline = None; + for item in items { + match item { + Item::Token(Token { variant: token::Variant::Newline(_), left_offset, code }) => { + let newline = mem::replace(&mut newline, Some(token::newline(left_offset, code))); + if let Some(newline) = newline { + let line: Vec<_> = line.drain(..).collect(); + let expression = operator::resolve_operator_precedence_if_non_empty(line); + block_builder.push(newline, expression); + } + } + _ => line.push(item), + } + } + if let Some(newline) = newline { + let expression = operator::resolve_operator_precedence_if_non_empty(line); + block_builder.push(newline, expression); + } + block_builder.build() +} + // =========== diff --git a/lib/rust/parser/src/syntax/operator.rs b/lib/rust/parser/src/syntax/operator.rs index 1bc7a5cea32..96316622cd4 100644 --- a/lib/rust/parser/src/syntax/operator.rs +++ b/lib/rust/parser/src/syntax/operator.rs @@ -13,7 +13,7 @@ use crate::syntax::token::Token; // ================== // FIXME: The current implementation hard-codes precedence values and does not support precedence -// computations for any operator (according to the spec) +// computations for any operator (according to the spec) fn precedence_of(operator: &str) -> usize { match operator { "=" => 1, @@ -45,77 +45,93 @@ impl WithPrecedence { /// example, `if cond then.x else.y` is parsed as `if cond then .x else .y`, which after expansion /// translates to `if cond then (\t -> t.x) else (\t -> t.y)`. However, for some macros spacing is /// not needed. For example, `(.x)` is parsed as `(\t -> t.x)`, which is understandable. -fn annotate_tokens_that_need_spacing(items: Vec) -> Vec { - // TODO: It should be possible to make it faster by iterating over mut vec. To be checked. - items - .into_iter() - .map(|item| match item { - syntax::Item::Block(_) => item, - syntax::Item::Token(_) => item, - syntax::Item::Tree(ast) => syntax::Item::Tree(match &*ast.variant { - syntax::tree::Variant::MultiSegmentApp(data) - if !data.segments.first().header.is_symbol() => - ast.with_error("This expression cannot be used in a non-spaced equation."), - _ => ast, - }), - }) - .collect() +fn annotate_tokens_that_need_spacing(item: syntax::Item) -> syntax::Item { + use syntax::tree::Variant::*; + item.map_tree(|ast| match &*ast.variant { + MultiSegmentApp(data) if !data.segments.first().header.is_symbol() => + ast.with_error("This expression cannot be used in a non-spaced equation."), + _ => ast, + }) } -/// Take [`Item`] stream, resolve operators precedence and return the final AST. The precedence -/// resolution algorithm bases on the [Shunting yard algorithm](https://en.wikipedia.org/wiki/Shunting_yard_algorithm). -/// It is extended to handle operator sections. -#[inline(always)] -pub fn resolve_operator_precedence<'s>(items: Vec>) -> syntax::Tree<'s> { +/// If the input sequence is non-empty, return the result of applying +/// [`resolve_operator_precedence`] to it. +pub fn resolve_operator_precedence_if_non_empty( + items: Vec>, +) -> Option> { + match NonEmptyVec::try_from(items) { + Ok(items) => Some(resolve_operator_precedence(items)), + _ => None, + } +} + +/// Take [`Item`] stream, resolve operator precedence and return the final AST. +/// +/// The precedence resolution algorithm is based on the Shunting yard algorithm[1], extended to +/// handle operator sections. +/// [1]: https://en.wikipedia.org/wiki/Shunting_yard_algorithm +pub fn resolve_operator_precedence<'s>(items: NonEmptyVec>) -> syntax::Tree<'s> { type Tokens<'s> = Vec>; let mut flattened: Tokens<'s> = default(); let mut no_space_group: Tokens<'s> = default(); - let processs_no_space_group = |flattened: &mut Tokens<'s>, no_space_group: &mut Tokens<'s>| { - let tokens = mem::take(no_space_group); - if tokens.len() == 1 { + let process_no_space_group = |flattened: &mut Tokens<'s>, no_space_group: &mut Tokens<'s>| { + let tokens = no_space_group.drain(..); + if tokens.len() < 2 { flattened.extend(tokens); } else { - let tokens = annotate_tokens_that_need_spacing(tokens); + let tokens = tokens.map(annotate_tokens_that_need_spacing); let ast = resolve_operator_precedence_internal(tokens); flattened.push(ast.into()); } }; - for item in items { - if item.left_visible_offset().width_in_spaces == 0 || no_space_group.is_empty() { - no_space_group.push(item) - } else if !no_space_group.is_empty() { - processs_no_space_group(&mut flattened, &mut no_space_group); - no_space_group.push(item); - } else { - // FIXME: this is unreachable. - flattened.push(item); + // Returns `true` for an item if that item should not follow any other item in a no-space group + // (i.e. the item has "space" before it). + let starts_new_no_space_group = |item: &syntax::item::Item| { + if item.left_visible_offset().width_in_spaces != 0 { + return true; } + if let syntax::item::Item::Block(_) = item { + return true; + } + false + }; + for item in items { + if starts_new_no_space_group(&item) { + process_no_space_group(&mut flattened, &mut no_space_group); + } + no_space_group.push(item); } - if !no_space_group.is_empty() { - processs_no_space_group(&mut flattened, &mut no_space_group); - } + process_no_space_group(&mut flattened, &mut no_space_group); resolve_operator_precedence_internal(flattened) } -fn resolve_operator_precedence_internal(items: Vec>) -> syntax::Tree<'_> { +fn resolve_operator_precedence_internal<'s>( + items: impl IntoIterator>, +) -> syntax::Tree<'s> { // Reverse-polish notation encoding. + /// Classify an item as an operator-token, or other data; we track this state information + /// because whenever consecutive operators or consecutive non-operators occur, we merge them + /// into one node. + #[derive(PartialEq, Eq)] + enum ItemType { + Ast, + Opr, + } + use ItemType::*; let mut was_section_used = false; let mut output: Vec = default(); let mut operator_stack: Vec> = default(); - let mut last_token_was_ast = false; - let mut last_token_was_opr = false; + let mut prev_type = None; for item in items { - if let syntax::Item::Token(token) = item.clone() - && let token::Variant::Operator(opr) = token.variant { + if let syntax::Item::Token( + Token { variant: token::Variant::Operator(opr), left_offset, code }) = item { // Item is an operator. - let last_token_was_opr_copy = last_token_was_opr; - last_token_was_ast = false; - last_token_was_opr = true; + let prev_type = mem::replace(&mut prev_type, Some(Opr)); - let prec = precedence_of(&token.code); - let opr = Token(token.left_offset, token.code, opr); + let prec = precedence_of(&code); + let opr = Token(left_offset, code, opr); - if last_token_was_opr_copy && let Some(prev_opr) = operator_stack.last_mut() { + if prev_type == Some(Opr) && let Some(prev_opr) = operator_stack.last_mut() { // Error. Multiple operators next to each other. match &mut prev_opr.elem { Err(err) => err.operators.push(opr), @@ -133,37 +149,38 @@ fn resolve_operator_precedence_internal(items: Vec>) -> syntax: // Prev operator in the [`operator_stack`] has a higher precedence. let lhs = output.pop().map(|t| t.to_ast()); if lhs.is_none() { was_section_used = true; } - let ast = syntax::Tree::opr_app(lhs, prev_opr.elem, Some(rhs.to_ast())); + let ast = syntax::tree::apply_operator(lhs, prev_opr.elem, Some(rhs.to_ast())); output.push(ast.into()); } operator_stack.push(WithPrecedence::new(prec, Ok(opr))); } - } else if last_token_was_ast && let Some(lhs) = output.pop() { + } else if prev_type == Some(Ast) && let Some(lhs) = output.pop() { // Multiple non-operators next to each other. let lhs = lhs.to_ast(); let rhs = item.to_ast(); - let ast = syntax::Tree::app(lhs, rhs); + let ast = syntax::tree::apply(lhs, rhs); output.push(ast.into()); } else { // Non-operator that follows previously consumed operator. - last_token_was_ast = true; - last_token_was_opr = false; + prev_type = Some(Ast); output.push(item); } } - let mut opt_rhs = last_token_was_ast.and_option_from(|| output.pop().map(|t| t.to_ast())); + let mut opt_rhs = (prev_type == Some(Ast)).and_option_from(|| output.pop().map(|t| t.to_ast())); while let Some(opr) = operator_stack.pop() { let opt_lhs = output.pop().map(|t| t.to_ast()); if opt_lhs.is_none() || opt_rhs.is_none() { was_section_used = true; } - opt_rhs = Some(syntax::Tree::opr_app(opt_lhs, opr.elem, opt_rhs)); + opt_rhs = Some(syntax::tree::apply_operator(opt_lhs, opr.elem, opt_rhs)); } if !output.is_empty() { panic!("Internal error. Not all tokens were consumed while constructing the expression."); } - // FIXME + // This unwrap is safe because: + // - resolve_operator_precedence only calls this function with non-empty sequences as inputs. + // - Given a non-empty input, we will always have at least one output. let out = opt_rhs.unwrap(); if was_section_used { syntax::Tree::opr_section_boundary(out) diff --git a/lib/rust/parser/src/syntax/token.rs b/lib/rust/parser/src/syntax/token.rs index b8c4e1784c4..343f2bf3e4c 100644 --- a/lib/rust/parser/src/syntax/token.rs +++ b/lib/rust/parser/src/syntax/token.rs @@ -103,7 +103,7 @@ use enso_shapely_macros::tagged_enum; // ============= /// The lexical token definition. See the module docs to learn more about its usage scenarios. -#[derive(Clone, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] +#[derive(Clone, Default, Deref, DerefMut, Eq, PartialEq, Serialize, Reflect, Deserialize)] #[allow(missing_docs)] pub struct Token<'s, T = Variant> { #[deref] @@ -248,6 +248,8 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg #[allow(missing_docs)] #[tagged_enum(apply_attributes_to = "variants")] #[reflect(inline)] + #[tagged_enum(apply_attributes_to = "variant-types")] + #[derive(Default)] pub enum Variant { Newline, Symbol, @@ -272,6 +274,12 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg } }}} +impl Default for Variant { + fn default() -> Self { + Self::Newline(variant::Newline {}) + } +} + macro_rules! generate_token_aliases { ( $(#$enum_meta:tt)* diff --git a/lib/rust/parser/src/syntax/tree.rs b/lib/rust/parser/src/syntax/tree.rs index 5ad8b746dbb..756303f264d 100644 --- a/lib/rust/parser/src/syntax/tree.rs +++ b/lib/rust/parser/src/syntax/tree.rs @@ -9,6 +9,8 @@ use crate::span_builder; use enso_parser_syntax_tree_visitor::Visitor; use enso_shapely_macros::tagged_enum; +pub mod block; + // ============ @@ -53,6 +55,15 @@ impl<'s> AsRef> for Tree<'s> { } } +impl<'s> Default for Tree<'s> { + fn default() -> Self { + Self { + variant: Box::new(Variant::Ident(Ident { token: Default::default() })), + span: Default::default(), + } + } +} + /// Macro providing [`Tree`] type definition. It is used to both define the ast [`Variant`], and to /// define impls for every token type in other modules. #[macro_export] @@ -68,8 +79,28 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args) pub error: Error, pub ast: Tree<'s>, }, - Block { - pub statements: Vec>, + /// A sequence of lines introduced by a line ending in an operator. + BodyBlock { + /// The lines of the block. + pub statements: Vec>, + }, + /// A sequence of lines comprising the arguments of a function call. + ArgumentBlockApplication { + /// The expression for the value to which the arguments are to be applied. + pub lhs: Option>, + /// The lines of the block. + pub arguments: Vec>, + }, + /// A sequence of lines comprising a tree of operator expressions. + OperatorBlockApplication { + /// The expression preceding the block; this will be the leftmost-leaf of the binary + /// tree. + pub lhs: Option>, + /// The lines of the block. + pub expressions: Vec>, + /// Lines that appear lexically within the block, but are not syntactically consistent + /// with an operator block. + pub excess: Vec>, }, /// A simple identifier, like `foo` or `bar`. Ident { @@ -115,15 +146,25 @@ macro_rules! with_ast_definition { ($f:ident ($($args:tt)*)) => { $f! { $($args) pub name: Tree<'s>, pub params: Vec>, }, + /// A variable assignment, like `foo = bar 23`. Assignment { + /// The pattern which should be unified with the expression. pub pattern: Tree<'s>, + /// The `=` token. pub equals: token::Operator<'s>, + /// The expression initializing the value(s) in the pattern. pub expr: Tree<'s>, }, + /// A function definition, like `add x y = x + y`. Function { + /// The identifier to which the function should be bound. pub name: token::Ident<'s>, + /// The argument patterns. pub args: Vec>, + /// The `=` token. pub equals: token::Operator<'s>, + /// The body, which will typically be an inline expression or a `BodyBlock` expression. + /// It is an error for this to be empty. pub body: Option>, }, } @@ -135,7 +176,7 @@ macro_rules! generate_variant_constructors { pub enum $enum:ident<'s> { $( $(#$variant_meta:tt)* - $variant:ident $({ $(pub $field:ident : $field_ty:ty),* $(,)? })? + $variant:ident $({$($(#$field_meta:tt)* pub $field:ident : $field_ty:ty),* $(,)? })? ),* $(,)? } ) => { paste! { @@ -212,6 +253,29 @@ impl<'s> span::Builder<'s> for MultipleOperatorError<'s> { } } +/// A sequence of one or more operators. +pub trait NonEmptyOperatorSequence<'s> { + /// Return a reference to the first operator. + fn first_operator(&self) -> &token::Operator<'s>; + /// Return a mutable reference to the first operator. + fn first_operator_mut(&mut self) -> &mut token::Operator<'s>; +} + +impl<'s> NonEmptyOperatorSequence<'s> for OperatorOrError<'s> { + fn first_operator(&self) -> &token::Operator<'s> { + match self { + Ok(opr) => opr, + Err(oprs) => oprs.operators.first(), + } + } + fn first_operator_mut(&mut self) -> &mut token::Operator<'s> { + match self { + Ok(opr) => opr, + Err(oprs) => oprs.operators.first_mut(), + } + } +} + // === MultiSegmentApp === @@ -231,6 +295,53 @@ impl<'s> span::Builder<'s> for MultiSegmentAppSegment<'s> { +// ==================================== +// === Tree-construction operations === +// ==================================== + +/// Join two nodes with a new node appropriate for their types. +/// +/// For most input types, this simply constructs an `App`; however, for some block type operands +/// application has special semantics. +pub fn apply<'s>(func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> { + match &mut *arg.variant { + Variant::ArgumentBlockApplication(block) if block.lhs.is_none() => { + block.lhs = Some(func); + arg + } + Variant::OperatorBlockApplication(block) if block.lhs.is_none() => { + block.lhs = Some(func); + arg + } + _ => Tree::app(func, arg), + } +} + +/// Join two nodes with an operator, in a way appropriate for their types. +/// +/// For most operands this will simply construct an `OprApp`; however, a non-operator block (i.e. an +/// `ArgumentBlock`) is reinterpreted as a `BodyBlock` when it appears in the RHS of an operator +/// expression. +pub fn apply_operator<'s>( + lhs: Option>, + opr: OperatorOrError<'s>, + mut rhs: Option>, +) -> Tree<'s> { + if let Some(rhs_) = rhs.as_mut() { + if let Variant::ArgumentBlockApplication(block) = &mut *rhs_.variant { + if block.lhs.is_none() { + let ArgumentBlockApplication { lhs: _, arguments } = block; + let arguments = mem::take(arguments); + let rhs_ = block::body_from_lines(arguments); + rhs = Some(rhs_); + } + } + } + Tree::opr_app(lhs, opr, rhs) +} + + + // ================ // === Visitors === // ================ diff --git a/lib/rust/parser/src/syntax/tree/block.rs b/lib/rust/parser/src/syntax/tree/block.rs new file mode 100644 index 00000000000..9098f4a7ef8 --- /dev/null +++ b/lib/rust/parser/src/syntax/tree/block.rs @@ -0,0 +1,252 @@ +//! Code blocks. + + + +use crate::syntax::tree::*; + + + +// ============= +// === Lines === +// ============= + +/// A line of code. +#[derive(Debug, Clone, PartialEq, Eq, Visitor, Reflect, Serialize, Deserialize)] +pub struct Line<'s> { + /// Token ending the previous line, if any. + pub newline: token::Newline<'s>, + /// The content of the line, if any. + pub expression: Option>, +} + +impl<'s> Line<'s> { + /// Transform the content of the line with the provided function, if any is present; return the + /// result. + pub fn map_expression(self, f: impl FnOnce(Tree<'s>) -> Tree<'s>) -> Self { + let Self { newline, expression } = self; + let expression = expression.map(f); + Self { newline, expression } + } +} + +impl<'s> From> for Line<'s> { + fn from(newline: token::Newline<'s>) -> Self { + Self { newline, expression: None } + } +} + +impl<'s> span::Builder<'s> for Line<'s> { + fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> { + span.add(&mut self.newline).add(&mut self.expression) + } +} + + + +// ================== +// === Body Block === +// ================== + +/// Build a body block from a sequence of lines; this involves reinterpreting the input expressions +/// in statement context (i.e. expressions at the top-level of the block that involve the `=` +/// operator will be reinterpreted as function/variable bindings). +pub fn body_from_lines<'s>(expressions: impl IntoIterator>) -> Tree<'s> { + use crate::expression_to_statement; + let expressions = expressions.into_iter(); + let statements = expressions.map(|line| line.map_expression(expression_to_statement)); + let statements = statements.collect(); + Tree::body_block(statements) +} + + + +// ====================== +// === Operator Block === +// ====================== + +/// The content of a line in an operator block. +#[derive(Debug, Clone, PartialEq, Eq, Visitor, Reflect, Serialize, Deserialize)] +pub struct OperatorBlockExpression<'s> { + /// The operator at the beginning of the line. + pub operator: OperatorOrError<'s>, + /// The rest of the expression. + pub expression: Tree<'s>, +} + +/// Interpret the given expression as an `OperatorBlockExpression`, if it fits the correct pattern. +fn to_operator_block_expression( + expression_: Tree<'_>, +) -> Result, Tree<'_>> { + let tree_ = match &*expression_.variant { + Variant::OprSectionBoundary(OprSectionBoundary { ast }) => ast, + _ => return Err(expression_), + }; + if let Variant::OprApp(OprApp { lhs: None, opr, rhs: Some(expression) }) = &*tree_.variant { + if expression.span.left_offset.visible.width_in_spaces < 1 { + return Err(expression_); + } + let mut operator = opr.clone(); + operator.first_operator_mut().left_offset = expression_.span.left_offset; + let expression = expression.clone(); + Ok(OperatorBlockExpression { operator, expression }) + } else { + Err(expression_) + } +} + +impl<'s> span::Builder<'s> for OperatorBlockExpression<'s> { + fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> { + span.add(&mut self.operator).add(&mut self.expression) + } +} + + +// === Operator block lines ==== + +/// A line in an operator block. +#[derive(Debug, Clone, PartialEq, Eq, Visitor, Reflect, Serialize, Deserialize)] +pub struct OperatorLine<'s> { + /// Token ending the previous line, if any. + pub newline: token::Newline<'s>, + /// The operator-expression, if any. + pub expression: Option>, +} + +impl<'s> From> for OperatorLine<'s> { + fn from(newline: token::Newline<'s>) -> Self { + Self { newline, expression: None } + } +} + +impl<'s> span::Builder<'s> for OperatorLine<'s> { + fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> { + span.add(&mut self.newline).add(&mut self.expression) + } +} + + + +// ===================== +// === Block Builder === +// ===================== + +/// Builds an AST block type from a sequence of lines. +/// +/// Note that the block type is not fully determined at this stage: We apply context information +/// later (see `apply_operator`) to distinguish the two non-operator block types, `BodyBlock` and +/// `ArgumentBlockApplication`. Here we treat every non-operator block as an argument block, +/// because creating a body block involves re-interpreting the expressions in statement context. +/// +/// The implementation is a state machine. The only top-level transitions are: +/// - `Indeterminate` -> `Operator` +/// - `Indeterminate` -> `NonOperator` +/// +/// The `Operator` state has two substates, and one possible transition: +/// - `body_lines is empty` -> `body_lines is not empty` +#[derive(Debug)] +pub enum Builder<'s> { + /// The builder is in an indeterminate state until a non-empty line has been encountered, which + /// would distinguish an operator-block from a non-operator block. + Indeterminate { + /// The `Newline` token introducing the block, and `Newline` tokens for any empty lines + /// that have been encountered. + empty_lines: Vec>, + }, + /// Building an operator block. If any line doesn't fit the operator-block syntax, that line + /// and all following will be placed in `body_lines`. + Operator { + /// Valid operator-block expressions. + operator_lines: Vec>, + /// Any lines violating the expected operator-block syntax. + body_lines: Vec>, + }, + /// Building a non-operator block (either a body block or an argument block). + NonOperator { + /// The block content. + body_lines: Vec>, + }, +} + +impl<'s> Builder<'s> { + /// Create a new instance, in initial state. + pub fn new() -> Self { + Self::Indeterminate { empty_lines: default() } + } + + /// Create a new instance, in a state appropriate for the given expression. + fn new_with_expression( + empty_lines: impl IntoIterator>, + newline: token::Newline<'s>, + expression: Tree<'s>, + ) -> Self { + let empty_lines = empty_lines.into_iter(); + let new_lines = 1; + match to_operator_block_expression(expression) { + Ok(expression) => { + let expression = Some(expression); + let mut operator_lines = Vec::with_capacity(empty_lines.size_hint().0 + new_lines); + operator_lines.extend(empty_lines.map(block::OperatorLine::from)); + operator_lines.push(OperatorLine { newline, expression }); + Self::Operator { operator_lines, body_lines: default() } + } + Err(expression) => { + let expression = Some(expression); + let mut body_lines = Vec::with_capacity(empty_lines.size_hint().0 + new_lines); + body_lines.extend(empty_lines.map(block::Line::from)); + body_lines.push(Line { newline, expression }); + Self::NonOperator { body_lines } + } + } + } + + /// Apply a new line to the state. + pub fn push(&mut self, newline: token::Newline<'s>, expression: Option>) { + match self { + Builder::Indeterminate { empty_lines } => match expression { + Some(expression) => + *self = Self::new_with_expression(empty_lines.drain(..), newline, expression), + None => empty_lines.push(newline), + }, + Builder::NonOperator { body_lines, .. } => + body_lines.push(Line { newline, expression }), + Builder::Operator { body_lines, .. } if !body_lines.is_empty() => { + body_lines.push(Line { newline, expression }); + } + Builder::Operator { operator_lines, body_lines, .. } + if let Some(expression) = expression => { + match to_operator_block_expression(expression) { + Ok(expression) => { + let expression = Some(expression); + operator_lines.push(OperatorLine { newline, expression }); + } + Err(expression) => { + let expression = Some(expression); + body_lines.push(Line { newline, expression }) + }, + } + } + Builder::Operator { operator_lines, .. } => operator_lines.push(newline.into()), + } + } + + /// Produce an AST node from the state. + pub fn build(self) -> Tree<'s> { + match self { + Builder::Indeterminate { empty_lines } => { + let empty_lines = empty_lines.into_iter(); + let lines = empty_lines.map(Line::from).collect(); + Tree::argument_block_application(None, lines) + } + Builder::Operator { operator_lines, body_lines } => + Tree::operator_block_application(None, operator_lines, body_lines), + Builder::NonOperator { body_lines } => + Tree::argument_block_application(None, body_lines), + } + } +} + +impl<'s> Default for Builder<'s> { + fn default() -> Self { + Self::new() + } +} diff --git a/lib/rust/parser/src/syntax/tree/builder/Cargo.toml b/lib/rust/parser/src/syntax/tree/builder/Cargo.toml deleted file mode 100644 index 4d319fdb204..00000000000 --- a/lib/rust/parser/src/syntax/tree/builder/Cargo.toml +++ /dev/null @@ -1,22 +0,0 @@ -[package] -name = "enso-parser-syntax-tree-builder" -version = "0.1.0" -authors = ["Enso Team "] -edition = "2021" -description = "Enso Parser AST Builder." -readme = "README.md" -homepage = "https://github.com/enso-org/enso" -repository = "https://github.com/enso-org/enso" -license-file = "../../LICENSE" - -[lib] -proc-macro = true - -[dependencies] -proc-macro2 = "1.0" -enso-macro-utils = { path = "../../../../../macro-utils" } -quote = "1.0" - -[dependencies.syn] -version = "1.0" -features = ['extra-traits', 'visit', 'full'] diff --git a/lib/rust/parser/src/syntax/tree/builder/src/lib.rs b/lib/rust/parser/src/syntax/tree/builder/src/lib.rs deleted file mode 100644 index 11cb2b541ae..00000000000 --- a/lib/rust/parser/src/syntax/tree/builder/src/lib.rs +++ /dev/null @@ -1,136 +0,0 @@ -//! Definition of a macro allowing building mock AST structures, mostly useful for testing. - -// === Features === -#![feature(proc_macro_span)] -// === Standard Linter Configuration === -#![deny(non_ascii_idents)] -#![warn(unsafe_code)] -// === Non-Standard Linter Configuration === -#![allow(clippy::option_map_unit_fn)] -#![allow(clippy::precedence)] -#![allow(dead_code)] -#![deny(unconditional_recursion)] -#![warn(missing_copy_implementations)] -#![warn(missing_debug_implementations)] -#![warn(missing_docs)] -#![warn(trivial_casts)] -#![warn(trivial_numeric_casts)] -#![warn(unused_import_braces)] -#![warn(unused_qualifications)] - -use proc_macro2::TokenStream; -use quote::quote; -use std::mem; - - - -/// A macro allowing building mock AST structures, mostly useful for testing. -/// -/// Currently supported syntax: -/// -/// - `a b c` Application of arguments. Arguments are applied in-order, from left to right. Here, -/// this expression would be the same as `[[a b] c]`. -/// -/// - `a [b c] d` Grouping syntax that does not produce AST group expression. Here, `b c` is just -/// the first argument passed to `a`. -/// -/// - `{if} a {then} b {else} c` Multi-segment application. All segments should be enclosed in curly -/// braces. You can also place segments in quotes, like `{"("} a {")"}`. -#[proc_macro] -pub fn ast_builder(tokens: proc_macro::TokenStream) -> proc_macro::TokenStream { - let output = expr(tokens, None); - let output = quote!(crate::syntax::Tree::block(vec![#output])); - output.into() -} - - -struct Segment { - header: TokenStream, - body: TokenStream, -} - -impl Segment { - fn new(header: TokenStream) -> Self { - let body = quote!(); - Self { header, body } - } -} - -fn expr(tokens: proc_macro::TokenStream, parent_spacing: Option) -> TokenStream { - use proc_macro::TokenTree::*; - let mut output = quote! {}; - let mut prefix: Option = None; - let mut segments: Vec = vec![]; - let mut current_segment: Option = None; - let mut last_column: Option = None; - let app_to_output = |output: &mut TokenStream, tok| { - if output.is_empty() { - *output = tok; - } else { - *output = quote! {syntax::Tree::app(#output,#tok)}; - } - }; - let mut inherited_spacing = parent_spacing.unwrap_or(0); - for token in tokens { - let spacing = last_column.map(|t| token.span().start().column - t).unwrap_or(0); - let spacing = spacing + inherited_spacing; - inherited_spacing = 0; - last_column = Some(token.span().end().column); - match &token { - // a b c ... - Ident(ident) => { - let ident = ident.to_string(); - let spacing = " ".repeat(spacing); - app_to_output( - &mut output, - quote! {crate::syntax::Tree::ident(crate::syntax::Token(#spacing, #ident, syntax::token::Variant::new_ident_unchecked(#ident)))}, - ); - } - // {if} a {then} b {else} c - // {"("} a {")"} - Group(group) if group.delimiter() == proc_macro::Delimiter::Brace => { - if let Some(mut current_segment) = mem::take(&mut current_segment) { - current_segment.body = mem::take(&mut output); - segments.push(current_segment); - } else if !output.is_empty() { - prefix = Some(mem::take(&mut output)); - } - let ident = group.stream().to_string(); - let spacing = " ".repeat(spacing); - current_segment = Some(Segment::new( - quote! { Token(#spacing, #ident, syntax::token::Variant::new_ident_unchecked(#ident).into())}, - )); // Token::symbol - } - // a [b c] d - Group(group) if group.delimiter() == proc_macro::Delimiter::Bracket => { - app_to_output(&mut output, expr(group.stream(), Some(spacing))); - } - _ => panic!("Unsupported token {:?}", token), - } - } - if let Some(mut current_segment) = current_segment { - current_segment.body = mem::take(&mut output); - segments.push(current_segment); - let segments: Vec = segments - .into_iter() - .map(|t| { - let header = t.header; - let body = t.body; - let body = if !body.is_empty() { - quote!(Some(syntax::Tree::opr_section_boundary(#body))) - } else { - quote!(None) - }; - quote! { syntax::tree::MultiSegmentAppSegment { header: #header, body: #body } } - }) - .collect(); - let pfx = prefix - .map(|t| quote! {Some(Box::new(syntax::Tree::opr_section_boundary(#t)))}) - .unwrap_or_else(|| quote! {None}); - let segments = quote! {NonEmptyVec::try_from(vec![#(#segments),*]).unwrap()}; - output = quote! { - syntax::Tree::multi_segment_app (#pfx, #segments) - } - } - output -} diff --git a/lib/rust/parser/tests/parse.rs b/lib/rust/parser/tests/parse.rs index e46f671ff76..8d166d41766 100644 --- a/lib/rust/parser/tests/parse.rs +++ b/lib/rust/parser/tests/parse.rs @@ -16,6 +16,7 @@ #![warn(unused_qualifications)] use lexpr::sexp; +use lexpr::Value; @@ -23,10 +24,10 @@ use lexpr::sexp; // === Test support macros === // =========================== -/// Parses input as a sequence of S-expressions, and wraps it in a `Block`. +/// Parses input as a sequence of S-expressions, and wraps it in a `BodyBlock`. macro_rules! block { - ( $statements:tt ) => { - sexp![(Block #($statements))] + ( $($statements:tt)* ) => { + sexp![(BodyBlock #( $( $statements )* ) )] } } @@ -36,18 +37,48 @@ macro_rules! block { // === Tests === // ============= +#[test] +fn nothing() { + test("", block![()]); +} + #[test] fn application() { test("a b c", block![(App (App (Ident a) (Ident b)) (Ident c))]); } #[test] -fn type_definition_bool() { - test("type Bool", block![(TypeDef (Ident type) (Ident Bool) #())]); +fn parentheses_simple() { + let expected = block![ + (MultiSegmentApp #(((Symbol "(") (App (Ident a) (Ident b))) ((Symbol ")") ()))) + ]; + test("(a b)", expected); } #[test] -fn type_definition_option() { +fn section_simple() { + let expected_lhs = block![(OprSectionBoundary (OprApp () (Ok "+") (Ident a)))]; + test("+ a", expected_lhs); + let expected_rhs = block![(OprSectionBoundary (OprApp (Ident a) (Ok "+") ()))]; + test("a +", expected_rhs); +} + +#[test] +fn parentheses_nested() { + #[rustfmt::skip] + let expected = block![ + (MultiSegmentApp #( + ((Symbol "(") + (App (MultiSegmentApp #(((Symbol "(") (App (Ident a) (Ident b))) ((Symbol ")") ()))) + (Ident c))) + ((Symbol ")") ()))) + ]; + test("((a b) c)", expected); +} + +#[test] +fn type_definition() { + test("type Bool", block![(TypeDef (Ident type) (Ident Bool) #())]); test("type Option a", block![(TypeDef (Ident type) (Ident Option) #((Ident a)))]); } @@ -75,6 +106,118 @@ fn function_block_simple_args() { test("foo a b c =", block![(Function foo #((Ident a) (Ident b) (Ident c)) "=" ())]); } +#[test] +fn code_block_body() { + let code = ["main =", " 4"]; + test(&code.join("\n"), block![(Function main #() "=" (BodyBlock #((Number 4))))]); + let code = ["main =", " ", " 4"]; + test(&code.join("\n"), block![(Function main #() "=" (BodyBlock #(() (Number 4))))]); + let code = ["main =", " ", " 4"]; + test(&code.join("\n"), block![(Function main #() "=" (BodyBlock #(() (Number 4))))]); + let code = ["main =", " ", " 4"]; + test(&code.join("\n"), block![(Function main #() "=" (BodyBlock #(() (Number 4))))]); + let code = ["main =", "", " 4"]; + test(&code.join("\n"), block![(Function main #() "=" (BodyBlock #(() (Number 4))))]); + + #[rustfmt::skip] + let code = [ + "main =", + " +4", + " print 23", + ]; + #[rustfmt::skip] + let expect = block![ + (Function main #() "=" (BodyBlock #( + (OprSectionBoundary (OprApp () (Ok "+") (Number 4))) + (App (Ident print) (Number 23))))) + ]; + test(&code.join("\n"), expect); +} + +#[test] +fn code_block_operator() { + let code = ["value = nums", " * each random", " + constant"]; + let expect = block![ + (Assignment (Ident value) "=" + (OperatorBlockApplication (Ident nums) + #(((Ok "*") (App (Ident each) (Ident random))) + ((Ok "+") (Ident constant))) + #())) + ]; + test(&code.join("\n"), expect); +} + +#[test] +fn code_block_argument_list() { + #[rustfmt::skip] + let code = [ + "value = foo", + " bar", + ]; + let expect = block![ + (Assignment (Ident value) "=" (ArgumentBlockApplication (Ident foo) #((Ident bar)))) + ]; + test(&code.join("\n"), expect); + + + #[rustfmt::skip] + let code = [ + "value = foo", + " +1", + " bar", + ]; + #[rustfmt::skip] + let expect = block![ + (Assignment (Ident value) "=" + (ArgumentBlockApplication (Ident foo) #( + (OprSectionBoundary (OprApp () (Ok "+") (Number 1))) + (Ident bar)))) + ]; + test(&code.join("\n"), expect); +} + +#[test] +fn code_block_empty() { + // The first line here should parse as a function with no body expression (which is an error). + // No input would parse as an empty `ArgumentBlock` or `OperatorBlock`, because those types are + // distinguished from a body continuation by the presence of non-empty indented lines. + let code = ["foo =", "bar"]; + test(&code.join("\n"), block![(Function foo #() "=" ()) (Ident bar)]); + // This parses similarly to above; a line with no non-whitespace content does not create a code + // block. + let code = ["foo =", " ", "bar"]; + test(&code.join("\n"), block![(Function foo #() "=" ()) () (Ident bar)]); +} + +#[test] +fn code_block_bad_indents1() { + let code = ["main =", " foo", " bar", " baz"]; + let expected = block![ + (Function main #() "=" (BodyBlock #((Ident foo) (Ident bar) (Ident baz)))) + ]; + test(&code.join("\n"), expected); +} + +#[test] +fn code_block_bad_indents2() { + let code = ["main =", " foo", " bar", "baz"]; + let expected = block![ + (Function main #() "=" (BodyBlock #((Ident foo) (Ident bar)))) + (Ident baz) + ]; + test(&code.join("\n"), expected); +} + +#[test] +fn code_block_with_following_statement() { + let code = ["main =", " foo", "bar"]; + let expected = block![ + (Function main #() "=" (BodyBlock #((Ident foo)))) + (Ident bar) + ]; + test(&code.join("\n"), expected); +} + // ==================== @@ -95,11 +238,11 @@ use std::collections::HashSet; /// - Most token types are represented as their contents, rather than as a token struct. For /// example, a `token::Number` may be represented like: `sexp![10]`, and a `token::Ident` may look /// like `sexp![foo]`. -fn test(code: &str, expect: lexpr::Value) { +fn test(code: &str, expect: Value) { let ast = enso_parser::Parser::new().run(code); let ast_s_expr = to_s_expr(&ast, code); - assert_eq!(ast_s_expr.to_string(), expect.to_string()); - assert_eq!(ast.code(), code); + assert_eq!(ast_s_expr.to_string(), expect.to_string(), "{:?}", &ast); + assert_eq!(ast.code(), code, "{:?}", &ast); } @@ -109,40 +252,61 @@ fn test(code: &str, expect: lexpr::Value) { // ===================== /// Produce an S-expression representation of the input AST type. -pub fn to_s_expr(value: &T, code: &str) -> lexpr::Value +pub fn to_s_expr(value: &T, code: &str) -> Value where T: serde::Serialize + Reflect { + use enso_parser::syntax::token; + use enso_parser::syntax::tree; let (graph, rust_to_meta) = enso_metamodel::rust::to_meta(value.reflect_type()); let ast_ty = rust_to_meta[&value.reflect_type().id]; let base = code.as_bytes().as_ptr() as usize; let code: Box = Box::from(code); let mut to_s_expr = ToSExpr::new(&graph); to_s_expr.mapper(ast_ty, strip_hidden_fields); - let ident_token = rust_to_meta[&enso_parser::syntax::token::variant::Ident::reflect().id]; - let operator_token = rust_to_meta[&enso_parser::syntax::token::variant::Operator::reflect().id]; - let number_token = rust_to_meta[&enso_parser::syntax::token::variant::Number::reflect().id]; - let token_to_str = move |token: lexpr::Value| { + let ident_token = rust_to_meta[&token::variant::Ident::reflect().id]; + let operator_token = rust_to_meta[&token::variant::Operator::reflect().id]; + let symbol_token = rust_to_meta[&token::variant::Symbol::reflect().id]; + let number_token = rust_to_meta[&token::variant::Number::reflect().id]; + let newline_token = rust_to_meta[&token::variant::Newline::reflect().id]; + // TODO: Implement `#[reflect(flag = "enso::concrete")]`, which just attaches user data to the + // type info; then filter by flag here instead of hard-coding these simplifications. + let line = rust_to_meta[&tree::block::Line::reflect().id]; + let operator_line = rust_to_meta[&tree::block::OperatorLine::reflect().id]; + let token_to_str = move |token: Value| { let range = token_code_range(&token, base); code[range].to_owned().into_boxed_str() }; let token_to_str_ = token_to_str.clone(); - to_s_expr.mapper(ident_token, move |token| lexpr::Value::symbol(token_to_str_(token))); + to_s_expr.mapper(ident_token, move |token| Value::symbol(token_to_str_(token))); let token_to_str_ = token_to_str.clone(); - to_s_expr.mapper(operator_token, move |token| lexpr::Value::string(token_to_str_(token))); + to_s_expr.mapper(operator_token, move |token| Value::string(token_to_str_(token))); + let token_to_str_ = token_to_str.clone(); + to_s_expr.mapper(symbol_token, move |token| Value::string(token_to_str_(token))); let token_to_str_ = token_to_str; to_s_expr.mapper(number_token, move |token| { - lexpr::Value::Number(token_to_str_(token).parse::().unwrap().into()) + Value::Number(token_to_str_(token).parse::().unwrap().into()) }); + let into_car = |cons| match cons { + Value::Cons(cons) => cons.into_pair().0, + _ => panic!(), + }; + to_s_expr.mapper(line, into_car); + to_s_expr.mapper(operator_line, into_car); + to_s_expr.skip(newline_token); tuplify(to_s_expr.value(ast_ty, &value)) } /// Strip certain fields that should be excluded from output. -fn strip_hidden_fields(tree: lexpr::Value) -> lexpr::Value { - let hidden_tree_fields = - [":spanLeftOffsetVisible", ":spanLeftOffsetCodeRepr", ":spanCodeLength"]; +fn strip_hidden_fields(tree: Value) -> Value { + let hidden_tree_fields = [ + ":spanLeftOffsetVisible", + ":spanLeftOffsetCodeReprBegin", + ":spanLeftOffsetCodeReprLen", + ":spanCodeLength", + ]; let hidden_tree_fields: HashSet<_> = hidden_tree_fields.into_iter().collect(); - lexpr::Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val { - lexpr::Value::Cons(cons) => match cons.car() { - lexpr::Value::Symbol(symbol) => !hidden_tree_fields.contains(symbol.as_ref()), + Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val { + Value::Cons(cons) => match cons.car() { + Value::Symbol(symbol) => !hidden_tree_fields.contains(symbol.as_ref()), _ => panic!(), }, _ => true, @@ -151,30 +315,23 @@ fn strip_hidden_fields(tree: lexpr::Value) -> lexpr::Value { /// Given an S-expression representation of a [`Token`] and the base address for `Code` `Cow`s, /// return the range of the input code the token references. -fn token_code_range(token: &lexpr::Value, base: usize) -> std::ops::Range { - let code_repr = fields(token).find(|(name, _)| *name == ":codeRepr").unwrap().1; - let mut begin = None; - let mut len = None; - for (name, value) in fields(code_repr) { - match name { - ":begin" => begin = Some(value.as_u64().unwrap() as u32), - ":len" => len = Some(value.as_u64().unwrap() as u32), - _ => (), - } - } - let begin = begin.unwrap(); +fn token_code_range(token: &Value, base: usize) -> std::ops::Range { + let get_u32 = + |field| fields(token).find(|(name, _)| *name == field).unwrap().1.as_u64().unwrap() as u32; + let begin = get_u32(":codeReprBegin"); + let len = get_u32(":codeReprLen"); let begin = (begin as u64) | (base as u64 & !0xFFFF_FFFF); let begin = if begin < (base as u64) { begin + 0x1_0000_0000 } else { begin }; let begin = begin as usize - base; - let len = len.unwrap() as usize; + let len = len as usize; begin..(begin + len) } /// Iterate the field `(name, value)` pairs of the S-expression of a struct with named fields. -fn fields(value: &'_ lexpr::Value) -> impl Iterator { +fn fields(value: &'_ Value) -> impl Iterator { value.list_iter().unwrap().filter_map(|value| match value { - lexpr::Value::Cons(cons) => match cons.car() { - lexpr::Value::Symbol(symbol) => Some((&symbol[..], cons.cdr())), + Value::Cons(cons) => match cons.car() { + Value::Symbol(symbol) => Some((&symbol[..], cons.cdr())), _ => None, }, _ => None, @@ -183,24 +340,24 @@ fn fields(value: &'_ lexpr::Value) -> impl Iterator lexpr::Value { +fn tuplify(value: Value) -> Value { let (car, cdr) = match value { - lexpr::Value::Cons(cons) => cons.into_pair(), - lexpr::Value::Vector(mut vector) => { + Value::Cons(cons) => cons.into_pair(), + Value::Vector(mut vector) => { for value in vector.iter_mut() { - let original = std::mem::replace(value, lexpr::Value::Nil); + let original = std::mem::replace(value, Value::Nil); *value = tuplify(original); } - return lexpr::Value::Vector(vector); + return Value::Vector(vector); } value => return value, }; - if let lexpr::Value::Symbol(symbol) = &car { + if let Value::Symbol(symbol) = &car { if let Some(':') = symbol.chars().next() { return tuplify(cdr); } } let car = tuplify(car); let cdr = tuplify(cdr); - lexpr::Value::Cons(lexpr::Cons::new(car, cdr)) + Value::Cons(lexpr::Cons::new(car, cdr)) } diff --git a/lib/rust/prelude/src/data/non_empty_vec.rs b/lib/rust/prelude/src/data/non_empty_vec.rs index c24ca0eae6f..54692809f07 100644 --- a/lib/rust/prelude/src/data/non_empty_vec.rs +++ b/lib/rust/prelude/src/data/non_empty_vec.rs @@ -33,7 +33,8 @@ impl NonEmptyVec { /// let mut vec: NonEmptyVec = NonEmptyVec::new(0, vec![]); /// ``` pub fn new(first: T, rest: Vec) -> NonEmptyVec { - let mut elems = vec![first]; + let mut elems = Vec::with_capacity(1 + rest.len()); + elems.push(first); elems.extend(rest); NonEmptyVec { elems } } diff --git a/lib/rust/prelude/src/vec.rs b/lib/rust/prelude/src/vec.rs index 4efdff21814..d0234c8c10c 100644 --- a/lib/rust/prelude/src/vec.rs +++ b/lib/rust/prelude/src/vec.rs @@ -1,5 +1,6 @@ //! This module defines utilities for working with the [`std::vec::Vec`] type. +use derivative::Derivative; use failure::_core::hint::unreachable_unchecked; @@ -84,6 +85,84 @@ pub trait VecOps: AsMut> + Sized { impl VecOps for Vec {} + + +// ===================== +// === VecAllocation === +// ===================== + +/// Owns a storage allocation for a [`std::vec::Vec`], but no elements. +/// +/// # Usage +/// +/// This data structure implements an optimization when creating temporary vectors. The use case +/// occurs when: +/// - Within some scope, a `Vec` is created, added to, and discarded. +/// - The scope may be entered multiple times. +/// +/// The optimization is to reuse an allocation between entries to the scope. This is sometimes done +/// by storing and reusing the `Vec`, but that pattern is misleading; owning a `Vec` suggests that +/// values may be retained between entries to the scope. This type explicitly has only one logical +/// state (empty). +/// +/// ``` +/// # use enso_prelude::*; +/// #[derive(Default)] +/// struct NumberAdder { +/// // In a more complex struct it would be important to be able to tell what state the object +/// // retains from its fields. +/// temporary_nums: VecAllocation, +/// } +/// +/// impl NumberAdder { +/// /// Add some numbers, with better precision than simply adding `f32` values in a loop. +/// /// (For the sake of example, ignore that this is not a fast or accurate approach.) +/// /// +/// /// Because we reuse an allocation, if this method is called repeatedly it will only have to +/// /// allocate enough space to accommodate the largest single input it processes. Thus, rather +/// /// than performing a number of reallocations that scales linearly in the number of batches +/// /// of input (assuming batch size has some constant geometric mean), it performs a number of +/// /// allocations that scales with the log of the size of the largest batch; the worst case of +/// /// this implementation has the same performance as the best case of an implementation that +/// /// doesn't reuse its allocation. +/// pub fn add_nums(&mut self, inputs: impl IntoIterator) -> f32 { +/// let mut extended_precision = self.temporary_nums.take(); +/// extended_precision.extend(inputs.into_iter().map(f64::from)); +/// let result = extended_precision.drain(..).fold(0.0, f64::add); +/// self.temporary_nums.set_from(extended_precision); +/// result as f32 +/// } +/// } +/// ``` +#[derive(Clone, Debug, Derivative, Eq, PartialEq)] +#[derivative(Default(bound = ""))] +pub struct VecAllocation { + data: Vec, +} + +impl VecAllocation { + /// Create a new, empty allocation. + pub fn new() -> Self { + Self::default() + } + + /// Drop any elements from the given `Vec`, keeping its allocated memory. It can be retrieved + /// later with `take`. + pub fn set_from(&mut self, mut data: Vec) { + data.clear(); + self.data = data; + } + + /// Return a `Vec` containing no elements, whose allocated storage comes from the most recent + /// call to `set_from`, unless `take` has been called since then. Any subsequent call before the + /// next `set_from` would return a newly-created `Vec` with no allocated memory. + pub fn take(&mut self) -> Vec { + std::mem::take(&mut self.data) + } +} + + + // ============= // === Tests === // =============