From 2edd2bd7ff6a58ed3a9cf77e3ddbb38e1adfc8cb Mon Sep 17 00:00:00 2001 From: Kaz Wesley Date: Thu, 19 Oct 2023 05:36:42 -0700 Subject: [PATCH] Ensure all spans have document offsets (#8039) - Validate spans during existing lexer and parser unit tests, and in `enso_parser_debug`. - Fix lost span info causing failures of updated tests. # Important Notes - [x] Output of `parse_all_enso_files.sh` is unchanged since before #7881 (modulo libs changes since then). - When the parser encounters an input with the first line indented, it now creates a sub-block for lines at than indent level, and emits a syntax error (every indented block must have a parent). - When the parser encounters a number with a base but no digits (e.g. `0x`), it now emits a `Number` with `None` in the digits field rather than a 0-length digits token. --- .../src/util/ast/__snapshots__/index.ts.snap | 359 ++++++++++++++++++ app/gui2/src/util/ast/index.ts | 13 +- app/gui2/src/util/ast/opr.ts | 13 +- .../Standard/Base/0.0.0-dev/src/Data/XML.enso | 10 +- .../enso/interpreter/test/JsInteropTest.java | 4 +- .../NestedPatternCompilationBenchmarks.java | 2 +- lib/rust/parser/debug/src/lib.rs | 44 +++ lib/rust/parser/debug/src/main.rs | 2 + lib/rust/parser/debug/tests/metadata/mod.rs | 14 +- lib/rust/parser/debug/tests/parse.rs | 162 +++++--- lib/rust/parser/src/lexer.rs | 208 ++++++---- lib/rust/parser/src/lib.rs | 21 +- lib/rust/parser/src/macros/built_in.rs | 36 +- lib/rust/parser/src/macros/resolver.rs | 36 +- lib/rust/parser/src/source/code.rs | 31 +- lib/rust/parser/src/source/span.rs | 19 +- lib/rust/parser/src/syntax/operator.rs | 4 +- lib/rust/parser/src/syntax/token.rs | 14 +- lib/rust/parser/src/syntax/tree.rs | 24 +- lib/rust/parser/src/syntax/tree/block.rs | 34 +- 20 files changed, 799 insertions(+), 251 deletions(-) diff --git a/app/gui2/src/util/ast/__snapshots__/index.ts.snap b/app/gui2/src/util/ast/__snapshots__/index.ts.snap index c530417614..c5a6e69892 100644 --- a/app/gui2/src/util/ast/__snapshots__/index.ts.snap +++ b/app/gui2/src/util/ast/__snapshots__/index.ts.snap @@ -71,6 +71,127 @@ exports[`Parsing ' foo bar } `; +exports[`Parsing '2 + + 3 + + 4' 1`] = ` +{ + "childrenLengthInCodeParsed": 11, + "statements": [ + { + "expression": { + "childrenLengthInCodeParsed": 11, + "excess": [], + "expressions": [ + { + "expression": { + "expression": { + "base": undefined, + "childrenLengthInCodeParsed": 1, + "fractionalDigits": undefined, + "integer": { + "base": undefined, + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 5, + "type": "Digits", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 5, + }, + "type": "Number", + "whitespaceLengthInCodeParsed": 1, + "whitespaceStartInCodeParsed": 4, + }, + "operator": { + "ok": true, + "value": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 3, + "type": "Operator", + "whitespaceLengthInCodeBuffer": 1, + "whitespaceStartInCodeBuffer": 2, + }, + }, + }, + "newline": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 1, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 1, + }, + }, + { + "expression": { + "expression": { + "base": undefined, + "childrenLengthInCodeParsed": 1, + "fractionalDigits": undefined, + "integer": { + "base": undefined, + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 10, + "type": "Digits", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 10, + }, + "type": "Number", + "whitespaceLengthInCodeParsed": 1, + "whitespaceStartInCodeParsed": 9, + }, + "operator": { + "ok": true, + "value": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 8, + "type": "Operator", + "whitespaceLengthInCodeBuffer": 1, + "whitespaceStartInCodeBuffer": 7, + }, + }, + }, + "newline": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 6, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 6, + }, + }, + ], + "lhs": { + "base": undefined, + "childrenLengthInCodeParsed": 1, + "fractionalDigits": undefined, + "integer": { + "base": undefined, + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 0, + "type": "Digits", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + "type": "Number", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "type": "OperatorBlockApplication", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "newline": { + "lengthInCodeBuffer": 0, + "startInCodeBuffer": 0, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + }, + ], + "type": "BodyBlock", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, +} +`; + exports[`Parsing 'Data.read 2 + 2' 1`] = ` { @@ -199,6 +320,173 @@ exports[`Parsing 'Data.read } `; +exports[`Parsing 'Data.read "File" +2 + 3' 1`] = ` +{ + "childrenLengthInCodeParsed": 22, + "statements": [ + { + "expression": { + "arg": { + "childrenLengthInCodeParsed": 6, + "close": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 15, + "type": "TextEnd", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 15, + }, + "elements": [ + { + "text": { + "lengthInCodeBuffer": 4, + "startInCodeBuffer": 11, + "type": "TextSection", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 11, + }, + "type": "Section", + }, + ], + "newline": undefined, + "open": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 10, + "type": "TextStart", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 10, + }, + "type": "TextLiteral", + "whitespaceLengthInCodeParsed": 1, + "whitespaceStartInCodeParsed": 9, + }, + "childrenLengthInCodeParsed": 16, + "func": { + "childrenLengthInCodeParsed": 9, + "lhs": { + "childrenLengthInCodeParsed": 4, + "token": { + "isFree": false, + "isOperatorLexically": false, + "isTypeOrConstructor": true, + "lengthInCodeBuffer": 4, + "liftLevel": 0, + "startInCodeBuffer": 0, + "type": "Ident", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + "type": "Ident", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "opr": { + "ok": true, + "value": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 4, + "type": "Operator", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 4, + }, + }, + "rhs": { + "childrenLengthInCodeParsed": 4, + "token": { + "isFree": false, + "isOperatorLexically": false, + "isTypeOrConstructor": false, + "lengthInCodeBuffer": 4, + "liftLevel": 0, + "startInCodeBuffer": 5, + "type": "Ident", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 5, + }, + "type": "Ident", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 5, + }, + "type": "OprApp", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "type": "App", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "newline": { + "lengthInCodeBuffer": 0, + "startInCodeBuffer": 0, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + }, + { + "expression": { + "childrenLengthInCodeParsed": 5, + "lhs": { + "base": undefined, + "childrenLengthInCodeParsed": 1, + "fractionalDigits": undefined, + "integer": { + "base": undefined, + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 17, + "type": "Digits", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 17, + }, + "type": "Number", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 17, + }, + "opr": { + "ok": true, + "value": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 19, + "type": "Operator", + "whitespaceLengthInCodeBuffer": 1, + "whitespaceStartInCodeBuffer": 18, + }, + }, + "rhs": { + "base": undefined, + "childrenLengthInCodeParsed": 1, + "fractionalDigits": undefined, + "integer": { + "base": undefined, + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 21, + "type": "Digits", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 21, + }, + "type": "Number", + "whitespaceLengthInCodeParsed": 1, + "whitespaceStartInCodeParsed": 20, + }, + "type": "OprApp", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 17, + }, + "newline": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 16, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 16, + }, + }, + ], + "type": "BodyBlock", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, +} +`; + exports[`Parsing 'Data.read File 2 + 3' 1`] = ` { @@ -350,6 +638,77 @@ exports[`Parsing 'Data.read File } `; +exports[`Parsing 'foo bar +' 1`] = ` +{ + "childrenLengthInCodeParsed": 8, + "statements": [ + { + "expression": { + "arg": { + "childrenLengthInCodeParsed": 3, + "token": { + "isFree": false, + "isOperatorLexically": false, + "isTypeOrConstructor": false, + "lengthInCodeBuffer": 3, + "liftLevel": 0, + "startInCodeBuffer": 4, + "type": "Ident", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 4, + }, + "type": "Ident", + "whitespaceLengthInCodeParsed": 1, + "whitespaceStartInCodeParsed": 3, + }, + "childrenLengthInCodeParsed": 7, + "func": { + "childrenLengthInCodeParsed": 3, + "token": { + "isFree": false, + "isOperatorLexically": false, + "isTypeOrConstructor": false, + "lengthInCodeBuffer": 3, + "liftLevel": 0, + "startInCodeBuffer": 0, + "type": "Ident", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + "type": "Ident", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "type": "App", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, + }, + "newline": { + "lengthInCodeBuffer": 0, + "startInCodeBuffer": 0, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 0, + }, + }, + { + "expression": undefined, + "newline": { + "lengthInCodeBuffer": 1, + "startInCodeBuffer": 7, + "type": "Newline", + "whitespaceLengthInCodeBuffer": 0, + "whitespaceStartInCodeBuffer": 7, + }, + }, + ], + "type": "BodyBlock", + "whitespaceLengthInCodeParsed": 0, + "whitespaceStartInCodeParsed": 0, +} +`; + exports[`Parsing 'foo bar=baz' 1`] = ` { "childrenLengthInCodeParsed": 11, diff --git a/app/gui2/src/util/ast/index.ts b/app/gui2/src/util/ast/index.ts index 1ff7bf1134..c33d6e8b9d 100644 --- a/app/gui2/src/util/ast/index.ts +++ b/app/gui2/src/util/ast/index.ts @@ -18,10 +18,10 @@ export function parseEnso(code: string): Tree { export function parseEnsoLine(code: string): Tree { const block = parseEnso(code) assert(block.type === Tree.Type.BodyBlock) - const statemets = block.statements[Symbol.iterator]() - const firstLine = statemets.next() + const statements = block.statements[Symbol.iterator]() + const firstLine = statements.next() assert(!firstLine.done) - assert(!!statemets.next().done) + assert(!!statements.next().done) assert(firstLine.value.expression != null) return firstLine.value.expression } @@ -95,14 +95,13 @@ function treePath(obj: LazyObject, pred: (node: Tree) => boolean): Tree[] { if (import.meta.vitest) { const { test, expect } = import.meta.vitest - // Not working cases commented. const parseCases = [ - ' foo bar\n', + 'foo bar\n', 'Data.read\n2 + 2', 'Data.read File\n2 + 3', - // 'Data.read "File"\n2 + 3', + 'Data.read "File"\n2 + 3', 'foo bar=baz', - // '2\n + 3\n + 4', + '2\n + 3\n + 4', ] test.each(parseCases)("Parsing '%s'", (code) => { diff --git a/app/gui2/src/util/ast/opr.ts b/app/gui2/src/util/ast/opr.ts index 9ac9622b55..35d62b9a2d 100644 --- a/app/gui2/src/util/ast/opr.ts +++ b/app/gui2/src/util/ast/opr.ts @@ -3,7 +3,7 @@ import { assert } from '@/util/assert' import { parseEnsoLine, readAstSpan, readTokenSpan } from '@/util/ast' import type { Result } from '@/util/result' -/** An operand of one of the applications inside `GenralOprApp` */ +/** An operand of one of the applications inside `GeneralOprApp` */ export type GeneralOperand = | Operand // A part of `GeneralOprApp`, consisting of lhs and first `statements` of applications. @@ -66,7 +66,7 @@ export class GeneralOprApp { expectedOpr = oprCode } if (matchingOprs === this.apps.length) { - // If all operatros matched, the lhs may be a continuation of this chain. + // If all operators matched, the lhs may be a continuation of this chain. if (this.lhs != null) yield* operandsOfLeftAssocOprChain(this.lhs, code, expectedOpr) else yield null } else { @@ -203,15 +203,14 @@ if (import.meta.vitest) { { code: '2\n * 3\n + 44', result: [ - { type: 'partOfOprBlockApp', repr: '2\n * 3\n + 4', statemets: 1 }, + { type: 'partOfOprBlockApp', repr: '2\n * 3\n + 44', statements: 1 }, { type: 'ast', repr: '44' }, ], }, - // There is a bug in AST spans in some OperatorBlockApplications. Fix this test once fixed { code: '2\n + 3\n * 4\n + 55', result: [ - { type: 'partOfOprBlockApp', repr: '2\n + 3\n * 4\n + 5', statemets: 2 }, + { type: 'partOfOprBlockApp', repr: '2\n + 3\n * 4\n + 55', statements: 2 }, { type: 'ast', repr: '55' }, ], }, @@ -241,7 +240,7 @@ if (import.meta.vitest) { }: { code: string opr?: string - result: { type: string; repr: string; statemets?: number }[] + result: { type: string; repr: string; statements?: number }[] }) => { const ast = parseEnsoLine(code) const actual = operandsOfLeftAssocOprChain(ast, code, opr) @@ -258,7 +257,7 @@ if (import.meta.vitest) { } else { assert(actual?.type == 'partOfOprBlockApp') expect(readAstSpan(actual.ast, code)).toStrictEqual(expected?.repr) - expect(actual.statements).toStrictEqual(expected?.statemets) + expect(actual.statements).toStrictEqual(expected?.statements) } } } diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso index 8bd8be3662..7b9b93c9e9 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso @@ -339,14 +339,14 @@ type XML_Element Value (java_element:Element) (~children_cache:(Vector (XML_Element | Text))) type XML_Error - # An error that indicates that the XML data could not be parsed. + ## An error that indicates that the XML data could not be parsed. - Arguments: - - line_number: the line on which the parse failed. - - column_number: the column at which the parse failed. + Arguments: + - line_number: the line on which the parse failed. + - column_number: the column at which the parse failed. Parse_Error (line_number : Integer) (column_number : Integer) - # Any other XML-related Java exception. + ## Any other XML-related Java exception. Other (error : Text) ## PRIVATE diff --git a/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java b/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java index 1cb494b046..ce7be024e1 100644 --- a/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java +++ b/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java @@ -29,7 +29,7 @@ public class JsInteropTest extends TestBase { public void testDefaultJSPrint() { var src = """ from Standard.Base import Json - + main = json = Json.parse <| ''' { @@ -38,7 +38,7 @@ public class JsInteropTest extends TestBase { } } json.get "inner" - """; + """; Value res = evalModule(ctx, src); assertEquals("{\"a\":1}", res.toString()); } diff --git a/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java b/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java index ee5e914cb1..36a7ddf484 100644 --- a/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java +++ b/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java @@ -65,7 +65,7 @@ public class NestedPatternCompilationBenchmarks { list_of_6 = List.Cons 1 (List.Cons 2 (List.Cons 3 (List.Cons 4 (List.Cons 5 (List.Cons 6 List.Nil))))) - """; + """; } @Benchmark diff --git a/lib/rust/parser/debug/src/lib.rs b/lib/rust/parser/debug/src/lib.rs index 98c0ec59dd..b786a97418 100644 --- a/lib/rust/parser/debug/src/lib.rs +++ b/lib/rust/parser/debug/src/lib.rs @@ -185,3 +185,47 @@ fn tuplify(value: Value) -> Value { let cdr = tuplify(cdr); Value::Cons(lexpr::Cons::new(car, cdr)) } + + + +// ======================== +// === Span consistency === +// ======================== + +/// Check the internal consistency of the `Tree` and `Token` spans from the given root, and validate +/// that every character in the given range is covered exactly once in the token spans. +pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std::ops::Range) { + let mut sum_span = None; + fn concat( + a: &Option>, + b: &std::ops::Range, + ) -> std::ops::Range { + match a { + Some(a) => { + assert_eq!(a.end, b.start); + a.start..b.end + } + None => b.clone(), + } + } + sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range_utf16())); + tree.visit_items(|item| match item { + enso_parser::syntax::item::Ref::Token(token) => { + if !(token.left_offset.is_empty() && token.code.is_empty()) { + sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16())); + sum_span = Some(concat(&sum_span, &token.code.range_utf16())); + } + } + enso_parser::syntax::item::Ref::Tree(tree) => { + let children_span = + concat(&Some(tree.span.left_offset.code.range_utf16()), &tree.span.range_utf16()); + validate_spans(tree, children_span.clone()); + sum_span = Some(concat(&sum_span, &children_span)); + } + }); + if expected_span.is_empty() { + assert!(sum_span.map_or(true, |range| range.is_empty())); + } else { + assert_eq!(sum_span.unwrap(), expected_span); + } +} diff --git a/lib/rust/parser/debug/src/main.rs b/lib/rust/parser/debug/src/main.rs index 6828323afd..519ca7c754 100644 --- a/lib/rust/parser/debug/src/main.rs +++ b/lib/rust/parser/debug/src/main.rs @@ -40,6 +40,8 @@ fn check_file(path: &str, mut code: &str) { code = code_; } let ast = enso_parser::Parser::new().run(code); + let expected_span = 0..(code.encode_utf16().count() as u32); + enso_parser_debug::validate_spans(&ast, expected_span); for (parsed, original) in ast.code().lines().zip(code.lines()) { assert_eq!(parsed, original, "Bug: dropped tokens, while parsing: {path}"); } diff --git a/lib/rust/parser/debug/tests/metadata/mod.rs b/lib/rust/parser/debug/tests/metadata/mod.rs index 4a8416f888..76183c66fb 100644 --- a/lib/rust/parser/debug/tests/metadata/mod.rs +++ b/lib/rust/parser/debug/tests/metadata/mod.rs @@ -10,7 +10,7 @@ parse_json1 = Json.parse 3 main = ## The file contains three different sheets relating to operations of an -online store. + online store. operator2 = Enso_Project.data / 3 ## Read the customers table. operator3 = operator2.read_xlsx 3 @@ -19,10 +19,10 @@ operator4 = operator2.read_xlsx 3 ## Read the orders history. operator5 = operator2.read_xlsx 3 ## Index the items table on `Item ID`. This will allow this data to be -joined to other tables that also contain Item IDs. + joined to other tables that also contain Item IDs. operator7 = operator4.set_index 3 ## Join the item data to the order history, to get information on item -prices in the orders table. + prices in the orders table. operator8 = operator5.join operator7 3 operator1 = operator8.at 3 operator9 = operator8.at 3 @@ -30,20 +30,20 @@ operator9 = operator8.at 3 product1 = operator1 * operator9 operator10 = operator8.set 3 product1 ## Group all orders by the Customer ID, to compute the total value of orders -placed by each client. + placed by each client. operator11 = operator10.group by=3 operator12 = operator11.at 3 ## Compute the lifetime value of each client. operator13 = operator12.sum operator14 = operator13.rename 3 ## Index the customers table by Customer ID. This will allow this table -to be joined to other tables that also contain Customer IDs. + to be joined to other tables that also contain Customer IDs. operator15 = operator3.set_index 3 ## Join the customer data into orders table, to include names in the final -ranking. + ranking. operator16 = operator14.join operator15 ## Sort the customers by their lifetime value, with the most valuable -customers at the start of the table. + customers at the start of the table. operator17 = operator16.sort by=3 order=Sort_Order.Descending diff --git a/lib/rust/parser/debug/tests/parse.rs b/lib/rust/parser/debug/tests/parse.rs index f514aa8bb3..71ef1b4fab 100644 --- a/lib/rust/parser/debug/tests/parse.rs +++ b/lib/rust/parser/debug/tests/parse.rs @@ -135,22 +135,17 @@ fn doc_comments() { (Function (Ident id) #((() (Ident x) () ())) "=" (Ident x)))]); #[rustfmt::skip] let lines = vec![ - " ## Test indent handling", - " foo", - ]; - #[rustfmt::skip] - test!(&lines.join("\n"), (Documented (#((Section " Test indent handling")) #(())) (Ident foo))); - #[rustfmt::skip] - let lines = vec![ + "type Foo", " ## Test indent handling", " ", " foo", ]; #[rustfmt::skip] test!(&lines.join("\n"), - (Documented - (#((Section " Test indent handling")) #(() ())) - (Ident foo))); + (TypeDef type Foo #() #( + (Documented + (#((Section " Test indent handling")) #(() ())) + (Ident foo))))); } @@ -329,7 +324,7 @@ fn assignment_simple() { #[test] fn function_inline_simple_args() { - test(" foo a = x", block![(Function (Ident foo) #((() (Ident a) () ())) "=" (Ident x))]); + test("foo a = x", block![(Function (Ident foo) #((() (Ident a) () ())) "=" (Ident x))]); #[rustfmt::skip] test("foo a b = x", block![(Function (Ident foo) #((() (Ident a) () ()) (() (Ident b) () ())) "=" (Ident x))]); @@ -340,7 +335,7 @@ fn function_inline_simple_args() { #((() (Ident a) () ()) (() (Ident b) () ()) (() (Ident c) () ())) "=" (Ident x))], ); - test(" foo _ = x", block![(Function (Ident foo) #((() (Wildcard -1) () ())) "=" (Ident x))]); + test("foo _ = x", block![(Function (Ident foo) #((() (Wildcard -1) () ())) "=" (Ident x))]); } #[test] @@ -578,6 +573,11 @@ fn operator_section_in_operator_block() { test(&code.join("\n"), expected); } +#[test] +fn first_line_indented() { + expect_invalid_node(" a"); +} + // === Binary Operators === @@ -710,24 +710,21 @@ fn unary_operator_at_end_of_expression() { #[test] fn unspaced_operator_sequence() { - let cases = [ - // Add a negated value. - ("x = y+-z", block![ - (Assignment (Ident x) "=" (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" (Ident z))))]), - // Create an operator section that adds a negated value to its input. - ("x = +-z", block![ - (Assignment (Ident x) "=" (OprSectionBoundary 1 - (OprApp () (Ok "+") (UnaryOprApp "-" (Ident z)))))]), - // Create an operator section that adds its input, negated, to a value. - ("x = y+-", block![ - (Assignment (Ident x) "=" (OprSectionBoundary 1 - (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" ()))))]), - // Assign a negative number to x. - ("x=-1", block![(Assignment (Ident x) "=" (UnaryOprApp "-" (Number () "1" ())))]), - // Assign a negated value to x. - ("x=-y", block![(Assignment (Ident x) "=" (UnaryOprApp "-" (Ident y)))]), - ]; - cases.into_iter().for_each(|(code, expected)| test(code, expected)); + // Add a negated value. + test!("x = y+-z", + (Assignment (Ident x) "=" (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" (Ident z))))); + // Create an operator section that adds a negated value to its input. + test!("x = +-z", + (Assignment (Ident x) "=" (OprSectionBoundary 1 + (OprApp () (Ok "+") (UnaryOprApp "-" (Ident z)))))); + // Create an operator section that adds its input, negated, to a value. + test!("x = y+-", + (Assignment (Ident x) "=" (OprSectionBoundary 1 + (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" ()))))); + // Assign a negative number to x. + test!("x=-1", (Assignment (Ident x) "=" (UnaryOprApp "-" (Number () "1" ())))); + // Assign a negated value to x. + test!("x=-y", (Assignment (Ident x) "=" (UnaryOprApp "-" (Ident y)))); } #[test] @@ -891,7 +888,7 @@ fn metadata_raw() { fn metadata_parsing() { let code = metadata::ORDERS_WITH_METADATA; let (meta, code) = enso_parser::metadata::parse(code).unwrap(); - let _ast = enso_parser::Parser::new().run(code); + let _ast = parse(code); let _meta: enso_parser::metadata::Metadata = meta.unwrap(); } @@ -989,8 +986,7 @@ x"#; (Ident x) ]; test(code, expected); - - let code = " x = \"\"\"\n Indented multiline\n x"; + let code = "x = \"\"\"\n Indented multiline\nx"; #[rustfmt::skip] let expected = block![ (Assignment (Ident x) "=" (TextLiteral #((Section "Indented multiline")))) @@ -1153,6 +1149,27 @@ fn case_expression() { test(&code.join("\n"), expected); } +#[test] +fn case_documentation() { + #[rustfmt::skip] + let code = [ + "case a of", + " ## The Some case", + " Some -> x", + " ## The Int case", + " Int -> x", + ]; + #[rustfmt::skip] + let expected = block![ + (CaseOf (Ident a) #( + (((#((Section " The Some case")) #()) () () ())) + ((() (Ident Some) "->" (Ident x))) + (((#((Section " The Int case")) #()) () () ())) + ((() (Ident Int) "->" (Ident x))))) + ]; + test(&code.join("\n"), expected); +} + #[test] fn case_by_type() { macro_rules! test_case { @@ -1247,34 +1264,50 @@ fn tuple_literals() { // === Numeric literals === -#[test] -fn numbers() { - test!("1 . 0", (OprApp (Number () "1" ()) (Ok ".") (Number () "0" ()))); - test!("1 .0", - (App (Number () "1" ()) (OprSectionBoundary 1 (OprApp () (Ok ".") (Number () "0" ()))))); - test!("1. 0", - (OprSectionBoundary 1 (App (OprApp (Number () "1" ()) (Ok ".") ()) (Number () "0" ())))); - test!("0b10101010", (Number "0b" "10101010" ())); - test!("0o122137", (Number "0o" "122137" ())); - test!("0xAE2F14", (Number "0x" "AE2F14" ())); - test!("pi = 3.14", (Assignment (Ident pi) "=" (Number () "3" ("." "14")))); - test!("0.0.x", (OprApp (Number () "0" ("." "0")) (Ok ".") (Ident x))); -} +#[cfg(test)] +mod numbers { + use super::*; -#[test] -// This syntax cannot be used until we remove old-nondecimal number support, which is -// needed for compatibility until the old parser is fully replaced. -#[ignore] -fn new_delimited_numbers() { - test!("100_000", (Number () "100_000" ())); - test!("10_000.99", (Number () "10_000" ("." "99"))); -} + #[test] + fn with_decimal() { + test!("1 . 0", (OprApp (Number () "1" ()) (Ok ".") (Number () "0" ()))); + test!("1 .0", + (App (Number () "1" ()) (OprSectionBoundary 1 (OprApp () (Ok ".") (Number () "0" ()))))); + test!("1. 0", + (OprSectionBoundary 1 (App (OprApp (Number () "1" ()) (Ok ".") ()) (Number () "0" ())))); + test!("pi = 3.14", (Assignment (Ident pi) "=" (Number () "3" ("." "14")))); + test!("0.0.x", (OprApp (Number () "0" ("." "0")) (Ok ".") (Ident x))); + } -#[test] -fn old_nondecimal_numbers() { - test!("2_01101101", (Number "2_" "01101101" ())); - test!("-2_01101101", (UnaryOprApp "-" (Number "2_" "01101101" ()))); - test!("16_17ffffffffffffffa", (Number "16_" "17ffffffffffffffa" ())); + #[test] + fn with_base() { + test!("0b10101010", (Number "0b" "10101010" ())); + test!("0o122137", (Number "0o" "122137" ())); + test!("0xAE2F14", (Number "0x" "AE2F14" ())); + } + + #[test] + fn base_only() { + test!("0x", (Number "0x" () ())); + test!("0b", (Number "0b" () ())); + test!("0o", (Number "0o" () ())); + } + + #[test] + // This syntax cannot be used until we remove old-nondecimal number support, which is + // needed for compatibility until the old parser is fully replaced. + #[ignore] + fn new_delimited() { + test!("100_000", (Number () "100_000" ())); + test!("10_000.99", (Number () "10_000" ("." "99"))); + } + + #[test] + fn old_nondecimal() { + test!("2_01101101", (Number "2_" "01101101" ())); + test!("-2_01101101", (UnaryOprApp "-" (Number "2_" "01101101" ()))); + test!("16_17ffffffffffffffa", (Number "16_" "17ffffffffffffffa" ())); + } } @@ -1538,12 +1571,19 @@ fn expect_tree_representing_code(code: &str, ast: &enso_parser::syntax::Tree) { /// example, a `token::Number` may be represented like: `sexp![10]`, and a `token::Ident` may look /// like `sexp![foo]`. fn test(code: &str, expect: lexpr::Value) { - let ast = enso_parser::Parser::new().run(code); + let ast = parse(code); let ast_s_expr = to_s_expr(&ast, code); assert_eq!(ast_s_expr.to_string(), expect.to_string(), "{:?}", &ast); expect_tree_representing_code(code, &ast); } +fn parse(code: &str) -> enso_parser::syntax::tree::Tree { + let ast = enso_parser::Parser::new().run(code); + let expected_span = 0..(code.encode_utf16().count() as u32); + enso_parser_debug::validate_spans(&ast, expected_span); + ast +} + // === Testing inputs containing syntax errors === @@ -1555,7 +1595,7 @@ struct Errors { impl Errors { fn collect(code: &str) -> Self { - let ast = enso_parser::Parser::new().run(code); + let ast = parse(code); expect_tree_representing_code(code, &ast); let errors = core::cell::Cell::new(Errors::default()); ast.map(|tree| match &*tree.variant { diff --git a/lib/rust/parser/src/lexer.rs b/lib/rust/parser/src/lexer.rs index 029e2d1e81..0de9e18bb7 100644 --- a/lib/rust/parser/src/lexer.rs +++ b/lib/rust/parser/src/lexer.rs @@ -657,7 +657,7 @@ impl<'s> Lexer<'s> { match token.code.as_ref() { // Special-case: Split into multiple operators. "+-" => { - let (left, right) = token.split_at_(Bytes(1)); + let (left, right) = token.split_at(code::Length::of("+")); let lhs = analyze_operator(&left.code); self.submit_token(left.with_variant(token::Variant::operator(lhs))); // The `-` in this case is not identical to a free `-`: It is only allowed a @@ -886,23 +886,25 @@ impl<'s> Lexer<'s> { if let Some(token) = token { if let Some(base) = base { self.submit_token(token.with_variant(token::Variant::number_base())); - let token = match base { + if let Some(digits) = match base { token::Base::Binary => self.token(|this| this.take_while(is_binary_digit)), token::Base::Octal => self.token(|this| this.take_while(is_octal_digit)), token::Base::Hexadecimal => self.token(|this| this.take_while(is_hexadecimal_digit)), - }; - let joiner = token::OperatorProperties::new() - .with_binary_infix_precedence(u32::MAX) - .as_token_joiner(); - self.submit_token(Token( - Code::empty_without_offset(), - Code::empty_without_offset(), - token::Variant::operator(joiner), - )); - // Every number has a digits-token, even if it's zero-length. - let token = token.unwrap_or_default(); - self.submit_token(token.with_variant(token::Variant::digits(Some(base)))); + } { + // The base and the digits are separate tokens so that they can have separate + // spans. A pseudo-token binds them together tightly so that the parser can + // assemble them into one number node. + let joiner = token::OperatorProperties::new() + .with_binary_infix_precedence(u32::MAX) + .as_token_joiner(); + self.submit_token(Token( + Code::empty(self.current_offset.utf16), + Code::empty(self.current_offset.utf16), + token::Variant::operator(joiner), + )); + self.submit_token(digits.with_variant(token::Variant::digits(Some(base)))); + } } else { self.submit_token(token.with_variant(token::Variant::digits(None))); } @@ -1076,11 +1078,19 @@ impl<'s> Lexer<'s> { } if let Some(indent) = new_indent { if indent <= *block_indent { - self.output.push(Token::from(token::text_end( - Code::empty_without_offset(), - Code::empty_without_offset(), - ))); - self.end_blocks(indent); + let text_end = { + let location = newlines + .first() + .as_ref() + .unwrap() + .left_offset + .code + .position_before(); + let offset = Offset(VisibleOffset(0), location.clone()); + Token(offset, location, token::Variant::text_end()) + }; + self.output.push(text_end); + self.end_blocks(indent, newlines.first().as_ref().unwrap()); self.output.extend(newlines); if self.current_offset == text_start.0 { self.last_spaces_visible_offset = text_start.1.visible; @@ -1152,7 +1162,10 @@ impl<'s> Lexer<'s> { let close_quote_end = self.mark(); self.make_token(text_end, close_quote_end, token::Variant::text_end()) } else { - Token::from(token::text_end(Code::empty_without_offset(), Code::empty_without_offset())) + Token::from(token::text_end( + Code::empty(self.current_offset.utf16), + Code::empty(self.current_offset.utf16), + )) }; self.output.push(end_token); TextEndedAt::End @@ -1327,20 +1340,24 @@ impl<'s> Lexer<'s> { while let Some(token) = self.line_break() { newlines.push(token.with_variant(token::Variant::newline())); } - if !newlines.is_empty() { + if let Some(last) = newlines.last() { let block_indent = self.last_spaces_visible_offset; if block_indent > self.current_block_indent { - let block_start = self.marker_token(token::Variant::block_start()); + let block_start = { + let location = last.left_offset.code.position_before(); + let offset = Offset(VisibleOffset(0), location.clone()); + Token(offset, location, token::Variant::block_start()) + }; self.submit_token(block_start); self.start_block(block_indent); } - self.end_blocks(block_indent); + self.end_blocks(block_indent, newlines.first().as_ref().unwrap()); newlines.drain(..).for_each(|token| self.submit_token(token)); } self.token_storage.set_from(newlines); } - fn end_blocks(&mut self, block_indent: VisibleOffset) { + fn end_blocks(&mut self, block_indent: VisibleOffset, newline: &Token<'s>) { while block_indent < self.current_block_indent { let Some(previous_indent) = self.block_indent_stack.last().copied() else { // If the file starts at indent > 0, we treat that as the root indent level @@ -1355,7 +1372,11 @@ impl<'s> Lexer<'s> { break; } self.end_block(); - let block_end = self.marker_token(token::Variant::block_end()); + let block_end = { + let location = newline.left_offset.code.position_before(); + let offset = Offset(VisibleOffset(0), location.clone()); + Token(offset, location, token::Variant::block_end()) + }; self.submit_token(block_end); } } @@ -1385,22 +1406,23 @@ impl<'s> Lexer<'s> { /// Run the lexer. Return non-hierarchical list of tokens (the token groups will be represented /// as start and end tokens). pub fn run(mut self) -> ParseResult>> { + // If the first line is indented, open a block for it. self.spaces_after_lexeme(); - self.current_block_indent = self.last_spaces_visible_offset; - let mut any_parser_matched = true; - while any_parser_matched { - any_parser_matched = false; - for f in PARSERS { - if self.run_and_check_if_progressed(f) { - any_parser_matched = true; - break; - } - } + let first_block_indent = self.last_spaces_visible_offset; + if first_block_indent.width_in_spaces != 0 { + self.submit_token(token::block_start(Code::empty(0), Code::empty(0)).into()); + self.start_block(first_block_indent); + self.submit_token(token::newline(Code::empty(0), Code::empty(0)).into()); } + // Main parsing loop. + while PARSERS.iter().any(|f| self.run_and_check_if_progressed(f)) {} + // If any blocks were still open at EOF, close them. while self.end_block().is_some() { let block_end = self.marker_token(token::Variant::block_end()); self.submit_token(block_end); } + // If the last line ended in whitespace, ensure it is represented; we'll attach it to a + // phantom newline token. if self.last_spaces_visible_offset != VisibleOffset(0) { let left_offset_start = self.current_offset - self.last_spaces_offset; let offset_code = self.input.slice(left_offset_start.utf8..self.current_offset.utf8); @@ -1412,13 +1434,14 @@ impl<'s> Lexer<'s> { let eof = token::variant::Variant::Newline(token::variant::Newline()); self.submit_token(Token(offset, Code::empty(self.current_offset.utf16), eof)); } + // Sanity check. let mut internal_error = self.internal_error.take(); if self.current_char.is_some() { let message = format!("Lexer did not consume all input. State: {self:?}"); internal_error.get_or_insert(message); } + let value = self.output; - trace!("Tokens:\n{:#?}", value); ParseResult { value, internal_error } } } @@ -1491,9 +1514,30 @@ mod tests { } } + /// Lex the input, check the spans for consistency, and return the tokens with the span offsets + /// stripped. + fn lex_and_validate_spans(input: &str) -> Vec { + let result: Vec<_> = run(input).unwrap(); + let mut sum_span = None; + fn concat(a: &Option>, b: &Range) -> Range { + match a { + Some(a) => { + assert_eq!(a.end, b.start); + a.start..b.end + } + None => b.clone(), + } + } + for token in &result { + sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16())); + sum_span = Some(concat(&sum_span, &token.code.range_utf16())); + } + assert_eq!(sum_span.unwrap_or_default(), 0..(input.encode_utf16().count() as u32)); + result.into_iter().map(|token| token.without_offsets()).collect() + } + fn test_lexer<'s>(input: &'s str, expected: Vec>) { - let result: Vec<_> = - run(input).unwrap().into_iter().map(|token| token.without_offsets()).collect(); + let result = lex_and_validate_spans(input); let expected: Vec<_> = expected.into_iter().map(|token| token.without_offsets()).collect(); assert_eq!(result, expected); } @@ -1517,23 +1561,21 @@ mod tests { #[test] fn test_case_block() { let newline = newline_(empty(), test_code("\n")); - test_lexer_many(vec![ - ("\n", vec![newline_(empty(), test_code("\n"))]), - ("\n foo\n bar", vec![ - block_start_(empty(), empty()), - newline.clone(), - ident_(" ", "foo"), - newline.clone(), - ident_(" ", "bar"), - block_end_(empty(), empty()), - ]), - ("foo\n +", vec![ - ident_("", "foo"), - block_start_(empty(), empty()), - newline, - operator_(" ", "+"), - block_end_(empty(), empty()), - ]), + test_lexer("\n", vec![newline_(empty(), test_code("\n"))]); + test_lexer("\n foo\n bar", vec![ + block_start_(empty(), empty()), + newline.clone(), + ident_(" ", "foo"), + newline.clone(), + ident_(" ", "bar"), + block_end_(empty(), empty()), + ]); + test_lexer("foo\n +", vec![ + ident_("", "foo"), + block_start_(empty(), empty()), + newline, + operator_(" ", "+"), + block_end_(empty(), empty()), ]); } @@ -1541,21 +1583,29 @@ mod tests { fn test_case_block_bad_indents() { let newline = newline_(empty(), test_code("\n")); #[rustfmt::skip] - test_lexer_many(vec![ - ("\n foo\n bar\nbaz", vec![ - block_start_(empty(), empty()), - newline.clone(), ident_(" ", "foo"), - newline.clone(), ident_(" ", "bar"), - block_end_(empty(), empty()), - newline.clone(), ident_("", "baz"), - ]), - ("\n foo\n bar\n baz", vec![ - block_start_(empty(), empty()), - newline.clone(), ident_(" ", "foo"), - newline.clone(), ident_(" ", "bar"), - newline, ident_(" ", "baz"), - block_end_(empty(), empty()), - ]), + test_lexer(" foo\n bar\nbaz", vec![ + block_start_(empty(), empty()), + newline_(empty(), empty()), + ident_(" ", "foo"), + newline.clone(), ident_(" ", "bar"), + block_end_(empty(), empty()), + newline.clone(), ident_("", "baz"), + ]); + #[rustfmt::skip] + test_lexer("\n foo\n bar\nbaz", vec![ + block_start_(empty(), empty()), + newline.clone(), ident_(" ", "foo"), + newline.clone(), ident_(" ", "bar"), + block_end_(empty(), empty()), + newline.clone(), ident_("", "baz"), + ]); + #[rustfmt::skip] + test_lexer("\n foo\n bar\n baz", vec![ + block_start_(empty(), empty()), + newline.clone(), ident_(" ", "foo"), + newline.clone(), ident_(" ", "bar"), + newline, ident_(" ", "baz"), + block_end_(empty(), empty()), ]); } @@ -1594,12 +1644,10 @@ mod tests { #[test] fn test_case_idents() { - test_lexer_many(vec![ - ("", vec![]), - ("_", vec![wildcard_("", "_")]), - ("_'", vec![wildcard_("", "_'")]), - ("_''", vec![wildcard_("", "_''")]), - ]); + test_lexer("", vec![]); + test_lexer("_", vec![wildcard_("", "_")]); + test_lexer("_'", vec![wildcard_("", "_'")]); + test_lexer("_''", vec![wildcard_("", "_''")]); test_lexer_many(lexer_case_idents(&[ "a", "a'", @@ -1629,7 +1677,7 @@ mod tests { #[test] fn test_case_operators() { test_lexer_many(lexer_case_operators(&["+", "-", "=", "==", "===", ":", ","])); - assert_eq!(run("+-").unwrap().len(), 2); + assert_eq!(lex_and_validate_spans("+-").len(), 2); } /// Based on https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt. @@ -1777,6 +1825,12 @@ mod tests { /* 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = */ "������" } } + + #[test] + fn test_doc_comment() { + let code = ["## Foo.", "main = 23"].join("\n"); + lex_and_validate_spans(&code); + } } diff --git a/lib/rust/parser/src/lib.rs b/lib/rust/parser/src/lib.rs index 7cc20e0bb2..6a1f824026 100644 --- a/lib/rust/parser/src/lib.rs +++ b/lib/rust/parser/src/lib.rs @@ -201,7 +201,6 @@ impl Default for Parser { /// interpreted as a variable assignment or method definition. fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> { use syntax::tree::*; - let mut left_offset = tree.span.left_offset.position_before(); if let Tree { variant: box Variant::Annotated(annotated), .. } = &mut tree { annotated.expression = annotated.expression.take().map(expression_to_statement); return tree; @@ -214,14 +213,22 @@ fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> { documented.expression = documented.expression.take().map(expression_to_statement); return tree; } - if let Tree { variant: box Variant::TypeAnnotated(annotated), span } = tree { - let colon = annotated.operator; - let type_ = annotated.type_; - let variable = annotated.expression; - let mut tree = Tree::type_signature(variable, colon, type_); - tree.span.left_offset += span.left_offset; + if let Tree { variant: box Variant::TypeAnnotated(annotated), .. } = tree { + let TypeAnnotated { expression, operator, type_ } = annotated; + tree.variant = Box::new(Variant::TypeSignature(TypeSignature { + variable: expression, + operator, + type_, + })); return tree; } + if matches!(&tree, Tree { + variant: box Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, .. }), + .. + }) { + return tree.with_error("Expected expression before indented block."); + } + let mut left_offset = tree.span.left_offset.position_before(); let tree_ = &mut tree; let opr_app = match tree_ { Tree { variant: box Variant::OprApp(opr_app), span } => { diff --git a/lib/rust/parser/src/macros/built_in.rs b/lib/rust/parser/src/macros/built_in.rs index 40ddedc48a..3c9e44f961 100644 --- a/lib/rust/parser/src/macros/built_in.rs +++ b/lib/rust/parser/src/macros/built_in.rs @@ -82,7 +82,10 @@ fn import_body<'s>( let field = match header.code.as_ref() { "polyglot" => { body = Some( - precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty), + precedence + .resolve(tokens) + .map(expect_ident) + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut polyglot } @@ -91,7 +94,7 @@ fn import_body<'s>( precedence .resolve(tokens) .map(expect_qualified) - .unwrap_or_else(expected_nonempty), + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut from } @@ -112,14 +115,17 @@ fn import_body<'s>( } "as" => { body = Some( - precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty), + precedence + .resolve(tokens) + .map(expect_ident) + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut as_ } "hiding" => { body = Some( sequence_tree(precedence, tokens, expect_ident) - .unwrap_or_else(expected_nonempty), + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut hiding } @@ -175,7 +181,7 @@ fn export_body<'s>( precedence .resolve(tokens) .map(expect_qualified) - .unwrap_or_else(expected_nonempty), + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut from } @@ -196,14 +202,17 @@ fn export_body<'s>( } "as" => { body = Some( - precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty), + precedence + .resolve(tokens) + .map(expect_ident) + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut as_ } "hiding" => { body = Some( sequence_tree(precedence, tokens, expect_ident) - .unwrap_or_else(expected_nonempty), + .unwrap_or_else(|| expected_nonempty(header.code.position_after())), ); &mut hiding } @@ -438,10 +447,9 @@ fn case_body<'s>( _ => initial_case.push(item), } } - if let Some(_first) = initial_case.first() { - // FIXME: Create 0-length span at offset preceding `_first`. - let newline = - syntax::token::newline(Code::empty_without_offset(), Code::empty_without_offset()); + if !initial_case.is_empty() { + let location = of_.code.position_after(); + let newline = syntax::token::newline(location.clone(), location); case_builder.push(syntax::item::Line { newline, items: initial_case }); } block.into_iter().for_each(|line| case_builder.push(line)); @@ -825,10 +833,10 @@ fn expect_qualified(tree: syntax::Tree) -> syntax::Tree { } } -fn expected_nonempty<'s>() -> syntax::Tree<'s> { +fn expected_nonempty(location: Code) -> syntax::Tree { let empty = syntax::Tree::ident(syntax::token::ident( - Code::empty_without_offset(), - Code::empty_without_offset(), + location.clone(), + location, false, 0, false, diff --git a/lib/rust/parser/src/macros/resolver.rs b/lib/rust/parser/src/macros/resolver.rs index 3e8c85060e..26856311b8 100644 --- a/lib/rust/parser/src/macros/resolver.rs +++ b/lib/rust/parser/src/macros/resolver.rs @@ -142,24 +142,14 @@ pub struct Resolver<'s> { impl<'s> Resolver<'s> { /// Create a new resolver, in statement context. pub fn new_statement() -> Self { - let scopes = default(); - let open_blocks = vec![syntax::item::Line { - newline: token::newline(Code::empty(0), Code::empty(0)), - items: default(), - }]; - let macro_stack = default(); - let segments = default(); - let items = default(); - let context = Context::Statement; - let precedence = syntax::operator::Precedence::new(); Self { - blocks: scopes, - lines: open_blocks, - macros: macro_stack, - segments, - items, - context, - precedence, + context: Context::Statement, + precedence: syntax::operator::Precedence::new(), + blocks: default(), + lines: default(), + macros: default(), + segments: default(), + items: default(), } } @@ -169,6 +159,10 @@ impl<'s> Resolver<'s> { root_macro_map: &MacroMap, tokens: impl IntoIterator>, ) -> syntax::Tree<'s> { + self.lines.push(syntax::item::Line { + newline: token::newline(Code::empty(0), Code::empty(0)), + items: default(), + }); tokens.into_iter().for_each(|t| self.push(root_macro_map, t)); self.finish_current_line(); let lines = self.lines.drain(..).map(|syntax::item::Line { newline, items }| { @@ -233,9 +227,11 @@ impl<'s> Resolver<'s> { /// Append a token to the state. fn push(&mut self, root_macro_map: &MacroMap, token: Token<'s>) { match token.variant { - token::Variant::Newline(_) => { - self.finish_current_line(); - let newline = token::newline(token.left_offset, token.code); + token::Variant::Newline(newline) => { + if !self.lines.is_empty() { + self.finish_current_line(); + } + let newline = token.with_variant(newline); self.lines.push(syntax::item::Line { newline, items: default() }); self.context = Context::Statement; } diff --git a/lib/rust/parser/src/source/code.rs b/lib/rust/parser/src/source/code.rs index 7be3332fd7..ca8eddbc0a 100644 --- a/lib/rust/parser/src/source/code.rs +++ b/lib/rust/parser/src/source/code.rs @@ -75,21 +75,24 @@ impl<'s> Code<'s> { self.utf16 } - /// Split the UTF-8 code at the given byte offset. - pub fn split_at(&self, offset: usize) -> (Self, Self) { - let (left, right) = self.repr.split_at(offset); - let left_utf16 = left.chars().map(|c| c.len_utf16() as u32).sum(); - let right_utf16 = self.utf16 - left_utf16; + /// Return the start and end of the UTF-16 source code for this element. + pub fn range_utf16(&self) -> Range { + self.offset_utf16..(self.offset_utf16 + self.utf16) + } + + /// Split the code at the given location. + pub fn split_at(&self, split: Length) -> (Self, Self) { + let (left, right) = self.repr.split_at(split.utf8); ( Self { repr: StrRef(left), offset_utf16: self.offset_utf16, - utf16: left_utf16, + utf16: split.utf16, }, Self { repr: StrRef(right), - offset_utf16: self.offset_utf16 + left_utf16, - utf16: right_utf16, + offset_utf16: self.offset_utf16 + split.utf16, + utf16: self.utf16 - split.utf16, }, ) } @@ -209,6 +212,12 @@ pub struct Length { } impl Length { + /// Returns the length of the given input. + #[inline(always)] + pub fn of(s: &str) -> Self { + Self { utf8: s.len(), utf16: s.encode_utf16().count() as u32 } + } + /// Returns true if the code is empty. #[inline(always)] pub fn is_zero(&self) -> bool { @@ -220,6 +229,12 @@ impl Length { pub fn utf8_bytes(&self) -> usize { self.utf8 } + + /// Return the length in UTF-16 code units. + #[inline(always)] + pub fn utf16_len(&self) -> u32 { + self.utf16 + } } impl Add for Length { diff --git a/lib/rust/parser/src/source/span.rs b/lib/rust/parser/src/source/span.rs index cd243c348a..7d2dcea7ca 100644 --- a/lib/rust/parser/src/source/span.rs +++ b/lib/rust/parser/src/source/span.rs @@ -101,7 +101,7 @@ impl<'s> Offset<'s> { /// Return a 0-length `Span` representing the position after the end of this `Span`. pub fn position_after(&self) -> Self { - Self { visible: default(), code: self.code.position_before() } + Self { visible: default(), code: self.code.position_after() } } /// Return this value with its start position removed (set to 0). This can be used to compare @@ -184,6 +184,18 @@ impl<'s> Span<'s> { pub fn add>(self, elem: &mut T) -> Self { Builder::add_to_span(elem, self) } + + /// Return the start and end of the UTF-16 source code for this element. + pub fn range_utf16(&self) -> Range { + let start = self.left_offset.position_after().code.range_utf16().start; + let end = start + self.code_length.utf16_len(); + start..end + } + + /// Return the sum of the whitespace length and the code length. + pub fn length_including_whitespace(&self) -> code::Length { + self.left_offset.code.length() + self.code_length + } } impl<'s> AsRef> for Span<'s> { @@ -204,6 +216,11 @@ where self.left_offset += other.left_offset; self.code_length = other.code_length; } else { + debug_assert_eq!( + self.left_offset.code.position_after().range_utf16().end + + self.code_length.utf16_len(), + other.left_offset.code.position_before().range_utf16().start + ); self.code_length += other.left_offset.code.length() + other.code_length; } } diff --git a/lib/rust/parser/src/syntax/operator.rs b/lib/rust/parser/src/syntax/operator.rs index 61ed2c8df6..6033981a59 100644 --- a/lib/rust/parser/src/syntax/operator.rs +++ b/lib/rust/parser/src/syntax/operator.rs @@ -137,10 +137,10 @@ impl<'s> ExpressionBuilder<'s> { pub fn operand(&mut self, operand: Operand>) { if self.prev_type == Some(ItemType::Ast) { if let Some(Operand { value: syntax::Tree { variant: box - syntax::tree::Variant::TextLiteral(ref mut lhs), .. }, .. }) = self.output.last_mut() + syntax::tree::Variant::TextLiteral(ref mut lhs), span: lhs_span }, .. }) = self.output.last_mut() && !lhs.closed && let box syntax::tree::Variant::TextLiteral(mut rhs) = operand.value.variant { - syntax::tree::join_text_literals(lhs, &mut rhs, operand.value.span); + syntax::tree::join_text_literals(lhs, &mut rhs, lhs_span, operand.value.span); if let syntax::tree::TextLiteral { open: Some(open), newline: None, elements, closed: true, close: None } = lhs && open.code.starts_with('#') { let elements = mem::take(elements); diff --git a/lib/rust/parser/src/syntax/token.rs b/lib/rust/parser/src/syntax/token.rs index c5ee27186d..46d1ee460c 100644 --- a/lib/rust/parser/src/syntax/token.rs +++ b/lib/rust/parser/src/syntax/token.rs @@ -135,19 +135,13 @@ impl<'s, T> Token<'s, T> { /// position, which does not include the [`left_offset`]. It means that `split_at(Bytes(0))` /// will split the token into left offset only and a left-trimmed token. #[inline(always)] - pub fn split_at(self, offset: Bytes) -> (Token<'s, ()>, Token<'s, ()>, T) { + pub fn split_at(self, split: code::Length) -> (Token<'s, ()>, Token<'s, ()>) { let left_lexeme_offset = self.left_offset; - let right_lexeme_offset = self.code.position_after(); - let (left_code, right_code) = self.code.split_at(offset.unchecked_raw()); + let right_lexeme_offset = + Code::empty(self.code.position_before().range_utf16().end + split.utf16_len()); + let (left_code, right_code) = self.code.split_at(split); let left = Token(left_lexeme_offset, left_code, ()); let right = Token(right_lexeme_offset, right_code, ()); - (left, right, self.variant) - } - - /// A version of [`split_at`] that discards the associated variant. - #[inline(always)] - pub fn split_at_(self, offset: Bytes) -> (Token<'s, ()>, Token<'s, ()>) { - let (left, right, _) = self.split_at(offset); (left, right) } diff --git a/lib/rust/parser/src/syntax/tree.rs b/lib/rust/parser/src/syntax/tree.rs index 0d0107b706..6bdfbab6cd 100644 --- a/lib/rust/parser/src/syntax/tree.rs +++ b/lib/rust/parser/src/syntax/tree.rs @@ -608,7 +608,7 @@ impl<'s> span::Builder<'s> for ArgumentType<'s> { // === CaseOf === -/// A that may contain a case-expression in a case-of expression. +/// A line that may contain a case-expression in a case-of expression. #[derive(Clone, Debug, Default, Eq, PartialEq, Visitor, Serialize, Reflect, Deserialize)] pub struct CaseLine<'s> { /// The token beginning the line. This will always be present, unless the first case-expression @@ -661,7 +661,10 @@ impl<'s> Case<'s> { impl<'s> span::Builder<'s> for Case<'s> { fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> { - span.add(&mut self.pattern).add(&mut self.arrow).add(&mut self.expression) + span.add(&mut self.documentation) + .add(&mut self.pattern) + .add(&mut self.arrow) + .add(&mut self.expression) } } @@ -755,20 +758,23 @@ impl<'s> span::Builder<'s> for OperatorDelimitedTree<'s> { pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> { match (&mut *func.variant, &mut *arg.variant) { (Variant::Annotated(func_ @ Annotated { argument: None, .. }), _) => { + func.span.code_length += arg.span.length_including_whitespace(); func_.argument = maybe_apply(mem::take(&mut func_.argument), arg).into(); func } (Variant::AnnotatedBuiltin(func_), _) => { + func.span.code_length += arg.span.length_including_whitespace(); func_.expression = maybe_apply(mem::take(&mut func_.expression), arg).into(); func } - (Variant::OprApp(OprApp { lhs: Some(_), opr: Ok(_), rhs }), - Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, arguments })) - if rhs.is_none() => { + (Variant::OprApp(OprApp { lhs: Some(_), opr: Ok(_), rhs: rhs @ None }), + Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, arguments })) => { + func.span.code_length += arg.span.length_including_whitespace(); *rhs = block::body_from_lines(mem::take(arguments)).into(); func } (_, Variant::ArgumentBlockApplication(block)) if block.lhs.is_none() => { + arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length; let func_left_offset = func.span.left_offset.take_as_prefix(); let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset); if let Some(first) = block.arguments.first_mut() { @@ -778,6 +784,7 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> { arg } (_, Variant::OperatorBlockApplication(block)) if block.lhs.is_none() => { + arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length; let func_left_offset = func.span.left_offset.take_as_prefix(); let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset); if let Some(first) = block.expressions.first_mut() { @@ -822,8 +829,10 @@ fn maybe_apply<'s>(f: Option>, x: Tree<'s>) -> Tree<'s> { pub fn join_text_literals<'s>( lhs: &mut TextLiteral<'s>, rhs: &mut TextLiteral<'s>, + lhs_span: &mut Span<'s>, rhs_span: Span<'s>, ) { + lhs_span.code_length += rhs_span.length_including_whitespace(); match rhs.elements.first_mut() { Some(TextElement::Section { text }) => text.left_offset += rhs_span.left_offset, Some(TextElement::Escape { token }) => token.left_offset += rhs_span.left_offset, @@ -863,6 +872,7 @@ pub fn apply_operator<'s>( Variant::Number(Number { base: None, integer, fractional_digits })) => { func_.integer = mem::take(integer); func_.fractional_digits = mem::take(fractional_digits); + lhs_.span.code_length += rhs_.span.code_length; lhs.take().unwrap() } _ => { @@ -901,6 +911,7 @@ pub fn apply_operator<'s>( { let dot = opr.clone(); let digits = digits.clone(); + lhs.span.code_length += dot.code.length() + rhs.span.code_length; lhs_.fractional_digits = Some(FractionalDigits { dot, digits }); return lhs.clone(); } @@ -912,8 +923,7 @@ pub fn apply_operator<'s>( } let ArgumentBlockApplication { lhs: _, arguments } = block; let arguments = mem::take(arguments); - let rhs_ = block::body_from_lines(arguments); - rhs = Some(rhs_); + *rhs_ = block::body_from_lines(arguments); } } } diff --git a/lib/rust/parser/src/syntax/tree/block.rs b/lib/rust/parser/src/syntax/tree/block.rs index 1bd5691365..28a929501b 100644 --- a/lib/rust/parser/src/syntax/tree/block.rs +++ b/lib/rust/parser/src/syntax/tree/block.rs @@ -88,7 +88,7 @@ where I: Iterator> match line.expression.map(Prefix::try_from) { Some(Ok(prefix)) => { match self.prefixes.last_mut() { - Some(prefix) => prefix.newlines().push(line.newline), + Some(prefix) => prefix.push_newline(line.newline), None => self.newline = Some(line.newline), }; self.prefixes.push(prefix); @@ -96,7 +96,7 @@ where I: Iterator> Some(Err(mut statement)) => { return Some(match self.prefixes.last_mut() { Some(prefix) => { - prefix.newlines().push(line.newline); + prefix.push_newline(line.newline); for prefix in self.prefixes.drain(..).rev() { statement = prefix.apply_to(statement); } @@ -108,7 +108,7 @@ where I: Iterator> } None => { match self.prefixes.last_mut() { - Some(prefix) => prefix.newlines().push(line.newline), + Some(prefix) => prefix.push_newline(line.newline), None => return Some(line.newline.into()), }; } @@ -154,23 +154,27 @@ impl<'s> TryFrom> for Prefix<'s> { } impl<'s> Prefix<'s> { - fn newlines(&mut self) -> &mut Vec> { - match self { - Prefix::Annotation { node: Annotated { newlines, .. }, .. } - | Prefix::BuiltinAnnotation { node: AnnotatedBuiltin { newlines, .. }, .. } + fn push_newline(&mut self, newline: token::Newline<'s>) { + let (newlines, span) = match self { + Prefix::Annotation { node: Annotated { newlines, .. }, span } + | Prefix::BuiltinAnnotation { node: AnnotatedBuiltin { newlines, .. }, span } | Prefix::Documentation { node: Documented { documentation: DocComment { newlines, .. }, .. }, - .. - } => newlines, - } + span, + } => (newlines, span), + }; + span.code_length += newline.left_offset.code.length() + newline.code.length(); + newlines.push(newline); } fn apply_to(mut self, expression: Tree<'s>) -> Tree<'s> { - *(match &mut self { - Prefix::Annotation { node, .. } => &mut node.expression, - Prefix::BuiltinAnnotation { node, .. } => &mut node.expression, - Prefix::Documentation { node, .. } => &mut node.expression, - }) = Some(expression); + let (expr, span) = match &mut self { + Prefix::Annotation { node, span } => (&mut node.expression, span), + Prefix::BuiltinAnnotation { node, span } => (&mut node.expression, span), + Prefix::Documentation { node, span } => (&mut node.expression, span), + }; + span.code_length += expression.span.left_offset.code.length() + expression.span.code_length; + *expr = Some(expression); self.into() } }