Ensure all spans have document offsets (#8039)

- Validate spans during existing lexer and parser unit tests, and in `enso_parser_debug`. - Fix lost span info causing failures of updated tests. # Important Notes - [x] Output of `parse_all_enso_files.sh` is unchanged since before #7881 (modulo libs changes since then). - When the parser encounters an input with the first line indented, it now creates a sub-block for lines at than indent level, and emits a syntax error (every indented block must have a parent). - When the parser encounters a number with a base but no digits (e.g. `0x`), it now emits a `Number` with `None` in the digits field rather than a 0-length digits token.
2024-11-22 22:10:15 +03:00 · 2023-10-19 05:36:42 -07:00 · 2023-10-19 05:36:42 -07:00 · 2edd2bd7ff
commit 2edd2bd7ff
parent 24b9a1179e
20 changed files with 799 additions and 251 deletions
--- a/app/gui2/src/util/ast/snapshots/index.ts.snap
+++ b/app/gui2/src/util/ast/snapshots/index.ts.snap
@ -71,6 +71,127 @@ exports[`Parsing ' foo bar
 }
 `;

+exports[`Parsing '2
+ + 3
+ + 4' 1`] = `
+{
+  "childrenLengthInCodeParsed": 11,
+  "statements": [
+    {
+      "expression": {
+        "childrenLengthInCodeParsed": 11,
+        "excess": [],
+        "expressions": [
+          {
+            "expression": {
+              "expression": {
+                "base": undefined,
+                "childrenLengthInCodeParsed": 1,
+                "fractionalDigits": undefined,
+                "integer": {
+                  "base": undefined,
+                  "lengthInCodeBuffer": 1,
+                  "startInCodeBuffer": 5,
+                  "type": "Digits",
+                  "whitespaceLengthInCodeBuffer": 0,
+                  "whitespaceStartInCodeBuffer": 5,
+                },
+                "type": "Number",
+                "whitespaceLengthInCodeParsed": 1,
+                "whitespaceStartInCodeParsed": 4,
+              },
+              "operator": {
+                "ok": true,
+                "value": {
+                  "lengthInCodeBuffer": 1,
+                  "startInCodeBuffer": 3,
+                  "type": "Operator",
+                  "whitespaceLengthInCodeBuffer": 1,
+                  "whitespaceStartInCodeBuffer": 2,
+                },
+              },
+            },
+            "newline": {
+              "lengthInCodeBuffer": 1,
+              "startInCodeBuffer": 1,
+              "type": "Newline",
+              "whitespaceLengthInCodeBuffer": 0,
+              "whitespaceStartInCodeBuffer": 1,
+            },
+          },
+          {
+            "expression": {
+              "expression": {
+                "base": undefined,
+                "childrenLengthInCodeParsed": 1,
+                "fractionalDigits": undefined,
+                "integer": {
+                  "base": undefined,
+                  "lengthInCodeBuffer": 1,
+                  "startInCodeBuffer": 10,
+                  "type": "Digits",
+                  "whitespaceLengthInCodeBuffer": 0,
+                  "whitespaceStartInCodeBuffer": 10,
+                },
+                "type": "Number",
+                "whitespaceLengthInCodeParsed": 1,
+                "whitespaceStartInCodeParsed": 9,
+              },
+              "operator": {
+                "ok": true,
+                "value": {
+                  "lengthInCodeBuffer": 1,
+                  "startInCodeBuffer": 8,
+                  "type": "Operator",
+                  "whitespaceLengthInCodeBuffer": 1,
+                  "whitespaceStartInCodeBuffer": 7,
+                },
+              },
+            },
+            "newline": {
+              "lengthInCodeBuffer": 1,
+              "startInCodeBuffer": 6,
+              "type": "Newline",
+              "whitespaceLengthInCodeBuffer": 0,
+              "whitespaceStartInCodeBuffer": 6,
+            },
+          },
+        ],
+        "lhs": {
+          "base": undefined,
+          "childrenLengthInCodeParsed": 1,
+          "fractionalDigits": undefined,
+          "integer": {
+            "base": undefined,
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 0,
+            "type": "Digits",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 0,
+          },
+          "type": "Number",
+          "whitespaceLengthInCodeParsed": 0,
+          "whitespaceStartInCodeParsed": 0,
+        },
+        "type": "OperatorBlockApplication",
+        "whitespaceLengthInCodeParsed": 0,
+        "whitespaceStartInCodeParsed": 0,
+      },
+      "newline": {
+        "lengthInCodeBuffer": 0,
+        "startInCodeBuffer": 0,
+        "type": "Newline",
+        "whitespaceLengthInCodeBuffer": 0,
+        "whitespaceStartInCodeBuffer": 0,
+      },
+    },
+  ],
+  "type": "BodyBlock",
+  "whitespaceLengthInCodeParsed": 0,
+  "whitespaceStartInCodeParsed": 0,
+}
+`;
+
 exports[`Parsing 'Data.read
 2 + 2' 1`] = `
 {
@ -199,6 +320,173 @@ exports[`Parsing 'Data.read
 }
 `;

+exports[`Parsing 'Data.read "File"
+2 + 3' 1`] = `
+{
+  "childrenLengthInCodeParsed": 22,
+  "statements": [
+    {
+      "expression": {
+        "arg": {
+          "childrenLengthInCodeParsed": 6,
+          "close": {
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 15,
+            "type": "TextEnd",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 15,
+          },
+          "elements": [
+            {
+              "text": {
+                "lengthInCodeBuffer": 4,
+                "startInCodeBuffer": 11,
+                "type": "TextSection",
+                "whitespaceLengthInCodeBuffer": 0,
+                "whitespaceStartInCodeBuffer": 11,
+              },
+              "type": "Section",
+            },
+          ],
+          "newline": undefined,
+          "open": {
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 10,
+            "type": "TextStart",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 10,
+          },
+          "type": "TextLiteral",
+          "whitespaceLengthInCodeParsed": 1,
+          "whitespaceStartInCodeParsed": 9,
+        },
+        "childrenLengthInCodeParsed": 16,
+        "func": {
+          "childrenLengthInCodeParsed": 9,
+          "lhs": {
+            "childrenLengthInCodeParsed": 4,
+            "token": {
+              "isFree": false,
+              "isOperatorLexically": false,
+              "isTypeOrConstructor": true,
+              "lengthInCodeBuffer": 4,
+              "liftLevel": 0,
+              "startInCodeBuffer": 0,
+              "type": "Ident",
+              "whitespaceLengthInCodeBuffer": 0,
+              "whitespaceStartInCodeBuffer": 0,
+            },
+            "type": "Ident",
+            "whitespaceLengthInCodeParsed": 0,
+            "whitespaceStartInCodeParsed": 0,
+          },
+          "opr": {
+            "ok": true,
+            "value": {
+              "lengthInCodeBuffer": 1,
+              "startInCodeBuffer": 4,
+              "type": "Operator",
+              "whitespaceLengthInCodeBuffer": 0,
+              "whitespaceStartInCodeBuffer": 4,
+            },
+          },
+          "rhs": {
+            "childrenLengthInCodeParsed": 4,
+            "token": {
+              "isFree": false,
+              "isOperatorLexically": false,
+              "isTypeOrConstructor": false,
+              "lengthInCodeBuffer": 4,
+              "liftLevel": 0,
+              "startInCodeBuffer": 5,
+              "type": "Ident",
+              "whitespaceLengthInCodeBuffer": 0,
+              "whitespaceStartInCodeBuffer": 5,
+            },
+            "type": "Ident",
+            "whitespaceLengthInCodeParsed": 0,
+            "whitespaceStartInCodeParsed": 5,
+          },
+          "type": "OprApp",
+          "whitespaceLengthInCodeParsed": 0,
+          "whitespaceStartInCodeParsed": 0,
+        },
+        "type": "App",
+        "whitespaceLengthInCodeParsed": 0,
+        "whitespaceStartInCodeParsed": 0,
+      },
+      "newline": {
+        "lengthInCodeBuffer": 0,
+        "startInCodeBuffer": 0,
+        "type": "Newline",
+        "whitespaceLengthInCodeBuffer": 0,
+        "whitespaceStartInCodeBuffer": 0,
+      },
+    },
+    {
+      "expression": {
+        "childrenLengthInCodeParsed": 5,
+        "lhs": {
+          "base": undefined,
+          "childrenLengthInCodeParsed": 1,
+          "fractionalDigits": undefined,
+          "integer": {
+            "base": undefined,
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 17,
+            "type": "Digits",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 17,
+          },
+          "type": "Number",
+          "whitespaceLengthInCodeParsed": 0,
+          "whitespaceStartInCodeParsed": 17,
+        },
+        "opr": {
+          "ok": true,
+          "value": {
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 19,
+            "type": "Operator",
+            "whitespaceLengthInCodeBuffer": 1,
+            "whitespaceStartInCodeBuffer": 18,
+          },
+        },
+        "rhs": {
+          "base": undefined,
+          "childrenLengthInCodeParsed": 1,
+          "fractionalDigits": undefined,
+          "integer": {
+            "base": undefined,
+            "lengthInCodeBuffer": 1,
+            "startInCodeBuffer": 21,
+            "type": "Digits",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 21,
+          },
+          "type": "Number",
+          "whitespaceLengthInCodeParsed": 1,
+          "whitespaceStartInCodeParsed": 20,
+        },
+        "type": "OprApp",
+        "whitespaceLengthInCodeParsed": 0,
+        "whitespaceStartInCodeParsed": 17,
+      },
+      "newline": {
+        "lengthInCodeBuffer": 1,
+        "startInCodeBuffer": 16,
+        "type": "Newline",
+        "whitespaceLengthInCodeBuffer": 0,
+        "whitespaceStartInCodeBuffer": 16,
+      },
+    },
+  ],
+  "type": "BodyBlock",
+  "whitespaceLengthInCodeParsed": 0,
+  "whitespaceStartInCodeParsed": 0,
+}
+`;
+
 exports[`Parsing 'Data.read File
 2 + 3' 1`] = `
 {
@ -350,6 +638,77 @@ exports[`Parsing 'Data.read File
 }
 `;

+exports[`Parsing 'foo bar
+' 1`] = `
+{
+  "childrenLengthInCodeParsed": 8,
+  "statements": [
+    {
+      "expression": {
+        "arg": {
+          "childrenLengthInCodeParsed": 3,
+          "token": {
+            "isFree": false,
+            "isOperatorLexically": false,
+            "isTypeOrConstructor": false,
+            "lengthInCodeBuffer": 3,
+            "liftLevel": 0,
+            "startInCodeBuffer": 4,
+            "type": "Ident",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 4,
+          },
+          "type": "Ident",
+          "whitespaceLengthInCodeParsed": 1,
+          "whitespaceStartInCodeParsed": 3,
+        },
+        "childrenLengthInCodeParsed": 7,
+        "func": {
+          "childrenLengthInCodeParsed": 3,
+          "token": {
+            "isFree": false,
+            "isOperatorLexically": false,
+            "isTypeOrConstructor": false,
+            "lengthInCodeBuffer": 3,
+            "liftLevel": 0,
+            "startInCodeBuffer": 0,
+            "type": "Ident",
+            "whitespaceLengthInCodeBuffer": 0,
+            "whitespaceStartInCodeBuffer": 0,
+          },
+          "type": "Ident",
+          "whitespaceLengthInCodeParsed": 0,
+          "whitespaceStartInCodeParsed": 0,
+        },
+        "type": "App",
+        "whitespaceLengthInCodeParsed": 0,
+        "whitespaceStartInCodeParsed": 0,
+      },
+      "newline": {
+        "lengthInCodeBuffer": 0,
+        "startInCodeBuffer": 0,
+        "type": "Newline",
+        "whitespaceLengthInCodeBuffer": 0,
+        "whitespaceStartInCodeBuffer": 0,
+      },
+    },
+    {
+      "expression": undefined,
+      "newline": {
+        "lengthInCodeBuffer": 1,
+        "startInCodeBuffer": 7,
+        "type": "Newline",
+        "whitespaceLengthInCodeBuffer": 0,
+        "whitespaceStartInCodeBuffer": 7,
+      },
+    },
+  ],
+  "type": "BodyBlock",
+  "whitespaceLengthInCodeParsed": 0,
+  "whitespaceStartInCodeParsed": 0,
+}
+`;
+
 exports[`Parsing 'foo bar=baz' 1`] = `
 {
  "childrenLengthInCodeParsed": 11,
--- a/app/gui2/src/util/ast/index.ts
+++ b/app/gui2/src/util/ast/index.ts
@ -18,10 +18,10 @@ export function parseEnso(code: string): Tree {
 export function parseEnsoLine(code: string): Tree {
  const block = parseEnso(code)
  assert(block.type === Tree.Type.BodyBlock)
-  const statemets = block.statements[Symbol.iterator]()
-  const firstLine = statemets.next()
+  const statements = block.statements[Symbol.iterator]()
+  const firstLine = statements.next()
  assert(!firstLine.done)
-  assert(!!statemets.next().done)
+  assert(!!statements.next().done)
  assert(firstLine.value.expression != null)
  return firstLine.value.expression
 }
@ -95,14 +95,13 @@ function treePath(obj: LazyObject, pred: (node: Tree) => boolean): Tree[] {
 if (import.meta.vitest) {
  const { test, expect } = import.meta.vitest

-  // Not working cases commented.
  const parseCases = [
-    ' foo bar\n',
+    'foo bar\n',
    'Data.read\n2 + 2',
    'Data.read File\n2 + 3',
-    // 'Data.read "File"\n2 + 3',
+    'Data.read "File"\n2 + 3',
    'foo bar=baz',
-    // '2\n + 3\n + 4',
+    '2\n + 3\n + 4',
  ]

  test.each(parseCases)("Parsing '%s'", (code) => {
--- a/app/gui2/src/util/ast/opr.ts
+++ b/app/gui2/src/util/ast/opr.ts
@ -3,7 +3,7 @@ import { assert } from '@/util/assert'
 import { parseEnsoLine, readAstSpan, readTokenSpan } from '@/util/ast'
 import type { Result } from '@/util/result'

-/** An operand of one of the applications inside `GenralOprApp` */
+/** An operand of one of the applications inside `GeneralOprApp` */
 export type GeneralOperand =
  | Operand
  // A part of `GeneralOprApp`, consisting of lhs and first `statements` of applications.
@ -66,7 +66,7 @@ export class GeneralOprApp {
      expectedOpr = oprCode
    }
    if (matchingOprs === this.apps.length) {
-      // If all operatros matched, the lhs may be a continuation of this chain.
+      // If all operators matched, the lhs may be a continuation of this chain.
      if (this.lhs != null) yield* operandsOfLeftAssocOprChain(this.lhs, code, expectedOpr)
      else yield null
    } else {
@ -203,15 +203,14 @@ if (import.meta.vitest) {
    {
      code: '2\n * 3\n + 44',
      result: [
-        { type: 'partOfOprBlockApp', repr: '2\n * 3\n + 4', statemets: 1 },
+        { type: 'partOfOprBlockApp', repr: '2\n * 3\n + 44', statements: 1 },
        { type: 'ast', repr: '44' },
      ],
    },
-    // There is a bug in AST spans in some OperatorBlockApplications. Fix this test once fixed
    {
      code: '2\n + 3\n * 4\n + 55',
      result: [
-        { type: 'partOfOprBlockApp', repr: '2\n + 3\n * 4\n + 5', statemets: 2 },
+        { type: 'partOfOprBlockApp', repr: '2\n + 3\n * 4\n + 55', statements: 2 },
        { type: 'ast', repr: '55' },
      ],
    },
@ -241,7 +240,7 @@ if (import.meta.vitest) {
    }: {
      code: string
      opr?: string
-      result: { type: string; repr: string; statemets?: number }[]
+      result: { type: string; repr: string; statements?: number }[]
    }) => {
      const ast = parseEnsoLine(code)
      const actual = operandsOfLeftAssocOprChain(ast, code, opr)
@ -258,7 +257,7 @@ if (import.meta.vitest) {
          } else {
            assert(actual?.type == 'partOfOprBlockApp')
            expect(readAstSpan(actual.ast, code)).toStrictEqual(expected?.repr)
-            expect(actual.statements).toStrictEqual(expected?.statemets)
+            expect(actual.statements).toStrictEqual(expected?.statements)
          }
        }
      }
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/XML.enso
@ -339,14 +339,14 @@ type XML_Element
    Value (java_element:Element) (~children_cache:(Vector (XML_Element | Text)))

 type XML_Error
-    # An error that indicates that the XML data could not be parsed.
+    ## An error that indicates that the XML data could not be parsed.

-      Arguments:
-      - line_number: the line on which the parse failed.
-      - column_number: the column at which the parse failed.
+       Arguments:
+       - line_number: the line on which the parse failed.
+       - column_number: the column at which the parse failed.
    Parse_Error (line_number : Integer) (column_number : Integer)

-    # Any other XML-related Java exception.
+    ## Any other XML-related Java exception.
    Other (error : Text)

    ## PRIVATE
--- a/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java
+++ b/engine/runtime-with-polyglot/src/test/java/org/enso/interpreter/test/JsInteropTest.java
@ -29,7 +29,7 @@ public class JsInteropTest extends TestBase {
  public void testDefaultJSPrint() {
    var src = """
      from Standard.Base import Json
-      
+
      main =
        json = Json.parse <| '''
          {
@ -38,7 +38,7 @@ public class JsInteropTest extends TestBase {
            }
          }
        json.get "inner"
-    """;
+      """;
    Value res = evalModule(ctx, src);
    assertEquals("{\"a\":1}", res.toString());
  }
--- a/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java
+++ b/engine/runtime/src/bench/java/org/enso/interpreter/bench/benchmarks/semantic/NestedPatternCompilationBenchmarks.java
@ -65,7 +65,7 @@ public class NestedPatternCompilationBenchmarks {

            list_of_6 =
                List.Cons 1 (List.Cons 2 (List.Cons 3 (List.Cons 4 (List.Cons 5 (List.Cons 6 List.Nil)))))
-        """;
+            """;
    }

    @Benchmark
--- a/lib/rust/parser/debug/src/lib.rs
+++ b/lib/rust/parser/debug/src/lib.rs
@ -185,3 +185,47 @@ fn tuplify(value: Value) -> Value {
    let cdr = tuplify(cdr);
    Value::Cons(lexpr::Cons::new(car, cdr))
 }
+
+
+
+// ========================
+// === Span consistency ===
+// ========================
+
+/// Check the internal consistency of the `Tree` and `Token` spans from the given root, and validate
+/// that every character in the given range is covered exactly once in the token spans.
+pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std::ops::Range<u32>) {
+    let mut sum_span = None;
+    fn concat<T: PartialEq + std::fmt::Debug + Copy>(
+        a: &Option<std::ops::Range<T>>,
+        b: &std::ops::Range<T>,
+    ) -> std::ops::Range<T> {
+        match a {
+            Some(a) => {
+                assert_eq!(a.end, b.start);
+                a.start..b.end
+            }
+            None => b.clone(),
+        }
+    }
+    sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range_utf16()));
+    tree.visit_items(|item| match item {
+        enso_parser::syntax::item::Ref::Token(token) => {
+            if !(token.left_offset.is_empty() && token.code.is_empty()) {
+                sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16()));
+                sum_span = Some(concat(&sum_span, &token.code.range_utf16()));
+            }
+        }
+        enso_parser::syntax::item::Ref::Tree(tree) => {
+            let children_span =
+                concat(&Some(tree.span.left_offset.code.range_utf16()), &tree.span.range_utf16());
+            validate_spans(tree, children_span.clone());
+            sum_span = Some(concat(&sum_span, &children_span));
+        }
+    });
+    if expected_span.is_empty() {
+        assert!(sum_span.map_or(true, |range| range.is_empty()));
+    } else {
+        assert_eq!(sum_span.unwrap(), expected_span);
+    }
+}
--- a/lib/rust/parser/debug/src/main.rs
+++ b/lib/rust/parser/debug/src/main.rs
@ -40,6 +40,8 @@ fn check_file(path: &str, mut code: &str) {
        code = code_;
    }
    let ast = enso_parser::Parser::new().run(code);
+    let expected_span = 0..(code.encode_utf16().count() as u32);
+    enso_parser_debug::validate_spans(&ast, expected_span);
    for (parsed, original) in ast.code().lines().zip(code.lines()) {
        assert_eq!(parsed, original, "Bug: dropped tokens, while parsing: {path}");
    }
--- a/lib/rust/parser/debug/tests/metadata/mod.rs
+++ b/lib/rust/parser/debug/tests/metadata/mod.rs
@ -10,7 +10,7 @@ parse_json1 = Json.parse 3

 main =
 ## The file contains three different sheets relating to operations of an
-online store.
+   online store.
 operator2 = Enso_Project.data / 3
 ## Read the customers table.
 operator3 = operator2.read_xlsx 3
@ -19,10 +19,10 @@ operator4 = operator2.read_xlsx 3
 ## Read the orders history.
 operator5 = operator2.read_xlsx 3
 ## Index the items table on `Item ID`. This will allow this data to be
-joined to other tables that also contain Item IDs.
+   joined to other tables that also contain Item IDs.
 operator7 = operator4.set_index 3
 ## Join the item data to the order history, to get information on item
-prices in the orders table.
+   prices in the orders table.
 operator8 = operator5.join operator7 3
 operator1 = operator8.at 3
 operator9 = operator8.at 3
@ -30,20 +30,20 @@ operator9 = operator8.at 3
 product1 = operator1 * operator9
 operator10 = operator8.set 3 product1
 ## Group all orders by the Customer ID, to compute the total value of orders
-placed by each client.
+   placed by each client.
 operator11 = operator10.group by=3
 operator12 = operator11.at 3
 ## Compute the lifetime value of each client.
 operator13 = operator12.sum
 operator14 = operator13.rename 3
 ## Index the customers table by Customer ID. This will allow this table
-to be joined to other tables that also contain Customer IDs.
+   to be joined to other tables that also contain Customer IDs.
 operator15 = operator3.set_index 3
 ## Join the customer data into orders table, to include names in the final
-ranking.
+   ranking.
 operator16 = operator14.join operator15
 ## Sort the customers by their lifetime value, with the most valuable
-customers at the start of the table.
+   customers at the start of the table.
 operator17 = operator16.sort by=3 order=Sort_Order.Descending


--- a/lib/rust/parser/debug/tests/parse.rs
+++ b/lib/rust/parser/debug/tests/parse.rs
@ -135,22 +135,17 @@ fn doc_comments() {
         (Function (Ident id) #((() (Ident x) () ())) "=" (Ident x)))]);
    #[rustfmt::skip]
    let lines = vec![
-        " ## Test indent handling",
-        " foo",
-    ];
-    #[rustfmt::skip]
-    test!(&lines.join("\n"), (Documented (#((Section " Test indent handling")) #(())) (Ident foo)));
-    #[rustfmt::skip]
-    let lines = vec![
+        "type Foo",
        " ## Test indent handling",
        "  ",
        " foo",
    ];
    #[rustfmt::skip]
    test!(&lines.join("\n"),
-        (Documented
-         (#((Section " Test indent handling")) #(() ()))
-         (Ident foo)));
+        (TypeDef type Foo #() #(
+         (Documented
+          (#((Section " Test indent handling")) #(() ()))
+          (Ident foo)))));
 }


@ -329,7 +324,7 @@ fn assignment_simple() {

 #[test]
 fn function_inline_simple_args() {
-    test(" foo a = x", block![(Function (Ident foo) #((() (Ident a) () ())) "=" (Ident x))]);
+    test("foo a = x", block![(Function (Ident foo) #((() (Ident a) () ())) "=" (Ident x))]);
    #[rustfmt::skip]
    test("foo a b = x",
         block![(Function (Ident foo) #((() (Ident a) () ()) (() (Ident b) () ())) "=" (Ident x))]);
@ -340,7 +335,7 @@ fn function_inline_simple_args() {
             #((() (Ident a) () ()) (() (Ident b) () ()) (() (Ident c) () ()))
             "=" (Ident x))],
    );
-    test(" foo _ = x", block![(Function (Ident foo) #((() (Wildcard -1) () ())) "=" (Ident x))]);
+    test("foo _ = x", block![(Function (Ident foo) #((() (Wildcard -1) () ())) "=" (Ident x))]);
 }

 #[test]
@ -578,6 +573,11 @@ fn operator_section_in_operator_block() {
    test(&code.join("\n"), expected);
 }

+#[test]
+fn first_line_indented() {
+    expect_invalid_node(" a");
+}
+

 // === Binary Operators ===

@ -710,24 +710,21 @@ fn unary_operator_at_end_of_expression() {

 #[test]
 fn unspaced_operator_sequence() {
-    let cases = [
-        // Add a negated value.
-        ("x = y+-z", block![
-            (Assignment (Ident x) "=" (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" (Ident z))))]),
-        // Create an operator section that adds a negated value to its input.
-        ("x = +-z", block![
-            (Assignment (Ident x) "=" (OprSectionBoundary 1
-                (OprApp () (Ok "+") (UnaryOprApp "-" (Ident z)))))]),
-        // Create an operator section that adds its input, negated, to a value.
-        ("x = y+-", block![
-            (Assignment (Ident x) "=" (OprSectionBoundary 1
-                (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" ()))))]),
-        // Assign a negative number to x.
-        ("x=-1", block![(Assignment (Ident x) "=" (UnaryOprApp "-" (Number () "1" ())))]),
-        // Assign a negated value to x.
-        ("x=-y", block![(Assignment (Ident x) "=" (UnaryOprApp "-" (Ident y)))]),
-    ];
-    cases.into_iter().for_each(|(code, expected)| test(code, expected));
+    // Add a negated value.
+    test!("x = y+-z",
+        (Assignment (Ident x) "=" (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" (Ident z)))));
+    // Create an operator section that adds a negated value to its input.
+    test!("x = +-z",
+        (Assignment (Ident x) "=" (OprSectionBoundary 1
+            (OprApp () (Ok "+") (UnaryOprApp "-" (Ident z))))));
+    // Create an operator section that adds its input, negated, to a value.
+    test!("x = y+-",
+        (Assignment (Ident x) "=" (OprSectionBoundary 1
+            (OprApp (Ident y) (Ok "+") (UnaryOprApp "-" ())))));
+    // Assign a negative number to x.
+    test!("x=-1", (Assignment (Ident x) "=" (UnaryOprApp "-" (Number () "1" ()))));
+    // Assign a negated value to x.
+    test!("x=-y", (Assignment (Ident x) "=" (UnaryOprApp "-" (Ident y))));
 }

 #[test]
@ -891,7 +888,7 @@ fn metadata_raw() {
 fn metadata_parsing() {
    let code = metadata::ORDERS_WITH_METADATA;
    let (meta, code) = enso_parser::metadata::parse(code).unwrap();
-    let _ast = enso_parser::Parser::new().run(code);
+    let _ast = parse(code);
    let _meta: enso_parser::metadata::Metadata = meta.unwrap();
 }

@ -989,8 +986,7 @@ x"#;
        (Ident x)
    ];
    test(code, expected);
-
-    let code = "  x = \"\"\"\n    Indented multiline\n  x";
+    let code = "x = \"\"\"\n    Indented multiline\nx";
    #[rustfmt::skip]
    let expected = block![
        (Assignment (Ident x) "=" (TextLiteral #((Section "Indented multiline"))))
@ -1153,6 +1149,27 @@ fn case_expression() {
    test(&code.join("\n"), expected);
 }

+#[test]
+fn case_documentation() {
+    #[rustfmt::skip]
+    let code = [
+        "case a of",
+        "    ## The Some case",
+        "    Some -> x",
+        "    ## The Int case",
+        "    Int -> x",
+    ];
+    #[rustfmt::skip]
+    let expected = block![
+        (CaseOf (Ident a) #(
+            (((#((Section " The Some case")) #()) () () ()))
+            ((() (Ident Some) "->" (Ident x)))
+            (((#((Section " The Int case")) #()) () () ()))
+            ((() (Ident Int) "->" (Ident x)))))
+    ];
+    test(&code.join("\n"), expected);
+}
+
 #[test]
 fn case_by_type() {
    macro_rules! test_case {
@ -1247,34 +1264,50 @@ fn tuple_literals() {

 // === Numeric literals ===

-#[test]
-fn numbers() {
-    test!("1 . 0", (OprApp (Number () "1" ()) (Ok ".") (Number () "0" ())));
-    test!("1 .0",
-        (App (Number () "1" ()) (OprSectionBoundary 1 (OprApp () (Ok ".") (Number () "0" ())))));
-    test!("1. 0",
-        (OprSectionBoundary 1 (App (OprApp (Number () "1" ()) (Ok ".") ()) (Number () "0" ()))));
-    test!("0b10101010", (Number "0b" "10101010" ()));
-    test!("0o122137", (Number "0o" "122137" ()));
-    test!("0xAE2F14", (Number "0x" "AE2F14" ()));
-    test!("pi = 3.14", (Assignment (Ident pi) "=" (Number () "3" ("." "14"))));
-    test!("0.0.x", (OprApp (Number () "0" ("." "0")) (Ok ".") (Ident x)));
-}
+#[cfg(test)]
+mod numbers {
+    use super::*;

-#[test]
-// This syntax cannot be used until we remove old-nondecimal number support, which is
-// needed for compatibility until the old parser is fully replaced.
-#[ignore]
-fn new_delimited_numbers() {
-    test!("100_000", (Number () "100_000" ()));
-    test!("10_000.99", (Number () "10_000" ("." "99")));
-}
+    #[test]
+    fn with_decimal() {
+        test!("1 . 0", (OprApp (Number () "1" ()) (Ok ".") (Number () "0" ())));
+        test!("1 .0",
+            (App (Number () "1" ()) (OprSectionBoundary 1 (OprApp () (Ok ".") (Number () "0" ())))));
+        test!("1. 0",
+            (OprSectionBoundary 1 (App (OprApp (Number () "1" ()) (Ok ".") ()) (Number () "0" ()))));
+        test!("pi = 3.14", (Assignment (Ident pi) "=" (Number () "3" ("." "14"))));
+        test!("0.0.x", (OprApp (Number () "0" ("." "0")) (Ok ".") (Ident x)));
+    }

-#[test]
-fn old_nondecimal_numbers() {
-    test!("2_01101101", (Number "2_" "01101101" ()));
-    test!("-2_01101101", (UnaryOprApp "-" (Number "2_" "01101101" ())));
-    test!("16_17ffffffffffffffa", (Number "16_" "17ffffffffffffffa" ()));
+    #[test]
+    fn with_base() {
+        test!("0b10101010", (Number "0b" "10101010" ()));
+        test!("0o122137", (Number "0o" "122137" ()));
+        test!("0xAE2F14", (Number "0x" "AE2F14" ()));
+    }
+
+    #[test]
+    fn base_only() {
+        test!("0x", (Number "0x" () ()));
+        test!("0b", (Number "0b" () ()));
+        test!("0o", (Number "0o" () ()));
+    }
+
+    #[test]
+    // This syntax cannot be used until we remove old-nondecimal number support, which is
+    // needed for compatibility until the old parser is fully replaced.
+    #[ignore]
+    fn new_delimited() {
+        test!("100_000", (Number () "100_000" ()));
+        test!("10_000.99", (Number () "10_000" ("." "99")));
+    }
+
+    #[test]
+    fn old_nondecimal() {
+        test!("2_01101101", (Number "2_" "01101101" ()));
+        test!("-2_01101101", (UnaryOprApp "-" (Number "2_" "01101101" ())));
+        test!("16_17ffffffffffffffa", (Number "16_" "17ffffffffffffffa" ()));
+    }
 }


@ -1538,12 +1571,19 @@ fn expect_tree_representing_code(code: &str, ast: &enso_parser::syntax::Tree) {
 ///   example, a `token::Number` may be represented like: `sexp![10]`, and a `token::Ident` may look
 ///   like `sexp![foo]`.
 fn test(code: &str, expect: lexpr::Value) {
-    let ast = enso_parser::Parser::new().run(code);
+    let ast = parse(code);
    let ast_s_expr = to_s_expr(&ast, code);
    assert_eq!(ast_s_expr.to_string(), expect.to_string(), "{:?}", &ast);
    expect_tree_representing_code(code, &ast);
 }

+fn parse(code: &str) -> enso_parser::syntax::tree::Tree {
+    let ast = enso_parser::Parser::new().run(code);
+    let expected_span = 0..(code.encode_utf16().count() as u32);
+    enso_parser_debug::validate_spans(&ast, expected_span);
+    ast
+}
+

 // === Testing inputs containing syntax errors ===

@ -1555,7 +1595,7 @@ struct Errors {

 impl Errors {
    fn collect(code: &str) -> Self {
-        let ast = enso_parser::Parser::new().run(code);
+        let ast = parse(code);
        expect_tree_representing_code(code, &ast);
        let errors = core::cell::Cell::new(Errors::default());
        ast.map(|tree| match &*tree.variant {
--- a/lib/rust/parser/src/lexer.rs
+++ b/lib/rust/parser/src/lexer.rs
@ -657,7 +657,7 @@ impl<'s> Lexer<'s> {
            match token.code.as_ref() {
                // Special-case: Split into multiple operators.
                "+-" => {
-                    let (left, right) = token.split_at_(Bytes(1));
+                    let (left, right) = token.split_at(code::Length::of("+"));
                    let lhs = analyze_operator(&left.code);
                    self.submit_token(left.with_variant(token::Variant::operator(lhs)));
                    // The `-` in this case is not identical to a free `-`: It is only allowed a
@ -886,23 +886,25 @@ impl<'s> Lexer<'s> {
        if let Some(token) = token {
            if let Some(base) = base {
                self.submit_token(token.with_variant(token::Variant::number_base()));
-                let token = match base {
+                if let Some(digits) = match base {
                    token::Base::Binary => self.token(|this| this.take_while(is_binary_digit)),
                    token::Base::Octal => self.token(|this| this.take_while(is_octal_digit)),
                    token::Base::Hexadecimal =>
                        self.token(|this| this.take_while(is_hexadecimal_digit)),
-                };
-                let joiner = token::OperatorProperties::new()
-                    .with_binary_infix_precedence(u32::MAX)
-                    .as_token_joiner();
-                self.submit_token(Token(
-                    Code::empty_without_offset(),
-                    Code::empty_without_offset(),
-                    token::Variant::operator(joiner),
-                ));
-                // Every number has a digits-token, even if it's zero-length.
-                let token = token.unwrap_or_default();
-                self.submit_token(token.with_variant(token::Variant::digits(Some(base))));
+                } {
+                    // The base and the digits are separate tokens so that they can have separate
+                    // spans. A pseudo-token binds them together tightly so that the parser can
+                    // assemble them into one number node.
+                    let joiner = token::OperatorProperties::new()
+                        .with_binary_infix_precedence(u32::MAX)
+                        .as_token_joiner();
+                    self.submit_token(Token(
+                        Code::empty(self.current_offset.utf16),
+                        Code::empty(self.current_offset.utf16),
+                        token::Variant::operator(joiner),
+                    ));
+                    self.submit_token(digits.with_variant(token::Variant::digits(Some(base))));
+                }
            } else {
                self.submit_token(token.with_variant(token::Variant::digits(None)));
            }
@ -1076,11 +1078,19 @@ impl<'s> Lexer<'s> {
                }
                if let Some(indent) = new_indent {
                    if indent <= *block_indent {
-                        self.output.push(Token::from(token::text_end(
-                            Code::empty_without_offset(),
-                            Code::empty_without_offset(),
-                        )));
-                        self.end_blocks(indent);
+                        let text_end = {
+                            let location = newlines
+                                .first()
+                                .as_ref()
+                                .unwrap()
+                                .left_offset
+                                .code
+                                .position_before();
+                            let offset = Offset(VisibleOffset(0), location.clone());
+                            Token(offset, location, token::Variant::text_end())
+                        };
+                        self.output.push(text_end);
+                        self.end_blocks(indent, newlines.first().as_ref().unwrap());
                        self.output.extend(newlines);
                        if self.current_offset == text_start.0 {
                            self.last_spaces_visible_offset = text_start.1.visible;
@ -1152,7 +1162,10 @@ impl<'s> Lexer<'s> {
            let close_quote_end = self.mark();
            self.make_token(text_end, close_quote_end, token::Variant::text_end())
        } else {
-            Token::from(token::text_end(Code::empty_without_offset(), Code::empty_without_offset()))
+            Token::from(token::text_end(
+                Code::empty(self.current_offset.utf16),
+                Code::empty(self.current_offset.utf16),
+            ))
        };
        self.output.push(end_token);
        TextEndedAt::End
@ -1327,20 +1340,24 @@ impl<'s> Lexer<'s> {
        while let Some(token) = self.line_break() {
            newlines.push(token.with_variant(token::Variant::newline()));
        }
-        if !newlines.is_empty() {
+        if let Some(last) = newlines.last() {
            let block_indent = self.last_spaces_visible_offset;
            if block_indent > self.current_block_indent {
-                let block_start = self.marker_token(token::Variant::block_start());
+                let block_start = {
+                    let location = last.left_offset.code.position_before();
+                    let offset = Offset(VisibleOffset(0), location.clone());
+                    Token(offset, location, token::Variant::block_start())
+                };
                self.submit_token(block_start);
                self.start_block(block_indent);
            }
-            self.end_blocks(block_indent);
+            self.end_blocks(block_indent, newlines.first().as_ref().unwrap());
            newlines.drain(..).for_each(|token| self.submit_token(token));
        }
        self.token_storage.set_from(newlines);
    }

-    fn end_blocks(&mut self, block_indent: VisibleOffset) {
+    fn end_blocks(&mut self, block_indent: VisibleOffset, newline: &Token<'s>) {
        while block_indent < self.current_block_indent {
            let Some(previous_indent) = self.block_indent_stack.last().copied() else {
                // If the file starts at indent > 0, we treat that as the root indent level
@ -1355,7 +1372,11 @@ impl<'s> Lexer<'s> {
                break;
            }
            self.end_block();
-            let block_end = self.marker_token(token::Variant::block_end());
+            let block_end = {
+                let location = newline.left_offset.code.position_before();
+                let offset = Offset(VisibleOffset(0), location.clone());
+                Token(offset, location, token::Variant::block_end())
+            };
            self.submit_token(block_end);
        }
    }
@ -1385,22 +1406,23 @@ impl<'s> Lexer<'s> {
    /// Run the lexer. Return non-hierarchical list of tokens (the token groups will be represented
    /// as start and end tokens).
    pub fn run(mut self) -> ParseResult<Vec<Token<'s>>> {
+        // If the first line is indented, open a block for it.
        self.spaces_after_lexeme();
-        self.current_block_indent = self.last_spaces_visible_offset;
-        let mut any_parser_matched = true;
-        while any_parser_matched {
-            any_parser_matched = false;
-            for f in PARSERS {
-                if self.run_and_check_if_progressed(f) {
-                    any_parser_matched = true;
-                    break;
-                }
-            }
+        let first_block_indent = self.last_spaces_visible_offset;
+        if first_block_indent.width_in_spaces != 0 {
+            self.submit_token(token::block_start(Code::empty(0), Code::empty(0)).into());
+            self.start_block(first_block_indent);
+            self.submit_token(token::newline(Code::empty(0), Code::empty(0)).into());
        }
+        // Main parsing loop.
+        while PARSERS.iter().any(|f| self.run_and_check_if_progressed(f)) {}
+        // If any blocks were still open at EOF, close them.
        while self.end_block().is_some() {
            let block_end = self.marker_token(token::Variant::block_end());
            self.submit_token(block_end);
        }
+        // If the last line ended in whitespace, ensure it is represented; we'll attach it to a
+        // phantom newline token.
        if self.last_spaces_visible_offset != VisibleOffset(0) {
            let left_offset_start = self.current_offset - self.last_spaces_offset;
            let offset_code = self.input.slice(left_offset_start.utf8..self.current_offset.utf8);
@ -1412,13 +1434,14 @@ impl<'s> Lexer<'s> {
            let eof = token::variant::Variant::Newline(token::variant::Newline());
            self.submit_token(Token(offset, Code::empty(self.current_offset.utf16), eof));
        }
+        // Sanity check.
        let mut internal_error = self.internal_error.take();
        if self.current_char.is_some() {
            let message = format!("Lexer did not consume all input. State: {self:?}");
            internal_error.get_or_insert(message);
        }
+
        let value = self.output;
-        trace!("Tokens:\n{:#?}", value);
        ParseResult { value, internal_error }
    }
 }
@ -1491,9 +1514,30 @@ mod tests {
        }
    }

+    /// Lex the input, check the spans for consistency, and return the tokens with the span offsets
+    /// stripped.
+    fn lex_and_validate_spans(input: &str) -> Vec<Token> {
+        let result: Vec<_> = run(input).unwrap();
+        let mut sum_span = None;
+        fn concat<T: PartialEq + Debug + Copy>(a: &Option<Range<T>>, b: &Range<T>) -> Range<T> {
+            match a {
+                Some(a) => {
+                    assert_eq!(a.end, b.start);
+                    a.start..b.end
+                }
+                None => b.clone(),
+            }
+        }
+        for token in &result {
+            sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16()));
+            sum_span = Some(concat(&sum_span, &token.code.range_utf16()));
+        }
+        assert_eq!(sum_span.unwrap_or_default(), 0..(input.encode_utf16().count() as u32));
+        result.into_iter().map(|token| token.without_offsets()).collect()
+    }
+
    fn test_lexer<'s>(input: &'s str, expected: Vec<Token<'s>>) {
-        let result: Vec<_> =
-            run(input).unwrap().into_iter().map(|token| token.without_offsets()).collect();
+        let result = lex_and_validate_spans(input);
        let expected: Vec<_> = expected.into_iter().map(|token| token.without_offsets()).collect();
        assert_eq!(result, expected);
    }
@ -1517,23 +1561,21 @@ mod tests {
    #[test]
    fn test_case_block() {
        let newline = newline_(empty(), test_code("\n"));
-        test_lexer_many(vec![
-            ("\n", vec![newline_(empty(), test_code("\n"))]),
-            ("\n  foo\n  bar", vec![
-                block_start_(empty(), empty()),
-                newline.clone(),
-                ident_("  ", "foo"),
-                newline.clone(),
-                ident_("  ", "bar"),
-                block_end_(empty(), empty()),
-            ]),
-            ("foo\n    +", vec![
-                ident_("", "foo"),
-                block_start_(empty(), empty()),
-                newline,
-                operator_("    ", "+"),
-                block_end_(empty(), empty()),
-            ]),
+        test_lexer("\n", vec![newline_(empty(), test_code("\n"))]);
+        test_lexer("\n  foo\n  bar", vec![
+            block_start_(empty(), empty()),
+            newline.clone(),
+            ident_("  ", "foo"),
+            newline.clone(),
+            ident_("  ", "bar"),
+            block_end_(empty(), empty()),
+        ]);
+        test_lexer("foo\n    +", vec![
+            ident_("", "foo"),
+            block_start_(empty(), empty()),
+            newline,
+            operator_("    ", "+"),
+            block_end_(empty(), empty()),
        ]);
    }

@ -1541,21 +1583,29 @@ mod tests {
    fn test_case_block_bad_indents() {
        let newline = newline_(empty(), test_code("\n"));
        #[rustfmt::skip]
-        test_lexer_many(vec![
-            ("\n  foo\n bar\nbaz", vec![
-                block_start_(empty(), empty()),
-                newline.clone(), ident_("  ", "foo"),
-                newline.clone(), ident_(" ", "bar"),
-                block_end_(empty(), empty()),
-                newline.clone(), ident_("", "baz"),
-            ]),
-            ("\n  foo\n bar\n  baz", vec![
-                block_start_(empty(), empty()),
-                newline.clone(), ident_("  ", "foo"),
-                newline.clone(), ident_(" ", "bar"),
-                newline, ident_("  ", "baz"),
-                block_end_(empty(), empty()),
-            ]),
+        test_lexer("  foo\n  bar\nbaz", vec![
+            block_start_(empty(), empty()),
+            newline_(empty(), empty()),
+            ident_("  ", "foo"),
+            newline.clone(), ident_("  ", "bar"),
+            block_end_(empty(), empty()),
+            newline.clone(), ident_("", "baz"),
+        ]);
+        #[rustfmt::skip]
+        test_lexer("\n  foo\n bar\nbaz", vec![
+            block_start_(empty(), empty()),
+            newline.clone(), ident_("  ", "foo"),
+            newline.clone(), ident_(" ", "bar"),
+            block_end_(empty(), empty()),
+            newline.clone(), ident_("", "baz"),
+        ]);
+        #[rustfmt::skip]
+        test_lexer("\n  foo\n bar\n  baz", vec![
+            block_start_(empty(), empty()),
+            newline.clone(), ident_("  ", "foo"),
+            newline.clone(), ident_(" ", "bar"),
+            newline, ident_("  ", "baz"),
+            block_end_(empty(), empty()),
        ]);
    }

@ -1594,12 +1644,10 @@ mod tests {

    #[test]
    fn test_case_idents() {
-        test_lexer_many(vec![
-            ("", vec![]),
-            ("_", vec![wildcard_("", "_")]),
-            ("_'", vec![wildcard_("", "_'")]),
-            ("_''", vec![wildcard_("", "_''")]),
-        ]);
+        test_lexer("", vec![]);
+        test_lexer("_", vec![wildcard_("", "_")]);
+        test_lexer("_'", vec![wildcard_("", "_'")]);
+        test_lexer("_''", vec![wildcard_("", "_''")]);
        test_lexer_many(lexer_case_idents(&[
            "a",
            "a'",
@ -1629,7 +1677,7 @@ mod tests {
    #[test]
    fn test_case_operators() {
        test_lexer_many(lexer_case_operators(&["+", "-", "=", "==", "===", ":", ","]));
-        assert_eq!(run("+-").unwrap().len(), 2);
+        assert_eq!(lex_and_validate_spans("+-").len(), 2);
    }

    /// Based on https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt.
@ -1777,6 +1825,12 @@ mod tests {
            /* 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = */ "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"
        }
    }
+
+    #[test]
+    fn test_doc_comment() {
+        let code = ["## Foo.", "main = 23"].join("\n");
+        lex_and_validate_spans(&code);
+    }
 }


--- a/lib/rust/parser/src/lib.rs
+++ b/lib/rust/parser/src/lib.rs
@ -201,7 +201,6 @@ impl Default for Parser {
 /// interpreted as a variable assignment or method definition.
 fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> {
    use syntax::tree::*;
-    let mut left_offset = tree.span.left_offset.position_before();
    if let Tree { variant: box Variant::Annotated(annotated), .. } = &mut tree {
        annotated.expression = annotated.expression.take().map(expression_to_statement);
        return tree;
@ -214,14 +213,22 @@ fn expression_to_statement(mut tree: syntax::Tree<'_>) -> syntax::Tree<'_> {
        documented.expression = documented.expression.take().map(expression_to_statement);
        return tree;
    }
-    if let Tree { variant: box Variant::TypeAnnotated(annotated), span } = tree {
-        let colon = annotated.operator;
-        let type_ = annotated.type_;
-        let variable = annotated.expression;
-        let mut tree = Tree::type_signature(variable, colon, type_);
-        tree.span.left_offset += span.left_offset;
+    if let Tree { variant: box Variant::TypeAnnotated(annotated), .. } = tree {
+        let TypeAnnotated { expression, operator, type_ } = annotated;
+        tree.variant = Box::new(Variant::TypeSignature(TypeSignature {
+            variable: expression,
+            operator,
+            type_,
+        }));
        return tree;
    }
+    if matches!(&tree, Tree {
+        variant: box Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, .. }),
+        ..
+    }) {
+        return tree.with_error("Expected expression before indented block.");
+    }
+    let mut left_offset = tree.span.left_offset.position_before();
    let tree_ = &mut tree;
    let opr_app = match tree_ {
        Tree { variant: box Variant::OprApp(opr_app), span } => {
--- a/lib/rust/parser/src/macros/built_in.rs
+++ b/lib/rust/parser/src/macros/built_in.rs
@ -82,7 +82,10 @@ fn import_body<'s>(
        let field = match header.code.as_ref() {
            "polyglot" => {
                body = Some(
-                    precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty),
+                    precedence
+                        .resolve(tokens)
+                        .map(expect_ident)
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut polyglot
            }
@ -91,7 +94,7 @@ fn import_body<'s>(
                    precedence
                        .resolve(tokens)
                        .map(expect_qualified)
-                        .unwrap_or_else(expected_nonempty),
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut from
            }
@ -112,14 +115,17 @@ fn import_body<'s>(
            }
            "as" => {
                body = Some(
-                    precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty),
+                    precedence
+                        .resolve(tokens)
+                        .map(expect_ident)
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut as_
            }
            "hiding" => {
                body = Some(
                    sequence_tree(precedence, tokens, expect_ident)
-                        .unwrap_or_else(expected_nonempty),
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut hiding
            }
@ -175,7 +181,7 @@ fn export_body<'s>(
                    precedence
                        .resolve(tokens)
                        .map(expect_qualified)
-                        .unwrap_or_else(expected_nonempty),
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut from
            }
@ -196,14 +202,17 @@ fn export_body<'s>(
            }
            "as" => {
                body = Some(
-                    precedence.resolve(tokens).map(expect_ident).unwrap_or_else(expected_nonempty),
+                    precedence
+                        .resolve(tokens)
+                        .map(expect_ident)
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut as_
            }
            "hiding" => {
                body = Some(
                    sequence_tree(precedence, tokens, expect_ident)
-                        .unwrap_or_else(expected_nonempty),
+                        .unwrap_or_else(|| expected_nonempty(header.code.position_after())),
                );
                &mut hiding
            }
@ -438,10 +447,9 @@ fn case_body<'s>(
            _ => initial_case.push(item),
        }
    }
-    if let Some(_first) = initial_case.first() {
-        // FIXME: Create 0-length span at offset preceding `_first`.
-        let newline =
-            syntax::token::newline(Code::empty_without_offset(), Code::empty_without_offset());
+    if !initial_case.is_empty() {
+        let location = of_.code.position_after();
+        let newline = syntax::token::newline(location.clone(), location);
        case_builder.push(syntax::item::Line { newline, items: initial_case });
    }
    block.into_iter().for_each(|line| case_builder.push(line));
@ -825,10 +833,10 @@ fn expect_qualified(tree: syntax::Tree) -> syntax::Tree {
    }
 }

-fn expected_nonempty<'s>() -> syntax::Tree<'s> {
+fn expected_nonempty(location: Code) -> syntax::Tree {
    let empty = syntax::Tree::ident(syntax::token::ident(
-        Code::empty_without_offset(),
-        Code::empty_without_offset(),
+        location.clone(),
+        location,
        false,
        0,
        false,
--- a/lib/rust/parser/src/macros/resolver.rs
+++ b/lib/rust/parser/src/macros/resolver.rs
@ -142,24 +142,14 @@ pub struct Resolver<'s> {
 impl<'s> Resolver<'s> {
    /// Create a new resolver, in statement context.
    pub fn new_statement() -> Self {
-        let scopes = default();
-        let open_blocks = vec![syntax::item::Line {
-            newline: token::newline(Code::empty(0), Code::empty(0)),
-            items:   default(),
-        }];
-        let macro_stack = default();
-        let segments = default();
-        let items = default();
-        let context = Context::Statement;
-        let precedence = syntax::operator::Precedence::new();
        Self {
-            blocks: scopes,
-            lines: open_blocks,
-            macros: macro_stack,
-            segments,
-            items,
-            context,
-            precedence,
+            context:    Context::Statement,
+            precedence: syntax::operator::Precedence::new(),
+            blocks:     default(),
+            lines:      default(),
+            macros:     default(),
+            segments:   default(),
+            items:      default(),
        }
    }

@ -169,6 +159,10 @@ impl<'s> Resolver<'s> {
        root_macro_map: &MacroMap,
        tokens: impl IntoIterator<Item = Token<'s>>,
    ) -> syntax::Tree<'s> {
+        self.lines.push(syntax::item::Line {
+            newline: token::newline(Code::empty(0), Code::empty(0)),
+            items:   default(),
+        });
        tokens.into_iter().for_each(|t| self.push(root_macro_map, t));
        self.finish_current_line();
        let lines = self.lines.drain(..).map(|syntax::item::Line { newline, items }| {
@ -233,9 +227,11 @@ impl<'s> Resolver<'s> {
    /// Append a token to the state.
    fn push(&mut self, root_macro_map: &MacroMap, token: Token<'s>) {
        match token.variant {
-            token::Variant::Newline(_) => {
-                self.finish_current_line();
-                let newline = token::newline(token.left_offset, token.code);
+            token::Variant::Newline(newline) => {
+                if !self.lines.is_empty() {
+                    self.finish_current_line();
+                }
+                let newline = token.with_variant(newline);
                self.lines.push(syntax::item::Line { newline, items: default() });
                self.context = Context::Statement;
            }
--- a/lib/rust/parser/src/source/code.rs
+++ b/lib/rust/parser/src/source/code.rs
@ -75,21 +75,24 @@ impl<'s> Code<'s> {
        self.utf16
    }

-    /// Split the UTF-8 code at the given byte offset.
-    pub fn split_at(&self, offset: usize) -> (Self, Self) {
-        let (left, right) = self.repr.split_at(offset);
-        let left_utf16 = left.chars().map(|c| c.len_utf16() as u32).sum();
-        let right_utf16 = self.utf16 - left_utf16;
+    /// Return the start and end of the UTF-16 source code for this element.
+    pub fn range_utf16(&self) -> Range<u32> {
+        self.offset_utf16..(self.offset_utf16 + self.utf16)
+    }
+
+    /// Split the code at the given location.
+    pub fn split_at(&self, split: Length) -> (Self, Self) {
+        let (left, right) = self.repr.split_at(split.utf8);
        (
            Self {
                repr:         StrRef(left),
                offset_utf16: self.offset_utf16,
-                utf16:        left_utf16,
+                utf16:        split.utf16,
            },
            Self {
                repr:         StrRef(right),
-                offset_utf16: self.offset_utf16 + left_utf16,
-                utf16:        right_utf16,
+                offset_utf16: self.offset_utf16 + split.utf16,
+                utf16:        self.utf16 - split.utf16,
            },
        )
    }
@ -209,6 +212,12 @@ pub struct Length {
 }

 impl Length {
+    /// Returns the length of the given input.
+    #[inline(always)]
+    pub fn of(s: &str) -> Self {
+        Self { utf8: s.len(), utf16: s.encode_utf16().count() as u32 }
+    }
+
    /// Returns true if the code is empty.
    #[inline(always)]
    pub fn is_zero(&self) -> bool {
@ -220,6 +229,12 @@ impl Length {
    pub fn utf8_bytes(&self) -> usize {
        self.utf8
    }
+
+    /// Return the length in UTF-16 code units.
+    #[inline(always)]
+    pub fn utf16_len(&self) -> u32 {
+        self.utf16
+    }
 }

 impl Add for Length {
--- a/lib/rust/parser/src/source/span.rs
+++ b/lib/rust/parser/src/source/span.rs
@ -101,7 +101,7 @@ impl<'s> Offset<'s> {

    /// Return a 0-length `Span` representing the position after the end of this `Span`.
    pub fn position_after(&self) -> Self {
-        Self { visible: default(), code: self.code.position_before() }
+        Self { visible: default(), code: self.code.position_after() }
    }

    /// Return this value with its start position removed (set to 0). This can be used to compare
@ -184,6 +184,18 @@ impl<'s> Span<'s> {
    pub fn add<T: Builder<'s>>(self, elem: &mut T) -> Self {
        Builder::add_to_span(elem, self)
    }
+
+    /// Return the start and end of the UTF-16 source code for this element.
+    pub fn range_utf16(&self) -> Range<u32> {
+        let start = self.left_offset.position_after().code.range_utf16().start;
+        let end = start + self.code_length.utf16_len();
+        start..end
+    }
+
+    /// Return the sum of the whitespace length and the code length.
+    pub fn length_including_whitespace(&self) -> code::Length {
+        self.left_offset.code.length() + self.code_length
+    }
 }

 impl<'s> AsRef<Span<'s>> for Span<'s> {
@ -204,6 +216,11 @@ where
            self.left_offset += other.left_offset;
            self.code_length = other.code_length;
        } else {
+            debug_assert_eq!(
+                self.left_offset.code.position_after().range_utf16().end
+                    + self.code_length.utf16_len(),
+                other.left_offset.code.position_before().range_utf16().start
+            );
            self.code_length += other.left_offset.code.length() + other.code_length;
        }
    }
--- a/lib/rust/parser/src/syntax/operator.rs
+++ b/lib/rust/parser/src/syntax/operator.rs
@ -137,10 +137,10 @@ impl<'s> ExpressionBuilder<'s> {
    pub fn operand(&mut self, operand: Operand<syntax::Tree<'s>>) {
        if self.prev_type == Some(ItemType::Ast) {
            if let Some(Operand { value: syntax::Tree { variant: box
-                    syntax::tree::Variant::TextLiteral(ref mut lhs), .. }, .. }) = self.output.last_mut()
+                    syntax::tree::Variant::TextLiteral(ref mut lhs), span: lhs_span }, .. }) = self.output.last_mut()
                    && !lhs.closed
                    && let box syntax::tree::Variant::TextLiteral(mut rhs) = operand.value.variant {
-                syntax::tree::join_text_literals(lhs, &mut rhs, operand.value.span);
+                syntax::tree::join_text_literals(lhs, &mut rhs, lhs_span, operand.value.span);
                if let syntax::tree::TextLiteral { open: Some(open), newline: None, elements, closed: true, close: None } = lhs
                    && open.code.starts_with('#') {
                    let elements = mem::take(elements);
--- a/lib/rust/parser/src/syntax/token.rs
+++ b/lib/rust/parser/src/syntax/token.rs
@ -135,19 +135,13 @@ impl<'s, T> Token<'s, T> {
    /// position, which does not include the [`left_offset`]. It means that `split_at(Bytes(0))`
    /// will split the token into left offset only and a left-trimmed token.
    #[inline(always)]
-    pub fn split_at(self, offset: Bytes) -> (Token<'s, ()>, Token<'s, ()>, T) {
+    pub fn split_at(self, split: code::Length) -> (Token<'s, ()>, Token<'s, ()>) {
        let left_lexeme_offset = self.left_offset;
-        let right_lexeme_offset = self.code.position_after();
-        let (left_code, right_code) = self.code.split_at(offset.unchecked_raw());
+        let right_lexeme_offset =
+            Code::empty(self.code.position_before().range_utf16().end + split.utf16_len());
+        let (left_code, right_code) = self.code.split_at(split);
        let left = Token(left_lexeme_offset, left_code, ());
        let right = Token(right_lexeme_offset, right_code, ());
-        (left, right, self.variant)
-    }
-
-    /// A version of [`split_at`] that discards the associated variant.
-    #[inline(always)]
-    pub fn split_at_(self, offset: Bytes) -> (Token<'s, ()>, Token<'s, ()>) {
-        let (left, right, _) = self.split_at(offset);
        (left, right)
    }

--- a/lib/rust/parser/src/syntax/tree.rs
+++ b/lib/rust/parser/src/syntax/tree.rs
@ -608,7 +608,7 @@ impl<'s> span::Builder<'s> for ArgumentType<'s> {

 // === CaseOf ===

-/// A that may contain a case-expression in a case-of expression.
+/// A line that may contain a case-expression in a case-of expression.
 #[derive(Clone, Debug, Default, Eq, PartialEq, Visitor, Serialize, Reflect, Deserialize)]
 pub struct CaseLine<'s> {
    /// The token beginning the line. This will always be present, unless the first case-expression
@ -661,7 +661,10 @@ impl<'s> Case<'s> {

 impl<'s> span::Builder<'s> for Case<'s> {
    fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> {
-        span.add(&mut self.pattern).add(&mut self.arrow).add(&mut self.expression)
+        span.add(&mut self.documentation)
+            .add(&mut self.pattern)
+            .add(&mut self.arrow)
+            .add(&mut self.expression)
    }
 }

@ -755,20 +758,23 @@ impl<'s> span::Builder<'s> for OperatorDelimitedTree<'s> {
 pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> {
    match (&mut *func.variant, &mut *arg.variant) {
        (Variant::Annotated(func_ @ Annotated { argument: None, .. }), _) => {
+            func.span.code_length += arg.span.length_including_whitespace();
            func_.argument = maybe_apply(mem::take(&mut func_.argument), arg).into();
            func
        }
        (Variant::AnnotatedBuiltin(func_), _) => {
+            func.span.code_length += arg.span.length_including_whitespace();
            func_.expression = maybe_apply(mem::take(&mut func_.expression), arg).into();
            func
        }
-        (Variant::OprApp(OprApp { lhs: Some(_), opr: Ok(_), rhs }),
-                Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, arguments }))
-        if rhs.is_none() => {
+        (Variant::OprApp(OprApp { lhs: Some(_), opr: Ok(_), rhs: rhs @ None }),
+                Variant::ArgumentBlockApplication(ArgumentBlockApplication { lhs: None, arguments })) => {
+            func.span.code_length += arg.span.length_including_whitespace();
            *rhs = block::body_from_lines(mem::take(arguments)).into();
            func
        }
        (_, Variant::ArgumentBlockApplication(block)) if block.lhs.is_none() => {
+            arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length;
            let func_left_offset = func.span.left_offset.take_as_prefix();
            let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset);
            if let Some(first) = block.arguments.first_mut() {
@ -778,6 +784,7 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> {
            arg
        }
        (_, Variant::OperatorBlockApplication(block)) if block.lhs.is_none() => {
+            arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length;
            let func_left_offset = func.span.left_offset.take_as_prefix();
            let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset);
            if let Some(first) = block.expressions.first_mut() {
@ -822,8 +829,10 @@ fn maybe_apply<'s>(f: Option<Tree<'s>>, x: Tree<'s>) -> Tree<'s> {
 pub fn join_text_literals<'s>(
    lhs: &mut TextLiteral<'s>,
    rhs: &mut TextLiteral<'s>,
+    lhs_span: &mut Span<'s>,
    rhs_span: Span<'s>,
 ) {
+    lhs_span.code_length += rhs_span.length_including_whitespace();
    match rhs.elements.first_mut() {
        Some(TextElement::Section { text }) => text.left_offset += rhs_span.left_offset,
        Some(TextElement::Escape { token }) => token.left_offset += rhs_span.left_offset,
@ -863,6 +872,7 @@ pub fn apply_operator<'s>(
                Variant::Number(Number { base: None, integer, fractional_digits })) => {
                func_.integer = mem::take(integer);
                func_.fractional_digits = mem::take(fractional_digits);
+                lhs_.span.code_length += rhs_.span.code_length;
                lhs.take().unwrap()
            }
            _ => {
@ -901,6 +911,7 @@ pub fn apply_operator<'s>(
    {
        let dot = opr.clone();
        let digits = digits.clone();
+        lhs.span.code_length += dot.code.length() + rhs.span.code_length;
        lhs_.fractional_digits = Some(FractionalDigits { dot, digits });
        return lhs.clone();
    }
@ -912,8 +923,7 @@ pub fn apply_operator<'s>(
                }
                let ArgumentBlockApplication { lhs: _, arguments } = block;
                let arguments = mem::take(arguments);
-                let rhs_ = block::body_from_lines(arguments);
-                rhs = Some(rhs_);
+                *rhs_ = block::body_from_lines(arguments);
            }
        }
    }
--- a/lib/rust/parser/src/syntax/tree/block.rs
+++ b/lib/rust/parser/src/syntax/tree/block.rs
@ -88,7 +88,7 @@ where I: Iterator<Item = Line<'s>>
            match line.expression.map(Prefix::try_from) {
                Some(Ok(prefix)) => {
                    match self.prefixes.last_mut() {
-                        Some(prefix) => prefix.newlines().push(line.newline),
+                        Some(prefix) => prefix.push_newline(line.newline),
                        None => self.newline = Some(line.newline),
                    };
                    self.prefixes.push(prefix);
@ -96,7 +96,7 @@ where I: Iterator<Item = Line<'s>>
                Some(Err(mut statement)) => {
                    return Some(match self.prefixes.last_mut() {
                        Some(prefix) => {
-                            prefix.newlines().push(line.newline);
+                            prefix.push_newline(line.newline);
                            for prefix in self.prefixes.drain(..).rev() {
                                statement = prefix.apply_to(statement);
                            }
@ -108,7 +108,7 @@ where I: Iterator<Item = Line<'s>>
                }
                None => {
                    match self.prefixes.last_mut() {
-                        Some(prefix) => prefix.newlines().push(line.newline),
+                        Some(prefix) => prefix.push_newline(line.newline),
                        None => return Some(line.newline.into()),
                    };
                }
@ -154,23 +154,27 @@ impl<'s> TryFrom<Tree<'s>> for Prefix<'s> {
 }

 impl<'s> Prefix<'s> {
-    fn newlines(&mut self) -> &mut Vec<token::Newline<'s>> {
-        match self {
-            Prefix::Annotation { node: Annotated { newlines, .. }, .. }
-            | Prefix::BuiltinAnnotation { node: AnnotatedBuiltin { newlines, .. }, .. }
+    fn push_newline(&mut self, newline: token::Newline<'s>) {
+        let (newlines, span) = match self {
+            Prefix::Annotation { node: Annotated { newlines, .. }, span }
+            | Prefix::BuiltinAnnotation { node: AnnotatedBuiltin { newlines, .. }, span }
            | Prefix::Documentation {
                node: Documented { documentation: DocComment { newlines, .. }, .. },
-                ..
-            } => newlines,
-        }
+                span,
+            } => (newlines, span),
+        };
+        span.code_length += newline.left_offset.code.length() + newline.code.length();
+        newlines.push(newline);
    }

    fn apply_to(mut self, expression: Tree<'s>) -> Tree<'s> {
-        *(match &mut self {
-            Prefix::Annotation { node, .. } => &mut node.expression,
-            Prefix::BuiltinAnnotation { node, .. } => &mut node.expression,
-            Prefix::Documentation { node, .. } => &mut node.expression,
-        }) = Some(expression);
+        let (expr, span) = match &mut self {
+            Prefix::Annotation { node, span } => (&mut node.expression, span),
+            Prefix::BuiltinAnnotation { node, span } => (&mut node.expression, span),
+            Prefix::Documentation { node, span } => (&mut node.expression, span),
+        };
+        span.code_length += expression.span.left_offset.code.length() + expression.span.code_length;
+        *expr = Some(expression);
        self.into()
    }
 }