line:col positions in parser (#8203)

Add `line:column` information to source code references produced by the parser. This information will be used by GUI2 as part of the solution to #8134. # Important Notes - `parse_all_enso_files.sh` has been used to ensure this doesn't affect tree structures. - `parse_all_enso_files.sh` now checks emitted locations for consistency, and has been used to verify that all line:col references match the values found by an independent scan of the source up to the given UTF8 position.
2024-09-11 13:15:52 +03:00 · 2023-11-08 08:53:39 -08:00 · 2023-11-08 08:53:39 -08:00 · ce042569b0
commit ce042569b0
parent f21e09bb65
15 changed files with 1211 additions and 275 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -64,7 +64,7 @@ debug-assertions = false
 [profile.test]
 opt-level = 0
 lto = false
-debug = 1
+debug = 2
 debug-assertions = true

 [profile.integration-test]
--- a/app/gui2/parser-codegen/util.ts
+++ b/app/gui2/parser-codegen/util.ts
@ -20,14 +20,14 @@ const RENAME = new Map([
  ['type', 'typeNode'],
  // Rename source references to reflect our usage:
  // - In `Tree`s:
-  ['spanLeftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeParsed'],
-  ['spanLeftOffsetCodeUtf16', 'whitespaceLengthInCodeParsed'],
+  ['spanLeftOffsetCodeStartUtf16', 'whitespaceStartInCodeParsed'],
+  ['spanLeftOffsetCodeLenUtf16', 'whitespaceLengthInCodeParsed'],
  ['spanCodeLengthUtf16', 'childrenLengthInCodeParsed'],
  // - In `Tokens`s:
-  ['leftOffsetCodeOffsetUtf16', 'whitespaceStartInCodeBuffer'],
-  ['leftOffsetCodeUtf16', 'whitespaceLengthInCodeBuffer'],
-  ['codeUtf16', 'lengthInCodeBuffer'],
-  ['codeOffsetUtf16', 'startInCodeBuffer'],
+  ['leftOffsetCodeStartUtf16', 'whitespaceStartInCodeBuffer'],
+  ['leftOffsetCodeLenUtf16', 'whitespaceLengthInCodeBuffer'],
+  ['codeLenUtf16', 'lengthInCodeBuffer'],
+  ['codeStartUtf16', 'startInCodeBuffer'],
 ])

 export function mapIdent(ident: string): string {
--- a/app/gui2/src/util/ast/tests/snapshots/ast.test.ts.snap
+++ b/app/gui2/src/util/ast/tests/snapshots/ast.test.ts.snap
--- a/lib/rust/parser/debug/src/bin/lexer.rs
+++ b/lib/rust/parser/debug/src/bin/lexer.rs
@ -29,5 +29,5 @@ pub fn main() {
    use std::io::Read;
    let mut input = String::new();
    std::io::stdin().read_to_string(&mut input).unwrap();
-    println!("{:#?}", enso_parser::lexer::run(&input));
+    println!("{:#?}", enso_parser::lexer::debug::lex_and_validate_spans(&input));
 }
--- a/lib/rust/parser/debug/src/lib.rs
+++ b/lib/rust/parser/debug/src/lib.rs
@ -23,6 +23,7 @@
 #![warn(unused_qualifications)]

 use enso_metamodel_lexpr::ToSExpr;
+use enso_parser::source::code::debug::LocationCheck;
 use enso_reflect::Reflect;
 use lexpr::Value;
 use std::collections::HashSet;
@ -122,10 +123,18 @@ fn strip_hidden_fields(tree: Value) -> Value {
        ":spanLeftOffsetVisible",
        ":spanLeftOffsetCodeReprBegin",
        ":spanLeftOffsetCodeReprLen",
-        ":spanLeftOffsetCodeUtf16",
-        ":spanLeftOffsetCodeOffsetUtf16",
+        ":spanLeftOffsetCodeLenUtf8",
+        ":spanLeftOffsetCodeLenUtf16",
+        ":spanLeftOffsetCodeLenNewlines",
+        ":spanLeftOffsetCodeLenLineChars16",
+        ":spanLeftOffsetCodeStartUtf8",
+        ":spanLeftOffsetCodeStartUtf16",
+        ":spanLeftOffsetCodeStartLine",
+        ":spanLeftOffsetCodeStartCol16",
        ":spanCodeLengthUtf8",
        ":spanCodeLengthUtf16",
+        ":spanCodeLengthNewlines",
+        ":spanCodeLengthLineChars16",
    ];
    let hidden_tree_fields: HashSet<_> = hidden_tree_fields.into_iter().collect();
    Value::list(tree.to_vec().unwrap().into_iter().filter(|val| match val {
@ -194,7 +203,11 @@ fn tuplify(value: Value) -> Value {

 /// Check the internal consistency of the `Tree` and `Token` spans from the given root, and validate
 /// that every character in the given range is covered exactly once in the token spans.
-pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std::ops::Range<u32>) {
+pub fn validate_spans(
+    tree: &enso_parser::syntax::tree::Tree,
+    expected_span: std::ops::Range<u32>,
+    locations: &mut LocationCheck,
+) {
    let mut sum_span = None;
    fn concat<T: PartialEq + std::fmt::Debug + Copy>(
        a: &Option<std::ops::Range<T>>,
@ -208,24 +221,33 @@ pub fn validate_spans(tree: &enso_parser::syntax::tree::Tree, expected_span: std
            None => b.clone(),
        }
    }
-    sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range_utf16()));
+    sum_span = Some(concat(&sum_span, &tree.span.left_offset.code.range()));
    tree.visit_items(|item| match item {
        enso_parser::syntax::item::Ref::Token(token) => {
            if !(token.left_offset.is_empty() && token.code.is_empty()) {
-                sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16()));
-                sum_span = Some(concat(&sum_span, &token.code.range_utf16()));
+                sum_span = Some(concat(&sum_span, &token.left_offset.code.range()));
+                sum_span = Some(concat(&sum_span, &token.code.range()));
            }
+            let left_offset = token.left_offset.code.range();
+            let code = token.code.range();
+            locations.extend(&[left_offset.start, left_offset.end, code.start, code.end]);
        }
        enso_parser::syntax::item::Ref::Tree(tree) => {
            let children_span =
-                concat(&Some(tree.span.left_offset.code.range_utf16()), &tree.span.range_utf16());
-            validate_spans(tree, children_span.clone());
+                concat(&Some(tree.span.left_offset.code.range()), &tree.span.range());
+            let children_span_ = children_span.start.utf16..children_span.end.utf16;
+            validate_spans(tree, children_span_, locations);
            sum_span = Some(concat(&sum_span, &children_span));
+            let left_offset = tree.span.left_offset.code.range();
+            let code = tree.span.range();
+            locations.extend(&[left_offset.start, left_offset.end, code.start, code.end]);
        }
    });
    if expected_span.is_empty() {
        assert!(sum_span.map_or(true, |range| range.is_empty()));
    } else {
-        assert_eq!(sum_span.unwrap(), expected_span);
+        let sum_span = sum_span.unwrap_or_default();
+        let sum_span = sum_span.start.utf16..sum_span.end.utf16;
+        assert_eq!(sum_span, expected_span);
    }
 }
--- a/lib/rust/parser/debug/src/main.rs
+++ b/lib/rust/parser/debug/src/main.rs
@ -41,10 +41,12 @@ fn check_file(path: &str, mut code: &str) {
    }
    let ast = enso_parser::Parser::new().run(code);
    let expected_span = 0..(code.encode_utf16().count() as u32);
-    enso_parser_debug::validate_spans(&ast, expected_span);
+    let mut locations = enso_parser::source::code::debug::LocationCheck::new();
+    enso_parser_debug::validate_spans(&ast, expected_span, &mut locations);
    for (parsed, original) in ast.code().lines().zip(code.lines()) {
        assert_eq!(parsed, original, "Bug: dropped tokens, while parsing: {path}");
    }
+    locations.check(code);
    let s_expr = enso_parser_debug::to_s_expr(&ast, code);
    println!("{s_expr}");
 }
--- a/lib/rust/parser/debug/tests/parse.rs
+++ b/lib/rust/parser/debug/tests/parse.rs
@ -482,6 +482,13 @@ fn dot_operator_blocks() {

 #[test]
 fn code_block_argument_list() {
+    #[rustfmt::skip]
+    let code = [
+        "foo",
+        "    bar",
+    ];
+    test!(&code.join("\n"), (ArgumentBlockApplication (Ident foo) #((Ident bar))));
+
    #[rustfmt::skip]
    let code = [
        "value = foo",
@ -492,7 +499,6 @@ fn code_block_argument_list() {
    ];
    test(&code.join("\n"), expect);

-
    #[rustfmt::skip]
    let code = [
        "value = foo",
@ -1012,28 +1018,19 @@ x"#;

 #[test]
 fn interpolated_literals_in_inline_text() {
-    #[rustfmt::skip]
-    let cases = [
-        (r#"'Simple case.'"#, block![(TextLiteral #((Section "Simple case.")))]),
-        (r#"'With a `splice`.'"#, block![(TextLiteral
-            #((Section "With a ")
-              (Splice (Ident splice))
-              (Section ".")))]),
-        (r#"'` SpliceWithLeadingWhitespace`'"#, block![(TextLiteral
-            #((Splice (Ident SpliceWithLeadingWhitespace))))]),
-        (r#"'String with \n escape'"#, block![
-            (TextLiteral
-             #((Section "String with ") (Escape '\n') (Section " escape")))]),
-        (r#"'\x0Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\u000Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\u{0000A}escape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-        (r#"'\U0000000Aescape'"#, block![
-            (TextLiteral #((Escape '\n') (Section "escape")))]),
-    ];
-    cases.into_iter().for_each(|(code, expected)| test(code, expected));
+    test!(r#"'Simple case.'"#, (TextLiteral #((Section "Simple case."))));
+    test!(r#"'With a `splice`.'"#, (TextLiteral
+        #((Section "With a ")
+          (Splice (Ident splice))
+          (Section "."))));
+    test!(r#"'` SpliceWithLeadingWhitespace`'"#,
+        (TextLiteral #((Splice (Ident SpliceWithLeadingWhitespace)))));
+    test!(r#"'String with \n escape'"#,
+        (TextLiteral #((Section "String with ") (Escape '\n') (Section " escape"))));
+    test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+    test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
 }

 #[test]
@ -1580,7 +1577,9 @@ fn test(code: &str, expect: lexpr::Value) {
 fn parse(code: &str) -> enso_parser::syntax::tree::Tree {
    let ast = enso_parser::Parser::new().run(code);
    let expected_span = 0..(code.encode_utf16().count() as u32);
-    enso_parser_debug::validate_spans(&ast, expected_span);
+    let mut locations = enso_parser::source::code::debug::LocationCheck::new();
+    enso_parser_debug::validate_spans(&ast, expected_span, &mut locations);
+    locations.check(code);
    ast
 }

--- a/lib/rust/parser/debug/tools/parse_all_enso_files.sh
+++ b/lib/rust/parser/debug/tools/parse_all_enso_files.sh
@ -15,7 +15,12 @@
 set -e

 cargo build -p enso-parser-debug --bin enso-parser-debug
+cargo build -p enso-parser-debug --bin lexer

 ENSO_FILES=$(find distribution/ test/ -name '*.enso' -print | sort)
-for x in $ENSO_FILES; do echo -n "$x "; target/rust/debug/enso-parser-debug <$x; done
+for x in $ENSO_FILES; do
+	echo -n "$x "
+	target/rust/debug/lexer <$x >/dev/null
+	target/rust/debug/enso-parser-debug <$x
+done

--- a/lib/rust/parser/generate-java/src/serialization.rs
+++ b/lib/rust/parser/generate-java/src/serialization.rs
@ -19,6 +19,9 @@ const CODE_GETTER: &str = "codeRepr";
 const WHITESPACE_GETTER: &str = "getWhitespace";
 const TREE_BEGIN: &str = "fieldSpanLeftOffsetCodeReprBegin";
 const TREE_LEN: &str = "fieldSpanLeftOffsetCodeReprLen";
+const TREE_WHITESPACE: &str = "fieldSpanLeftOffsetCodeLenUtf16";
+const TOKEN_WHITESPACE: &str = "fieldLeftOffsetCodeLenUtf16";
+const TOKEN_CODE_LENGTH: &str = "fieldCodeLenUtf16";

 /// Derive deserialization for all types in the typegraph.
 pub fn derive(graph: &mut TypeGraph, tree: ClassId, token: ClassId) {
@ -151,16 +154,16 @@ fn start_whitespace() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'sta
    |MaterializerInput { message }| format!("{message}.position()")
 }
 fn start_code_tree() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldSpanLeftOffsetCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TREE_WHITESPACE})")
 }
 fn end_code_tree() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
    |MaterializerInput { message }| format!("{message}.position()")
 }
 fn start_code_token() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldLeftOffsetCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TOKEN_WHITESPACE})")
 }
 fn end_code_token() -> impl for<'a> Fn(MaterializerInput<'a>) -> String + 'static {
-    |MaterializerInput { message }| format!("{message}.advance(fieldCodeUtf16)")
+    |MaterializerInput { message }| format!("{message}.advance({TOKEN_CODE_LENGTH})")
 }


--- a/lib/rust/parser/src/lexer.rs
+++ b/lib/rust/parser/src/lexer.rs
@ -8,10 +8,11 @@ use crate::prelude::*;
 use crate::source::*;
 use crate::syntax::*;

+use crate::source::code::Length;
+use crate::source::code::Location;
 use std::str;


-
 // =================
 // === Constants ===
 // =================
@ -89,35 +90,13 @@ pub struct Lexer<'s> {
    token_storage: VecAllocation<Token<'s>>,
 }

-#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
-struct StrOffset {
-    utf8:  Bytes,
-    utf16: u32,
-}
-
-impl Sub for StrOffset {
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        Self { utf8: self.utf8 - rhs.utf8, utf16: self.utf16 - rhs.utf16 }
-    }
-}
-
-impl Add for StrOffset {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self { utf8: self.utf8 + rhs.utf8, utf16: self.utf16 + rhs.utf16 }
-    }
-}
-
 /// Internal state of the [`Lexer`].
 #[derive(Debug, Default)]
 #[allow(missing_docs)]
 pub struct LexerState {
    current_char: Option<char>,
-    current_offset: StrOffset,
-    last_spaces_offset: StrOffset,
+    current_offset: Location,
+    last_spaces_offset: Location,
    last_spaces_visible_offset: VisibleOffset,
    current_block_indent: VisibleOffset,
    block_indent_stack: Vec<VisibleOffset>,
@ -139,7 +118,11 @@ enum State {
    },
 }

-type Mark<'s> = (StrOffset, Offset<'s>);
+#[derive(Debug, Clone)]
+struct Mark<'s> {
+    location: Location,
+    offset:   Offset<'s>,
+}

 impl<'s> Lexer<'s> {
    /// Constructor.
@ -163,17 +146,24 @@ impl<'s> Lexer<'s> {
    fn next_input_char(&mut self) -> bool {
        let next = self.iterator.next();
        if let Some((current_offset, current_char)) = next {
-            self.current_offset = StrOffset {
-                utf8:  Bytes(current_offset),
-                utf16: self.current_offset.utf16
-                    + self.current_char.map_or(0, |c| c.len_utf16() as u32),
+            let prev = self.current_offset;
+            let char_len16 = self.current_char.map_or(0, |c| c.len_utf16() as u32);
+            self.current_offset = Location {
+                utf8:  u32_from(current_offset),
+                utf16: prev.utf16 + char_len16,
+                line:  prev.line,
+                col16: prev.col16 + char_len16,
            };
            self.current_char = Some(current_char);
            true
        } else if let Some(c) = self.current_char {
-            self.current_offset = StrOffset {
-                utf8:  Bytes(self.input.len()),
-                utf16: self.current_offset.utf16 + c.len_utf16() as u32,
+            let prev = self.current_offset;
+            let char_len16 = c.len_utf16() as u32;
+            self.current_offset = Location {
+                utf8:  u32_from(self.input.len()),
+                utf16: prev.utf16 + char_len16,
+                line:  prev.line,
+                col16: prev.col16 + char_len16,
            };
            self.current_char = None;
            true
@ -182,34 +172,28 @@ impl<'s> Lexer<'s> {
        }
    }

-    /// Run the provided function and compute how much input it consumed.
-    #[inline(always)]
-    fn run_and_get_offset<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> (T, StrOffset) {
-        let start_offset = self.current_offset;
-        let out = f(self);
-        let len = self.current_offset - start_offset;
-        (out, len)
-    }
-
    /// Run the provided function and check if it consumed any input.
    #[inline(always)]
    fn run_and_check_if_progressed(&mut self, f: impl FnOnce(&mut Self)) -> bool {
-        self.run_and_get_offset(f).1.utf8.is_positive()
+        let start = self.current_offset;
+        f(self);
+        let end = self.current_offset;
+        end != start
    }

    /// Consume spaces after parsing a [`Token`] and update the internal spacing info.
    #[inline(always)]
    fn spaces_after_lexeme(&mut self) {
-        (self.last_spaces_visible_offset, self.last_spaces_offset) =
-            self.run_and_get_offset(|this| this.spaces());
+        self.last_spaces_offset = self.current_offset;
+        self.last_spaces_visible_offset = self.spaces();
    }

    /// Consume spaces after parsing a [`Token`] and update the internal spacing info. Doesn't
    /// consume more than the specified [`VisibleOffset`] of spaces.
    #[inline(always)]
    fn spaces_after_lexeme_with_limit(&mut self, limit: VisibleOffset) {
-        (self.last_spaces_visible_offset, self.last_spaces_offset) =
-            self.run_and_get_offset(|this| this.spaces_with_limit(limit));
+        self.last_spaces_offset = self.current_offset;
+        self.last_spaces_visible_offset = self.spaces_with_limit(limit);
    }

    /// Run the provided function. If it consumed any chars, return the [`Token`] containing the
@ -217,21 +201,19 @@ impl<'s> Lexer<'s> {
    #[inline(always)]
    fn token<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> Option<Token<'s, T>> {
        let start = self.current_offset;
-        let (elem, len) = self.run_and_get_offset(f);
-        len.utf8.is_positive().as_some_from(|| {
-            let end = start + len;
-            let left_offset_start = start - self.last_spaces_offset;
-            let (offset_code, code) = self
-                .input
-                .slice(left_offset_start.utf8..end.utf8)
-                .split_at(self.last_spaces_offset.utf8.unchecked_raw());
+        let elem = f(self);
+        let end = self.current_offset;
+        (end != start).as_some_from(|| {
+            let left_offset_start = self.last_spaces_offset;
+            let (offset_code, code) = self.input
+                [usize_from(left_offset_start.utf8)..usize_from(end.utf8)]
+                .split_at(usize_from(start.utf8 - left_offset_start.utf8));
            let visible_offset = self.last_spaces_visible_offset;
-            let offset = Offset(
-                visible_offset,
-                Code::from_str_at_offset(offset_code, left_offset_start.utf16),
-            );
+            let offset =
+                Offset(visible_offset, Code::from_str_at_location(offset_code, left_offset_start));
            self.spaces_after_lexeme();
-            Token(offset, Code::from_str_at_offset(code, start.utf16), elem)
+            debug_assert_eq!(left_offset_start + Length::of(offset_code), start);
+            Token(offset, Code::from_str_at_location(code, start), elem)
        })
    }

@ -240,9 +222,9 @@ impl<'s> Lexer<'s> {
    #[inline(always)]
    fn marker_token<T>(&mut self, elem: T) -> Token<'s, T> {
        let visible_offset = VisibleOffset(0);
-        let start = self.current_offset - self.last_spaces_offset;
-        let offset = Offset(visible_offset, Code::empty(start.utf16));
-        Token(offset, Code::empty(start.utf16), elem)
+        let start = self.last_spaces_offset;
+        let offset = Offset(visible_offset, Code::empty(start));
+        Token(offset, Code::empty(start), elem)
    }

    /// Push the [`token`] to the result stream.
@ -657,7 +639,7 @@ impl<'s> Lexer<'s> {
            match token.code.as_ref() {
                // Special-case: Split into multiple operators.
                "+-" => {
-                    let (left, right) = token.split_at(code::Length::of("+"));
+                    let (left, right) = token.split_at(Length::of("+"));
                    let lhs = analyze_operator(&left.code);
                    self.submit_token(left.with_variant(token::Variant::operator(lhs)));
                    // The `-` in this case is not identical to a free `-`: It is only allowed a
@ -886,6 +868,7 @@ impl<'s> Lexer<'s> {
        if let Some(token) = token {
            if let Some(base) = base {
                self.submit_token(token.with_variant(token::Variant::number_base()));
+                let after_base = self.current_offset;
                if let Some(digits) = match base {
                    token::Base::Binary => self.token(|this| this.take_while(is_binary_digit)),
                    token::Base::Octal => self.token(|this| this.take_while(is_octal_digit)),
@ -899,8 +882,8 @@ impl<'s> Lexer<'s> {
                        .with_binary_infix_precedence(u32::MAX)
                        .as_token_joiner();
                    self.submit_token(Token(
-                        Code::empty(self.current_offset.utf16),
-                        Code::empty(self.current_offset.utf16),
+                        Code::empty(after_base),
+                        Code::empty(after_base),
                        token::Variant::operator(joiner),
                    ));
                    self.submit_token(digits.with_variant(token::Variant::digits(Some(base))));
@ -937,12 +920,12 @@ impl<'s> Lexer<'s> {
        };
        let indent = self.current_block_indent;
        let open_quote_start = self.mark();
-        self.last_spaces_visible_offset = VisibleOffset(0);
-        self.last_spaces_offset = default();
        self.take_next();
+        self.last_spaces_visible_offset = VisibleOffset(0);
+        self.last_spaces_offset = self.current_offset;
        // At least two quote characters.
        if let Some(char) = self.current_char && char == quote_char {
-            let close_quote_start = self.mark();
+            let close_quote_start = self.mark_without_whitespace();
            self.take_next();
            let mut multiline = false;
            // If more than two quote characters: Start a multiline quote.
@ -955,7 +938,7 @@ impl<'s> Lexer<'s> {
                return;
            } else {
                // Exactly two quote characters: Open and shut case.
-                let close_quote_end = self.mark();
+                let close_quote_end = self.mark_without_whitespace();
                let token = self.make_token(open_quote_start, close_quote_start.clone(),
                                            token::Variant::text_start());
                self.output.push(token);
@ -965,7 +948,7 @@ impl<'s> Lexer<'s> {
            }
        } else {
            // One quote followed by non-quote character: Inline quote.
-            let open_quote_end = self.mark();
+            let open_quote_end = self.mark_without_whitespace();
            let token = self.make_token(open_quote_start, open_quote_end,
                                        token::Variant::text_start());
            self.output.push(token);
@ -980,7 +963,7 @@ impl<'s> Lexer<'s> {
        block_indent: VisibleOffset,
        text_type: TextType,
    ) {
-        let open_quote_end = self.mark();
+        let open_quote_end = self.mark_without_whitespace();
        let token = self.make_token(open_quote_start, open_quote_end, token::Variant::text_start());
        self.output.push(token);
        let mut initial_indent = None;
@ -1002,9 +985,9 @@ impl<'s> Lexer<'s> {
    }

    fn end_splice(&mut self, state: State) {
-        let splice_quote_start = self.mark();
+        let splice_quote_start = self.mark_without_whitespace();
        self.take_next();
-        let splice_quote_end = self.mark();
+        let splice_quote_end = self.mark_without_whitespace();
        let token =
            self.make_token(splice_quote_start, splice_quote_end, token::Variant::close_symbol());
        self.output.push(token);
@ -1038,8 +1021,8 @@ impl<'s> Lexer<'s> {
                let mut newlines = vec![];
                let mut new_indent = None;
                loop {
-                    let mut before_newline = self.mark();
-                    if before_newline.0 == text_start.0 {
+                    let mut before_newline = self.mark_without_whitespace();
+                    if before_newline.location == text_start.location {
                        before_newline = text_start.clone();
                    }
                    let mut newline = self.take_1('\r');
@ -1059,7 +1042,8 @@ impl<'s> Lexer<'s> {
                    } else {
                        before_newline = text_start;
                    }
-                    let newline_end = self.mark();
+                    self.advance_line_pos();
+                    let newline_end = self.mark_without_whitespace();
                    let token =
                        self.make_token(before_newline, newline_end, token::Variant::newline());
                    newlines.push(token);
@ -1092,12 +1076,9 @@ impl<'s> Lexer<'s> {
                        self.output.push(text_end);
                        self.end_blocks(indent, newlines.first().as_ref().unwrap());
                        self.output.extend(newlines);
-                        if self.current_offset == text_start.0 {
-                            self.last_spaces_visible_offset = text_start.1.visible;
-                            self.last_spaces_offset = StrOffset {
-                                utf8:  text_start.1.code.len(),
-                                utf16: text_start.1.code.len_utf16(),
-                            };
+                        if self.current_offset == text_start.location {
+                            self.last_spaces_visible_offset = text_start.offset.visible;
+                            self.last_spaces_offset = text_start.offset.code.range().start;
                        }
                        return TextEndedAt::End;
                    }
@ -1109,7 +1090,7 @@ impl<'s> Lexer<'s> {
                }
            }
            if interpolate && char == '\\' {
-                let mut backslash_start = self.mark();
+                let mut backslash_start = self.mark_without_whitespace();
                self.take_next();
                if let Some(char) = self.current_char {
                    let token = self.make_token(
@ -1122,13 +1103,15 @@ impl<'s> Lexer<'s> {
                    } else {
                        self.output.push(token);
                    }
+                    self.last_spaces_offset = self.current_offset;
                    text_start = self.text_escape(backslash_start, char);
                    continue;
                }
+                self.last_spaces_offset = self.current_offset;
                continue;
            }
            if interpolate && char == '`' {
-                let mut splice_quote_start = self.mark();
+                let mut splice_quote_start = self.mark_without_whitespace();
                let token = self.make_token(
                    text_start.clone(),
                    splice_quote_start.clone(),
@ -1140,7 +1123,7 @@ impl<'s> Lexer<'s> {
                    self.output.push(token);
                }
                self.take_next();
-                let splice_quote_end = self.mark();
+                let splice_quote_end = self.mark_without_whitespace();
                let token = self.make_token(
                    splice_quote_start,
                    splice_quote_end,
@ -1148,23 +1131,24 @@ impl<'s> Lexer<'s> {
                );
                self.output.push(token);
                self.stack.push(state);
+                self.last_spaces_offset = self.current_offset;
                return TextEndedAt::Splice;
            }
            self.take_next();
        }
-        let text_end = self.mark();
+        let text_end = self.mark_without_whitespace();
        let token = self.make_token(text_start, text_end.clone(), token::Variant::text_section());
        if !(token.code.is_empty() && token.left_offset.code.is_empty()) {
            self.output.push(token);
        }
        let end_token = if self.current_char == closing_char {
            self.take_next();
-            let close_quote_end = self.mark();
+            let close_quote_end = self.mark_without_whitespace();
            self.make_token(text_end, close_quote_end, token::Variant::text_end())
        } else {
            Token::from(token::text_end(
-                Code::empty(self.current_offset.utf16),
-                Code::empty(self.current_offset.utf16),
+                Code::empty(self.current_offset),
+                Code::empty(self.current_offset),
            ))
        };
        self.output.push(end_token);
@ -1197,7 +1181,7 @@ impl<'s> Lexer<'s> {
            if delimited && self.current_char == Some('}') {
                self.take_next();
            }
-            let sequence_end = self.mark();
+            let sequence_end = self.mark_without_whitespace();
            let token = self.make_token(
                backslash_start,
                sequence_end.clone(),
@ -1223,7 +1207,7 @@ impl<'s> Lexer<'s> {
                _ => None,
            };
            self.take_next();
-            let escape_end = self.mark();
+            let escape_end = self.mark_without_whitespace();
            let token = self.make_token(
                backslash_start,
                escape_end.clone(),
@ -1236,23 +1220,30 @@ impl<'s> Lexer<'s> {

    fn mark(&mut self) -> Mark<'s> {
        let start = self.current_offset;
-        let left_offset_start = start - self.last_spaces_offset;
-        let offset_code = self.input.slice(left_offset_start.utf8..start.utf8);
-        let visible_offset = self.last_spaces_visible_offset;
-        self.last_spaces_visible_offset = VisibleOffset(0);
-        self.last_spaces_offset = default();
-        (
-            start,
-            Offset(visible_offset, Code::from_str_at_offset(offset_code, left_offset_start.utf16)),
-        )
+        let visible_offset = mem::take(&mut self.last_spaces_visible_offset);
+        let left_offset_start = mem::replace(&mut self.last_spaces_offset, start);
+        let offset_code = &self.input[usize_from(left_offset_start.utf8)..usize_from(start.utf8)];
+        Mark {
+            location: start,
+            offset:   Offset(
+                visible_offset,
+                Code::from_str_at_location(offset_code, left_offset_start),
+            ),
+        }
+    }
+
+    fn mark_without_whitespace(&mut self) -> Mark<'s> {
+        let start = self.current_offset;
+        self.last_spaces_offset = start;
+        self.mark()
    }

    fn make_token(&self, from: Mark<'s>, to: Mark<'s>, variant: token::Variant) -> Token<'s> {
-        let (start, offset) = from;
-        let end = to.0;
-        let start8 = start.utf8.unchecked_raw();
-        let end8 = end.utf8.unchecked_raw();
-        Token(offset, Code::from_str_at_offset(&self.input[start8..end8], start.utf16), variant)
+        let Mark { location: start, offset } = from;
+        let end = to.location;
+        let start8 = usize_from(start.utf8);
+        let end8 = usize_from(end.utf8);
+        Token(offset, Code::from_str_at_location(&self.input[start8..end8], start), variant)
    }
 }

@ -1319,15 +1310,26 @@ impl<'s> Lexer<'s> {
 // =============

 impl<'s> Lexer<'s> {
-    #[allow(clippy::collapsible_if)]
    fn line_break(&mut self) -> Option<Token<'s, ()>> {
-        self.token(|this| {
-            if !this.take_1('\n') {
-                if this.take_1('\r') {
-                    this.take_1('\n');
-                }
+        let token = self.token(|this| {
+            let matched = if this.take_1('\n') {
+                true
+            } else if this.take_1('\r') {
+                this.take_1('\n');
+                true
+            } else {
+                false
+            };
+            if matched {
+                this.advance_line_pos()
            }
-        })
+        });
+        token
+    }
+
+    fn advance_line_pos(&mut self) {
+        self.current_offset.line += 1;
+        self.current_offset.col16 = 0;
    }

    fn newlines(&mut self) {
@ -1340,11 +1342,11 @@ impl<'s> Lexer<'s> {
        while let Some(token) = self.line_break() {
            newlines.push(token.with_variant(token::Variant::newline()));
        }
-        if let Some(last) = newlines.last() {
+        if let Some(first) = newlines.first() {
            let block_indent = self.last_spaces_visible_offset;
            if block_indent > self.current_block_indent {
                let block_start = {
-                    let location = last.left_offset.code.position_before();
+                    let location = first.left_offset.code.position_before();
                    let offset = Offset(VisibleOffset(0), location.clone());
                    Token(offset, location, token::Variant::block_start())
                };
@ -1410,9 +1412,10 @@ impl<'s> Lexer<'s> {
        self.spaces_after_lexeme();
        let first_block_indent = self.last_spaces_visible_offset;
        if first_block_indent.width_in_spaces != 0 {
-            self.submit_token(token::block_start(Code::empty(0), Code::empty(0)).into());
+            let start = Location::default();
+            self.submit_token(token::block_start(Code::empty(start), Code::empty(start)).into());
            self.start_block(first_block_indent);
-            self.submit_token(token::newline(Code::empty(0), Code::empty(0)).into());
+            self.submit_token(token::newline(Code::empty(start), Code::empty(start)).into());
        }
        // Main parsing loop.
        while PARSERS.iter().any(|f| self.run_and_check_if_progressed(f)) {}
@ -1424,15 +1427,14 @@ impl<'s> Lexer<'s> {
        // If the last line ended in whitespace, ensure it is represented; we'll attach it to a
        // phantom newline token.
        if self.last_spaces_visible_offset != VisibleOffset(0) {
-            let left_offset_start = self.current_offset - self.last_spaces_offset;
-            let offset_code = self.input.slice(left_offset_start.utf8..self.current_offset.utf8);
+            let left_offset_start = self.last_spaces_offset;
+            let offset_code = &self.input
+                [usize_from(left_offset_start.utf8)..usize_from(self.current_offset.utf8)];
            let visible_offset = self.last_spaces_visible_offset;
-            let offset = Offset(
-                visible_offset,
-                Code::from_str_at_offset(offset_code, left_offset_start.utf16),
-            );
+            let offset =
+                Offset(visible_offset, Code::from_str_at_location(offset_code, left_offset_start));
            let eof = token::variant::Variant::Newline(token::variant::Newline());
-            self.submit_token(Token(offset, Code::empty(self.current_offset.utf16), eof));
+            self.submit_token(Token(offset, Code::empty(self.current_offset), eof));
        }
        // Sanity check.
        let mut internal_error = self.internal_error.take();
@ -1464,7 +1466,7 @@ pub mod test {
    pub use token::*;

    fn test_code(code: &str) -> Code {
-        Code::from_str_without_offset(code)
+        Code::from_str_without_location(code)
    }

    /// Constructor.
@ -1488,36 +1490,28 @@ pub mod test {

    /// Constructor.
    pub fn operator_<'s>(left_offset: &'s str, code: &'s str) -> Token<'s> {
-        let variant = token::Variant::operator(analyze_operator(code));
+        let variant = Variant::operator(analyze_operator(code));
        let left_offset = test_code(left_offset);
        let code = test_code(code);
        Token(left_offset, code, variant)
    }
 }

-#[cfg(test)]
-mod tests {
-    use super::test::*;
+fn usize_from(x: u32) -> usize {
+    usize::try_from(x).unwrap()
+}
+
+fn u32_from(x: usize) -> u32 {
+    u32::try_from(x).unwrap()
+}
+
+/// Testing/debugging helpers.
+pub mod debug {
    use super::*;

-    fn empty<'a>() -> Code<'a> {
-        Code::empty_without_offset()
-    }
-
-    fn test_code(code: &str) -> Code {
-        Code::from_str_without_offset(code)
-    }
-
-    fn test_lexer_many<'s>(inputs: Vec<(&'s str, Vec<Token<'s>>)>) {
-        for (input, output) in inputs {
-            test_lexer(input, output)
-        }
-    }
-
-    /// Lex the input, check the spans for consistency, and return the tokens with the span offsets
-    /// stripped.
-    fn lex_and_validate_spans(input: &str) -> Vec<Token> {
-        let result: Vec<_> = run(input).unwrap();
+    /// Lex the input and check the spans for consistency.
+    pub fn lex_and_validate_spans(input: &str) -> Vec<Token> {
+        let tokens: Vec<_> = run(input).unwrap();
        let mut sum_span = None;
        fn concat<T: PartialEq + Debug + Copy>(a: &Option<Range<T>>, b: &Range<T>) -> Range<T> {
            match a {
@ -1528,16 +1522,48 @@ mod tests {
                None => b.clone(),
            }
        }
-        for token in &result {
-            sum_span = Some(concat(&sum_span, &token.left_offset.code.range_utf16()));
-            sum_span = Some(concat(&sum_span, &token.code.range_utf16()));
+        let mut locations = code::debug::LocationCheck::new();
+        for token in &tokens {
+            let left_offset = token.left_offset.code.range();
+            let code = token.code.range();
+            sum_span = Some(concat(&sum_span, &left_offset));
+            sum_span = Some(concat(&sum_span, &code));
+            locations.extend(&[left_offset.start, left_offset.end, code.start, code.end]);
+        }
+        let sum_span = sum_span.unwrap_or_default();
+        let sum_span = sum_span.start.utf16..sum_span.end.utf16;
+        assert_eq!(sum_span, 0..(input.encode_utf16().count() as u32));
+        locations.check(input);
+        tokens
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::debug::*;
+    use super::test::*;
+    use super::*;
+
+    fn strip_offsets<'s>(tokens: impl IntoIterator<Item = Token<'s>>) -> Vec<Token<'s>> {
+        tokens.into_iter().map(|token| token.without_offsets()).collect()
+    }
+
+    fn empty<'a>() -> Code<'a> {
+        Code::empty_without_location()
+    }
+
+    fn test_code(code: &str) -> Code {
+        Code::from_str_without_location(code)
+    }
+
+    fn test_lexer_many<'s>(inputs: Vec<(&'s str, Vec<Token<'s>>)>) {
+        for (input, output) in inputs {
+            test_lexer(input, output)
        }
-        assert_eq!(sum_span.unwrap_or_default(), 0..(input.encode_utf16().count() as u32));
-        result.into_iter().map(|token| token.without_offsets()).collect()
    }

    fn test_lexer<'s>(input: &'s str, expected: Vec<Token<'s>>) {
-        let result = lex_and_validate_spans(input);
+        let result = strip_offsets(lex_and_validate_spans(input));
        let expected: Vec<_> = expected.into_iter().map(|token| token.without_offsets()).collect();
        assert_eq!(result, expected);
    }
@ -1831,6 +1857,39 @@ mod tests {
        let code = ["## Foo.", "main = 23"].join("\n");
        lex_and_validate_spans(&code);
    }
+
+    #[test]
+    fn test_comment() {
+        let code = ["# comment", "main = 23"].join("\n");
+        lex_and_validate_spans(&code);
+    }
+
+    #[test]
+    fn test_text() {
+        lex_and_validate_spans("f 'foo' 'bar'");
+        lex_and_validate_spans(r#"'String with \' escape'"#);
+        lex_and_validate_spans("'String with `splice`.'");
+        lex_and_validate_spans(&["## a", "", "   b"].join("\n"));
+    }
+
+    #[test]
+    fn test_indented_doc_after_blank_line() {
+        let code = ["type Redshift_Error_Mapper", "", "    A"].join("\n");
+        lex_and_validate_spans(&code);
+    }
+
+    #[test]
+    fn test_based_numbers() {
+        lex_and_validate_spans("0x23");
+        lex_and_validate_spans("2_010101");
+    }
+
+    #[test]
+    fn test_line_endings() {
+        lex_and_validate_spans("Windows\r\n...");
+        lex_and_validate_spans("Linux\n...");
+        lex_and_validate_spans("Classic Mac OS\r...");
+    }
 }


--- a/lib/rust/parser/src/macros/resolver.rs
+++ b/lib/rust/parser/src/macros/resolver.rs
@ -159,8 +159,9 @@ impl<'s> Resolver<'s> {
        root_macro_map: &MacroMap,
        tokens: impl IntoIterator<Item = Token<'s>>,
    ) -> syntax::Tree<'s> {
+        let start = crate::source::code::Location::default();
        self.lines.push(syntax::item::Line {
-            newline: token::newline(Code::empty(0), Code::empty(0)),
+            newline: token::newline(Code::empty(start), Code::empty(start)),
            items:   default(),
        });
        tokens.into_iter().for_each(|t| self.push(root_macro_map, t));
--- a/lib/rust/parser/src/source/code.rs
+++ b/lib/rust/parser/src/source/code.rs
@ -13,98 +13,133 @@ use crate::prelude::*;
 #[derive(Debug, Clone, Default, Eq, PartialEq, Deref)]
 pub struct StrRef<'s>(pub &'s str);

-/// A code representation. It can either be a borrowed source code or a modified owned one.
+/// Identifies a location in source code.
+#[derive(
+    Copy,
+    Clone,
+    Debug,
+    Default,
+    Eq,
+    PartialEq,
+    Serialize,
+    Reflect,
+    Deserialize,
+    PartialOrd,
+    Ord
+)]
+pub struct Location {
+    /// Offset from the beginning, in UTF-8 code units (bytes).
+    #[reflect(hide)]
+    pub utf8:  u32,
+    /// Offset from the beginning, in UTF-16 code units (two-byte words).
+    #[reflect(hide)]
+    pub utf16: u32,
+    /// Line number, starting from 0. The recognized line terminators are CR, LF, or CRLF.
+    #[reflect(hide)]
+    pub line:  u32,
+    /// Offset from start of line, in UTF-16 code units.
+    #[reflect(hide)]
+    pub col16: u32,
+}
+
+impl Add<Length> for Location {
+    type Output = Self;
+
+    fn add(self, rhs: Length) -> Self::Output {
+        Self {
+            utf8:  self.utf8 + rhs.utf8,
+            utf16: self.utf16 + rhs.utf16,
+            line:  self.line + rhs.newlines,
+            col16: if rhs.newlines == 0 { self.col16 } else { 0 } + rhs.line_chars16,
+        }
+    }
+}
+
+/// A code representation.
 #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Reflect, Deserialize, Deref)]
 #[allow(missing_docs)]
 pub struct Code<'s> {
+    /// The borrowed string data.
    #[serde(serialize_with = "crate::serialization::serialize_cow")]
    #[serde(deserialize_with = "crate::serialization::deserialize_cow")]
    #[reflect(as = "crate::serialization::Code", flatten, hide)]
    #[deref]
-    pub repr:     StrRef<'s>,
-    #[reflect(hide)]
-    offset_utf16: u32,
-    #[reflect(hide)]
-    utf16:        u32,
+    pub repr: StrRef<'s>,
+    #[reflect(flatten)]
+    start:    Location,
+    /// The length of the source code.
+    #[reflect(flatten)]
+    pub len:  Length,
 }

 impl<'s> Code<'s> {
    /// Return a code reference from the given source and offset within the document.
    #[inline(always)]
-    pub fn from_str_at_offset(repr: &'s str, offset_utf16: u32) -> Self {
-        let utf16 = repr.chars().map(|c| c.len_utf16() as u32).sum();
+    pub fn from_str_at_location(repr: &'s str, location: Location) -> Self {
+        let len = Length::of(repr);
        let repr = StrRef(repr);
-        Self { repr, offset_utf16, utf16 }
+        Self { repr, start: location, len }
    }

    /// Return a code reference at the beginning of the document. This can be used in testing, when
    /// accurate code references are not needed.
    #[inline(always)]
-    pub fn from_str_without_offset(repr: &'s str) -> Self {
-        Self::from_str_at_offset(repr, 0)
+    pub fn from_str_without_location(repr: &'s str) -> Self {
+        Self::from_str_at_location(repr, default())
    }

    /// Return a copy of this value, and set this value to a 0-length value following the returned
    /// value.
    #[inline(always)]
    pub fn take_as_prefix(&mut self) -> Self {
-        let end = self.offset_utf16 + self.utf16;
+        let end = self.start + self.len;
        Self {
-            repr:         mem::take(&mut self.repr),
-            offset_utf16: mem::replace(&mut self.offset_utf16, end),
-            utf16:        mem::take(&mut self.utf16),
+            repr:  mem::take(&mut self.repr),
+            start: mem::replace(&mut self.start, end),
+            len:   mem::take(&mut self.len),
        }
    }

    /// Return a 0-length `Code` located immediately before the start of this `Code`.
    pub fn position_before(&self) -> Self {
-        Self { repr: default(), offset_utf16: self.offset_utf16, utf16: default() }
+        Self { repr: default(), start: self.start, len: default() }
    }

    /// Return a 0-length `Code` located immediately after the end of this `Code`.
    pub fn position_after(&self) -> Self {
-        Self {
-            repr:         default(),
-            offset_utf16: self.offset_utf16 + self.utf16,
-            utf16:        default(),
-        }
-    }
-
-    /// Return the length in UTF-16 code units.
-    pub fn len_utf16(&self) -> u32 {
-        self.utf16
+        Self { repr: default(), start: self.start + self.len, len: default() }
    }

    /// Return the start and end of the UTF-16 source code for this element.
-    pub fn range_utf16(&self) -> Range<u32> {
-        self.offset_utf16..(self.offset_utf16 + self.utf16)
+    pub fn range(&self) -> Range<Location> {
+        self.start..(self.start + self.len)
    }

    /// Split the code at the given location.
    pub fn split_at(&self, split: Length) -> (Self, Self) {
-        let (left, right) = self.repr.split_at(split.utf8);
-        (
-            Self {
-                repr:         StrRef(left),
-                offset_utf16: self.offset_utf16,
-                utf16:        split.utf16,
-            },
-            Self {
-                repr:         StrRef(right),
-                offset_utf16: self.offset_utf16 + split.utf16,
-                utf16:        self.utf16 - split.utf16,
-            },
-        )
+        let (left, right) = self.repr.split_at(usize::try_from(split.utf8).unwrap());
+        let right_len = Length {
+            utf8:         self.len.utf8 - split.utf8,
+            utf16:        self.len.utf16 - split.utf16,
+            newlines:     self.len.newlines - split.newlines,
+            line_chars16: self.len.line_chars16
+                - if split.newlines == 0 { split.line_chars16 } else { 0 },
+        };
+        (Self { repr: StrRef(left), start: self.start, len: split }, Self {
+            repr:  StrRef(right),
+            start: self.start + split,
+            len:   right_len,
+        })
    }

    /// Return a reference to an empty string, not associated with any location in the document.
-    pub fn empty_without_offset() -> Self {
-        Self { repr: StrRef(""), offset_utf16: 0, utf16: 0 }
+    pub fn empty_without_location() -> Self {
+        Self { repr: StrRef(""), start: default(), len: default() }
    }

    /// Return a reference to an empty string.
-    pub fn empty(offset: u32) -> Self {
-        Self { repr: StrRef(""), offset_utf16: offset, utf16: 0 }
+    pub fn empty(location: Location) -> Self {
+        Self { repr: StrRef(""), start: location, len: default() }
    }

    /// Length of the code in bytes.
@ -116,7 +151,7 @@ impl<'s> Code<'s> {
    /// Length of the code.
    #[inline(always)]
    pub fn length(&self) -> Length {
-        Length { utf8: self.repr.len(), utf16: self.utf16 }
+        self.len
    }

    /// True if the code is the empty string.
@ -127,8 +162,8 @@ impl<'s> Code<'s> {

    /// Return this value with its start position removed (set to 0). This can be used to compare
    /// values ignoring offsets.
-    pub fn without_offset(&self) -> Self {
-        Self { repr: self.repr.clone(), offset_utf16: default(), utf16: self.utf16 }
+    pub fn without_location(&self) -> Self {
+        Self { repr: self.repr.clone(), start: default(), len: self.len }
    }
 }

@ -172,10 +207,10 @@ impl<'s> AddAssign<&Code<'s>> for Code<'s> {
        match (self.is_empty(), other.is_empty()) {
            (false, true) => (),
            (true, true) => {
-                // The span builder works by starting with `Span::empty_without_offset()`, and
-                // appending to the right side. In order to ensure every span has an offset: When
+                // The span builder works by starting with `Span::empty_without_location()`, and
+                // appending to the right side. In order to ensure every span has a location: When
                // the LHS is empty, take the location from the RHS even if the RHS is also empty.
-                self.offset_utf16 = other.offset_utf16;
+                self.start = other.start;
            }
            (true, false) => {
                *self = other.clone();
@ -193,7 +228,7 @@ impl<'s> AddAssign<&Code<'s>> for Code<'s> {
                    // Concatenating two UTF-8 strings always yields a valid UTF-8 string.
                    self.repr = StrRef(std::str::from_utf8_unchecked(joined));
                }
-                self.utf16 += other.utf16;
+                self.len += other.len;
            }
        }
    }
@ -205,17 +240,41 @@ impl<'s> AddAssign<&Code<'s>> for Code<'s> {
 /// The length of a [`Code`] object.
 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Reflect, Deserialize)]
 pub struct Length {
+    /// An offset, in UTF-8 code units (bytes).
    #[reflect(skip)]
    #[serde(skip)]
-    utf8:  usize,
-    utf16: u32,
+    pub utf8:         u32,
+    /// An offset, in UTF-16 code units (two-byte words).
+    pub utf16:        u32,
+    /// A difference in line numbers.
+    #[reflect(hide)]
+    pub newlines:     u32,
+    /// If `newlines` is 0, this is the difference in UTF-16 code-unit positions within a line; if
+    /// `newlines` is nonzero, this is the position within the line ending the range.
+    pub line_chars16: u32,
 }

 impl Length {
    /// Returns the length of the given input.
    #[inline(always)]
    pub fn of(s: &str) -> Self {
-        Self { utf8: s.len(), utf16: s.encode_utf16().count() as u32 }
+        let mut utf16 = 0;
+        let mut newlines = 0;
+        let mut line_chars16 = 0;
+        let mut prev = None;
+        for c in s.chars() {
+            let char_len16 = c.len_utf16() as u32;
+            utf16 += char_len16;
+            line_chars16 += char_len16;
+            if c == '\r' || c == '\n' {
+                line_chars16 = 0;
+            }
+            if c == '\r' || (c == '\n' && prev != Some('\r')) {
+                newlines += 1;
+            }
+            prev = Some(c);
+        }
+        Self { utf8: u32::try_from(s.len()).unwrap(), utf16, newlines, line_chars16 }
    }

    /// Returns true if the code is empty.
@ -226,7 +285,7 @@ impl Length {

    /// Return the length in UTF-8 code units (bytes).
    #[inline(always)]
-    pub fn utf8_bytes(&self) -> usize {
+    pub fn utf8_bytes(&self) -> u32 {
        self.utf8
    }

@ -242,8 +301,13 @@ impl Add for Length {

    #[inline(always)]
    fn add(self, rhs: Self) -> Self::Output {
-        let Self { utf8, utf16 } = self;
-        Self { utf8: utf8 + rhs.utf8, utf16: utf16 + rhs.utf16 }
+        let Self { utf8, utf16, newlines, line_chars16 } = self;
+        Self {
+            utf8:         utf8 + rhs.utf8,
+            utf16:        utf16 + rhs.utf16,
+            newlines:     newlines + rhs.newlines,
+            line_chars16: if rhs.newlines == 0 { line_chars16 } else { 0 } + rhs.line_chars16,
+        }
    }
 }

@ -259,3 +323,67 @@ impl Display for Length {
        write!(f, "{}", self.utf8)
    }
 }
+
+
+
+// ====================
+// === Test support ===
+// ====================
+
+/// Testing/debugging helpers.
+pub mod debug {
+    use super::*;
+    use std::collections::BTreeMap;
+
+    /// Checks consistency of observed `Location`s. Compares `line:col` values against values found
+    /// in an independent scan of the input source code.
+    #[derive(Debug, Default)]
+    pub struct LocationCheck {
+        locations: BTreeMap<u32, Location>,
+    }
+
+    impl LocationCheck {
+        /// Create a new empty checker.
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Add the location to the collection waiting to be checked.
+        pub fn add(&mut self, location: Location) {
+            self.locations.insert(location.utf8, location);
+        }
+
+        /// Add multiple locations to the collection waiting to be checked.
+        pub fn extend(&mut self, locations: &[Location]) {
+            self.locations.extend(locations.iter().map(|loc| (loc.utf8, *loc)));
+        }
+
+        /// Check all previously-added locations for consistency with the input.
+        pub fn check(mut self, input: &str) {
+            let mut pos = Location::default();
+            let mut prev = None;
+            for (i, c) in input.char_indices() {
+                pos.utf8 = i as u32;
+                if let Some(loc) = self.locations.remove(&(i as u32)) {
+                    assert_eq!(loc, pos);
+                }
+                let char_len = c.len_utf16() as u32;
+                pos.utf16 += char_len;
+                pos.col16 += char_len;
+                if c == '\r' || c == '\n' {
+                    pos.col16 = 0;
+                }
+                if c == '\r' || (c == '\n' && prev != Some('\r')) {
+                    pos.line += 1;
+                }
+                prev = Some(c);
+            }
+            if let Some(loc) = self.locations.remove(&(input.len() as u32)) {
+                pos.utf8 = input.len() as u32;
+                assert_eq!(loc, pos);
+            }
+            let non_char_boundary_locations: Vec<_> = self.locations.values().cloned().collect();
+            assert_eq!(&non_char_boundary_locations, &[]);
+        }
+    }
+}
--- a/lib/rust/parser/src/source/span.rs
+++ b/lib/rust/parser/src/source/span.rs
@ -6,7 +6,7 @@ use crate::source::*;
 use crate::syntax::*;

 use crate::lexer;
-
+use crate::source::code::Location;


 /// Common traits.
@ -107,7 +107,7 @@ impl<'s> Offset<'s> {
    /// Return this value with its start position removed (set to 0). This can be used to compare
    /// spans ignoring offsets.
    pub fn without_offset(&self) -> Self {
-        Self { visible: self.visible, code: self.code.without_offset() }
+        Self { visible: self.visible, code: self.code.without_location() }
    }
 }

@ -161,7 +161,7 @@ pub struct Span<'s> {
 impl<'s> Span<'s> {
    /// Constructor.
    pub fn empty_without_offset() -> Self {
-        Self { left_offset: Code::empty_without_offset().into(), code_length: default() }
+        Self { left_offset: Code::empty_without_location().into(), code_length: default() }
    }

    /// Check whether the span is empty.
@ -185,10 +185,10 @@ impl<'s> Span<'s> {
        Builder::add_to_span(elem, self)
    }

-    /// Return the start and end of the UTF-16 source code for this element.
-    pub fn range_utf16(&self) -> Range<u32> {
-        let start = self.left_offset.position_after().code.range_utf16().start;
-        let end = start + self.code_length.utf16_len();
+    /// Return the start and end of the source code for this element.
+    pub fn range(&self) -> Range<Location> {
+        let start = self.left_offset.position_after().code.range().start;
+        let end = start + self.code_length;
        start..end
    }

@ -217,11 +217,11 @@ where
            self.code_length = other.code_length;
        } else {
            debug_assert_eq!(
-                self.left_offset.code.position_after().range_utf16().end
-                    + self.code_length.utf16_len(),
-                other.left_offset.code.position_before().range_utf16().start
+                self.left_offset.code.position_after().range().end + self.code_length,
+                other.left_offset.code.position_before().range().start
            );
-            self.code_length += other.left_offset.code.length() + other.code_length;
+            self.code_length += other.left_offset.code.length();
+            self.code_length += other.code_length;
        }
    }
 }
@ -399,7 +399,7 @@ where T: Builder<'s>
 {
    #[inline(always)]
    fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> {
-        self.iter_mut().fold(span, |sum, new_span| Builder::add_to_span(new_span, sum))
+        self.as_mut_slice().add_to_span(span)
    }
 }

--- a/lib/rust/parser/src/syntax/token.rs
+++ b/lib/rust/parser/src/syntax/token.rs
@ -137,8 +137,7 @@ impl<'s, T> Token<'s, T> {
    #[inline(always)]
    pub fn split_at(self, split: code::Length) -> (Token<'s, ()>, Token<'s, ()>) {
        let left_lexeme_offset = self.left_offset;
-        let right_lexeme_offset =
-            Code::empty(self.code.position_before().range_utf16().end + split.utf16_len());
+        let right_lexeme_offset = Code::empty(self.code.position_before().range().end + split);
        let (left_code, right_code) = self.code.split_at(split);
        let left = Token(left_lexeme_offset, left_code, ());
        let right = Token(right_lexeme_offset, right_code, ());
@ -170,7 +169,7 @@ impl<'s, V: Clone> Token<'s, V> {
    pub fn without_offsets(&self) -> Self {
        Self {
            left_offset: self.left_offset.without_offset(),
-            code:        self.code.without_offset(),
+            code:        self.code.without_location(),
            variant:     self.variant.clone(),
        }
    }
--- a/lib/rust/parser/src/syntax/tree.rs
+++ b/lib/rust/parser/src/syntax/tree.rs
@ -774,7 +774,8 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> {
            func
        }
        (_, Variant::ArgumentBlockApplication(block)) if block.lhs.is_none() => {
-            arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length;
+            let code = func.span.code_length + arg.span.left_offset.code.length() + arg.span.code_length;
+            arg.span.code_length = code;
            let func_left_offset = func.span.left_offset.take_as_prefix();
            let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset);
            if let Some(first) = block.arguments.first_mut() {
@ -784,7 +785,8 @@ pub fn apply<'s>(mut func: Tree<'s>, mut arg: Tree<'s>) -> Tree<'s> {
            arg
        }
        (_, Variant::OperatorBlockApplication(block)) if block.lhs.is_none() => {
-            arg.span.code_length += arg.span.left_offset.code.length() + func.span.code_length;
+            let code = func.span.code_length + arg.span.left_offset.code.length() + arg.span.code_length;
+            arg.span.code_length = code;
            let func_left_offset = func.span.left_offset.take_as_prefix();
            let arg_left_offset = mem::replace(&mut arg.span.left_offset, func_left_offset);
            if let Some(first) = block.expressions.first_mut() {