Parser: Newline normalization in text literals (#3903)

- Newlines in text literals are now normalized to `\n` when producing IR representation. - Re-enabled tests that were dependent on the old behavior.
2024-12-23 05:41:32 +03:00 · 2022-11-27 01:40:44 -08:00 · 2022-11-27 01:40:44 -08:00 · 336bbf505c
commit 336bbf505c
parent 1b52ae239f
9 changed files with 84 additions and 78 deletions
--- a/engine/runtime/src/main/java/org/enso/compiler/TreeToIr.java
+++ b/engine/runtime/src/main/java/org/enso/compiler/TreeToIr.java
@ -1006,6 +1006,7 @@ final class TreeToIr {
            sb.appendCodePoint(val);
          }
        }
+        case TextElement.Newline n -> sb.append('\n');
        default -> throw new UnhandledEntity(t, "buildTextConstant");
      }
    }
--- a/lib/rust/parser/debug/tests/parse.rs
+++ b/lib/rust/parser/debug/tests/parse.rs
@ -125,9 +125,9 @@ fn doc_comments() {
    #[rustfmt::skip]
    test(&lines.join("\n"), block![
        (Documented
-         (#((Section " The Identity Function") (Section "\n")
-           (Section "\n")
-           (Section "Arguments:") (Section "\n")
+         (#((Section " The Identity Function") (Newline)
+           (Newline)
+           (Section "Arguments:") (Newline)
           (Section "- x: value to do nothing to"))
         #(()))
         (Function (Ident id) #((() (Ident x) () ())) "=" (Ident x)))]);
@ -919,10 +919,10 @@ x"#;
    #[rustfmt::skip]
    let expected = block![
        (TextLiteral
-         #((Section "part of the string") (Section "\n")
-           (Section "   3-spaces indented line, part of the Text Block") (Section "\n")
-           (Section "this does not end the string -> '''") (Section "\n")
-           (Section "\n")
+         #((Section "part of the string") (Newline)
+           (Section "   3-spaces indented line, part of the Text Block") (Newline)
+           (Section "this does not end the string -> '''") (Newline)
+           (Newline)
           (Section "`also` part of the string")))
        ()
        (Ident x)
@ -959,7 +959,7 @@ x"#;
        (Assignment (Ident foo) "=" (App (Ident bar) (TextLiteral #((Section "baz"))))));
    test!("'''\n \\t'", (TextLiteral #((Escape '\t') (Section "'"))));
    test!("'''\n x\n \\t'",
-        (TextLiteral #((Section "x") (Section "\n") (Escape '\t') (Section "'"))));
+        (TextLiteral #((Section "x") (Newline) (Escape '\t') (Section "'"))));
 }

 #[test]
@ -1003,7 +1003,7 @@ fn interpolated_literals_in_multiline_text() {
    #[rustfmt::skip]
    let expected = block![
        (TextLiteral
-         #((Section "text with a ") (Splice (Ident splice)) (Section "\n")
+         #((Section "text with a ") (Splice (Ident splice)) (Newline)
           (Section "and some ") (Escape '\n') (Section "escapes") (Escape '\'')))];
    test(code, expected);
 }
--- a/lib/rust/parser/src/lexer.rs
+++ b/lib/rust/parser/src/lexer.rs
@ -1008,7 +1008,7 @@ impl<'s> Lexer<'s> {
                    }
                    let newlines = newlines
                        .into_iter()
-                        .map(|token| token.with_variant(token::Variant::text_section()));
+                        .map(|token| token.with_variant(token::Variant::text_newline()));
                    self.output.extend(newlines);
                    continue;
                }
--- a/lib/rust/parser/src/macros/resolver.rs
+++ b/lib/rust/parser/src/macros/resolver.rs
@ -480,7 +480,7 @@ impl<'s> Resolver<'s> {
                    syntax::Item::Tree(tree) => body.push_str(&tree.code()),
                }
            }
-            let header0 = syntax::Tree::from(header0).with_error("Invalid macro invocation.");
+            let header0 = syntax::tree::to_ast(header0).with_error("Invalid macro invocation.");
            (header0, items)
        }
    }
--- a/lib/rust/parser/src/syntax/item.rs
+++ b/lib/rust/parser/src/syntax/item.rs
@ -50,15 +50,6 @@ impl<'s> Item<'s> {
        }
    }

-    /// Convert this item to a [`Tree`].
-    pub fn to_ast(self) -> Tree<'s> {
-        match self {
-            Item::Token(token) => token.into(),
-            Item::Tree(ast) => ast,
-            Item::Block(items) => build_block(items),
-        }
-    }
-
    /// If this item is an [`Item::Tree`], apply the given function to the contained [`Tree`] and
    /// return the result.
    pub fn map_tree<'t: 's, F>(self, f: F) -> Self
@ -90,7 +81,7 @@ impl<'s> TryAsRef<Item<'s>> for Item<'s> {

 /// Given a sequence of [`Line`]s belonging to one block, create an AST block node, of a type
 /// determined by the syntax of the lines in the block.
-fn build_block<'s>(lines: impl IntoIterator<Item = Line<'s>>) -> Tree<'s> {
+pub fn build_block<'s>(lines: impl IntoIterator<Item = Line<'s>>) -> Tree<'s> {
    let mut block_builder = tree::block::Builder::new();
    let mut precedence = operator::Precedence::new();
    for Line { newline, items } in lines {
--- a/lib/rust/parser/src/syntax/operator.rs
+++ b/lib/rust/parser/src/syntax/operator.rs
@ -65,9 +65,10 @@ impl<'s> Precedence<'s> {
                code,
            }) => self.nospace_builder.operator(Token(left_offset, code, opr)),
            syntax::Item::Token(token) =>
-                self.nospace_builder.operand(syntax::Tree::from(token).into()),
+                self.nospace_builder.operand(syntax::tree::to_ast(token).into()),
            syntax::Item::Tree(tree) => self.nospace_builder.operand(tree.into()),
-            syntax::Item::Block(_) => self.nospace_builder.operand(item.to_ast().into()),
+            syntax::Item::Block(lines) =>
+                self.nospace_builder.operand(syntax::item::build_block(lines).into()),
        }
    }

--- a/lib/rust/parser/src/syntax/token.rs
+++ b/lib/rust/parser/src/syntax/token.rs
@ -291,6 +291,7 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg
            pub value: Option<char>,
        },
        TextInitialNewline,
+        TextNewline,
        Invalid,
    }
 }}}
--- a/lib/rust/parser/src/syntax/tree.rs
+++ b/lib/rust/parser/src/syntax/tree.rs
@ -505,6 +505,12 @@ pub enum TextElement<'s> {
        /// The escape sequence.
        token: token::TextEscape<'s>,
    },
+    /// A logical newline.
+    Newline {
+        /// The newline token. The semantics of a logical newline are independent of the specific
+        /// characters in the input, which are generally platform-dependent.
+        newline: token::Newline<'s>,
+    },
    /// An interpolated section within a text literal.
    Splice {
        /// The opening ` character.
@ -523,6 +529,7 @@ impl<'s> span::Builder<'s> for TextElement<'s> {
            TextElement::Escape { token } => span.add(token),
            TextElement::Splice { open, expression, close } =>
                span.add(open).add(expression).add(close),
+            TextElement::Newline { newline } => span.add(newline),
        }
    }
 }
@ -856,6 +863,7 @@ pub fn join_text_literals<'s>(
        Some(TextElement::Section { text }) => text.left_offset += rhs_span.left_offset,
        Some(TextElement::Escape { token }) => token.left_offset += rhs_span.left_offset,
        Some(TextElement::Splice { open, .. }) => open.left_offset += rhs_span.left_offset,
+        Some(TextElement::Newline { newline }) => newline.left_offset += rhs_span.left_offset,
        None => (),
    }
    if let Some(newline) = rhs.newline.take() {
@ -939,52 +947,56 @@ pub fn apply_unary_operator<'s>(opr: token::Operator<'s>, rhs: Option<Tree<'s>>)
    }
 }

-impl<'s> From<Token<'s>> for Tree<'s> {
-    fn from(token: Token<'s>) -> Self {
-        match token.variant {
-            token::Variant::Ident(ident) => token.with_variant(ident).into(),
-            token::Variant::Digits(number) =>
-                Tree::number(None, Some(token.with_variant(number)), None),
-            token::Variant::NumberBase(base) =>
-                Tree::number(Some(token.with_variant(base)), None, None),
-            token::Variant::TextStart(open) =>
-                Tree::text_literal(Some(token.with_variant(open)), default(), default(), default(), default()),
-            token::Variant::TextSection(section) => {
-                let section = TextElement::Section { text: token.with_variant(section) };
-                Tree::text_literal(default(), default(), vec![section], default(), default())
-            }
-            token::Variant::TextEscape(escape) => {
-                let token = token.with_variant(escape);
-                let section = TextElement::Escape { token };
-                Tree::text_literal(default(), default(), vec![section], default(), default())
-            }
-            token::Variant::TextEnd(_) if token.code.is_empty() =>
-                Tree::text_literal(default(), default(), default(), default(), true),
-            token::Variant::TextEnd(close) =>
-                Tree::text_literal(default(), default(), default(), Some(token.with_variant(close)), true),
-            token::Variant::TextInitialNewline(_) =>
-                Tree::text_literal(default(), Some(token::newline(token.left_offset, token.code)), default(), default(), default()),
-            token::Variant::Wildcard(wildcard) => Tree::wildcard(token.with_variant(wildcard), default()),
-            token::Variant::AutoScope(t) => Tree::auto_scope(token.with_variant(t)),
-            token::Variant::OpenSymbol(s) =>
-                Tree::group(Some(token.with_variant(s)), default(), default()).with_error("Unmatched delimiter"),
-            token::Variant::CloseSymbol(s) =>
-                Tree::group(default(), default(), Some(token.with_variant(s))).with_error("Unmatched delimiter"),
-            // These should be unreachable: They are handled when assembling items into blocks,
-            // before parsing proper.
-            token::Variant::Newline(_)
-            | token::Variant::BlockStart(_)
-            | token::Variant::BlockEnd(_)
-            // This should be unreachable: `resolve_operator_precedence` doesn't calls `to_ast` for
-            // operators.
-            | token::Variant::Operator(_)
-            // Map an error case in the lexer to an error in the AST.
-            | token::Variant::Invalid(_) => {
-                let message = format!("Unexpected token: {token:?}");
-                let ident = token::variant::Ident(false, 0, false, false, false);
-                let value = Tree::ident(token.with_variant(ident));
-                Tree::with_error(value, message)
-            }
+/// Create an AST node for a token.
+pub fn to_ast(token: Token) -> Tree {
+    match token.variant {
+        token::Variant::Ident(ident) => token.with_variant(ident).into(),
+        token::Variant::Digits(number) =>
+            Tree::number(None, Some(token.with_variant(number)), None),
+        token::Variant::NumberBase(base) =>
+            Tree::number(Some(token.with_variant(base)), None, None),
+        token::Variant::TextStart(open) =>
+            Tree::text_literal(Some(token.with_variant(open)), default(), default(), default(), default()),
+        token::Variant::TextSection(section) => {
+            let section = TextElement::Section { text: token.with_variant(section) };
+            Tree::text_literal(default(), default(), vec![section], default(), default())
+        }
+        token::Variant::TextEscape(escape) => {
+            let token = token.with_variant(escape);
+            let section = TextElement::Escape { token };
+            Tree::text_literal(default(), default(), vec![section], default(), default())
+        }
+        token::Variant::TextEnd(_) if token.code.is_empty() =>
+            Tree::text_literal(default(), default(), default(), default(), true),
+        token::Variant::TextEnd(close) =>
+            Tree::text_literal(default(), default(), default(), Some(token.with_variant(close)), true),
+        token::Variant::TextInitialNewline(_) =>
+            Tree::text_literal(default(), Some(token::newline(token.left_offset, token.code)), default(), default(), default()),
+        token::Variant::TextNewline(_) => {
+            let newline = token::newline(token.left_offset, token.code);
+            let newline = TextElement::Newline { newline };
+            Tree::text_literal(default(), default(), vec![newline], default(), default())
+        }
+        token::Variant::Wildcard(wildcard) => Tree::wildcard(token.with_variant(wildcard), default()),
+        token::Variant::AutoScope(t) => Tree::auto_scope(token.with_variant(t)),
+        token::Variant::OpenSymbol(s) =>
+            Tree::group(Some(token.with_variant(s)), default(), default()).with_error("Unmatched delimiter"),
+        token::Variant::CloseSymbol(s) =>
+            Tree::group(default(), default(), Some(token.with_variant(s))).with_error("Unmatched delimiter"),
+        // These should be unreachable: They are handled when assembling items into blocks,
+        // before parsing proper.
+        token::Variant::Newline(_)
+        | token::Variant::BlockStart(_)
+        | token::Variant::BlockEnd(_)
+        // This should be unreachable: `resolve_operator_precedence` doesn't calls `to_ast` for
+        // operators.
+        | token::Variant::Operator(_)
+        // Map an error case in the lexer to an error in the AST.
+        | token::Variant::Invalid(_) => {
+            let message = format!("Unexpected token: {token:?}");
+            let ident = token::variant::Ident(false, 0, false, false, false);
+            let value = Tree::ident(token.with_variant(ident));
+            Tree::with_error(value, message)
        }
    }
 }
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -246,8 +246,8 @@ spec =
        Test.specify "should escape special characters when debug-printing text" <|
            text_1 = '''
                foo
-                bar\tbaz
-            (text_1.replace '\r' "").to_text.should_equal "'foo\nbar\tbaz'"
+                bar\r\tbaz
+            text_1.to_text.should_equal "'foo\nbar\r\tbaz'"
            text_2 = '\n\t\a\b\f\r\v\e\''
            text_2.to_text.should_equal "'\n\t\a\b\f\r\v\e\''"

@ -830,8 +830,8 @@ spec =
            long_text = """
                Hello from a long text. EOL
                SOL Hmm...
-            (long_text.replace '\r' "") . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
-            (long_text.replace '\r' "") . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
+            long_text . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
+            long_text . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false

        Test.specify "should check for starts_with using Unicode normalization" <|
            "Hello".starts_with "He" . should_be_true
@ -917,8 +917,8 @@ spec =
            long_text = """
                EOL
                SOL Hmm...
-            (long_text.replace '\r' "") . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
-            (long_text.replace '\r' "") . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false

            "aaazzz" . starts_with "a|b" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive) . should_be_true
            "bbbzzz" . starts_with "a|b" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive) . should_be_true
@ -997,8 +997,8 @@ spec =
            long_text = """
                Hnnnn EOL
                SOL
-            (long_text.replace '\r' "") . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=True) . should_be_true
-            (long_text.replace '\r' "") . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=False) . should_be_false
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=True) . should_be_true
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=False) . should_be_false

            "zzzaaa" . ends_with "a|b" Regex_Matcher.Regex_Matcher_Data . should_be_true
            "zzzbbb" . ends_with "a|b" Regex_Matcher.Regex_Matcher_Data . should_be_true
@ -1604,7 +1604,7 @@ spec =
            text = """
                Foo
                bar
-            r4 = text.replace "(\n|\r)" ""  matcher=(Regex_Matcher.Regex_Matcher_Data multiline=True)
+            r4 = text.replace '\n' ""  matcher=(Regex_Matcher.Regex_Matcher_Data multiline=True)
            r4 . should_equal "Foobar"

            r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Regex_Matcher_Data comments=True)