Text literals: Accept unpaired-surrogate escape codes. (#9731)

* Text literals: Accept unpaired-surrogate escape codes. Unpaired surrogates are not allowed by Unicode, but they occur in practice because many systems accept them; for example, they may be present in filenames on Windows (which are otherwise constrained to UTF-16). Programs written in Enso should be able to work with them, if only because they represent edge cases that should be tested when converting encodings and at other system boundaries. - Generalize the representation of interpreted-text-escapes in the lexer, so that we are not tied to the strict Unicode of Rust's `str`. - Move some doc-comment code from the parser to test utilities. - Simplify token serialization.
2024-11-22 22:10:15 +03:00 · 2024-04-18 09:21:05 -04:00 · 2024-04-18 09:21:05 -04:00 · 0de490be24
commit 0de490be24
parent a444934806
6 changed files with 150 additions and 67 deletions
--- a/lib/rust/parser/debug/tests/parse.rs
+++ b/lib/rust/parser/debug/tests/parse.rs
@ -1049,15 +1049,15 @@ fn inline_text_literals() {
    test!(r#""Non-escape: \n""#, (TextLiteral #((Section "Non-escape: \\n"))));
    test!(r#""Non-escape: \""#, (TextLiteral #((Section "Non-escape: \\"))));
    test!(r#"'String with \' escape'"#,
-        (TextLiteral #((Section "String with ") (Escape '\'') (Section " escape"))));
+        (TextLiteral #((Section "String with ") (Escape 0x27) (Section " escape"))));
    test!(r#"'\u0915\u094D\u0937\u093F'"#, (TextLiteral
-        #((Escape '\u{0915}') (Escape '\u{094D}') (Escape '\u{0937}') (Escape '\u{093F}'))));
-    test!(r#"('\n')"#, (Group (TextLiteral #((Escape '\n')))));
+        #((Escape 0x0915) (Escape 0x094D) (Escape 0x0937) (Escape 0x093F))));
+    test!(r#"('\n')"#, (Group (TextLiteral #((Escape 0x0A)))));
    test!(r#"`"#, (Invalid));
    test!(r#"(")")"#, (Group (TextLiteral #((Section ")")))));
-    test!(r#"'\x'"#, (TextLiteral #((Escape ()))));
-    test!(r#"'\u'"#, (TextLiteral #((Escape ()))));
-    test!(r#"'\U'"#, (TextLiteral #((Escape ()))));
+    test!(r#"'\x'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
+    test!(r#"'\u'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
+    test!(r#"'\U'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
 }

 #[test]
@ -1100,7 +1100,7 @@ x"#;
    ];
    test(code, expected);
    let code = "'''\n    \\nEscape at start\n";
-    test!(code, (TextLiteral #((Escape '\n') (Section "Escape at start"))) ());
+    test!(code, (TextLiteral #((Escape 0x0A) (Section "Escape at start"))) ());
    let code = "x =\n x = '''\n  x\nx";
    #[rustfmt::skip]
    let expected = block![
@ -1111,9 +1111,9 @@ x"#;
    test(code, expected);
    test!("foo = bar '''\n baz",
        (Assignment (Ident foo) "=" (App (Ident bar) (TextLiteral #((Section "baz"))))));
-    test!("'''\n \\t'", (TextLiteral #((Escape '\t') (Section "'"))));
+    test!("'''\n \\t'", (TextLiteral #((Escape 0x09) (Section "'"))));
    test!("'''\n x\n \\t'",
-        (TextLiteral #((Section "x") (Newline) (Escape '\t') (Section "'"))));
+        (TextLiteral #((Section "x") (Newline) (Escape 0x09) (Section "'"))));
 }

 #[test]
@ -1126,11 +1126,11 @@ fn interpolated_literals_in_inline_text() {
    test!(r#"'` SpliceWithLeadingWhitespace`'"#,
        (TextLiteral #((Splice (Ident SpliceWithLeadingWhitespace)))));
    test!(r#"'String with \n escape'"#,
-        (TextLiteral #((Section "String with ") (Escape '\n') (Section " escape"))));
-    test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
-    test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
-    test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
-    test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
+        (TextLiteral #((Section "String with ") (Escape 0x0A) (Section " escape"))));
+    test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
+    test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
+    test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
+    test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
 }

 #[test]
@ -1149,7 +1149,7 @@ fn interpolated_literals_in_multiline_text() {
    let expected = block![
        (TextLiteral
         #((Section "text with a ") (Splice (Ident splice)) (Newline)
-           (Section "and some ") (Escape '\n') (Section "escapes") (Escape '\'')))];
+           (Section "and some ") (Escape 0x0A) (Section "escapes") (Escape 0x27)))];
    test(code, expected);
 }

--- a/lib/rust/parser/doc-parser/src/main.rs
+++ b/lib/rust/parser/doc-parser/src/main.rs
@ -12,6 +12,8 @@
 use enso_doc_parser::*;
 use enso_parser::prelude::*;

+use enso_parser::syntax::tree::DocComment;
+use enso_parser::syntax::tree::TextElement;


 // ====================================
@ -59,7 +61,37 @@ fn extract_docs(_filename: &str, mut code: &str) -> Vec<String> {
        }
        _ => {}
    });
-    docs.take().into_iter().map(|node| node.content()).collect()
+    docs.take().iter().map(content).collect()
+}
+
+/// Return the contents of the comment, with leading whitespace, the `##` token, and following
+/// empty lines removed; newlines will be normalized.
+pub fn content(node: &DocComment) -> String {
+    let mut buf = String::new();
+    for element in &node.elements {
+        match element {
+            TextElement::Section { text } => buf.push_str(&text.code.repr),
+            TextElement::Newline { .. } => buf.push('\n'),
+            TextElement::Escape {
+                token:
+                    token @ enso_parser::syntax::token::TextEscape {
+                        variant: enso_parser::syntax::token::variant::TextEscape { value },
+                        ..
+                    },
+            } => {
+                if let Some(c) = value.to_char() {
+                    buf.push(c);
+                } else {
+                    // Invalid escape character, or unpaired surrogate that can't be represented in
+                    // a Rust string.
+                    buf.push_str(**token.code)
+                }
+            }
+            // Unreachable.
+            TextElement::Splice { .. } => continue,
+        }
+    }
+    buf
 }

 /// Lex the given documentation, and return the sequence of tokens.
--- a/lib/rust/parser/src/lexer.rs
+++ b/lib/rust/parser/src/lexer.rs
@ -10,6 +10,7 @@ use crate::syntax::*;

 use crate::source::code::Length;
 use crate::source::code::Location;
+use crate::syntax::token::Codepoint;

 use std::str;

@ -1210,7 +1211,7 @@ impl<'s> Lexer<'s> {
            let token = self.make_token(
                backslash_start,
                sequence_end.clone(),
-                token::Variant::text_escape(value.and_then(char::from_u32)),
+                token::Variant::text_escape(value.map(Codepoint::from_u32).unwrap_or_default()),
            );
            self.output.push(token);
            sequence_end
@ -1236,7 +1237,7 @@ impl<'s> Lexer<'s> {
            let token = self.make_token(
                backslash_start,
                escape_end.clone(),
-                token::Variant::text_escape(value),
+                token::Variant::text_escape(value.map(Codepoint::from_char).unwrap_or_default()),
            );
            self.output.push(token);
            escape_end
@ -1906,6 +1907,43 @@ mod tests {
        lex_and_validate_spans(&["## a", "", "   b"].join("\n"));
    }

+    fn text_escape_(code: &str, codepoint: Option<u32>) -> Token {
+        let codepoint = match codepoint {
+            Some(value) => {
+                let codepoint = Codepoint::from_u32(value);
+                assert!(!codepoint.is_none());
+                codepoint
+            }
+            None => Codepoint::none(),
+        };
+        text_escape(test_code(""), test_code(code), codepoint).into()
+    }
+
+    #[test]
+    fn test_text_escapes() {
+        // Valid Unicode codepoints.
+        test_lexer("'\\0\\u0\\u{10}\\u{10FFFF}'", vec![
+            text_start(test_code(""), test_code("'")).into(),
+            text_escape_("\\0", Some(0)),
+            text_escape_("\\u0", Some(0)),
+            text_escape_("\\u{10}", Some(0x10)),
+            text_escape_("\\u{10FFFF}", Some(0x10_FFFF)),
+            text_end(test_code(""), test_code("'")).into(),
+        ]);
+        // Invalid Unicode, but allowed in Enso strings.
+        test_lexer("'\\uD800'", vec![
+            text_start(test_code(""), test_code("'")).into(),
+            text_escape_("\\uD800", Some(0xD800)),
+            text_end(test_code(""), test_code("'")).into(),
+        ]);
+        // Invalid and disallowed.
+        test_lexer("'\\u{110000}'", vec![
+            text_start(test_code(""), test_code("'")).into(),
+            text_escape_("\\u{110000}", None),
+            text_end(test_code(""), test_code("'")).into(),
+        ]);
+    }
+
    #[test]
    fn test_indented_doc_after_blank_line() {
        let code = ["type Redshift_Error_Mapper", "", "    A"].join("\n");
--- a/lib/rust/parser/src/serialization.rs
+++ b/lib/rust/parser/src/serialization.rs
@ -58,29 +58,6 @@ where D: serde::Deserializer<'de> {



-// ==============
-// === Tokens ===
-// ==============
-
-pub(crate) fn serialize_optional_char<S>(c: &Option<char>, s: S) -> Result<S::Ok, S::Error>
-where S: serde::Serializer {
-    let value = c.map(|c| c as u32).unwrap_or(0xFFFF_FFFF);
-    s.serialize_u32(value)
-}
-
-pub(crate) fn deserialize_optional_char<'c, 'de, D>(
-    deserializer: D,
-) -> Result<Option<char>, D::Error>
-where D: serde::Deserializer<'de> {
-    let value = deserializer.deserialize_u32(DeserializeU32)?;
-    Ok(match value {
-        0xFFFF_FFFF => None,
-        x => Some(char::try_from(x).unwrap()),
-    })
-}
-
-
-
 // =============
 // === Error ===
 // =============
--- a/lib/rust/parser/src/syntax/token.rs
+++ b/lib/rust/parser/src/syntax/token.rs
@ -282,10 +282,7 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg
        TextEnd,
        TextSection,
        TextEscape {
-            #[serde(serialize_with = "crate::serialization::serialize_optional_char")]
-            #[serde(deserialize_with = "crate::serialization::deserialize_optional_char")]
-            #[reflect(as = "char")]
-            pub value: Option<char>,
+            pub value: Codepoint,
        },
        TextInitialNewline,
        TextNewline,
@ -594,6 +591,67 @@ pub enum Base {
 }


+// === Text literals ===
+
+/// Represents any of:
+/// - A valid Unicode codepoint (i.e. a `char`).
+/// - A value that does not constitute a legal codepoint according to the Unicode standard, but is
+///   allowed in Enso strings and can be included in Enso text as an escape sequence. This includes
+///   unpaired surrogates.
+/// - A value representing the absence of a valid Unicode codepoint; this is included in the
+///   `Codepoint` type rather than using `Option<Codepoint>` in order to simplify defining efficient
+///   serialization for optional codepoints.
+#[derive(Clone, Copy, PartialEq, Eq, Serialize, Reflect, Deserialize, Debug)]
+#[reflect(transparent)]
+pub struct Codepoint(#[reflect(as = "char")] u32);
+
+impl Default for Codepoint {
+    fn default() -> Self {
+        Codepoint::none()
+    }
+}
+
+impl Codepoint {
+    /// Cast a `char` to a `Codepoint`; this is a widening conversion and will never result in
+    /// `Codepoint::none`.
+    pub const fn from_char(value: char) -> Self {
+        Codepoint(value as u32)
+    }
+
+    fn is_allowed_invalid_codepoint(value: u32) -> bool {
+        let unpaired_surrogates = 0xD800..=0xDFFF;
+        unpaired_surrogates.contains(&value)
+    }
+
+    /// Create either a valid `Codepoint` or `Codepoint::none` from the given value.
+    pub fn from_u32(value: u32) -> Self {
+        if let Some(c) = char::from_u32(value) {
+            Self::from_char(c)
+        } else if Self::is_allowed_invalid_codepoint(value) {
+            Codepoint(value)
+        } else {
+            Codepoint::none()
+        }
+    }
+
+    /// Return the representation of an unspecified or out-of-range codepoint.
+    pub const fn none() -> Self {
+        Codepoint(0xFFFF_FFFF)
+    }
+
+    /// Return true if this value is `Codepoint::none`.
+    pub const fn is_none(self) -> bool {
+        self.0 == Self::none().0
+    }
+
+    /// Return the value as a `char`, if it is a valid unicode Codepoint (and not
+    /// `Codepoint::none` or an unpaired surrogate).
+    pub const fn to_char(self) -> Option<char> {
+        char::from_u32(self.0)
+    }
+}
+
+
 // === Macro-based implementation ===

 macro_rules! generate_token_aliases {
--- a/lib/rust/parser/src/syntax/tree.rs
+++ b/lib/rust/parser/src/syntax/tree.rs
@ -497,28 +497,6 @@ pub struct DocComment<'s> {
    pub newlines: Vec<token::Newline<'s>>,
 }

-impl<'s> DocComment<'s> {
-    /// Return the contents of the comment, with leading whitespace, the `##` token, and following
-    /// empty lines removed; newlines will be normalized.
-    pub fn content(&self) -> String {
-        let mut buf = String::new();
-        for element in &self.elements {
-            match element {
-                TextElement::Section { text } => buf.push_str(&text.code.repr),
-                TextElement::Newline { .. } => buf.push('\n'),
-                TextElement::Escape { token } if let Some(c) = token.value => {
-                    buf.push(c);
-                }
-                // Invalid escape character, ignore it.
-                TextElement::Escape { .. } => (),
-                // Unreachable.
-                TextElement::Splice { .. } => continue,
-            }
-        }
-        buf
-    }
-}
-
 impl<'s> span::Builder<'s> for DocComment<'s> {
    fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> {
        span.add(&mut self.open).add(&mut self.elements).add(&mut self.newlines)