Text literals: Accept unpaired-surrogate escape codes. (#9731)

* Text literals: Accept unpaired-surrogate escape codes.

Unpaired surrogates are not allowed by Unicode, but they occur in practice
because many systems accept them; for example, they may be present in filenames
on Windows (which are otherwise constrained to UTF-16).

Programs written in Enso should be able to work with them, if only because they
represent edge cases that should be tested when converting encodings and at
other system boundaries.

- Generalize the representation of interpreted-text-escapes in the lexer, so
  that we are not tied to the strict Unicode of Rust's `str`.
- Move some doc-comment code from the parser to test utilities.
- Simplify token serialization.
This commit is contained in:
Kaz Wesley 2024-04-18 09:21:05 -04:00 committed by GitHub
parent a444934806
commit 0de490be24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 150 additions and 67 deletions

View File

@ -1049,15 +1049,15 @@ fn inline_text_literals() {
test!(r#""Non-escape: \n""#, (TextLiteral #((Section "Non-escape: \\n"))));
test!(r#""Non-escape: \""#, (TextLiteral #((Section "Non-escape: \\"))));
test!(r#"'String with \' escape'"#,
(TextLiteral #((Section "String with ") (Escape '\'') (Section " escape"))));
(TextLiteral #((Section "String with ") (Escape 0x27) (Section " escape"))));
test!(r#"'\u0915\u094D\u0937\u093F'"#, (TextLiteral
#((Escape '\u{0915}') (Escape '\u{094D}') (Escape '\u{0937}') (Escape '\u{093F}'))));
test!(r#"('\n')"#, (Group (TextLiteral #((Escape '\n')))));
#((Escape 0x0915) (Escape 0x094D) (Escape 0x0937) (Escape 0x093F))));
test!(r#"('\n')"#, (Group (TextLiteral #((Escape 0x0A)))));
test!(r#"`"#, (Invalid));
test!(r#"(")")"#, (Group (TextLiteral #((Section ")")))));
test!(r#"'\x'"#, (TextLiteral #((Escape ()))));
test!(r#"'\u'"#, (TextLiteral #((Escape ()))));
test!(r#"'\U'"#, (TextLiteral #((Escape ()))));
test!(r#"'\x'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
test!(r#"'\u'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
test!(r#"'\U'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
}
#[test]
@ -1100,7 +1100,7 @@ x"#;
];
test(code, expected);
let code = "'''\n \\nEscape at start\n";
test!(code, (TextLiteral #((Escape '\n') (Section "Escape at start"))) ());
test!(code, (TextLiteral #((Escape 0x0A) (Section "Escape at start"))) ());
let code = "x =\n x = '''\n x\nx";
#[rustfmt::skip]
let expected = block![
@ -1111,9 +1111,9 @@ x"#;
test(code, expected);
test!("foo = bar '''\n baz",
(Assignment (Ident foo) "=" (App (Ident bar) (TextLiteral #((Section "baz"))))));
test!("'''\n \\t'", (TextLiteral #((Escape '\t') (Section "'"))));
test!("'''\n \\t'", (TextLiteral #((Escape 0x09) (Section "'"))));
test!("'''\n x\n \\t'",
(TextLiteral #((Section "x") (Newline) (Escape '\t') (Section "'"))));
(TextLiteral #((Section "x") (Newline) (Escape 0x09) (Section "'"))));
}
#[test]
@ -1126,11 +1126,11 @@ fn interpolated_literals_in_inline_text() {
test!(r#"'` SpliceWithLeadingWhitespace`'"#,
(TextLiteral #((Splice (Ident SpliceWithLeadingWhitespace)))));
test!(r#"'String with \n escape'"#,
(TextLiteral #((Section "String with ") (Escape '\n') (Section " escape"))));
test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
(TextLiteral #((Section "String with ") (Escape 0x0A) (Section " escape"))));
test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
}
#[test]
@ -1149,7 +1149,7 @@ fn interpolated_literals_in_multiline_text() {
let expected = block![
(TextLiteral
#((Section "text with a ") (Splice (Ident splice)) (Newline)
(Section "and some ") (Escape '\n') (Section "escapes") (Escape '\'')))];
(Section "and some ") (Escape 0x0A) (Section "escapes") (Escape 0x27)))];
test(code, expected);
}

View File

@ -12,6 +12,8 @@
use enso_doc_parser::*;
use enso_parser::prelude::*;
use enso_parser::syntax::tree::DocComment;
use enso_parser::syntax::tree::TextElement;
// ====================================
@ -59,7 +61,37 @@ fn extract_docs(_filename: &str, mut code: &str) -> Vec<String> {
}
_ => {}
});
docs.take().into_iter().map(|node| node.content()).collect()
docs.take().iter().map(content).collect()
}
/// Return the contents of the comment, with leading whitespace, the `##` token, and following
/// empty lines removed; newlines will be normalized.
pub fn content(node: &DocComment) -> String {
let mut buf = String::new();
for element in &node.elements {
match element {
TextElement::Section { text } => buf.push_str(&text.code.repr),
TextElement::Newline { .. } => buf.push('\n'),
TextElement::Escape {
token:
token @ enso_parser::syntax::token::TextEscape {
variant: enso_parser::syntax::token::variant::TextEscape { value },
..
},
} => {
if let Some(c) = value.to_char() {
buf.push(c);
} else {
// Invalid escape character, or unpaired surrogate that can't be represented in
// a Rust string.
buf.push_str(**token.code)
}
}
// Unreachable.
TextElement::Splice { .. } => continue,
}
}
buf
}
/// Lex the given documentation, and return the sequence of tokens.

View File

@ -10,6 +10,7 @@ use crate::syntax::*;
use crate::source::code::Length;
use crate::source::code::Location;
use crate::syntax::token::Codepoint;
use std::str;
@ -1210,7 +1211,7 @@ impl<'s> Lexer<'s> {
let token = self.make_token(
backslash_start,
sequence_end.clone(),
token::Variant::text_escape(value.and_then(char::from_u32)),
token::Variant::text_escape(value.map(Codepoint::from_u32).unwrap_or_default()),
);
self.output.push(token);
sequence_end
@ -1236,7 +1237,7 @@ impl<'s> Lexer<'s> {
let token = self.make_token(
backslash_start,
escape_end.clone(),
token::Variant::text_escape(value),
token::Variant::text_escape(value.map(Codepoint::from_char).unwrap_or_default()),
);
self.output.push(token);
escape_end
@ -1906,6 +1907,43 @@ mod tests {
lex_and_validate_spans(&["## a", "", " b"].join("\n"));
}
fn text_escape_(code: &str, codepoint: Option<u32>) -> Token {
let codepoint = match codepoint {
Some(value) => {
let codepoint = Codepoint::from_u32(value);
assert!(!codepoint.is_none());
codepoint
}
None => Codepoint::none(),
};
text_escape(test_code(""), test_code(code), codepoint).into()
}
#[test]
fn test_text_escapes() {
// Valid Unicode codepoints.
test_lexer("'\\0\\u0\\u{10}\\u{10FFFF}'", vec![
text_start(test_code(""), test_code("'")).into(),
text_escape_("\\0", Some(0)),
text_escape_("\\u0", Some(0)),
text_escape_("\\u{10}", Some(0x10)),
text_escape_("\\u{10FFFF}", Some(0x10_FFFF)),
text_end(test_code(""), test_code("'")).into(),
]);
// Invalid Unicode, but allowed in Enso strings.
test_lexer("'\\uD800'", vec![
text_start(test_code(""), test_code("'")).into(),
text_escape_("\\uD800", Some(0xD800)),
text_end(test_code(""), test_code("'")).into(),
]);
// Invalid and disallowed.
test_lexer("'\\u{110000}'", vec![
text_start(test_code(""), test_code("'")).into(),
text_escape_("\\u{110000}", None),
text_end(test_code(""), test_code("'")).into(),
]);
}
#[test]
fn test_indented_doc_after_blank_line() {
let code = ["type Redshift_Error_Mapper", "", " A"].join("\n");

View File

@ -58,29 +58,6 @@ where D: serde::Deserializer<'de> {
// ==============
// === Tokens ===
// ==============
pub(crate) fn serialize_optional_char<S>(c: &Option<char>, s: S) -> Result<S::Ok, S::Error>
where S: serde::Serializer {
let value = c.map(|c| c as u32).unwrap_or(0xFFFF_FFFF);
s.serialize_u32(value)
}
pub(crate) fn deserialize_optional_char<'c, 'de, D>(
deserializer: D,
) -> Result<Option<char>, D::Error>
where D: serde::Deserializer<'de> {
let value = deserializer.deserialize_u32(DeserializeU32)?;
Ok(match value {
0xFFFF_FFFF => None,
x => Some(char::try_from(x).unwrap()),
})
}
// =============
// === Error ===
// =============

View File

@ -282,10 +282,7 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg
TextEnd,
TextSection,
TextEscape {
#[serde(serialize_with = "crate::serialization::serialize_optional_char")]
#[serde(deserialize_with = "crate::serialization::deserialize_optional_char")]
#[reflect(as = "char")]
pub value: Option<char>,
pub value: Codepoint,
},
TextInitialNewline,
TextNewline,
@ -594,6 +591,67 @@ pub enum Base {
}
// === Text literals ===
/// Represents any of:
/// - A valid Unicode codepoint (i.e. a `char`).
/// - A value that does not constitute a legal codepoint according to the Unicode standard, but is
/// allowed in Enso strings and can be included in Enso text as an escape sequence. This includes
/// unpaired surrogates.
/// - A value representing the absence of a valid Unicode codepoint; this is included in the
/// `Codepoint` type rather than using `Option<Codepoint>` in order to simplify defining efficient
/// serialization for optional codepoints.
#[derive(Clone, Copy, PartialEq, Eq, Serialize, Reflect, Deserialize, Debug)]
#[reflect(transparent)]
pub struct Codepoint(#[reflect(as = "char")] u32);
impl Default for Codepoint {
fn default() -> Self {
Codepoint::none()
}
}
impl Codepoint {
/// Cast a `char` to a `Codepoint`; this is a widening conversion and will never result in
/// `Codepoint::none`.
pub const fn from_char(value: char) -> Self {
Codepoint(value as u32)
}
fn is_allowed_invalid_codepoint(value: u32) -> bool {
let unpaired_surrogates = 0xD800..=0xDFFF;
unpaired_surrogates.contains(&value)
}
/// Create either a valid `Codepoint` or `Codepoint::none` from the given value.
pub fn from_u32(value: u32) -> Self {
if let Some(c) = char::from_u32(value) {
Self::from_char(c)
} else if Self::is_allowed_invalid_codepoint(value) {
Codepoint(value)
} else {
Codepoint::none()
}
}
/// Return the representation of an unspecified or out-of-range codepoint.
pub const fn none() -> Self {
Codepoint(0xFFFF_FFFF)
}
/// Return true if this value is `Codepoint::none`.
pub const fn is_none(self) -> bool {
self.0 == Self::none().0
}
/// Return the value as a `char`, if it is a valid unicode Codepoint (and not
/// `Codepoint::none` or an unpaired surrogate).
pub const fn to_char(self) -> Option<char> {
char::from_u32(self.0)
}
}
// === Macro-based implementation ===
macro_rules! generate_token_aliases {

View File

@ -497,28 +497,6 @@ pub struct DocComment<'s> {
pub newlines: Vec<token::Newline<'s>>,
}
impl<'s> DocComment<'s> {
/// Return the contents of the comment, with leading whitespace, the `##` token, and following
/// empty lines removed; newlines will be normalized.
pub fn content(&self) -> String {
let mut buf = String::new();
for element in &self.elements {
match element {
TextElement::Section { text } => buf.push_str(&text.code.repr),
TextElement::Newline { .. } => buf.push('\n'),
TextElement::Escape { token } if let Some(c) = token.value => {
buf.push(c);
}
// Invalid escape character, ignore it.
TextElement::Escape { .. } => (),
// Unreachable.
TextElement::Splice { .. } => continue,
}
}
buf
}
}
impl<'s> span::Builder<'s> for DocComment<'s> {
fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> {
span.add(&mut self.open).add(&mut self.elements).add(&mut self.newlines)