mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
Text literals: Accept unpaired-surrogate escape codes. (#9731)
* Text literals: Accept unpaired-surrogate escape codes. Unpaired surrogates are not allowed by Unicode, but they occur in practice because many systems accept them; for example, they may be present in filenames on Windows (which are otherwise constrained to UTF-16). Programs written in Enso should be able to work with them, if only because they represent edge cases that should be tested when converting encodings and at other system boundaries. - Generalize the representation of interpreted-text-escapes in the lexer, so that we are not tied to the strict Unicode of Rust's `str`. - Move some doc-comment code from the parser to test utilities. - Simplify token serialization.
This commit is contained in:
parent
a444934806
commit
0de490be24
@ -1049,15 +1049,15 @@ fn inline_text_literals() {
|
||||
test!(r#""Non-escape: \n""#, (TextLiteral #((Section "Non-escape: \\n"))));
|
||||
test!(r#""Non-escape: \""#, (TextLiteral #((Section "Non-escape: \\"))));
|
||||
test!(r#"'String with \' escape'"#,
|
||||
(TextLiteral #((Section "String with ") (Escape '\'') (Section " escape"))));
|
||||
(TextLiteral #((Section "String with ") (Escape 0x27) (Section " escape"))));
|
||||
test!(r#"'\u0915\u094D\u0937\u093F'"#, (TextLiteral
|
||||
#((Escape '\u{0915}') (Escape '\u{094D}') (Escape '\u{0937}') (Escape '\u{093F}'))));
|
||||
test!(r#"('\n')"#, (Group (TextLiteral #((Escape '\n')))));
|
||||
#((Escape 0x0915) (Escape 0x094D) (Escape 0x0937) (Escape 0x093F))));
|
||||
test!(r#"('\n')"#, (Group (TextLiteral #((Escape 0x0A)))));
|
||||
test!(r#"`"#, (Invalid));
|
||||
test!(r#"(")")"#, (Group (TextLiteral #((Section ")")))));
|
||||
test!(r#"'\x'"#, (TextLiteral #((Escape ()))));
|
||||
test!(r#"'\u'"#, (TextLiteral #((Escape ()))));
|
||||
test!(r#"'\U'"#, (TextLiteral #((Escape ()))));
|
||||
test!(r#"'\x'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
|
||||
test!(r#"'\u'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
|
||||
test!(r#"'\U'"#, (TextLiteral #((Escape 0xFFFFFFFFu32))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1100,7 +1100,7 @@ x"#;
|
||||
];
|
||||
test(code, expected);
|
||||
let code = "'''\n \\nEscape at start\n";
|
||||
test!(code, (TextLiteral #((Escape '\n') (Section "Escape at start"))) ());
|
||||
test!(code, (TextLiteral #((Escape 0x0A) (Section "Escape at start"))) ());
|
||||
let code = "x =\n x = '''\n x\nx";
|
||||
#[rustfmt::skip]
|
||||
let expected = block![
|
||||
@ -1111,9 +1111,9 @@ x"#;
|
||||
test(code, expected);
|
||||
test!("foo = bar '''\n baz",
|
||||
(Assignment (Ident foo) "=" (App (Ident bar) (TextLiteral #((Section "baz"))))));
|
||||
test!("'''\n \\t'", (TextLiteral #((Escape '\t') (Section "'"))));
|
||||
test!("'''\n \\t'", (TextLiteral #((Escape 0x09) (Section "'"))));
|
||||
test!("'''\n x\n \\t'",
|
||||
(TextLiteral #((Section "x") (Newline) (Escape '\t') (Section "'"))));
|
||||
(TextLiteral #((Section "x") (Newline) (Escape 0x09) (Section "'"))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1126,11 +1126,11 @@ fn interpolated_literals_in_inline_text() {
|
||||
test!(r#"'` SpliceWithLeadingWhitespace`'"#,
|
||||
(TextLiteral #((Splice (Ident SpliceWithLeadingWhitespace)))));
|
||||
test!(r#"'String with \n escape'"#,
|
||||
(TextLiteral #((Section "String with ") (Escape '\n') (Section " escape"))));
|
||||
test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
|
||||
test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
|
||||
test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
|
||||
test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape '\n') (Section "escape"))));
|
||||
(TextLiteral #((Section "String with ") (Escape 0x0A) (Section " escape"))));
|
||||
test!(r#"'\x0Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
|
||||
test!(r#"'\u000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
|
||||
test!(r#"'\u{0000A}escape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
|
||||
test!(r#"'\U0000000Aescape'"#, (TextLiteral #((Escape 0x0A) (Section "escape"))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1149,7 +1149,7 @@ fn interpolated_literals_in_multiline_text() {
|
||||
let expected = block![
|
||||
(TextLiteral
|
||||
#((Section "text with a ") (Splice (Ident splice)) (Newline)
|
||||
(Section "and some ") (Escape '\n') (Section "escapes") (Escape '\'')))];
|
||||
(Section "and some ") (Escape 0x0A) (Section "escapes") (Escape 0x27)))];
|
||||
test(code, expected);
|
||||
}
|
||||
|
||||
|
@ -12,6 +12,8 @@
|
||||
use enso_doc_parser::*;
|
||||
use enso_parser::prelude::*;
|
||||
|
||||
use enso_parser::syntax::tree::DocComment;
|
||||
use enso_parser::syntax::tree::TextElement;
|
||||
|
||||
|
||||
// ====================================
|
||||
@ -59,7 +61,37 @@ fn extract_docs(_filename: &str, mut code: &str) -> Vec<String> {
|
||||
}
|
||||
_ => {}
|
||||
});
|
||||
docs.take().into_iter().map(|node| node.content()).collect()
|
||||
docs.take().iter().map(content).collect()
|
||||
}
|
||||
|
||||
/// Return the contents of the comment, with leading whitespace, the `##` token, and following
|
||||
/// empty lines removed; newlines will be normalized.
|
||||
pub fn content(node: &DocComment) -> String {
|
||||
let mut buf = String::new();
|
||||
for element in &node.elements {
|
||||
match element {
|
||||
TextElement::Section { text } => buf.push_str(&text.code.repr),
|
||||
TextElement::Newline { .. } => buf.push('\n'),
|
||||
TextElement::Escape {
|
||||
token:
|
||||
token @ enso_parser::syntax::token::TextEscape {
|
||||
variant: enso_parser::syntax::token::variant::TextEscape { value },
|
||||
..
|
||||
},
|
||||
} => {
|
||||
if let Some(c) = value.to_char() {
|
||||
buf.push(c);
|
||||
} else {
|
||||
// Invalid escape character, or unpaired surrogate that can't be represented in
|
||||
// a Rust string.
|
||||
buf.push_str(**token.code)
|
||||
}
|
||||
}
|
||||
// Unreachable.
|
||||
TextElement::Splice { .. } => continue,
|
||||
}
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
/// Lex the given documentation, and return the sequence of tokens.
|
||||
|
@ -10,6 +10,7 @@ use crate::syntax::*;
|
||||
|
||||
use crate::source::code::Length;
|
||||
use crate::source::code::Location;
|
||||
use crate::syntax::token::Codepoint;
|
||||
|
||||
use std::str;
|
||||
|
||||
@ -1210,7 +1211,7 @@ impl<'s> Lexer<'s> {
|
||||
let token = self.make_token(
|
||||
backslash_start,
|
||||
sequence_end.clone(),
|
||||
token::Variant::text_escape(value.and_then(char::from_u32)),
|
||||
token::Variant::text_escape(value.map(Codepoint::from_u32).unwrap_or_default()),
|
||||
);
|
||||
self.output.push(token);
|
||||
sequence_end
|
||||
@ -1236,7 +1237,7 @@ impl<'s> Lexer<'s> {
|
||||
let token = self.make_token(
|
||||
backslash_start,
|
||||
escape_end.clone(),
|
||||
token::Variant::text_escape(value),
|
||||
token::Variant::text_escape(value.map(Codepoint::from_char).unwrap_or_default()),
|
||||
);
|
||||
self.output.push(token);
|
||||
escape_end
|
||||
@ -1906,6 +1907,43 @@ mod tests {
|
||||
lex_and_validate_spans(&["## a", "", " b"].join("\n"));
|
||||
}
|
||||
|
||||
fn text_escape_(code: &str, codepoint: Option<u32>) -> Token {
|
||||
let codepoint = match codepoint {
|
||||
Some(value) => {
|
||||
let codepoint = Codepoint::from_u32(value);
|
||||
assert!(!codepoint.is_none());
|
||||
codepoint
|
||||
}
|
||||
None => Codepoint::none(),
|
||||
};
|
||||
text_escape(test_code(""), test_code(code), codepoint).into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_escapes() {
|
||||
// Valid Unicode codepoints.
|
||||
test_lexer("'\\0\\u0\\u{10}\\u{10FFFF}'", vec![
|
||||
text_start(test_code(""), test_code("'")).into(),
|
||||
text_escape_("\\0", Some(0)),
|
||||
text_escape_("\\u0", Some(0)),
|
||||
text_escape_("\\u{10}", Some(0x10)),
|
||||
text_escape_("\\u{10FFFF}", Some(0x10_FFFF)),
|
||||
text_end(test_code(""), test_code("'")).into(),
|
||||
]);
|
||||
// Invalid Unicode, but allowed in Enso strings.
|
||||
test_lexer("'\\uD800'", vec![
|
||||
text_start(test_code(""), test_code("'")).into(),
|
||||
text_escape_("\\uD800", Some(0xD800)),
|
||||
text_end(test_code(""), test_code("'")).into(),
|
||||
]);
|
||||
// Invalid and disallowed.
|
||||
test_lexer("'\\u{110000}'", vec![
|
||||
text_start(test_code(""), test_code("'")).into(),
|
||||
text_escape_("\\u{110000}", None),
|
||||
text_end(test_code(""), test_code("'")).into(),
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indented_doc_after_blank_line() {
|
||||
let code = ["type Redshift_Error_Mapper", "", " A"].join("\n");
|
||||
|
@ -58,29 +58,6 @@ where D: serde::Deserializer<'de> {
|
||||
|
||||
|
||||
|
||||
// ==============
|
||||
// === Tokens ===
|
||||
// ==============
|
||||
|
||||
pub(crate) fn serialize_optional_char<S>(c: &Option<char>, s: S) -> Result<S::Ok, S::Error>
|
||||
where S: serde::Serializer {
|
||||
let value = c.map(|c| c as u32).unwrap_or(0xFFFF_FFFF);
|
||||
s.serialize_u32(value)
|
||||
}
|
||||
|
||||
pub(crate) fn deserialize_optional_char<'c, 'de, D>(
|
||||
deserializer: D,
|
||||
) -> Result<Option<char>, D::Error>
|
||||
where D: serde::Deserializer<'de> {
|
||||
let value = deserializer.deserialize_u32(DeserializeU32)?;
|
||||
Ok(match value {
|
||||
0xFFFF_FFFF => None,
|
||||
x => Some(char::try_from(x).unwrap()),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
|
||||
// =============
|
||||
// === Error ===
|
||||
// =============
|
||||
|
@ -282,10 +282,7 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg
|
||||
TextEnd,
|
||||
TextSection,
|
||||
TextEscape {
|
||||
#[serde(serialize_with = "crate::serialization::serialize_optional_char")]
|
||||
#[serde(deserialize_with = "crate::serialization::deserialize_optional_char")]
|
||||
#[reflect(as = "char")]
|
||||
pub value: Option<char>,
|
||||
pub value: Codepoint,
|
||||
},
|
||||
TextInitialNewline,
|
||||
TextNewline,
|
||||
@ -594,6 +591,67 @@ pub enum Base {
|
||||
}
|
||||
|
||||
|
||||
// === Text literals ===
|
||||
|
||||
/// Represents any of:
|
||||
/// - A valid Unicode codepoint (i.e. a `char`).
|
||||
/// - A value that does not constitute a legal codepoint according to the Unicode standard, but is
|
||||
/// allowed in Enso strings and can be included in Enso text as an escape sequence. This includes
|
||||
/// unpaired surrogates.
|
||||
/// - A value representing the absence of a valid Unicode codepoint; this is included in the
|
||||
/// `Codepoint` type rather than using `Option<Codepoint>` in order to simplify defining efficient
|
||||
/// serialization for optional codepoints.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Serialize, Reflect, Deserialize, Debug)]
|
||||
#[reflect(transparent)]
|
||||
pub struct Codepoint(#[reflect(as = "char")] u32);
|
||||
|
||||
impl Default for Codepoint {
|
||||
fn default() -> Self {
|
||||
Codepoint::none()
|
||||
}
|
||||
}
|
||||
|
||||
impl Codepoint {
|
||||
/// Cast a `char` to a `Codepoint`; this is a widening conversion and will never result in
|
||||
/// `Codepoint::none`.
|
||||
pub const fn from_char(value: char) -> Self {
|
||||
Codepoint(value as u32)
|
||||
}
|
||||
|
||||
fn is_allowed_invalid_codepoint(value: u32) -> bool {
|
||||
let unpaired_surrogates = 0xD800..=0xDFFF;
|
||||
unpaired_surrogates.contains(&value)
|
||||
}
|
||||
|
||||
/// Create either a valid `Codepoint` or `Codepoint::none` from the given value.
|
||||
pub fn from_u32(value: u32) -> Self {
|
||||
if let Some(c) = char::from_u32(value) {
|
||||
Self::from_char(c)
|
||||
} else if Self::is_allowed_invalid_codepoint(value) {
|
||||
Codepoint(value)
|
||||
} else {
|
||||
Codepoint::none()
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the representation of an unspecified or out-of-range codepoint.
|
||||
pub const fn none() -> Self {
|
||||
Codepoint(0xFFFF_FFFF)
|
||||
}
|
||||
|
||||
/// Return true if this value is `Codepoint::none`.
|
||||
pub const fn is_none(self) -> bool {
|
||||
self.0 == Self::none().0
|
||||
}
|
||||
|
||||
/// Return the value as a `char`, if it is a valid unicode Codepoint (and not
|
||||
/// `Codepoint::none` or an unpaired surrogate).
|
||||
pub const fn to_char(self) -> Option<char> {
|
||||
char::from_u32(self.0)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// === Macro-based implementation ===
|
||||
|
||||
macro_rules! generate_token_aliases {
|
||||
|
@ -497,28 +497,6 @@ pub struct DocComment<'s> {
|
||||
pub newlines: Vec<token::Newline<'s>>,
|
||||
}
|
||||
|
||||
impl<'s> DocComment<'s> {
|
||||
/// Return the contents of the comment, with leading whitespace, the `##` token, and following
|
||||
/// empty lines removed; newlines will be normalized.
|
||||
pub fn content(&self) -> String {
|
||||
let mut buf = String::new();
|
||||
for element in &self.elements {
|
||||
match element {
|
||||
TextElement::Section { text } => buf.push_str(&text.code.repr),
|
||||
TextElement::Newline { .. } => buf.push('\n'),
|
||||
TextElement::Escape { token } if let Some(c) = token.value => {
|
||||
buf.push(c);
|
||||
}
|
||||
// Invalid escape character, ignore it.
|
||||
TextElement::Escape { .. } => (),
|
||||
// Unreachable.
|
||||
TextElement::Splice { .. } => continue,
|
||||
}
|
||||
}
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> span::Builder<'s> for DocComment<'s> {
|
||||
fn add_to_span(&mut self, span: Span<'s>) -> Span<'s> {
|
||||
span.add(&mut self.open).add(&mut self.elements).add(&mut self.newlines)
|
||||
|
Loading…
Reference in New Issue
Block a user