Parser: Newline normalization in text literals (#3903)

- Newlines in text literals are now normalized to `\n` when producing IR representation.
- Re-enabled tests that were dependent on the old behavior.
This commit is contained in:
Kaz Wesley 2022-11-27 01:40:44 -08:00 committed by GitHub
parent 1b52ae239f
commit 336bbf505c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 84 additions and 78 deletions

View File

@ -1006,6 +1006,7 @@ final class TreeToIr {
sb.appendCodePoint(val);
}
}
case TextElement.Newline n -> sb.append('\n');
default -> throw new UnhandledEntity(t, "buildTextConstant");
}
}

View File

@ -125,9 +125,9 @@ fn doc_comments() {
#[rustfmt::skip]
test(&lines.join("\n"), block![
(Documented
(#((Section " The Identity Function") (Section "\n")
(Section "\n")
(Section "Arguments:") (Section "\n")
(#((Section " The Identity Function") (Newline)
(Newline)
(Section "Arguments:") (Newline)
(Section "- x: value to do nothing to"))
#(()))
(Function (Ident id) #((() (Ident x) () ())) "=" (Ident x)))]);
@ -919,10 +919,10 @@ x"#;
#[rustfmt::skip]
let expected = block![
(TextLiteral
#((Section "part of the string") (Section "\n")
(Section " 3-spaces indented line, part of the Text Block") (Section "\n")
(Section "this does not end the string -> '''") (Section "\n")
(Section "\n")
#((Section "part of the string") (Newline)
(Section " 3-spaces indented line, part of the Text Block") (Newline)
(Section "this does not end the string -> '''") (Newline)
(Newline)
(Section "`also` part of the string")))
()
(Ident x)
@ -959,7 +959,7 @@ x"#;
(Assignment (Ident foo) "=" (App (Ident bar) (TextLiteral #((Section "baz"))))));
test!("'''\n \\t'", (TextLiteral #((Escape '\t') (Section "'"))));
test!("'''\n x\n \\t'",
(TextLiteral #((Section "x") (Section "\n") (Escape '\t') (Section "'"))));
(TextLiteral #((Section "x") (Newline) (Escape '\t') (Section "'"))));
}
#[test]
@ -1003,7 +1003,7 @@ fn interpolated_literals_in_multiline_text() {
#[rustfmt::skip]
let expected = block![
(TextLiteral
#((Section "text with a ") (Splice (Ident splice)) (Section "\n")
#((Section "text with a ") (Splice (Ident splice)) (Newline)
(Section "and some ") (Escape '\n') (Section "escapes") (Escape '\'')))];
test(code, expected);
}

View File

@ -1008,7 +1008,7 @@ impl<'s> Lexer<'s> {
}
let newlines = newlines
.into_iter()
.map(|token| token.with_variant(token::Variant::text_section()));
.map(|token| token.with_variant(token::Variant::text_newline()));
self.output.extend(newlines);
continue;
}

View File

@ -480,7 +480,7 @@ impl<'s> Resolver<'s> {
syntax::Item::Tree(tree) => body.push_str(&tree.code()),
}
}
let header0 = syntax::Tree::from(header0).with_error("Invalid macro invocation.");
let header0 = syntax::tree::to_ast(header0).with_error("Invalid macro invocation.");
(header0, items)
}
}

View File

@ -50,15 +50,6 @@ impl<'s> Item<'s> {
}
}
/// Convert this item to a [`Tree`].
pub fn to_ast(self) -> Tree<'s> {
match self {
Item::Token(token) => token.into(),
Item::Tree(ast) => ast,
Item::Block(items) => build_block(items),
}
}
/// If this item is an [`Item::Tree`], apply the given function to the contained [`Tree`] and
/// return the result.
pub fn map_tree<'t: 's, F>(self, f: F) -> Self
@ -90,7 +81,7 @@ impl<'s> TryAsRef<Item<'s>> for Item<'s> {
/// Given a sequence of [`Line`]s belonging to one block, create an AST block node, of a type
/// determined by the syntax of the lines in the block.
fn build_block<'s>(lines: impl IntoIterator<Item = Line<'s>>) -> Tree<'s> {
pub fn build_block<'s>(lines: impl IntoIterator<Item = Line<'s>>) -> Tree<'s> {
let mut block_builder = tree::block::Builder::new();
let mut precedence = operator::Precedence::new();
for Line { newline, items } in lines {

View File

@ -65,9 +65,10 @@ impl<'s> Precedence<'s> {
code,
}) => self.nospace_builder.operator(Token(left_offset, code, opr)),
syntax::Item::Token(token) =>
self.nospace_builder.operand(syntax::Tree::from(token).into()),
self.nospace_builder.operand(syntax::tree::to_ast(token).into()),
syntax::Item::Tree(tree) => self.nospace_builder.operand(tree.into()),
syntax::Item::Block(_) => self.nospace_builder.operand(item.to_ast().into()),
syntax::Item::Block(lines) =>
self.nospace_builder.operand(syntax::item::build_block(lines).into()),
}
}

View File

@ -291,6 +291,7 @@ macro_rules! with_token_definition { ($f:ident ($($args:tt)*)) => { $f! { $($arg
pub value: Option<char>,
},
TextInitialNewline,
TextNewline,
Invalid,
}
}}}

View File

@ -505,6 +505,12 @@ pub enum TextElement<'s> {
/// The escape sequence.
token: token::TextEscape<'s>,
},
/// A logical newline.
Newline {
/// The newline token. The semantics of a logical newline are independent of the specific
/// characters in the input, which are generally platform-dependent.
newline: token::Newline<'s>,
},
/// An interpolated section within a text literal.
Splice {
/// The opening ` character.
@ -523,6 +529,7 @@ impl<'s> span::Builder<'s> for TextElement<'s> {
TextElement::Escape { token } => span.add(token),
TextElement::Splice { open, expression, close } =>
span.add(open).add(expression).add(close),
TextElement::Newline { newline } => span.add(newline),
}
}
}
@ -856,6 +863,7 @@ pub fn join_text_literals<'s>(
Some(TextElement::Section { text }) => text.left_offset += rhs_span.left_offset,
Some(TextElement::Escape { token }) => token.left_offset += rhs_span.left_offset,
Some(TextElement::Splice { open, .. }) => open.left_offset += rhs_span.left_offset,
Some(TextElement::Newline { newline }) => newline.left_offset += rhs_span.left_offset,
None => (),
}
if let Some(newline) = rhs.newline.take() {
@ -939,52 +947,56 @@ pub fn apply_unary_operator<'s>(opr: token::Operator<'s>, rhs: Option<Tree<'s>>)
}
}
impl<'s> From<Token<'s>> for Tree<'s> {
fn from(token: Token<'s>) -> Self {
match token.variant {
token::Variant::Ident(ident) => token.with_variant(ident).into(),
token::Variant::Digits(number) =>
Tree::number(None, Some(token.with_variant(number)), None),
token::Variant::NumberBase(base) =>
Tree::number(Some(token.with_variant(base)), None, None),
token::Variant::TextStart(open) =>
Tree::text_literal(Some(token.with_variant(open)), default(), default(), default(), default()),
token::Variant::TextSection(section) => {
let section = TextElement::Section { text: token.with_variant(section) };
Tree::text_literal(default(), default(), vec![section], default(), default())
}
token::Variant::TextEscape(escape) => {
let token = token.with_variant(escape);
let section = TextElement::Escape { token };
Tree::text_literal(default(), default(), vec![section], default(), default())
}
token::Variant::TextEnd(_) if token.code.is_empty() =>
Tree::text_literal(default(), default(), default(), default(), true),
token::Variant::TextEnd(close) =>
Tree::text_literal(default(), default(), default(), Some(token.with_variant(close)), true),
token::Variant::TextInitialNewline(_) =>
Tree::text_literal(default(), Some(token::newline(token.left_offset, token.code)), default(), default(), default()),
token::Variant::Wildcard(wildcard) => Tree::wildcard(token.with_variant(wildcard), default()),
token::Variant::AutoScope(t) => Tree::auto_scope(token.with_variant(t)),
token::Variant::OpenSymbol(s) =>
Tree::group(Some(token.with_variant(s)), default(), default()).with_error("Unmatched delimiter"),
token::Variant::CloseSymbol(s) =>
Tree::group(default(), default(), Some(token.with_variant(s))).with_error("Unmatched delimiter"),
// These should be unreachable: They are handled when assembling items into blocks,
// before parsing proper.
token::Variant::Newline(_)
| token::Variant::BlockStart(_)
| token::Variant::BlockEnd(_)
// This should be unreachable: `resolve_operator_precedence` doesn't calls `to_ast` for
// operators.
| token::Variant::Operator(_)
// Map an error case in the lexer to an error in the AST.
| token::Variant::Invalid(_) => {
let message = format!("Unexpected token: {token:?}");
let ident = token::variant::Ident(false, 0, false, false, false);
let value = Tree::ident(token.with_variant(ident));
Tree::with_error(value, message)
}
/// Create an AST node for a token.
pub fn to_ast(token: Token) -> Tree {
match token.variant {
token::Variant::Ident(ident) => token.with_variant(ident).into(),
token::Variant::Digits(number) =>
Tree::number(None, Some(token.with_variant(number)), None),
token::Variant::NumberBase(base) =>
Tree::number(Some(token.with_variant(base)), None, None),
token::Variant::TextStart(open) =>
Tree::text_literal(Some(token.with_variant(open)), default(), default(), default(), default()),
token::Variant::TextSection(section) => {
let section = TextElement::Section { text: token.with_variant(section) };
Tree::text_literal(default(), default(), vec![section], default(), default())
}
token::Variant::TextEscape(escape) => {
let token = token.with_variant(escape);
let section = TextElement::Escape { token };
Tree::text_literal(default(), default(), vec![section], default(), default())
}
token::Variant::TextEnd(_) if token.code.is_empty() =>
Tree::text_literal(default(), default(), default(), default(), true),
token::Variant::TextEnd(close) =>
Tree::text_literal(default(), default(), default(), Some(token.with_variant(close)), true),
token::Variant::TextInitialNewline(_) =>
Tree::text_literal(default(), Some(token::newline(token.left_offset, token.code)), default(), default(), default()),
token::Variant::TextNewline(_) => {
let newline = token::newline(token.left_offset, token.code);
let newline = TextElement::Newline { newline };
Tree::text_literal(default(), default(), vec![newline], default(), default())
}
token::Variant::Wildcard(wildcard) => Tree::wildcard(token.with_variant(wildcard), default()),
token::Variant::AutoScope(t) => Tree::auto_scope(token.with_variant(t)),
token::Variant::OpenSymbol(s) =>
Tree::group(Some(token.with_variant(s)), default(), default()).with_error("Unmatched delimiter"),
token::Variant::CloseSymbol(s) =>
Tree::group(default(), default(), Some(token.with_variant(s))).with_error("Unmatched delimiter"),
// These should be unreachable: They are handled when assembling items into blocks,
// before parsing proper.
token::Variant::Newline(_)
| token::Variant::BlockStart(_)
| token::Variant::BlockEnd(_)
// This should be unreachable: `resolve_operator_precedence` doesn't calls `to_ast` for
// operators.
| token::Variant::Operator(_)
// Map an error case in the lexer to an error in the AST.
| token::Variant::Invalid(_) => {
let message = format!("Unexpected token: {token:?}");
let ident = token::variant::Ident(false, 0, false, false, false);
let value = Tree::ident(token.with_variant(ident));
Tree::with_error(value, message)
}
}
}

View File

@ -246,8 +246,8 @@ spec =
Test.specify "should escape special characters when debug-printing text" <|
text_1 = '''
foo
bar\tbaz
(text_1.replace '\r' "").to_text.should_equal "'foo\nbar\tbaz'"
bar\r\tbaz
text_1.to_text.should_equal "'foo\nbar\r\tbaz'"
text_2 = '\n\t\a\b\f\r\v\e\''
text_2.to_text.should_equal "'\n\t\a\b\f\r\v\e\''"
@ -830,8 +830,8 @@ spec =
long_text = """
Hello from a long text. EOL
SOL Hmm...
(long_text.replace '\r' "") . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
(long_text.replace '\r' "") . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
long_text . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
long_text . contains "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
Test.specify "should check for starts_with using Unicode normalization" <|
"Hello".starts_with "He" . should_be_true
@ -917,8 +917,8 @@ spec =
long_text = """
EOL
SOL Hmm...
(long_text.replace '\r' "") . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
(long_text.replace '\r' "") . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
long_text . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=True) . should_be_true
long_text . starts_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive dot_matches_newline=False) . should_be_false
"aaazzz" . starts_with "a|b" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive) . should_be_true
"bbbzzz" . starts_with "a|b" (Regex_Matcher.Regex_Matcher_Data case_sensitivity=Case_Sensitivity.Sensitive) . should_be_true
@ -997,8 +997,8 @@ spec =
long_text = """
Hnnnn EOL
SOL
(long_text.replace '\r' "") . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=True) . should_be_true
(long_text.replace '\r' "") . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=False) . should_be_false
long_text . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=True) . should_be_true
long_text . ends_with "EOL.SOL" (Regex_Matcher.Regex_Matcher_Data dot_matches_newline=False) . should_be_false
"zzzaaa" . ends_with "a|b" Regex_Matcher.Regex_Matcher_Data . should_be_true
"zzzbbb" . ends_with "a|b" Regex_Matcher.Regex_Matcher_Data . should_be_true
@ -1604,7 +1604,7 @@ spec =
text = """
Foo
bar
r4 = text.replace "(\n|\r)" "" matcher=(Regex_Matcher.Regex_Matcher_Data multiline=True)
r4 = text.replace '\n' "" matcher=(Regex_Matcher.Regex_Matcher_Data multiline=True)
r4 . should_equal "Foobar"
r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Regex_Matcher_Data comments=True)