From 3dbf4c62b00c2deef36e283c33dcbd214d073313 Mon Sep 17 00:00:00 2001 From: Linus Groh Date: Wed, 28 Oct 2020 09:29:11 +0000 Subject: [PATCH] LibJS: Use GenericLexer for Token::string_value() This is, and I can't stress this enough, a lot better than all the manual bounds checking and indexing that was going on before. Also fixes a small bug where "\u{}" wouldn't get rejected as invalid unicode escape sequence. --- Libraries/LibJS/Tests/string-escapes.js | 17 ++ Libraries/LibJS/Token.cpp | 248 +++++++++--------------- 2 files changed, 114 insertions(+), 151 deletions(-) diff --git a/Libraries/LibJS/Tests/string-escapes.js b/Libraries/LibJS/Tests/string-escapes.js index d8c75e9055f..6d3072effce 100644 --- a/Libraries/LibJS/Tests/string-escapes.js +++ b/Libraries/LibJS/Tests/string-escapes.js @@ -4,6 +4,11 @@ test("hex escapes", () => { expect(`\x55`).toBe("U"); expect(`\X55`).toBe("X55"); expect("\xff").toBe(String.fromCharCode(0xff)); + expect("'\\x'").not.toEval(); + expect("'\\x1'").not.toEval(); + expect("'\\xz'").not.toEval(); + expect("'\\xzz'").not.toEval(); + expect("'\\x🐞'").not.toEval(); }); test("unicode escapes", () => { @@ -12,6 +17,18 @@ test("unicode escapes", () => { expect("\u{1f41e}").toBe("🐞"); expect(`\u{1f41e}`).toBe("🐞"); expect("\u00ff").toBe(String.fromCharCode(0xff)); + expect("'\\u'").not.toEval(); + expect("'\\u1'").not.toEval(); + expect("'\\uf'").not.toEval(); + expect("'\\u123'").not.toEval(); + expect("'\\u123z'").not.toEval(); + expect("'\\uz'").not.toEval(); + expect("'\\uzz'").not.toEval(); + expect("'\\uzzzz'").not.toEval(); + expect("'\\u{'").not.toEval(); + expect("'\\u{}'").not.toEval(); + expect("'\\u{z}'").not.toEval(); + expect("'\\u🐞'").not.toEval(); }); describe("octal escapes", () => { diff --git a/Libraries/LibJS/Token.cpp b/Libraries/LibJS/Token.cpp index 2826f7fbf67..638d4b7931f 100644 --- a/Libraries/LibJS/Token.cpp +++ b/Libraries/LibJS/Token.cpp @@ -27,8 +27,8 @@ #include "Token.h" #include +#include #include -#include #include namespace JS { @@ -104,20 +104,9 @@ static u32 hex2int(char x) String Token::string_value(StringValueStatus& status) const { ASSERT(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString); + auto is_template = type() == TokenType::TemplateLiteralString; - auto offset = is_template ? 0 : 1; - - size_t i; - - auto lookahead = [&](T fn, size_t distance = 1) -> bool { - if (i + distance >= m_value.length() - offset) - return false; - return fn(m_value[i + distance]); - }; - - auto is_octal_digit = [](char c) { - return c >= '0' && c <= '7'; - }; + GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2)); auto encoding_failure = [&status](StringValueStatus parse_status) -> String { status = parse_status; @@ -125,144 +114,101 @@ String Token::string_value(StringValueStatus& status) const }; StringBuilder builder; - for (i = offset; i < m_value.length() - offset; ++i) { - if (m_value[i] == '\\' && i + 1 < m_value.length() - offset) { - i++; - switch (m_value[i]) { - case 'b': - builder.append('\b'); - break; - case 'f': - builder.append('\f'); - break; - case 'n': - builder.append('\n'); - break; - case 'r': - builder.append('\r'); - break; - case 't': - builder.append('\t'); - break; - case 'v': - builder.append('\v'); - break; - case '\'': - builder.append('\''); - break; - case '"': - builder.append('"'); - break; - case '\\': - builder.append('\\'); - break; - case '\n': - break; - case '\r': - break; - case 'x': { - if (i + 2 >= m_value.length() - offset) - return encoding_failure(StringValueStatus::MalformedHexEscape); - - auto digit1 = m_value[++i]; - auto digit2 = m_value[++i]; - if (!isxdigit(digit1) || !isxdigit(digit2)) - return encoding_failure(StringValueStatus::MalformedHexEscape); - builder.append_code_point(hex2int(digit1) * 16 + hex2int(digit2)); - break; - } - case 'u': { - if (i + 1 >= m_value.length() - offset) - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - u32 code_point = m_value[++i]; - - if (code_point == '{') { - code_point = 0; - while (true) { - if (i + 1 >= m_value.length() - offset) - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - - auto ch = m_value[++i]; - if (ch == '}') - break; - if (!isxdigit(ch)) - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - - auto new_code_point = (code_point << 4u) | hex2int(ch); - if (new_code_point < code_point) - return encoding_failure(StringValueStatus::UnicodeEscapeOverflow); - code_point = new_code_point; - } - } else { - if (i + 3 >= m_value.length() - offset || !isxdigit(code_point)) - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - - code_point = hex2int(code_point); - for (int j = 0; j < 3; ++j) { - auto ch = m_value[++i]; - if (!isxdigit(ch)) - return encoding_failure(StringValueStatus::MalformedUnicodeEscape); - code_point = (code_point << 4u) | hex2int(ch); - } - } - - builder.append_code_point(code_point); - break; - } - default: - if (i + 2 < m_value.length() - offset) { - auto three_chars_view = m_value.substring_view(i, 3); - if (three_chars_view == LINE_SEPARATOR || three_chars_view == PARAGRAPH_SEPARATOR) { - // line continuation with LS or PS - i += 2; - break; - } - } - if (is_template && (m_value[i] == '$' || m_value[i] == '`')) { - builder.append(m_value[i]); - break; - } - if (m_value[i] == '0' && !lookahead(isdigit)) { - builder.append((char)0); - break; - } - - // In non-strict mode LegacyOctalEscapeSequence is allowed in strings: - // https://tc39.es/ecma262/#sec-additional-syntax-string-literals - String octal_str; - - // OctalDigit [lookahead ∉ OctalDigit] - if (is_octal_digit(m_value[i]) && !lookahead(is_octal_digit)) { - status = StringValueStatus::LegacyOctalEscapeSequence; - octal_str = String(&m_value[i], 1); - } - // ZeroToThree OctalDigit [lookahead ∉ OctalDigit] - else if (m_value[i] >= '0' && m_value[i] <= '3' && lookahead(is_octal_digit) && !lookahead(is_octal_digit, 2)) { - status = StringValueStatus::LegacyOctalEscapeSequence; - octal_str = String(m_value.substring_view(i, 2)); - i++; - } - // FourToSeven OctalDigit - else if (m_value[i] >= '4' && m_value[i] <= '7' && lookahead(is_octal_digit)) { - status = StringValueStatus::LegacyOctalEscapeSequence; - octal_str = String(m_value.substring_view(i, 2)); - i++; - } - // ZeroToThree OctalDigit OctalDigit - else if (m_value[i] >= '0' && m_value[i] <= '3' && lookahead(is_octal_digit) && lookahead(is_octal_digit, 2)) { - status = StringValueStatus::LegacyOctalEscapeSequence; - octal_str = String(m_value.substring_view(i, 3)); - i += 2; - } - - if (status == StringValueStatus::LegacyOctalEscapeSequence) - builder.append_code_point(strtoul(octal_str.characters(), nullptr, 8)); - else - builder.append(m_value[i]); - } - } else { - builder.append(m_value[i]); + while (!lexer.is_eof()) { + // No escape, consume one char and continue + if (!lexer.next_is('\\')) { + builder.append(lexer.consume()); + continue; } + + lexer.ignore(); + ASSERT(!lexer.is_eof()); + + // Line continuation + if (lexer.next_is('\n') || lexer.next_is('\r')) { + lexer.ignore(); + continue; + } + // Line continuation + if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) { + lexer.ignore(3); + continue; + } + // Null-byte escape + if (lexer.next_is('0') && !isdigit(lexer.peek(1))) { + lexer.ignore(); + builder.append('\0'); + continue; + } + // Hex escape + if (lexer.next_is('x')) { + lexer.ignore(); + if (!isxdigit(lexer.peek()) || !isxdigit(lexer.peek(1))) + return encoding_failure(StringValueStatus::MalformedHexEscape); + auto code_point = hex2int(lexer.consume()) * 16 + hex2int(lexer.consume()); + ASSERT(code_point <= 255); + builder.append_code_point(code_point); + continue; + } + // Unicode escape + if (lexer.next_is('u')) { + lexer.ignore(); + u32 code_point = 0; + if (lexer.next_is('{')) { + lexer.ignore(); + while (true) { + if (!lexer.next_is(isxdigit)) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + auto new_code_point = (code_point << 4u) | hex2int(lexer.consume()); + if (new_code_point < code_point) + return encoding_failure(StringValueStatus::UnicodeEscapeOverflow); + code_point = new_code_point; + if (lexer.next_is('}')) + break; + } + lexer.ignore(); + } else { + for (int j = 0; j < 4; ++j) { + if (!lexer.next_is(isxdigit)) + return encoding_failure(StringValueStatus::MalformedUnicodeEscape); + code_point = (code_point << 4u) | hex2int(lexer.consume()); + } + } + builder.append_code_point(code_point); + continue; + } + + // In non-strict mode LegacyOctalEscapeSequence is allowed in strings: + // https://tc39.es/ecma262/#sec-additional-syntax-string-literals + String octal_str; + + auto is_octal_digit = [](char ch) { return ch >= '0' && ch <= '7'; }; + auto is_zero_to_three = [](char ch) { return ch >= '0' && ch <= '3'; }; + auto is_four_to_seven = [](char ch) { return ch >= '4' && ch <= '7'; }; + + // OctalDigit [lookahead ∉ OctalDigit] + if (is_octal_digit(lexer.peek()) && !is_octal_digit(lexer.peek(1))) + octal_str = lexer.consume(1); + // ZeroToThree OctalDigit [lookahead ∉ OctalDigit] + else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && !is_octal_digit(lexer.peek(2))) + octal_str = lexer.consume(2); + // FourToSeven OctalDigit + else if (is_four_to_seven(lexer.peek()) && is_octal_digit(lexer.peek(1))) + octal_str = lexer.consume(2); + // ZeroToThree OctalDigit OctalDigit + else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && is_octal_digit(lexer.peek(2))) + octal_str = lexer.consume(3); + + if (!octal_str.is_null()) { + status = StringValueStatus::LegacyOctalEscapeSequence; + auto code_point = strtoul(octal_str.characters(), nullptr, 8); + ASSERT(code_point <= 255); + builder.append_code_point(code_point); + continue; + } + + lexer.retreat(); + builder.append(lexer.consume_escaped_character('\\', "b\bf\fn\nr\rt\tv\v")); } return builder.to_string(); }