LibRegex: Refactor parsing 'CharacterEscape' out of 'AtomEscape'

The ECMA262 spec has this as a separate production, and we need it to be
split up for a future commit.
This commit is contained in:
Ali Mohammad Pur 2022-07-20 23:19:43 +04:30 committed by Linus Groh
parent b908f9f6ef
commit 7734914909
Notes: sideshowbarker 2024-07-17 08:44:47 +09:00
2 changed files with 136 additions and 123 deletions

View File

@ -1427,6 +1427,137 @@ bool ECMA262Parser::parse_invalid_braced_quantifier()
}
bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& compares, size_t& match_length_minimum, ParseFlags flags)
{
// CharacterEscape > ControlEscape
if (try_skip("f"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\f' });
return true;
}
if (try_skip("n"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\n' });
return true;
}
if (try_skip("r"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\r' });
return true;
}
if (try_skip("t"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\t' });
return true;
}
if (try_skip("v"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\v' });
return true;
}
// CharacterEscape > ControlLetter
if (try_skip("c"sv)) {
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)(c % 32) });
return true;
}
}
if (flags.unicode) {
set_error(Error::InvalidPattern);
return false;
}
if (m_should_use_browser_extended_grammar) {
back(1 + (done() ? 0 : 1));
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\\' });
match_length_minimum += 1;
return true;
}
// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'c' });
return true;
}
// '\0'
if (try_skip("0"sv)) {
if (!lookahead_any(s_decimal_characters)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)0 });
return true;
}
back();
}
// LegacyOctalEscapeSequence
if (m_should_use_browser_extended_grammar) {
if (!flags.unicode) {
if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)escape.value() });
match_length_minimum += 1;
return true;
}
}
}
// HexEscape
if (try_skip("x"sv)) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() });
return true;
}
if (!flags.unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'x' });
return true;
}
set_error(Error::InvalidPattern);
return false;
}
if (try_skip("u"sv)) {
if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)code_point.value() });
return true;
}
return false;
}
// IdentityEscape
for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) {
if (try_skip({ &ch, 1 })) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)ch });
return true;
}
}
if (flags.unicode) {
if (try_skip("/"sv)) {
match_length_minimum += 1;
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'/' });
return true;
}
}
return false;
}
bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags)
{
if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
if (auto escape = escape_str.to_uint(); escape.has_value()) {
@ -1453,132 +1584,12 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
back(escape_str.length());
}
// CharacterEscape > ControlEscape
if (try_skip("f"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
Vector<CompareTypeAndValuePair> escape_compares;
if (parse_character_escape(escape_compares, match_length_minimum, flags)) {
stack.insert_bytecode_compare_values(move(escape_compares));
return true;
}
if (try_skip("n"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
return true;
}
if (try_skip("r"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
return true;
}
if (try_skip("t"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
return true;
}
if (try_skip("v"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
return true;
}
// CharacterEscape > ControlLetter
if (try_skip("c"sv)) {
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
return true;
}
}
if (flags.unicode) {
set_error(Error::InvalidPattern);
return false;
}
if (m_should_use_browser_extended_grammar) {
back(1 + !done());
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
match_length_minimum += 1;
return true;
}
// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
return true;
}
// '\0'
if (try_skip("0"sv)) {
if (!lookahead_any(s_decimal_characters)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
return true;
}
back();
}
// LegacyOctalEscapeSequence
if (m_should_use_browser_extended_grammar) {
if (!flags.unicode) {
if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
match_length_minimum += 1;
return true;
}
}
}
// HexEscape
if (try_skip("x"sv)) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
return true;
}
if (!flags.unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
return true;
}
set_error(Error::InvalidPattern);
return false;
}
if (try_skip("u"sv)) {
if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
return true;
}
return false;
}
// IdentityEscape
for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) {
if (try_skip({ &ch, 1 })) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
return true;
}
}
if (flags.unicode) {
if (try_skip("/"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
return true;
}
}
if (flags.named && try_skip("k"sv)) {
auto name = read_capture_group_specifier(true);
if (name.is_empty()) {

View File

@ -255,6 +255,8 @@ private:
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);