LibRegex: Fix ECMA-262 parsing of invalid identity escapes

* Only alphabetic (A-Z, a-z) characters may be escaped with \c. The loop
  currently parsing \c includes code points between the upper/lower case
  groups.
* In Unicode mode, all invalid identity escapes should cause a parser
  error, even in browser-extended mode.
* Avoid an infinite loop when parsing the pattern "\c" on its own.
This commit is contained in:
Timothy Flynn 2021-08-11 09:39:10 -04:00 committed by Linus Groh
parent 51b3fb5532
commit e887314472
Notes: sideshowbarker 2024-07-18 05:40:56 +09:00
2 changed files with 35 additions and 11 deletions

View File

@ -525,6 +525,18 @@ TEST_CASE(ECMA262_parse)
{ "\\\\p{1}", regex::Error::NoError, ECMAScriptFlags::Unicode },
{ "\\\\p{AsCiI}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\\\p{ASCII}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\c", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "\\c", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "[\\c]", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\c]", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\c`", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "\\c`", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "[\\c`]", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\c`]", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\A", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "\\A", regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
{ "[\\A]", regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
{ "[\\A]", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
};
for (auto& test : tests) {
@ -579,6 +591,7 @@ TEST_CASE(ECMA262_match)
{ "\\05", "\5", true, ECMAScriptFlags::BrowserExtended },
{ "\\455", "\45""5", true, ECMAScriptFlags::BrowserExtended },
{ "\\314", "\314", true, ECMAScriptFlags::BrowserExtended },
{ "\\c", "\\c", true, ECMAScriptFlags::BrowserExtended },
{ "\\cf", "\06", true, ECMAScriptFlags::BrowserExtended },
{ "\\c1", "\\c1", true, ECMAScriptFlags::BrowserExtended },
{ "[\\c1]", "\x11", true, ECMAScriptFlags::BrowserExtended },

View File

@ -17,6 +17,7 @@
namespace regex {
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
ALWAYS_INLINE bool Parser::set_error(Error error)
{
@ -1400,7 +1401,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
// CharacterEscape > ControlLetter
if (try_skip("c")) {
for (auto c = 'A'; c <= 'z'; ++c) {
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
@ -1408,18 +1409,18 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
}
if (m_should_use_browser_extended_grammar) {
back(2);
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
match_length_minimum += 1;
return true;
}
if (unicode) {
set_error(Error::InvalidPattern);
return false;
}
if (m_should_use_browser_extended_grammar) {
back(1 + !done());
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
match_length_minimum += 1;
return true;
}
// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
@ -1768,10 +1769,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
// CharacterEscape > ControlLetter
if (try_skip("c")) {
for (auto c = 'A'; c <= 'z'; ++c) {
if (try_skip({ &c, 1 }))
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } };
}
}
if (unicode) {
set_error(Error::InvalidPattern);
return {};
}
if (m_should_use_browser_extended_grammar) {
for (auto c = '0'; c <= '9'; ++c) {
if (try_skip({ &c, 1 }))
@ -1780,7 +1788,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
if (try_skip("_"))
return { CharClassRangeElement { .code_point = (u32)('_' % 32), .is_character_class = false } };
back(2);
back(1 + !done());
return { CharClassRangeElement { .code_point = '\\', .is_character_class = false } };
}
}
@ -1856,6 +1864,9 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
// Any unrecognised escape is allowed in non-unicode mode.
return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } };
}
set_error(Error::InvalidPattern);
return {};
}
if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))