LibRegex: Allow references to capture groups that aren't parsed yet

This only applies to the ECMA262 parser.
This behaviour is an ECMA262-specific quirk, such references always
generate zero-length matches (even on subsequent passes).
Also adds a test in LibJS's test suite.

Fixes #6039.
This commit is contained in:
AnotherTest 2021-04-01 18:30:47 +04:30 committed by Andreas Kling
parent 804ab79995
commit 6bbb26fdaf
Notes: sideshowbarker 2024-07-18 20:55:08 +09:00
6 changed files with 80 additions and 6 deletions

View File

@ -66,3 +66,15 @@ test("not matching", () => {
expect(res).toBe(null); expect(res).toBe(null);
}); });
// Backreference to a group not yet parsed: #6039
test("Future group backreference, #6039", () => {
let re = /(\3)(\1)(a)/;
let result = re.exec("cat");
expect(result.length).toBe(4);
expect(result[0]).toBe("a");
expect(result[1]).toBe("");
expect(result[2]).toBe("");
expect(result[3]).toBe("a");
expect(result.index).toBe(1);
});

View File

@ -399,6 +399,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
size_t string_position = state.string_position; size_t string_position = state.string_position;
bool inverse_matched { false }; bool inverse_matched { false };
bool had_zero_length_match { false };
size_t offset { state.instruction_position + 3 }; size_t offset { state.instruction_position + 3 };
for (size_t i = 0; i < arguments_count(); ++i) { for (size_t i = 0; i < arguments_count(); ++i) {
@ -454,7 +455,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
if (input.view.length() - state.string_position < length) if (input.view.length() - state.string_position < length)
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length)) if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
} else if (compare_type == CharacterCompareType::CharClass) { } else if (compare_type == CharacterCompareType::CharClass) {
@ -488,7 +489,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
if (input.view.length() - state.string_position < str.length()) if (input.view.length() - state.string_position < str.length())
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
} else if (compare_type == CharacterCompareType::NamedReference) { } else if (compare_type == CharacterCompareType::NamedReference) {
@ -506,7 +507,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
if (input.view.length() - state.string_position < str.length()) if (input.view.length() - state.string_position < str.length())
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
} else { } else {
@ -519,7 +520,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
if (current_inversion_state() && !inverse_matched) if (current_inversion_state() && !inverse_matched)
++state.string_position; ++state.string_position;
if (string_position == state.string_position || state.string_position > input.view.length()) if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
return ExecutionResult::Continue; return ExecutionResult::Continue;
@ -542,7 +543,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt
} }
} }
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
{ {
if (input.view.is_u8_view()) { if (input.view.is_u8_view()) {
auto str_view1 = StringView(str, length); auto str_view1 = StringView(str, length);
@ -558,6 +559,8 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, Match
if (str_view1 == str_view2) { if (str_view1 == str_view2) {
state.string_position += length; state.string_position += length;
if (length == 0)
had_zero_length_match = true;
return true; return true;
} }
} }

View File

@ -768,7 +768,7 @@ public:
private: private:
ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length); ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match);
ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
}; };

View File

@ -93,6 +93,7 @@ public:
void set_source(const StringView source) { m_source = source; } void set_source(const StringView source) { m_source = source; }
bool try_skip(char); bool try_skip(char);
char skip(); char skip();
const auto& source() const { return m_source; }
private: private:
ALWAYS_INLINE char peek(size_t offset = 0) const; ALWAYS_INLINE char peek(size_t offset = 0) const;

View File

@ -1114,12 +1114,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
{ {
if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) { if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) {
if (auto escape = escape_str.to_uint(); escape.has_value()) { if (auto escape = escape_str.to_uint(); escape.has_value()) {
// See if this is a "back"-reference (we've already parsed the group it refers to)
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
if (maybe_length.has_value()) { if (maybe_length.has_value()) {
match_length_minimum += maybe_length.value(); match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
return true; return true;
} }
// It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
// This refers to a future group, and it will _always_ be matching an empty string
// So just match nothing and move on.
return true;
}
if (!m_should_use_browser_extended_grammar) { if (!m_should_use_browser_extended_grammar) {
set_error(Error::InvalidNumber); set_error(Error::InvalidNumber);
return false; return false;
@ -1729,4 +1736,47 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
return true; return true;
} }
size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
{
if (m_total_number_of_capturing_parenthesis.has_value())
return m_total_number_of_capturing_parenthesis.value();
GenericLexer lexer { m_parser_state.lexer.source() };
size_t count = 0;
while (!lexer.is_eof()) {
switch (lexer.peek()) {
case '\\':
lexer.consume(2);
continue;
case '[':
while (!lexer.is_eof()) {
if (lexer.consume_specific('\\'))
lexer.consume();
else if (lexer.consume_specific(']'))
break;
lexer.consume();
}
break;
case '(':
if (lexer.consume_specific('?')) {
// non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
if (!lexer.consume_specific('<'))
break;
if (lexer.next_is(is_any_of("=!")))
break;
++count;
} else {
++count;
}
break;
}
lexer.consume();
}
m_total_number_of_capturing_parenthesis = count;
return count;
}
} }

View File

@ -210,6 +210,14 @@ private:
bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length); bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
Optional<u8> parse_legacy_octal_escape(); Optional<u8> parse_legacy_octal_escape();
size_t ensure_total_number_of_capturing_parenthesis();
// ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
// always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
// parse it twice, so we'll just do so when it's actually needed.
// Most patterns should have no need to ever populate this field.
Optional<size_t> m_total_number_of_capturing_parenthesis;
// Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
bool m_should_use_browser_extended_grammar { false }; bool m_should_use_browser_extended_grammar { false };
}; };