From 6bbb26fdaf19e95c8082819d836f46cd7d1f7a86 Mon Sep 17 00:00:00 2001 From: AnotherTest Date: Thu, 1 Apr 2021 18:30:47 +0430 Subject: [PATCH] LibRegex: Allow references to capture groups that aren't parsed yet This only applies to the ECMA262 parser. This behaviour is an ECMA262-specific quirk, such references always generate zero-length matches (even on subsequent passes). Also adds a test in LibJS's test suite. Fixes #6039. --- .../builtins/RegExp/RegExp.prototype.exec.js | 12 +++++ Userland/Libraries/LibRegex/RegexByteCode.cpp | 13 +++-- Userland/Libraries/LibRegex/RegexByteCode.h | 2 +- Userland/Libraries/LibRegex/RegexLexer.h | 1 + Userland/Libraries/LibRegex/RegexParser.cpp | 50 +++++++++++++++++++ Userland/Libraries/LibRegex/RegexParser.h | 8 +++ 6 files changed, 80 insertions(+), 6 deletions(-) diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js index 5747ae487e2..3cdd5dd8e33 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js +++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js @@ -66,3 +66,15 @@ test("not matching", () => { expect(res).toBe(null); }); + +// Backreference to a group not yet parsed: #6039 +test("Future group backreference, #6039", () => { + let re = /(\3)(\1)(a)/; + let result = re.exec("cat"); + expect(result.length).toBe(4); + expect(result[0]).toBe("a"); + expect(result[1]).toBe(""); + expect(result[2]).toBe(""); + expect(result[3]).toBe("a"); + expect(result.index).toBe(1); +}); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index d9b91a6ec40..00578fb211b 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -399,6 +399,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M size_t string_position = state.string_position; bool inverse_matched { false }; + bool had_zero_length_match { false }; size_t offset { state.instruction_position + 3 }; for (size_t i = 0; i < arguments_count(); ++i) { @@ -454,7 +455,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M if (input.view.length() - state.string_position < length) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length)) + if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::CharClass) { @@ -488,7 +489,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M if (input.view.length() - state.string_position < str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) + if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::NamedReference) { @@ -506,7 +507,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M if (input.view.length() - state.string_position < str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length())) + if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else { @@ -519,7 +520,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M if (current_inversion_state() && !inverse_matched) ++state.string_position; - if (string_position == state.string_position || state.string_position > input.view.length()) + if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Continue; @@ -542,7 +543,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt } } -ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) +ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match) { if (input.view.is_u8_view()) { auto str_view1 = StringView(str, length); @@ -558,6 +559,8 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, Match if (str_view1 == str_view2) { state.string_position += length; + if (length == 0) + had_zero_length_match = true; return true; } } diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index 1909e36a91c..d8021adf205 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -768,7 +768,7 @@ public: private: ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); - ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length); + ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match); ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); }; diff --git a/Userland/Libraries/LibRegex/RegexLexer.h b/Userland/Libraries/LibRegex/RegexLexer.h index 959fa50c33f..d6076cee8d1 100644 --- a/Userland/Libraries/LibRegex/RegexLexer.h +++ b/Userland/Libraries/LibRegex/RegexLexer.h @@ -93,6 +93,7 @@ public: void set_source(const StringView source) { m_source = source; } bool try_skip(char); char skip(); + const auto& source() const { return m_source; } private: ALWAYS_INLINE char peek(size_t offset = 0) const; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 3bf96475d26..fe00adfd97b 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1114,12 +1114,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini { if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) { if (auto escape = escape_str.to_uint(); escape.has_value()) { + // See if this is a "back"-reference (we've already parsed the group it refers to) auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value()); if (maybe_length.has_value()) { match_length_minimum += maybe_length.value(); stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } }); return true; } + // It's not a pattern seen before, so we have to see if it's a valid reference to a future group. + if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) { + // This refers to a future group, and it will _always_ be matching an empty string + // So just match nothing and move on. + return true; + } if (!m_should_use_browser_extended_grammar) { set_error(Error::InvalidNumber); return false; @@ -1729,4 +1736,47 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi return true; } + +size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis() +{ + if (m_total_number_of_capturing_parenthesis.has_value()) + return m_total_number_of_capturing_parenthesis.value(); + + GenericLexer lexer { m_parser_state.lexer.source() }; + size_t count = 0; + while (!lexer.is_eof()) { + switch (lexer.peek()) { + case '\\': + lexer.consume(2); + continue; + case '[': + while (!lexer.is_eof()) { + if (lexer.consume_specific('\\')) + lexer.consume(); + else if (lexer.consume_specific(']')) + break; + lexer.consume(); + } + break; + case '(': + if (lexer.consume_specific('?')) { + // non-capturing group '(?:', lookaround '(?<='/'(? parse_legacy_octal_escape(); + size_t ensure_total_number_of_capturing_parenthesis(); + + // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs + // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always + // parse it twice, so we'll just do so when it's actually needed. + // Most patterns should have no need to ever populate this field. + Optional m_total_number_of_capturing_parenthesis; + // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. bool m_should_use_browser_extended_grammar { false }; };