diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index ea8707ccf7f..bcbfa545f97 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -569,6 +569,14 @@ TEST_CASE(ECMA262_parse) { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "a{9007199254740991}"sv }, // 2^53 - 1 + { "a{9007199254740991,}"sv }, + { "a{9007199254740991,9007199254740991}"sv }, + { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent }, + { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent }, + { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent }, + { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent }, + { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent }, }; for (auto& test : tests) { @@ -619,6 +627,14 @@ TEST_CASE(ECMA262_match) { "\\^"sv, "^"sv }, { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode }, { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode }, + { "(a{2}){3}"sv, "aaaaaa"sv }, + { "(a{2}){3}"sv, "aaaabaa"sv, false }, + { "(a{2}){4}"sv, "aaaaaaaa"sv }, + { "(a{2}){4}"sv, "aaaaaabaa"sv, false }, + { "(a{3}){2}"sv, "aaaaaa"sv }, + { "(a{3}){2}"sv, "aaaabaa"sv, false }, + { "(a{4}){2}"sv, "aaaaaaaa"sv }, + { "(a{4}){2}"sv, "aaaaaabaa"sv, false }, // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended }, { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended }, diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 146a0bbd1dc..0d411ba34c9 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -175,6 +175,9 @@ void ByteCode::ensure_opcodes_initialized() case OpCodeId::SaveRightNamedCaptureGroup: s_opcodes[i] = make(); break; + case OpCodeId::Repeat: + s_opcodes[i] = make(); + break; } } s_opcodes_initialized = true; @@ -850,4 +853,23 @@ Vector const OpCode_Compare::variable_arguments_to_string(Optional 0); + + if (id() >= state.repetition_marks.size()) + state.repetition_marks.resize(id() + 1); + auto& repetition_mark = state.repetition_marks.at(id()); + + if (repetition_mark == count() - 1) { + repetition_mark = 0; + } else { + state.instruction_position -= offset() + size(); + ++repetition_mark; + } + + return ExecutionResult::Continue; +} + } diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index 475ce61bd6a..926737fa8b3 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -40,6 +40,7 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(Restore) \ __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(ClearCaptureGroup) \ + __ENUMERATE_OPCODE(Repeat) \ __ENUMERATE_OPCODE(Exit) // clang-format off @@ -331,10 +332,11 @@ public: // LABEL _END = alterantive_bytecode.size } - static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional maximum, bool greedy = true) + template + static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional maximum, size_t repetition_mark_id, bool greedy = true) requires(IsIntegral) { ByteCode new_bytecode; - new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum); + new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id); if (maximum.has_value()) { auto jump_kind = static_cast(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump); @@ -343,7 +345,7 @@ public: new_bytecode.empend(jump_kind); new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label - for (size_t i = 0; i < diff; ++i) { + for (T i = 0; i < diff; ++i) { new_bytecode.extend(bytecode_to_repeat); new_bytecode.empend(jump_kind); new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label @@ -359,10 +361,28 @@ public: bytecode_to_repeat = move(new_bytecode); } - void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n) + template + void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) requires(IsIntegral) { - for (size_t i = 0; i < n; ++i) + // LABEL _LOOP + // REGEXP + // REPEAT _LOOP N-1 + // REGEXP + if (n == 0) + return; + + // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the + // REPEAT instruction forcing another loop. + extend(bytecode_to_repeat); + + if (n > 1) { + empend(static_cast(OpCodeId::Repeat)); + empend(bytecode_to_repeat.size()); + empend(static_cast(n - 1)); + empend(repetition_mark_id); + extend(bytecode_to_repeat); + } } static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy) @@ -672,6 +692,21 @@ private: ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); }; +class OpCode_Repeat : public OpCode { +public: + ExecutionResult execute(MatchInput const& input, MatchState& state) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; } + ALWAYS_INLINE size_t size() const override { return 4; } + ALWAYS_INLINE size_t offset() const { return argument(0); } + ALWAYS_INLINE u64 count() const { return argument(1); } + ALWAYS_INLINE size_t id() const { return argument(2); } + String const arguments_string() const override + { + auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0; + return String::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position); + } +}; + template bool is(OpCode const&); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index dc0b7bb52a3..0fdb27f1476 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -505,6 +505,7 @@ struct MatchState { size_t fork_at_position { 0 }; Vector matches; Vector> capture_group_matches; + Vector repetition_marks; }; } diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 4fed9ec6919..12f63c016ed 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -206,6 +206,7 @@ RegexResult Matcher::match(Vector const& views, Optiona state.string_position = view_index; state.string_position_in_code_units = view_index; state.instruction_position = 0; + state.repetition_marks.clear(); auto success = execute(input, state, temp_operations); // This success is acceptable only if it doesn't read anything from the input (input length is 0). @@ -238,6 +239,7 @@ RegexResult Matcher::match(Vector const& views, Optiona state.string_position = view_index; state.string_position_in_code_units = view_index; state.instruction_position = 0; + state.repetition_marks.clear(); auto success = execute(input, state, operations); if (!success.has_value()) diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index db08d836fcc..f18b0179ffb 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -17,6 +17,7 @@ namespace regex { static constexpr size_t s_maximum_repetition_count = 1024 * 1024; +static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1; static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv; static constexpr auto s_decimal_characters = "0123456789"sv; @@ -419,7 +420,8 @@ bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_ if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count)) return set_error(Error::InvalidBraceContent); - ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, true); + auto repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true); match_length_minimum += re_match_length_minimum * min_limit; } else { match_length_minimum += re_match_length_minimum; @@ -564,15 +566,17 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco if (match(TokenType::Comma)) { consume(); } else { + auto repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode bytecode; - bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum); + bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id); bytecode_to_repeat = move(bytecode); consume(TokenType::RightCurly, Error::MismatchingBrace); return !has_error(); } - Optional maybe_maximum {}; + Optional maybe_maximum {}; number_builder.clear(); while (match(TokenType::Char)) { number_builder.append(consume().value()); @@ -585,7 +589,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco maybe_maximum = value.value(); } - ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum); + auto repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id); consume(TokenType::RightCurly, Error::MismatchingBrace); return !has_error(); @@ -1141,7 +1146,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim } repetition_mark { Repetition::None }; bool ungreedy = false; - Optional repeat_min, repeat_max; + Optional repeat_min, repeat_max; if (match(TokenType::Asterisk)) { consume(); @@ -1182,10 +1187,12 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy); match_length_minimum = 0; break; - case Repetition::Explicit: - ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, !ungreedy); + case Repetition::Explicit: { + auto repetition_mark_id = m_parser_state.repetition_mark_count++; + ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy); match_length_minimum *= repeat_min.value(); break; + } case Repetition::None: VERIFY_NOT_REACHED(); } @@ -1193,7 +1200,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim return true; } -bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max) +bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max) { VERIFY(match(TokenType::LeftCurly)); consume(); @@ -1202,7 +1209,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Opti auto low_bound_string = read_digits_as_string(); chars_consumed += low_bound_string.length(); - auto low_bound = low_bound_string.to_uint(); + auto low_bound = low_bound_string.to_uint(); if (!low_bound.has_value()) { if (!m_should_use_browser_extended_grammar && done()) @@ -1218,7 +1225,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Opti consume(); ++chars_consumed; auto high_bound_string = read_digits_as_string(); - auto high_bound = high_bound_string.to_uint(); + auto high_bound = high_bound_string.to_uint(); if (high_bound.has_value()) { repeat_max = high_bound.value(); chars_consumed += high_bound_string.length(); @@ -1243,6 +1250,9 @@ bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Opti set_error(Error::InvalidBraceContent); } + if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count))) + return set_error(Error::InvalidBraceContent); + return true; } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index c1f54be69c9..d9825bcd788 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -102,6 +102,7 @@ protected: size_t capture_groups_count { 0 }; size_t named_capture_groups_count { 0 }; size_t match_length_minimum { 0 }; + size_t repetition_mark_count { 0 }; AllOptions regex_options; HashMap capture_group_minimum_lengths; HashMap named_capture_groups; @@ -232,7 +233,7 @@ private: bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named); bool parse_atom(ByteCode&, size_t&, bool unicode, bool named); bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named); - bool parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max); + bool parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max); bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named); bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named); bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);