From f1ce998d73979b80352675d6d3170399b2311913 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 14 Aug 2021 16:28:54 -0400 Subject: [PATCH] LibRegex+LibJS: Combine named and unnamed capture groups in MatchState Combining these into one list helps reduce the size of MatchState, and as a result, reduces the amount of memory consumed during execution of very large regex matches. Doing this also allows us to remove a few regex byte code instructions: ClearNamedCaptureGroup, SaveLeftNamedCaptureGroup, and NamedReference. Named groups now behave the same as unnamed groups for these operations. Note that SaveRightNamedCaptureGroup still exists to cache the matched group name. This also removes the recursion level from the MatchState, as it can exist as a local variable in Matcher::execute instead. --- Tests/LibRegex/Regex.cpp | 12 ++- .../LibJS/Runtime/RegExpPrototype.cpp | 22 +++-- Userland/Libraries/LibRegex/RegexByteCode.cpp | 80 +++---------------- Userland/Libraries/LibRegex/RegexByteCode.h | 69 +--------------- Userland/Libraries/LibRegex/RegexMatch.h | 15 +++- Userland/Libraries/LibRegex/RegexMatcher.cpp | 22 +---- Userland/Libraries/LibRegex/RegexMatcher.h | 1 - Userland/Libraries/LibRegex/RegexParser.cpp | 39 +++------ Userland/Libraries/LibRegex/RegexParser.h | 11 ++- 9 files changed, 69 insertions(+), 202 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 9529ce790ce..ea8707ccf7f 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -422,9 +422,11 @@ TEST_CASE(named_capture_group) EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true); EXPECT_EQ(result.count, 2u); EXPECT_EQ(result.matches.at(0).view, "Opacity=255"); - EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255"); + EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255"); + EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test"); EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0"); - EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0"); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0"); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test"); } TEST_CASE(ecma262_named_capture_group_with_dollar_sign) @@ -443,9 +445,11 @@ TEST_CASE(ecma262_named_capture_group_with_dollar_sign) EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true); EXPECT_EQ(result.count, 2u); EXPECT_EQ(result.matches.at(0).view, "Opacity=255"); - EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("$Test$").view, "255"); + EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255"); + EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$"); EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0"); - EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("$Test$").view, "0"); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0"); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$"); } TEST_CASE(a_star) diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 7a5c4fad999..ca3f4593b5a 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -263,13 +263,16 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege return {}; } - auto* array = Array::create(global_object, result.n_capture_groups + 1); + auto* array = Array::create(global_object, result.n_named_capture_groups + 1); if (vm.exception()) return {}; Vector> indices { Match::create(match) }; HashMap group_names; + bool has_groups = result.n_named_capture_groups != 0; + Object* groups_object = has_groups ? Object::create(global_object, nullptr) : nullptr; + for (size_t i = 0; i < result.n_capture_groups; ++i) { auto capture_value = js_undefined(); auto& capture = result.capture_group_matches[0][i + 1]; @@ -280,22 +283,15 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege indices.append(Match::create(capture)); } array->create_data_property_or_throw(i + 1, capture_value); - } - bool has_groups = result.n_named_capture_groups > 0; - Value groups = js_undefined(); - - if (has_groups) { - auto groups_object = Object::create(global_object, nullptr); - - for (auto& entry : result.named_capture_group_matches[0]) { - groups_object->create_data_property_or_throw(entry.key, js_string(vm, entry.value.view.u16_view())); - group_names.set(entry.key, Match::create(entry.value)); + if (capture.capture_group_name.has_value()) { + auto group_name = capture.capture_group_name->to_string(); + groups_object->create_data_property_or_throw(group_name, js_string(vm, capture.view.u16_view())); + group_names.set(move(group_name), Match::create(capture)); } - - groups = move(groups_object); } + Value groups = has_groups ? groups_object : js_undefined(); array->create_data_property_or_throw(vm.names.groups, groups); if (has_indices) { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 7cad14ba92b..3d9b1d3882f 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -166,18 +166,12 @@ void ByteCode::ensure_opcodes_initialized() case OpCodeId::ClearCaptureGroup: s_opcodes[i] = make(); break; - case OpCodeId::ClearNamedCaptureGroup: - s_opcodes[i] = make(); - break; case OpCodeId::SaveLeftCaptureGroup: s_opcodes[i] = make(); break; case OpCodeId::SaveRightCaptureGroup: s_opcodes[i] = make(); break; - case OpCodeId::SaveLeftNamedCaptureGroup: - s_opcodes[i] = make(); - break; case OpCodeId::SaveRightNamedCaptureGroup: s_opcodes[i] = make(); break; @@ -378,52 +372,26 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput c return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const -{ - if (input.match_index < state.capture_group_matches.size()) { - auto& group = state.named_capture_group_matches[input.match_index]; - if (auto it = group.find(name()); it != group.end()) - it->value.reset(); - } - return ExecutionResult::Continue; -} - -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const -{ - if (input.match_index >= state.named_capture_group_matches.size()) { - state.named_capture_group_matches.ensure_capacity(input.match_index); - auto capacity = state.named_capture_group_matches.capacity(); - for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i) - state.named_capture_group_matches.empend(); - } - state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; - return ExecutionResult::Continue; -} - ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const { - StringView capture_group_name = name(); + auto& match = state.capture_group_matches.at(input.match_index).at(id()); + auto start_position = match.left_column; + if (state.string_position < start_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; - if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { - auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; - auto length = state.string_position - start_position; + auto length = state.string_position - start_position; - auto& map = state.named_capture_group_matches.at(input.match_index); + if (start_position < match.column) + return ExecutionResult::Continue; - if constexpr (REGEX_DEBUG) { - VERIFY(start_position + length <= input.view.length()); - dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length)); - } + VERIFY(start_position + length <= input.view.length()); - VERIFY(start_position + length <= input.view.length()); - auto view = input.view.substring_view(start_position, length); - if (input.regex_options & AllFlags::StringCopyMatches) { - map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string - } else { - map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string - } + auto view = input.view.substring_view(start_position, length); + + if (input.regex_options & AllFlags::StringCopyMatches) { + match = { view.to_string(), name(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string } else { - warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index); + match = { view, name(), input.line, start_position, input.global_offset + start_position }; // take view to original string } return ExecutionResult::Continue; @@ -543,24 +511,6 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; - } else if (compare_type == CharacterCompareType::NamedReference) { - auto ptr = (char const*)m_bytecode->at(offset++); - auto length = (size_t)m_bytecode->at(offset++); - StringView name { ptr, length }; - - auto group = state.named_capture_group_matches.at(input.match_index).get(name); - if (!group.has_value()) - return ExecutionResult::Failed_ExecuteLowPrioForks; - - auto str = group.value().view; - - // We want to compare a string that is definitely longer than the available string - if (input.view.length() < state.string_position + str.length()) - return ExecutionResult::Failed_ExecuteLowPrioForks; - - if (!compare_string(input, state, str, had_zero_length_match)) - return ExecutionResult::Failed_ExecuteLowPrioForks; - } else if (compare_type == CharacterCompareType::Property) { auto property = static_cast(m_bytecode->at(offset++)); compare_property(input, state, property, current_inversion_state(), inverse_matched); @@ -869,10 +819,6 @@ Vector const OpCode_Compare::variable_arguments_to_string(Optionalat(offset++); - auto length = m_bytecode->at(offset++); - result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length })); } else if (compare_type == CharacterCompareType::Reference) { auto ref = m_bytecode->at(offset++); result.empend(String::formatted("number={}", ref)); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index c1c7af235bf..ab305785e2b 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -32,7 +32,6 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(FailForks) \ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ - __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ __ENUMERATE_OPCODE(CheckBegin) \ __ENUMERATE_OPCODE(CheckEnd) \ @@ -41,7 +40,6 @@ using ByteCodeValueType = u64; __ENUMERATE_OPCODE(Restore) \ __ENUMERATE_OPCODE(GoBack) \ __ENUMERATE_OPCODE(ClearCaptureGroup) \ - __ENUMERATE_OPCODE(ClearNamedCaptureGroup) \ __ENUMERATE_OPCODE(Exit) // clang-format off @@ -65,7 +63,6 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ - __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ @@ -159,7 +156,6 @@ public: VERIFY(value.type != CharacterCompareType::RangeExpressionDummy); VERIFY(value.type != CharacterCompareType::Undefined); VERIFY(value.type != CharacterCompareType::String); - VERIFY(value.type != CharacterCompareType::NamedReference); arguments.append((ByteCodeValueType)value.type); if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse) @@ -187,13 +183,6 @@ public: empend(index); } - void insert_bytecode_clear_named_capture_group(StringView name) - { - empend(static_cast(OpCodeId::ClearNamedCaptureGroup)); - empend(reinterpret_cast(name.characters_without_null_termination())); - empend(name.length()); - } - void insert_bytecode_compare_string(StringView view) { ByteCode bytecode; @@ -212,49 +201,24 @@ public: extend(move(bytecode)); } - void insert_bytecode_compare_named_reference(StringView name) - { - ByteCode bytecode; - - bytecode.empend(static_cast(OpCodeId::Compare)); - bytecode.empend(static_cast(1)); // number of arguments - - ByteCode arguments; - - arguments.empend(static_cast(CharacterCompareType::NamedReference)); - arguments.empend(reinterpret_cast(name.characters_without_null_termination())); - arguments.empend(name.length()); - - bytecode.empend(arguments.size()); // size of arguments - bytecode.extend(move(arguments)); - - extend(move(bytecode)); - } - void insert_bytecode_group_capture_left(size_t capture_groups_count) { empend(static_cast(OpCodeId::SaveLeftCaptureGroup)); empend(capture_groups_count); } - void insert_bytecode_group_capture_left(StringView const& name) - { - empend(static_cast(OpCodeId::SaveLeftNamedCaptureGroup)); - empend(reinterpret_cast(name.characters_without_null_termination())); - empend(name.length()); - } - void insert_bytecode_group_capture_right(size_t capture_groups_count) { empend(static_cast(OpCodeId::SaveRightCaptureGroup)); empend(capture_groups_count); } - void insert_bytecode_group_capture_right(StringView const& name) + void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView const& name) { empend(static_cast(OpCodeId::SaveRightNamedCaptureGroup)); empend(reinterpret_cast(name.characters_without_null_termination())); empend(name.length()); + empend(capture_groups_count); } enum class LookAroundType { @@ -655,19 +619,6 @@ public: String const arguments_string() const override { return String::formatted("id={}", id()); } }; -class OpCode_ClearNamedCaptureGroup final : public OpCode { -public: - ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; - ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } - ALWAYS_INLINE StringView name() const { return { reinterpret_cast(argument(0)), length() }; } - ALWAYS_INLINE size_t length() const { return argument(1); } - String const arguments_string() const override - { - return String::formatted("name={}, length={}", name(), length()); - } -}; - class OpCode_SaveLeftCaptureGroup final : public OpCode { public: ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; @@ -686,26 +637,14 @@ public: String const arguments_string() const override { return String::formatted("id={}", id()); } }; -class OpCode_SaveLeftNamedCaptureGroup final : public OpCode { -public: - ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; - ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } - ALWAYS_INLINE StringView name() const { return { reinterpret_cast(argument(0)), length() }; } - ALWAYS_INLINE size_t length() const { return argument(1); } - String const arguments_string() const override - { - return String::formatted("name={}, length={}", name(), length()); - } -}; - class OpCode_SaveRightNamedCaptureGroup final : public OpCode { public: ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override; ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; } - ALWAYS_INLINE size_t size() const override { return 3; } + ALWAYS_INLINE size_t size() const override { return 4; } ALWAYS_INLINE StringView name() const { return { reinterpret_cast(argument(0)), length() }; } ALWAYS_INLINE size_t length() const { return argument(1); } + ALWAYS_INLINE size_t id() const { return argument(2); } String const arguments_string() const override { return String::formatted("name={}, length={}", name(), length()); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 599e68451ac..98c324b359d 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -442,11 +442,20 @@ public: } Match(String const string_, size_t const line_, size_t const column_, size_t const global_offset_) - : string(string_) + : string(move(string_)) , view(string.value().view()) , line(line_) , column(column_) , global_offset(global_offset_) + { + } + + Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_) + : view(view_) + , capture_group_name(capture_group_name_) + , line(line_) + , column(column_) + , global_offset(global_offset_) , left_column(column_) { } @@ -454,6 +463,7 @@ public: void reset() { view = view.typed_null_view(); + capture_group_name.clear(); line = 0; column = 0; global_offset = 0; @@ -461,6 +471,7 @@ public: } RegexStringView view { nullptr }; + Optional capture_group_name {}; size_t line { 0 }; size_t column { 0 }; size_t global_offset { 0 }; @@ -494,8 +505,6 @@ struct MatchState { size_t fork_at_position { 0 }; Vector matches; Vector> capture_group_matches; - Vector> named_capture_group_matches; - size_t recursion_level { 0 }; }; struct MatchOutput { diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index c312ca3c81c..0d41c9af429 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -149,10 +149,7 @@ RegexResult Matcher::match(Vector const& views, Optiona if (c_match_preallocation_count) { state.matches.ensure_capacity(c_match_preallocation_count); state.capture_group_matches.ensure_capacity(c_match_preallocation_count); - state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); - auto& capture_groups_count = m_pattern->parser_result.capture_groups_count; - auto& named_capture_groups_count = m_pattern->parser_result.named_capture_groups_count; for (size_t j = 0; j < c_match_preallocation_count; ++j) { state.matches.empend(); @@ -160,9 +157,6 @@ RegexResult Matcher::match(Vector const& views, Optiona state.capture_group_matches.at(j).ensure_capacity(capture_groups_count); for (size_t k = 0; k < capture_groups_count; ++k) state.capture_group_matches.at(j).unchecked_append({}); - - state.named_capture_group_matches.unchecked_append({}); - state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); } } @@ -315,15 +309,9 @@ RegexResult Matcher::match(Vector const& views, Optiona matches.template remove_all_matching([](auto& match) { return match.view.is_null(); }); } - output_copy.named_capture_group_matches = state.named_capture_group_matches; - // Make sure there are as many capture matches as there are actual matches. - if (output_copy.named_capture_group_matches.size() < match_count) - output_copy.named_capture_group_matches.resize(match_count); - output_copy.matches = state.matches; } else { output_copy.capture_group_matches.clear_with_capacity(); - output_copy.named_capture_group_matches.clear_with_capacity(); } return { @@ -331,7 +319,6 @@ RegexResult Matcher::match(Vector const& views, Optiona match_count, move(output_copy.matches), move(output_copy.capture_group_matches), - move(output_copy.named_capture_group_matches), output.operations, m_pattern->parser_result.capture_groups_count, m_pattern->parser_result.named_capture_groups_count, @@ -399,9 +386,8 @@ private: template Optional Matcher::execute(MatchInput const& input, MatchState& state, MatchOutput& output) const { - state.recursion_level = 0; - BumpAllocatedLinkedList states_to_try_next; + size_t recursion_level = 0; auto& bytecode = m_pattern->parser_result.bytecode; @@ -410,7 +396,7 @@ Optional Matcher::execute(MatchInput const& input, MatchState& sta auto& opcode = bytecode.get_opcode(state); #if REGEX_DEBUG - s_regex_dbg.print_opcode("VM", opcode, state, state.recursion_level, false); + s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false); #endif ExecutionResult result; @@ -435,7 +421,7 @@ Optional Matcher::execute(MatchInput const& input, MatchState& sta case ExecutionResult::Fork_PrioHigh: states_to_try_next.append(state); state.instruction_position = state.fork_at_position; - ++state.recursion_level; + ++recursion_level; continue; case ExecutionResult::Continue: continue; @@ -454,7 +440,7 @@ Optional Matcher::execute(MatchInput const& input, MatchState& sta return false; } state = states_to_try_next.take_last(); - ++state.recursion_level; + ++recursion_level; continue; } } diff --git a/Userland/Libraries/LibRegex/RegexMatcher.h b/Userland/Libraries/LibRegex/RegexMatcher.h index 8e1ce64e6db..4a1e64b8f4c 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.h +++ b/Userland/Libraries/LibRegex/RegexMatcher.h @@ -31,7 +31,6 @@ struct RegexResult final { size_t count { 0 }; Vector matches; Vector> capture_group_matches; - Vector> named_capture_group_matches; size_t n_operations { 0 }; size_t n_capture_groups { 0 }; size_t n_named_capture_groups { 0 }; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 84768cf121c..db08d836fcc 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -150,7 +150,6 @@ ALWAYS_INLINE void Parser::reset() m_parser_state.capture_group_minimum_lengths.clear(); m_parser_state.capture_groups_count = 0; m_parser_state.named_capture_groups_count = 0; - m_parser_state.named_capture_group_minimum_lengths.clear(); m_parser_state.named_capture_groups.clear(); } @@ -780,12 +779,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si } } - if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { - if (capture_group_name.has_value()) - bytecode.insert_bytecode_group_capture_left(capture_group_name.value()); - else - bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); - } + if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) + bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); ByteCode capture_group_bytecode; @@ -814,12 +809,12 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { if (capture_group_name.has_value()) { - bytecode.insert_bytecode_group_capture_right(capture_group_name.value()); + bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count, capture_group_name.value()); ++m_parser_state.named_capture_groups_count; } else { bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count); - ++m_parser_state.capture_groups_count; } + ++m_parser_state.capture_groups_count; } should_parse_repetition_symbol = true; break; @@ -1564,14 +1559,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini set_error(Error::InvalidNameForCaptureGroup); return false; } - auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name); - if (!maybe_length.has_value()) { + auto maybe_capture_group = m_parser_state.named_capture_groups.get(name); + if (!maybe_capture_group.has_value()) { set_error(Error::InvalidNameForCaptureGroup); return false; } - match_length_minimum += maybe_length.value(); + match_length_minimum += maybe_capture_group->minimum_length; - stack.insert_bytecode_compare_named_reference(name); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } }); return true; } @@ -2121,15 +2116,8 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi m_capture_groups_in_scope.last().empend(identifier); }; auto clear_all_capture_groups_in_scope = [&] { - for (auto& entry : m_capture_groups_in_scope.last()) { - entry.visit( - [&](size_t index) { - stack.insert_bytecode_clear_capture_group(index); - }, - [&](String const& name) { - stack.insert_bytecode_clear_named_capture_group(name); - }); - } + for (auto& index : m_capture_groups_in_scope.last()) + stack.insert_bytecode_clear_capture_group(index); }; if (match(TokenType::Questionmark)) { @@ -2172,21 +2160,18 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi clear_all_capture_groups_in_scope(); exit_capture_group_scope(); - register_capture_group_in_current_scope(name); register_capture_group_in_current_scope(group_index); consume(TokenType::RightParen, Error::MismatchingParen); - stack.insert_bytecode_group_capture_left(name); stack.insert_bytecode_group_capture_left(group_index); stack.extend(move(capture_group_bytecode)); - stack.insert_bytecode_group_capture_right(name); - stack.insert_bytecode_group_capture_right(group_index); + stack.insert_bytecode_group_capture_right(group_index, name); match_length_minimum += length; - m_parser_state.named_capture_group_minimum_lengths.set(name, length); m_parser_state.capture_group_minimum_lengths.set(group_index, length); + m_parser_state.named_capture_groups.set(name, { group_index, length }); return true; } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 25a96771891..c1f54be69c9 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -88,6 +88,11 @@ protected: ALWAYS_INLINE bool done() const; ALWAYS_INLINE bool set_error(Error error); + struct NamedCaptureGroup { + size_t group_index { 0 }; + size_t minimum_length { 0 }; + }; + struct ParserState { Lexer& lexer; Token current_token; @@ -99,8 +104,7 @@ protected: size_t match_length_minimum { 0 }; AllOptions regex_options; HashMap capture_group_minimum_lengths; - HashMap named_capture_group_minimum_lengths; - HashMap named_capture_groups; + HashMap named_capture_groups; explicit ParserState(Lexer& lexer) : lexer(lexer) @@ -258,8 +262,7 @@ private: // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it, // by requiring that (...)+ only contain the matches for the last iteration. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed. - using CaptureGroup = Variant; - Vector> m_capture_groups_in_scope; + Vector> m_capture_groups_in_scope; }; using PosixExtended = PosixExtendedParser;