mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-29 14:14:45 +03:00
LibRegex+LibJS: Combine named and unnamed capture groups in MatchState
Combining these into one list helps reduce the size of MatchState, and as a result, reduces the amount of memory consumed during execution of very large regex matches. Doing this also allows us to remove a few regex byte code instructions: ClearNamedCaptureGroup, SaveLeftNamedCaptureGroup, and NamedReference. Named groups now behave the same as unnamed groups for these operations. Note that SaveRightNamedCaptureGroup still exists to cache the matched group name. This also removes the recursion level from the MatchState, as it can exist as a local variable in Matcher::execute instead.
This commit is contained in:
parent
fea181bde3
commit
f1ce998d73
Notes:
sideshowbarker
2024-07-18 05:40:26 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/f1ce998d739 Pull-request: https://github.com/SerenityOS/serenity/pull/9367 Reviewed-by: https://github.com/linusg ✅
@ -422,9 +422,11 @@ TEST_CASE(named_capture_group)
|
||||
EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
|
||||
EXPECT_EQ(result.count, 2u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
|
||||
EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
|
||||
EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test");
|
||||
EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test");
|
||||
}
|
||||
|
||||
TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
|
||||
@ -443,9 +445,11 @@ TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
|
||||
EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
|
||||
EXPECT_EQ(result.count, 2u);
|
||||
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("$Test$").view, "255");
|
||||
EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
|
||||
EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$");
|
||||
EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
|
||||
EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("$Test$").view, "0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
|
||||
EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$");
|
||||
}
|
||||
|
||||
TEST_CASE(a_star)
|
||||
|
@ -263,13 +263,16 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
|
||||
return {};
|
||||
}
|
||||
|
||||
auto* array = Array::create(global_object, result.n_capture_groups + 1);
|
||||
auto* array = Array::create(global_object, result.n_named_capture_groups + 1);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
Vector<Optional<Match>> indices { Match::create(match) };
|
||||
HashMap<String, Match> group_names;
|
||||
|
||||
bool has_groups = result.n_named_capture_groups != 0;
|
||||
Object* groups_object = has_groups ? Object::create(global_object, nullptr) : nullptr;
|
||||
|
||||
for (size_t i = 0; i < result.n_capture_groups; ++i) {
|
||||
auto capture_value = js_undefined();
|
||||
auto& capture = result.capture_group_matches[0][i + 1];
|
||||
@ -280,22 +283,15 @@ static Value regexp_builtin_exec(GlobalObject& global_object, RegExpObject& rege
|
||||
indices.append(Match::create(capture));
|
||||
}
|
||||
array->create_data_property_or_throw(i + 1, capture_value);
|
||||
}
|
||||
|
||||
bool has_groups = result.n_named_capture_groups > 0;
|
||||
Value groups = js_undefined();
|
||||
|
||||
if (has_groups) {
|
||||
auto groups_object = Object::create(global_object, nullptr);
|
||||
|
||||
for (auto& entry : result.named_capture_group_matches[0]) {
|
||||
groups_object->create_data_property_or_throw(entry.key, js_string(vm, entry.value.view.u16_view()));
|
||||
group_names.set(entry.key, Match::create(entry.value));
|
||||
if (capture.capture_group_name.has_value()) {
|
||||
auto group_name = capture.capture_group_name->to_string();
|
||||
groups_object->create_data_property_or_throw(group_name, js_string(vm, capture.view.u16_view()));
|
||||
group_names.set(move(group_name), Match::create(capture));
|
||||
}
|
||||
|
||||
groups = move(groups_object);
|
||||
}
|
||||
|
||||
Value groups = has_groups ? groups_object : js_undefined();
|
||||
array->create_data_property_or_throw(vm.names.groups, groups);
|
||||
|
||||
if (has_indices) {
|
||||
|
@ -166,18 +166,12 @@ void ByteCode::ensure_opcodes_initialized()
|
||||
case OpCodeId::ClearCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::ClearNamedCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::SaveLeftCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::SaveRightCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_SaveRightCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::SaveLeftNamedCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_SaveLeftNamedCaptureGroup>();
|
||||
break;
|
||||
case OpCodeId::SaveRightNamedCaptureGroup:
|
||||
s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>();
|
||||
break;
|
||||
@ -378,52 +372,26 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput c
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index < state.capture_group_matches.size()) {
|
||||
auto& group = state.named_capture_group_matches[input.match_index];
|
||||
if (auto it = group.find(name()); it != group.end())
|
||||
it->value.reset();
|
||||
}
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index >= state.named_capture_group_matches.size()) {
|
||||
state.named_capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = state.named_capture_group_matches.capacity();
|
||||
for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i)
|
||||
state.named_capture_group_matches.empend();
|
||||
}
|
||||
state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
StringView capture_group_name = name();
|
||||
auto& match = state.capture_group_matches.at(input.match_index).at(id());
|
||||
auto start_position = match.left_column;
|
||||
if (state.string_position < start_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
|
||||
auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
|
||||
auto length = state.string_position - start_position;
|
||||
auto length = state.string_position - start_position;
|
||||
|
||||
auto& map = state.named_capture_group_matches.at(input.match_index);
|
||||
if (start_position < match.column)
|
||||
return ExecutionResult::Continue;
|
||||
|
||||
if constexpr (REGEX_DEBUG) {
|
||||
VERIFY(start_position + length <= input.view.length());
|
||||
dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
|
||||
}
|
||||
VERIFY(start_position + length <= input.view.length());
|
||||
|
||||
VERIFY(start_position + length <= input.view.length());
|
||||
auto view = input.view.substring_view(start_position, length);
|
||||
if (input.regex_options & AllFlags::StringCopyMatches) {
|
||||
map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
|
||||
} else {
|
||||
map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
|
||||
}
|
||||
auto view = input.view.substring_view(start_position, length);
|
||||
|
||||
if (input.regex_options & AllFlags::StringCopyMatches) {
|
||||
match = { view.to_string(), name(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
|
||||
} else {
|
||||
warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index);
|
||||
match = { view, name(), input.line, start_position, input.global_offset + start_position }; // take view to original string
|
||||
}
|
||||
|
||||
return ExecutionResult::Continue;
|
||||
@ -543,24 +511,6 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
if (!compare_string(input, state, str, had_zero_length_match))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (char const*)m_bytecode->at(offset++);
|
||||
auto length = (size_t)m_bytecode->at(offset++);
|
||||
StringView name { ptr, length };
|
||||
|
||||
auto group = state.named_capture_group_matches.at(input.match_index).get(name);
|
||||
if (!group.has_value())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = group.value().view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() < state.string_position + str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str, had_zero_length_match))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::Property) {
|
||||
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
|
||||
compare_property(input, state, property, current_inversion_state(), inverse_matched);
|
||||
@ -869,10 +819,6 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match
|
||||
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
|
||||
}
|
||||
}
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (char const*)m_bytecode->at(offset++);
|
||||
auto length = m_bytecode->at(offset++);
|
||||
result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.empend(String::formatted("number={}", ref));
|
||||
|
@ -32,7 +32,6 @@ using ByteCodeValueType = u64;
|
||||
__ENUMERATE_OPCODE(FailForks) \
|
||||
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(CheckBegin) \
|
||||
__ENUMERATE_OPCODE(CheckEnd) \
|
||||
@ -41,7 +40,6 @@ using ByteCodeValueType = u64;
|
||||
__ENUMERATE_OPCODE(Restore) \
|
||||
__ENUMERATE_OPCODE(GoBack) \
|
||||
__ENUMERATE_OPCODE(ClearCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(ClearNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(Exit)
|
||||
|
||||
// clang-format off
|
||||
@ -65,7 +63,6 @@ enum class OpCodeId : ByteCodeValueType {
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
|
||||
@ -159,7 +156,6 @@ public:
|
||||
VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
|
||||
VERIFY(value.type != CharacterCompareType::Undefined);
|
||||
VERIFY(value.type != CharacterCompareType::String);
|
||||
VERIFY(value.type != CharacterCompareType::NamedReference);
|
||||
|
||||
arguments.append((ByteCodeValueType)value.type);
|
||||
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
|
||||
@ -187,13 +183,6 @@ public:
|
||||
empend(index);
|
||||
}
|
||||
|
||||
void insert_bytecode_clear_named_capture_group(StringView name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_string(StringView view)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
@ -212,49 +201,24 @@ public:
|
||||
extend(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_named_reference(StringView name)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||
bytecode.empend(static_cast<u64>(1)); // number of arguments
|
||||
|
||||
ByteCode arguments;
|
||||
|
||||
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
|
||||
arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
arguments.empend(name.length());
|
||||
|
||||
bytecode.empend(arguments.size()); // size of arguments
|
||||
bytecode.extend(move(arguments));
|
||||
|
||||
extend(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_left(size_t capture_groups_count)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
|
||||
empend(capture_groups_count);
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_left(StringView const& name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_right(size_t capture_groups_count)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
|
||||
empend(capture_groups_count);
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_right(StringView const& name)
|
||||
void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView const& name)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
|
||||
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
empend(name.length());
|
||||
empend(capture_groups_count);
|
||||
}
|
||||
|
||||
enum class LookAroundType {
|
||||
@ -655,19 +619,6 @@ public:
|
||||
String const arguments_string() const override { return String::formatted("id={}", id()); }
|
||||
};
|
||||
|
||||
class OpCode_ClearNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
String const arguments_string() const override
|
||||
{
|
||||
return String::formatted("name={}, length={}", name(), length());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override;
|
||||
@ -686,26 +637,14 @@ public:
|
||||
String const arguments_string() const override { return String::formatted("id={}", id()); }
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
String const arguments_string() const override
|
||||
{
|
||||
return String::formatted("name={}, length={}", name(), length());
|
||||
}
|
||||
};
|
||||
|
||||
class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
|
||||
public:
|
||||
ExecutionResult execute(MatchInput const& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
|
||||
ALWAYS_INLINE size_t size() const override { return 3; }
|
||||
ALWAYS_INLINE size_t size() const override { return 4; }
|
||||
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
|
||||
ALWAYS_INLINE size_t length() const { return argument(1); }
|
||||
ALWAYS_INLINE size_t id() const { return argument(2); }
|
||||
String const arguments_string() const override
|
||||
{
|
||||
return String::formatted("name={}, length={}", name(), length());
|
||||
|
@ -442,11 +442,20 @@ public:
|
||||
}
|
||||
|
||||
Match(String const string_, size_t const line_, size_t const column_, size_t const global_offset_)
|
||||
: string(string_)
|
||||
: string(move(string_))
|
||||
, view(string.value().view())
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
{
|
||||
}
|
||||
|
||||
Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
|
||||
: view(view_)
|
||||
, capture_group_name(capture_group_name_)
|
||||
, line(line_)
|
||||
, column(column_)
|
||||
, global_offset(global_offset_)
|
||||
, left_column(column_)
|
||||
{
|
||||
}
|
||||
@ -454,6 +463,7 @@ public:
|
||||
void reset()
|
||||
{
|
||||
view = view.typed_null_view();
|
||||
capture_group_name.clear();
|
||||
line = 0;
|
||||
column = 0;
|
||||
global_offset = 0;
|
||||
@ -461,6 +471,7 @@ public:
|
||||
}
|
||||
|
||||
RegexStringView view { nullptr };
|
||||
Optional<StringView> capture_group_name {};
|
||||
size_t line { 0 };
|
||||
size_t column { 0 };
|
||||
size_t global_offset { 0 };
|
||||
@ -494,8 +505,6 @@ struct MatchState {
|
||||
size_t fork_at_position { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
size_t recursion_level { 0 };
|
||||
};
|
||||
|
||||
struct MatchOutput {
|
||||
|
@ -149,10 +149,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
||||
if (c_match_preallocation_count) {
|
||||
state.matches.ensure_capacity(c_match_preallocation_count);
|
||||
state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
|
||||
auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
|
||||
auto& named_capture_groups_count = m_pattern->parser_result.named_capture_groups_count;
|
||||
|
||||
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
|
||||
state.matches.empend();
|
||||
@ -160,9 +157,6 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
||||
state.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
|
||||
for (size_t k = 0; k < capture_groups_count; ++k)
|
||||
state.capture_group_matches.at(j).unchecked_append({});
|
||||
|
||||
state.named_capture_group_matches.unchecked_append({});
|
||||
state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
|
||||
}
|
||||
}
|
||||
|
||||
@ -315,15 +309,9 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
||||
matches.template remove_all_matching([](auto& match) { return match.view.is_null(); });
|
||||
}
|
||||
|
||||
output_copy.named_capture_group_matches = state.named_capture_group_matches;
|
||||
// Make sure there are as many capture matches as there are actual matches.
|
||||
if (output_copy.named_capture_group_matches.size() < match_count)
|
||||
output_copy.named_capture_group_matches.resize(match_count);
|
||||
|
||||
output_copy.matches = state.matches;
|
||||
} else {
|
||||
output_copy.capture_group_matches.clear_with_capacity();
|
||||
output_copy.named_capture_group_matches.clear_with_capacity();
|
||||
}
|
||||
|
||||
return {
|
||||
@ -331,7 +319,6 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
||||
match_count,
|
||||
move(output_copy.matches),
|
||||
move(output_copy.capture_group_matches),
|
||||
move(output_copy.named_capture_group_matches),
|
||||
output.operations,
|
||||
m_pattern->parser_result.capture_groups_count,
|
||||
m_pattern->parser_result.named_capture_groups_count,
|
||||
@ -399,9 +386,8 @@ private:
|
||||
template<class Parser>
|
||||
Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
state.recursion_level = 0;
|
||||
|
||||
BumpAllocatedLinkedList<MatchState> states_to_try_next;
|
||||
size_t recursion_level = 0;
|
||||
|
||||
auto& bytecode = m_pattern->parser_result.bytecode;
|
||||
|
||||
@ -410,7 +396,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
|
||||
auto& opcode = bytecode.get_opcode(state);
|
||||
|
||||
#if REGEX_DEBUG
|
||||
s_regex_dbg.print_opcode("VM", opcode, state, state.recursion_level, false);
|
||||
s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false);
|
||||
#endif
|
||||
|
||||
ExecutionResult result;
|
||||
@ -435,7 +421,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
|
||||
case ExecutionResult::Fork_PrioHigh:
|
||||
states_to_try_next.append(state);
|
||||
state.instruction_position = state.fork_at_position;
|
||||
++state.recursion_level;
|
||||
++recursion_level;
|
||||
continue;
|
||||
case ExecutionResult::Continue:
|
||||
continue;
|
||||
@ -454,7 +440,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
|
||||
return false;
|
||||
}
|
||||
state = states_to_try_next.take_last();
|
||||
++state.recursion_level;
|
||||
++recursion_level;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -31,7 +31,6 @@ struct RegexResult final {
|
||||
size_t count { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
size_t n_operations { 0 };
|
||||
size_t n_capture_groups { 0 };
|
||||
size_t n_named_capture_groups { 0 };
|
||||
|
@ -150,7 +150,6 @@ ALWAYS_INLINE void Parser::reset()
|
||||
m_parser_state.capture_group_minimum_lengths.clear();
|
||||
m_parser_state.capture_groups_count = 0;
|
||||
m_parser_state.named_capture_groups_count = 0;
|
||||
m_parser_state.named_capture_group_minimum_lengths.clear();
|
||||
m_parser_state.named_capture_groups.clear();
|
||||
}
|
||||
|
||||
@ -780,12 +779,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si
|
||||
}
|
||||
}
|
||||
|
||||
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
|
||||
if (capture_group_name.has_value())
|
||||
bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
|
||||
else
|
||||
bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
|
||||
}
|
||||
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group))
|
||||
bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
|
||||
|
||||
ByteCode capture_group_bytecode;
|
||||
|
||||
@ -814,12 +809,12 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si
|
||||
|
||||
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
|
||||
if (capture_group_name.has_value()) {
|
||||
bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
|
||||
bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count, capture_group_name.value());
|
||||
++m_parser_state.named_capture_groups_count;
|
||||
} else {
|
||||
bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
|
||||
++m_parser_state.capture_groups_count;
|
||||
}
|
||||
++m_parser_state.capture_groups_count;
|
||||
}
|
||||
should_parse_repetition_symbol = true;
|
||||
break;
|
||||
@ -1564,14 +1559,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
|
||||
if (!maybe_length.has_value()) {
|
||||
auto maybe_capture_group = m_parser_state.named_capture_groups.get(name);
|
||||
if (!maybe_capture_group.has_value()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
match_length_minimum += maybe_length.value();
|
||||
match_length_minimum += maybe_capture_group->minimum_length;
|
||||
|
||||
stack.insert_bytecode_compare_named_reference(name);
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } });
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2121,15 +2116,8 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||
m_capture_groups_in_scope.last().empend(identifier);
|
||||
};
|
||||
auto clear_all_capture_groups_in_scope = [&] {
|
||||
for (auto& entry : m_capture_groups_in_scope.last()) {
|
||||
entry.visit(
|
||||
[&](size_t index) {
|
||||
stack.insert_bytecode_clear_capture_group(index);
|
||||
},
|
||||
[&](String const& name) {
|
||||
stack.insert_bytecode_clear_named_capture_group(name);
|
||||
});
|
||||
}
|
||||
for (auto& index : m_capture_groups_in_scope.last())
|
||||
stack.insert_bytecode_clear_capture_group(index);
|
||||
};
|
||||
|
||||
if (match(TokenType::Questionmark)) {
|
||||
@ -2172,21 +2160,18 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
||||
clear_all_capture_groups_in_scope();
|
||||
exit_capture_group_scope();
|
||||
|
||||
register_capture_group_in_current_scope(name);
|
||||
register_capture_group_in_current_scope(group_index);
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
stack.insert_bytecode_group_capture_left(name);
|
||||
stack.insert_bytecode_group_capture_left(group_index);
|
||||
stack.extend(move(capture_group_bytecode));
|
||||
stack.insert_bytecode_group_capture_right(name);
|
||||
stack.insert_bytecode_group_capture_right(group_index);
|
||||
stack.insert_bytecode_group_capture_right(group_index, name);
|
||||
|
||||
match_length_minimum += length;
|
||||
|
||||
m_parser_state.named_capture_group_minimum_lengths.set(name, length);
|
||||
m_parser_state.capture_group_minimum_lengths.set(group_index, length);
|
||||
m_parser_state.named_capture_groups.set(name, { group_index, length });
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -88,6 +88,11 @@ protected:
|
||||
ALWAYS_INLINE bool done() const;
|
||||
ALWAYS_INLINE bool set_error(Error error);
|
||||
|
||||
struct NamedCaptureGroup {
|
||||
size_t group_index { 0 };
|
||||
size_t minimum_length { 0 };
|
||||
};
|
||||
|
||||
struct ParserState {
|
||||
Lexer& lexer;
|
||||
Token current_token;
|
||||
@ -99,8 +104,7 @@ protected:
|
||||
size_t match_length_minimum { 0 };
|
||||
AllOptions regex_options;
|
||||
HashMap<int, size_t> capture_group_minimum_lengths;
|
||||
HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
|
||||
HashMap<size_t, FlyString> named_capture_groups;
|
||||
HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
|
||||
|
||||
explicit ParserState(Lexer& lexer)
|
||||
: lexer(lexer)
|
||||
@ -258,8 +262,7 @@ private:
|
||||
// ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
|
||||
// by requiring that (...)+ only contain the matches for the last iteration.
|
||||
// To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
|
||||
using CaptureGroup = Variant<size_t, String>;
|
||||
Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
|
||||
Vector<Vector<size_t>> m_capture_groups_in_scope;
|
||||
};
|
||||
|
||||
using PosixExtended = PosixExtendedParser;
|
||||
|
Loading…
Reference in New Issue
Block a user