LibRegex: Make Fork{Jump,Stay} non-recursive

This makes very fork-heavy expressions (like `(aa)*`) not run out of
stack space when matching very long strings.
This commit is contained in:
Ali Mohammad Pur 2021-07-31 16:03:45 +04:30 committed by Ali Mohammad Pur
parent a08870cc19
commit 5f342e4fa9
Notes: sideshowbarker 2024-07-18 07:35:17 +09:00
3 changed files with 29 additions and 52 deletions

View File

@ -469,6 +469,7 @@ struct MatchState {
Vector<Match> matches; Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches; Vector<Vector<Match>> capture_group_matches;
Vector<HashMap<String, Match>> named_capture_group_matches; Vector<HashMap<String, Match>> named_capture_group_matches;
size_t recursion_level { 0 };
}; };
struct MatchOutput { struct MatchOutput {

View File

@ -7,11 +7,12 @@
#include <AK/Debug.h> #include <AK/Debug.h>
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <LibRegex/RegexMatcher.h>
#include <LibRegex/RegexParser.h>
#if REGEX_DEBUG #if REGEX_DEBUG
# include <LibRegex/RegexDebug.h> # include <LibRegex/RegexDebug.h>
#endif #endif
#include <LibRegex/RegexMatcher.h>
#include <LibRegex/RegexParser.h>
namespace regex { namespace regex {
@ -210,10 +211,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
state.string_position = view_index; state.string_position = view_index;
state.instruction_position = 0; state.instruction_position = 0;
auto success = execute(input, state, temp_output, 0); auto success = execute(input, state, temp_output);
// This success is acceptable only if it doesn't read anything from the input (input length is 0). // This success is acceptable only if it doesn't read anything from the input (input length is 0).
if (state.string_position <= view_index) { if (state.string_position <= view_index) {
if (success.value()) { if (success.has_value() && success.value()) {
output = move(temp_output); output = move(temp_output);
if (!match_count) { if (!match_count) {
// Nothing was *actually* matched, so append an empty match. // Nothing was *actually* matched, so append an empty match.
@ -241,7 +242,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
state.string_position = view_index; state.string_position = view_index;
state.instruction_position = 0; state.instruction_position = 0;
auto success = execute(input, state, output, 0); auto success = execute(input, state, output);
if (!success.has_value()) if (!success.has_value())
return { false, 0, {}, {}, {}, output.operations }; return { false, 0, {}, {}, {}, output.operations };
@ -334,14 +335,11 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
} }
template<class Parser> template<class Parser>
Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& state, MatchOutput& output, size_t recursion_level) const Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& state, MatchOutput& output) const
{ {
if (recursion_level > c_max_recursion) state.recursion_level = 0;
return false;
Vector<MatchState, 64> reversed_fork_low_prio_states; Vector<MatchState, 64> states_to_try_next;
MatchState fork_high_prio_state;
Optional<bool> success;
auto& bytecode = m_pattern->parser_result.bytecode; auto& bytecode = m_pattern->parser_result.bytecode;
@ -350,7 +348,7 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
auto& opcode = bytecode.get_opcode(state); auto& opcode = bytecode.get_opcode(state);
#if REGEX_DEBUG #if REGEX_DEBUG
s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false); s_regex_dbg.print_opcode("VM", opcode, state, state.recursion_level, false);
#endif #endif
ExecutionResult result; ExecutionResult result;
@ -369,33 +367,33 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
switch (result) { switch (result) {
case ExecutionResult::Fork_PrioLow: case ExecutionResult::Fork_PrioLow:
reversed_fork_low_prio_states.append(state); states_to_try_next.append(state);
states_to_try_next.last().instruction_position = state.fork_at_position;
continue; continue;
case ExecutionResult::Fork_PrioHigh: case ExecutionResult::Fork_PrioHigh:
fork_high_prio_state = state; states_to_try_next.append(state);
fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position; state.instruction_position = state.fork_at_position;
success = execute(input, fork_high_prio_state, output, ++recursion_level); ++state.recursion_level;
if (!success.has_value())
return {};
if (success.value()) {
state = fork_high_prio_state;
return true;
}
continue; continue;
case ExecutionResult::Continue: case ExecutionResult::Continue:
continue; continue;
case ExecutionResult::Succeeded: case ExecutionResult::Succeeded:
return true; return true;
case ExecutionResult::Failed: case ExecutionResult::Failed:
if (!states_to_try_next.is_empty()) {
state = states_to_try_next.take_last();
continue;
}
return false; return false;
case ExecutionResult::Failed_ExecuteLowPrioForks: { case ExecutionResult::Failed_ExecuteLowPrioForks: {
Vector<MatchState> fork_low_prio_states; if (states_to_try_next.is_empty()) {
fork_low_prio_states.ensure_capacity(reversed_fork_low_prio_states.size()); if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful))
for (ssize_t i = reversed_fork_low_prio_states.size() - 1; i >= 0; i--) return {};
fork_low_prio_states.unchecked_append(move(reversed_fork_low_prio_states[i])); return false;
return execute_low_prio_forks(input, state, output, move(fork_low_prio_states), recursion_level + 1); }
state = states_to_try_next.take_last();
++state.recursion_level;
continue;
} }
} }
} }
@ -403,27 +401,6 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
VERIFY_NOT_REACHED(); VERIFY_NOT_REACHED();
} }
template<class Parser>
ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(MatchInput const& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const
{
for (auto& state : states) {
state.instruction_position = state.fork_at_position;
dbgln_if(REGEX_DEBUG, "Forkstay... ip = {}, sp = {}", state.instruction_position, state.string_position);
auto success = execute(input, state, output, recursion_level);
if (!success.has_value())
return {};
if (success.value()) {
dbgln_if(REGEX_DEBUG, "Forkstay succeeded... ip = {}, sp = {}", state.instruction_position, state.string_position);
original_state = state;
return true;
}
}
original_state.string_position = 0;
return false;
}
template class Matcher<PosixBasicParser>; template class Matcher<PosixBasicParser>;
template class Regex<PosixBasicParser>; template class Regex<PosixBasicParser>;

View File

@ -65,8 +65,7 @@ public:
} }
private: private:
Optional<bool> execute(MatchInput const& input, MatchState& state, MatchOutput& output, size_t recursion_level) const; Optional<bool> execute(MatchInput const& input, MatchState& state, MatchOutput& output) const;
ALWAYS_INLINE Optional<bool> execute_low_prio_forks(MatchInput const& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
Regex<Parser> const* m_pattern; Regex<Parser> const* m_pattern;
typename ParserTraits<Parser>::OptionsType const m_regex_options; typename ParserTraits<Parser>::OptionsType const m_regex_options;