LibRegex: Avoid rewriting a+ as a* as part of atomic rewriting

The initial `ForkStay` is only needed if the looping block has a
following block, if there's no following block or the following block
does not attempt to match anything, we should not insert the ForkStay,
otherwise we would be rewriting `a+` as `a*` by allowing the 'end' to be
executed.
Fixes #10952.
This commit is contained in:
Ali Mohammad Pur 2021-11-18 08:00:29 +03:30 committed by Andreas Kling
parent f10036b7c5
commit 387df06385
Notes: sideshowbarker 2024-07-18 01:01:24 +09:00
2 changed files with 34 additions and 17 deletions

View File

@ -895,6 +895,7 @@ TEST_CASE(optimizer_atomic_groups)
// Fork -> ForkReplace // Fork -> ForkReplace
Tuple { "a*b"sv, "aaaaa"sv, false }, Tuple { "a*b"sv, "aaaaa"sv, false },
Tuple { "a+b"sv, "aaaaa"sv, false }, Tuple { "a+b"sv, "aaaaa"sv, false },
Tuple { "\\\\(\\d+)"sv, "\\\\"sv, false }, // Rewrite bug turning a+ to a*, see #10952.
// Alternative fuse // Alternative fuse
Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true }, Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true },
Tuple { "(a|a)"sv, "a"sv, true }, Tuple { "(a|a)"sv, "a"sv, true },

View File

@ -103,7 +103,12 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks()
return block_boundaries; return block_boundaries;
} }
static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block) enum class AtomicRewritePreconditionResult {
SatisfiedWithProperHeader,
SatisfiedWithEmptyHeader,
NotSatisfied,
};
static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
{ {
Vector<Vector<CompareTypeAndValuePair>> repeated_values; Vector<Vector<CompareTypeAndValuePair>> repeated_values;
HashTable<size_t> active_capture_groups; HashTable<size_t> active_capture_groups;
@ -114,21 +119,21 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
case OpCodeId::Compare: { case OpCodeId::Compare: {
auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares(); auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; })) if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
return false; return AtomicRewritePreconditionResult::NotSatisfied;
repeated_values.append(move(compares)); repeated_values.append(move(compares));
break; break;
} }
case OpCodeId::CheckBegin: case OpCodeId::CheckBegin:
case OpCodeId::CheckEnd: case OpCodeId::CheckEnd:
if (repeated_values.is_empty()) if (repeated_values.is_empty())
return true; return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
break; break;
case OpCodeId::CheckBoundary: case OpCodeId::CheckBoundary:
// FIXME: What should we do with these? for now, let's fail. // FIXME: What should we do with these? for now, let's fail.
return false; return AtomicRewritePreconditionResult::NotSatisfied;
case OpCodeId::Restore: case OpCodeId::Restore:
case OpCodeId::GoBack: case OpCodeId::GoBack:
return false; return AtomicRewritePreconditionResult::NotSatisfied;
case OpCodeId::SaveRightCaptureGroup: case OpCodeId::SaveRightCaptureGroup:
active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id()); active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
break; break;
@ -144,6 +149,7 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size()); dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size()); dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
bool following_block_has_at_least_one_compare = false;
// Find the first compare in the following block, it must NOT match any of the values in `repeated_values'. // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) { for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
auto& opcode = bytecode.get_opcode(state); auto& opcode = bytecode.get_opcode(state);
@ -156,6 +162,7 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id()); active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
break; break;
case OpCodeId::Compare: { case OpCodeId::Compare: {
following_block_has_at_least_one_compare = true;
// We found a compare, let's see what it has. // We found a compare, let's see what it has.
auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares(); auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
if (compares.is_empty()) if (compares.is_empty())
@ -165,27 +172,27 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
return compare.type == CharacterCompareType::AnyChar return compare.type == CharacterCompareType::AnyChar
|| (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value)); || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
})) }))
return false; return AtomicRewritePreconditionResult::NotSatisfied;
for (auto& repeated_value : repeated_values) { for (auto& repeated_value : repeated_values) {
// FIXME: This is too naive! // FIXME: This is too naive!
if (any_of(repeated_value, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; })) if (any_of(repeated_value, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
return false; return AtomicRewritePreconditionResult::NotSatisfied;
for (auto& repeated_compare : repeated_value) { for (auto& repeated_compare : repeated_value) {
// FIXME: This is too naive! it will miss _tons_ of cases since it doesn't check ranges! // FIXME: This is too naive! it will miss _tons_ of cases since it doesn't check ranges!
if (any_of(compares, [&](auto& compare) { return compare.type == repeated_compare.type && compare.value == repeated_compare.value; })) if (any_of(compares, [&](auto& compare) { return compare.type == repeated_compare.type && compare.value == repeated_compare.value; }))
return false; return AtomicRewritePreconditionResult::NotSatisfied;
} }
} }
return true; return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
} }
case OpCodeId::CheckBegin: case OpCodeId::CheckBegin:
case OpCodeId::CheckEnd: case OpCodeId::CheckEnd:
return true; // Nothing can match the end! return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; // Nothing can match the end!
case OpCodeId::CheckBoundary: case OpCodeId::CheckBoundary:
// FIXME: What should we do with these? For now, consider them a failure. // FIXME: What should we do with these? For now, consider them a failure.
return false; return AtomicRewritePreconditionResult::NotSatisfied;
default: default:
break; break;
} }
@ -193,7 +200,9 @@ static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode
state.instruction_position += opcode.size(); state.instruction_position += opcode.size();
} }
return true; if (following_block_has_at_least_one_compare)
return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
} }
template<typename Parser> template<typename Parser>
@ -213,7 +222,7 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
// ------------------------- // -------------------------
// bb1 | RE1 // bb1 | RE1
// can be rewritten as: // can be rewritten as:
// loop.hdr | ForkStay bb1 // loop.hdr | ForkStay bb1 (if RE1 matches _something_, empty otherwise)
// ------------------------- // -------------------------
// bb0 | RE0 // bb0 | RE0
// | ForkReplaceX bb0 // | ForkReplaceX bb0
@ -240,8 +249,9 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
// bb2 | RE1 // bb2 | RE1
enum class AlternateForm { enum class AlternateForm {
DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form. DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form.
DirectLoopWithHeader, // loop with proper header, i.e. the second form. DirectLoopWithoutHeaderAndEmptyFollow, // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
DirectLoopWithHeader, // loop with proper header, i.e. the second form.
}; };
struct CandidateBlock { struct CandidateBlock {
Block forking_block; Block forking_block;
@ -298,10 +308,15 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
break; break;
} }
if (block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block)) { auto precondition = block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block);
if (precondition == AtomicRewritePreconditionResult::SatisfiedWithProperHeader) {
candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader }); candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
break; break;
} }
if (precondition == AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader) {
candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow });
break;
}
} }
} }
// Check if the last instruction in the last block is a direct jump to this block // Check if the last instruction in the last block is a direct jump to this block
@ -316,7 +331,8 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
Optional<Block> block_following_fork_fallback; Optional<Block> block_following_fork_fallback;
if (i + 2 < basic_blocks.size()) if (i + 2 < basic_blocks.size())
block_following_fork_fallback = basic_blocks[i + 2]; block_following_fork_fallback = basic_blocks[i + 2];
if (!block_following_fork_fallback.has_value() || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback)) { if (!block_following_fork_fallback.has_value()
|| block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback) != AtomicRewritePreconditionResult::NotSatisfied) {
candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader }); candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader });
break; break;
} }