mirror of
https://github.com/mawww/kakoune.git
synced 2024-12-21 18:41:29 +03:00
Regex: Support forward and backward matching code in the same CompiledRegex
No need to have two separate regexes to handle forward and backward matching, just passing RegexCompileFlags::Backward will add support for backward matching to the regex. For backward only regex, pass RegexCompileFlags::NoForward as well to disable generation of forward matching code.
This commit is contained in:
parent
e9e3dc862c
commit
413f880e9e
@ -693,6 +693,12 @@ void paste_all(Context& context, NormalParams params)
|
||||
selections = std::move(result);
|
||||
}
|
||||
|
||||
constexpr RegexCompileFlags direction_flags(MatchDirection direction)
|
||||
{
|
||||
return (direction == MatchDirection::Forward) ?
|
||||
RegexCompileFlags::None : RegexCompileFlags::Backward | RegexCompileFlags::NoForward;
|
||||
}
|
||||
|
||||
template<MatchDirection direction = MatchDirection::Forward, typename T>
|
||||
void regex_prompt(Context& context, String prompt, String default_regex, T func)
|
||||
{
|
||||
@ -725,7 +731,7 @@ void regex_prompt(Context& context, String prompt, String default_regex, T func)
|
||||
context.push_jump();
|
||||
|
||||
if (not str.empty() or event == PromptEvent::Validate)
|
||||
func(Regex{str.empty() ? default_regex : str, RegexCompileFlags::None, direction}, event, context);
|
||||
func(Regex{str.empty() ? default_regex : str, direction_flags(direction)}, event, context);
|
||||
}
|
||||
catch (regex_error& err)
|
||||
{
|
||||
@ -795,7 +801,7 @@ void search_next(Context& context, NormalParams params)
|
||||
StringView str = context.main_sel_register_value(reg);
|
||||
if (not str.empty())
|
||||
{
|
||||
Regex regex{str, RegexCompileFlags::None, direction};
|
||||
Regex regex{str, direction_flags(direction)};
|
||||
auto& selections = context.selections();
|
||||
bool main_wrapped = false;
|
||||
do {
|
||||
|
@ -3,8 +3,8 @@
|
||||
namespace Kakoune
|
||||
{
|
||||
|
||||
Regex::Regex(StringView re, RegexCompileFlags flags, MatchDirection direction)
|
||||
: m_impl{new CompiledRegex{compile_regex(re, flags, direction)}},
|
||||
Regex::Regex(StringView re, RegexCompileFlags flags)
|
||||
: m_impl{new CompiledRegex{compile_regex(re, flags)}},
|
||||
m_str{re.str()}
|
||||
{}
|
||||
|
||||
|
@ -13,8 +13,7 @@ class Regex
|
||||
public:
|
||||
Regex() = default;
|
||||
|
||||
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None,
|
||||
MatchDirection direction = MatchDirection::Forward);
|
||||
explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None);
|
||||
bool empty() const { return m_str.empty(); }
|
||||
bool operator==(const Regex& other) const { return m_str == other.m_str; }
|
||||
bool operator!=(const Regex& other) const { return m_str != other.m_str; }
|
||||
|
@ -618,26 +618,43 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[];
|
||||
|
||||
struct RegexCompiler
|
||||
{
|
||||
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags, MatchDirection direction)
|
||||
: m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward}
|
||||
RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags)
|
||||
: m_parsed_regex{parsed_regex}, m_flags(flags)
|
||||
{
|
||||
kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward);
|
||||
// Approximation of the number of instructions generated
|
||||
m_program.instructions.reserve(CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1);
|
||||
m_program.start_desc = compute_start_desc();
|
||||
m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1)
|
||||
* (((flags & RegexCompileFlags::Backward) and
|
||||
not (flags & RegexCompileFlags::NoForward)) ? 2 : 1));
|
||||
|
||||
if (not (flags & RegexCompileFlags::NoForward))
|
||||
{
|
||||
m_program.forward_start_desc = compute_start_desc(true);
|
||||
write_search_prefix();
|
||||
compile_node(0);
|
||||
compile_node(0, true);
|
||||
push_inst(CompiledRegex::Match);
|
||||
}
|
||||
|
||||
if (flags & RegexCompileFlags::Backward)
|
||||
{
|
||||
m_program.first_backward_inst = m_program.instructions.size();
|
||||
m_program.backward_start_desc = compute_start_desc(false);
|
||||
write_search_prefix();
|
||||
compile_node(0, false);
|
||||
push_inst(CompiledRegex::Match);
|
||||
}
|
||||
else
|
||||
m_program.first_backward_inst = -1;
|
||||
|
||||
m_program.character_classes = std::move(m_parsed_regex.character_classes);
|
||||
m_program.save_count = m_parsed_regex.capture_count * 2;
|
||||
m_program.direction = direction;
|
||||
}
|
||||
|
||||
CompiledRegex get_compiled_regex() { return std::move(m_program); }
|
||||
|
||||
private:
|
||||
|
||||
uint32_t compile_node_inner(ParsedRegex::NodeIndex index)
|
||||
uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward)
|
||||
{
|
||||
auto& node = get_node(index);
|
||||
|
||||
@ -647,7 +664,7 @@ private:
|
||||
const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and
|
||||
(node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs)));
|
||||
if (save)
|
||||
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 0 : 1));
|
||||
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1));
|
||||
|
||||
Vector<uint32_t> goto_inner_end_offsets;
|
||||
switch (node.op)
|
||||
@ -669,13 +686,13 @@ private:
|
||||
break;
|
||||
case ParsedRegex::Sequence:
|
||||
{
|
||||
if (m_forward)
|
||||
if (forward)
|
||||
for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
|
||||
compile_node(child); return true;
|
||||
compile_node(child, true); return true;
|
||||
});
|
||||
else
|
||||
for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) {
|
||||
compile_node(child); return true;
|
||||
compile_node(child, false); return true;
|
||||
});
|
||||
break;
|
||||
}
|
||||
@ -690,7 +707,7 @@ private:
|
||||
|
||||
for_each_child(m_parsed_regex, index,
|
||||
[&, end = node.children_end](ParsedRegex::NodeIndex child) {
|
||||
auto node = compile_node(child);
|
||||
auto node = compile_node(child, forward);
|
||||
if (child != index+1)
|
||||
m_program.instructions[split_pos++].param = node;
|
||||
if (get_node(child).children_end != end)
|
||||
@ -703,39 +720,39 @@ private:
|
||||
break;
|
||||
}
|
||||
case ParsedRegex::LookAhead:
|
||||
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||
push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||
: CompiledRegex::LookAhead)
|
||||
: (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||
: CompiledRegex::LookBehind),
|
||||
push_lookaround(index, false, ignore_case));
|
||||
break;
|
||||
case ParsedRegex::NegativeLookAhead:
|
||||
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||
: CompiledRegex::NegativeLookAhead)
|
||||
: (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||
: CompiledRegex::NegativeLookBehind),
|
||||
push_lookaround(index, false, ignore_case));
|
||||
break;
|
||||
case ParsedRegex::LookBehind:
|
||||
push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||
push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase
|
||||
: CompiledRegex::LookBehind)
|
||||
: (ignore_case ? CompiledRegex::LookAhead_IgnoreCase
|
||||
: CompiledRegex::LookAhead),
|
||||
push_lookaround(index, true, ignore_case));
|
||||
break;
|
||||
case ParsedRegex::NegativeLookBehind:
|
||||
push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||
push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase
|
||||
: CompiledRegex::NegativeLookBehind)
|
||||
: (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase
|
||||
: CompiledRegex::NegativeLookAhead),
|
||||
push_lookaround(index, true, ignore_case));
|
||||
break;
|
||||
case ParsedRegex::LineStart:
|
||||
push_inst(m_forward ? CompiledRegex::LineStart
|
||||
push_inst(forward ? CompiledRegex::LineStart
|
||||
: CompiledRegex::LineEnd);
|
||||
break;
|
||||
case ParsedRegex::LineEnd:
|
||||
push_inst(m_forward ? CompiledRegex::LineEnd
|
||||
push_inst(forward ? CompiledRegex::LineEnd
|
||||
: CompiledRegex::LineStart);
|
||||
break;
|
||||
case ParsedRegex::WordBoundary:
|
||||
@ -745,11 +762,11 @@ private:
|
||||
push_inst(CompiledRegex::NotWordBoundary);
|
||||
break;
|
||||
case ParsedRegex::SubjectBegin:
|
||||
push_inst(m_forward ? CompiledRegex::SubjectBegin
|
||||
push_inst(forward ? CompiledRegex::SubjectBegin
|
||||
: CompiledRegex::SubjectEnd);
|
||||
break;
|
||||
case ParsedRegex::SubjectEnd:
|
||||
push_inst(m_forward ? CompiledRegex::SubjectEnd
|
||||
push_inst(forward ? CompiledRegex::SubjectEnd
|
||||
: CompiledRegex::SubjectBegin);
|
||||
break;
|
||||
case ParsedRegex::ResetStart:
|
||||
@ -761,12 +778,12 @@ private:
|
||||
m_program.instructions[offset].param = m_program.instructions.size();
|
||||
|
||||
if (save)
|
||||
push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 1 : 0));
|
||||
push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0));
|
||||
|
||||
return start_pos;
|
||||
}
|
||||
|
||||
uint32_t compile_node(ParsedRegex::NodeIndex index)
|
||||
uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward)
|
||||
{
|
||||
auto& node = get_node(index);
|
||||
|
||||
@ -784,10 +801,10 @@ private:
|
||||
goto_ends.push_back(split_pos);
|
||||
}
|
||||
|
||||
auto inner_pos = compile_node_inner(index);
|
||||
auto inner_pos = compile_node_inner(index, forward);
|
||||
// Write the node multiple times when we have a min count quantifier
|
||||
for (int i = 1; i < quantifier.min; ++i)
|
||||
inner_pos = compile_node_inner(index);
|
||||
inner_pos = compile_node_inner(index, forward);
|
||||
|
||||
if (quantifier.allows_infinite_repeat())
|
||||
push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
|
||||
@ -801,7 +818,7 @@ private:
|
||||
auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
|
||||
: CompiledRegex::Split_PrioritizeChild);
|
||||
goto_ends.push_back(split_pos);
|
||||
compile_node_inner(index);
|
||||
compile_node_inner(index, forward);
|
||||
}
|
||||
|
||||
for (auto offset : goto_ends)
|
||||
@ -813,11 +830,11 @@ private:
|
||||
// Add an set of instruction prefix used in the search use case
|
||||
void write_search_prefix()
|
||||
{
|
||||
kak_assert(m_program.instructions.empty());
|
||||
push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size);
|
||||
const uint32_t first_inst = m_program.instructions.size();
|
||||
push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size);
|
||||
push_inst(CompiledRegex::FindNextStart);
|
||||
push_inst(CompiledRegex::Split_PrioritizeParent, 1);
|
||||
kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size);
|
||||
push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1);
|
||||
kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size);
|
||||
}
|
||||
|
||||
uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
|
||||
@ -862,7 +879,7 @@ private:
|
||||
// returns true if the node did not consume the char, hence a following node in
|
||||
// sequence would be still relevant for the parent node start chars computation.
|
||||
bool compute_start_desc(ParsedRegex::NodeIndex index,
|
||||
CompiledRegex::StartDesc& start_desc) const
|
||||
CompiledRegex::StartDesc& start_desc, bool forward) const
|
||||
{
|
||||
auto& node = get_node(index);
|
||||
switch (node.op)
|
||||
@ -924,9 +941,9 @@ private:
|
||||
{
|
||||
bool did_not_consume = false;
|
||||
auto does_not_consume = [&, this](auto child) {
|
||||
return this->compute_start_desc(child, start_desc);
|
||||
return this->compute_start_desc(child, start_desc, forward);
|
||||
};
|
||||
if (m_forward)
|
||||
if (forward)
|
||||
did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume);
|
||||
else
|
||||
did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume);
|
||||
@ -937,7 +954,7 @@ private:
|
||||
{
|
||||
bool all_consumed = not node.quantifier.allows_none();
|
||||
for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) {
|
||||
if (compute_start_desc(child, start_desc))
|
||||
if (compute_start_desc(child, start_desc, forward))
|
||||
all_consumed = false;
|
||||
return true;
|
||||
});
|
||||
@ -960,10 +977,10 @@ private:
|
||||
}
|
||||
|
||||
[[gnu::noinline]]
|
||||
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc() const
|
||||
std::unique_ptr<CompiledRegex::StartDesc> compute_start_desc(bool forward) const
|
||||
{
|
||||
CompiledRegex::StartDesc start_desc{};
|
||||
if (compute_start_desc(0, start_desc) or
|
||||
if (compute_start_desc(0, start_desc, forward) or
|
||||
not contains(start_desc.map, false))
|
||||
return nullptr;
|
||||
|
||||
@ -978,7 +995,6 @@ private:
|
||||
CompiledRegex m_program;
|
||||
RegexCompileFlags m_flags;
|
||||
ParsedRegex& m_parsed_regex;
|
||||
const bool m_forward;
|
||||
};
|
||||
|
||||
void dump_regex(const CompiledRegex& program)
|
||||
@ -1079,9 +1095,9 @@ void dump_regex(const CompiledRegex& program)
|
||||
}
|
||||
}
|
||||
|
||||
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction)
|
||||
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
|
||||
{
|
||||
return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex();
|
||||
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
|
||||
}
|
||||
|
||||
bool is_character_class(const CharacterClass& character_class, Codepoint cp)
|
||||
@ -1120,7 +1136,8 @@ struct TestVM : CompiledRegex, ThreadedRegexVM<const char*, dir>
|
||||
using VMType = ThreadedRegexVM<const char*, dir>;
|
||||
|
||||
TestVM(StringView re, bool dump = false)
|
||||
: CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)},
|
||||
: CompiledRegex{compile_regex(re, dir == MatchDirection::Forward ?
|
||||
RegexCompileFlags::None : RegexCompileFlags::Backward)},
|
||||
VMType{(const CompiledRegex&)*this}
|
||||
{ if (dump) dump_regex(*this); }
|
||||
|
||||
|
@ -98,8 +98,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||
Vector<Instruction, MemoryDomain::Regex> instructions;
|
||||
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
|
||||
Vector<Codepoint, MemoryDomain::Regex> lookarounds;
|
||||
MatchDirection direction;
|
||||
size_t save_count;
|
||||
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
|
||||
uint32_t save_count;
|
||||
|
||||
struct StartDesc
|
||||
{
|
||||
@ -108,18 +108,21 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
||||
bool map[count+1];
|
||||
};
|
||||
|
||||
std::unique_ptr<StartDesc> start_desc;
|
||||
std::unique_ptr<StartDesc> forward_start_desc;
|
||||
std::unique_ptr<StartDesc> backward_start_desc;
|
||||
};
|
||||
|
||||
enum class RegexCompileFlags
|
||||
{
|
||||
None = 0,
|
||||
NoSubs = 1 << 0,
|
||||
Optimize = 1 << 1
|
||||
Optimize = 1 << 1,
|
||||
Backward = 1 << 1,
|
||||
NoForward = 1 << 2,
|
||||
};
|
||||
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
|
||||
|
||||
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward);
|
||||
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags);
|
||||
|
||||
enum class RegexExecFlags
|
||||
{
|
||||
@ -145,7 +148,8 @@ public:
|
||||
ThreadedRegexVM(const CompiledRegex& program)
|
||||
: m_program{program}
|
||||
{
|
||||
kak_assert(m_program and direction == m_program.direction);
|
||||
kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
|
||||
(direction == MatchDirection::Backward and program.first_backward_inst != -1));
|
||||
}
|
||||
|
||||
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
||||
@ -183,20 +187,30 @@ public:
|
||||
|
||||
const bool search = (flags & RegexExecFlags::Search);
|
||||
Utf8It start{m_begin};
|
||||
if (m_program.start_desc)
|
||||
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
|
||||
: m_program.backward_start_desc;
|
||||
if (start_desc)
|
||||
{
|
||||
if (search)
|
||||
{
|
||||
to_next_start(start, m_end, *m_program.start_desc);
|
||||
to_next_start(start, m_end, *start_desc);
|
||||
if (start == m_end) // If start_desc is not null, it means we consume at least one char
|
||||
return false;
|
||||
}
|
||||
else if (start != m_end and
|
||||
not m_program.start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
|
||||
not start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
|
||||
return false;
|
||||
}
|
||||
|
||||
return exec_program(start, Thread{&m_program.instructions[search ? 0 : CompiledRegex::search_prefix_size], nullptr});
|
||||
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
|
||||
if (direction == MatchDirection::Forward)
|
||||
instructions = instructions.subrange(0, m_program.first_backward_inst);
|
||||
else
|
||||
instructions = instructions.subrange(m_program.first_backward_inst);
|
||||
if (not search)
|
||||
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
|
||||
|
||||
return exec_program(start, instructions);
|
||||
}
|
||||
|
||||
ArrayView<const Iterator> captures() const
|
||||
@ -397,10 +411,13 @@ private:
|
||||
return StepResult::Failed;
|
||||
}
|
||||
|
||||
bool exec_program(Utf8It pos, Thread init_thread)
|
||||
bool exec_program(Utf8It pos, ConstArrayView<CompiledRegex::Instruction> instructions)
|
||||
{
|
||||
ExecState state;
|
||||
state.current_threads.push_back(init_thread);
|
||||
state.current_threads.push_back({instructions.begin(), nullptr});
|
||||
|
||||
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
|
||||
: m_program.backward_start_desc;
|
||||
|
||||
bool found_match = false;
|
||||
while (true) // Iterate on all codepoints and once at the end
|
||||
@ -408,7 +425,7 @@ private:
|
||||
if (++state.step == 0)
|
||||
{
|
||||
// We wrapped, avoid potential collision on inst.last_step by resetting them
|
||||
for (auto& inst : m_program.instructions)
|
||||
for (auto& inst : instructions)
|
||||
inst.last_step = 0;
|
||||
state.step = 1; // step 0 is never valid
|
||||
}
|
||||
@ -470,8 +487,8 @@ private:
|
||||
std::reverse(state.current_threads.begin(), state.current_threads.end());
|
||||
++pos;
|
||||
|
||||
if (find_next_start and m_program.start_desc)
|
||||
to_next_start(pos, m_end, *m_program.start_desc);
|
||||
if (find_next_start and start_desc)
|
||||
to_next_start(pos, m_end, *start_desc);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user