From 413f880e9e74e01b1b03692a04554a355fae2519 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 1 Dec 2017 19:57:02 +0800 Subject: [PATCH] Regex: Support forward and backward matching code in the same CompiledRegex No need to have two separate regexes to handle forward and backward matching, just passing RegexCompileFlags::Backward will add support for backward matching to the regex. For backward only regex, pass RegexCompileFlags::NoForward as well to disable generation of forward matching code. --- src/normal.cc | 10 +++- src/regex.cc | 4 +- src/regex.hh | 3 +- src/regex_impl.cc | 131 ++++++++++++++++++++++++++-------------------- src/regex_impl.hh | 47 +++++++++++------ 5 files changed, 117 insertions(+), 78 deletions(-) diff --git a/src/normal.cc b/src/normal.cc index 3332cbdb4..224319b70 100644 --- a/src/normal.cc +++ b/src/normal.cc @@ -693,6 +693,12 @@ void paste_all(Context& context, NormalParams params) selections = std::move(result); } +constexpr RegexCompileFlags direction_flags(MatchDirection direction) +{ + return (direction == MatchDirection::Forward) ? + RegexCompileFlags::None : RegexCompileFlags::Backward | RegexCompileFlags::NoForward; +} + template void regex_prompt(Context& context, String prompt, String default_regex, T func) { @@ -725,7 +731,7 @@ void regex_prompt(Context& context, String prompt, String default_regex, T func) context.push_jump(); if (not str.empty() or event == PromptEvent::Validate) - func(Regex{str.empty() ? default_regex : str, RegexCompileFlags::None, direction}, event, context); + func(Regex{str.empty() ? default_regex : str, direction_flags(direction)}, event, context); } catch (regex_error& err) { @@ -795,7 +801,7 @@ void search_next(Context& context, NormalParams params) StringView str = context.main_sel_register_value(reg); if (not str.empty()) { - Regex regex{str, RegexCompileFlags::None, direction}; + Regex regex{str, direction_flags(direction)}; auto& selections = context.selections(); bool main_wrapped = false; do { diff --git a/src/regex.cc b/src/regex.cc index e3170d6ea..3967cebc4 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -3,8 +3,8 @@ namespace Kakoune { -Regex::Regex(StringView re, RegexCompileFlags flags, MatchDirection direction) - : m_impl{new CompiledRegex{compile_regex(re, flags, direction)}}, +Regex::Regex(StringView re, RegexCompileFlags flags) + : m_impl{new CompiledRegex{compile_regex(re, flags)}}, m_str{re.str()} {} diff --git a/src/regex.hh b/src/regex.hh index 4c7b8ecb8..11926d644 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -13,8 +13,7 @@ class Regex public: Regex() = default; - explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None, - MatchDirection direction = MatchDirection::Forward); + explicit Regex(StringView re, RegexCompileFlags flags = RegexCompileFlags::None); bool empty() const { return m_str.empty(); } bool operator==(const Regex& other) const { return m_str == other.m_str; } bool operator!=(const Regex& other) const { return m_str != other.m_str; } diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 91d1816a3..c8ce28c8d 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -618,26 +618,43 @@ constexpr RegexParser::ControlEscape RegexParser::control_escapes[]; struct RegexCompiler { - RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags, MatchDirection direction) - : m_parsed_regex{parsed_regex}, m_flags(flags), m_forward{direction == MatchDirection::Forward} + RegexCompiler(ParsedRegex&& parsed_regex, RegexCompileFlags flags) + : m_parsed_regex{parsed_regex}, m_flags(flags) { + kak_assert(not (flags & RegexCompileFlags::NoForward) or flags & RegexCompileFlags::Backward); // Approximation of the number of instructions generated - m_program.instructions.reserve(CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1); - m_program.start_desc = compute_start_desc(); + m_program.instructions.reserve((CompiledRegex::search_prefix_size + parsed_regex.nodes.size() + 1) + * (((flags & RegexCompileFlags::Backward) and + not (flags & RegexCompileFlags::NoForward)) ? 2 : 1)); + + if (not (flags & RegexCompileFlags::NoForward)) + { + m_program.forward_start_desc = compute_start_desc(true); + write_search_prefix(); + compile_node(0, true); + push_inst(CompiledRegex::Match); + } + + if (flags & RegexCompileFlags::Backward) + { + m_program.first_backward_inst = m_program.instructions.size(); + m_program.backward_start_desc = compute_start_desc(false); + write_search_prefix(); + compile_node(0, false); + push_inst(CompiledRegex::Match); + } + else + m_program.first_backward_inst = -1; - write_search_prefix(); - compile_node(0); - push_inst(CompiledRegex::Match); m_program.character_classes = std::move(m_parsed_regex.character_classes); m_program.save_count = m_parsed_regex.capture_count * 2; - m_program.direction = direction; } CompiledRegex get_compiled_regex() { return std::move(m_program); } private: - uint32_t compile_node_inner(ParsedRegex::NodeIndex index) + uint32_t compile_node_inner(ParsedRegex::NodeIndex index, bool forward) { auto& node = get_node(index); @@ -647,7 +664,7 @@ private: const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and (node.value == 0 or (node.value != -1 and not (m_flags & RegexCompileFlags::NoSubs))); if (save) - push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 0 : 1)); + push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 0 : 1)); Vector goto_inner_end_offsets; switch (node.op) @@ -669,13 +686,13 @@ private: break; case ParsedRegex::Sequence: { - if (m_forward) + if (forward) for_each_child(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { - compile_node(child); return true; + compile_node(child, true); return true; }); else for_each_child_reverse(m_parsed_regex, index, [this](ParsedRegex::NodeIndex child) { - compile_node(child); return true; + compile_node(child, false); return true; }); break; } @@ -690,7 +707,7 @@ private: for_each_child(m_parsed_regex, index, [&, end = node.children_end](ParsedRegex::NodeIndex child) { - auto node = compile_node(child); + auto node = compile_node(child, forward); if (child != index+1) m_program.instructions[split_pos++].param = node; if (get_node(child).children_end != end) @@ -703,40 +720,40 @@ private: break; } case ParsedRegex::LookAhead: - push_inst(m_forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase - : CompiledRegex::LookAhead) - : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase - : CompiledRegex::LookBehind), + push_inst(forward ? (ignore_case ? CompiledRegex::LookAhead_IgnoreCase + : CompiledRegex::LookAhead) + : (ignore_case ? CompiledRegex::LookBehind_IgnoreCase + : CompiledRegex::LookBehind), push_lookaround(index, false, ignore_case)); break; case ParsedRegex::NegativeLookAhead: - push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase - : CompiledRegex::NegativeLookAhead) - : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase - : CompiledRegex::NegativeLookBehind), + push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase + : CompiledRegex::NegativeLookAhead) + : (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase + : CompiledRegex::NegativeLookBehind), push_lookaround(index, false, ignore_case)); break; case ParsedRegex::LookBehind: - push_inst(m_forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase - : CompiledRegex::LookBehind) - : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase - : CompiledRegex::LookAhead), + push_inst(forward ? (ignore_case ? CompiledRegex::LookBehind_IgnoreCase + : CompiledRegex::LookBehind) + : (ignore_case ? CompiledRegex::LookAhead_IgnoreCase + : CompiledRegex::LookAhead), push_lookaround(index, true, ignore_case)); break; case ParsedRegex::NegativeLookBehind: - push_inst(m_forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase - : CompiledRegex::NegativeLookBehind) - : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase - : CompiledRegex::NegativeLookAhead), + push_inst(forward ? (ignore_case ? CompiledRegex::NegativeLookBehind_IgnoreCase + : CompiledRegex::NegativeLookBehind) + : (ignore_case ? CompiledRegex::NegativeLookAhead_IgnoreCase + : CompiledRegex::NegativeLookAhead), push_lookaround(index, true, ignore_case)); break; case ParsedRegex::LineStart: - push_inst(m_forward ? CompiledRegex::LineStart - : CompiledRegex::LineEnd); + push_inst(forward ? CompiledRegex::LineStart + : CompiledRegex::LineEnd); break; case ParsedRegex::LineEnd: - push_inst(m_forward ? CompiledRegex::LineEnd - : CompiledRegex::LineStart); + push_inst(forward ? CompiledRegex::LineEnd + : CompiledRegex::LineStart); break; case ParsedRegex::WordBoundary: push_inst(CompiledRegex::WordBoundary); @@ -745,12 +762,12 @@ private: push_inst(CompiledRegex::NotWordBoundary); break; case ParsedRegex::SubjectBegin: - push_inst(m_forward ? CompiledRegex::SubjectBegin - : CompiledRegex::SubjectEnd); + push_inst(forward ? CompiledRegex::SubjectBegin + : CompiledRegex::SubjectEnd); break; case ParsedRegex::SubjectEnd: - push_inst(m_forward ? CompiledRegex::SubjectEnd - : CompiledRegex::SubjectBegin); + push_inst(forward ? CompiledRegex::SubjectEnd + : CompiledRegex::SubjectBegin); break; case ParsedRegex::ResetStart: push_inst(CompiledRegex::Save, 0); @@ -761,12 +778,12 @@ private: m_program.instructions[offset].param = m_program.instructions.size(); if (save) - push_inst(CompiledRegex::Save, node.value * 2 + (m_forward ? 1 : 0)); + push_inst(CompiledRegex::Save, node.value * 2 + (forward ? 1 : 0)); return start_pos; } - uint32_t compile_node(ParsedRegex::NodeIndex index) + uint32_t compile_node(ParsedRegex::NodeIndex index, bool forward) { auto& node = get_node(index); @@ -784,10 +801,10 @@ private: goto_ends.push_back(split_pos); } - auto inner_pos = compile_node_inner(index); + auto inner_pos = compile_node_inner(index, forward); // Write the node multiple times when we have a min count quantifier for (int i = 1; i < quantifier.min; ++i) - inner_pos = compile_node_inner(index); + inner_pos = compile_node_inner(index, forward); if (quantifier.allows_infinite_repeat()) push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild @@ -801,7 +818,7 @@ private: auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent : CompiledRegex::Split_PrioritizeChild); goto_ends.push_back(split_pos); - compile_node_inner(index); + compile_node_inner(index, forward); } for (auto offset : goto_ends) @@ -813,11 +830,11 @@ private: // Add an set of instruction prefix used in the search use case void write_search_prefix() { - kak_assert(m_program.instructions.empty()); - push_inst(CompiledRegex::Split_PrioritizeChild, CompiledRegex::search_prefix_size); + const uint32_t first_inst = m_program.instructions.size(); + push_inst(CompiledRegex::Split_PrioritizeChild, first_inst + CompiledRegex::search_prefix_size); push_inst(CompiledRegex::FindNextStart); - push_inst(CompiledRegex::Split_PrioritizeParent, 1); - kak_assert(m_program.instructions.size() == CompiledRegex::search_prefix_size); + push_inst(CompiledRegex::Split_PrioritizeParent, first_inst + 1); + kak_assert(m_program.instructions.size() == first_inst + CompiledRegex::search_prefix_size); } uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0) @@ -862,7 +879,7 @@ private: // returns true if the node did not consume the char, hence a following node in // sequence would be still relevant for the parent node start chars computation. bool compute_start_desc(ParsedRegex::NodeIndex index, - CompiledRegex::StartDesc& start_desc) const + CompiledRegex::StartDesc& start_desc, bool forward) const { auto& node = get_node(index); switch (node.op) @@ -924,9 +941,9 @@ private: { bool did_not_consume = false; auto does_not_consume = [&, this](auto child) { - return this->compute_start_desc(child, start_desc); + return this->compute_start_desc(child, start_desc, forward); }; - if (m_forward) + if (forward) did_not_consume = for_each_child(m_parsed_regex, index, does_not_consume); else did_not_consume = for_each_child_reverse(m_parsed_regex, index, does_not_consume); @@ -937,7 +954,7 @@ private: { bool all_consumed = not node.quantifier.allows_none(); for_each_child(m_parsed_regex, index, [&](ParsedRegex::NodeIndex child) { - if (compute_start_desc(child, start_desc)) + if (compute_start_desc(child, start_desc, forward)) all_consumed = false; return true; }); @@ -960,10 +977,10 @@ private: } [[gnu::noinline]] - std::unique_ptr compute_start_desc() const + std::unique_ptr compute_start_desc(bool forward) const { CompiledRegex::StartDesc start_desc{}; - if (compute_start_desc(0, start_desc) or + if (compute_start_desc(0, start_desc, forward) or not contains(start_desc.map, false)) return nullptr; @@ -978,7 +995,6 @@ private: CompiledRegex m_program; RegexCompileFlags m_flags; ParsedRegex& m_parsed_regex; - const bool m_forward; }; void dump_regex(const CompiledRegex& program) @@ -1079,9 +1095,9 @@ void dump_regex(const CompiledRegex& program) } } -CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction) +CompiledRegex compile_regex(StringView re, RegexCompileFlags flags) { - return RegexCompiler{RegexParser::parse(re), flags, direction}.get_compiled_regex(); + return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex(); } bool is_character_class(const CharacterClass& character_class, Codepoint cp) @@ -1120,7 +1136,8 @@ struct TestVM : CompiledRegex, ThreadedRegexVM using VMType = ThreadedRegexVM; TestVM(StringView re, bool dump = false) - : CompiledRegex{compile_regex(re, RegexCompileFlags::None, dir)}, + : CompiledRegex{compile_regex(re, dir == MatchDirection::Forward ? + RegexCompileFlags::None : RegexCompileFlags::Backward)}, VMType{(const CompiledRegex&)*this} { if (dump) dump_regex(*this); } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 2615d7b00..4e01b009b 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -98,8 +98,8 @@ struct CompiledRegex : RefCountable, UseMemoryDomain Vector instructions; Vector character_classes; Vector lookarounds; - MatchDirection direction; - size_t save_count; + uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward + uint32_t save_count; struct StartDesc { @@ -108,18 +108,21 @@ struct CompiledRegex : RefCountable, UseMemoryDomain bool map[count+1]; }; - std::unique_ptr start_desc; + std::unique_ptr forward_start_desc; + std::unique_ptr backward_start_desc; }; enum class RegexCompileFlags { None = 0, NoSubs = 1 << 0, - Optimize = 1 << 1 + Optimize = 1 << 1, + Backward = 1 << 1, + NoForward = 1 << 2, }; constexpr bool with_bit_ops(Meta::Type) { return true; } -CompiledRegex compile_regex(StringView re, RegexCompileFlags flags, MatchDirection direction = MatchDirection::Forward); +CompiledRegex compile_regex(StringView re, RegexCompileFlags flags); enum class RegexExecFlags { @@ -145,7 +148,8 @@ public: ThreadedRegexVM(const CompiledRegex& program) : m_program{program} { - kak_assert(m_program and direction == m_program.direction); + kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or + (direction == MatchDirection::Backward and program.first_backward_inst != -1)); } ThreadedRegexVM(const ThreadedRegexVM&) = delete; @@ -183,20 +187,30 @@ public: const bool search = (flags & RegexExecFlags::Search); Utf8It start{m_begin}; - if (m_program.start_desc) + const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc + : m_program.backward_start_desc; + if (start_desc) { if (search) { - to_next_start(start, m_end, *m_program.start_desc); + to_next_start(start, m_end, *start_desc); if (start == m_end) // If start_desc is not null, it means we consume at least one char return false; } else if (start != m_end and - not m_program.start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)]) + not start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)]) return false; } - return exec_program(start, Thread{&m_program.instructions[search ? 0 : CompiledRegex::search_prefix_size], nullptr}); + ConstArrayView instructions{m_program.instructions}; + if (direction == MatchDirection::Forward) + instructions = instructions.subrange(0, m_program.first_backward_inst); + else + instructions = instructions.subrange(m_program.first_backward_inst); + if (not search) + instructions = instructions.subrange(CompiledRegex::search_prefix_size); + + return exec_program(start, instructions); } ArrayView captures() const @@ -397,10 +411,13 @@ private: return StepResult::Failed; } - bool exec_program(Utf8It pos, Thread init_thread) + bool exec_program(Utf8It pos, ConstArrayView instructions) { ExecState state; - state.current_threads.push_back(init_thread); + state.current_threads.push_back({instructions.begin(), nullptr}); + + const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc + : m_program.backward_start_desc; bool found_match = false; while (true) // Iterate on all codepoints and once at the end @@ -408,7 +425,7 @@ private: if (++state.step == 0) { // We wrapped, avoid potential collision on inst.last_step by resetting them - for (auto& inst : m_program.instructions) + for (auto& inst : instructions) inst.last_step = 0; state.step = 1; // step 0 is never valid } @@ -470,8 +487,8 @@ private: std::reverse(state.current_threads.begin(), state.current_threads.end()); ++pos; - if (find_next_start and m_program.start_desc) - to_next_start(pos, m_end, *m_program.start_desc); + if (find_next_start and start_desc) + to_next_start(pos, m_end, *start_desc); } }