From d652ec9ce1f6bebeeadec92348d1f756f03cff32 Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 10 Oct 2018 22:35:38 +1100 Subject: [PATCH] Cleanup regex lookarounds implementation and reject incompatible regex Fixes #2487 --- src/regex_impl.cc | 27 ++++++++++++++++++--------- src/regex_impl.hh | 36 ++++++++++++++++++++++++++---------- src/utils.hh | 6 ++++++ 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 0511c0db8..f5913f0fb 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -8,6 +8,7 @@ #include "utf8_iterator.hh" #include "string_utils.hh" #include "vector.hh" +#include "utils.hh" #include @@ -602,12 +603,17 @@ private: void validate_lookaround(NodeIndex index) { + using Lookaround = CompiledRegex::Lookaround; ForEachChild<>::apply(m_parsed_regex, index, [this](NodeIndex child_index) { auto& child = get_node(child_index); if (child.op != ParsedRegex::Literal and child.op != ParsedRegex::Class and child.op != ParsedRegex::CharacterType and child.op != ParsedRegex::AnyChar and child.op != ParsedRegex::AnyCharExceptNewLine) parse_error("Lookaround can only contain literals, any chars or character classes"); + if (child.op == ParsedRegex::Literal and + to_underlying(Lookaround::OpBegin) <= child.value and + child.value < to_underlying(Lookaround::OpEnd)) + parse_error("Lookaround does not support literals codepoint between 0xF0000 and 0xFFFFD"); if (child.quantifier.type != ParsedRegex::Quantifier::One) parse_error("Quantifiers cannot be used in lookarounds"); return true; @@ -877,20 +883,22 @@ private: template uint32_t push_lookaround(ParsedRegex::NodeIndex index, bool ignore_case) { + using Lookaround = CompiledRegex::Lookaround; + const uint32_t res = m_program.lookarounds.size(); auto write_matcher = [this, ignore_case](ParsedRegex::NodeIndex child) { auto& character = get_node(child); if (character.op == ParsedRegex::Literal) - m_program.lookarounds.push_back(ignore_case ? to_lower(character.value) - : character.value); + m_program.lookarounds.push_back( + static_cast(ignore_case ? to_lower(character.value) : character.value)); else if (character.op == ParsedRegex::AnyChar) - m_program.lookarounds.push_back(0xF000); + m_program.lookarounds.push_back(Lookaround::AnyChar); else if (character.op == ParsedRegex::AnyCharExceptNewLine) - m_program.lookarounds.push_back(0xF001); + m_program.lookarounds.push_back(Lookaround::AnyCharExceptNewLine); else if (character.op == ParsedRegex::Class) - m_program.lookarounds.push_back(0xF0001 + character.value); + m_program.lookarounds.push_back(static_cast(to_underlying(Lookaround::CharacterClass) + character.value)); else if (character.op == ParsedRegex::CharacterType) - m_program.lookarounds.push_back(0xF8000 | character.value); + m_program.lookarounds.push_back(static_cast(to_underlying(Lookaround::CharacterType) | character.value)); else kak_assert(false); return true; @@ -898,7 +906,7 @@ private: ForEachChild::apply(m_parsed_regex, index, write_matcher); - m_program.lookarounds.push_back((Codepoint)-1); + m_program.lookarounds.push_back(Lookaround::EndOfLookaround); return res; } @@ -1121,8 +1129,9 @@ String dump_regex(const CompiledRegex& program) name = "negative look behind (ignore case)"; String str; - for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it) - utf8::dump(std::back_inserter(str), *it); + for (auto it = program.lookarounds.begin() + inst.param; + *it != CompiledRegex::Lookaround::EndOfLookaround; ++it) + utf8::dump(std::back_inserter(str), to_underlying(*it)); res += format("{} ({})\n", name, str); break; } diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 77b674f59..b0d7feb5a 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -8,6 +8,7 @@ #include "utf8.hh" #include "utf8_iterator.hh" #include "vector.hh" +#include "utils.hh" namespace Kakoune { @@ -82,6 +83,17 @@ struct CompiledRegex : RefCountable, UseMemoryDomain NegativeLookBehind_IgnoreCase, }; + enum class Lookaround : Codepoint + { + OpBegin = 0xF0000, + AnyChar = 0xF0000, + AnyCharExceptNewLine = 0xF0001, + CharacterClass = 0xF0002, + CharacterType = 0xF8000, + OpEnd = 0xFFFFF, + EndOfLookaround = static_cast(-1) + }; + struct Instruction { Op op; @@ -98,7 +110,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain Vector instructions; Vector character_classes; - Vector lookarounds; + Vector lookarounds; uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward uint32_t save_count; @@ -522,8 +534,10 @@ private: template bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const { + using Lookaround = CompiledRegex::Lookaround; + const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin); - for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it) + for (auto it = m_program.lookarounds.begin() + index; *it != Lookaround::EndOfLookaround; ++it) { if (pos == end) return false; @@ -531,25 +545,27 @@ private: if (ignore_case) cp = to_lower(cp); - const Codepoint ref = *it; - if (ref == 0xF000) + const Lookaround op = *it; + if (op == Lookaround::AnyChar) {} // any character matches - else if (ref == 0xF001) + else if (op == Lookaround::AnyCharExceptNewLine) { if (cp == '\n') return false; } - else if (ref > 0xF0000 and ref < 0xF8000) + else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType) { - if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp)) + auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass); + if (not is_character_class(m_program.character_classes[index], cp)) return false; } - else if (ref >= 0xF8000 and ref <= 0xFFFFD) + else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd) { - if (not is_ctype((CharacterType)(ref & 0xFF), cp)) + auto ctype = static_cast(to_underlying(op) & 0xFF); + if (not is_ctype(ctype, cp)) return false; } - else if (ref != cp) + else if (static_cast(op) != cp) return false; (look_direction == MatchDirection::Forward) ? ++pos : --pos; diff --git a/src/utils.hh b/src/utils.hh index f21be9c16..ec6965a8b 100644 --- a/src/utils.hh +++ b/src/utils.hh @@ -148,6 +148,12 @@ bool skip_while_reverse(Iterator& it, const BeginIterator& begin, T condition) return condition(*it); } +template +auto to_underlying(E value) +{ + return static_cast>(value); +} + } #endif // utils_hh_INCLUDED