From ca71d8997d6144fd7bc770ff458d469a989d576c Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Fri, 5 Aug 2022 20:29:43 +1000 Subject: [PATCH] Reuse existing character classes when possible in regex --- src/regex_impl.cc | 13 +++++++++++-- src/regex_impl.hh | 8 +++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 256afcc1c..40701e482 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -119,6 +119,7 @@ struct Children Index operator*() const { return m_pos; } bool operator!=(Sentinel) const { return m_pos != m_end; } + private: Index find_prev(Index parent, Index pos) const { Index child = parent+1; @@ -544,8 +545,10 @@ private: character_class.ranges.empty()) return add_node(ParsedRegex::CharType, (Codepoint)character_class.ctypes); - auto class_id = m_parsed_regex.character_classes.size(); - m_parsed_regex.character_classes.push_back(std::move(character_class)); + auto it = std::find(m_parsed_regex.character_classes.begin(), m_parsed_regex.character_classes.end(), character_class); + auto class_id = it - m_parsed_regex.character_classes.begin(); + if (it == m_parsed_regex.character_classes.end()) + m_parsed_regex.character_classes.push_back(std::move(character_class)); return add_node(ParsedRegex::CharClass, class_id); } @@ -1536,6 +1539,12 @@ auto test_regex = UnitTest{[]{ kak_assert(vm.exec("\t\n\v\f\r")); } + { + TestVM<> vm{R"([\t-\r]\h+[\t-\r])"}; + kak_assert(vm.character_classes.size() == 1); + kak_assert(vm.exec("\n \f")); + } + { TestVM<> vm{R"([^\x00-\x7F]+)"}; kak_assert(not vm.exec("ascii")); diff --git a/src/regex_impl.hh b/src/regex_impl.hh index ca859592d..73d697904 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -33,12 +33,18 @@ constexpr bool with_bit_ops(Meta::Type) { return true; } struct CharacterClass { - struct Range { Codepoint min, max; }; + struct Range + { + Codepoint min, max; + friend bool operator==(const Range&, const Range&) = default; + }; Vector ranges; CharacterType ctypes = CharacterType::None; bool negative = false; bool ignore_case = false; + + friend bool operator==(const CharacterClass&, const CharacterClass&) = default; }; bool is_character_class(const CharacterClass& character_class, Codepoint cp);