LibRegex: Add support for the Basic POSIX regular expressions

This implements the internal regex stuff for #8506.
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/54d89609de5 Pull-request: https://github.com/SerenityOS/serenity/pull/8606
2025-01-07 19:57:45 +03:00 · 2021-07-10 13:18:10 +04:30 · 2021-07-10 13:18:10 +04:30 · 54d89609de · 2024-07-18 09:26:14 +09:00
commit 54d89609de
parent addfa1e82e
3 changed files with 351 additions and 109 deletions
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@ -378,6 +378,9 @@ ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const Match
    return false;
 }

+template class Matcher<PosixBasicParser>;
+template class Regex<PosixBasicParser>;
+
 template class Matcher<PosixExtendedParser>;
 template class Regex<PosixExtendedParser>;

--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -173,114 +173,12 @@ ALWAYS_INLINE bool Parser::match_ordinary_characters()
 }

 // =============================
-// PosixExtended Parser
+// Abstract Posix Parser
 // =============================

-bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+ALWAYS_INLINE bool AbstractPosixParser::parse_bracket_expression(Vector<CompareTypeAndValuePair>& values, size_t& match_length_minimum)
 {
-    return parse_root(stack, match_length_minimum);
-}
-
-ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
-{
-    auto type = m_parser_state.current_token.type();
-    return (type == TokenType::Asterisk
-        || type == TokenType::Plus
-        || type == TokenType::Questionmark
-        || type == TokenType::LeftCurly);
-}
-
-ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
-{
-    if (match(TokenType::LeftCurly)) {
-        consume();
-
-        StringBuilder number_builder;
-
-        while (match(TokenType::Char)) {
-            number_builder.append(consume().value());
-        }
-
-        auto maybe_minimum = number_builder.build().to_uint();
-        if (!maybe_minimum.has_value())
-            return set_error(Error::InvalidBraceContent);
-
-        auto minimum = maybe_minimum.value();
-        match_length_minimum *= minimum;
-
-        if (match(TokenType::Comma)) {
-            consume();
-        } else {
-            ByteCode bytecode;
-            bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
-            bytecode_to_repeat = move(bytecode);
-
-            consume(TokenType::RightCurly, Error::MismatchingBrace);
-            return !has_error();
-        }
-
-        Optional<size_t> maybe_maximum {};
-        number_builder.clear();
-        while (match(TokenType::Char)) {
-            number_builder.append(consume().value());
-        }
-        if (!number_builder.is_empty()) {
-            auto value = number_builder.build().to_uint();
-            if (!value.has_value() || minimum > value.value())
-                return set_error(Error::InvalidBraceContent);
-
-            maybe_maximum = value.value();
-        }
-
-        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
-
-        consume(TokenType::RightCurly, Error::MismatchingBrace);
-        return !has_error();
-
-    } else if (match(TokenType::Plus)) {
-        consume();
-
-        bool nongreedy = match(TokenType::Questionmark);
-        if (nongreedy)
-            consume();
-
-        // Note: don't touch match_length_minimum, it's already correct
-        ByteCode::transform_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
-        return !has_error();
-
-    } else if (match(TokenType::Asterisk)) {
-        consume();
-        match_length_minimum = 0;
-
-        bool nongreedy = match(TokenType::Questionmark);
-        if (nongreedy)
-            consume();
-
-        ByteCode::transform_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
-
-        return !has_error();
-
-    } else if (match(TokenType::Questionmark)) {
-        consume();
-        match_length_minimum = 0;
-
-        bool nongreedy = match(TokenType::Questionmark);
-        if (nongreedy)
-            consume();
-
-        ByteCode::transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
-        return !has_error();
-    }
-
-    return false;
-}
-
-ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
-{
-    Vector<CompareTypeAndValuePair> values;
-
-    for (;;) {
-
+    for (; !done();) {
        if (match(TokenType::HyphenMinus)) {
            consume();

@ -410,6 +308,296 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack
    if (values.first().type == CharacterCompareType::Inverse)
        match_length_minimum = 0;

+    return true;
+}
+
+// =============================
+// PosixBasic Parser
+// =============================
+
+bool PosixBasicParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+{
+    return parse_root(stack, match_length_minimum);
+}
+
+bool PosixBasicParser::parse_root(ByteCode& bytecode, size_t& match_length_minimum)
+{
+    // basic_reg_exp : L_ANCHOR? RE_expression R_ANCHOR?
+    if (match(TokenType::Circumflex)) {
+        consume();
+        bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
+    }
+
+    if (!parse_re_expression(bytecode, match_length_minimum))
+        return false;
+
+    if (match(TokenType::Dollar)) {
+        consume();
+        bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
+    }
+
+    return !has_error();
+}
+
+bool PosixBasicParser::parse_re_expression(ByteCode& bytecode, size_t& match_length_minimum)
+{
+    // RE_expression : RE_expression? simple_RE
+    while (!done()) {
+        if (!parse_simple_re(bytecode, match_length_minimum))
+            break;
+    }
+
+    return !has_error();
+}
+
+bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_minimum)
+{
+    // simple_RE : nondupl_RE RE_dupl_symbol?
+    ByteCode simple_re_bytecode;
+    size_t re_match_length_minimum = 0;
+    if (!parse_nonduplicating_re(simple_re_bytecode, re_match_length_minimum))
+        return false;
+
+    // RE_dupl_symbol : '*' | Back_open_brace DUP_COUNT (',' DUP_COUNT?)? Back_close_brace
+    if (match(TokenType::Asterisk)) {
+        consume();
+        ByteCode::transform_bytecode_repetition_any(simple_re_bytecode, true);
+    } else if (try_skip("\\{")) {
+        auto read_number = [&]() -> Optional<size_t> {
+            if (!match(TokenType::Char))
+                return {};
+            size_t value = 0;
+            while (match(TokenType::Char)) {
+                auto c = m_parser_state.current_token.value().substring_view(0, 1);
+                auto c_value = c.to_uint();
+                if (!c_value.has_value())
+                    break;
+                value *= 10;
+                value += *c_value;
+                consume();
+            }
+            return value;
+        };
+
+        size_t min_limit;
+        Optional<size_t> max_limit;
+
+        if (auto limit = read_number(); !limit.has_value())
+            return set_error(Error::InvalidRepetitionMarker);
+        else
+            min_limit = *limit;
+
+        if (match(TokenType::Comma)) {
+            consume();
+            max_limit = read_number();
+        }
+
+        if (!try_skip("\\}"))
+            return set_error(Error::MismatchingBrace);
+
+        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, true);
+        match_length_minimum += re_match_length_minimum * min_limit;
+    } else {
+        match_length_minimum += re_match_length_minimum;
+    }
+
+    bytecode.extend(move(simple_re_bytecode));
+    return true;
+}
+
+bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match_length_minimum)
+{
+    // nondupl_RE : one_char_or_coll_elem_RE | Back_open_paren RE_expression Back_close_paren | BACKREF
+    if (try_skip("\\(")) {
+        ByteCode capture_bytecode;
+        size_t capture_length_minimum = 0;
+        auto capture_group_index = ++m_capture_group;
+
+        if (!parse_re_expression(capture_bytecode, capture_length_minimum))
+            return false;
+
+        if (!try_skip("\\)"))
+            return set_error(Error::MismatchingParen);
+
+        match_length_minimum += capture_length_minimum;
+        if (capture_group_index <= number_of_addressable_capture_groups) {
+            m_capture_group_minimum_lengths[capture_group_index - 1] = capture_length_minimum;
+            m_capture_group_seen[capture_group_index - 1] = true;
+            bytecode.insert_bytecode_group_capture_left(capture_group_index);
+        }
+
+        bytecode.extend(capture_bytecode);
+
+        if (capture_group_index <= number_of_addressable_capture_groups)
+            bytecode.insert_bytecode_group_capture_right(capture_group_index);
+        return true;
+    }
+
+    for (size_t i = 1; i < 10; ++i) {
+        char backref_name[2] { '\\', '0' };
+        backref_name[1] += i;
+        if (try_skip({ backref_name, 2 })) {
+            if (!m_capture_group_seen[i - 1])
+                return set_error(Error::InvalidNumber);
+            match_length_minimum += m_capture_group_minimum_lengths[i - 1];
+            bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } });
+            return true;
+        }
+    }
+
+    return parse_one_char_or_collation_element(bytecode, match_length_minimum);
+}
+
+bool PosixBasicParser::parse_one_char_or_collation_element(ByteCode& bytecode, size_t& match_length_minimum)
+{
+    // one_char_or_coll_elem_RE : ORD_CHAR | QUOTED_CHAR | '.' | bracket_expression
+    if (match(TokenType::Period)) {
+        consume();
+        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
+        match_length_minimum += 1;
+        return true;
+    }
+
+    // None of these are special in BRE.
+    if (match(TokenType::Char) || match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus)
+        || match(TokenType::Circumflex) || match(TokenType::RightCurly) || match(TokenType::Comma) || match(TokenType::Colon)
+        || match(TokenType::Dollar) || match(TokenType::EqualSign) || match(TokenType::LeftCurly) || match(TokenType::LeftParen)
+        || match(TokenType::Pipe) || match(TokenType::Slash) || match(TokenType::RightBracket) || match(TokenType::RightParen)) {
+
+        auto ch = consume().value()[0];
+        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
+        match_length_minimum += 1;
+        return true;
+    }
+
+    if (match(TokenType::EscapeSequence)) {
+        if (m_parser_state.current_token.value().is_one_of("\\)"sv, "\\}"sv, "\\("sv, "\\{"sv))
+            return false;
+        auto ch = consume().value()[1];
+        bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
+        match_length_minimum += 1;
+        return true;
+    }
+
+    Vector<CompareTypeAndValuePair> values;
+    size_t bracket_minimum_length = 0;
+    if (!AbstractPosixParser::parse_bracket_expression(values, bracket_minimum_length))
+        return false;
+
+    bytecode.insert_bytecode_compare_values(move(values));
+    match_length_minimum += bracket_minimum_length;
+    return !has_error();
+}
+
+// =============================
+// PosixExtended Parser
+// =============================
+
+bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+{
+    return parse_root(stack, match_length_minimum);
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
+{
+    auto type = m_parser_state.current_token.type();
+    return (type == TokenType::Asterisk
+        || type == TokenType::Plus
+        || type == TokenType::Questionmark
+        || type == TokenType::LeftCurly);
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
+{
+    if (match(TokenType::LeftCurly)) {
+        consume();
+
+        StringBuilder number_builder;
+
+        while (match(TokenType::Char)) {
+            number_builder.append(consume().value());
+        }
+
+        auto maybe_minimum = number_builder.build().to_uint();
+        if (!maybe_minimum.has_value())
+            return set_error(Error::InvalidBraceContent);
+
+        auto minimum = maybe_minimum.value();
+        match_length_minimum *= minimum;
+
+        if (match(TokenType::Comma)) {
+            consume();
+        } else {
+            ByteCode bytecode;
+            bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
+            bytecode_to_repeat = move(bytecode);
+
+            consume(TokenType::RightCurly, Error::MismatchingBrace);
+            return !has_error();
+        }
+
+        Optional<size_t> maybe_maximum {};
+        number_builder.clear();
+        while (match(TokenType::Char)) {
+            number_builder.append(consume().value());
+        }
+        if (!number_builder.is_empty()) {
+            auto value = number_builder.build().to_uint();
+            if (!value.has_value() || minimum > value.value())
+                return set_error(Error::InvalidBraceContent);
+
+            maybe_maximum = value.value();
+        }
+
+        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
+
+        consume(TokenType::RightCurly, Error::MismatchingBrace);
+        return !has_error();
+
+    } else if (match(TokenType::Plus)) {
+        consume();
+
+        bool nongreedy = match(TokenType::Questionmark);
+        if (nongreedy)
+            consume();
+
+        // Note: don't touch match_length_minimum, it's already correct
+        ByteCode::transform_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
+        return !has_error();
+
+    } else if (match(TokenType::Asterisk)) {
+        consume();
+        match_length_minimum = 0;
+
+        bool nongreedy = match(TokenType::Questionmark);
+        if (nongreedy)
+            consume();
+
+        ByteCode::transform_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
+
+        return !has_error();
+
+    } else if (match(TokenType::Questionmark)) {
+        consume();
+        match_length_minimum = 0;
+
+        bool nongreedy = match(TokenType::Questionmark);
+        if (nongreedy)
+            consume();
+
+        ByteCode::transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
+        return !has_error();
+    }
+
+    return false;
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
+{
+    Vector<CompareTypeAndValuePair> values;
+    if (!AbstractPosixParser::parse_bracket_expression(values, match_length_minimum))
+        return false;
+
    stack.insert_bytecode_compare_values(move(values));

    return !has_error();
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@ -19,6 +19,7 @@
 namespace regex {

 class PosixExtendedParser;
+class PosixBasicParser;
 class ECMA262Parser;

 template<typename T>
@ -34,6 +35,10 @@ template<>
 struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
 };

+template<>
+struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
+};
+
 template<>
 struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
 };
@ -112,18 +117,62 @@ protected:
    ParserState m_parser_state;
 };

-class PosixExtendedParser final : public Parser {
-public:
-    explicit PosixExtendedParser(Lexer& lexer)
+class AbstractPosixParser : public Parser {
+protected:
+    explicit AbstractPosixParser(Lexer& lexer)
        : Parser(lexer)
    {
    }

-    PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
+    AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
        : Parser(lexer, regex_options.value_or({}))
    {
    }

+    ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
+};
+
+class PosixBasicParser final : public AbstractPosixParser {
+public:
+    explicit PosixBasicParser(Lexer& lexer)
+        : AbstractPosixParser(lexer)
+    {
+    }
+
+    PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
+        : AbstractPosixParser(lexer, regex_options.value_or({}))
+    {
+    }
+
+    ~PosixBasicParser() = default;
+
+private:
+    bool parse_internal(ByteCode&, size_t&) override;
+
+    bool parse_root(ByteCode&, size_t&);
+    bool parse_re_expression(ByteCode&, size_t&);
+    bool parse_simple_re(ByteCode&, size_t&);
+    bool parse_nonduplicating_re(ByteCode&, size_t&);
+    bool parse_one_char_or_collation_element(ByteCode&, size_t&);
+
+    size_t m_capture_group { 0 };
+    constexpr static size_t number_of_addressable_capture_groups = 9;
+    size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
+    bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
+};
+
+class PosixExtendedParser final : public AbstractPosixParser {
+public:
+    explicit PosixExtendedParser(Lexer& lexer)
+        : AbstractPosixParser(lexer)
+    {
+    }
+
+    PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
+        : AbstractPosixParser(lexer, regex_options.value_or({}))
+    {
+    }
+
    ~PosixExtendedParser() = default;

 private:
@ -197,9 +246,11 @@ private:
 };

 using PosixExtended = PosixExtendedParser;
+using PosixBasic = PosixBasicParser;
 using ECMA262 = ECMA262Parser;

 }

 using regex::ECMA262;
+using regex::PosixBasic;
 using regex::PosixExtended;