Regex: abandon bytecode and just use a simple list of instructions

Makes the code simpler.
2024-12-19 17:31:44 +03:00 · 2017-10-07 18:51:32 +08:00 · 2017-10-07 18:51:32 +08:00 · 732b8bc2a4
commit 732b8bc2a4
parent 6434bca325
2 changed files with 129 additions and 179 deletions
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@ -505,7 +505,7 @@ struct RegexCompiler
        : m_parsed_regex{parsed_regex}, m_forward{direction == MatchDirection::Forward}
    {
        compile_node(m_parsed_regex.ast);
-        push_op(CompiledRegex::Match);
+        push_inst(CompiledRegex::Match);
        m_program.matchers = m_parsed_regex.matchers;
        m_program.save_count = m_parsed_regex.capture_count * 2;
        m_program.direction = direction;
@ -515,34 +515,30 @@ struct RegexCompiler
    CompiledRegex get_compiled_regex() { return std::move(m_program); }

 private:
-    using Offset = CompiledRegex::Offset;

-    Offset compile_node_inner(const ParsedRegex::AstNodePtr& node)
+    uint32_t compile_node_inner(const ParsedRegex::AstNodePtr& node)
    {
-        const auto start_pos = m_program.bytecode.size();
+        const auto start_pos = m_program.instructions.size();

        const Codepoint capture = (node->op == ParsedRegex::Alternation or node->op == ParsedRegex::Sequence) ? node->value : -1;
        if (capture != -1)
-        {
-            push_op(CompiledRegex::Save);
-            push_byte(capture * 2 + (m_forward ? 0 : 1));
-        }
+            push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 0 : 1));

-        Vector<Offset> goto_inner_end_offsets;
+        Vector<uint32_t> goto_inner_end_offsets;
        switch (node->op)
        {
            case ParsedRegex::Literal:
-                push_op(node->ignore_case ? CompiledRegex::LiteralIgnoreCase
-                                          : CompiledRegex::Literal);
-                push_codepoint(node->ignore_case ? to_lower(node->value)
-                                                 : node->value);
+                if (node->ignore_case)
+                    push_inst(CompiledRegex::LiteralIgnoreCase, to_lower(node->value));
+                else
+                    push_inst(CompiledRegex::Literal, node->value);
                break;
            case ParsedRegex::AnyChar:
-                push_op(CompiledRegex::AnyChar);
+                push_inst(CompiledRegex::AnyChar);
                break;
            case ParsedRegex::Matcher:
-                push_op(CompiledRegex::Matcher);
-                push_byte(node->value);
+                push_inst(CompiledRegex::Matcher, node->value);
+                break;
            case ParsedRegex::Sequence:
            {
                if (m_forward)
@ -558,82 +554,77 @@ private:
                auto& children = node->children;
                kak_assert(children.size() == 2);

-                push_op(CompiledRegex::Split_PrioritizeParent);
-                auto offset = alloc_offset();
+                auto split_pos = push_inst(CompiledRegex::Split_PrioritizeParent);

                compile_node(children[m_forward ? 0 : 1]);
-                push_op(CompiledRegex::Jump);
-                goto_inner_end_offsets.push_back(alloc_offset());
+                auto left_pos = push_inst(CompiledRegex::Jump);
+                goto_inner_end_offsets.push_back(left_pos);

                auto right_pos = compile_node(children[m_forward ? 1 : 0]);
-                set_offset(offset, right_pos);
+                m_program.instructions[split_pos].param = right_pos;

                break;
            }
            case ParsedRegex::LookAhead:
-                push_op(m_forward ? CompiledRegex::LookAhead
-                                  : CompiledRegex::LookBehind);
-                push_string(node->children, false);
+                push_inst(m_forward ? CompiledRegex::LookAhead
+                                    : CompiledRegex::LookBehind,
+                          push_lookaround(node->children, false));
                break;
            case ParsedRegex::NegativeLookAhead:
-                push_op(m_forward ? CompiledRegex::NegativeLookAhead
-                                  : CompiledRegex::NegativeLookBehind);
-                push_string(node->children, false);
+                push_inst(m_forward ? CompiledRegex::NegativeLookAhead
+                                    : CompiledRegex::NegativeLookBehind,
+                          push_lookaround(node->children, false));
                break;
            case ParsedRegex::LookBehind:
-                push_op(m_forward ? CompiledRegex::LookBehind
-                                  : CompiledRegex::LookAhead);
-                push_string(node->children, true);
+                push_inst(m_forward ? CompiledRegex::LookBehind
+                                    : CompiledRegex::LookAhead,
+                          push_lookaround(node->children, true));
                break;
            case ParsedRegex::NegativeLookBehind:
-                push_op(m_forward ? CompiledRegex::NegativeLookBehind
-                                  : CompiledRegex::NegativeLookAhead);
-                push_string(node->children, true);
+                push_inst(m_forward ? CompiledRegex::NegativeLookBehind
+                                    : CompiledRegex::NegativeLookAhead,
+                          push_lookaround(node->children, true));
                break;
            case ParsedRegex::LineStart:
-                push_op(m_forward ? CompiledRegex::LineStart
-                                  : CompiledRegex::LineEnd);
+                push_inst(m_forward ? CompiledRegex::LineStart
+                                    : CompiledRegex::LineEnd);
                break;
            case ParsedRegex::LineEnd:
-                push_op(m_forward ? CompiledRegex::LineEnd
-                                  : CompiledRegex::LineStart);
+                push_inst(m_forward ? CompiledRegex::LineEnd
+                                    : CompiledRegex::LineStart);
                break;
            case ParsedRegex::WordBoundary:
-                push_op(CompiledRegex::WordBoundary);
+                push_inst(CompiledRegex::WordBoundary);
                break;
            case ParsedRegex::NotWordBoundary:
-                push_op(CompiledRegex::NotWordBoundary);
+                push_inst(CompiledRegex::NotWordBoundary);
                break;
            case ParsedRegex::SubjectBegin:
-                push_op(m_forward ? CompiledRegex::SubjectBegin
-                                  : CompiledRegex::SubjectEnd);
+                push_inst(m_forward ? CompiledRegex::SubjectBegin
+                                    : CompiledRegex::SubjectEnd);
                break;
            case ParsedRegex::SubjectEnd:
-                push_op(m_forward ? CompiledRegex::SubjectEnd
-                                  : CompiledRegex::SubjectBegin);
+                push_inst(m_forward ? CompiledRegex::SubjectEnd
+                                    : CompiledRegex::SubjectBegin);
                break;
            case ParsedRegex::ResetStart:
-                push_op(CompiledRegex::Save);
-                push_byte(0);
+                push_inst(CompiledRegex::Save, 0);
                break;
        }

        for (auto& offset : goto_inner_end_offsets)
-            set_offset(offset, m_program.bytecode.size());
+            m_program.instructions[offset].param = m_program.instructions.size();

        if (capture != -1)
-        {
-            push_op(CompiledRegex::Save);
-            push_byte(capture * 2 + (m_forward ? 1 : 0));
-        }
+            push_inst(CompiledRegex::Save, capture * 2 + (m_forward ? 1 : 0));

        return start_pos;
    }

-    Offset compile_node(const ParsedRegex::AstNodePtr& node)
+    uint32_t compile_node(const ParsedRegex::AstNodePtr& node)
    {
-        Offset pos = m_program.bytecode.size();
-        Vector<Offset> goto_end_offsets;
+        uint32_t pos = m_program.instructions.size();
+        Vector<uint32_t> goto_ends;

        auto& quantifier = node->quantifier;

@ -641,9 +632,9 @@ private:

        if (quantifier.allows_none())
        {
-            push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
-                                      : CompiledRegex::Split_PrioritizeChild);
-            goto_end_offsets.push_back(alloc_offset());
+            auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
+                                                         : CompiledRegex::Split_PrioritizeChild);
+            goto_ends.push_back(split_pos);
        }

        auto inner_pos = compile_node_inner(node);
@ -652,66 +643,45 @@ private:
            inner_pos = compile_node_inner(node);

        if (quantifier.allows_infinite_repeat())
-        {
-            push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
-                                      : CompiledRegex::Split_PrioritizeParent);
-            set_offset(alloc_offset(), inner_pos);
-        }
+            push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeChild
+                                        : CompiledRegex::Split_PrioritizeParent,
+                      inner_pos);
+
        // Write the node as an optional match for the min -> max counts
        else for (int i = std::max(1, quantifier.min); // STILL UGLY !
                  i < quantifier.max; ++i)
        {
-            push_op(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
-                                      : CompiledRegex::Split_PrioritizeChild);
-            goto_end_offsets.push_back(alloc_offset());
+            auto split_pos = push_inst(quantifier.greedy ? CompiledRegex::Split_PrioritizeParent
+                                                         : CompiledRegex::Split_PrioritizeChild);
+            goto_ends.push_back(split_pos);
            compile_node_inner(node);
        }

-        for (auto offset : goto_end_offsets)
-            set_offset(offset, m_program.bytecode.size());
+        for (auto offset : goto_ends)
+            m_program.instructions[offset].param = m_program.instructions.size();

        return pos;
    }

-    Offset alloc_offset()
+    uint32_t push_inst(CompiledRegex::Op op, uint32_t param = 0)
    {
-        auto pos = m_program.bytecode.size();
-        m_program.bytecode.resize(pos + sizeof(Offset));
-        return pos;
+        uint32_t res = m_program.instructions.size();
+        m_program.instructions.push_back({ op, param });
+        return res;
    }

-    void set_offset(Offset pos, Offset value)
+    uint32_t push_lookaround(const Vector<ParsedRegex::AstNodePtr>& literals, bool reversed = false)
    {
-        memcpy(&m_program.bytecode[pos], &value, sizeof(Offset));
-    }
-
-    void push_op(CompiledRegex::Op op)
-    {
-        m_program.bytecode.push_back(op);
-    }
-
-    void push_byte(char byte)
-    {
-        m_program.bytecode.push_back(byte);
-    }
-
-    void push_codepoint(Codepoint cp)
-    {
-        utf8::dump(std::back_inserter(m_program.bytecode), cp);
-    }
-
-    void push_string(const Vector<ParsedRegex::AstNodePtr>& codepoints, bool reversed = false)
-    {
-        if (codepoints.size() > 127)
-            throw runtime_error{"Too long literal string"};
-
-        push_byte(codepoints.size());
+        uint32_t res = m_program.lookarounds.size();
        if (reversed)
-            for (auto& cp : codepoints | reverse()) 
-                push_codepoint(cp->value);
+            for (auto& literal : literals | reverse()) 
+                m_program.lookarounds.push_back(literal->value);
        else
-            for (auto& cp : codepoints) 
-                push_codepoint(cp->value);
+            for (auto& literal : literals) 
+                m_program.lookarounds.push_back(literal->value);
+
+        m_program.lookarounds.push_back((Codepoint)-1);
+        return res;
    }

    // Fills accepted and rejected according to which chars can start the given node,
@ -804,40 +774,35 @@ private:

 void dump_regex(const CompiledRegex& program)
 {
-    for (auto pos = program.bytecode.data(), end = program.bytecode.data() + program.bytecode.size();
-         pos < end; )
+    for (auto& inst : program.instructions)
    {
-        printf("%4zd    ", pos - program.bytecode.data());
-        const auto op = (CompiledRegex::Op)*pos++;
-        switch (op)
+        switch (inst.op)
        {
            case CompiledRegex::Literal:
-                printf("literal %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
+                printf("literal %lc\n", inst.param);
                break;
            case CompiledRegex::LiteralIgnoreCase:
-                printf("literal (ignore case) %lc\n", utf8::read_codepoint(pos, (const char*)nullptr));
+                printf("literal (ignore case) %lc\n", inst.param);
                break;
            case CompiledRegex::AnyChar:
                printf("any char\n");
                break;
            case CompiledRegex::Jump:
-                printf("jump %u\n", *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
-                pos += sizeof(CompiledRegex::Offset);
+                printf("jump %u\n", inst.param);
                break;
            case CompiledRegex::Split_PrioritizeParent:
            case CompiledRegex::Split_PrioritizeChild:
            {
                printf("split (prioritize %s) %u\n",
-                       op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
-                       *reinterpret_cast<const CompiledRegex::Offset*>(&*pos));
-                pos += sizeof(CompiledRegex::Offset);
+                       inst.op == CompiledRegex::Split_PrioritizeParent ? "parent" : "child",
+                       inst.param);
                break;
            }
            case CompiledRegex::Save:
-                printf("save %d\n", *pos++);
+                printf("save %d\n", inst.param);
                break;
            case CompiledRegex::Matcher:
-                printf("matcher %d\n", *pos++);
+                printf("matcher %d\n", inst.param);
                break;
            case CompiledRegex::LineStart:
                printf("line start\n");
@ -862,20 +827,20 @@ void dump_regex(const CompiledRegex& program)
            case CompiledRegex::LookBehind:
            case CompiledRegex::NegativeLookBehind:
            {
-                int count = *pos++;
-                StringView str{pos, pos + count};
                const char* name = nullptr;
-                if (op == CompiledRegex::LookAhead)
+                if (inst.op == CompiledRegex::LookAhead)
                    name = "look ahead";
-                if (op == CompiledRegex::NegativeLookAhead)
+                if (inst.op == CompiledRegex::NegativeLookAhead)
                    name = "negative look ahead";
-                if (op == CompiledRegex::LookBehind)
+                if (inst.op == CompiledRegex::LookBehind)
                    name = "look behind";
-                if (op == CompiledRegex::NegativeLookBehind)
+                if (inst.op == CompiledRegex::NegativeLookBehind)
                    name = "negative look behind";

-                printf("%s (%s)\n", name, (const char*)str.zstr());
-                pos += count;
+                String str;
+                for (auto it = program.lookarounds.begin() + inst.param; *it != -1; ++it)
+                    utf8::dump(std::back_inserter(str), *it);
+                printf("%s (%s)\n", name, str.c_str());
                break;
            }
            case CompiledRegex::Match:
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@ -45,11 +45,17 @@ struct CompiledRegex : RefCountable
        NegativeLookBehind,
    };

-    using Offset = unsigned;
-    explicit operator bool() const { return not bytecode.empty(); }
+    struct Instruction
+    {
+        Op op;
+        uint32_t param;
+    };

-    Vector<char> bytecode;
+    explicit operator bool() const { return not instructions.empty(); }
+
+    Vector<Instruction> instructions;
    Vector<std::function<bool (Codepoint)>> matchers;
+    Vector<Codepoint> lookarounds;
    MatchDirection direction;
    size_t save_count;

@ -123,7 +129,7 @@ public:
            return false;

        Vector<Thread> current_threads, next_threads;
-        std::unique_ptr<bool[]> inst_processed{new bool[m_program.bytecode.size()]};
+        std::unique_ptr<bool[]> processed_inst{new bool[m_program.instructions.size()]};

        const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
        Utf8It start{m_begin};
@ -134,7 +140,7 @@ public:
            to_next_start(start, m_end, start_chars);

        if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
-                      current_threads, next_threads, inst_processed.get()))
+                      current_threads, next_threads, processed_inst.get()))
            return true;

        if (not (flags & RegexExecFlags::Search))
@ -144,7 +150,7 @@ public:
        {
            to_next_start(++start, m_end, start_chars);
            if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
-                          current_threads, next_threads, inst_processed.get()))
+                          current_threads, next_threads, processed_inst.get()))
                return true;
        }
        while (start != m_end);
@ -200,7 +206,7 @@ private:

    struct Thread
    {
-        const char* inst;
+        uint32_t inst;
        Saves* saves;
    };

@ -209,58 +215,49 @@ private:
    enum class StepResult { Consumed, Matched, Failed };

    // Steps a thread until it consumes the current character, matches or fail
-    StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* inst_processed)
+    StepResult step(const Utf8It& pos, Thread& thread, Vector<Thread>& threads, bool* processed_inst)
    {
-        const auto prog_start = m_program.bytecode.data();
-        const auto prog_end = prog_start + m_program.bytecode.size();
        while (true)
        {
-            // If we have hit this instruction on this character, in this thread or another, do not try again
-            const auto inst_offset = thread.inst - prog_start;
-            if (inst_processed[inst_offset])
+            if (processed_inst[thread.inst])
                return StepResult::Failed;
-            inst_processed[inst_offset] = true;
+            processed_inst[thread.inst] = true;
+
+            auto& inst = m_program.instructions[thread.inst++];

            const Codepoint cp = pos == m_end ? 0 : *pos;
-            const CompiledRegex::Op op = (CompiledRegex::Op)*thread.inst++;
-            switch (op)
+            switch (inst.op)
            {
                case CompiledRegex::Literal:
-                    if (utf8::read_codepoint(thread.inst, prog_end) == cp)
+                    if (inst.param == cp)
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::LiteralIgnoreCase:
-                    if (utf8::read_codepoint(thread.inst, prog_end) == to_lower(cp))
+                    if (inst.param == to_lower(cp))
                        return StepResult::Consumed;
                    return StepResult::Failed;
                case CompiledRegex::AnyChar:
                    return StepResult::Consumed;
                case CompiledRegex::Jump:
-                    thread.inst = prog_start + get_offset(thread.inst);
+                    thread.inst = inst.param;
                    break;
                case CompiledRegex::Split_PrioritizeParent:
                {
-                    auto parent = thread.inst + sizeof(CompiledRegex::Offset);
-                    auto child = prog_start + get_offset(thread.inst);
-                    thread.inst = parent;
                    if (thread.saves)
                        ++thread.saves->refcount;
-                    threads.push_back({child, thread.saves});
+                    threads.push_back({inst.param, thread.saves});
                    break;
                }
                case CompiledRegex::Split_PrioritizeChild:
                {
-                    auto parent = thread.inst + sizeof(CompiledRegex::Offset);
-                    auto child = prog_start + get_offset(thread.inst);
-                    thread.inst = child;
                    if (thread.saves)
                        ++thread.saves->refcount;
-                    threads.push_back({parent, thread.saves});
+                    threads.push_back({thread.inst, thread.saves});
+                    thread.inst = inst.param;
                    break;
                }
                case CompiledRegex::Save:
                {
-                    const size_t index = *thread.inst++;
                    if (thread.saves == nullptr)
                        break;
                    if (thread.saves->refcount > 1)
@ -268,15 +265,12 @@ private:
                        --thread.saves->refcount;
                        thread.saves = new_saves<true>(thread.saves->pos);
                    }
-                    thread.saves->pos[index] = get_base(pos);
+                    thread.saves->pos[inst.param] = get_base(pos);
                    break;
                }
                case CompiledRegex::Matcher:
-                {
-                    const int matcher_id = *thread.inst++;
-                    return m_program.matchers[matcher_id](cp) ?
+                    return m_program.matchers[inst.param](cp) ?
                        StepResult::Consumed : StepResult::Failed;
-                }
                case CompiledRegex::LineStart:
                    if (not is_line_start(pos))
                        return StepResult::Failed;
@ -304,27 +298,25 @@ private:
                case CompiledRegex::LookAhead:
                case CompiledRegex::NegativeLookAhead:
                {
-                    int count = *thread.inst++;
-                    for (auto it = pos; count and it != m_end; ++it, --count)
-                        if (*it != utf8::read(thread.inst))
+                    auto ref = m_program.lookarounds.begin() + inst.param;
+                    for (auto it = pos; *ref != -1 and it != m_end; ++it, ++ref)
+                        if (*it != *ref)
                            break;
-                    if ((op == CompiledRegex::LookAhead and count != 0) or
-                        (op == CompiledRegex::NegativeLookAhead and count == 0))
+                    if ((inst.op == CompiledRegex::LookAhead and *ref != -1) or
+                        (inst.op == CompiledRegex::NegativeLookAhead and *ref == -1))
                        return StepResult::Failed;
-                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::LookBehind:
                case CompiledRegex::NegativeLookBehind:
                {
-                    int count = *thread.inst++;
-                    for (auto it = pos-1; count and it >= m_begin; --it, --count)
-                        if (*it != utf8::read(thread.inst))
+                    auto ref = m_program.lookarounds.begin() + inst.param;
+                    for (auto it = pos-1; *ref != -1 and it >= m_begin; --it, ++ref)
+                        if (*it != *ref)
                            break;
-                    if ((op == CompiledRegex::LookBehind and count != 0) or
-                        (op == CompiledRegex::NegativeLookBehind and count == 0))
+                    if ((inst.op == CompiledRegex::LookBehind and *ref != -1) or
+                        (inst.op == CompiledRegex::NegativeLookBehind and *ref == -1))
                        return StepResult::Failed;
-                    thread.inst = utf8::advance(thread.inst, prog_end, CharCount{count - 1});
                    break;
                }
                case CompiledRegex::Match:
@ -334,20 +326,20 @@ private:
        return StepResult::Failed;
    }

-    bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* inst_processed)
+    bool exec_from(const Utf8It& start, Saves* initial_saves, Vector<Thread>& current_threads, Vector<Thread>& next_threads, bool* processed_inst)
    {
-        current_threads.push_back({m_program.bytecode.data(), initial_saves});
+        current_threads.push_back({0, initial_saves});
        next_threads.clear();

        bool found_match = false;
        for (Utf8It pos = start; pos != m_end; ++pos)
        {
-            memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
+            memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
            while (not current_threads.empty())
            {
                auto thread = current_threads.back();
                current_threads.pop_back();
-                switch (step(pos, thread, current_threads, inst_processed))
+                switch (step(pos, thread, current_threads, processed_inst))
                {
                case StepResult::Matched:
                    if (not (m_flags & RegexExecFlags::Search) or // We are not at end, this is not a full match
@ -385,13 +377,13 @@ private:
        if (found_match)
            return true;

-        memset(inst_processed, 0, m_program.bytecode.size() * sizeof(bool));
+        memset(processed_inst, 0, sizeof(bool) * m_program.instructions.size());
        // Step remaining threads to see if they match without consuming anything else
        while (not current_threads.empty())
        {
            auto thread = current_threads.back();
            current_threads.pop_back();
-            if (step(m_end, thread, current_threads, inst_processed) == StepResult::Matched)
+            if (step(m_end, thread, current_threads, processed_inst) == StepResult::Matched)
            {
                release_saves(m_captures);
                m_captures = thread.saves;
@ -411,13 +403,6 @@ private:
            ++start;
    }

-    static CompiledRegex::Offset get_offset(const char* ptr)
-    {
-        CompiledRegex::Offset res;
-        memcpy(&res, ptr, sizeof(CompiledRegex::Offset));
-        return res;
-    }
-
    bool is_line_start(const Utf8It& pos) const
    {
        return (pos == m_begin and not (m_flags & RegexExecFlags::NotBeginOfLine)) or