1
1
mirror of https://github.com/mawww/kakoune.git synced 2024-08-17 00:30:26 +03:00

Store instruction pointers directly in ThreadedRegexVM::Thread

The previous tradeoff of having a very small Thread struct is not
necessary anymore as we do not memcpy Threads on swap_next since
d708b77186.

This requires offsets to be used instead of indices for jump/split
ops.
This commit is contained in:
Maxime Coste 2024-06-15 14:03:10 +10:00
parent c84942c2ac
commit c4684d0d84
2 changed files with 42 additions and 31 deletions

View File

@ -706,7 +706,7 @@ private:
{ {
auto& node = get_node(index); auto& node = get_node(index);
const uint32_t start_pos = (uint32_t)m_program.instructions.size(); const OpIndex start_pos = op_count();
const bool ignore_case = node.ignore_case; const bool ignore_case = node.ignore_case;
const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and const bool save = (node.op == ParsedRegex::Alternation or node.op == ParsedRegex::Sequence) and
@ -746,14 +746,14 @@ private:
if (child != index+1) if (child != index+1)
push_inst(CompiledRegex::Split); push_inst(CompiledRegex::Split);
} }
auto split_pos = m_program.instructions.size(); auto split_pos = op_count();
const auto end = node.children_end; const auto end = node.children_end;
for (auto child : Children<>{m_parsed_regex, index}) for (auto child : Children<>{m_parsed_regex, index})
{ {
auto node = compile_node<direction>(child); auto node = compile_node<direction>(child);
if (child != index+1) if (child != index+1)
m_program.instructions[--split_pos].param.split = CompiledRegex::Param::Split{.target = node, .prioritize_parent = true}; m_program.instructions[--split_pos].param.split = CompiledRegex::Param::Split{.offset = offset(node, split_pos), .prioritize_parent = true};
if (get_node(child).children_end != end) if (get_node(child).children_end != end)
{ {
auto jump = push_inst(CompiledRegex::Jump); auto jump = push_inst(CompiledRegex::Jump);
@ -801,8 +801,8 @@ private:
break; break;
} }
for (auto& offset : goto_inner_end_offsets) for (auto& index : goto_inner_end_offsets)
m_program.instructions[offset].param.jump_target = m_program.instructions.size(); m_program.instructions[index].param.jump_offset = offset(op_count(), index);
if (save) if (save)
push_inst(CompiledRegex::Save, {.save_index=int16_t(node.value * 2 + (forward ? 1 : 0))}); push_inst(CompiledRegex::Save, {.save_index=int16_t(node.value * 2 + (forward ? 1 : 0))});
@ -810,19 +810,29 @@ private:
return start_pos; return start_pos;
} }
OpIndex op_count() const
{
return static_cast<OpIndex>(m_program.instructions.size());
}
static OpIndex offset(OpIndex to, OpIndex from)
{
return static_cast<OpIndex>(to - from);
}
template<RegexMode direction> template<RegexMode direction>
OpIndex compile_node(ParsedRegex::NodeIndex index) OpIndex compile_node(ParsedRegex::NodeIndex index)
{ {
auto& node = get_node(index); auto& node = get_node(index);
const OpIndex start_pos = (OpIndex)m_program.instructions.size(); const OpIndex start_pos = op_count();
Vector<OpIndex> goto_ends; Vector<OpIndex> goto_ends;
auto& quantifier = node.quantifier; auto& quantifier = node.quantifier;
if (quantifier.allows_none()) if (quantifier.allows_none())
{ {
auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}}); auto split_pos = push_inst(CompiledRegex::Split, {.split={.offset=0, .prioritize_parent=quantifier.greedy}});
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
} }
@ -832,18 +842,18 @@ private:
inner_pos = compile_node_inner<direction>(index); inner_pos = compile_node_inner<direction>(index);
if (quantifier.allows_infinite_repeat()) if (quantifier.allows_infinite_repeat())
push_inst(CompiledRegex::Split, {.split = {.target=inner_pos, .prioritize_parent=not quantifier.greedy}}); push_inst(CompiledRegex::Split, {.split = {.offset=offset(inner_pos, op_count()), .prioritize_parent=not quantifier.greedy}});
// Write the node as an optional match for the min -> max counts // Write the node as an optional match for the min -> max counts
else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY ! else for (int i = std::max((int16_t)1, quantifier.min); // STILL UGLY !
i < quantifier.max; ++i) i < quantifier.max; ++i)
{ {
auto split_pos = push_inst(CompiledRegex::Split, {.split={.target=0, .prioritize_parent=quantifier.greedy}}); auto split_pos = push_inst(CompiledRegex::Split, {.split={.offset=0, .prioritize_parent=quantifier.greedy}});
goto_ends.push_back(split_pos); goto_ends.push_back(split_pos);
compile_node_inner<direction>(index); compile_node_inner<direction>(index);
} }
for (auto offset : goto_ends) for (auto index : goto_ends)
m_program.instructions[offset].param.split.target = m_program.instructions.size(); m_program.instructions[index].param.split.offset = offset(op_count(), index);
return start_pos; return start_pos;
} }
@ -851,7 +861,7 @@ private:
OpIndex push_inst(CompiledRegex::Op op, CompiledRegex::Param param = {}) OpIndex push_inst(CompiledRegex::Op op, CompiledRegex::Param param = {})
{ {
constexpr auto max_instructions = std::numeric_limits<OpIndex>::max(); constexpr auto max_instructions = std::numeric_limits<OpIndex>::max();
const auto res = m_program.instructions.size(); const auto res = op_count();
if (res >= max_instructions) if (res >= max_instructions)
throw regex_error(format("regex compiled to more than {} instructions", max_instructions)); throw regex_error(format("regex compiled to more than {} instructions", max_instructions));
m_program.instructions.push_back({ op, 0, param }); m_program.instructions.push_back({ op, 0, param });
@ -1031,7 +1041,7 @@ private:
{ {
auto& inst = m_program.instructions[i]; auto& inst = m_program.instructions[i];
if (is_jump(inst.op)) if (is_jump(inst.op))
m_program.instructions[inst.param.jump_target].last_step = 0xffff; // tag as jump target m_program.instructions[i + inst.param.jump_offset].last_step = 0xffff; // tag as jump target
} }
for (auto block_begin = begin; block_begin < end; ) for (auto block_begin = begin; block_begin < end; )
@ -1071,11 +1081,11 @@ private:
String dump_regex(const CompiledRegex& program) String dump_regex(const CompiledRegex& program)
{ {
String res; String res;
int count = 0; int index = 0;
for (auto& inst : program.instructions) for (auto& inst : program.instructions)
{ {
char buf[20]; char buf[20];
format_to(buf, " {:03} ", count++); format_to(buf, " {:03} ", index);
res += buf; res += buf;
switch (inst.op) switch (inst.op)
{ {
@ -1095,13 +1105,13 @@ String dump_regex(const CompiledRegex& program)
res += format("character type {}\n", to_underlying(inst.param.character_type)); res += format("character type {}\n", to_underlying(inst.param.character_type));
break; break;
case CompiledRegex::Jump: case CompiledRegex::Jump:
res += format("jump {}\n", inst.param.jump_target); res += format("jump {} ({:03})\n", inst.param.jump_offset, index + inst.param.jump_offset);
break; break;
case CompiledRegex::Split: case CompiledRegex::Split:
{ {
res += format("split (prioritize {}) {}\n", res += format("split (prioritize {}) {} ({:03})\n",
(inst.param.split.prioritize_parent) ? "parent" : "child", (inst.param.split.prioritize_parent) ? "parent" : "child",
inst.param.split.target); inst.param.split.offset, index + inst.param.split.offset);
break; break;
} }
case CompiledRegex::Save: case CompiledRegex::Save:
@ -1135,6 +1145,7 @@ String dump_regex(const CompiledRegex& program)
case CompiledRegex::Match: case CompiledRegex::Match:
res += "match\n"; res += "match\n";
} }
++index;
} }
auto dump_start_desc = [&](const CompiledRegex::StartDesc& desc, StringView name) { auto dump_start_desc = [&](const CompiledRegex::StartDesc& desc, StringView name) {
res += name + " start desc: ["; res += name + " start desc: [";

View File

@ -106,11 +106,11 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex>
} literal; } literal;
int16_t character_class_index; int16_t character_class_index;
CharacterType character_type; CharacterType character_type;
int16_t jump_target; int16_t jump_offset;
int16_t save_index; int16_t save_index;
struct Split struct Split
{ {
int16_t target; int16_t offset;
bool prioritize_parent : 1; bool prioritize_parent : 1;
} split; } split;
bool line_start; bool line_start;
@ -351,10 +351,10 @@ private:
--saves.refcount; --saves.refcount;
}; };
struct alignas(int32_t) Thread struct Thread
{ {
int16_t inst; const CompiledRegex::Instruction* inst;
int16_t saves; int saves;
}; };
using StartDesc = CompiledRegex::StartDesc; using StartDesc = CompiledRegex::StartDesc;
@ -370,7 +370,7 @@ private:
// Steps a thread until it consumes the current character, matches or fail // Steps a thread until it consumes the current character, matches or fail
[[gnu::always_inline]] [[gnu::always_inline]]
void step_thread(const CompiledRegex::Instruction* instructions, const Iterator& pos, Codepoint cp, void step_thread(const Iterator& pos, Codepoint cp,
uint16_t current_step, Thread thread, const ExecConfig& config) uint16_t current_step, Thread thread, const ExecConfig& config)
{ {
auto failed = [this, &thread]() { auto failed = [this, &thread]() {
@ -382,7 +382,7 @@ private:
while (true) while (true)
{ {
auto& inst = instructions[thread.inst++]; auto& inst = *thread.inst++;
// if this instruction was already executed for this step in another thread, // if this instruction was already executed for this step in another thread,
// then this thread is redundant and can be dropped // then this thread is redundant and can be dropped
if (inst.last_step == current_step) if (inst.last_step == current_step)
@ -424,11 +424,11 @@ private:
return failed(); return failed();
return is_ctype(inst.param.character_type, cp) ? consumed() : failed(); return is_ctype(inst.param.character_type, cp) ? consumed() : failed();
case CompiledRegex::Jump: case CompiledRegex::Jump:
thread.inst = inst.param.jump_target; thread.inst = &inst + inst.param.jump_offset;
break; break;
case CompiledRegex::Split: case CompiledRegex::Split:
if (auto target = inst.param.split.target; if (auto* target = &inst + inst.param.split.offset;
instructions[target].last_step != current_step) target->last_step != current_step)
{ {
if (thread.saves >= 0) if (thread.saves >= 0)
++m_saves[thread.saves].refcount; ++m_saves[thread.saves].refcount;
@ -478,7 +478,8 @@ private:
m_captures = -1; m_captures = -1;
m_threads.ensure_initial_capacity(); m_threads.ensure_initial_capacity();
const int16_t first_inst = forward ? 0 : m_program.first_backward_inst; ConstArrayView<CompiledRegex::Instruction> insts{m_program.instructions};
const auto* first_inst = insts.begin() + (forward ? 0 : m_program.first_backward_inst);
m_threads.push_current({first_inst, -1}); m_threads.push_current({first_inst, -1});
const auto* start_desc = (forward ? m_program.forward_start_desc : m_program.backward_start_desc).get(); const auto* start_desc = (forward ? m_program.forward_start_desc : m_program.backward_start_desc).get();
@ -486,7 +487,6 @@ private:
constexpr bool search = mode & RegexMode::Search; constexpr bool search = mode & RegexMode::Search;
constexpr bool any_match = mode & RegexMode::AnyMatch; constexpr bool any_match = mode & RegexMode::AnyMatch;
ConstArrayView<CompiledRegex::Instruction> insts{m_program.instructions};
uint16_t current_step = -1; uint16_t current_step = -1;
m_found_match = false; m_found_match = false;
while (true) // Iterate on all codepoints and once at the end while (true) // Iterate on all codepoints and once at the end
@ -506,7 +506,7 @@ private:
Codepoint cp = codepoint(next, config); Codepoint cp = codepoint(next, config);
while (not m_threads.current_is_empty()) while (not m_threads.current_is_empty())
step_thread(insts.pointer(), pos, cp, current_step, m_threads.pop_current(), config); step_thread(pos, cp, current_step, m_threads.pop_current(), config);
if (pos == config.end or if (pos == config.end or
(m_threads.next_is_empty() and (not search or m_found_match)) or (m_threads.next_is_empty() and (not search or m_found_match)) or