2017-09-17 11:50:53 +03:00
|
|
|
#ifndef regex_impl_hh_INCLUDED
|
|
|
|
#define regex_impl_hh_INCLUDED
|
|
|
|
|
2017-10-07 07:46:27 +03:00
|
|
|
#include "exception.hh"
|
2017-10-07 05:43:21 +03:00
|
|
|
#include "flags.hh"
|
|
|
|
#include "ref_ptr.hh"
|
2017-10-02 09:59:04 +03:00
|
|
|
#include "unicode.hh"
|
|
|
|
#include "utf8.hh"
|
|
|
|
#include "utf8_iterator.hh"
|
|
|
|
#include "vector.hh"
|
|
|
|
|
2017-09-26 10:44:30 +03:00
|
|
|
namespace Kakoune
|
|
|
|
{
|
|
|
|
|
2017-10-09 09:04:14 +03:00
|
|
|
struct regex_error : runtime_error
|
|
|
|
{
|
|
|
|
using runtime_error::runtime_error;
|
|
|
|
};
|
|
|
|
|
2017-10-07 07:46:27 +03:00
|
|
|
enum class MatchDirection
|
|
|
|
{
|
|
|
|
Forward,
|
|
|
|
Backward
|
|
|
|
};
|
|
|
|
|
2017-11-25 13:14:15 +03:00
|
|
|
enum class CharacterType : unsigned char
|
|
|
|
{
|
|
|
|
None = 0,
|
2017-11-27 19:13:42 +03:00
|
|
|
Whitespace = 1 << 0,
|
|
|
|
HorizontalWhitespace = 1 << 1,
|
|
|
|
Word = 1 << 2,
|
2017-11-25 13:14:15 +03:00
|
|
|
Digit = 1 << 3,
|
2017-11-27 19:13:42 +03:00
|
|
|
NotWhitespace = 1 << 4,
|
|
|
|
NotHorizontalWhitespace = 1 << 5,
|
|
|
|
NotWord = 1 << 6,
|
2017-11-25 13:14:15 +03:00
|
|
|
NotDigit = 1 << 7
|
|
|
|
};
|
|
|
|
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
|
|
|
|
|
|
|
|
struct CharacterClass
|
|
|
|
{
|
|
|
|
struct Range { Codepoint min, max; };
|
|
|
|
|
|
|
|
Vector<Range, MemoryDomain::Regex> ranges;
|
|
|
|
CharacterType ctypes = CharacterType::None;
|
|
|
|
bool negative = false;
|
|
|
|
bool ignore_case = false;
|
|
|
|
};
|
|
|
|
|
|
|
|
bool is_character_class(const CharacterClass& character_class, Codepoint cp);
|
|
|
|
bool is_ctype(CharacterType ctype, Codepoint cp);
|
|
|
|
|
2017-10-15 04:23:57 +03:00
|
|
|
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
|
|
|
enum Op : char
|
|
|
|
{
|
|
|
|
Match,
|
2017-10-20 10:17:02 +03:00
|
|
|
FindNextStart,
|
2017-10-02 09:59:04 +03:00
|
|
|
Literal,
|
2017-10-10 06:21:21 +03:00
|
|
|
Literal_IgnoreCase,
|
2017-10-02 09:59:04 +03:00
|
|
|
AnyChar,
|
2017-11-25 13:14:15 +03:00
|
|
|
Class,
|
|
|
|
CharacterType,
|
2017-10-02 09:59:04 +03:00
|
|
|
Jump,
|
|
|
|
Split_PrioritizeParent,
|
|
|
|
Split_PrioritizeChild,
|
|
|
|
Save,
|
|
|
|
LineStart,
|
|
|
|
LineEnd,
|
|
|
|
WordBoundary,
|
|
|
|
NotWordBoundary,
|
|
|
|
SubjectBegin,
|
|
|
|
SubjectEnd,
|
|
|
|
LookAhead,
|
|
|
|
NegativeLookAhead,
|
2017-10-04 18:00:19 +03:00
|
|
|
LookBehind,
|
2017-10-02 09:59:04 +03:00
|
|
|
NegativeLookBehind,
|
2017-10-10 06:21:21 +03:00
|
|
|
LookAhead_IgnoreCase,
|
|
|
|
NegativeLookAhead_IgnoreCase,
|
|
|
|
LookBehind_IgnoreCase,
|
|
|
|
NegativeLookBehind_IgnoreCase,
|
2017-10-02 09:59:04 +03:00
|
|
|
};
|
|
|
|
|
2017-10-07 13:51:32 +03:00
|
|
|
struct Instruction
|
|
|
|
{
|
|
|
|
Op op;
|
2017-10-15 04:34:49 +03:00
|
|
|
// Those mutables are used during execution
|
2017-10-07 14:58:10 +03:00
|
|
|
mutable bool scheduled;
|
2017-10-14 07:58:42 +03:00
|
|
|
mutable uint16_t last_step;
|
2017-10-07 13:51:32 +03:00
|
|
|
uint32_t param;
|
|
|
|
};
|
2017-10-07 14:08:14 +03:00
|
|
|
static_assert(sizeof(Instruction) == 8, "");
|
2017-10-07 13:51:32 +03:00
|
|
|
|
2017-10-20 10:17:02 +03:00
|
|
|
static constexpr uint16_t search_prefix_size = 3;
|
|
|
|
|
2017-10-07 13:51:32 +03:00
|
|
|
explicit operator bool() const { return not instructions.empty(); }
|
2017-10-02 09:59:04 +03:00
|
|
|
|
2017-10-15 04:23:57 +03:00
|
|
|
Vector<Instruction, MemoryDomain::Regex> instructions;
|
2017-11-25 13:14:15 +03:00
|
|
|
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
|
2017-10-15 04:23:57 +03:00
|
|
|
Vector<Codepoint, MemoryDomain::Regex> lookarounds;
|
2017-12-01 14:57:02 +03:00
|
|
|
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
|
|
|
|
uint32_t save_count;
|
2017-10-06 08:40:27 +03:00
|
|
|
|
2018-02-24 08:29:24 +03:00
|
|
|
struct StartDesc : UseMemoryDomain<MemoryDomain::Regex>
|
2017-10-09 13:19:36 +03:00
|
|
|
{
|
|
|
|
static constexpr size_t count = 256;
|
2017-10-21 05:04:08 +03:00
|
|
|
static constexpr Codepoint other = 256;
|
|
|
|
bool map[count+1];
|
2017-10-09 13:19:36 +03:00
|
|
|
};
|
2017-10-21 05:04:08 +03:00
|
|
|
|
2017-12-01 14:57:02 +03:00
|
|
|
std::unique_ptr<StartDesc> forward_start_desc;
|
|
|
|
std::unique_ptr<StartDesc> backward_start_desc;
|
2017-10-02 09:59:04 +03:00
|
|
|
};
|
|
|
|
|
2017-10-23 12:29:03 +03:00
|
|
|
enum class RegexCompileFlags
|
2017-10-09 09:04:14 +03:00
|
|
|
{
|
|
|
|
None = 0,
|
|
|
|
NoSubs = 1 << 0,
|
2017-12-01 14:57:02 +03:00
|
|
|
Optimize = 1 << 1,
|
|
|
|
Backward = 1 << 1,
|
|
|
|
NoForward = 1 << 2,
|
2017-10-09 09:04:14 +03:00
|
|
|
};
|
|
|
|
constexpr bool with_bit_ops(Meta::Type<RegexCompileFlags>) { return true; }
|
|
|
|
|
2017-12-01 14:57:02 +03:00
|
|
|
CompiledRegex compile_regex(StringView re, RegexCompileFlags flags);
|
2017-10-02 09:59:04 +03:00
|
|
|
|
2017-10-02 17:34:57 +03:00
|
|
|
enum class RegexExecFlags
|
|
|
|
{
|
|
|
|
None = 0,
|
|
|
|
Search = 1 << 0,
|
|
|
|
NotBeginOfLine = 1 << 1,
|
|
|
|
NotEndOfLine = 1 << 2,
|
|
|
|
NotBeginOfWord = 1 << 3,
|
|
|
|
NotEndOfWord = 1 << 4,
|
2018-03-04 21:48:10 +03:00
|
|
|
NotInitialNull = 1 << 5,
|
|
|
|
AnyMatch = 1 << 6,
|
|
|
|
NoSaves = 1 << 7,
|
2017-10-02 17:34:57 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
constexpr bool with_bit_ops(Meta::Type<RegexExecFlags>) { return true; }
|
|
|
|
|
2017-10-07 07:46:27 +03:00
|
|
|
template<typename Iterator, MatchDirection direction>
|
2017-10-06 14:30:46 +03:00
|
|
|
class ThreadedRegexVM
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2017-10-06 14:30:46 +03:00
|
|
|
public:
|
2017-10-02 09:59:04 +03:00
|
|
|
ThreadedRegexVM(const CompiledRegex& program)
|
2017-10-07 07:46:27 +03:00
|
|
|
: m_program{program}
|
2017-10-09 16:56:48 +03:00
|
|
|
{
|
2017-12-01 14:57:02 +03:00
|
|
|
kak_assert((direction == MatchDirection::Forward and program.first_backward_inst != 0) or
|
|
|
|
(direction == MatchDirection::Backward and program.first_backward_inst != -1));
|
2017-10-09 16:56:48 +03:00
|
|
|
}
|
2017-10-02 09:59:04 +03:00
|
|
|
|
2017-10-04 15:11:15 +03:00
|
|
|
ThreadedRegexVM(const ThreadedRegexVM&) = delete;
|
2017-10-06 14:30:46 +03:00
|
|
|
ThreadedRegexVM& operator=(const ThreadedRegexVM&) = delete;
|
2017-10-04 15:11:15 +03:00
|
|
|
|
2017-10-04 06:14:24 +03:00
|
|
|
~ThreadedRegexVM()
|
|
|
|
{
|
|
|
|
for (auto* saves : m_saves)
|
|
|
|
{
|
|
|
|
for (size_t i = m_program.save_count-1; i > 0; --i)
|
|
|
|
saves->pos[i].~Iterator();
|
|
|
|
saves->~Saves();
|
2017-10-15 04:23:57 +03:00
|
|
|
operator delete(saves);
|
2017-10-04 06:14:24 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-04 21:48:10 +03:00
|
|
|
bool exec(Iterator begin, Iterator end,
|
|
|
|
Iterator subject_begin, Iterator subject_end,
|
|
|
|
RegexExecFlags flags)
|
2017-10-06 14:30:46 +03:00
|
|
|
{
|
2017-10-11 14:24:01 +03:00
|
|
|
if (flags & RegexExecFlags::NotInitialNull and begin == end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
constexpr bool forward = direction == MatchDirection::Forward;
|
|
|
|
|
2018-03-04 21:48:10 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
if (not forward) // Flip line begin/end flags as we flipped the instructions on compilation.
|
|
|
|
flags = (RegexExecFlags)(flags & ~(RegexExecFlags::NotEndOfLine | RegexExecFlags::NotBeginOfLine)) |
|
2017-10-11 14:24:01 +03:00
|
|
|
((flags & RegexExecFlags::NotEndOfLine) ? RegexExecFlags::NotBeginOfLine : RegexExecFlags::None) |
|
|
|
|
((flags & RegexExecFlags::NotBeginOfLine) ? RegexExecFlags::NotEndOfLine : RegexExecFlags::None);
|
2017-10-06 14:30:46 +03:00
|
|
|
|
2017-10-20 10:17:02 +03:00
|
|
|
const bool search = (flags & RegexExecFlags::Search);
|
2018-04-21 05:44:54 +03:00
|
|
|
|
|
|
|
ConstArrayView<CompiledRegex::Instruction> instructions{m_program.instructions};
|
|
|
|
if (direction == MatchDirection::Forward)
|
|
|
|
instructions = instructions.subrange(0, m_program.first_backward_inst);
|
|
|
|
else
|
|
|
|
instructions = instructions.subrange(m_program.first_backward_inst);
|
|
|
|
if (not search)
|
|
|
|
instructions = instructions.subrange(CompiledRegex::search_prefix_size);
|
|
|
|
|
|
|
|
|
|
|
|
const ExecConfig config{
|
|
|
|
EffectiveIt{Utf8It{forward ? begin : end, subject_begin, subject_end}},
|
|
|
|
EffectiveIt{Utf8It{forward ? end : begin, subject_begin, subject_end}},
|
|
|
|
EffectiveIt{Utf8It{forward ? subject_begin : subject_end, subject_begin, subject_end}},
|
|
|
|
EffectiveIt{Utf8It{forward ? subject_end : subject_begin, subject_begin, subject_end}},
|
|
|
|
flags,
|
|
|
|
instructions
|
|
|
|
};
|
|
|
|
|
|
|
|
EffectiveIt start{config.begin};
|
2017-12-01 14:57:02 +03:00
|
|
|
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
|
|
|
|
: m_program.backward_start_desc;
|
|
|
|
if (start_desc)
|
2017-12-01 10:03:03 +03:00
|
|
|
{
|
|
|
|
if (search)
|
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
to_next_start(start, config.end, *start_desc);
|
|
|
|
if (start == config.end) // If start_desc is not null, it means we consume at least one char
|
2017-12-01 10:03:03 +03:00
|
|
|
return false;
|
|
|
|
}
|
2018-04-21 05:44:54 +03:00
|
|
|
else if (start != config.end and
|
2017-12-01 14:57:02 +03:00
|
|
|
not start_desc->map[std::min(*start, CompiledRegex::StartDesc::other)])
|
2017-12-01 10:03:03 +03:00
|
|
|
return false;
|
|
|
|
}
|
2017-10-06 14:30:46 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
return exec_program(std::move(start), config);
|
2017-10-06 14:30:46 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
ArrayView<const Iterator> captures() const
|
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
if (m_captures >= 0)
|
|
|
|
return { m_saves[m_captures]->pos, m_program.save_count };
|
2017-10-06 14:30:46 +03:00
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2017-10-03 05:54:43 +03:00
|
|
|
struct Saves
|
|
|
|
{
|
2017-10-13 16:58:38 +03:00
|
|
|
union // ref count when in use, next_free when in free list
|
|
|
|
{
|
|
|
|
int refcount;
|
2018-04-27 01:18:04 +03:00
|
|
|
int16_t next_free;
|
2017-10-13 16:58:38 +03:00
|
|
|
};
|
2017-10-04 06:14:24 +03:00
|
|
|
Iterator pos[1];
|
2017-10-03 05:54:43 +03:00
|
|
|
};
|
|
|
|
|
2017-10-04 14:49:16 +03:00
|
|
|
template<bool copy>
|
2018-04-27 01:18:04 +03:00
|
|
|
int16_t new_saves(Iterator* pos)
|
2017-10-03 05:54:43 +03:00
|
|
|
{
|
2017-10-04 14:49:16 +03:00
|
|
|
kak_assert(not copy or pos != nullptr);
|
|
|
|
const auto count = m_program.save_count;
|
2018-04-27 01:18:04 +03:00
|
|
|
if (m_first_free >= 0)
|
2017-10-03 05:54:43 +03:00
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
const int16_t res = m_first_free;
|
|
|
|
Saves* save = m_saves[res];
|
|
|
|
m_first_free = save->next_free;
|
|
|
|
save->refcount = 1;
|
2017-10-04 14:49:16 +03:00
|
|
|
if (copy)
|
2018-04-27 01:18:04 +03:00
|
|
|
std::copy(pos, pos + count, save->pos);
|
2017-10-04 14:49:16 +03:00
|
|
|
else
|
2018-04-27 01:18:04 +03:00
|
|
|
std::fill(save->pos, save->pos + count, Iterator{});
|
2017-10-04 14:49:16 +03:00
|
|
|
|
2017-10-03 13:23:31 +03:00
|
|
|
return res;
|
2017-10-03 05:54:43 +03:00
|
|
|
}
|
|
|
|
|
2017-10-15 04:23:57 +03:00
|
|
|
void* ptr = operator new (sizeof(Saves) + (count-1) * sizeof(Iterator));
|
2017-10-13 16:58:38 +03:00
|
|
|
Saves* saves = new (ptr) Saves{{1}, {copy ? pos[0] : Iterator{}}};
|
2017-10-04 14:49:16 +03:00
|
|
|
for (size_t i = 1; i < count; ++i)
|
|
|
|
new (&saves->pos[i]) Iterator{copy ? pos[i] : Iterator{}};
|
|
|
|
m_saves.push_back(saves);
|
2018-04-27 01:18:04 +03:00
|
|
|
return static_cast<int16_t>(m_saves.size() - 1);
|
2017-10-03 05:54:43 +03:00
|
|
|
}
|
|
|
|
|
2018-04-27 01:18:04 +03:00
|
|
|
void release_saves(int16_t saves)
|
2017-10-04 05:49:40 +03:00
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
if (saves >= 0 and --m_saves[saves]->refcount == 0)
|
2017-10-13 16:58:38 +03:00
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
m_saves[saves]->next_free = m_first_free;
|
2017-10-13 16:58:38 +03:00
|
|
|
m_first_free = saves;
|
|
|
|
}
|
2017-10-04 05:49:40 +03:00
|
|
|
};
|
|
|
|
|
2017-10-02 09:59:04 +03:00
|
|
|
struct Thread
|
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
int16_t inst;
|
|
|
|
int16_t saves;
|
2017-10-02 09:59:04 +03:00
|
|
|
};
|
|
|
|
|
2018-03-04 21:48:10 +03:00
|
|
|
using Utf8It = utf8::iterator<Iterator>;
|
|
|
|
using EffectiveIt = std::conditional_t<direction == MatchDirection::Forward,
|
|
|
|
Utf8It, std::reverse_iterator<Utf8It>>;
|
2017-10-04 07:16:52 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
struct ExecConfig
|
2017-10-11 05:24:05 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
const EffectiveIt begin;
|
|
|
|
const EffectiveIt end;
|
|
|
|
const EffectiveIt subject_begin;
|
|
|
|
const EffectiveIt subject_end;
|
|
|
|
const RegexExecFlags flags;
|
|
|
|
ConstArrayView<CompiledRegex::Instruction> instructions;
|
2017-10-11 05:24:05 +03:00
|
|
|
};
|
|
|
|
|
2017-10-20 10:17:02 +03:00
|
|
|
enum class StepResult { Consumed, Matched, Failed, FindNextStart };
|
2017-10-07 11:36:53 +03:00
|
|
|
|
|
|
|
// Steps a thread until it consumes the current character, matches or fail
|
2018-04-21 05:44:54 +03:00
|
|
|
StepResult step(EffectiveIt& pos, uint16_t current_step, Thread& thread, const ExecConfig& config)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
const bool no_saves = (config.flags & RegexExecFlags::NoSaves);
|
2017-11-11 10:15:13 +03:00
|
|
|
auto* instructions = m_program.instructions.data();
|
2017-10-02 09:59:04 +03:00
|
|
|
while (true)
|
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
auto& inst = instructions[thread.inst++];
|
2018-02-24 09:21:15 +03:00
|
|
|
// if this instruction was already executed for this step in another thread,
|
|
|
|
// then this thread is redundant and can be dropped
|
2018-04-21 05:44:54 +03:00
|
|
|
if (inst.last_step == current_step)
|
2017-10-07 14:08:14 +03:00
|
|
|
return StepResult::Failed;
|
2018-04-21 05:44:54 +03:00
|
|
|
inst.last_step = current_step;
|
2017-10-07 09:25:14 +03:00
|
|
|
|
2017-10-07 13:51:32 +03:00
|
|
|
switch (inst.op)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
|
|
|
case CompiledRegex::Literal:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos != config.end and inst.param == *pos)
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Consumed;
|
|
|
|
return StepResult::Failed;
|
2017-10-10 06:21:21 +03:00
|
|
|
case CompiledRegex::Literal_IgnoreCase:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos != config.end and inst.param == to_lower(*pos))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Consumed;
|
|
|
|
return StepResult::Failed;
|
|
|
|
case CompiledRegex::AnyChar:
|
|
|
|
return StepResult::Consumed;
|
|
|
|
case CompiledRegex::Jump:
|
2018-04-27 01:18:04 +03:00
|
|
|
thread.inst = static_cast<int16_t>(inst.param);
|
2017-10-02 09:59:04 +03:00
|
|
|
break;
|
|
|
|
case CompiledRegex::Split_PrioritizeParent:
|
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
if (thread.saves >= 0)
|
|
|
|
++m_saves[thread.saves]->refcount;
|
|
|
|
m_current_threads.push_back({static_cast<int16_t>(inst.param), thread.saves});
|
2017-10-02 09:59:04 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CompiledRegex::Split_PrioritizeChild:
|
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
if (thread.saves >= 0)
|
|
|
|
++m_saves[thread.saves]->refcount;
|
2018-04-21 05:44:54 +03:00
|
|
|
m_current_threads.push_back({thread.inst, thread.saves});
|
2018-04-27 01:18:04 +03:00
|
|
|
thread.inst = static_cast<uint16_t>(inst.param);
|
2017-10-02 09:59:04 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CompiledRegex::Save:
|
|
|
|
{
|
2017-10-20 11:55:38 +03:00
|
|
|
if (no_saves)
|
2017-10-03 14:07:44 +03:00
|
|
|
break;
|
2018-04-27 01:18:04 +03:00
|
|
|
if (thread.saves < 0)
|
2017-10-20 11:55:38 +03:00
|
|
|
thread.saves = new_saves<false>(nullptr);
|
2018-04-27 01:18:04 +03:00
|
|
|
else if (m_saves[thread.saves]->refcount > 1)
|
2017-10-03 05:54:43 +03:00
|
|
|
{
|
2018-04-27 01:18:04 +03:00
|
|
|
--m_saves[thread.saves]->refcount;
|
|
|
|
thread.saves = new_saves<true>(m_saves[thread.saves]->pos);
|
2017-10-03 05:54:43 +03:00
|
|
|
}
|
2018-04-27 01:18:04 +03:00
|
|
|
m_saves[thread.saves]->pos[inst.param] = get_base(pos);
|
2017-10-02 09:59:04 +03:00
|
|
|
break;
|
|
|
|
}
|
2017-11-25 13:14:15 +03:00
|
|
|
case CompiledRegex::Class:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.end)
|
2017-10-07 14:58:10 +03:00
|
|
|
return StepResult::Failed;
|
2017-11-25 13:14:15 +03:00
|
|
|
return is_character_class(m_program.character_classes[inst.param], *pos) ?
|
2017-10-02 09:59:04 +03:00
|
|
|
StepResult::Consumed : StepResult::Failed;
|
2017-11-25 13:14:15 +03:00
|
|
|
case CompiledRegex::CharacterType:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.end)
|
2017-11-25 13:14:15 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
return is_ctype((CharacterType)inst.param, *pos) ?
|
|
|
|
StepResult::Consumed : StepResult::Failed;;
|
2017-10-02 09:59:04 +03:00
|
|
|
case CompiledRegex::LineStart:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (not is_line_start(pos, config))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LineEnd:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (not is_line_end(pos, config))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::WordBoundary:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (not is_word_boundary(pos, config))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::NotWordBoundary:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (is_word_boundary(pos, config))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::SubjectBegin:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos != config.subject_begin)
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::SubjectEnd:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos != config.subject_end)
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookAhead:
|
|
|
|
case CompiledRegex::NegativeLookAhead:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (lookaround<MatchDirection::Forward, false>(inst.param, pos, config) !=
|
2017-10-10 06:21:21 +03:00
|
|
|
(inst.op == CompiledRegex::LookAhead))
|
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookAhead_IgnoreCase:
|
|
|
|
case CompiledRegex::NegativeLookAhead_IgnoreCase:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (lookaround<MatchDirection::Forward, true>(inst.param, pos, config) !=
|
2017-10-10 06:21:21 +03:00
|
|
|
(inst.op == CompiledRegex::LookAhead_IgnoreCase))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookBehind:
|
|
|
|
case CompiledRegex::NegativeLookBehind:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (lookaround<MatchDirection::Backward, false>(inst.param, pos, config) !=
|
2017-10-10 06:21:21 +03:00
|
|
|
(inst.op == CompiledRegex::LookBehind))
|
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
|
|
|
case CompiledRegex::LookBehind_IgnoreCase:
|
|
|
|
case CompiledRegex::NegativeLookBehind_IgnoreCase:
|
2018-04-21 05:44:54 +03:00
|
|
|
if (lookaround<MatchDirection::Backward, true>(inst.param, pos, config) !=
|
2017-10-10 06:21:21 +03:00
|
|
|
(inst.op == CompiledRegex::LookBehind_IgnoreCase))
|
2017-10-02 09:59:04 +03:00
|
|
|
return StepResult::Failed;
|
|
|
|
break;
|
2017-10-20 10:17:02 +03:00
|
|
|
case CompiledRegex::FindNextStart:
|
2018-04-21 05:44:54 +03:00
|
|
|
kak_assert(m_current_threads.empty()); // search thread should by construction be the lower priority one
|
|
|
|
if (m_next_threads.empty())
|
2017-10-20 10:17:02 +03:00
|
|
|
return StepResult::FindNextStart;
|
|
|
|
return StepResult::Consumed;
|
2017-10-02 09:59:04 +03:00
|
|
|
case CompiledRegex::Match:
|
|
|
|
return StepResult::Matched;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return StepResult::Failed;
|
|
|
|
}
|
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
bool exec_program(EffectiveIt pos, const ExecConfig& config)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
kak_assert(m_current_threads.empty() and m_next_threads.empty());
|
|
|
|
release_saves(m_captures);
|
2018-04-27 01:18:04 +03:00
|
|
|
m_captures = -1;
|
|
|
|
m_current_threads.push_back({static_cast<int16_t>(&config.instructions[0] - &m_program.instructions[0]), -1});
|
2017-12-01 14:57:02 +03:00
|
|
|
|
|
|
|
const auto& start_desc = direction == MatchDirection::Forward ? m_program.forward_start_desc
|
|
|
|
: m_program.backward_start_desc;
|
2017-10-03 14:07:44 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
uint16_t current_step = -1;
|
2017-10-07 14:27:06 +03:00
|
|
|
bool found_match = false;
|
2017-10-07 14:58:10 +03:00
|
|
|
while (true) // Iterate on all codepoints and once at the end
|
2017-10-07 14:27:06 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
if (++current_step == 0)
|
2017-10-14 07:58:42 +03:00
|
|
|
{
|
|
|
|
// We wrapped, avoid potential collision on inst.last_step by resetting them
|
2018-04-21 05:44:54 +03:00
|
|
|
for (auto& inst : config.instructions)
|
2017-10-14 07:58:42 +03:00
|
|
|
inst.last_step = 0;
|
2018-04-21 05:44:54 +03:00
|
|
|
current_step = 1; // step 0 is never valid
|
2017-10-14 07:58:42 +03:00
|
|
|
}
|
|
|
|
|
2017-10-20 10:17:02 +03:00
|
|
|
bool find_next_start = false;
|
2018-04-21 05:44:54 +03:00
|
|
|
while (not m_current_threads.empty())
|
2017-10-07 14:58:10 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
auto thread = m_current_threads.back();
|
|
|
|
m_current_threads.pop_back();
|
|
|
|
switch (step(pos, current_step, thread, config))
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2017-10-03 13:00:52 +03:00
|
|
|
case StepResult::Matched:
|
2018-04-21 05:44:54 +03:00
|
|
|
if ((pos != config.end and not (config.flags & RegexExecFlags::Search)) or
|
|
|
|
(config.flags & RegexExecFlags::NotInitialNull and pos == config.begin))
|
2017-10-02 11:24:38 +03:00
|
|
|
{
|
2017-10-03 13:00:52 +03:00
|
|
|
release_saves(thread.saves);
|
2017-10-02 17:34:57 +03:00
|
|
|
continue;
|
2017-10-02 11:24:38 +03:00
|
|
|
}
|
2017-10-02 09:59:04 +03:00
|
|
|
|
2017-10-04 15:11:15 +03:00
|
|
|
release_saves(m_captures);
|
2017-10-04 06:28:58 +03:00
|
|
|
m_captures = thread.saves;
|
2017-10-02 09:59:04 +03:00
|
|
|
found_match = true;
|
2017-10-20 11:55:38 +03:00
|
|
|
|
|
|
|
// remove this and lower priority threads
|
2018-04-21 05:44:54 +03:00
|
|
|
for (auto& t : m_current_threads)
|
2017-10-20 11:55:38 +03:00
|
|
|
release_saves(t.saves);
|
2018-04-21 05:44:54 +03:00
|
|
|
m_current_threads.clear();
|
2017-10-03 13:00:52 +03:00
|
|
|
break;
|
|
|
|
case StepResult::Failed:
|
|
|
|
release_saves(thread.saves);
|
|
|
|
break;
|
|
|
|
case StepResult::Consumed:
|
2018-04-27 01:18:04 +03:00
|
|
|
if (m_program.instructions[thread.inst].scheduled)
|
2017-10-07 14:58:10 +03:00
|
|
|
{
|
2017-10-03 13:00:52 +03:00
|
|
|
release_saves(thread.saves);
|
2017-10-07 14:58:10 +03:00
|
|
|
continue;
|
|
|
|
}
|
2018-04-27 01:18:04 +03:00
|
|
|
m_program.instructions[thread.inst].scheduled = true;
|
2018-04-21 05:44:54 +03:00
|
|
|
m_next_threads.push_back(thread);
|
2017-10-03 13:00:52 +03:00
|
|
|
break;
|
2017-10-20 10:17:02 +03:00
|
|
|
case StepResult::FindNextStart:
|
2018-04-21 05:44:54 +03:00
|
|
|
m_next_threads.push_back(thread);
|
2017-10-20 10:17:02 +03:00
|
|
|
find_next_start = true;
|
|
|
|
break;
|
2017-10-02 11:24:38 +03:00
|
|
|
}
|
2017-10-02 09:59:04 +03:00
|
|
|
}
|
2018-04-21 05:44:54 +03:00
|
|
|
kak_assert(m_current_threads.empty());
|
|
|
|
for (auto& thread : m_next_threads)
|
2018-04-27 01:18:04 +03:00
|
|
|
m_program.instructions[thread.inst].scheduled = false;
|
2017-10-11 05:24:05 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.end or m_next_threads.empty() or
|
|
|
|
(found_match and (config.flags & RegexExecFlags::AnyMatch)))
|
2017-10-20 11:55:38 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
for (auto& t : m_next_threads)
|
2017-10-20 11:55:38 +03:00
|
|
|
release_saves(t.saves);
|
2018-04-21 05:44:54 +03:00
|
|
|
m_next_threads.clear();
|
2017-10-02 09:59:04 +03:00
|
|
|
return found_match;
|
2017-10-20 11:55:38 +03:00
|
|
|
}
|
2017-10-03 13:00:52 +03:00
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
std::swap(m_current_threads, m_next_threads);
|
|
|
|
std::reverse(m_current_threads.begin(), m_current_threads.end());
|
2017-10-07 14:27:06 +03:00
|
|
|
++pos;
|
2017-10-20 10:17:02 +03:00
|
|
|
|
2017-12-01 14:57:02 +03:00
|
|
|
if (find_next_start and start_desc)
|
2018-04-21 05:44:54 +03:00
|
|
|
to_next_start(pos, config.end, *start_desc);
|
2017-10-02 09:59:04 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-04 21:48:10 +03:00
|
|
|
void to_next_start(EffectiveIt& start, const EffectiveIt& end,
|
2017-12-01 09:46:18 +03:00
|
|
|
const CompiledRegex::StartDesc& start_desc)
|
2017-10-06 08:40:27 +03:00
|
|
|
{
|
2017-10-20 06:49:19 +03:00
|
|
|
while (start != end and *start >= 0 and
|
2017-12-01 09:46:18 +03:00
|
|
|
not start_desc.map[std::min(*start, CompiledRegex::StartDesc::other)])
|
2017-10-06 08:40:27 +03:00
|
|
|
++start;
|
|
|
|
}
|
|
|
|
|
2017-10-10 06:21:21 +03:00
|
|
|
template<MatchDirection look_direction, bool ignore_case>
|
2018-04-21 05:44:54 +03:00
|
|
|
bool lookaround(uint32_t index, EffectiveIt pos, const ExecConfig& config) const
|
2017-10-09 06:12:42 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
const auto end = (look_direction == MatchDirection::Forward ? config.subject_end : config.subject_begin);
|
2017-10-09 06:12:42 +03:00
|
|
|
for (auto it = m_program.lookarounds.begin() + index; *it != -1; ++it)
|
|
|
|
{
|
2018-03-04 21:48:10 +03:00
|
|
|
if (pos == end)
|
2017-10-09 06:12:42 +03:00
|
|
|
return false;
|
2017-10-23 12:00:42 +03:00
|
|
|
Codepoint cp = (look_direction == MatchDirection::Forward ? *pos : *(pos-1));
|
2017-10-10 06:21:21 +03:00
|
|
|
if (ignore_case)
|
|
|
|
cp = to_lower(cp);
|
|
|
|
|
2017-10-23 12:00:42 +03:00
|
|
|
const Codepoint ref = *it;
|
2017-10-09 06:12:42 +03:00
|
|
|
if (ref == 0xF000)
|
|
|
|
{} // any character matches
|
2017-11-25 13:14:15 +03:00
|
|
|
else if (ref > 0xF0000 and ref < 0xF8000)
|
|
|
|
{
|
|
|
|
if (not is_character_class(m_program.character_classes[ref - 0xF0001], cp))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else if (ref >= 0xF8000 and ref <= 0xFFFFD)
|
2017-10-09 06:20:05 +03:00
|
|
|
{
|
2017-11-25 13:14:15 +03:00
|
|
|
if (not is_ctype((CharacterType)(ref & 0xFF), cp))
|
2017-10-09 06:20:05 +03:00
|
|
|
return false;
|
|
|
|
}
|
2017-10-09 06:12:42 +03:00
|
|
|
else if (ref != cp)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
(look_direction == MatchDirection::Forward) ? ++pos : --pos;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
static bool is_line_start(const EffectiveIt& pos, const ExecConfig& config)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.subject_begin)
|
|
|
|
return not (config.flags & RegexExecFlags::NotBeginOfLine);
|
2017-10-08 04:22:24 +03:00
|
|
|
return *(pos-1) == '\n';
|
2017-10-02 09:59:04 +03:00
|
|
|
}
|
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
static bool is_line_end(const EffectiveIt& pos, const ExecConfig& config)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.subject_end)
|
|
|
|
return not (config.flags & RegexExecFlags::NotEndOfLine);
|
2017-10-08 04:22:24 +03:00
|
|
|
return *pos == '\n';
|
2017-10-02 09:59:04 +03:00
|
|
|
}
|
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
static bool is_word_boundary(const EffectiveIt& pos, const ExecConfig& config)
|
2017-10-02 09:59:04 +03:00
|
|
|
{
|
2018-04-21 05:44:54 +03:00
|
|
|
if (pos == config.subject_begin)
|
|
|
|
return not (config.flags & RegexExecFlags::NotBeginOfWord);
|
|
|
|
if (pos == config.subject_end)
|
|
|
|
return not (config.flags & RegexExecFlags::NotEndOfWord);
|
2017-10-08 04:22:24 +03:00
|
|
|
return is_word(*(pos-1)) != is_word(*pos);
|
2017-10-02 09:59:04 +03:00
|
|
|
}
|
|
|
|
|
2018-03-04 21:48:10 +03:00
|
|
|
static const Iterator& get_base(const Utf8It& it) { return it.base(); }
|
|
|
|
static Iterator get_base(const std::reverse_iterator<Utf8It>& it) { return it.base().base(); }
|
2017-10-07 07:46:27 +03:00
|
|
|
|
2017-10-02 09:59:04 +03:00
|
|
|
const CompiledRegex& m_program;
|
|
|
|
|
2018-04-21 05:44:54 +03:00
|
|
|
Vector<Thread, MemoryDomain::Regex> m_current_threads;
|
|
|
|
Vector<Thread, MemoryDomain::Regex> m_next_threads;
|
2017-10-02 09:59:04 +03:00
|
|
|
|
2017-10-15 04:23:57 +03:00
|
|
|
Vector<Saves*, MemoryDomain::Regex> m_saves;
|
2018-04-27 01:18:04 +03:00
|
|
|
int16_t m_first_free = -1;
|
|
|
|
int16_t m_captures = -1;
|
2017-10-02 09:59:04 +03:00
|
|
|
};
|
|
|
|
|
2017-09-26 10:44:30 +03:00
|
|
|
}
|
|
|
|
|
2017-09-17 11:50:53 +03:00
|
|
|
#endif // regex_impl_hh_INCLUDED
|