kakoune/src/ranked_match.cc

#include "ranked_match.hh"

#include "utf8_iterator.hh"
#include "unit_tests.hh"

namespace Kakoune
{

UsedLetters used_letters(StringView str)
{
    UsedLetters res = 0;
    for (auto c : str)
    {
        if (c >= 'a' and c <= 'z')
            res |= 1uL << (c - 'a');
        else if (c >= 'A' and c <= 'Z')
            res |= 1uL << (c - 'A' + 26);
        else if (c == '_')
            res |= 1uL << 53;
        else if (c == '-')
            res |= 1uL << 54;
        else
            res |= 1uL << 63;
    }
    return res;
}

bool matches(UsedLetters query, UsedLetters letters)
{
    return (query & letters) == query;
}

using Utf8It = utf8::iterator<const char*>;

static int count_word_boundaries_match(StringView candidate, StringView query)
{
    int count = 0;
    Utf8It query_it{query.begin(), query};
    Codepoint prev = 0;
    for (Utf8It it{candidate.begin(), candidate}; it != candidate.end(); ++it)
    {
        const Codepoint c = *it;
        const bool is_word_boundary = prev == 0 or
                                      (!iswalnum(prev) and iswalnum(c)) or
                                      (islower(prev) and isupper(c));
        prev = c;

        if (not is_word_boundary)
            continue;

        const Codepoint lc = to_lower(c);
        for (auto qit = query_it; qit != query.end(); ++qit)
        {
            const Codepoint qc = *qit;
            if (qc == (islower(qc) ? lc  : c))
            {
                ++count;
                query_it = qit+1;
                break;
            }
        }
        if (query_it == query.end())
            break;
    }
    return count;
}

static bool smartcase_eq(Codepoint query, Codepoint candidate)
{
    return query == (islower(query) ? to_lower(candidate) : candidate);
}

static bool subsequence_match_smart_case(StringView str, StringView subseq, int& index_sum)
{
    index_sum = 0;
    auto it = str.begin();
    int index = 0;
    for (auto subseq_it = subseq.begin(); subseq_it != subseq.end();
         subseq_it = utf8::next(subseq_it, subseq.end()))
    {
        if (it == str.end())
            return false;
        const Codepoint c = utf8::codepoint(subseq_it, subseq.end());
        while (not smartcase_eq(c, utf8::read_codepoint(it, subseq.end())))
        {
            ++index;
            if (it == str.end())
                return false;
        }
        index_sum += index++;
    }
    return true;
}

template<typename TestFunc>
RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func)
{
    if (candidate.empty() or query.length() > candidate.length())
        return;

    if (query.empty())
        m_candidate = candidate;
    else if (func() and  subsequence_match_smart_case(candidate, query, m_match_index_sum))
    {
        m_candidate = candidate;

        m_first_char_match = smartcase_eq(query[0], candidate[0]);
        m_word_boundary_match_count = count_word_boundaries_match(candidate, query);
        m_only_word_boundary = m_word_boundary_match_count == query.length();
        m_prefix = std::equal(query.begin(), query.end(), candidate.begin(), smartcase_eq);
    }
}

RankedMatch::RankedMatch(StringView candidate, UsedLetters candidate_letters,
                         StringView query, UsedLetters query_letters)
    : RankedMatch{candidate, query, [&] {
        return matches(to_lower(query_letters), to_lower(candidate_letters)) and
               matches(query_letters & upper_mask, candidate_letters & upper_mask);
    }} {}


RankedMatch::RankedMatch(StringView candidate, StringView query)
    : RankedMatch{candidate, query, [] { return true; }}
{
}

bool RankedMatch::operator<(const RankedMatch& other) const
{
    kak_assert((bool)*this and (bool)other);

    if (m_prefix != other.m_prefix)
        return m_prefix;

    if (m_first_char_match != other.m_first_char_match)
        return m_first_char_match;

    if (m_only_word_boundary and other.m_only_word_boundary)
    {
        if (m_word_boundary_match_count != other.m_word_boundary_match_count)
            return m_word_boundary_match_count > other.m_word_boundary_match_count;
    }
    else if (m_only_word_boundary or other.m_only_word_boundary)
        return  m_only_word_boundary;

    if (m_word_boundary_match_count != other.m_word_boundary_match_count)
        return m_word_boundary_match_count > other.m_word_boundary_match_count;

    if (m_match_index_sum != other.m_match_index_sum)
        return m_match_index_sum < other.m_match_index_sum;

    for (Utf8It it1{m_candidate.begin(), m_candidate}, it2{other.m_candidate.begin(), other.m_candidate};
         it1 != m_candidate.end() and it2 != other.m_candidate.end(); ++it1, ++it2)
    {
        const auto cp1 = *it1, cp2 = *it2;
        if (cp1 != cp2)
        {
            const bool low1 = islower(cp1), low2 = islower(cp2);
            return low1 == low2 ? cp1 < cp2 : low2;
        }
    }

    return false;
}

UnitTest test_ranked_match{[] {
    kak_assert(count_word_boundaries_match("run_all_tests", "rat") == 3);
    kak_assert(count_word_boundaries_match("run_all_tests", "at") == 2);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "wm") == 2);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cobm") == 3);
    kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cWBM") == 4);
}};

UnitTest test_used_letters{[]()
{
    kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));
}};

}
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`#include "ranked_match.hh"`

Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`#include "utf8_iterator.hh"`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`#include "unit_tests.hh"`

Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`namespace Kakoune`
			`{`

Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`UsedLetters used_letters(StringView str)`
			`{`
			`UsedLetters res = 0;`
			`for (auto c : str)`
			`{`
			`if (c >= 'a' and c <= 'z')`
			`res \|= 1uL << (c - 'a');`
			`else if (c >= 'A' and c <= 'Z')`
			`res \|= 1uL << (c - 'A' + 26);`
			`else if (c == '_')`
			`res \|= 1uL << 53;`
			`else if (c == '-')`
			`res \|= 1uL << 54;`
			`else`
			`res \|= 1uL << 63;`
			`}`
			`return res;`
			`}`

			`bool matches(UsedLetters query, UsedLetters letters)`
			`{`
			`return (query & letters) == query;`
			`}`

Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`using Utf8It = utf8::iterator<const char*>;`

Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`static int count_word_boundaries_match(StringView candidate, StringView query)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`{`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`int count = 0;`
Fix count_word_boundaries_match 2016-02-18 02:05:08 +03:00			`Utf8It query_it{query.begin(), query};`
Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`Codepoint prev = 0;`
			`for (Utf8It it{candidate.begin(), candidate}; it != candidate.end(); ++it)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`{`
Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`const Codepoint c = *it;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`const bool is_word_boundary = prev == 0 or`
Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`(!iswalnum(prev) and iswalnum(c)) or`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`(islower(prev) and isupper(c));`
			`prev = c;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`if (not is_word_boundary)`
			`continue;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 03:21:20 +03:00			`const Codepoint lc = to_lower(c);`
Fix count_word_boundaries_match 2016-02-18 02:05:08 +03:00			`for (auto qit = query_it; qit != query.end(); ++qit)`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`{`
Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`const Codepoint qc = *qit;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`if (qc == (islower(qc) ? lc : c))`
			`{`
			`++count;`
Fix count_word_boundaries_match 2016-02-18 02:05:08 +03:00			`query_it = qit+1;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`break;`
			`}`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`}`
Fix count_word_boundaries_match 2016-02-18 02:05:08 +03:00			`if (query_it == query.end())`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`break;`
			`}`
			`return count;`
			`}`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00
Make word insert completion work better with unicode char 2015-10-30 16:57:46 +03:00			`static bool smartcase_eq(Codepoint query, Codepoint candidate)`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`{`
Fix to_lower/to_upper handling to correctly support non unicode chars require a proper unicode locale setup on the system Fixes #94 2015-11-11 03:21:20 +03:00			`return query == (islower(query) ? to_lower(candidate) : candidate);`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`}`

Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`static bool subsequence_match_smart_case(StringView str, StringView subseq, int& index_sum)`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`{`
Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`index_sum = 0;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 02:45:56 +03:00			`auto it = str.begin();`
Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`int index = 0;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 02:45:56 +03:00			`for (auto subseq_it = subseq.begin(); subseq_it != subseq.end();`
			`subseq_it = utf8::next(subseq_it, subseq.end()))`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`{`
			`if (it == str.end())`
			`return false;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 02:45:56 +03:00			`const Codepoint c = utf8::codepoint(subseq_it, subseq.end());`
			`while (not smartcase_eq(c, utf8::read_codepoint(it, subseq.end())))`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`{`
Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`++index;`
Tweak implementation of subsequence_match_smart_case Remove use of utf8 iterators and use directly utf8 functions 2016-03-25 02:45:56 +03:00			`if (it == str.end())`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`return false;`
			`}`
Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`index_sum += index++;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`}`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`return true;`
Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`}`

Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`template<typename TestFunc>`
			`RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func)`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00			`{`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`if (candidate.empty() or query.length() > candidate.length())`
			`return;`

			`if (query.empty())`
Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`m_candidate = candidate;`
			`else if (func() and subsequence_match_smart_case(candidate, query, m_match_index_sum))`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00			`{`
			`m_candidate = candidate;`
Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00
			`m_first_char_match = smartcase_eq(query[0], candidate[0]);`
			`m_word_boundary_match_count = count_word_boundaries_match(candidate, query);`
			`m_only_word_boundary = m_word_boundary_match_count == query.length();`
			`m_prefix = std::equal(query.begin(), query.end(), candidate.begin(), smartcase_eq);`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00			`}`
Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`}`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00
Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`RankedMatch::RankedMatch(StringView candidate, UsedLetters candidate_letters,`
			`StringView query, UsedLetters query_letters)`
			`: RankedMatch{candidate, query, [&] {`
			`return matches(to_lower(query_letters), to_lower(candidate_letters)) and`
			`matches(query_letters & upper_mask, candidate_letters & upper_mask);`
			`}} {}`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00

Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`RankedMatch::RankedMatch(StringView candidate, StringView query)`
			`: RankedMatch{candidate, query, [] { return true; }}`
			`{`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00			`}`

			`bool RankedMatch::operator<(const RankedMatch& other) const`
			`{`
Fix uninitialized value in RankedMatch 2016-03-25 01:04:56 +03:00			`kak_assert((bool)*this and (bool)other);`

Tweak RankedMatch ordering, give priority to prefix matches 2016-03-02 16:30:54 +03:00			`if (m_prefix != other.m_prefix)`
			`return m_prefix;`

Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`if (m_first_char_match != other.m_first_char_match)`
			`return m_first_char_match;`

Tweak RankedMatch::operator< 2016-02-23 02:07:29 +03:00			`if (m_only_word_boundary and other.m_only_word_boundary)`
			`{`
			`if (m_word_boundary_match_count != other.m_word_boundary_match_count)`
			`return m_word_boundary_match_count > other.m_word_boundary_match_count;`
			`}`
			`else if (m_only_word_boundary or other.m_only_word_boundary)`
			`return m_only_word_boundary;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00
			`if (m_word_boundary_match_count != other.m_word_boundary_match_count)`
			`return m_word_boundary_match_count > other.m_word_boundary_match_count;`

Take subsequence matches index when sorting RankedMatch 2016-02-29 02:05:51 +03:00			`if (m_match_index_sum != other.m_match_index_sum)`
			`return m_match_index_sum < other.m_match_index_sum;`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00
Use manual lexicographic comparison in RankedMatch::operator< 2016-03-28 16:44:49 +03:00			`for (Utf8It it1{m_candidate.begin(), m_candidate}, it2{other.m_candidate.begin(), other.m_candidate};`
			`it1 != m_candidate.end() and it2 != other.m_candidate.end(); ++it1, ++it2)`
			`{`
			`const auto cp1 = it1, cp2 = it2;`
			`if (cp1 != cp2)`
			`{`
			`const bool low1 = islower(cp1), low2 = islower(cp2);`
			`return low1 == low2 ? cp1 < cp2 : low2;`
			`}`
			`}`

			`return false;`
Move more logic into RankedMatch 2015-10-28 00:25:18 +03:00			`}`

Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`UnitTest test_ranked_match{[] {`
			`kak_assert(count_word_boundaries_match("run_all_tests", "rat") == 3);`
Fix count_word_boundaries_match 2016-02-18 02:05:08 +03:00			`kak_assert(count_word_boundaries_match("run_all_tests", "at") == 2);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "wm") == 2);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cobm") == 3);`
			`kak_assert(count_word_boundaries_match("countWordBoundariesMatch", "cWBM") == 4);`
Use an heuristic based match ranking algorithm inspired by what YouCompleteMe does 2015-10-29 16:36:30 +03:00			`}};`

Move UsedLetters with RankedMatch 2016-03-25 23:35:57 +03:00			`UnitTest test_used_letters{[]()`
			`{`
			`kak_assert(used_letters("abcd") == to_lower(used_letters("abcdABCD")));`
			`}};`

Extract WordDB::RankedWord as RankedMatch in its own file 2015-10-22 21:49:08 +03:00			`}`