kakoune/src/utf8.hh

#ifndef utf8_hh_INCLUDED
#define utf8_hh_INCLUDED

#include "assert.hh"
#include "unicode.hh"
#include "units.hh"

#include <cstddef>

namespace Kakoune
{

namespace utf8
{

template<typename Iterator>
[[gnu::always_inline]]
inline char read(Iterator& it) { char c = *it; ++it; return c; }

// returns an iterator to next character first byte
template<typename Iterator>
Iterator next(Iterator it, const Iterator& end)
{
    if (it != end and read(it) & 0x80)
        while (it != end and (*(it) & 0xC0) == 0x80)
            ++it;
    return it;
}

// returns it's parameter if it points to a character first byte,
// or else returns next character first byte
template<typename Iterator>
Iterator finish(Iterator it, const Iterator& end)
{
    while (it != end and (*(it) & 0xC0) == 0x80)
        ++it;
    return it;
}

// returns an iterator to the previous character first byte
template<typename Iterator>
Iterator previous(Iterator it, const Iterator& begin)
{
    while (it != begin and (*(--it) & 0xC0) == 0x80)
           ;
    return it;
}

// returns an iterator pointing to the first byte of the
// dth character after (or before if d < 0) the character
// pointed by it
template<typename Iterator>
Iterator advance(Iterator it, const Iterator& end, CharCount d)
{
    if (d < 0)
    {
       while (it != end and d++)
           it = utf8::previous(it, end);
    }
    else
    {
        while (it != end and d--)
           it = utf8::next(it, end);
    }
    return it;
}

// return true if it points to the first byte of a (either single or
// multibyte) character
inline bool is_character_start(char c)
{
    return (c & 0xC0) != 0x80;
}

// returns the character count between begin and end
template<typename Iterator>
CharCount distance(Iterator begin, const Iterator& end)
{
    CharCount dist = 0;

    while (begin != end)
    {
        if (is_character_start(*begin++))
            ++dist;
    }
    return dist;
}

// returns an iterator to the first byte of the character it is into
template<typename Iterator>
Iterator character_start(Iterator it, const Iterator& begin)
{
    while (it != begin and not is_character_start(*it))
        --it;
    return it;
}

namespace InvalidPolicy
{

struct Assert
{
    Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
};

struct Pass
{
    Codepoint operator()(Codepoint cp) const { return cp; }
};

}

// returns the codepoint of the character whose first byte
// is pointed by it
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
         typename Iterator>
Codepoint read_codepoint(Iterator& it, const Iterator& end)
{
    if (it == end)
        return InvalidPolicy{}(-1);
    // According to rfc3629, UTF-8 allows only up to 4 bytes.
    // (21 bits codepoint)
    unsigned char byte = read(it);
    if (not (byte & 0x80)) // 0xxxxxxx
        return byte;

    if (it == end)
        return InvalidPolicy{}(byte);

    if ((byte & 0xE0) == 0xC0) // 110xxxxx
        return ((byte & 0x1F) << 6) | (read(it) & 0x3F);

    if ((byte & 0xF0) == 0xE0) // 1110xxxx
    {
        Codepoint cp = ((byte & 0x0F) << 12) | ((read(it) & 0x3F) << 6);
        if (it == end)
            return InvalidPolicy{}(cp);
        return cp | (read(it) & 0x3F);
    }

    if ((byte & 0xF8) == 0xF0) // 11110xxx
    {
        Codepoint cp = ((byte & 0x0F) << 18) | ((read(it) & 0x3F) << 12);
        if (it == end)
            return InvalidPolicy{}(cp);
        cp |= (read(it) & 0x3F) << 6;
        if (it == end)
            return InvalidPolicy{}(cp);
        return cp | (read(it) & 0x3F);
    }
    return InvalidPolicy{}(byte);
}

template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
         typename Iterator>
Codepoint codepoint(Iterator it, const Iterator& end)
{
    return read_codepoint(it, end);
}

template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
ByteCount codepoint_size(char byte)
{
    if (not (byte & 0x80)) // 0xxxxxxx
        return 1;
    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
        return 2;
    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
        return 3;
    else if ((byte & 0xF8) == 0xF0) // 11110xxx
        return 4;
    else
    {
        InvalidPolicy{}(byte);
        return 1;
    }
}

struct invalid_codepoint{};

inline ByteCount codepoint_size(Codepoint cp)
{
    if (cp <= 0x7F)
        return 1;
    else if (cp <= 0x7FF)
        return 2;
    else if (cp <= 0xFFFF)
        return 3;
    else if (cp <= 0x10FFFF)
        return 4;
    else
        throw invalid_codepoint{};
}

template<typename OutputIterator>
void dump(OutputIterator&& it, Codepoint cp)
{
    if (cp <= 0x7F)
        *it++ = cp;
    else if (cp <= 0x7FF)
    {
        *it++ = 0xC0 | (cp >> 6);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0xFFFF)
    {
        *it++ = 0xE0 | (cp >> 12);
        *it++ = 0x80 | ((cp >> 6) & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else if (cp <= 0x10FFFF)
    {
        *it++ = 0xF0 | (cp >> 18);
        *it++ = 0x80 | ((cp >> 12) & 0x3F);
        *it++ = 0x80 | ((cp >> 6)  & 0x3F);
        *it++ = 0x80 | (cp & 0x3F);
    }
    else
        throw invalid_codepoint{};
}

}

}

#endif // utf8_hh_INCLUDED
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`#ifndef utf8_hh_INCLUDED`
			`#define utf8_hh_INCLUDED`

sort includes directives 2013-04-09 22:05:40 +04:00			`#include "assert.hh"`
add a unicode.hh header for Codepoint related functions, s/utf8::Codepoint/Codepoint/ 2012-10-09 21:15:05 +04:00			`#include "unicode.hh"`
utf8: use CharCount instead of size_t 2012-10-27 15:26:40 +04:00			`#include "units.hh"`
sort includes directives 2013-04-09 22:05:40 +04:00
			`#include <cstddef>`
utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 16:29:37 +04:00
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`namespace Kakoune`
			`{`

			`namespace utf8`
			`{`

Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`template<typename Iterator>`
			`[[gnu::always_inline]]`
			`inline char read(Iterator& it) { char c = *it; ++it; return c; }`

add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`// returns an iterator to next character first byte`
			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`Iterator next(Iterator it, const Iterator& end)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`if (it != end and read(it) & 0x80)`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`while (it != end and (*(it) & 0xC0) == 0x80)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`++it;`
			`return it;`
			`}`

			`// returns it's parameter if it points to a character first byte,`
			`// or else returns next character first byte`
			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`Iterator finish(Iterator it, const Iterator& end)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`while (it != end and (*(it) & 0xC0) == 0x80)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`++it;`
			`return it;`
			`}`

			`// returns an iterator to the previous character first byte`
			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`Iterator previous(Iterator it, const Iterator& begin)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`while (it != begin and (*(--it) & 0xC0) == 0x80)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`;`
			`return it;`
			`}`

			`// returns an iterator pointing to the first byte of the`
			`// dth character after (or before if d < 0) the character`
			`// pointed by it`
utf8: use CharCount instead of size_t 2012-10-27 15:26:40 +04:00			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`Iterator advance(Iterator it, const Iterator& end, CharCount d)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
			`if (d < 0)`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 02:41:48 +04:00			`while (it != end and d++)`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`it = utf8::previous(it, end);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`
			`else`
			`{`
use ByteCount instead of CharCount when we are really counting bytes (that is most of the time when we are not concerned with displaying) 2012-10-11 02:41:48 +04:00			`while (it != end and d--)`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`it = utf8::next(it, end);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`
			`return it;`
			`}`

Minor additional cleanup in utf8.hh 2015-09-24 00:09:37 +03:00			`// return true if it points to the first byte of a (either single or`
			`// multibyte) character`
			`inline bool is_character_start(char c)`
			`{`
			`return (c & 0xC0) != 0x80;`
			`}`

add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`// returns the character count between begin and end`
			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`CharCount distance(Iterator begin, const Iterator& end)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
utf8: use CharCount instead of size_t 2012-10-27 15:26:40 +04:00			`CharCount dist = 0;`
Minor additional cleanup in utf8.hh 2015-09-24 00:09:37 +03:00
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`while (begin != end)`
			`{`
Minor additional cleanup in utf8.hh 2015-09-24 00:09:37 +03:00			`if (is_character_start(*begin++))`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`++dist;`
			`}`
Return something in utf8::distance, thanks again gcc for letting this work 2012-10-11 02:39:17 +04:00			`return dist;`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`

Add utf8::character_start function 2013-02-26 17:05:51 +04:00			`// returns an iterator to the first byte of the character it is into`
			`template<typename Iterator>`
Avoid unneeded iterator copies in utf8.hh 2015-09-23 21:48:15 +03:00			`Iterator character_start(Iterator it, const Iterator& begin)`
Add utf8::character_start function 2013-02-26 17:05:51 +04:00			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`while (it != begin and not is_character_start(*it))`
Add utf8::character_start function 2013-02-26 17:05:51 +04:00			`--it;`
			`return it;`
			`}`

utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`namespace InvalidPolicy`
utf8::codepoint: configurable invalid byte policy 2012-10-13 20:31:29 +04:00			`{`

utf8: replace InvalidBytePolicy::Throw with InvalidBytePolicy::Assert 2012-10-17 19:01:51 +04:00			`struct Assert`
utf8::codepoint: configurable invalid byte policy 2012-10-13 20:31:29 +04:00			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }`
utf8::codepoint: configurable invalid byte policy 2012-10-13 20:31:29 +04:00			`};`

			`struct Pass`
			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`Codepoint operator()(Codepoint cp) const { return cp; }`
utf8::codepoint: configurable invalid byte policy 2012-10-13 20:31:29 +04:00			`};`

			`}`

add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`// returns the codepoint of the character whose first byte`
			`// is pointed by it`
Use Pass as default policy for invalid utf8 avoid asserting on that 2014-10-13 22:54:40 +04:00			`template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,`
utf8::codepoint: configurable invalid byte policy 2012-10-13 20:31:29 +04:00			`typename Iterator>`
Add utf8::read_codepoint that both gets the codepoint and advance iterator 2015-09-25 01:00:47 +03:00			`Codepoint read_codepoint(Iterator& it, const Iterator& end)`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`if (it == end)`
			`return InvalidPolicy{}(-1);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`// According to rfc3629, UTF-8 allows only up to 4 bytes.`
			`// (21 bits codepoint)`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`unsigned char byte = read(it);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`if (not (byte & 0x80)) // 0xxxxxxx`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`return byte;`

			`if (it == end)`
			`return InvalidPolicy{}(byte);`

			`if ((byte & 0xE0) == 0xC0) // 110xxxxx`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`return ((byte & 0x1F) << 6) \| (read(it) & 0x3F);`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00
			`if ((byte & 0xF0) == 0xE0) // 1110xxxx`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`Codepoint cp = ((byte & 0x0F) << 12) \| ((read(it) & 0x3F) << 6);`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`if (it == end)`
			`return InvalidPolicy{}(cp);`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`return cp \| (read(it) & 0x3F);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00
			`if ((byte & 0xF8) == 0xF0) // 11110xxx`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`{`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`Codepoint cp = ((byte & 0x0F) << 18) \| ((read(it) & 0x3F) << 12);`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`if (it == end)`
			`return InvalidPolicy{}(cp);`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`cp \|= (read(it) & 0x3F) << 6;`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`if (it == end)`
			`return InvalidPolicy{}(cp);`
Avoid (*it++) pattern in utf8.hh 2015-09-25 15:19:21 +03:00			`return cp \| (read(it) & 0x3F);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`return InvalidPolicy{}(byte);`
add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`

Add utf8::read_codepoint that both gets the codepoint and advance iterator 2015-09-25 01:00:47 +03:00			`template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,`
			`typename Iterator>`
			`Codepoint codepoint(Iterator it, const Iterator& end)`
			`{`
			`return read_codepoint(it, end);`
			`}`

Use Pass as default policy for invalid utf8 avoid asserting on that 2014-10-13 22:54:40 +04:00			`template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>`
utf8: use end of sequence iterators for more security 2014-07-03 00:14:01 +04:00			`ByteCount codepoint_size(char byte)`
Add utf8::codepoint_size function 2013-05-30 20:49:50 +04:00			`{`
			`if (not (byte & 0x80)) // 0xxxxxxx`
			`return 1;`
			`else if ((byte & 0xE0) == 0xC0) // 110xxxxx`
			`return 2;`
			`else if ((byte & 0xF0) == 0xE0) // 1110xxxx`
			`return 3;`
			`else if ((byte & 0xF8) == 0xF0) // 11110xxx`
			`return 4;`
			`else`
			`{`
			`InvalidPolicy{}(byte);`
Use Pass as default policy for invalid utf8 avoid asserting on that 2014-10-13 22:54:40 +04:00			`return 1;`
Add utf8::codepoint_size function 2013-05-30 20:49:50 +04:00			`}`
			`}`

utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 16:29:37 +04:00			`struct invalid_codepoint{};`

More string usage cleanup 2016-02-05 12:13:07 +03:00			`inline ByteCount codepoint_size(Codepoint cp)`
			`{`
			`if (cp <= 0x7F)`
			`return 1;`
			`else if (cp <= 0x7FF)`
			`return 2;`
			`else if (cp <= 0xFFFF)`
			`return 3;`
			`else if (cp <= 0x10FFFF)`
			`return 4;`
			`else`
			`throw invalid_codepoint{};`
			`}`

utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 16:29:37 +04:00			`template<typename OutputIterator>`
utf8::dump uses a copy of the output iterator instead of a reference 2013-02-28 00:36:28 +04:00			`void dump(OutputIterator&& it, Codepoint cp)`
utf8: add dump(OutputIterator& it, Codepoint cp) 2012-10-09 16:29:37 +04:00			`{`
			`if (cp <= 0x7F)`
			`*it++ = cp;`
			`else if (cp <= 0x7FF)`
			`{`
			`*it++ = 0xC0 \| (cp >> 6);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0xFFFF)`
			`{`
			`*it++ = 0xE0 \| (cp >> 12);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else if (cp <= 0x10FFFF)`
			`{`
			`*it++ = 0xF0 \| (cp >> 18);`
			`*it++ = 0x80 \| ((cp >> 12) & 0x3F);`
			`*it++ = 0x80 \| ((cp >> 6) & 0x3F);`
			`*it++ = 0x80 \| (cp & 0x3F);`
			`}`
			`else`
			`throw invalid_codepoint{};`
			`}`

add utf8 helpers in utf8.hh 2012-10-08 16:25:05 +04:00			`}`

			`}`

			`#endif // utf8_hh_INCLUDED`