diff --git a/gen-wcwidth.py b/gen-wcwidth.py index 8617657d5..a9c6c13da 100755 --- a/gen-wcwidth.py +++ b/gen-wcwidth.py @@ -512,7 +512,7 @@ def add(p: Callable, comment: str, chars_: Union[Set[int], FrozenSet[int]], ret: p('\t}') p('\treturn 1;\n}') - p('static bool\nis_emoji_presentation_base(uint32_t code) {') + p('static inline bool\nis_emoji_presentation_base(uint32_t code) {') p('\tswitch(code) {') for spec in get_ranges(list(emoji_presentation_bases)): write_case(spec, p) diff --git a/kitty/data-types.c b/kitty/data-types.c index 8829b7fc7..2ff656c1e 100644 --- a/kitty/data-types.c +++ b/kitty/data-types.c @@ -13,8 +13,8 @@ #endif #include "data-types.h" #include "control-codes.h" -#include "unicode-data.h" #include "wcwidth-std.h" +#include "wcswidth.h" #include "modes.h" #include #include @@ -150,115 +150,6 @@ close_tty(PyObject *self UNUSED, PyObject *args) { #undef TTY_ARGS -static inline bool is_flag_pair(char_type a, char_type b) { - return is_flag_codepoint(a) && is_flag_codepoint(b); -} - - -static PyObject* -wcswidth_impl(PyObject UNUSED *self, PyObject *str) { - if (PyUnicode_READY(str) != 0) return NULL; - int kind = PyUnicode_KIND(str); - void *data = PyUnicode_DATA(str); - Py_ssize_t len = PyUnicode_GET_LENGTH(str), i; - unsigned long ans = 0; - char_type prev_ch = 0; - int prev_width = 0; - typedef enum {NORMAL, IN_CSI, FLAG_PAIR_STARTED, IN_ST_TERMINATED} WCSState; - WCSState state = NORMAL; - for (i = 0; i < len; i++) { - char_type ch = PyUnicode_READ(kind, data, i); - switch(state) { - case IN_CSI: { - if (0x40 <= ch && ch <= 0x7e) state = NORMAL; - } continue; - case IN_ST_TERMINATED: { - if (ch == 0x9c) state = NORMAL; - else if (ch == 0x1b && i + 1 < len && PyUnicode_READ(kind, data, i + 1) == '\\') { i++; state = NORMAL; } - } continue; - - case FLAG_PAIR_STARTED: { - state = NORMAL; - if (is_flag_pair(prev_ch, ch)) break; - } /* fallthrough */ - - case NORMAL: { - switch(ch) { - case 0x1b: { - prev_width = 0; - if (i + 1 < len) { - switch (PyUnicode_READ(kind, data, i + 1)) { - case '[': - state = IN_CSI; i++; continue; - case 'P': - case ']': - case 'X': - case '^': - case '_': - state = IN_ST_TERMINATED; i++; continue; - case 'D': - case 'E': - case 'H': - case 'M': - case 'N': - case 'O': - case 'Z': - case '6': - case '7': - case '8': - case '9': - case '=': - case '>': - case 'F': - case 'c': - case 'l': - case 'm': - case 'n': - case 'o': - case '|': - case '}': - case '~': - i++; continue; - } - } - } break; - - case 0xfe0f: { - if (is_emoji_presentation_base(prev_ch) && prev_width == 1) { - ans += 1; - prev_width = 2; - } else prev_width = 0; - } break; - - case 0xfe0e: { - if (is_emoji_presentation_base(prev_ch) && prev_width == 2) { - ans -= 1; - prev_width = 1; - } else prev_width = 0; - } break; - - default: { - if (is_flag_codepoint(ch)) state = FLAG_PAIR_STARTED; - int w = wcwidth_std(ch); - switch(w) { - case -1: - case 0: - prev_width = 0; break; - case 2: - prev_width = 2; break; - default: - prev_width = 1; break; - } - ans += prev_width; - } break; - } break; // switch(ch) - } break; // case NORMAL - } // switch(state) - prev_ch = ch; - } - return PyLong_FromUnsignedLong(ans); -} - static PyObject* wcwidth_wrap(PyObject UNUSED *self, PyObject *chr) { return PyLong_FromLong(wcwidth_std(PyLong_AsLong(chr))); @@ -267,7 +158,7 @@ wcwidth_wrap(PyObject UNUSED *self, PyObject *chr) { static PyMethodDef module_methods[] = { {"wcwidth", (PyCFunction)wcwidth_wrap, METH_O, ""}, - {"wcswidth", (PyCFunction)wcswidth_impl, METH_O, ""}, + {"wcswidth", (PyCFunction)wcswidth_std, METH_O, ""}, {"open_tty", open_tty, METH_VARARGS, ""}, {"normal_tty", normal_tty, METH_VARARGS, ""}, {"raw_tty", raw_tty, METH_VARARGS, ""}, diff --git a/kitty/wcswidth.c b/kitty/wcswidth.c new file mode 100644 index 000000000..940cc1a97 --- /dev/null +++ b/kitty/wcswidth.c @@ -0,0 +1,135 @@ +/* + * wcswidth.c + * Copyright (C) 2020 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#include "wcwidth-std.h" +#include "wcswidth.h" +#include "unicode-data.h" + +void +initialize_wcs_state(WCSState *state) { + zero_at_ptr(state); +} + +static inline bool +is_flag_pair(char_type a, char_type b) { + return is_flag_codepoint(a) && is_flag_codepoint(b); +} + +int +wcswidth_step(WCSState *state, const char_type ch) { + int ans = 0; + switch (state->parser_state) { + case IN_CSI: { + state->prev_width = 0; + if (0x40 <= ch && ch <= 0x7e) state->parser_state = NORMAL; + } break; + case IN_ST_TERMINATED: { + state->prev_width = 0; + if (ch == 0x9c || (ch == '\\' && state->prev_ch == 0x1b)) state->parser_state = NORMAL; + } break; + + case FLAG_PAIR_STARTED: { + state->parser_state = NORMAL; + if (is_flag_pair(state->prev_ch, ch)) break; + } /* fallthrough */ + + case NORMAL: { + switch(ch) { + case 0x1b: { + state->prev_width = 0; + state->parser_state = IN_ESC; + } break; + case 0xfe0f: { + if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 1) { + ans += 1; + state->prev_width = 2; + } else state->prev_width = 0; + } break; + + case 0xfe0e: { + if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 2) { + ans -= 1; + state->prev_width = 1; + } else state->prev_width = 0; + } break; + + default: { + if (is_flag_codepoint(ch)) state->parser_state = FLAG_PAIR_STARTED; + int w = wcwidth_std(ch); + switch(w) { + case -1: + case 0: + state->prev_width = 0; break; + case 2: + state->prev_width = 2; break; + default: + state->prev_width = 1; break; + } + ans += state->prev_width; + } break; + } break; // switch(ch) + } break; // case NORMAL + + case IN_ESC: + switch (ch) { + case '[': + state->parser_state = IN_CSI; break; + case 'P': + case ']': + case 'X': + case '^': + case '_': + state->parser_state = IN_ST_TERMINATED; break; + case 'D': + case 'E': + case 'H': + case 'M': + case 'N': + case 'O': + case 'Z': + case '6': + case '7': + case '8': + case '9': + case '=': + case '>': + case 'F': + case 'c': + case 'l': + case 'm': + case 'n': + case 'o': + case '|': + case '}': + case '~': + break; + default: + state->prev_ch = 0x1b; + state->prev_width = 0; + state->parser_state = NORMAL; + return wcswidth_step(state, ch); + } break; + } + state->prev_ch = ch; + return ans; +} + +PyObject * +wcswidth_std(PyObject UNUSED *self, PyObject *str) { + if (PyUnicode_READY(str) != 0) return NULL; + int kind = PyUnicode_KIND(str); + void *data = PyUnicode_DATA(str); + Py_ssize_t len = PyUnicode_GET_LENGTH(str), i; + WCSState state; + initialize_wcs_state(&state); + size_t ans = 0; + for (i = 0; i < len; i++) { + char_type ch = PyUnicode_READ(kind, data, i); + ans += wcswidth_step(&state, ch); + } + return PyLong_FromSize_t(ans); +} diff --git a/kitty/wcswidth.h b/kitty/wcswidth.h new file mode 100644 index 000000000..05bea98bb --- /dev/null +++ b/kitty/wcswidth.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2020 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#pragma once + +#include "data-types.h" + +typedef enum {NORMAL, IN_ESC, IN_CSI, FLAG_PAIR_STARTED, IN_ST_TERMINATED} WCSParserState; + +typedef struct { + char_type prev_ch; + int prev_width; + WCSParserState parser_state; +} WCSState; + + +void initialize_wcs_state(WCSState *state); +int wcswidth_step(WCSState *state, const char_type ch); +PyObject * wcswidth_std(PyObject UNUSED *self, PyObject *str); diff --git a/kitty/wcwidth-std.h b/kitty/wcwidth-std.h index e2d379461..32c0629a0 100644 --- a/kitty/wcwidth-std.h +++ b/kitty/wcwidth-std.h @@ -2848,7 +2848,7 @@ wcwidth_std(int32_t code) { } return 1; } -static bool +static inline bool is_emoji_presentation_base(uint32_t code) { switch(code) { case 0x23: diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py index b0d09238d..68c744bdc 100644 --- a/kitty_tests/datatypes.py +++ b/kitty_tests/datatypes.py @@ -357,6 +357,7 @@ def w(x): self.ae(wcswidth('a\033[2mb'), 2) self.ae(wcswidth('\033a\033[2mb'), 2) self.ae(wcswidth('a\033]8;id=moo;https://foo\033\\a'), 2) + self.ae(wcswidth('a\033x'), 2) self.ae(tuple(map(w, 'a1\0コニチ ✔')), (1, 1, 0, 2, 2, 2, 1, 1)) self.ae(wcswidth('\u2716\u2716\ufe0f\U0001f337'), 5) self.ae(wcswidth('\u25b6\ufe0f'), 2)