mirror of
https://github.com/kovidgoyal/kitty.git
synced 2024-11-11 01:28:19 +03:00
Simplify utf8 parser func
Also show a replacement char for incomplete utf-8 sequences interrupted by an esc char
This commit is contained in:
parent
72e73f2f81
commit
0ed1c6f840
@ -674,10 +674,28 @@ ensure_cursor_not_on_wide_char_trailer_for_insert(Screen *self, text_loop_state
|
||||
static void
|
||||
draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_state *s) {
|
||||
init_text_loop_line(self, s);
|
||||
if (chars[0] < 0x7f || !is_combining_char(chars[0])) ensure_cursor_not_on_wide_char_trailer_for_insert(self, s);
|
||||
if ((' ' >= chars[0] && chars[0] < 0x7f) || !is_combining_char(chars[0])) ensure_cursor_not_on_wide_char_trailer_for_insert(self, s);
|
||||
for (size_t i = 0; i < num_chars; i++) {
|
||||
uint32_t ch = chars[i];
|
||||
if (ch < ' ') continue;
|
||||
if (ch < ' ') {
|
||||
switch (ch) {
|
||||
case BEL:
|
||||
screen_bell(self); break;
|
||||
case BS:
|
||||
screen_backspace(self); break;
|
||||
case HT:
|
||||
screen_tab(self); break;
|
||||
case LF:
|
||||
case VT:
|
||||
case FF:
|
||||
screen_linefeed(self); init_text_loop_line(self, s); break;
|
||||
case CR:
|
||||
screen_carriage_return(self); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
int char_width = 1;
|
||||
if (ch > 0x7f) { // not printable ASCII
|
||||
if (is_ignored_char(ch)) continue;
|
||||
|
@ -34,27 +34,26 @@ find_either_of_two_bytes(const uint8_t *haystack, const size_t sz, const uint8_t
|
||||
|
||||
// UTF-8 {{{
|
||||
|
||||
static unsigned
|
||||
static bool
|
||||
utf8_decode_to_sentinel_scalar(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) {
|
||||
unsigned num_consumed = 0, num_output = 0;
|
||||
while (num_consumed < src_sz && num_output < arraysz(d->output)) {
|
||||
const uint8_t ch = src[num_consumed++];
|
||||
if (ch < ' ') {
|
||||
d->output_sz = 0; d->num_consumed = 0;
|
||||
while (d->num_consumed < src_sz && d->output_sz < arraysz(d->output)) {
|
||||
const uint8_t ch = src[d->num_consumed++];
|
||||
if (ch == sentinel) {
|
||||
if (d->state.cur != UTF8_ACCEPT) d->output[d->output_sz++] = 0xfffd;
|
||||
zero_at_ptr(&d->state);
|
||||
if (num_output) { d->output_chars_callback(d->callback_data, d->output, num_output); num_output = 0; }
|
||||
d->control_byte_callback(d->callback_data, ch);
|
||||
if (ch == sentinel) break;
|
||||
return true;
|
||||
} else {
|
||||
switch(decode_utf8(&d->state.cur, &d->state.codep, ch)) {
|
||||
case UTF8_ACCEPT:
|
||||
d->output[num_output++] = d->state.codep;
|
||||
d->output[d->output_sz++] = d->state.codep;
|
||||
break;
|
||||
case UTF8_REJECT: {
|
||||
const bool prev_was_accept = d->state.prev == UTF8_ACCEPT;
|
||||
zero_at_ptr(&d->state);
|
||||
d->output[num_output++] = 0xfffd;
|
||||
if (!prev_was_accept) {
|
||||
num_consumed--;
|
||||
d->output[d->output_sz++] = 0xfffd;
|
||||
if (!prev_was_accept && d->num_consumed) {
|
||||
d->num_consumed--;
|
||||
continue; // so that prev is correct
|
||||
}
|
||||
} break;
|
||||
@ -62,13 +61,12 @@ utf8_decode_to_sentinel_scalar(UTF8Decoder *d, const uint8_t *src, const size_t
|
||||
}
|
||||
d->state.prev = d->state.cur;
|
||||
}
|
||||
if (num_output) d->output_chars_callback(d->callback_data, d->output, num_output);
|
||||
return num_consumed;
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned (*utf8_decode_to_sentinel_impl)(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) = utf8_decode_to_sentinel_scalar;
|
||||
static bool (*utf8_decode_to_sentinel_impl)(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) = utf8_decode_to_sentinel_scalar;
|
||||
|
||||
unsigned
|
||||
bool
|
||||
utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) {
|
||||
return utf8_decode_to_sentinel_impl(d, src, src_sz, sentinel);
|
||||
}
|
||||
@ -76,22 +74,6 @@ utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz,
|
||||
// }}}
|
||||
|
||||
// Boilerplate {{{
|
||||
static void
|
||||
test_control_byte_callback(void *l, uint8_t ch) {
|
||||
if (!PyErr_Occurred()) {
|
||||
RAII_PyObject(c, PyLong_FromUnsignedLong((unsigned long)ch));
|
||||
if (c) PyList_Append((PyObject*)l, c);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
test_output_chars_callback(void *l, const uint32_t *chars, unsigned sz) {
|
||||
if (!PyErr_Occurred()) {
|
||||
RAII_PyObject(c, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, chars, (Py_ssize_t)sz));
|
||||
if (c) PyList_Append((PyObject*)l, c);
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
test_utf8_decode_to_sentinel(PyObject *self UNUSED, PyObject *args) {
|
||||
const uint8_t *src; Py_ssize_t src_sz;
|
||||
@ -99,24 +81,20 @@ test_utf8_decode_to_sentinel(PyObject *self UNUSED, PyObject *args) {
|
||||
static UTF8Decoder d = {0};
|
||||
unsigned char sentinel = 0x1b;
|
||||
if (!PyArg_ParseTuple(args, "s#|iB", &src, &src_sz, &which_function, &sentinel)) return NULL;
|
||||
RAII_PyObject(ans, PyList_New(0));
|
||||
d.callback_data = ans;
|
||||
d.control_byte_callback = test_control_byte_callback;
|
||||
d.output_chars_callback = test_output_chars_callback;
|
||||
unsigned long consumed;
|
||||
bool found_sentinel = false;
|
||||
switch(which_function) {
|
||||
case -1:
|
||||
zero_at_ptr(&d); Py_RETURN_NONE;
|
||||
case 1:
|
||||
consumed = utf8_decode_to_sentinel_scalar(&d, src, src_sz, sentinel); break;
|
||||
found_sentinel = utf8_decode_to_sentinel_scalar(&d, src, src_sz, sentinel); break;
|
||||
case 2:
|
||||
consumed = utf8_decode_to_sentinel_128(&d, src, src_sz, sentinel); break;
|
||||
found_sentinel = utf8_decode_to_sentinel_128(&d, src, src_sz, sentinel); break;
|
||||
case 3:
|
||||
consumed = utf8_decode_to_sentinel_256(&d, src, src_sz, sentinel); break;
|
||||
found_sentinel = utf8_decode_to_sentinel_256(&d, src, src_sz, sentinel); break;
|
||||
default:
|
||||
consumed = utf8_decode_to_sentinel(&d, src, src_sz, sentinel); break;
|
||||
found_sentinel = utf8_decode_to_sentinel(&d, src, src_sz, sentinel); break;
|
||||
}
|
||||
return Py_BuildValue("kO", consumed, ans);
|
||||
return Py_BuildValue("ON", found_sentinel ? Py_True : Py_False, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, d.output, d.output_sz));
|
||||
}
|
||||
// }}}
|
||||
|
||||
|
@ -17,14 +17,12 @@ typedef void (*output_chars_callback)(void *data, const uint32_t *chars, unsigne
|
||||
|
||||
typedef struct UTF8Decoder {
|
||||
alignas(512/8) uint32_t output[512/8]; // we can process at most 512 bits of input (AVX512) so we get at most 64 chars of output
|
||||
struct { uint32_t cur, prev, codep; } state;
|
||||
unsigned output_sz, num_consumed;
|
||||
|
||||
void *callback_data;
|
||||
control_byte_callback control_byte_callback;
|
||||
output_chars_callback output_chars_callback;
|
||||
struct { uint32_t cur, prev, codep; } state;
|
||||
} UTF8Decoder;
|
||||
static inline void utf8_decoder_reset(UTF8Decoder *self) { zero_at_ptr(&self->state); }
|
||||
unsigned utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel);
|
||||
bool utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel);
|
||||
|
||||
// Pass a PyModule PyObject* as the argument. Must be called once at application startup
|
||||
bool init_simd(void* module);
|
||||
|
@ -74,15 +74,6 @@ _report_params(PyObject *dump_callback, id_type window_id, const char *name, int
|
||||
Py_XDECREF(PyObject_CallFunction(dump_callback, "Kss", window_id, name, buf)); PyErr_Clear();
|
||||
}
|
||||
|
||||
static void
|
||||
_report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars, unsigned num) {
|
||||
RAII_PyObject(s, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, chars, num));
|
||||
if (s) {
|
||||
RAII_PyObject(t, PyObject_CallFunction(dump_callback, "KsO", window_id, "draw", s));
|
||||
if (t == NULL) PyErr_Clear();
|
||||
}
|
||||
}
|
||||
|
||||
#define DUMP_UNUSED
|
||||
|
||||
#define REPORT_ERROR(...) _report_error(self->dump_callback, self->window_id, __VA_ARGS__);
|
||||
@ -100,7 +91,24 @@ _report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars,
|
||||
#define REPORT_COMMAND(...) GET_MACRO(__VA_ARGS__, REPORT_COMMAND3, REPORT_COMMAND2, REPORT_COMMAND1, SENTINEL)(__VA_ARGS__)
|
||||
#define REPORT_VA_COMMAND(...) Py_XDECREF(PyObject_CallFunction(self->dump_callback, __VA_ARGS__)); PyErr_Clear();
|
||||
|
||||
#define REPORT_DRAW(chars, num) _report_draw(self->dump_callback, self->window_id, chars, num);
|
||||
#define REPORT_DRAW(chars, num) { \
|
||||
for (unsigned i = 0; i < num; i++) { \
|
||||
uint32_t ch = chars[i]; \
|
||||
switch(ch) { \
|
||||
case BEL: REPORT_COMMAND(screen_bell); break; \
|
||||
case BS: REPORT_COMMAND(screen_backspace); break; \
|
||||
case HT: REPORT_COMMAND(screen_tab); break; \
|
||||
case LF: case VT: case FF: REPORT_COMMAND(screen_linefeed); break; \
|
||||
case CR: REPORT_COMMAND(screen_carriage_return); break; \
|
||||
default: \
|
||||
if (ch >= ' ') { \
|
||||
RAII_PyObject(t, PyObject_CallFunction(self->dump_callback, "KsC", self->window_id, "draw", ch)); \
|
||||
if (t == NULL) PyErr_Clear(); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define REPORT_PARAMS(name, params, num, is_group, region) _report_params(self->dump_callback, self->window_id, name, params, num_params, is_group, region)
|
||||
|
||||
@ -117,7 +125,7 @@ _report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars,
|
||||
#define REPORT_ERROR(...) log_error(ERROR_PREFIX " " __VA_ARGS__);
|
||||
#define REPORT_COMMAND(...)
|
||||
#define REPORT_VA_COMMAND(...)
|
||||
#define REPORT_DRAW(chars, num)
|
||||
#define REPORT_DRAW(...)
|
||||
#define REPORT_PARAMS(...)
|
||||
#define REPORT_OSC(name, string)
|
||||
#define REPORT_OSC2(name, code, string)
|
||||
@ -219,46 +227,21 @@ reset_csi(ParsedCSI *csi) {
|
||||
// Normal mode {{{
|
||||
|
||||
static void
|
||||
dispatch_single_byte_control(void *s, uint8_t ch) {
|
||||
#define CALL_SCREEN_HANDLER(name) REPORT_COMMAND(name); name(self->screen); break;
|
||||
PS *self = s;
|
||||
switch(ch) {
|
||||
case BEL:
|
||||
CALL_SCREEN_HANDLER(screen_bell);
|
||||
case BS:
|
||||
CALL_SCREEN_HANDLER(screen_backspace);
|
||||
case HT:
|
||||
CALL_SCREEN_HANDLER(screen_tab);
|
||||
case LF:
|
||||
case VT:
|
||||
case FF:
|
||||
CALL_SCREEN_HANDLER(screen_linefeed);
|
||||
case CR:
|
||||
CALL_SCREEN_HANDLER(screen_carriage_return);
|
||||
case SI:
|
||||
REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break;
|
||||
case SO:
|
||||
REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break;
|
||||
case ESC:
|
||||
SET_STATE(ESC); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#undef CALL_SCREEN_HANDLER
|
||||
}
|
||||
|
||||
static void
|
||||
dispatch_output_chars(void *s, const uint32_t *chars, unsigned sz) {
|
||||
PS *self = s;
|
||||
REPORT_DRAW(chars, sz);
|
||||
screen_draw_text(self->screen, chars, sz);
|
||||
dispatch_single_byte_control(PS *self, uint32_t ch) {
|
||||
screen_draw_text(self->screen, &ch, 1);
|
||||
}
|
||||
|
||||
static void
|
||||
consume_normal(PS *self) {
|
||||
do {
|
||||
self->read.pos += utf8_decode_to_sentinel(&self->utf8_decoder, self->buf + self->read.pos, self->read.sz - self->read.pos, ESC);
|
||||
} while (self->read.pos < self->read.sz && self->vte_state == VTE_NORMAL);
|
||||
const bool sentinel_found = utf8_decode_to_sentinel(&self->utf8_decoder, self->buf + self->read.pos, self->read.sz - self->read.pos, ESC);
|
||||
self->read.pos += self->utf8_decoder.num_consumed;
|
||||
if (self->utf8_decoder.output_sz) {
|
||||
REPORT_DRAW(self->utf8_decoder.output, self->utf8_decoder.output_sz);
|
||||
screen_draw_text(self->screen, self->utf8_decoder.output, self->utf8_decoder.output_sz);
|
||||
}
|
||||
if (sentinel_found) { SET_STATE(ESC); break; }
|
||||
} while (self->read.pos < self->read.sz);
|
||||
}
|
||||
// }}}
|
||||
|
||||
@ -1555,10 +1538,6 @@ run_worker(void *p, ParseData *pd, bool flush) {
|
||||
pd->input_read = true;
|
||||
self->dump_callback = pd->dump_callback; self->now = pd->now;
|
||||
self->screen = p;
|
||||
// these are here as they need to be specialized to dump/non dump versions
|
||||
self->utf8_decoder.control_byte_callback = dispatch_single_byte_control;
|
||||
self->utf8_decoder.output_chars_callback = dispatch_output_chars;
|
||||
self->utf8_decoder.callback_data = self;
|
||||
do {
|
||||
end_with_lock; {
|
||||
do_parse_vt(self);
|
||||
|
@ -175,7 +175,7 @@ class TestParser(BaseTest):
|
||||
pb(c1_controls, c1_controls)
|
||||
self.assertFalse(str(s.line(1)) + str(s.line(2)) + str(s.line(3)))
|
||||
pb('😀'.encode()[:-1])
|
||||
pb('\x1b\x1b%a', ('Unknown char after ESC: 0x1b',), ('draw', '%a'))
|
||||
pb('\x1b\x1b%a', '\ufffd', ('Unknown char after ESC: 0x1b',), ('draw', '%a'))
|
||||
|
||||
def test_utf8_parsing(self):
|
||||
s = self.create_screen()
|
||||
@ -515,7 +515,7 @@ class TestParser(BaseTest):
|
||||
s.set_pending_activated_at(0.00001)
|
||||
pb(']8;;\x07', ('set_active_hyperlink', None, None))
|
||||
pb('😀'.encode()[:-1])
|
||||
pb('\033[?2026h', ('screen_start_pending_mode',),)
|
||||
pb('\033[?2026h', '\ufffd', ('screen_start_pending_mode',),)
|
||||
pb('😀'.encode()[-1:])
|
||||
pb('\033[?2026l', '\ufffd', ('screen_stop_pending_mode',),)
|
||||
pb('a', ('draw', 'a'))
|
||||
|
Loading…
Reference in New Issue
Block a user