Simplify utf8 parser func

Also show a replacement char for incomplete utf-8 sequences interrupted by an esc char
This commit is contained in:
Kovid Goyal 2023-11-20 16:06:24 +05:30
parent 72e73f2f81
commit 0ed1c6f840
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 74 additions and 101 deletions

View File

@ -674,10 +674,28 @@ ensure_cursor_not_on_wide_char_trailer_for_insert(Screen *self, text_loop_state
static void
draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_state *s) {
init_text_loop_line(self, s);
if (chars[0] < 0x7f || !is_combining_char(chars[0])) ensure_cursor_not_on_wide_char_trailer_for_insert(self, s);
if ((' ' >= chars[0] && chars[0] < 0x7f) || !is_combining_char(chars[0])) ensure_cursor_not_on_wide_char_trailer_for_insert(self, s);
for (size_t i = 0; i < num_chars; i++) {
uint32_t ch = chars[i];
if (ch < ' ') continue;
if (ch < ' ') {
switch (ch) {
case BEL:
screen_bell(self); break;
case BS:
screen_backspace(self); break;
case HT:
screen_tab(self); break;
case LF:
case VT:
case FF:
screen_linefeed(self); init_text_loop_line(self, s); break;
case CR:
screen_carriage_return(self); break;
default:
break;
}
continue;
}
int char_width = 1;
if (ch > 0x7f) { // not printable ASCII
if (is_ignored_char(ch)) continue;

View File

@ -34,27 +34,26 @@ find_either_of_two_bytes(const uint8_t *haystack, const size_t sz, const uint8_t
// UTF-8 {{{
static unsigned
static bool
utf8_decode_to_sentinel_scalar(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) {
unsigned num_consumed = 0, num_output = 0;
while (num_consumed < src_sz && num_output < arraysz(d->output)) {
const uint8_t ch = src[num_consumed++];
if (ch < ' ') {
d->output_sz = 0; d->num_consumed = 0;
while (d->num_consumed < src_sz && d->output_sz < arraysz(d->output)) {
const uint8_t ch = src[d->num_consumed++];
if (ch == sentinel) {
if (d->state.cur != UTF8_ACCEPT) d->output[d->output_sz++] = 0xfffd;
zero_at_ptr(&d->state);
if (num_output) { d->output_chars_callback(d->callback_data, d->output, num_output); num_output = 0; }
d->control_byte_callback(d->callback_data, ch);
if (ch == sentinel) break;
return true;
} else {
switch(decode_utf8(&d->state.cur, &d->state.codep, ch)) {
case UTF8_ACCEPT:
d->output[num_output++] = d->state.codep;
d->output[d->output_sz++] = d->state.codep;
break;
case UTF8_REJECT: {
const bool prev_was_accept = d->state.prev == UTF8_ACCEPT;
zero_at_ptr(&d->state);
d->output[num_output++] = 0xfffd;
if (!prev_was_accept) {
num_consumed--;
d->output[d->output_sz++] = 0xfffd;
if (!prev_was_accept && d->num_consumed) {
d->num_consumed--;
continue; // so that prev is correct
}
} break;
@ -62,13 +61,12 @@ utf8_decode_to_sentinel_scalar(UTF8Decoder *d, const uint8_t *src, const size_t
}
d->state.prev = d->state.cur;
}
if (num_output) d->output_chars_callback(d->callback_data, d->output, num_output);
return num_consumed;
return false;
}
static unsigned (*utf8_decode_to_sentinel_impl)(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) = utf8_decode_to_sentinel_scalar;
static bool (*utf8_decode_to_sentinel_impl)(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) = utf8_decode_to_sentinel_scalar;
unsigned
bool
utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel) {
return utf8_decode_to_sentinel_impl(d, src, src_sz, sentinel);
}
@ -76,22 +74,6 @@ utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz,
// }}}
// Boilerplate {{{
static void
test_control_byte_callback(void *l, uint8_t ch) {
if (!PyErr_Occurred()) {
RAII_PyObject(c, PyLong_FromUnsignedLong((unsigned long)ch));
if (c) PyList_Append((PyObject*)l, c);
}
}
static void
test_output_chars_callback(void *l, const uint32_t *chars, unsigned sz) {
if (!PyErr_Occurred()) {
RAII_PyObject(c, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, chars, (Py_ssize_t)sz));
if (c) PyList_Append((PyObject*)l, c);
}
}
static PyObject*
test_utf8_decode_to_sentinel(PyObject *self UNUSED, PyObject *args) {
const uint8_t *src; Py_ssize_t src_sz;
@ -99,24 +81,20 @@ test_utf8_decode_to_sentinel(PyObject *self UNUSED, PyObject *args) {
static UTF8Decoder d = {0};
unsigned char sentinel = 0x1b;
if (!PyArg_ParseTuple(args, "s#|iB", &src, &src_sz, &which_function, &sentinel)) return NULL;
RAII_PyObject(ans, PyList_New(0));
d.callback_data = ans;
d.control_byte_callback = test_control_byte_callback;
d.output_chars_callback = test_output_chars_callback;
unsigned long consumed;
bool found_sentinel = false;
switch(which_function) {
case -1:
zero_at_ptr(&d); Py_RETURN_NONE;
case 1:
consumed = utf8_decode_to_sentinel_scalar(&d, src, src_sz, sentinel); break;
found_sentinel = utf8_decode_to_sentinel_scalar(&d, src, src_sz, sentinel); break;
case 2:
consumed = utf8_decode_to_sentinel_128(&d, src, src_sz, sentinel); break;
found_sentinel = utf8_decode_to_sentinel_128(&d, src, src_sz, sentinel); break;
case 3:
consumed = utf8_decode_to_sentinel_256(&d, src, src_sz, sentinel); break;
found_sentinel = utf8_decode_to_sentinel_256(&d, src, src_sz, sentinel); break;
default:
consumed = utf8_decode_to_sentinel(&d, src, src_sz, sentinel); break;
found_sentinel = utf8_decode_to_sentinel(&d, src, src_sz, sentinel); break;
}
return Py_BuildValue("kO", consumed, ans);
return Py_BuildValue("ON", found_sentinel ? Py_True : Py_False, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, d.output, d.output_sz));
}
// }}}

View File

@ -17,14 +17,12 @@ typedef void (*output_chars_callback)(void *data, const uint32_t *chars, unsigne
typedef struct UTF8Decoder {
alignas(512/8) uint32_t output[512/8]; // we can process at most 512 bits of input (AVX512) so we get at most 64 chars of output
struct { uint32_t cur, prev, codep; } state;
unsigned output_sz, num_consumed;
void *callback_data;
control_byte_callback control_byte_callback;
output_chars_callback output_chars_callback;
struct { uint32_t cur, prev, codep; } state;
} UTF8Decoder;
static inline void utf8_decoder_reset(UTF8Decoder *self) { zero_at_ptr(&self->state); }
unsigned utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel);
bool utf8_decode_to_sentinel(UTF8Decoder *d, const uint8_t *src, const size_t src_sz, const uint8_t sentinel);
// Pass a PyModule PyObject* as the argument. Must be called once at application startup
bool init_simd(void* module);

View File

@ -74,15 +74,6 @@ _report_params(PyObject *dump_callback, id_type window_id, const char *name, int
Py_XDECREF(PyObject_CallFunction(dump_callback, "Kss", window_id, name, buf)); PyErr_Clear();
}
static void
_report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars, unsigned num) {
RAII_PyObject(s, PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, chars, num));
if (s) {
RAII_PyObject(t, PyObject_CallFunction(dump_callback, "KsO", window_id, "draw", s));
if (t == NULL) PyErr_Clear();
}
}
#define DUMP_UNUSED
#define REPORT_ERROR(...) _report_error(self->dump_callback, self->window_id, __VA_ARGS__);
@ -100,7 +91,24 @@ _report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars,
#define REPORT_COMMAND(...) GET_MACRO(__VA_ARGS__, REPORT_COMMAND3, REPORT_COMMAND2, REPORT_COMMAND1, SENTINEL)(__VA_ARGS__)
#define REPORT_VA_COMMAND(...) Py_XDECREF(PyObject_CallFunction(self->dump_callback, __VA_ARGS__)); PyErr_Clear();
#define REPORT_DRAW(chars, num) _report_draw(self->dump_callback, self->window_id, chars, num);
#define REPORT_DRAW(chars, num) { \
for (unsigned i = 0; i < num; i++) { \
uint32_t ch = chars[i]; \
switch(ch) { \
case BEL: REPORT_COMMAND(screen_bell); break; \
case BS: REPORT_COMMAND(screen_backspace); break; \
case HT: REPORT_COMMAND(screen_tab); break; \
case LF: case VT: case FF: REPORT_COMMAND(screen_linefeed); break; \
case CR: REPORT_COMMAND(screen_carriage_return); break; \
default: \
if (ch >= ' ') { \
RAII_PyObject(t, PyObject_CallFunction(self->dump_callback, "KsC", self->window_id, "draw", ch)); \
if (t == NULL) PyErr_Clear(); \
} \
} \
} \
}
#define REPORT_PARAMS(name, params, num, is_group, region) _report_params(self->dump_callback, self->window_id, name, params, num_params, is_group, region)
@ -117,7 +125,7 @@ _report_draw(PyObject *dump_callback, id_type window_id, const uint32_t *chars,
#define REPORT_ERROR(...) log_error(ERROR_PREFIX " " __VA_ARGS__);
#define REPORT_COMMAND(...)
#define REPORT_VA_COMMAND(...)
#define REPORT_DRAW(chars, num)
#define REPORT_DRAW(...)
#define REPORT_PARAMS(...)
#define REPORT_OSC(name, string)
#define REPORT_OSC2(name, code, string)
@ -219,46 +227,21 @@ reset_csi(ParsedCSI *csi) {
// Normal mode {{{
static void
dispatch_single_byte_control(void *s, uint8_t ch) {
#define CALL_SCREEN_HANDLER(name) REPORT_COMMAND(name); name(self->screen); break;
PS *self = s;
switch(ch) {
case BEL:
CALL_SCREEN_HANDLER(screen_bell);
case BS:
CALL_SCREEN_HANDLER(screen_backspace);
case HT:
CALL_SCREEN_HANDLER(screen_tab);
case LF:
case VT:
case FF:
CALL_SCREEN_HANDLER(screen_linefeed);
case CR:
CALL_SCREEN_HANDLER(screen_carriage_return);
case SI:
REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break;
case SO:
REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break;
case ESC:
SET_STATE(ESC); break;
default:
break;
}
#undef CALL_SCREEN_HANDLER
}
static void
dispatch_output_chars(void *s, const uint32_t *chars, unsigned sz) {
PS *self = s;
REPORT_DRAW(chars, sz);
screen_draw_text(self->screen, chars, sz);
dispatch_single_byte_control(PS *self, uint32_t ch) {
screen_draw_text(self->screen, &ch, 1);
}
static void
consume_normal(PS *self) {
do {
self->read.pos += utf8_decode_to_sentinel(&self->utf8_decoder, self->buf + self->read.pos, self->read.sz - self->read.pos, ESC);
} while (self->read.pos < self->read.sz && self->vte_state == VTE_NORMAL);
const bool sentinel_found = utf8_decode_to_sentinel(&self->utf8_decoder, self->buf + self->read.pos, self->read.sz - self->read.pos, ESC);
self->read.pos += self->utf8_decoder.num_consumed;
if (self->utf8_decoder.output_sz) {
REPORT_DRAW(self->utf8_decoder.output, self->utf8_decoder.output_sz);
screen_draw_text(self->screen, self->utf8_decoder.output, self->utf8_decoder.output_sz);
}
if (sentinel_found) { SET_STATE(ESC); break; }
} while (self->read.pos < self->read.sz);
}
// }}}
@ -1555,10 +1538,6 @@ run_worker(void *p, ParseData *pd, bool flush) {
pd->input_read = true;
self->dump_callback = pd->dump_callback; self->now = pd->now;
self->screen = p;
// these are here as they need to be specialized to dump/non dump versions
self->utf8_decoder.control_byte_callback = dispatch_single_byte_control;
self->utf8_decoder.output_chars_callback = dispatch_output_chars;
self->utf8_decoder.callback_data = self;
do {
end_with_lock; {
do_parse_vt(self);

View File

@ -175,7 +175,7 @@ class TestParser(BaseTest):
pb(c1_controls, c1_controls)
self.assertFalse(str(s.line(1)) + str(s.line(2)) + str(s.line(3)))
pb('😀'.encode()[:-1])
pb('\x1b\x1b%a', ('Unknown char after ESC: 0x1b',), ('draw', '%a'))
pb('\x1b\x1b%a', '\ufffd', ('Unknown char after ESC: 0x1b',), ('draw', '%a'))
def test_utf8_parsing(self):
s = self.create_screen()
@ -515,7 +515,7 @@ class TestParser(BaseTest):
s.set_pending_activated_at(0.00001)
pb(']8;;\x07', ('set_active_hyperlink', None, None))
pb('😀'.encode()[:-1])
pb('\033[?2026h', ('screen_start_pending_mode',),)
pb('\033[?2026h', '\ufffd', ('screen_start_pending_mode',),)
pb('😀'.encode()[-1:])
pb('\033[?2026l', '\ufffd', ('screen_stop_pending_mode',),)
pb('a', ('draw', 'a'))