diff --git a/kitty/vt-parser.c b/kitty/vt-parser.c index ebf89a39e..b9fdb03bc 100644 --- a/kitty/vt-parser.c +++ b/kitty/vt-parser.c @@ -20,7 +20,7 @@ // Macros {{{ -#define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0; self->utf8_state = UTF8_ACCEPT; +#define SET_STATE(x) self->vte_state = VTE_##x; self->parser_buf_pos = 0; self->utf8.state = UTF8_ACCEPT; self->utf8.prev = UTF8_ACCEPT; #define IS_DIGIT \ case '0': \ @@ -145,7 +145,7 @@ typedef struct PS { id_type window_id; unsigned parser_buf_pos; - UTF8State utf8_state; + struct { UTF8State prev, state; } utf8; VTEState vte_state; // this is used only during dispatch of a single byte, its present here just to avoid adding an extra parameter to accumulate_osc() @@ -173,15 +173,21 @@ typedef struct PS { static void draw_byte(PS *self, uint8_t b) { uint32_t ch; - switch (decode_utf8(&self->utf8_state, &ch, b)) { + switch (decode_utf8(&self->utf8.state, &ch, b)) { case UTF8_ACCEPT: REPORT_DRAW(ch); screen_draw(self->screen, ch, true); break; - case UTF8_REJECT: - self->utf8_state = UTF8_ACCEPT; - break; + case UTF8_REJECT: { + bool prev_was_accept = self->utf8.prev == UTF8_ACCEPT; + self->utf8.state = UTF8_ACCEPT; self->utf8.prev = UTF8_ACCEPT; + ch = 0xfffd; // unicode replacement char + REPORT_DRAW(ch); + screen_draw(self->screen, ch, true); + if (!prev_was_accept) draw_byte(self, b); + } break; } + self->utf8.prev = self->utf8.state; } static void @@ -206,7 +212,7 @@ dispatch_normal_mode_byte(PS *self) { case SO: REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break; case ESC: - SET_STATE(VTE_ESC); break; + SET_STATE(ESC); break; case NUL: case DEL: break; // no-op @@ -237,23 +243,23 @@ screen_nel(Screen *screen) { screen_carriage_return(screen); screen_linefeed(scr static void dispatch_esc_mode_byte(PS *self) { -#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(VTE_NORMAL); -#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(VTE_NORMAL); -#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(VTE_NORMAL); +#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(NORMAL); +#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(NORMAL); +#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(NORMAL); uint8_t ch = self->input_data[self->input_pos++]; switch(self->parser_buf_pos) { case 0: switch (ch) { case ESC_DCS: - SET_STATE(VTE_DCS); break; + SET_STATE(DCS); break; case ESC_OSC: - SET_STATE(VTE_OSC); break; + SET_STATE(OSC); break; case ESC_CSI: - SET_STATE(VTE_CSI); break; + SET_STATE(CSI); break; case ESC_APC: - SET_STATE(VTE_APC); break; + SET_STATE(APC); break; case ESC_PM: - SET_STATE(VTE_PM); break; + SET_STATE(PM); break; case ESC_RIS: CALL_ED(screen_reset); break; case ESC_IND: @@ -277,7 +283,7 @@ dispatch_esc_mode_byte(PS *self) { break; default: REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch); - SET_STATE(VTE_NORMAL); break; + SET_STATE(NORMAL); break; } break; default: @@ -325,7 +331,7 @@ dispatch_esc_mode_byte(PS *self) { default: REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break; } - SET_STATE(VTE_NORMAL); + SET_STATE(NORMAL); break; } #undef CALL_ED @@ -552,7 +558,7 @@ END_ALLOW_CASE_RANGE if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) { if (ch == '\\') { self->parser_buf_pos--; return true; } REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos); - SET_STATE(VTE_ESC); return false; + SET_STATE(ESC); return false; } if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) { REPORT_ERROR("DCS sequence too long, truncating."); @@ -682,7 +688,7 @@ accumulate_csi(PS *self) { #define ENSURE_SPACE \ if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \ REPORT_ERROR("CSI sequence too long, ignoring"); \ - SET_STATE(VTE_NORMAL); \ + SET_STATE(NORMAL); \ return false; \ } @@ -701,7 +707,7 @@ accumulate_csi(PS *self) { case '=': if (self->parser_buf_pos != 0) { REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); - SET_STATE(VTE_NORMAL); + SET_STATE(NORMAL); return false; } ENSURE_SPACE; @@ -733,11 +739,11 @@ END_ALLOW_CASE_RANGE break; case NUL: case DEL: - SET_STATE(VTE_NORMAL); + SET_STATE(NORMAL); break; // no-op default: REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); - SET_STATE(VTE_NORMAL); + SET_STATE(NORMAL); return false; } @@ -1345,25 +1351,25 @@ accumulate_oth(PS *self) { dispatch##_esc_mode_byte(self); \ break; \ case VTE_CSI: \ - if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \ + if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(NORMAL); watch_for_pending; } \ break; \ case VTE_OSC: \ if (accumulate_osc(self)) { \ dispatch##_osc(self); \ if (self->extended_osc_code) { \ self->input_pos--; \ - if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(VTE_NORMAL); } \ - } else { SET_STATE(VTE_NORMAL); } \ + if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(NORMAL); } \ + } else { SET_STATE(NORMAL); } \ } \ break; \ case VTE_APC: \ - if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(VTE_NORMAL); } \ + if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(NORMAL); } \ break; \ case VTE_PM: \ - if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(VTE_NORMAL); } \ + if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(NORMAL); } \ break; \ case VTE_DCS: \ - if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \ + if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(NORMAL); watch_for_pending; } \ if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \ break; \ case VTE_NORMAL: \ @@ -1389,7 +1395,7 @@ pending_normal_mode_byte(PS *self) { uint8_t ch = self->input_data[self->input_pos++]; switch(ch) { case ESC: - SET_STATE(VTE_ESC); break; + SET_STATE(ESC); break; default: ensure_pending_space(self, 1); self->pending_mode.buf[self->pending_mode.used++] = ch; @@ -1405,20 +1411,20 @@ pending_esc_mode_byte(PS *self) { self->pending_mode.buf[self->pending_mode.used++] = ESC; self->pending_mode.buf[self->pending_mode.used++] = self->parser_buf[self->parser_buf_pos - 1]; self->pending_mode.buf[self->pending_mode.used++] = ch; - SET_STATE(VTE_NORMAL); + SET_STATE(NORMAL); return; } switch (ch) { case ESC_DCS: - SET_STATE(VTE_DCS); break; + SET_STATE(DCS); break; case ESC_OSC: - SET_STATE(VTE_OSC); break; + SET_STATE(OSC); break; case ESC_CSI: - SET_STATE(VTE_CSI); break; + SET_STATE(CSI); break; case ESC_APC: - SET_STATE(VTE_APC); break; + SET_STATE(APC); break; case ESC_PM: - SET_STATE(VTE_PM); break; + SET_STATE(PM); break; IS_ESCAPED_CHAR: self->parser_buf[self->parser_buf_pos++] = ch; break; @@ -1426,7 +1432,7 @@ pending_esc_mode_byte(PS *self) { ensure_pending_space(self, 2); self->pending_mode.buf[self->pending_mode.used++] = ESC; self->pending_mode.buf[self->pending_mode.used++] = ch; - SET_STATE(VTE_NORMAL); break; + SET_STATE(NORMAL); break; } } @@ -1651,7 +1657,8 @@ free_vt_parser(Parser* self) { static void reset(PS *self) { self->vte_state = VTE_NORMAL; - self->utf8_state = UTF8_ACCEPT; + self->utf8.state = UTF8_ACCEPT; + self->utf8.prev = UTF8_ACCEPT; self->parser_buf_pos = 0; self->pending_mode.activated_at = 0; diff --git a/kitty_tests/parser.py b/kitty_tests/parser.py index 1336b9a6e..2fa1309c9 100644 --- a/kitty_tests/parser.py +++ b/kitty_tests/parser.py @@ -84,6 +84,24 @@ class TestParser(BaseTest): c1_controls = '\x84\x85\x88\x8d\x8e\x8f\x90\x96\x97\x98\x9a\x9b\x9c\x9d\x9e\x9f' pb(c1_controls, c1_controls) self.assertFalse(str(s.line(1)) + str(s.line(2)) + str(s.line(3))) + pb('😀'.encode()[:-1]) + pb('\x1b\x1b%a', ('Unknown char after ESC: 0x1b',), ('draw', '%a')) + + def test_utf8_parsing(self): + s = self.create_screen() + pb = partial(self.parse_bytes_dump, s) + pb(b'"\xbf"', '"\ufffd"') + pb(b'"\x80"', '"\ufffd"') + pb(b'"\x80\xbf"', '"\ufffd\ufffd"') + pb(b'"\x80\xbf\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xc0 "', '"\ufffd "') + pb(b'"\xfe"', '"\ufffd"') + pb(b'"\xff"', '"\ufffd"') + pb(b'"\xff\xfe"', '"\ufffd\ufffd"') + pb(b'"\xfe\xfe\xff\xff"', '"\ufffd\ufffd\ufffd\ufffd"') + pb(b'"\xef\xbf"', '"\ufffd"') + pb(b'"\xe0\xa0"', '"\ufffd"') + pb(b'"\xf0\x9f\x98"', '"\ufffd"') def test_esc_codes(self): s = self.create_screen() @@ -395,6 +413,11 @@ class TestParser(BaseTest): pb('\033[?2026h\033', ('screen_set_mode', 2026, 1),) s.set_pending_activated_at(0.00001) pb(']8;;\x07', ('set_active_hyperlink', None, None)) + pb('😀'.encode()[:-1]) + pb('\033[?2026h', ('screen_set_mode', 2026, 1),) + pb('😀'.encode()[-1:]) + pb('\033[?2026l', '\ufffd', ('screen_reset_mode', 2026, 1),) + pb('a', ('draw', 'a')) def test_oth_codes(self): s = self.create_screen()