Represent malformed UTF-8 with the replacement character

This commit is contained in:
Kovid Goyal 2023-10-30 13:02:24 +05:30
parent 8a83014f51
commit 969bd05fc5
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 67 additions and 37 deletions

View File

@ -20,7 +20,7 @@
// Macros {{{ // Macros {{{
#define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0; self->utf8_state = UTF8_ACCEPT; #define SET_STATE(x) self->vte_state = VTE_##x; self->parser_buf_pos = 0; self->utf8.state = UTF8_ACCEPT; self->utf8.prev = UTF8_ACCEPT;
#define IS_DIGIT \ #define IS_DIGIT \
case '0': \ case '0': \
@ -145,7 +145,7 @@ typedef struct PS {
id_type window_id; id_type window_id;
unsigned parser_buf_pos; unsigned parser_buf_pos;
UTF8State utf8_state; struct { UTF8State prev, state; } utf8;
VTEState vte_state; VTEState vte_state;
// this is used only during dispatch of a single byte, its present here just to avoid adding an extra parameter to accumulate_osc() // this is used only during dispatch of a single byte, its present here just to avoid adding an extra parameter to accumulate_osc()
@ -173,15 +173,21 @@ typedef struct PS {
static void static void
draw_byte(PS *self, uint8_t b) { draw_byte(PS *self, uint8_t b) {
uint32_t ch; uint32_t ch;
switch (decode_utf8(&self->utf8_state, &ch, b)) { switch (decode_utf8(&self->utf8.state, &ch, b)) {
case UTF8_ACCEPT: case UTF8_ACCEPT:
REPORT_DRAW(ch); REPORT_DRAW(ch);
screen_draw(self->screen, ch, true); screen_draw(self->screen, ch, true);
break; break;
case UTF8_REJECT: case UTF8_REJECT: {
self->utf8_state = UTF8_ACCEPT; bool prev_was_accept = self->utf8.prev == UTF8_ACCEPT;
break; self->utf8.state = UTF8_ACCEPT; self->utf8.prev = UTF8_ACCEPT;
ch = 0xfffd; // unicode replacement char
REPORT_DRAW(ch);
screen_draw(self->screen, ch, true);
if (!prev_was_accept) draw_byte(self, b);
} break;
} }
self->utf8.prev = self->utf8.state;
} }
static void static void
@ -206,7 +212,7 @@ dispatch_normal_mode_byte(PS *self) {
case SO: case SO:
REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break; REPORT_ERROR("Ignoring request to change charset as we only support UTF-8"); break;
case ESC: case ESC:
SET_STATE(VTE_ESC); break; SET_STATE(ESC); break;
case NUL: case NUL:
case DEL: case DEL:
break; // no-op break; // no-op
@ -237,23 +243,23 @@ screen_nel(Screen *screen) { screen_carriage_return(screen); screen_linefeed(scr
static void static void
dispatch_esc_mode_byte(PS *self) { dispatch_esc_mode_byte(PS *self) {
#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(VTE_NORMAL); #define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(NORMAL);
#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(VTE_NORMAL); #define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(NORMAL);
#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(VTE_NORMAL); #define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(NORMAL);
uint8_t ch = self->input_data[self->input_pos++]; uint8_t ch = self->input_data[self->input_pos++];
switch(self->parser_buf_pos) { switch(self->parser_buf_pos) {
case 0: case 0:
switch (ch) { switch (ch) {
case ESC_DCS: case ESC_DCS:
SET_STATE(VTE_DCS); break; SET_STATE(DCS); break;
case ESC_OSC: case ESC_OSC:
SET_STATE(VTE_OSC); break; SET_STATE(OSC); break;
case ESC_CSI: case ESC_CSI:
SET_STATE(VTE_CSI); break; SET_STATE(CSI); break;
case ESC_APC: case ESC_APC:
SET_STATE(VTE_APC); break; SET_STATE(APC); break;
case ESC_PM: case ESC_PM:
SET_STATE(VTE_PM); break; SET_STATE(PM); break;
case ESC_RIS: case ESC_RIS:
CALL_ED(screen_reset); break; CALL_ED(screen_reset); break;
case ESC_IND: case ESC_IND:
@ -277,7 +283,7 @@ dispatch_esc_mode_byte(PS *self) {
break; break;
default: default:
REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch); REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch);
SET_STATE(VTE_NORMAL); break; SET_STATE(NORMAL); break;
} }
break; break;
default: default:
@ -325,7 +331,7 @@ dispatch_esc_mode_byte(PS *self) {
default: default:
REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break; REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break;
} }
SET_STATE(VTE_NORMAL); SET_STATE(NORMAL);
break; break;
} }
#undef CALL_ED #undef CALL_ED
@ -552,7 +558,7 @@ END_ALLOW_CASE_RANGE
if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) { if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) {
if (ch == '\\') { self->parser_buf_pos--; return true; } if (ch == '\\') { self->parser_buf_pos--; return true; }
REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos); REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos);
SET_STATE(VTE_ESC); return false; SET_STATE(ESC); return false;
} }
if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) { if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) {
REPORT_ERROR("DCS sequence too long, truncating."); REPORT_ERROR("DCS sequence too long, truncating.");
@ -682,7 +688,7 @@ accumulate_csi(PS *self) {
#define ENSURE_SPACE \ #define ENSURE_SPACE \
if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \ if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \
REPORT_ERROR("CSI sequence too long, ignoring"); \ REPORT_ERROR("CSI sequence too long, ignoring"); \
SET_STATE(VTE_NORMAL); \ SET_STATE(NORMAL); \
return false; \ return false; \
} }
@ -701,7 +707,7 @@ accumulate_csi(PS *self) {
case '=': case '=':
if (self->parser_buf_pos != 0) { if (self->parser_buf_pos != 0) {
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(VTE_NORMAL); SET_STATE(NORMAL);
return false; return false;
} }
ENSURE_SPACE; ENSURE_SPACE;
@ -733,11 +739,11 @@ END_ALLOW_CASE_RANGE
break; break;
case NUL: case NUL:
case DEL: case DEL:
SET_STATE(VTE_NORMAL); SET_STATE(NORMAL);
break; // no-op break; // no-op
default: default:
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(VTE_NORMAL); SET_STATE(NORMAL);
return false; return false;
} }
@ -1345,25 +1351,25 @@ accumulate_oth(PS *self) {
dispatch##_esc_mode_byte(self); \ dispatch##_esc_mode_byte(self); \
break; \ break; \
case VTE_CSI: \ case VTE_CSI: \
if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \ if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(NORMAL); watch_for_pending; } \
break; \ break; \
case VTE_OSC: \ case VTE_OSC: \
if (accumulate_osc(self)) { \ if (accumulate_osc(self)) { \
dispatch##_osc(self); \ dispatch##_osc(self); \
if (self->extended_osc_code) { \ if (self->extended_osc_code) { \
self->input_pos--; \ self->input_pos--; \
if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(VTE_NORMAL); } \ if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(NORMAL); } \
} else { SET_STATE(VTE_NORMAL); } \ } else { SET_STATE(NORMAL); } \
} \ } \
break; \ break; \
case VTE_APC: \ case VTE_APC: \
if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(VTE_NORMAL); } \ if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(NORMAL); } \
break; \ break; \
case VTE_PM: \ case VTE_PM: \
if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(VTE_NORMAL); } \ if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(NORMAL); } \
break; \ break; \
case VTE_DCS: \ case VTE_DCS: \
if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \ if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(NORMAL); watch_for_pending; } \
if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \ if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \
break; \ break; \
case VTE_NORMAL: \ case VTE_NORMAL: \
@ -1389,7 +1395,7 @@ pending_normal_mode_byte(PS *self) {
uint8_t ch = self->input_data[self->input_pos++]; uint8_t ch = self->input_data[self->input_pos++];
switch(ch) { switch(ch) {
case ESC: case ESC:
SET_STATE(VTE_ESC); break; SET_STATE(ESC); break;
default: default:
ensure_pending_space(self, 1); ensure_pending_space(self, 1);
self->pending_mode.buf[self->pending_mode.used++] = ch; self->pending_mode.buf[self->pending_mode.used++] = ch;
@ -1405,20 +1411,20 @@ pending_esc_mode_byte(PS *self) {
self->pending_mode.buf[self->pending_mode.used++] = ESC; self->pending_mode.buf[self->pending_mode.used++] = ESC;
self->pending_mode.buf[self->pending_mode.used++] = self->parser_buf[self->parser_buf_pos - 1]; self->pending_mode.buf[self->pending_mode.used++] = self->parser_buf[self->parser_buf_pos - 1];
self->pending_mode.buf[self->pending_mode.used++] = ch; self->pending_mode.buf[self->pending_mode.used++] = ch;
SET_STATE(VTE_NORMAL); SET_STATE(NORMAL);
return; return;
} }
switch (ch) { switch (ch) {
case ESC_DCS: case ESC_DCS:
SET_STATE(VTE_DCS); break; SET_STATE(DCS); break;
case ESC_OSC: case ESC_OSC:
SET_STATE(VTE_OSC); break; SET_STATE(OSC); break;
case ESC_CSI: case ESC_CSI:
SET_STATE(VTE_CSI); break; SET_STATE(CSI); break;
case ESC_APC: case ESC_APC:
SET_STATE(VTE_APC); break; SET_STATE(APC); break;
case ESC_PM: case ESC_PM:
SET_STATE(VTE_PM); break; SET_STATE(PM); break;
IS_ESCAPED_CHAR: IS_ESCAPED_CHAR:
self->parser_buf[self->parser_buf_pos++] = ch; self->parser_buf[self->parser_buf_pos++] = ch;
break; break;
@ -1426,7 +1432,7 @@ pending_esc_mode_byte(PS *self) {
ensure_pending_space(self, 2); ensure_pending_space(self, 2);
self->pending_mode.buf[self->pending_mode.used++] = ESC; self->pending_mode.buf[self->pending_mode.used++] = ESC;
self->pending_mode.buf[self->pending_mode.used++] = ch; self->pending_mode.buf[self->pending_mode.used++] = ch;
SET_STATE(VTE_NORMAL); break; SET_STATE(NORMAL); break;
} }
} }
@ -1651,7 +1657,8 @@ free_vt_parser(Parser* self) {
static void static void
reset(PS *self) { reset(PS *self) {
self->vte_state = VTE_NORMAL; self->vte_state = VTE_NORMAL;
self->utf8_state = UTF8_ACCEPT; self->utf8.state = UTF8_ACCEPT;
self->utf8.prev = UTF8_ACCEPT;
self->parser_buf_pos = 0; self->parser_buf_pos = 0;
self->pending_mode.activated_at = 0; self->pending_mode.activated_at = 0;

View File

@ -84,6 +84,24 @@ class TestParser(BaseTest):
c1_controls = '\x84\x85\x88\x8d\x8e\x8f\x90\x96\x97\x98\x9a\x9b\x9c\x9d\x9e\x9f' c1_controls = '\x84\x85\x88\x8d\x8e\x8f\x90\x96\x97\x98\x9a\x9b\x9c\x9d\x9e\x9f'
pb(c1_controls, c1_controls) pb(c1_controls, c1_controls)
self.assertFalse(str(s.line(1)) + str(s.line(2)) + str(s.line(3))) self.assertFalse(str(s.line(1)) + str(s.line(2)) + str(s.line(3)))
pb('😀'.encode()[:-1])
pb('\x1b\x1b%a', ('Unknown char after ESC: 0x1b',), ('draw', '%a'))
def test_utf8_parsing(self):
s = self.create_screen()
pb = partial(self.parse_bytes_dump, s)
pb(b'"\xbf"', '"\ufffd"')
pb(b'"\x80"', '"\ufffd"')
pb(b'"\x80\xbf"', '"\ufffd\ufffd"')
pb(b'"\x80\xbf\x80"', '"\ufffd\ufffd\ufffd"')
pb(b'"\xc0 "', '"\ufffd "')
pb(b'"\xfe"', '"\ufffd"')
pb(b'"\xff"', '"\ufffd"')
pb(b'"\xff\xfe"', '"\ufffd\ufffd"')
pb(b'"\xfe\xfe\xff\xff"', '"\ufffd\ufffd\ufffd\ufffd"')
pb(b'"\xef\xbf"', '"\ufffd"')
pb(b'"\xe0\xa0"', '"\ufffd"')
pb(b'"\xf0\x9f\x98"', '"\ufffd"')
def test_esc_codes(self): def test_esc_codes(self):
s = self.create_screen() s = self.create_screen()
@ -395,6 +413,11 @@ class TestParser(BaseTest):
pb('\033[?2026h\033', ('screen_set_mode', 2026, 1),) pb('\033[?2026h\033', ('screen_set_mode', 2026, 1),)
s.set_pending_activated_at(0.00001) s.set_pending_activated_at(0.00001)
pb(']8;;\x07', ('set_active_hyperlink', None, None)) pb(']8;;\x07', ('set_active_hyperlink', None, None))
pb('😀'.encode()[:-1])
pb('\033[?2026h', ('screen_set_mode', 2026, 1),)
pb('😀'.encode()[-1:])
pb('\033[?2026l', '\ufffd', ('screen_reset_mode', 2026, 1),)
pb('a', ('draw', 'a'))
def test_oth_codes(self): def test_oth_codes(self):
s = self.create_screen() s = self.create_screen()