mirror of
https://github.com/wez/wezterm.git
synced 2024-11-22 22:42:48 +03:00
vtparse: Add utf-8 support
This commit is contained in:
parent
2361d88c2a
commit
192eaeb3ff
@ -5,6 +5,7 @@ version = "0.1.0"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
utf8parse = "0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "0.6"
|
||||
|
@ -36,7 +36,6 @@ macro_rules! sparse_table {
|
||||
|
||||
fn apply_anywhere(anywhere: &StateMap, mut map: StateMap) -> StateMap {
|
||||
for (k, v) in anywhere {
|
||||
assert!(!map.contains_key(k));
|
||||
map.insert(*k, *v);
|
||||
}
|
||||
map
|
||||
@ -79,6 +78,13 @@ fn build_tables() -> Tables {
|
||||
r(0x19) => (Execute, Ground),
|
||||
0x1c..=0x1f => (Execute, Ground),
|
||||
0x20..=0x7f => (Print, Ground),
|
||||
// The following three ranges allow for
|
||||
// UTF-8 multibyte sequences to be recognized
|
||||
// and emitted as byte sequences in the ground
|
||||
// state.
|
||||
0xc2..=0xdf => (Utf8, Utf8Sequence),
|
||||
0xe0..=0xef => (Utf8, Utf8Sequence),
|
||||
0xf0..=0xf4 => (Utf8, Utf8Sequence),
|
||||
},
|
||||
),
|
||||
);
|
||||
@ -308,6 +314,10 @@ fn build_tables() -> Tables {
|
||||
r(0x19) => (Ignore, OscString),
|
||||
0x1c..=0x1f => (Ignore, OscString),
|
||||
0x20..=0x7f => (OscPut, OscString),
|
||||
// This extended range allows for UTF-8 characters
|
||||
// to be embedded in OSC parameters. It is not
|
||||
// part of the base state machine.
|
||||
0x80..=0xff => (OscPut, OscString),
|
||||
},
|
||||
),
|
||||
);
|
||||
|
@ -17,6 +17,7 @@ pub enum Action {
|
||||
OscStart = 12,
|
||||
OscPut = 13,
|
||||
OscEnd = 14,
|
||||
Utf8 = 15,
|
||||
}
|
||||
|
||||
impl Action {
|
||||
@ -46,6 +47,7 @@ pub enum State {
|
||||
OscString = 12,
|
||||
SosPmApcString = 13,
|
||||
Anywhere = 14,
|
||||
Utf8Sequence = 15,
|
||||
}
|
||||
|
||||
impl State {
|
||||
|
@ -1,5 +1,6 @@
|
||||
//! An implementation of the state machine described by
|
||||
//! https://vt100.net/emu/dec_ansi_parser
|
||||
use utf8parse::Parser as Utf8Parser;
|
||||
mod enums;
|
||||
use crate::enums::*;
|
||||
|
||||
@ -38,7 +39,7 @@ pub trait VTActor {
|
||||
/// ignore 7F. The VT320 introduced ISO Latin-1, which has 96 characters in its supplemental
|
||||
/// set, so emulators with a VT320 compatibility mode need to treat 7F as a printable
|
||||
/// character.
|
||||
fn print_byte(&mut self, b: u8);
|
||||
fn print(&mut self, b: char);
|
||||
|
||||
/// The C0 or C1 control function should be executed, which may have any one of a variety of
|
||||
/// effects, including changing the cursor position, suspending or resuming communications or
|
||||
@ -82,7 +83,7 @@ pub trait VTActor {
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub enum VTAction {
|
||||
PrintByte(u8),
|
||||
Print(char),
|
||||
ExecuteC0orC1(u8),
|
||||
DcsHook {
|
||||
params: Vec<i64>,
|
||||
@ -127,8 +128,8 @@ impl CollectingVTActor {
|
||||
}
|
||||
|
||||
impl VTActor for CollectingVTActor {
|
||||
fn print_byte(&mut self, b: u8) {
|
||||
self.actions.push(VTAction::PrintByte(b));
|
||||
fn print(&mut self, b: char) {
|
||||
self.actions.push(VTAction::Print(b));
|
||||
}
|
||||
|
||||
fn execute_c0_or_c1(&mut self, control: u8) {
|
||||
@ -197,7 +198,6 @@ const MAX_INTERMEDIATES: usize = 2;
|
||||
const MAX_OSC: usize = 16;
|
||||
const MAX_PARAMS: usize = 16;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VTParser {
|
||||
state: State,
|
||||
|
||||
@ -214,6 +214,8 @@ pub struct VTParser {
|
||||
num_params: usize,
|
||||
current_param: Option<i64>,
|
||||
params_full: bool,
|
||||
|
||||
utf8_parser: Utf8Parser,
|
||||
}
|
||||
|
||||
impl VTParser {
|
||||
@ -237,6 +239,8 @@ impl VTParser {
|
||||
num_params: 0,
|
||||
params_full: false,
|
||||
current_param: None,
|
||||
|
||||
utf8_parser: Utf8Parser::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -253,7 +257,7 @@ impl VTParser {
|
||||
eprintln!("action {:?} {}", action, param);
|
||||
match action {
|
||||
Action::None | Action::Ignore => {}
|
||||
Action::Print => actor.print_byte(param),
|
||||
Action::Print => actor.print(param as char),
|
||||
Action::Execute => actor.execute_c0_or_c1(param),
|
||||
Action::Clear => {
|
||||
self.num_intermediates = 0;
|
||||
@ -273,7 +277,6 @@ impl VTParser {
|
||||
}
|
||||
}
|
||||
Action::Param => {
|
||||
dbg!(&self);
|
||||
if self.params_full {
|
||||
return;
|
||||
}
|
||||
@ -328,7 +331,6 @@ impl VTParser {
|
||||
self.osc_full = false;
|
||||
}
|
||||
Action::OscPut => {
|
||||
dbg!(&self);
|
||||
if param == b';' {
|
||||
match self.osc_num_params {
|
||||
MAX_OSC => {
|
||||
@ -365,11 +367,53 @@ impl VTParser {
|
||||
actor.osc_dispatch(¶ms[0..limit]);
|
||||
}
|
||||
}
|
||||
Action::Utf8 => self.next_utf8(actor, param),
|
||||
}
|
||||
}
|
||||
|
||||
// Process a utf-8 multi-byte sequence.
|
||||
// The state tables emit Action::Utf8 to initiate a multi-byte
|
||||
// sequence, and once we're in the utf-8 state we'll defer to
|
||||
// this method for each byte until the Decode struct is signalled
|
||||
// that we're done.
|
||||
// We use the REPLACEMENT_CHARACTER for invalid sequences.
|
||||
// We return to the ground state after each codepoint, successful
|
||||
// or otherwise.
|
||||
fn next_utf8(&mut self, actor: &mut dyn VTActor, byte: u8) {
|
||||
struct Decoder<'a> {
|
||||
state: &'a mut State,
|
||||
actor: &'a mut dyn VTActor,
|
||||
}
|
||||
|
||||
impl<'a> utf8parse::Receiver for Decoder<'a> {
|
||||
fn codepoint(&mut self, c: char) {
|
||||
self.actor.print(c);
|
||||
*self.state = State::Ground;
|
||||
}
|
||||
|
||||
fn invalid_sequence(&mut self) {
|
||||
self.codepoint(std::char::REPLACEMENT_CHARACTER);
|
||||
}
|
||||
}
|
||||
|
||||
let mut decoder = Decoder {
|
||||
state: &mut self.state,
|
||||
actor,
|
||||
};
|
||||
self.utf8_parser.advance(&mut decoder, byte);
|
||||
}
|
||||
|
||||
pub fn parse(&mut self, bytes: &[u8], actor: &mut dyn VTActor) {
|
||||
for b in bytes {
|
||||
// While in utf-8 parsing mode, co-opt the vt state
|
||||
// table and instead use the utf-8 state table from the
|
||||
// parser. It will drop us back into the Ground state
|
||||
// after each recognized (or invalid) codepoint.
|
||||
if self.state == State::Utf8Sequence {
|
||||
self.next_utf8(actor, *b);
|
||||
continue;
|
||||
}
|
||||
|
||||
let (action, state) = lookup(self.state, *b);
|
||||
|
||||
if state != self.state {
|
||||
@ -401,8 +445,8 @@ mod test {
|
||||
assert_eq!(
|
||||
parse_as_vec(b"yo\x07\x1b[32mwoot\x1b[0mdone"),
|
||||
vec![
|
||||
VTAction::PrintByte(b'y'),
|
||||
VTAction::PrintByte(b'o'),
|
||||
VTAction::Print('y'),
|
||||
VTAction::Print('o'),
|
||||
VTAction::ExecuteC0orC1(0x07,),
|
||||
VTAction::CsiDispatch {
|
||||
params: vec![32],
|
||||
@ -410,20 +454,20 @@ mod test {
|
||||
ignored_excess_intermediates: false,
|
||||
byte: b'm',
|
||||
},
|
||||
VTAction::PrintByte(b'w',),
|
||||
VTAction::PrintByte(b'o',),
|
||||
VTAction::PrintByte(b'o',),
|
||||
VTAction::PrintByte(b't',),
|
||||
VTAction::Print('w',),
|
||||
VTAction::Print('o',),
|
||||
VTAction::Print('o',),
|
||||
VTAction::Print('t',),
|
||||
VTAction::CsiDispatch {
|
||||
params: vec![0],
|
||||
intermediates: vec![],
|
||||
ignored_excess_intermediates: false,
|
||||
byte: b'm',
|
||||
},
|
||||
VTAction::PrintByte(b'd',),
|
||||
VTAction::PrintByte(b'o',),
|
||||
VTAction::PrintByte(b'n',),
|
||||
VTAction::PrintByte(b'e',),
|
||||
VTAction::Print('d',),
|
||||
VTAction::Print('o',),
|
||||
VTAction::Print('n',),
|
||||
VTAction::Print('e',),
|
||||
]
|
||||
);
|
||||
}
|
||||
@ -432,7 +476,7 @@ mod test {
|
||||
fn test_print() {
|
||||
assert_eq!(
|
||||
parse_as_vec(b"yo"),
|
||||
vec![VTAction::PrintByte(b'y'), VTAction::PrintByte(b'o')]
|
||||
vec![VTAction::Print('y'), VTAction::Print('o')]
|
||||
);
|
||||
}
|
||||
|
||||
@ -571,4 +615,20 @@ mod test {
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osc_utf8() {
|
||||
assert_eq!(
|
||||
parse_as_vec("\x1b]\u{af}\x07".as_bytes()),
|
||||
vec![VTAction::OscDispatch(vec!["\u{af}".as_bytes().to_vec()])]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_utf8() {
|
||||
assert_eq!(
|
||||
parse_as_vec("\u{af}".as_bytes()),
|
||||
vec![VTAction::Print('\u{af}')]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user