mirror of
https://github.com/wez/wezterm.git
synced 2024-11-25 21:07:39 +03:00
vtparse: document things
This commit is contained in:
parent
d67d4aab57
commit
ed5dcd71da
@ -3,6 +3,12 @@ authors = ["Wez Furlong <wez@wezfurlong.org>"]
|
||||
name = "vtparse"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
repository = "https://github.com/wez/wezterm"
|
||||
description = "Low level escape sequence parser"
|
||||
license = "MIT"
|
||||
documentation = "https://docs.rs/termwiz"
|
||||
keywords = ["terminal", "escape", "ansi", "sequence", "parser"]
|
||||
readme = "README.md"
|
||||
|
||||
[dependencies]
|
||||
utf8parse = "0.1"
|
||||
|
20
vtparse/README.md
Normal file
20
vtparse/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# vtparse
|
||||
|
||||
This is an implementation of a parser for escape and control sequences.
|
||||
It is based on the [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser).
|
||||
|
||||
It has been modified slightly to support UTF-8 sequences.
|
||||
|
||||
`vtparse` is the lowest level parser; it categorizes the basic
|
||||
types of sequences but does not ascribe any semantic meaning
|
||||
to them.
|
||||
|
||||
You may wish to look at `termwiz::escape::parser::Parser` in the
|
||||
[termwiz](https://docs.rs/termwiz) crate if you're looking for semantic
|
||||
parsing.
|
||||
|
||||
## Comparison with the `vte` crate
|
||||
|
||||
`vtparse` has support for dynamically sized OSC buffers, which makes
|
||||
it suitable for processing large escape sequences, such as those
|
||||
used by the `iTerm2` image protocol.
|
@ -335,7 +335,7 @@ fn pack(action: Action, state: State) -> u8 {
|
||||
|
||||
fn write_tables(dest_path: std::path::PathBuf, tables: &Tables) -> std::io::Result<()> {
|
||||
let mut f = std::fs::File::create(&dest_path)?;
|
||||
writeln!(f, "pub static TRANSITIONS: [[u8; 256]; 14] = [")?;
|
||||
writeln!(f, "static TRANSITIONS: [[u8; 256]; 14] = [")?;
|
||||
for state_num in State::Ground as u8..State::Anywhere as u8 {
|
||||
let this_state = State::from_u8(state_num);
|
||||
writeln!(f, " // State: {:?}", this_state)?;
|
||||
@ -365,7 +365,7 @@ fn event_table(
|
||||
f: &mut std::fs::File,
|
||||
table: &HashMap<State, Action>,
|
||||
) -> std::io::Result<()> {
|
||||
writeln!(f, "pub static {}: [Action; 14] = [", label)?;
|
||||
writeln!(f, "static {}: [Action; 14] = [", label)?;
|
||||
for state_num in State::Ground as u8..State::Anywhere as u8 {
|
||||
let this_state = State::from_u8(state_num);
|
||||
let action = table.get(&this_state).cloned().unwrap_or(Action::None);
|
||||
|
@ -1,5 +1,15 @@
|
||||
//! An implementation of the state machine described by
|
||||
//! https://vt100.net/emu/dec_ansi_parser
|
||||
//! [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser), modified to support UTF-8.
|
||||
//!
|
||||
//! This is sufficient to broadly categorize ANSI/ECMA-48 escape sequences that are
|
||||
//! commonly used in terminal emulators. It does not ascribe semantic meaning to
|
||||
//! those escape sequences; for example, if you wish to parse the SGR sequence
|
||||
//! that makes text bold, you will need to know which codes correspond to bold
|
||||
//! in your implementation of `VTActor`.
|
||||
//!
|
||||
//! You may wish to use `termwiz::escape::parser::Parser` in the
|
||||
//! [termwiz](https://docs.rs/termwiz/) crate if you don't want to have to research
|
||||
//! all those possible escape sequences for yourself.
|
||||
use utf8parse::Parser as Utf8Parser;
|
||||
mod enums;
|
||||
use crate::enums::*;
|
||||
@ -26,10 +36,40 @@ fn lookup_exit(state: State) -> Action {
|
||||
unsafe { *EXIT.get_unchecked(state as usize) }
|
||||
}
|
||||
|
||||
/// Terminology: an intermediate is a character in the range 0x20-0x2f
|
||||
/// `VTActor` is a trait that allows the host application to process
|
||||
/// the different kinds of sequence as they are parsed from the input
|
||||
/// stream.
|
||||
///
|
||||
/// The functions defined by this trait correspond to the actions defined
|
||||
/// in the [state machine](https://vt100.net/emu/dec_ansi_parser).
|
||||
///
|
||||
/// ## Terminology:
|
||||
/// An intermediate is a character in the range 0x20-0x2f that
|
||||
/// occurs before the final character in an escape sequence.
|
||||
///
|
||||
/// `ignored_excess_intermediates` is a boolean that is set in the case
|
||||
/// where there were more than two intermediate characters; no standard
|
||||
/// defines any codes with more than two. Intermediates after
|
||||
/// the second will set this flag and are discarded.
|
||||
///
|
||||
/// `params` in most of the functions of this trait are decimal integer parameters in escape
|
||||
/// sequences. They are separated by semicolon characters. An omitted parameter is returned in
|
||||
/// this interface as a zero, which represents the default value for that parameter.
|
||||
///
|
||||
/// Other jargon used here is defined in
|
||||
/// [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf).
|
||||
pub trait VTActor {
|
||||
/// The current code should be mapped to a glyph according to the character set mappings and
|
||||
/// shift states in effect, and that glyph should be displayed. GL characters (20 to 7F) are
|
||||
/// shift states in effect, and that glyph should be displayed.
|
||||
///
|
||||
/// If the input was UTF-8 then it will have been mapped to a unicode code point. Invalid
|
||||
/// sequences are represented here using the unicode REPLACEMENT_CHARACTER.
|
||||
///
|
||||
/// Otherwise the parameter will be a 7-bit printable value and may be subject to mapping
|
||||
/// depending on other state maintained by the embedding application.
|
||||
///
|
||||
/// ## Some commentary from the state machine documentation:
|
||||
/// GL characters (20 to 7F) are
|
||||
/// printed. 20 (SP) and 7F (DEL) are included in this area, although both codes have special
|
||||
/// behaviour. If a 94-character set is mapped into GL, 20 will cause a space to be displayed,
|
||||
/// and 7F will be ignored. When a 96-character set is mapped into GL, both 20 and 7F may cause
|
||||
@ -44,12 +84,18 @@ pub trait VTActor {
|
||||
/// The C0 or C1 control function should be executed, which may have any one of a variety of
|
||||
/// effects, including changing the cursor position, suspending or resuming communications or
|
||||
/// changing the shift states in effect.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on C0 and C1 control functions.
|
||||
fn execute_c0_or_c1(&mut self, control: u8);
|
||||
|
||||
/// invoked when a final character arrives in the first part of a device control string. It
|
||||
/// determines the control function from the private marker, intermediate character(s) and
|
||||
/// final character, and executes it, passing in the parameter list. It also selects a handler
|
||||
/// function for the rest of the characters in the control string.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on device control strings.
|
||||
fn dcs_hook(
|
||||
&mut self,
|
||||
params: &[i64],
|
||||
@ -60,10 +106,24 @@ pub trait VTActor {
|
||||
/// This action passes characters from the data string part of a device control string to a
|
||||
/// handler that has previously been selected by the dcs_hook action. C0 controls are also
|
||||
/// passed to the handler.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on device control strings.
|
||||
fn dcs_put(&mut self, byte: u8);
|
||||
|
||||
/// When a device control string is terminated by ST, CAN, SUB or ESC, this action calls the
|
||||
/// previously selected handler function with an “end of data” parameter. This allows the
|
||||
/// handler to finish neatly.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on device control strings.
|
||||
fn dcs_unhook(&mut self);
|
||||
|
||||
/// The final character of an escape sequence has arrived, so determine the control function
|
||||
/// to be executed from the intermediate character(s) and final character, and execute it.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on escape sequences.
|
||||
fn esc_dispatch(
|
||||
&mut self,
|
||||
params: &[i64],
|
||||
@ -71,6 +131,13 @@ pub trait VTActor {
|
||||
ignored_excess_intermediates: bool,
|
||||
byte: u8,
|
||||
);
|
||||
|
||||
/// A final character of a Control Sequence Initiator has arrived, so determine the control function to be executed from
|
||||
/// private marker, intermediate character(s) and final character, and execute it, passing in
|
||||
/// the parameter list.
|
||||
///
|
||||
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
|
||||
/// for more information on control functions.
|
||||
fn csi_dispatch(
|
||||
&mut self,
|
||||
params: &[i64],
|
||||
@ -78,9 +145,18 @@ pub trait VTActor {
|
||||
ignored_excess_intermediates: bool,
|
||||
byte: u8,
|
||||
);
|
||||
|
||||
/// Called when an OSC string is terminated by ST, CAN, SUB or ESC.
|
||||
///
|
||||
/// `params` is an array of byte strings (which may also be valid utf-8)
|
||||
/// that were passed as semicolon separated parameters to the operating
|
||||
/// system command.
|
||||
fn osc_dispatch(&mut self, params: &[&[u8]]);
|
||||
}
|
||||
|
||||
/// `VTAction` is an alternative way to work with the parser; rather
|
||||
/// than implementing the VTActor trait you can use `CollectingVTActor`
|
||||
/// to capture the sequence of events into a `Vec<VTAction>`.
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub enum VTAction {
|
||||
Print(char),
|
||||
@ -107,6 +183,10 @@ pub enum VTAction {
|
||||
OscDispatch(Vec<Vec<u8>>),
|
||||
}
|
||||
|
||||
/// This is an implementation of `VTActor` that captures the events
|
||||
/// into an internal vector.
|
||||
/// It can be iterated via `into_iter` or have the internal
|
||||
/// vector extracted via `into_vec`.
|
||||
#[derive(Default)]
|
||||
pub struct CollectingVTActor {
|
||||
actions: Vec<VTAction>,
|
||||
@ -198,6 +278,7 @@ const MAX_INTERMEDIATES: usize = 2;
|
||||
const MAX_OSC: usize = 16;
|
||||
const MAX_PARAMS: usize = 16;
|
||||
|
||||
/// The virtual terminal parser. It works together with an implementation of `VTActor`.
|
||||
pub struct VTParser {
|
||||
state: State,
|
||||
|
||||
@ -402,6 +483,8 @@ impl VTParser {
|
||||
self.utf8_parser.advance(&mut decoder, byte);
|
||||
}
|
||||
|
||||
/// Parse a single byte. This may result in a call to one of the
|
||||
/// methods on the provided `actor`.
|
||||
#[inline(always)]
|
||||
pub fn parse_byte(&mut self, byte: u8, actor: &mut dyn VTActor) {
|
||||
// While in utf-8 parsing mode, co-opt the vt state
|
||||
@ -425,6 +508,9 @@ impl VTParser {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a sequence of bytes. The sequence need not be complete.
|
||||
/// This may result in some number of calls to the methods on the
|
||||
/// provided `actor`.
|
||||
pub fn parse(&mut self, bytes: &[u8], actor: &mut dyn VTActor) {
|
||||
for b in bytes {
|
||||
self.parse_byte(*b, actor);
|
||||
|
Loading…
Reference in New Issue
Block a user