1
1
mirror of https://github.com/wez/wezterm.git synced 2024-11-29 10:04:49 +03:00

vtparse: document things

This commit is contained in:
Wez Furlong 2019-06-29 18:58:07 -07:00
parent d67d4aab57
commit ed5dcd71da
4 changed files with 117 additions and 5 deletions

View File

@ -3,6 +3,12 @@ authors = ["Wez Furlong <wez@wezfurlong.org>"]
name = "vtparse"
version = "0.1.0"
edition = "2018"
repository = "https://github.com/wez/wezterm"
description = "Low level escape sequence parser"
license = "MIT"
documentation = "https://docs.rs/termwiz"
keywords = ["terminal", "escape", "ansi", "sequence", "parser"]
readme = "README.md"
[dependencies]
utf8parse = "0.1"

20
vtparse/README.md Normal file
View File

@ -0,0 +1,20 @@
# vtparse
This is an implementation of a parser for escape and control sequences.
It is based on the [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser).
It has been modified slightly to support UTF-8 sequences.
`vtparse` is the lowest level parser; it categorizes the basic
types of sequences but does not ascribe any semantic meaning
to them.
You may wish to look at `termwiz::escape::parser::Parser` in the
[termwiz](https://docs.rs/termwiz) crate if you're looking for semantic
parsing.
## Comparison with the `vte` crate
`vtparse` has support for dynamically sized OSC buffers, which makes
it suitable for processing large escape sequences, such as those
used by the `iTerm2` image protocol.

View File

@ -335,7 +335,7 @@ fn pack(action: Action, state: State) -> u8 {
fn write_tables(dest_path: std::path::PathBuf, tables: &Tables) -> std::io::Result<()> {
let mut f = std::fs::File::create(&dest_path)?;
writeln!(f, "pub static TRANSITIONS: [[u8; 256]; 14] = [")?;
writeln!(f, "static TRANSITIONS: [[u8; 256]; 14] = [")?;
for state_num in State::Ground as u8..State::Anywhere as u8 {
let this_state = State::from_u8(state_num);
writeln!(f, " // State: {:?}", this_state)?;
@ -365,7 +365,7 @@ fn event_table(
f: &mut std::fs::File,
table: &HashMap<State, Action>,
) -> std::io::Result<()> {
writeln!(f, "pub static {}: [Action; 14] = [", label)?;
writeln!(f, "static {}: [Action; 14] = [", label)?;
for state_num in State::Ground as u8..State::Anywhere as u8 {
let this_state = State::from_u8(state_num);
let action = table.get(&this_state).cloned().unwrap_or(Action::None);

View File

@ -1,5 +1,15 @@
//! An implementation of the state machine described by
//! https://vt100.net/emu/dec_ansi_parser
//! [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser), modified to support UTF-8.
//!
//! This is sufficient to broadly categorize ANSI/ECMA-48 escape sequences that are
//! commonly used in terminal emulators. It does not ascribe semantic meaning to
//! those escape sequences; for example, if you wish to parse the SGR sequence
//! that makes text bold, you will need to know which codes correspond to bold
//! in your implementation of `VTActor`.
//!
//! You may wish to use `termwiz::escape::parser::Parser` in the
//! [termwiz](https://docs.rs/termwiz/) crate if you don't want to have to research
//! all those possible escape sequences for yourself.
use utf8parse::Parser as Utf8Parser;
mod enums;
use crate::enums::*;
@ -26,10 +36,40 @@ fn lookup_exit(state: State) -> Action {
unsafe { *EXIT.get_unchecked(state as usize) }
}
/// Terminology: an intermediate is a character in the range 0x20-0x2f
/// `VTActor` is a trait that allows the host application to process
/// the different kinds of sequence as they are parsed from the input
/// stream.
///
/// The functions defined by this trait correspond to the actions defined
/// in the [state machine](https://vt100.net/emu/dec_ansi_parser).
///
/// ## Terminology:
/// An intermediate is a character in the range 0x20-0x2f that
/// occurs before the final character in an escape sequence.
///
/// `ignored_excess_intermediates` is a boolean that is set in the case
/// where there were more than two intermediate characters; no standard
/// defines any codes with more than two. Intermediates after
/// the second will set this flag and are discarded.
///
/// `params` in most of the functions of this trait are decimal integer parameters in escape
/// sequences. They are separated by semicolon characters. An omitted parameter is returned in
/// this interface as a zero, which represents the default value for that parameter.
///
/// Other jargon used here is defined in
/// [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf).
pub trait VTActor {
/// The current code should be mapped to a glyph according to the character set mappings and
/// shift states in effect, and that glyph should be displayed. GL characters (20 to 7F) are
/// shift states in effect, and that glyph should be displayed.
///
/// If the input was UTF-8 then it will have been mapped to a unicode code point. Invalid
/// sequences are represented here using the unicode REPLACEMENT_CHARACTER.
///
/// Otherwise the parameter will be a 7-bit printable value and may be subject to mapping
/// depending on other state maintained by the embedding application.
///
/// ## Some commentary from the state machine documentation:
/// GL characters (20 to 7F) are
/// printed. 20 (SP) and 7F (DEL) are included in this area, although both codes have special
/// behaviour. If a 94-character set is mapped into GL, 20 will cause a space to be displayed,
/// and 7F will be ignored. When a 96-character set is mapped into GL, both 20 and 7F may cause
@ -44,12 +84,18 @@ pub trait VTActor {
/// The C0 or C1 control function should be executed, which may have any one of a variety of
/// effects, including changing the cursor position, suspending or resuming communications or
/// changing the shift states in effect.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on C0 and C1 control functions.
fn execute_c0_or_c1(&mut self, control: u8);
/// invoked when a final character arrives in the first part of a device control string. It
/// determines the control function from the private marker, intermediate character(s) and
/// final character, and executes it, passing in the parameter list. It also selects a handler
/// function for the rest of the characters in the control string.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on device control strings.
fn dcs_hook(
&mut self,
params: &[i64],
@ -60,10 +106,24 @@ pub trait VTActor {
/// This action passes characters from the data string part of a device control string to a
/// handler that has previously been selected by the dcs_hook action. C0 controls are also
/// passed to the handler.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on device control strings.
fn dcs_put(&mut self, byte: u8);
/// When a device control string is terminated by ST, CAN, SUB or ESC, this action calls the
/// previously selected handler function with an “end of data” parameter. This allows the
/// handler to finish neatly.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on device control strings.
fn dcs_unhook(&mut self);
/// The final character of an escape sequence has arrived, so determine the control function
/// to be executed from the intermediate character(s) and final character, and execute it.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on escape sequences.
fn esc_dispatch(
&mut self,
params: &[i64],
@ -71,6 +131,13 @@ pub trait VTActor {
ignored_excess_intermediates: bool,
byte: u8,
);
/// A final character of a Control Sequence Initiator has arrived, so determine the control function to be executed from
/// private marker, intermediate character(s) and final character, and execute it, passing in
/// the parameter list.
///
/// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
/// for more information on control functions.
fn csi_dispatch(
&mut self,
params: &[i64],
@ -78,9 +145,18 @@ pub trait VTActor {
ignored_excess_intermediates: bool,
byte: u8,
);
/// Called when an OSC string is terminated by ST, CAN, SUB or ESC.
///
/// `params` is an array of byte strings (which may also be valid utf-8)
/// that were passed as semicolon separated parameters to the operating
/// system command.
fn osc_dispatch(&mut self, params: &[&[u8]]);
}
/// `VTAction` is an alternative way to work with the parser; rather
/// than implementing the VTActor trait you can use `CollectingVTActor`
/// to capture the sequence of events into a `Vec<VTAction>`.
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum VTAction {
Print(char),
@ -107,6 +183,10 @@ pub enum VTAction {
OscDispatch(Vec<Vec<u8>>),
}
/// This is an implementation of `VTActor` that captures the events
/// into an internal vector.
/// It can be iterated via `into_iter` or have the internal
/// vector extracted via `into_vec`.
#[derive(Default)]
pub struct CollectingVTActor {
actions: Vec<VTAction>,
@ -198,6 +278,7 @@ const MAX_INTERMEDIATES: usize = 2;
const MAX_OSC: usize = 16;
const MAX_PARAMS: usize = 16;
/// The virtual terminal parser. It works together with an implementation of `VTActor`.
pub struct VTParser {
state: State,
@ -402,6 +483,8 @@ impl VTParser {
self.utf8_parser.advance(&mut decoder, byte);
}
/// Parse a single byte. This may result in a call to one of the
/// methods on the provided `actor`.
#[inline(always)]
pub fn parse_byte(&mut self, byte: u8, actor: &mut dyn VTActor) {
// While in utf-8 parsing mode, co-opt the vt state
@ -425,6 +508,9 @@ impl VTParser {
}
}
/// Parse a sequence of bytes. The sequence need not be complete.
/// This may result in some number of calls to the methods on the
/// provided `actor`.
pub fn parse(&mut self, bytes: &[u8], actor: &mut dyn VTActor) {
for b in bytes {
self.parse_byte(*b, actor);