vtparse: document things

2024-11-25 21:07:39 +03:00 · 2019-06-29 18:58:07 -07:00 · 2019-06-29 18:58:07 -07:00 · ed5dcd71da
commit ed5dcd71da
parent d67d4aab57
4 changed files with 117 additions and 5 deletions
--- a/vtparse/Cargo.toml
+++ b/vtparse/Cargo.toml
@ -3,6 +3,12 @@ authors = ["Wez Furlong <wez@wezfurlong.org>"]
 name = "vtparse"
 version = "0.1.0"
 edition = "2018"
+repository = "https://github.com/wez/wezterm"
+description = "Low level escape sequence parser"
+license = "MIT"
+documentation = "https://docs.rs/termwiz"
+keywords = ["terminal", "escape", "ansi", "sequence", "parser"]
+readme = "README.md"

 [dependencies]
 utf8parse = "0.1"
--- a/vtparse/README.md
+++ b/vtparse/README.md
@ -0,0 +1,20 @@
+# vtparse
+
+This is an implementation of a parser for escape and control sequences.
+It is based on the [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser).
+
+It has been modified slightly to support UTF-8 sequences.
+
+`vtparse` is the lowest level parser; it categorizes the basic
+types of sequences but does not ascribe any semantic meaning
+to them.
+
+You may wish to look at `termwiz::escape::parser::Parser` in the
+[termwiz](https://docs.rs/termwiz) crate if you're looking for semantic
+parsing.
+
+## Comparison with the `vte` crate
+
+`vtparse` has support for dynamically sized OSC buffers, which makes
+it suitable for processing large escape sequences, such as those
+used by the `iTerm2` image protocol.
--- a/vtparse/build.rs
+++ b/vtparse/build.rs
@ -335,7 +335,7 @@ fn pack(action: Action, state: State) -> u8 {

 fn write_tables(dest_path: std::path::PathBuf, tables: &Tables) -> std::io::Result<()> {
    let mut f = std::fs::File::create(&dest_path)?;
-    writeln!(f, "pub static TRANSITIONS: [[u8; 256]; 14] = [")?;
+    writeln!(f, "static TRANSITIONS: [[u8; 256]; 14] = [")?;
    for state_num in State::Ground as u8..State::Anywhere as u8 {
        let this_state = State::from_u8(state_num);
        writeln!(f, "  // State: {:?}", this_state)?;
@ -365,7 +365,7 @@ fn event_table(
    f: &mut std::fs::File,
    table: &HashMap<State, Action>,
 ) -> std::io::Result<()> {
-    writeln!(f, "pub static {}: [Action; 14] = [", label)?;
+    writeln!(f, "static {}: [Action; 14] = [", label)?;
    for state_num in State::Ground as u8..State::Anywhere as u8 {
        let this_state = State::from_u8(state_num);
        let action = table.get(&this_state).cloned().unwrap_or(Action::None);
--- a/vtparse/src/lib.rs
+++ b/vtparse/src/lib.rs
@ -1,5 +1,15 @@
 //! An implementation of the state machine described by
-//! https://vt100.net/emu/dec_ansi_parser
+//! [DEC ANSI Parser](https://vt100.net/emu/dec_ansi_parser), modified to support UTF-8.
+//!
+//! This is sufficient to broadly categorize ANSI/ECMA-48 escape sequences that are
+//! commonly used in terminal emulators.  It does not ascribe semantic meaning to
+//! those escape sequences; for example, if you wish to parse the SGR sequence
+//! that makes text bold, you will need to know which codes correspond to bold
+//! in your implementation of `VTActor`.
+//!
+//! You may wish to use `termwiz::escape::parser::Parser` in the
+//! [termwiz](https://docs.rs/termwiz/) crate if you don't want to have to research
+//! all those possible escape sequences for yourself.
 use utf8parse::Parser as Utf8Parser;
 mod enums;
 use crate::enums::*;
@ -26,10 +36,40 @@ fn lookup_exit(state: State) -> Action {
    unsafe { *EXIT.get_unchecked(state as usize) }
 }

-/// Terminology: an intermediate is a character in the range 0x20-0x2f
+/// `VTActor` is a trait that allows the host application to process
+/// the different kinds of sequence as they are parsed from the input
+/// stream.
+///
+/// The functions defined by this trait correspond to the actions defined
+/// in the [state machine](https://vt100.net/emu/dec_ansi_parser).
+///
+/// ## Terminology:
+/// An intermediate is a character in the range 0x20-0x2f that
+/// occurs before the final character in an escape sequence.
+///
+/// `ignored_excess_intermediates` is a boolean that is set in the case
+/// where there were more than two intermediate characters; no standard
+/// defines any codes with more than two.  Intermediates after
+/// the second will set this flag and are discarded.
+///
+/// `params` in most of the functions of this trait are decimal integer parameters in escape
+/// sequences.  They are separated by semicolon characters.  An omitted parameter is returned in
+/// this interface as a zero, which represents the default value for that parameter.
+///
+/// Other jargon used here is defined in
+/// [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf).
 pub trait VTActor {
    /// The current code should be mapped to a glyph according to the character set mappings and
-    /// shift states in effect, and that glyph should be displayed.  GL characters (20 to 7F) are
+    /// shift states in effect, and that glyph should be displayed.
+    ///
+    /// If the input was UTF-8 then it will have been mapped to a unicode code point.  Invalid
+    /// sequences are represented here using the unicode REPLACEMENT_CHARACTER.
+    ///
+    /// Otherwise the parameter will be a 7-bit printable value and may be subject to mapping
+    /// depending on other state maintained by the embedding application.
+    ///
+    /// ## Some commentary from the state machine documentation:
+    /// GL characters (20 to 7F) are
    /// printed. 20 (SP) and 7F (DEL) are included in this area, although both codes have special
    /// behaviour. If a 94-character set is mapped into GL, 20 will cause a space to be displayed,
    /// and 7F will be ignored. When a 96-character set is mapped into GL, both 20 and 7F may cause
@ -44,12 +84,18 @@ pub trait VTActor {
    /// The C0 or C1 control function should be executed, which may have any one of a variety of
    /// effects, including changing the cursor position, suspending or resuming communications or
    /// changing the shift states in effect.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on C0 and C1 control functions.
    fn execute_c0_or_c1(&mut self, control: u8);

    /// invoked when a final character arrives in the first part of a device control string. It
    /// determines the control function from the private marker, intermediate character(s) and
    /// final character, and executes it, passing in the parameter list. It also selects a handler
    /// function for the rest of the characters in the control string.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on device control strings.
    fn dcs_hook(
        &mut self,
        params: &[i64],
@ -60,10 +106,24 @@ pub trait VTActor {
    /// This action passes characters from the data string part of a device control string to a
    /// handler that has previously been selected by the dcs_hook action. C0 controls are also
    /// passed to the handler.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on device control strings.
    fn dcs_put(&mut self, byte: u8);

+    /// When a device control string is terminated by ST, CAN, SUB or ESC, this action calls the
+    /// previously selected handler function with an “end of data” parameter. This allows the
+    /// handler to finish neatly.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on device control strings.
    fn dcs_unhook(&mut self);

+    /// The final character of an escape sequence has arrived, so determine the control function
+    /// to be executed from the intermediate character(s) and final character, and execute it.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on escape sequences.
    fn esc_dispatch(
        &mut self,
        params: &[i64],
@ -71,6 +131,13 @@ pub trait VTActor {
        ignored_excess_intermediates: bool,
        byte: u8,
    );
+
+    /// A final character of a Control Sequence Initiator has arrived, so determine the control function to be executed from
+    /// private marker, intermediate character(s) and final character, and execute it, passing in
+    /// the parameter list.
+    ///
+    /// See [ECMA-48](http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-48,%202nd%20Edition,%20August%201979.pdf)
+    /// for more information on control functions.
    fn csi_dispatch(
        &mut self,
        params: &[i64],
@ -78,9 +145,18 @@ pub trait VTActor {
        ignored_excess_intermediates: bool,
        byte: u8,
    );
+
+    /// Called when an OSC string is terminated by ST, CAN, SUB or ESC.
+    ///
+    /// `params` is an array of byte strings (which may also be valid utf-8)
+    /// that were passed as semicolon separated parameters to the operating
+    /// system command.
    fn osc_dispatch(&mut self, params: &[&[u8]]);
 }

+/// `VTAction` is an alternative way to work with the parser; rather
+/// than implementing the VTActor trait you can use `CollectingVTActor`
+/// to capture the sequence of events into a `Vec<VTAction>`.
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub enum VTAction {
    Print(char),
@ -107,6 +183,10 @@ pub enum VTAction {
    OscDispatch(Vec<Vec<u8>>),
 }

+/// This is an implementation of `VTActor` that captures the events
+/// into an internal vector.
+/// It can be iterated via `into_iter` or have the internal
+/// vector extracted via `into_vec`.
 #[derive(Default)]
 pub struct CollectingVTActor {
    actions: Vec<VTAction>,
@ -198,6 +278,7 @@ const MAX_INTERMEDIATES: usize = 2;
 const MAX_OSC: usize = 16;
 const MAX_PARAMS: usize = 16;

+/// The virtual terminal parser.  It works together with an implementation of `VTActor`.
 pub struct VTParser {
    state: State,

@ -402,6 +483,8 @@ impl VTParser {
        self.utf8_parser.advance(&mut decoder, byte);
    }

+    /// Parse a single byte.  This may result in a call to one of the
+    /// methods on the provided `actor`.
    #[inline(always)]
    pub fn parse_byte(&mut self, byte: u8, actor: &mut dyn VTActor) {
        // While in utf-8 parsing mode, co-opt the vt state
@ -425,6 +508,9 @@ impl VTParser {
        }
    }

+    /// Parse a sequence of bytes.  The sequence need not be complete.
+    /// This may result in some number of calls to the methods on the
+    /// provided `actor`.
    pub fn parse(&mut self, bytes: &[u8], actor: &mut dyn VTActor) {
        for b in bytes {
            self.parse_byte(*b, actor);