wezterm: improve shaping of emoji

This is one of those massive time sinks that I almost regret... As part of recent changes to dust-off the allsorts shaper, I noticed that the harfbuzz shaper wasn't shaping as well as the allsorts one. This commit: * Adds emoji-test.txt, a text file you can `cat` to see how well the emoji are shaped and rendered. * Fixes (or at least, improves) the column width calculation for combining sequences such as "deaf man" which was previously calculated at 3 cells in width when it should have just been 2 cells wide, which resulted in a weird "prismatic" effect during rendering where the glyph would be rendered with an extra RHS portion of the glyph across 3 cells. * Improved/simplified the clustering logic used to compute fallbacks. Previously we could end up with some wonky/disjoint sequence of undefined glyphs which wouldn't be successfully resolved from a fallback font. We now make a better effort to consolidate runs of undefined glyphs for fallback. * For sequences such as "woman with veil: dark skin tone" that occupy a single cell, the shaper may return 3 clusters with 3 glyphs in the case that the font doesn't fully support this grapheme. At render time we'd just take the last glyph from that sequence and render it, resulting in eg: a female symbol in this particular case. It is generally a bit more useful to show the first glyph in the sequence (eg: person with veil) rather than the gender or skin tone, so the renderer now checks for this kind of overlapping sequence and renders only the first glyph from the sequence.
2024-11-22 04:56:12 +03:00 · 2020-11-23 13:45:38 -08:00 · 2020-11-23 13:45:38 -08:00 · a063d20cf0
commit a063d20cf0
parent dc453f2eed
12 changed files with 4682 additions and 109 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3873,9 +3873,9 @@ dependencies = [

 [[package]]
 name = "unicode-segmentation"
-version = "1.6.0"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
+checksum = "db8716a166f290ff49dabc18b44aa407cb7c6dbe1aa0971b44b8a24b0ca35aae"

 [[package]]
 name = "unicode-width"
--- a/mux/Cargo.toml
+++ b/mux/Cargo.toml
@ -31,6 +31,6 @@ termwiz = { path = "../termwiz" }
 textwrap = "0.12"
 thiserror = "1.0"
 tmux-cc = { path = "../tmux-cc" }
-unicode-segmentation = "1.6"
+unicode-segmentation = "1.7"
 url = "2"
 wezterm-term = { path = "../term", features=["use_serde"] }
--- a/term/Cargo.toml
+++ b/term/Cargo.toml
@ -24,7 +24,7 @@ ordered-float = "1.0"
 palette = "0.5"
 serde = {version="1.0", features = ["rc"]}
 sha2 = "0.9"
-unicode-segmentation = "1.6"
+unicode-segmentation = "1.7"
 unicode-width = "0.1"
 url = "2"

--- a/termwiz/Cargo.toml
+++ b/termwiz/Cargo.toml
@ -27,7 +27,7 @@ regex = "1"
 semver = "0.9"
 serde = {version="1.0", features = ["rc", "derive"], optional=true}
 terminfo = "0.7"
-unicode-segmentation = "1.6"
+unicode-segmentation = "1.7"
 unicode-width = "0.1"
 xi-unicode = "0.2"
 vtparse = { version="0.3", path="../vtparse" }
--- a/termwiz/src/cell.rs
+++ b/termwiz/src/cell.rs
@ -530,13 +530,27 @@ pub fn grapheme_column_width(s: &str) -> usize {
    // the desired value.
    // Let's check for emoji-ness for ourselves first
    use xi_unicode::EmojiExt;
+    let mut emoji = false;
    for c in s.chars() {
        if c.is_emoji_modifier_base() || c.is_emoji_modifier() {
            // treat modifier sequences as double wide
            return 2;
        }
+        if c.is_emoji() {
+            emoji = true;
+        }
+    }
+    let width = UnicodeWidthStr::width(s);
+    if emoji {
+        // For sequences such as "deaf man", UnicodeWidthStr::width()
+        // returns 3 because of the widths of the component glyphs,
+        // rather than 2 for a single double width grapheme.
+        // If we saw any emoji within the characters then we assume
+        // that it can be a maximum of 2 cells in width.
+        width.min(2)
+    } else {
+        width
    }
-    UnicodeWidthStr::width(s)
 }

 /// Models a change in the attributes of a cell in a stream of changes.
@ -627,5 +641,16 @@ mod test {
            "width of {} should be 2",
            women_holding_hands_dark_skin_tone_medium_light_skin_tone
        );
+
+        let deaf_man = "\u{1F9CF}\u{200D}\u{2642}\u{FE0F}";
+        eprintln!("deaf_man chars");
+        for c in deaf_man.chars() {
+            eprintln!("char: {:?}", c);
+            use xi_unicode::EmojiExt;
+            eprintln!("xi emoji: {}", c.is_emoji());
+            eprintln!("xi emoji_mod: {}", c.is_emoji_modifier());
+            eprintln!("xi emoji_mod_base: {}", c.is_emoji_modifier_base());
+        }
+        assert_eq!(unicode_column_width(deaf_man), 2);
    }
 }
--- a/termwiz/src/cellcluster.rs
+++ b/termwiz/src/cellcluster.rs
@ -21,6 +21,7 @@ impl CellCluster {

        for (cell_idx, c) in iter {
            let cell_str = c.str();
+            let normalized_attr = c.attrs().clone().set_wrapped(false).clone();

            last_cluster = match last_cluster.take() {
                None => {
@ -28,10 +29,10 @@ impl CellCluster {
                    Some(CellCluster::new(c.attrs().clone(), cell_str, cell_idx))
                }
                Some(mut last) => {
-                    if last.attrs != *c.attrs() {
+                    if last.attrs != normalized_attr {
                        // Flush pending cluster and start a new one
                        clusters.push(last);
-                        Some(CellCluster::new(c.attrs().clone(), cell_str, cell_idx))
+                        Some(CellCluster::new(normalized_attr, cell_str, cell_idx))
                    } else {
                        // Add to current cluster
                        last.add(cell_str, cell_idx);
--- a/test-data/emoji-test.txt
+++ b/test-data/emoji-test.txt
--- a/wezterm-font/Cargo.toml
+++ b/wezterm-font/Cargo.toml
@ -20,7 +20,7 @@ mux = { path = "../mux" }
 termwiz = { path = "../termwiz" }
 thiserror = "1.0"
 tinyvec = "1.1" # Note: constrained by the allsorts crate
-unicode-segmentation = "1.6"
+unicode-segmentation = "1.7"
 unicode-general-category = "0.3"
 walkdir = "2"
 wezterm-term = { path = "../term", features=["use_serde"] }
--- a/wezterm-font/src/hbwrap.rs
+++ b/wezterm-font/src/hbwrap.rs
@ -222,6 +222,12 @@ impl Buffer {
        }
    }

+    pub fn set_cluster_level(&mut self, level: hb_buffer_cluster_level_t) {
+        unsafe {
+            hb_buffer_set_cluster_level(self.buf, level);
+        }
+    }
+
    pub fn set_direction(&mut self, direction: hb_direction_t) {
        unsafe {
            hb_buffer_set_direction(self.buf, direction);
--- a/wezterm-font/src/shaper/harfbuzz.rs
+++ b/wezterm-font/src/shaper/harfbuzz.rs
@ -5,17 +5,31 @@ use crate::shaper::{FallbackIdx, FontMetrics, FontShaper, GlyphInfo};
 use crate::units::*;
 use anyhow::anyhow;
 use config::configuration;
-use log::{debug, error};
+use log::error;
 use std::cell::{RefCell, RefMut};
 use termwiz::cell::unicode_column_width;
 use thiserror::Error;
+use unicode_segmentation::UnicodeSegmentation;

-fn make_glyphinfo(
-    text: &str,
-    font_idx: usize,
-    info: &harfbuzz::hb_glyph_info_t,
-    pos: &harfbuzz::hb_glyph_position_t,
-) -> GlyphInfo {
+#[derive(Clone)]
+struct Info<'a> {
+    cluster: usize,
+    len: usize,
+    codepoint: harfbuzz::hb_codepoint_t,
+    pos: &'a harfbuzz::hb_glyph_position_t,
+}
+
+impl<'a> std::fmt::Debug for Info<'a> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("Info")
+            .field("cluster", &self.cluster)
+            .field("len", &self.len)
+            .field("codepoint", &self.codepoint)
+            .finish()
+    }
+}
+
+fn make_glyphinfo(text: &str, font_idx: usize, info: &Info) -> GlyphInfo {
    let num_cells = unicode_column_width(text) as u8;
    GlyphInfo {
        #[cfg(debug_assertions)]
@ -23,11 +37,11 @@ fn make_glyphinfo(
        num_cells,
        font_idx,
        glyph_pos: info.codepoint,
-        cluster: info.cluster,
-        x_advance: PixelLength::new(f64::from(pos.x_advance) / 64.0),
-        y_advance: PixelLength::new(f64::from(pos.y_advance) / 64.0),
-        x_offset: PixelLength::new(f64::from(pos.x_offset) / 64.0),
-        y_offset: PixelLength::new(f64::from(pos.y_offset) / 64.0),
+        cluster: info.cluster as u32,
+        x_advance: PixelLength::new(f64::from(info.pos.x_advance) / 64.0),
+        y_advance: PixelLength::new(f64::from(info.pos.y_advance) / 64.0),
+        x_offset: PixelLength::new(f64::from(info.pos.x_offset) / 64.0),
+        y_offset: PixelLength::new(f64::from(info.pos.y_offset) / 64.0),
    }
 }

@ -36,6 +50,8 @@ struct FontPair {
    font: harfbuzz::Font,
    size: f64,
    dpi: u32,
+    cell_width: f64,
+    cell_height: f64,
 }

 pub struct HarfbuzzShaper {
@ -55,7 +71,6 @@ struct NoMoreFallbacksError {
 /// original string.  That isn't perfect, but it should
 /// be good enough to indicate that something isn't right.
 fn make_question_string(s: &str) -> String {
-    use unicode_segmentation::UnicodeSegmentation;
    let len = s.graphemes(true).count();
    let mut result = String::new();
    let c = if !is_question_string(s) {
@ -112,6 +127,8 @@ impl HarfbuzzShaper {
                        font,
                        size: 0.,
                        dpi: 0,
+                        cell_width: 0.,
+                        cell_height: 0.,
                    });
                }

@ -141,16 +158,24 @@ impl HarfbuzzShaper {
        buf.set_direction(harfbuzz::hb_direction_t::HB_DIRECTION_LTR);
        buf.set_language(harfbuzz::language_from_string("en")?);
        buf.add_str(s);
+        buf.set_cluster_level(
+            harfbuzz::hb_buffer_cluster_level_t::HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES,
+        );
+
+        let cell_width;

        {
            match self.load_fallback(font_idx)? {
                #[allow(clippy::float_cmp)]
                Some(mut pair) => {
                    if pair.size != font_size || pair.dpi != dpi {
-                        pair.face.set_font_size(font_size, dpi)?;
+                        let (width, height) = pair.face.set_font_size(font_size, dpi)?;
                        pair.size = font_size;
                        pair.dpi = dpi;
+                        pair.cell_width = width;
+                        pair.cell_height = height;
                    }
+                    cell_width = pair.cell_width;
                    pair.font.shape(&mut buf, Some(features.as_slice()));
                }
                None => {
@ -162,14 +187,11 @@ impl HarfbuzzShaper {
            }
        }

-        let infos = buf.glyph_infos();
+        let hb_infos = buf.glyph_infos();
        let positions = buf.glyph_positions();

        let mut cluster = Vec::new();

-        let mut last_text_pos = None;
-        let mut first_fallback_pos = None;
-
        // Compute the lengths of the text clusters.
        // Ligatures and combining characters mean
        // that a single glyph can take the place of
@ -182,45 +204,61 @@ impl HarfbuzzShaper {
        // the fragments to properly handle fallback,
        // and they're handy to have for debugging
        // purposes too.
-        let mut sizes = Vec::with_capacity(s.len());
-        for (i, info) in infos.iter().enumerate() {
-            let pos = info.cluster as usize;
-            let mut size = 1;
-            if let Some(last_pos) = last_text_pos {
-                let diff = pos - last_pos;
-                if diff > 1 {
-                    sizes[i - 1] = diff;
-                }
-            } else if pos != 0 {
-                size = pos;
-            }
-            last_text_pos = Some(pos);
-            sizes.push(size);
-        }
-        if let Some(last_pos) = last_text_pos {
-            let diff = s.len() - last_pos;
-            if diff > 1 {
-                let last = sizes.len() - 1;
-                sizes[last] = diff;
-            }
-        }
-        //debug!("sizes: {:?}", sizes);
+        let mut info_clusters: Vec<Vec<Info>> = vec![];
+        let mut info_iter = hb_infos.iter().enumerate().peekable();
+        while let Some((i, info)) = info_iter.next() {
+            let next_pos = info_iter
+                .peek()
+                .map(|(_, info)| info.cluster as usize)
+                .unwrap_or(s.len());

-        // Now make a second pass to determine if we need
-        // to perform fallback to a later font.
-        // We can determine this by looking at the codepoint.
-        for (i, info) in infos.iter().enumerate() {
-            let pos = info.cluster as usize;
-            if info.codepoint == 0 {
-                if first_fallback_pos.is_none() {
-                    // Start of a run that needs fallback
-                    first_fallback_pos = Some(pos);
-                }
-            } else if let Some(start_pos) = first_fallback_pos {
-                // End of a fallback run
-                //debug!("range: {:?}-{:?} needs fallback", start, pos);
+            let info = Info {
+                cluster: info.cluster as usize,
+                len: next_pos - info.cluster as usize,
+                codepoint: info.codepoint,
+                pos: &positions[i],
+            };
+
+            if let Some(ref mut cluster) = info_clusters.last_mut() {
+                if cluster.last().unwrap().cluster == info.cluster {
+                    cluster.push(info);
+                    continue;
+                }
+                // Don't fragment runs of unresolve codepoints; they could be a sequence
+                // that shapes together in a fallback font.
+                if info.codepoint == 0 {
+                    let prior = cluster.last_mut().unwrap();
+                    if prior.codepoint == 0 {
+                        prior.len = next_pos - prior.cluster;
+                        continue;
+                    }
+                }
+            }
+            info_clusters.push(vec![info]);
+        }
+        /*
+        if font_idx > 0 {
+            log::error!("do_shape: font_idx={} {:?} {:?}", font_idx, s, info_clusters);
+        }
+        */
+
+        for infos in &info_clusters {
+            let cluster_len: usize = infos.iter().map(|info| info.len).sum();
+            let cluster_start = infos.first().unwrap().cluster;
+            let substr = &s[cluster_start..cluster_start + cluster_len];
+
+            let incomplete = infos.iter().find(|info| info.codepoint == 0).is_some();
+
+            if incomplete {
+                // One or more entries didn't have a corresponding glyph,
+                // so try a fallback
+
+                /*
+                if font_idx == 0 {
+                    log::error!("incomplete cluster for text={:?} {:?}", s, info_clusters);
+                }
+                */

-                let substr = &s[start_pos..pos];
                let mut shape = match self.do_shape(font_idx + 1, substr, font_size, dpi) {
                    Ok(shape) => Ok(shape),
                    Err(e) => {
@ -231,51 +269,49 @@ impl HarfbuzzShaper {

                // Fixup the cluster member to match our current offset
                for mut info in &mut shape {
-                    info.cluster += start_pos as u32;
+                    info.cluster += cluster_start as u32;
                }
                cluster.append(&mut shape);
-
-                first_fallback_pos = None;
+                continue;
            }
-            if info.codepoint != 0 {
-                if s.is_char_boundary(pos) && s.is_char_boundary(pos + sizes[i]) {
-                    let text = &s[pos..pos + sizes[i]];
-                    //debug!("glyph from `{}`", text);
-                    cluster.push(make_glyphinfo(text, font_idx, info, &positions[i]));
+
+            let mut next_idx = 0;
+            for info in infos.iter() {
+                if info.pos.x_advance == 0 {
+                    continue;
+                }
+
+                let nom_width =
+                    ((f64::from(info.pos.x_advance) / 64.0) / cell_width).ceil() as usize;
+
+                let len;
+                if nom_width == 0 || !substr.is_char_boundary(next_idx + nom_width) {
+                    let remainder = &substr[next_idx..];
+                    if let Some(g) = remainder.graphemes(true).next() {
+                        len = g.len();
+                    } else {
+                        len = remainder.len();
+                    }
                } else {
-                    cluster.append(&mut self.do_shape(0, "?", font_size, dpi)?);
+                    len = nom_width;
                }
+
+                let glyph = if len > 0 {
+                    let text = &substr[next_idx..next_idx + len];
+                    make_glyphinfo(text, font_idx, info)
+                } else {
+                    make_glyphinfo("__", font_idx, info)
+                };
+
+                if glyph.x_advance != PixelLength::new(0.0) {
+                    // log::error!("glyph: {:?}, nominal width: {:?}/{:?} = {:?}", glyph, glyph.x_advance, cell_width, nom_width);
+                    cluster.push(glyph);
+                }
+
+                next_idx += len;
            }
        }

-        // Check to see if we started and didn't finish a
-        // fallback run.
-        if let Some(start_pos) = first_fallback_pos {
-            let substr = &s[start_pos..];
-            if false {
-                debug!(
-                    "at end {:?}-{:?} needs fallback {}",
-                    start_pos,
-                    s.len() - 1,
-                    substr,
-                );
-            }
-            let mut shape = match self.do_shape(font_idx + 1, substr, font_size, dpi) {
-                Ok(shape) => Ok(shape),
-                Err(e) => {
-                    error!("{:?} for {:?}", e, substr);
-                    self.do_shape(0, &make_question_string(substr), font_size, dpi)
-                }
-            }?;
-            // Fixup the cluster member to match our current offset
-            for mut info in &mut shape {
-                info.cluster += start_pos as u32;
-            }
-            cluster.append(&mut shape);
-        }
-
-        //debug!("shaped: {:#?}", cluster);
-
        Ok(cluster)
    }
 }
@ -285,6 +321,16 @@ impl FontShaper for HarfbuzzShaper {
        let start = std::time::Instant::now();
        let result = self.do_shape(0, text, size, dpi);
        metrics::value!("shape.harfbuzz", start.elapsed());
+        /*
+        if let Ok(glyphs) = &result {
+            for g in glyphs {
+                if g.font_idx > 0 {
+                    log::error!("{:#?}", result);
+                    break;
+                }
+            }
+        }
+        */
        result
    }

--- a/wezterm-gui/Cargo.toml
+++ b/wezterm-gui/Cargo.toml
@ -57,7 +57,7 @@ textwrap = "0.12"
 thiserror = "1.0"
 umask = { path = "../umask" }
 unicode-normalization = "0.1"
-unicode-segmentation = "1.6"
+unicode-segmentation = "1.7"
 unicode-width = "0.1"
 url = "2"
 walkdir = "2"
--- a/wezterm-gui/src/gui/termwindow.rs
+++ b/wezterm-gui/src/gui/termwindow.rs
@ -2695,10 +2695,31 @@ impl TermWindow {
        let window_is_transparent =
            self.window_background.is_some() || params.config.window_background_opacity != 1.0;

+        let white_space = gl_state.util_sprites.white_space.texture_coords();
+
+        // Pre-set the row with the whitespace glyph.
+        // This is here primarily because clustering/shaping can cause the line updates
+        // to skip setting a quad that is logically obscured by a double-wide glyph.
+        // If eg: scrolling the viewport causes the pair of quads to change from two
+        // individual cells to a single double-wide cell then we might leave the second
+        // one of the pair with the glyph from the prior viewport position.
+        for cell_idx in 0..num_cols {
+            let mut quad =
+                match quads.cell(cell_idx + params.pos.left, params.line_idx + params.pos.top) {
+                    Ok(quad) => quad,
+                    Err(_) => break,
+                };
+
+            quad.set_texture(white_space);
+            quad.set_texture_adjust(0., 0., 0., 0.);
+            quad.set_underline(white_space);
+            quad.set_cursor(white_space);
+        }
+
        // Break the line into clusters of cells with the same attributes
        let cell_clusters = params.line.cluster();
-        let mut last_cell_idx = 0;
-        for cluster in cell_clusters {
+        let mut last_cell_idx = None;
+        for cluster in &cell_clusters {
            let attrs = &cluster.attrs;
            let is_highlited_hyperlink = match (attrs.hyperlink(), &self.current_highlight) {
                (Some(ref this), &Some(ref highlight)) => Arc::ptr_eq(this, highlight),
@ -2788,6 +2809,24 @@ impl TermWindow {

            for info in glyph_info.iter() {
                let cell_idx = cluster.byte_to_cell_idx[info.cluster as usize];
+
+                if last_cell_idx.is_some() && cell_idx <= last_cell_idx.unwrap() {
+                    // This is a tricky case: if we have a cluster such as
+                    // 1F470 1F3FF 200D 2640 (woman with veil: dark skin tone)
+                    // and the font doesn't define a glyph for it, the shaper
+                    // may give us a sequence of three output clusters, each
+                    // comprising: veil, skin tone and female respectively.
+                    // Those all have the same info.cluster which
+                    // means that they all resolve to the same cell_idx.
+                    // In this case, the cluster is logically a single cell,
+                    // and the best presentation is of the veil, so we pick
+                    // that one and ignore the rest of the glyphs that map to
+                    // this same cell.
+                    // Ideally we'd overlay this with a "something is broken"
+                    // glyph in the corner.
+                    continue;
+                }
+
                let glyph = gl_state
                    .glyph_cache
                    .borrow_mut()
@ -2822,7 +2861,8 @@ impl TermWindow {
                        // smaller than the terminal.
                        break;
                    }
-                    last_cell_idx = cell_idx;
+
+                    last_cell_idx.replace(cell_idx);

                    let ComputeCellFgBgResult {
                        fg_color: glyph_color,
@ -2898,7 +2938,7 @@ impl TermWindow {
                        quad.set_bg_color(bg_color);
                        quad.set_texture(texture_rect);
                        quad.set_texture_adjust(0., 0., 0., 0.);
-                        quad.set_underline(gl_state.util_sprites.white_space.texture_coords());
+                        quad.set_underline(white_space);
                        quad.set_has_color(true);
                        quad.set_cursor(
                            gl_state
@ -2965,9 +3005,7 @@ impl TermWindow {
        // the right pane with its prior contents instead of showing the
        // cleared lines from the shell in the main screen.

-        let white_space = gl_state.util_sprites.white_space.texture_coords();
-
-        for cell_idx in last_cell_idx + 1..num_cols {
+        for cell_idx in last_cell_idx.unwrap_or(0) + 1..num_cols {
            // Even though we don't have a cell for these, they still
            // hold the cursor or the selection so we need to compute
            // the colors in the usual way.