improve emoji width calculation

I noticed while scrolling `emoji-test.txt` that some of the combined emoji sequences rendered very poorly. This was due to the unicode width being reported as up to 4 in some cases. Digging into it, I discovered that the unicode width crate uses a standard calculation that doesn't take emoji combination sequences into account (see https://github.com/unicode-rs/unicode-width/issues/4). This commit takes a dep on the xi-unicode crate as a lightweight way to gain access to emoji tables and test whether a given grapheme is part of a combining sequence of emoji.
2024-12-23 21:32:13 +03:00 · 2019-11-05 08:45:08 -08:00 · 2019-11-05 08:45:08 -08:00 · 1ab438c1e2
commit 1ab438c1e2
parent a9bdca6d84
6 changed files with 95 additions and 9 deletions
--- a/src/font/system.rs
+++ b/src/font/system.rs
@ -46,8 +46,8 @@ impl GlyphInfo {
        info: &harfbuzz::hb_glyph_info_t,
        pos: &harfbuzz::hb_glyph_position_t,
    ) -> GlyphInfo {
-        use unicode_width::UnicodeWidthStr;
-        let num_cells = UnicodeWidthStr::width(text) as u8;
+        use termwiz::cell::unicode_column_width;
+        let num_cells = unicode_column_width(text) as u8;
        GlyphInfo {
            #[cfg(debug_assertions)]
            text: text.into(),
--- a/term/src/terminalstate.rs
+++ b/term/src/terminalstate.rs
@ -17,7 +17,6 @@ use termwiz::escape::osc::{ChangeColorPair, ColorOrQuery, ITermFileData, ITermPr
 use termwiz::escape::{Action, ControlCode, Esc, EscCode, OneBased, OperatingSystemCommand, CSI};
 use termwiz::hyperlink::Rule as HyperlinkRule;
 use termwiz::image::{ImageCell, ImageData, TextureCoordinate};
-use unicode_width::UnicodeWidthStr;

 struct TabStop {
    tabs: Vec<bool>,
@ -1988,7 +1987,7 @@ impl<'a> Performer<'a> {
            // they occupy a cell so that we can re-emit them when we output them.
            // If we didn't do this, then we'd effectively filter them out from
            // the model, which seems like a lossy design choice.
-            let print_width = UnicodeWidthStr::width(g).max(1);
+            let print_width = unicode_column_width(g).max(1);

            if !self.insert && x + print_width >= width {
                pen.set_wrapped(true);
--- a/term/src/test/mod.rs
+++ b/term/src/test/mod.rs
@ -318,7 +318,14 @@ fn assert_lines_equal(lines: &[Line], expect_lines: &[Line], compare: Compare) {
        if compare.contains(Compare::TEXT) {
            let line_str = line.as_str();
            let expect_str = expect.as_str();
-            assert_eq!(line_str, expect_str, "line {} text didn't match", idx,);
+            assert_eq!(
+                line_str,
+                expect_str,
+                "line {} text didn't match '{}' vs '{}'",
+                idx,
+                line_str.escape_default(),
+                expect_str.escape_default()
+            );
        }
    }

@ -543,6 +550,26 @@ fn test_scroll_margins() {
    assert_all_contents(&term, &["1", "2", "3", "W", " ", "a"]);
 }

+#[test]
+fn test_emoji_with_modifier() {
+    let waving_hand = "\u{1f44b}";
+    let waving_hand_dark_tone = "\u{1f44b}\u{1f3ff}";
+
+    let mut term = TestTerm::new(3, 5, 0);
+    term.print(waving_hand);
+    term.print("\r\n");
+    term.print(waving_hand_dark_tone);
+
+    assert_all_contents(
+        &term,
+        &[
+            &format!("{}   ", waving_hand),
+            &format!("{}   ", waving_hand_dark_tone),
+            "     ",
+        ],
+    );
+}
+
 #[test]
 fn test_hyperlinks() {
    let mut term = TestTerm::new(3, 5, 0);
--- a/termwiz/Cargo.toml
+++ b/termwiz/Cargo.toml
@ -34,6 +34,7 @@ smallvec = "0.6"
 terminfo = "0.6"
 unicode-segmentation = "1.5"
 unicode-width = "0.1"
+xi-unicode = "0.2"
 vtparse = { version="0.1", path="../vtparse" }

 [dev-dependencies]
--- a/termwiz/src/cell.rs
+++ b/termwiz/src/cell.rs
@ -288,7 +288,7 @@ impl Cell {

    /// Returns the number of cells visually occupied by this grapheme
    pub fn width(&self) -> usize {
-        UnicodeWidthStr::width(self.str())
+        grapheme_column_width(self.str())
    }

    /// Returns the attributes of the cell
@ -301,6 +301,31 @@ impl Cell {
    }
 }

+/// Returns the number of cells visually occupied by a sequence
+/// of graphemes
+pub fn unicode_column_width(s: &str) -> usize {
+    use unicode_segmentation::UnicodeSegmentation;
+    s.graphemes(true).map(grapheme_column_width).sum()
+}
+
+/// Returns the number of cells visually occupied by a grapheme.
+/// The input string must be a single grapheme.
+pub fn grapheme_column_width(s: &str) -> usize {
+    // Due to this issue:
+    // https://github.com/unicode-rs/unicode-width/issues/4
+    // we cannot simply use the unicode-width crate to compute
+    // the desired value.
+    // Let's check for emoji-ness for ourselves first
+    use xi_unicode::EmojiExt;
+    for c in s.chars() {
+        if c.is_emoji_modifier_base() || c.is_emoji_modifier() {
+            // treat modifier sequences as double wide
+            return 2;
+        }
+    }
+    UnicodeWidthStr::width(s)
+}
+
 /// Models a change in the attributes of a cell in a stream of changes.
 /// Each variant specifies one of the possible attributes; the corresponding
 /// value holds the new value to be used for that attribute.
@ -334,4 +359,38 @@ mod test {
            assert_eq!(cell.str(), " ");
        }
    }
+
+    #[test]
+    fn test_width() {
+        let foot = "\u{1f9b6}";
+        eprintln!("foot chars");
+        for c in foot.chars() {
+            eprintln!("char: {:?}", c);
+            use xi_unicode::EmojiExt;
+            eprintln!("xi emoji: {}", c.is_emoji());
+            eprintln!("xi emoji_mod: {}", c.is_emoji_modifier());
+            eprintln!("xi emoji_mod_base: {}", c.is_emoji_modifier_base());
+        }
+        assert_eq!(unicode_column_width(foot), 2, "{} should be 2", foot);
+
+        let women_holding_hands_dark_skin_tone_medium_light_skin_tone =
+            "\u{1F469}\u{1F3FF}\u{200D}\u{1F91D}\u{200D}\u{1F469}\u{1F3FC}";
+
+        // Ensure that we can hold this longer grapheme sequence in the cell
+        // and correctly return its string contents!
+        let cell = Cell::new_grapheme(
+            women_holding_hands_dark_skin_tone_medium_light_skin_tone,
+            CellAttributes::default(),
+        );
+        assert_eq!(
+            cell.str(),
+            women_holding_hands_dark_skin_tone_medium_light_skin_tone
+        );
+        assert_eq!(
+            cell.width(),
+            2,
+            "width of {} should be 2",
+            women_holding_hands_dark_skin_tone_medium_light_skin_tone
+        );
+    }
 }
--- a/termwiz/src/lineedit/mod.rs
+++ b/termwiz/src/lineedit/mod.rs
@ -36,12 +36,12 @@
 //! Alt-b, Alt-Left | Move the cursor backwards one word
 //! Alt-f, Alt-Right | Move the cursor forwards one word
 use crate::caps::{Capabilities, ProbeHintsBuilder};
+use crate::cell::unicode_column_width;
 use crate::input::{InputEvent, KeyCode, KeyEvent, Modifiers};
 use crate::surface::{Change, Position};
 use crate::terminal::{new_terminal, Terminal};
 use failure::{err_msg, Fallible};
 use unicode_segmentation::GraphemeCursor;
-use unicode_width::UnicodeWidthStr;

 mod actions;
 mod history;
@ -157,7 +157,7 @@ impl<T: Terminal> LineEditor<T> {
        let mut prompt_width = 0;
        for ele in host.render_prompt(&self.prompt) {
            if let OutputElement::Text(ref t) = ele {
-                prompt_width += UnicodeWidthStr::width(t.as_str());
+                prompt_width += unicode_column_width(t.as_str());
            }
            changes.push(ele.into());
        }
@ -174,7 +174,7 @@ impl<T: Terminal> LineEditor<T> {
        // It might feel more right to count the number of graphemes in
        // the string, but this doesn't render correctly for glyphs that
        // are double-width.  Nothing about unicode is easy :-/
-        let grapheme_count = UnicodeWidthStr::width(&self.line[0..self.cursor]);
+        let grapheme_count = unicode_column_width(&self.line[0..self.cursor]);
        changes.push(Change::CursorPosition {
            x: Position::Absolute(prompt_width + grapheme_count),
            y: Position::NoChange,