1
1
mirror of https://github.com/wez/wezterm.git synced 2024-12-22 12:51:31 +03:00

Switch to finl_unicode for grapheme clustering

According to its benchmarks, it's almost 2x faster than
unicode_segmentation.  It doesn't appear to make a visible
difference to `time cat bigfile`, but I'll take anything
that gives more headroom for such little effort of switching.
This commit is contained in:
Wez Furlong 2022-09-09 16:38:47 -07:00
parent a0c2df2d86
commit 96c4e7e9b9
19 changed files with 53 additions and 48 deletions

16
Cargo.lock generated
View File

@ -1392,6 +1392,12 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "finl_unicode"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69152938fc3cf544c50065ed78d321d0c5bf3433344f227eb2323bcf78370d34"
[[package]]
name = "fixedbitset"
version = "0.4.2"
@ -2628,6 +2634,7 @@ dependencies = [
"crossbeam",
"downcast-rs",
"filedescriptor",
"finl_unicode",
"flume",
"hostname",
"k9",
@ -2654,7 +2661,6 @@ dependencies = [
"termwiz-funcs",
"textwrap 0.15.0",
"thiserror",
"unicode-segmentation",
"url",
"wezterm-dynamic",
"wezterm-ssh",
@ -4650,6 +4656,7 @@ dependencies = [
"criterion",
"env_logger",
"filedescriptor",
"finl_unicode",
"fixedbitset",
"fnv",
"hex",
@ -4691,11 +4698,11 @@ version = "0.1.0"
dependencies = [
"anyhow",
"config",
"finl_unicode",
"lazy_static",
"luahelper",
"terminfo",
"termwiz",
"unicode-segmentation",
"wezterm-dynamic",
]
@ -5488,6 +5495,7 @@ dependencies = [
"enum-display-derive",
"env_logger",
"euclid",
"finl_unicode",
"fontconfig",
"freetype",
"harfbuzz",
@ -5500,7 +5508,6 @@ dependencies = [
"rangeset",
"termwiz",
"thiserror",
"unicode-segmentation",
"walkdir",
"wezterm-bidi",
"wezterm-color-types",
@ -5533,6 +5540,7 @@ dependencies = [
"euclid",
"fastrand",
"filedescriptor",
"finl_unicode",
"fnv",
"frecency",
"futures",
@ -5701,6 +5709,7 @@ dependencies = [
"bitflags",
"csscolorparser",
"env_logger",
"finl_unicode",
"hex",
"image",
"k9",
@ -5714,7 +5723,6 @@ dependencies = [
"terminfo",
"termwiz",
"unicode-normalization",
"unicode-segmentation",
"url",
"wezterm-bidi",
"wezterm-dynamic",

View File

@ -8,9 +8,9 @@ edition = "2021"
[dependencies]
anyhow = "1.0"
config = { path = "../../config" }
finl_unicode = "1.0.2"
terminfo = "0.7"
wezterm-dynamic = { path = "../../wezterm-dynamic" }
luahelper = { path = "../../luahelper" }
termwiz = { path = "../../termwiz", features=["use_serde"] }
unicode-segmentation = "1.8"
lazy_static = "1.4"

View File

@ -1,5 +1,6 @@
use config::lua::get_or_create_module;
use config::lua::mlua::{self, Lua, ToLua};
use finl_unicode::grapheme_clusters::Graphemes;
use luahelper::impl_lua_conversion_dynamic;
use std::str::FromStr;
use termwiz::caps::{Capabilities, ColorLevel, ProbeHints};
@ -8,7 +9,6 @@ use termwiz::color::{AnsiColor, ColorAttribute, ColorSpec, SrgbaTuple};
use termwiz::input::Modifiers;
use termwiz::render::terminfo::TerminfoRenderer;
use termwiz::surface::change::Change;
use unicode_segmentation::UnicodeSegmentation;
use wezterm_dynamic::{FromDynamic, ToDynamic};
pub fn register(lua: &Lua) -> anyhow::Result<()> {
@ -170,7 +170,8 @@ pub fn pad_left(mut result: String, width: usize) -> String {
pub fn truncate_left(s: &str, max_width: usize) -> String {
let mut result = vec![];
let mut len = 0;
for g in s.graphemes(true).rev() {
let graphemes: Vec<_> = Graphemes::new(s).collect();
for &g in graphemes.iter().rev() {
let g_len = grapheme_column_width(g, None);
if g_len + len > max_width {
break;
@ -186,7 +187,7 @@ pub fn truncate_left(s: &str, max_width: usize) -> String {
pub fn truncate_right(s: &str, max_width: usize) -> String {
let mut result = String::new();
let mut len = 0;
for g in s.graphemes(true) {
for g in Graphemes::new(s) {
let g_len = grapheme_column_width(g, None);
if g_len + len > max_width {
break;

View File

@ -17,6 +17,7 @@ config = { path = "../config" }
crossbeam = "0.8"
downcast-rs = "1.0"
filedescriptor = { version="0.8", path = "../filedescriptor" }
finl_unicode = "1.0.2"
hostname = "0.3"
lazy_static = "1.4"
libc = "0.2"
@ -40,7 +41,6 @@ termwiz = { path = "../termwiz" }
termwiz-funcs = { path = "../lua-api-crates/termwiz-funcs" }
textwrap = "0.15"
thiserror = "1.0"
unicode-segmentation = "1.8"
url = "2"
wezterm-ssh = { path = "../wezterm-ssh" }
wezterm-dynamic = { path = "../wezterm-dynamic" }

View File

@ -1,6 +1,7 @@
use crate::termwiztermtab;
use anyhow::{anyhow, bail, Context as _};
use crossbeam::channel::{unbounded, Receiver, Sender};
use finl_unicode::grapheme_clusters::Graphemes;
use promise::spawn::block_on;
use promise::Promise;
use std::sync::Mutex;
@ -9,7 +10,6 @@ use termwiz::cell::{unicode_column_width, CellAttributes};
use termwiz::lineedit::*;
use termwiz::surface::{Change, Position};
use termwiz::terminal::*;
use unicode_segmentation::UnicodeSegmentation;
use wezterm_term::TerminalSize;
#[derive(Default)]
@ -146,7 +146,7 @@ impl ConnectionUIImpl {
let mut reversed_string = String::new();
let mut default_string = String::new();
let mut col = 0;
for grapheme in message.graphemes(true) {
for grapheme in Graphemes::new(&message) {
// Once we've passed the elapsed column, full up the string
// that we'll render with default attributes instead.
if col > prog_width {

View File

@ -18,6 +18,7 @@ anyhow = "1.0"
bitflags = "1.3"
csscolorparser = "0.6"
miniz_oxide = "0.4"
finl_unicode = "1.0.2"
hex = "0.4"
image = "0.24"
lazy_static = "1.4"
@ -28,7 +29,6 @@ ordered-float = "3.0"
serde = {version="1.0", features = ["rc"]}
terminfo = "0.7"
unicode-normalization = "0.1.21"
unicode-segmentation = "1.8"
url = "2"
wezterm-bidi = { path = "../bidi" }
wezterm-dynamic = { path = "../wezterm-dynamic" }
@ -36,7 +36,6 @@ wezterm-dynamic = { path = "../wezterm-dynamic" }
[dev-dependencies]
env_logger = "0.9"
k9 = "0.11.0"
unicode-normalization = "0.1"
[dependencies.termwiz]
version = "0.17"

View File

@ -3,6 +3,7 @@ use crate::terminalstate::{
default_color_map, CharSet, MouseEncoding, TabStop, UnicodeVersionStackEntry,
};
use crate::{ClipboardSelection, Position, TerminalState, VisibleRowIndex, DCS, ST};
use finl_unicode::grapheme_clusters::Graphemes;
use log::{debug, error};
use num_traits::FromPrimitive;
use ordered_float::NotNan;
@ -126,7 +127,7 @@ impl<'a> Performer<'a> {
p.as_str()
};
for g in unicode_segmentation::UnicodeSegmentation::graphemes(text, true) {
for g in Graphemes::new(text) {
let g = self.remap_grapheme(g);
let print_width = grapheme_column_width(g, Some(self.unicode_version));

View File

@ -1177,8 +1177,8 @@ fn test_1573() {
let recomposed: String = sequence.nfc().collect();
assert_eq!(recomposed, "\u{d55c}");
use unicode_segmentation::UnicodeSegmentation;
let graphemes: Vec<_> = sequence.graphemes(true).collect();
use finl_unicode::grapheme_clusters::Graphemes;
let graphemes: Vec<_> = Graphemes::new(sequence).collect();
assert_eq!(graphemes, vec![sequence]);
}

View File

@ -18,6 +18,7 @@ cassowary = {version="0.3", optional=true}
cfg-if = "1.0"
anyhow = "1.0"
filedescriptor = { version="0.8", path = "../filedescriptor" }
finl_unicode = "1.0.2"
fixedbitset = "0.4"
fnv = {version="1.0", optional=true}
hex = "0.4"

View File

@ -5,6 +5,7 @@ use crate::emoji_variation::WCWIDTH_TABLE;
pub use crate::escape::osc::Hyperlink;
use crate::image::ImageCell;
use crate::widechar_width::WcWidth;
use finl_unicode::grapheme_clusters::Graphemes;
#[cfg(feature = "use_serde")]
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::hash::{Hash, Hasher};
@ -922,8 +923,7 @@ pub const LATEST_UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
/// Calls through to `grapheme_column_width` for each grapheme
/// and sums up the length.
pub fn unicode_column_width(s: &str, version: Option<UnicodeVersion>) -> usize {
use unicode_segmentation::UnicodeSegmentation;
s.graphemes(true)
Graphemes::new(s)
.map(|g| grapheme_column_width(g, version))
.sum()
}
@ -1023,7 +1023,6 @@ pub enum AttributeChange {
#[cfg(test)]
mod test {
use super::*;
use unicode_segmentation::UnicodeSegmentation;
#[test]
fn teeny_string() {
@ -1135,7 +1134,7 @@ mod test {
let x_ideographic_space_x = "x\u{3000}x";
assert_eq!(unicode_column_width(x_ideographic_space_x, None), 4);
assert_eq!(
x_ideographic_space_x.graphemes(true).collect::<Vec<_>>(),
Graphemes::new(x_ideographic_space_x).collect::<Vec<_>>(),
vec!["x".to_string(), "\u{3000}".to_string(), "x".to_string()],
);
@ -1155,21 +1154,17 @@ mod test {
assert_eq!(unicode_column_width(victory_hand, None), 1);
assert_eq!(
victory_hand_text_presentation
.graphemes(true)
.collect::<Vec<_>>(),
Graphemes::new(victory_hand_text_presentation).collect::<Vec<_>>(),
vec![victory_hand_text_presentation.to_string()]
);
assert_eq!(
victory_hand.graphemes(true).collect::<Vec<_>>(),
Graphemes::new(victory_hand).collect::<Vec<_>>(),
vec![victory_hand.to_string()]
);
let copyright_emoji_presentation = "\u{00A9}\u{FE0F}";
assert_eq!(
copyright_emoji_presentation
.graphemes(true)
.collect::<Vec<_>>(),
Graphemes::new(copyright_emoji_presentation).collect::<Vec<_>>(),
vec![copyright_emoji_presentation.to_string()]
);
assert_eq!(unicode_column_width(copyright_emoji_presentation, None), 2);
@ -1180,9 +1175,7 @@ mod test {
let copyright_text_presentation = "\u{00A9}";
assert_eq!(
copyright_text_presentation
.graphemes(true)
.collect::<Vec<_>>(),
Graphemes::new(copyright_text_presentation).collect::<Vec<_>>(),
vec![copyright_text_presentation.to_string()]
);
assert_eq!(unicode_column_width(copyright_text_presentation, None), 1);
@ -1202,11 +1195,11 @@ mod test {
assert_eq!(unicode_column_width(raised_fist_text, None), 2);
assert_eq!(
raised_fist_text.graphemes(true).collect::<Vec<_>>(),
Graphemes::new(raised_fist_text).collect::<Vec<_>>(),
vec![raised_fist_text.to_string()]
);
assert_eq!(
raised_fist.graphemes(true).collect::<Vec<_>>(),
Graphemes::new(raised_fist).collect::<Vec<_>>(),
vec![raised_fist.to_string()]
);
}

View File

@ -2,10 +2,10 @@ use crate::cell::{unicode_column_width, AttributeChange, CellAttributes};
use crate::color::ColorAttribute;
pub use crate::image::{ImageData, TextureCoordinate};
use crate::surface::{CursorShape, CursorVisibility, Position};
use finl_unicode::grapheme_clusters::Graphemes;
#[cfg(feature = "use_serde")]
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use unicode_segmentation::UnicodeSegmentation;
/// `Change` describes an update operation to be applied to a `Surface`.
/// Changes to the active attributes (color, style), moving the cursor
@ -190,7 +190,7 @@ impl ChangeSequence {
| Change::Title(_)
| Change::ClearToEndOfScreen(_) => {}
Change::Text(t) => {
for g in t.as_str().graphemes(true) {
for g in Graphemes::new(t.as_str()) {
if self.cursor_x == self.screen_cols {
self.cursor_y += 1;
self.cursor_x = 0;

View File

@ -1,11 +1,11 @@
use crate::cell::{Cell, CellAttributes};
use crate::surface::line::CellRef;
use finl_unicode::grapheme_clusters::Graphemes;
use fixedbitset::FixedBitSet;
#[cfg(feature = "use_serde")]
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::convert::TryInto;
use std::num::NonZeroU8;
use unicode_segmentation::UnicodeSegmentation;
#[cfg_attr(feature = "use_serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq)]
@ -166,7 +166,7 @@ impl ClusteredLine {
let mut clusters = self.clusters.iter();
let cluster = clusters.next();
ClusterLineCellIter {
graphemes: self.text.graphemes(true),
graphemes: Graphemes::new(&self.text),
clusters,
cluster,
idx: 0,
@ -307,7 +307,7 @@ impl ClusteredLine {
}
pub(crate) struct ClusterLineCellIter<'a> {
graphemes: unicode_segmentation::Graphemes<'a>,
graphemes: Graphemes<'a>,
clusters: std::slice::Iter<'a, Cluster>,
cluster: Option<&'a Cluster>,
idx: usize,

View File

@ -7,6 +7,7 @@ use crate::surface::line::linebits::LineBits;
use crate::surface::line::storage::{CellStorage, VisibleCellIter};
use crate::surface::line::vecstorage::{VecStorage, VecStorageIter};
use crate::surface::{Change, SequenceNo, SEQ_ZERO};
use finl_unicode::grapheme_clusters::Graphemes;
#[cfg(feature = "use_serde")]
use serde::{Deserialize, Serialize};
use siphasher::sip128::{Hasher128, SipHasher};
@ -15,7 +16,6 @@ use std::borrow::Cow;
use std::hash::Hash;
use std::ops::Range;
use std::sync::{Arc, Mutex, Weak};
use unicode_segmentation::UnicodeSegmentation;
use wezterm_bidi::{Direction, ParagraphDirectionHint};
#[cfg_attr(feature = "use_serde", derive(Serialize, Deserialize))]
@ -137,7 +137,7 @@ impl Line {
) -> Line {
let mut cells = Vec::new();
for sub in s.graphemes(true) {
for sub in Graphemes::new(s) {
let cell = Cell::new_grapheme(sub, attrs.clone(), unicode_version);
let width = cell.width();
cells.push(cell);
@ -850,7 +850,7 @@ impl Line {
attr: CellAttributes,
seqno: SequenceNo,
) {
for (i, c) in text.graphemes(true).enumerate() {
for (i, c) in Graphemes::new(text).enumerate() {
let cell = Cell::new_grapheme(c, attr.clone(), None);
let width = cell.width();
self.set_cell(i + start_idx, cell, seqno);

View File

@ -2,12 +2,12 @@ use crate::cell::{AttributeChange, Cell, CellAttributes};
use crate::color::ColorAttribute;
use crate::image::ImageCell;
use crate::surface::line::CellRef;
use finl_unicode::grapheme_clusters::Graphemes;
use ordered_float::NotNan;
#[cfg(feature = "use_serde")]
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::cmp::min;
use unicode_segmentation::UnicodeSegmentation;
use wezterm_dynamic::{FromDynamic, ToDynamic};
pub mod change;
@ -407,7 +407,7 @@ impl Surface {
}
fn print_text(&mut self, text: &str) {
for g in UnicodeSegmentation::graphemes(text, true) {
for g in Graphemes::new(text) {
if g == "\r\n" {
self.xpos = 0;
let new_y = self.ypos + 1;

View File

@ -19,6 +19,7 @@ config = { path = "../config" }
encoding_rs = "0.8"
enum-display-derive = "0.1"
euclid = "0.22"
finl_unicode = "1.0.2"
freetype = { path = "../deps/freetype" }
harfbuzz = { path = "../deps/harfbuzz" }
lazy_static = "1.4"
@ -29,7 +30,6 @@ ordered-float = "3.0"
rangeset = { path = "../rangeset" }
termwiz = { path = "../termwiz" }
thiserror = "1.0"
unicode-segmentation = "1.8"
walkdir = "2"
wezterm-color-types = { path = "../color-types" }
wezterm-input-types = { path = "../wezterm-input-types" }

View File

@ -4,13 +4,13 @@ use crate::units::*;
use crate::{ftwrap, hbwrap as harfbuzz};
use anyhow::{anyhow, Context};
use config::ConfigHandle;
use finl_unicode::grapheme_clusters::Graphemes;
use log::error;
use ordered_float::NotNan;
use std::cell::{RefCell, RefMut};
use std::collections::HashMap;
use std::ops::Range;
use termwiz::cell::{unicode_column_width, Presentation};
use unicode_segmentation::UnicodeSegmentation;
use wezterm_bidi::Direction;
// Changing these will switch to using harfbuzz's opentype functions.
@ -80,7 +80,7 @@ pub struct HarfbuzzShaper {
/// original string. That isn't perfect, but it should
/// be good enough to indicate that something isn't right.
fn make_question_string(s: &str) -> String {
let len = s.graphemes(true).count();
let len = Graphemes::new(s).count();
let mut result = String::new();
let c = if !is_question_string(s) {
std::char::REPLACEMENT_CHARACTER

View File

@ -48,6 +48,7 @@ env-bootstrap = { path = "../env-bootstrap" }
euclid = "0.22"
fastrand = "1.6"
filedescriptor = { version="0.8", path = "../filedescriptor" }
finl_unicode = "1.0.2"
fnv = "1.0"
frecency = { path = "../frecency" }
futures = "0.3"

View File

@ -1,5 +1,6 @@
use crate::termwindow::{PaneInformation, TabInformation, UIItem, UIItemType};
use config::{ConfigHandle, TabBarColors};
use finl_unicode::grapheme_clusters::Graphemes;
use mlua::FromLua;
use termwiz::cell::{unicode_column_width, Cell, CellAttributes};
use termwiz::color::ColorSpec;
@ -401,7 +402,7 @@ pub fn parse_status_text(text: &str, default_cell: CellAttributes) -> Line {
let mut print_buffer = String::new();
fn flush_print(buf: &mut String, cells: &mut Vec<Cell>, pen: &CellAttributes) {
for g in unicode_segmentation::UnicodeSegmentation::graphemes(buf.as_str(), true) {
for g in Graphemes::new(buf.as_str()) {
let cell = Cell::new_grapheme(g, pen.clone(), None);
let width = cell.width();
cells.push(cell);

View File

@ -11,11 +11,11 @@ use crate::utilsprites::RenderMetrics;
use ::window::{RectF, WindowOps};
use anyhow::anyhow;
use config::{Dimension, DimensionContext};
use finl_unicode::grapheme_clusters::Graphemes;
use std::cell::RefCell;
use std::rc::Rc;
use termwiz::cell::{grapheme_column_width, Presentation};
use termwiz::surface::Line;
use unicode_segmentation::UnicodeSegmentation;
use wezterm_font::units::PixelUnit;
use wezterm_font::LoadedFont;
use wezterm_term::color::{ColorAttribute, ColorPalette};
@ -583,7 +583,7 @@ impl super::TermWindow {
for info in infos {
let cell_start = &s[info.cluster as usize..];
let mut iter = cell_start.graphemes(true).peekable();
let mut iter = Graphemes::new(cell_start).peekable();
let grapheme = iter
.next()
.ok_or_else(|| anyhow!("info.cluster didn't map into string"))?;