Add a script to generate wcwidth as well

Generated function is more efficient than the implementation from wcwidth9 and also makes it easy to update when the unicode standard changes.
2024-08-17 10:30:25 +03:00 · 2017-12-20 16:06:58 +05:30 · 2017-12-20 16:06:58 +05:30 · d1282b9f55
commit d1282b9f55
parent 11ee317884
8 changed files with 2653 additions and 1405 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,4 @@
-kitty/wcwidth9.h linguist-generated=true
+kitty/wcwidth-std.h linguist-generated=true
 kitty/emoji.h linguist-generated=true
 kitty/keys.h linguist-generated=true
 kitty/charsets.c linguist-generated=true
--- a/2
+++ b/2
@ -1,2 +1,2 @@
 #!/bin/bash
-cloc --exclude-list-file <(echo -e 'kitty/wcwidth9.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
+cloc --exclude-list-file <(echo -e 'kitty/wcwidth-std.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
--- a/gen-emoji.py
+++ b/gen-emoji.py
@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-# vim:fileencoding=utf-8
-# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
-
-import os
-from collections import defaultdict
-from functools import partial
-from itertools import groupby
-from operator import itemgetter
-from urllib.request import urlopen
-
-os.chdir(os.path.dirname(os.path.abspath(__file__)))
-
-raw = urlopen('http://unicode.org/Public/emoji/5.0/emoji-data.txt').read().decode('utf-8')
-seen = set()
-cmap = defaultdict(set)
-for line in raw.splitlines():
-    line = line.strip()
-    if not line or line.startswith('#'):
-        continue
-    spec, rest = line.partition(';')[::2]
-    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
-    if '.' in spec:
-        spec = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
-        spec = set(range(spec[0], spec[1] + 1))
-    else:
-        spec = {int(spec, 16)}
-    cmap[rest] |= spec
-    seen |= spec
-items = list(seen)
-
-
-def get_ranges(items):
-    items.sort()
-    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
-        group = tuple(map(itemgetter(1), g))
-        a, b = group[0], group[-1]
-        if a == b:
-            yield a
-        else:
-            yield a, b
-
-
-def write_case(spec, p):
-    if isinstance(spec, tuple):
-        p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
-    else:
-        p('\t\tcase 0x{:x}:'.format(spec))
-
-
-with open('kitty/emoji.h', 'w') as f:
-    p = partial(print, file=f)
-    p('#pragma once')
-    p('#include "data-types.h"\n')
-    p('START_ALLOW_CASE_RANGE')
-    p('static inline bool is_emoji(uint32_t code) {')
-    p('\tswitch(code) {')
-    for spec in get_ranges(items):
-        last = spec[1] if isinstance(spec, tuple) else spec
-        if last < 0x231a:
-            continue
-        write_case(spec, p)
-        p('\t\t\treturn true;')
-    p('\t\tdefault: return false;')
-    p('\t}')
-    p('\treturn false; \n}')
-    p('static inline bool is_emoji_modifier(uint32_t code) {')
-    p('\tswitch(code) {')
-    for spec in get_ranges(list(cmap['Emoji_Modifier'])):
-        write_case(spec, p)
-        p('\t\t\treturn true;')
-    p('\t\tdefault: return false;')
-    p('\t}')
-    p('\treturn false; \n}')
-    p('END_ALLOW_CASE_RANGE')
--- a/gen-wcwidth.py
+++ b/gen-wcwidth.py
@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+
+import os
+import sys
+from contextlib import contextmanager
+from datetime import date
+from functools import partial
+from itertools import groupby
+from operator import itemgetter
+from urllib.request import urlopen
+
+# We ignore the first few emojis as they are widely assumed to be single width
+# in legacy applications
+FIRST_EMOJI = 0x2194
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+
+def get_data(fname, folder='UCD'):
+    url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
+    bn = os.path.basename(url)
+    local = os.path.join('/tmp', bn)
+    if os.path.exists(local):
+        data = open(local, 'rb').read()
+    else:
+        data = urlopen(url).read()
+        open(local, 'wb').write(data)
+    for line in data.decode('utf-8').splitlines():
+        line = line.strip()
+        if line and not line.startswith('#'):
+            yield line
+
+
+# Map of class names to set of codepoints in class
+class_maps = {}
+combining_codepoints = set()
+not_assigned = set(range(0, sys.maxunicode))
+
+
+def parse_ucd():
+    first = None
+    for line in get_data('ucd/UnicodeData.txt'):
+        parts = [x.strip() for x in line.split(';')]
+        codepoint = int(parts[0], 16)
+        category = parts[2]
+        s = class_maps.setdefault(category, set())
+        desc = parts[1]
+        codepoints = (codepoint,)
+        if first is None:
+            if desc.endswith(', First>'):
+                first = codepoint
+                continue
+        else:
+            codepoints = range(first, codepoint + 1)
+            first = None
+        for codepoint in codepoints:
+            s.add(codepoint)
+            not_assigned.discard(codepoint)
+            cc = parts[3]
+            if cc and cc != '0':
+                combining_codepoints.add(codepoint)
+
+
+def split_two(line):
+    spec, rest = line.split(';', 1)
+    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
+    if '..' in spec:
+        chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
+        chars = set(range(chars[0], chars[1] + 1))
+    else:
+        chars = {int(spec, 16)}
+    return chars, rest
+
+
+all_emoji = set()
+emoji_categories = {}
+
+
+def parse_emoji():
+    for line in get_data('emoji-data.txt', 'emoji'):
+        chars, rest = split_two(line)
+        if max(chars) >= FIRST_EMOJI:
+            s = emoji_categories.setdefault(rest, set())
+            s |= chars
+            all_emoji.update(chars)
+
+
+doublewidth, ambiguous = set(), set()
+
+
+def parse_eaw():
+    global doublewidth, ambiguous
+    seen = set()
+    for line in get_data('ucd/EastAsianWidth.txt'):
+        chars, eaw = split_two(line)
+        if eaw == 'A':
+            ambiguous |= chars
+            seen |= chars
+        elif eaw == 'W' or eaw == 'F':
+            doublewidth |= chars
+            seen |= chars
+    doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
+    doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
+    doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
+    doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
+    doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
+
+
+def get_ranges(items):
+    items.sort()
+    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
+        group = tuple(map(itemgetter(1), g))
+        a, b = group[0], group[-1]
+        if a == b:
+            yield a
+        else:
+            yield a, b
+
+
+def write_case(spec, p):
+    if isinstance(spec, tuple):
+        p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
+    else:
+        p('\t\tcase 0x{:x}:'.format(spec))
+
+
+@contextmanager
+def create_header(path):
+    f = open(path, 'w')
+    p = partial(print, file=f)
+    p('// unicode data, built from the unicode standard on:', date.today())
+    p('// see gen-wcwidth.py')
+    p('#pragma once')
+    p('#include "data-types.h"\n')
+    p('START_ALLOW_CASE_RANGE')
+    p()
+    yield p
+    p()
+    p('END_ALLOW_CASE_RANGE')
+    f.close()
+
+
+def gen_emoji():
+    with create_header('kitty/emoji.h') as p:
+        p('static inline bool\nis_emoji(char_type code) {')
+        p('\tswitch(code) {')
+        for spec in get_ranges(list(all_emoji)):
+            write_case(spec, p)
+            p('\t\t\treturn true;')
+        p('\t\tdefault: return false;')
+        p('\t}')
+        p('\treturn false; \n}')
+        p('static inline bool\nis_emoji_modifier(char_type code) {')
+        p('\tswitch(code) {')
+        for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
+            write_case(spec, p)
+            p('\t\t\treturn true;')
+        p('\t\tdefault: return false;')
+        p('\t}')
+        p('\treturn false; \n}')
+
+
+def gen_wcwidth():
+    seen = set()
+
+    def add(p, comment, chars_, ret):
+        chars = chars_ - seen
+        seen.update(chars)
+        p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{{')
+        for spec in get_ranges(list(chars)):
+            write_case(spec, p)
+            p(f'\t\t\treturn {ret};')
+        p('\t\t// }}}\n')
+
+    with create_header('kitty/wcwidth-std.h') as p:
+        p('static int\nwcwidth_std(int32_t code) {')
+        p('\tswitch(code) {')
+
+        non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
+        add(p, 'Non-printing characters', non_printing, -1)
+        add(p, 'Combining characters', combining_codepoints, -1)
+        add(p, 'Private use', class_maps['Co'], -3)
+        add(p, 'East Asian ambiguous width', ambiguous, -2)
+        add(p, 'East Asian double width', doublewidth, 2)
+        add(p, 'Emoji', all_emoji, 2)
+
+        add(p, 'Not assigned in the unicode character database', not_assigned, -1)
+
+        p('\t\tdefault: return 1;')
+        p('\t}')
+        p('\treturn 1; \n}')
+
+
+parse_ucd()
+parse_emoji()
+parse_eaw()
+gen_wcwidth()
+gen_emoji()
--- a/kitty/emoji.h
+++ b/kitty/emoji.h
@ -1,9 +1,17 @@
+// unicode data, built from the unicode standard on: 2017-12-20
+// see gen-wcwidth.py
 #pragma once
 #include "data-types.h"

 START_ALLOW_CASE_RANGE
-static inline bool is_emoji(uint32_t code) {
+
+static inline bool
+is_emoji(char_type code) {
 	switch(code) {
+		case 0x2194 ... 0x2199:
+			return true;
+		case 0x21a9 ... 0x21aa:
+			return true;
 		case 0x231a ... 0x231b:
 			return true;
 		case 0x2328:
@ -274,14 +282,16 @@ static inline bool is_emoji(uint32_t code) {
 			return true;
 		default: return false;
 	}
-	return false;
+	return false; 
 }
-static inline bool is_emoji_modifier(uint32_t code) {
+static inline bool
+is_emoji_modifier(char_type code) {
 	switch(code) {
 		case 0x1f3fb ... 0x1f3ff:
 			return true;
 		default: return false;
 	}
-	return false;
+	return false; 
 }
+
 END_ALLOW_CASE_RANGE
--- a/kitty/screen.c
+++ b/kitty/screen.c
@ -18,7 +18,7 @@
 #include <fcntl.h>
 #include "unicode-data.h"
 #include "modes.h"
-#include "wcwidth9.h"
+#include "wcwidth-std.h"
 #include "control-codes.h"

 static const ScreenModes empty_modes = {0, .mDECAWM=true, .mDECTCEM=true, .mDECARM=true};
@ -275,8 +275,8 @@ safe_wcwidth(uint32_t ch) {
 }

 void
-change_wcwidth(bool use9) {
-    wcwidth_impl = (use9) ? wcwidth9 : wcwidth;
+change_wcwidth(bool use_std) {
+    wcwidth_impl = use_std ? wcwidth_std : wcwidth;
 }


--- a/kitty/wcwidth-std.h
+++ b/kitty/wcwidth-std.h
--- a/kitty/wcwidth9.h
+++ b/kitty/wcwidth9.h