Add a script to generate wcwidth as well

Generated function is more efficient than the implementation from
wcwidth9 and also makes it easy to update when the unicode standard
changes.
This commit is contained in:
Kovid Goyal 2017-12-20 16:06:58 +05:30
parent 11ee317884
commit d1282b9f55
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 2653 additions and 1405 deletions

2
.gitattributes vendored
View File

@ -1,4 +1,4 @@
kitty/wcwidth9.h linguist-generated=true
kitty/wcwidth-std.h linguist-generated=true
kitty/emoji.h linguist-generated=true
kitty/keys.h linguist-generated=true
kitty/charsets.c linguist-generated=true

View File

@ -1,2 +1,2 @@
#!/bin/bash
cloc --exclude-list-file <(echo -e 'kitty/wcwidth9.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
cloc --exclude-list-file <(echo -e 'kitty/wcwidth-std.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty

View File

@ -1,75 +0,0 @@
#!/usr/bin/env python3
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import os
from collections import defaultdict
from functools import partial
from itertools import groupby
from operator import itemgetter
from urllib.request import urlopen
os.chdir(os.path.dirname(os.path.abspath(__file__)))
raw = urlopen('http://unicode.org/Public/emoji/5.0/emoji-data.txt').read().decode('utf-8')
seen = set()
cmap = defaultdict(set)
for line in raw.splitlines():
line = line.strip()
if not line or line.startswith('#'):
continue
spec, rest = line.partition(';')[::2]
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
if '.' in spec:
spec = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
spec = set(range(spec[0], spec[1] + 1))
else:
spec = {int(spec, 16)}
cmap[rest] |= spec
seen |= spec
items = list(seen)
def get_ranges(items):
items.sort()
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
group = tuple(map(itemgetter(1), g))
a, b = group[0], group[-1]
if a == b:
yield a
else:
yield a, b
def write_case(spec, p):
if isinstance(spec, tuple):
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
else:
p('\t\tcase 0x{:x}:'.format(spec))
with open('kitty/emoji.h', 'w') as f:
p = partial(print, file=f)
p('#pragma once')
p('#include "data-types.h"\n')
p('START_ALLOW_CASE_RANGE')
p('static inline bool is_emoji(uint32_t code) {')
p('\tswitch(code) {')
for spec in get_ranges(items):
last = spec[1] if isinstance(spec, tuple) else spec
if last < 0x231a:
continue
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false; \n}')
p('static inline bool is_emoji_modifier(uint32_t code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(cmap['Emoji_Modifier'])):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false; \n}')
p('END_ALLOW_CASE_RANGE')

199
gen-wcwidth.py Executable file
View File

@ -0,0 +1,199 @@
#!/usr/bin/env python3
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import os
import sys
from contextlib import contextmanager
from datetime import date
from functools import partial
from itertools import groupby
from operator import itemgetter
from urllib.request import urlopen
# We ignore the first few emojis as they are widely assumed to be single width
# in legacy applications
FIRST_EMOJI = 0x2194
os.chdir(os.path.dirname(os.path.abspath(__file__)))
def get_data(fname, folder='UCD'):
url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
bn = os.path.basename(url)
local = os.path.join('/tmp', bn)
if os.path.exists(local):
data = open(local, 'rb').read()
else:
data = urlopen(url).read()
open(local, 'wb').write(data)
for line in data.decode('utf-8').splitlines():
line = line.strip()
if line and not line.startswith('#'):
yield line
# Map of class names to set of codepoints in class
class_maps = {}
combining_codepoints = set()
not_assigned = set(range(0, sys.maxunicode))
def parse_ucd():
first = None
for line in get_data('ucd/UnicodeData.txt'):
parts = [x.strip() for x in line.split(';')]
codepoint = int(parts[0], 16)
category = parts[2]
s = class_maps.setdefault(category, set())
desc = parts[1]
codepoints = (codepoint,)
if first is None:
if desc.endswith(', First>'):
first = codepoint
continue
else:
codepoints = range(first, codepoint + 1)
first = None
for codepoint in codepoints:
s.add(codepoint)
not_assigned.discard(codepoint)
cc = parts[3]
if cc and cc != '0':
combining_codepoints.add(codepoint)
def split_two(line):
spec, rest = line.split(';', 1)
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
if '..' in spec:
chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
chars = set(range(chars[0], chars[1] + 1))
else:
chars = {int(spec, 16)}
return chars, rest
all_emoji = set()
emoji_categories = {}
def parse_emoji():
for line in get_data('emoji-data.txt', 'emoji'):
chars, rest = split_two(line)
if max(chars) >= FIRST_EMOJI:
s = emoji_categories.setdefault(rest, set())
s |= chars
all_emoji.update(chars)
doublewidth, ambiguous = set(), set()
def parse_eaw():
global doublewidth, ambiguous
seen = set()
for line in get_data('ucd/EastAsianWidth.txt'):
chars, eaw = split_two(line)
if eaw == 'A':
ambiguous |= chars
seen |= chars
elif eaw == 'W' or eaw == 'F':
doublewidth |= chars
seen |= chars
doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
def get_ranges(items):
items.sort()
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
group = tuple(map(itemgetter(1), g))
a, b = group[0], group[-1]
if a == b:
yield a
else:
yield a, b
def write_case(spec, p):
if isinstance(spec, tuple):
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
else:
p('\t\tcase 0x{:x}:'.format(spec))
@contextmanager
def create_header(path):
f = open(path, 'w')
p = partial(print, file=f)
p('// unicode data, built from the unicode standard on:', date.today())
p('// see gen-wcwidth.py')
p('#pragma once')
p('#include "data-types.h"\n')
p('START_ALLOW_CASE_RANGE')
p()
yield p
p()
p('END_ALLOW_CASE_RANGE')
f.close()
def gen_emoji():
with create_header('kitty/emoji.h') as p:
p('static inline bool\nis_emoji(char_type code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(all_emoji)):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false; \n}')
p('static inline bool\nis_emoji_modifier(char_type code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false; \n}')
def gen_wcwidth():
seen = set()
def add(p, comment, chars_, ret):
chars = chars_ - seen
seen.update(chars)
p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{{')
for spec in get_ranges(list(chars)):
write_case(spec, p)
p(f'\t\t\treturn {ret};')
p('\t\t// }}}\n')
with create_header('kitty/wcwidth-std.h') as p:
p('static int\nwcwidth_std(int32_t code) {')
p('\tswitch(code) {')
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
add(p, 'Non-printing characters', non_printing, -1)
add(p, 'Combining characters', combining_codepoints, -1)
add(p, 'Private use', class_maps['Co'], -3)
add(p, 'East Asian ambiguous width', ambiguous, -2)
add(p, 'East Asian double width', doublewidth, 2)
add(p, 'Emoji', all_emoji, 2)
add(p, 'Not assigned in the unicode character database', not_assigned, -1)
p('\t\tdefault: return 1;')
p('\t}')
p('\treturn 1; \n}')
parse_ucd()
parse_emoji()
parse_eaw()
gen_wcwidth()
gen_emoji()

18
kitty/emoji.h generated
View File

@ -1,9 +1,17 @@
// unicode data, built from the unicode standard on: 2017-12-20
// see gen-wcwidth.py
#pragma once
#include "data-types.h"
START_ALLOW_CASE_RANGE
static inline bool is_emoji(uint32_t code) {
static inline bool
is_emoji(char_type code) {
switch(code) {
case 0x2194 ... 0x2199:
return true;
case 0x21a9 ... 0x21aa:
return true;
case 0x231a ... 0x231b:
return true;
case 0x2328:
@ -274,14 +282,16 @@ static inline bool is_emoji(uint32_t code) {
return true;
default: return false;
}
return false;
return false;
}
static inline bool is_emoji_modifier(uint32_t code) {
static inline bool
is_emoji_modifier(char_type code) {
switch(code) {
case 0x1f3fb ... 0x1f3ff:
return true;
default: return false;
}
return false;
return false;
}
END_ALLOW_CASE_RANGE

View File

@ -18,7 +18,7 @@
#include <fcntl.h>
#include "unicode-data.h"
#include "modes.h"
#include "wcwidth9.h"
#include "wcwidth-std.h"
#include "control-codes.h"
static const ScreenModes empty_modes = {0, .mDECAWM=true, .mDECTCEM=true, .mDECARM=true};
@ -275,8 +275,8 @@ safe_wcwidth(uint32_t ch) {
}
void
change_wcwidth(bool use9) {
wcwidth_impl = (use9) ? wcwidth9 : wcwidth;
change_wcwidth(bool use_std) {
wcwidth_impl = use_std ? wcwidth_std : wcwidth;
}

2435
kitty/wcwidth-std.h generated Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff