kitty/gen-wcwidth.py
2018-02-06 11:23:39 +05:30

262 lines
8.3 KiB
Python
Executable File

#!/usr/bin/env python3
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import os
import sys
from contextlib import contextmanager
from datetime import date
from functools import partial
from itertools import groupby
from operator import itemgetter
from urllib.request import urlopen
os.chdir(os.path.dirname(os.path.abspath(__file__)))
def get_data(fname, folder='UCD'):
url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
bn = os.path.basename(url)
local = os.path.join('/tmp', bn)
if os.path.exists(local):
data = open(local, 'rb').read()
else:
data = urlopen(url).read()
open(local, 'wb').write(data)
for line in data.decode('utf-8').splitlines():
line = line.strip()
if line and not line.startswith('#'):
yield line
# Map of class names to set of codepoints in class
class_maps = {}
marks = set()
not_assigned = set(range(0, sys.maxunicode))
def parse_ucd():
first = None
for line in get_data('ucd/UnicodeData.txt'):
parts = [x.strip() for x in line.split(';')]
codepoint = int(parts[0], 16)
category = parts[2]
s = class_maps.setdefault(category, set())
desc = parts[1]
codepoints = (codepoint,)
if first is None:
if desc.endswith(', First>'):
first = codepoint
continue
else:
codepoints = range(first, codepoint + 1)
first = None
for codepoint in codepoints:
s.add(codepoint)
not_assigned.discard(codepoint)
if category.startswith('M'):
marks.add(codepoint)
def split_two(line):
spec, rest = line.split(';', 1)
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
if '..' in spec:
chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
chars = set(range(chars[0], chars[1] + 1))
else:
chars = {int(spec, 16)}
return chars, rest
all_emoji = set()
emoji_categories = {}
emoji_presentation_bases = set()
def parse_emoji():
for line in get_data('emoji-data.txt', 'emoji'):
chars, rest = split_two(line)
s = emoji_categories.setdefault(rest, set())
s.update(chars)
all_emoji.update(chars)
for line in get_data('emoji-variation-sequences.txt', 'emoji'):
base, var, *rest = line.split()
if base.startswith('#'):
continue
base = int(base, 16)
if var.upper() == 'FE0F':
emoji_presentation_bases.add(base)
doublewidth, ambiguous = set(), set()
def parse_eaw():
global doublewidth, ambiguous
seen = set()
for line in get_data('ucd/EastAsianWidth.txt'):
chars, eaw = split_two(line)
if eaw == 'A':
ambiguous |= chars
seen |= chars
elif eaw == 'W' or eaw == 'F':
doublewidth |= chars
seen |= chars
doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
def get_ranges(items):
items.sort()
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
group = tuple(map(itemgetter(1), g))
a, b = group[0], group[-1]
if a == b:
yield a
else:
yield a, b
def write_case(spec, p):
if isinstance(spec, tuple):
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
else:
p('\t\tcase 0x{:x}:'.format(spec))
@contextmanager
def create_header(path):
f = open(path, 'w')
p = partial(print, file=f)
p('// unicode data, built from the unicode standard on:', date.today())
p('// see gen-wcwidth.py')
if path.endswith('.h'):
p('#pragma once')
p('#include "data-types.h"\n')
p('START_ALLOW_CASE_RANGE')
p()
yield p
p()
p('END_ALLOW_CASE_RANGE')
f.close()
def gen_emoji():
with create_header('kitty/emoji.h') as p:
p('static inline bool\nis_emoji(char_type code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(all_emoji)):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false;\n}')
p('static inline bool\nis_emoji_modifier(char_type code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn false;\n}')
def category_test(name, p, classes, comment, static=False):
static = 'static inline ' if static else ''
chars = set()
for c in classes:
chars |= class_maps[c]
p(f'{static}bool\n{name}(char_type code) {{')
p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
p('\tswitch(code) {')
for spec in get_ranges(list(chars)):
write_case(spec, p)
p(f'\t\t\treturn true;')
p('\t} // }}}\n')
p('\treturn false;\n}\n')
def gen_ucd():
with create_header('kitty/unicode-data.c') as p:
p('#include "unicode-data.h"')
category_test('is_combining_char', p, {c for c in class_maps if c.startswith('M')}, 'M category (marks)')
category_test('is_ignored_char', p, 'Cc Cf Cs'.split(), 'Control characters (Cc Cf Cs)')
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
category_test('is_CZ_category', p, {c for c in class_maps if c[0] in 'CZ'}, 'C and Z categories')
category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
mark_map = [0] + list(sorted(marks))
p('char_type codepoint_for_mark(combining_type m) {')
p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}')
p('\tif (m < arraysz(map)) return map[m];')
p('\treturn 0;')
p('}\n')
p('combining_type mark_for_codepoint(char_type c) {')
p('\tswitch(c) { // {{{')
rmap = {c: m for m, c in enumerate(mark_map)}
for spec in get_ranges(mark_map):
if isinstance(spec, tuple):
s = rmap[spec[0]]
p(f'\t\tcase {spec[0]} ... {spec[1]}: return {s} + c - {spec[0]};')
else:
p(f'\t\tcase {spec}: return {rmap[spec]};')
p('default: return 0;')
p('\t} // }}}')
p('}\n')
if rmap[0xfe0e] != 1275:
raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format(
rmap[0xfe0e], rmap[0xfe0f]
))
def gen_wcwidth():
seen = set()
def add(p, comment, chars_, ret):
chars = chars_ - seen
seen.update(chars)
p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
for spec in get_ranges(list(chars)):
write_case(spec, p)
p(f'\t\t\treturn {ret};')
p('\t\t// }}}\n')
with create_header('kitty/wcwidth-std.h') as p:
p('static int\nwcwidth_std(int32_t code) {')
p('\tswitch(code) {')
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
add(p, 'Marks', marks | {0}, 0)
add(p, 'Non-printing characters', non_printing, -1)
add(p, 'Private use', class_maps['Co'], -3)
add(p, 'Text Presentation', emoji_categories['Emoji'] - emoji_categories['Emoji_Presentation'], 1)
add(p, 'East Asian ambiguous width', ambiguous, -2)
add(p, 'East Asian double width', doublewidth, 2)
add(p, 'Emoji Presentation', emoji_categories['Emoji_Presentation'], 2)
add(p, 'Not assigned in the unicode character database', not_assigned, -1)
p('\t\tdefault: return 1;')
p('\t}')
p('\treturn 1;\n}')
p('static bool\nis_emoji_presentation_base(uint32_t code) {')
p('\tswitch(code) {')
for spec in get_ranges(list(emoji_presentation_bases)):
write_case(spec, p)
p('\t\t\treturn true;')
p('\t\tdefault: return false;')
p('\t}')
p('\treturn 1;\n}')
parse_ucd()
parse_emoji()
parse_eaw()
gen_ucd()
gen_wcwidth()
gen_emoji()