mirror of
https://github.com/kovidgoyal/kitty.git
synced 2024-09-19 18:47:26 +03:00
200 lines
5.7 KiB
Python
Executable File
200 lines
5.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# vim:fileencoding=utf-8
|
|
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
|
|
|
import os
|
|
import sys
|
|
from contextlib import contextmanager
|
|
from datetime import date
|
|
from functools import partial
|
|
from itertools import groupby
|
|
from operator import itemgetter
|
|
from urllib.request import urlopen
|
|
|
|
# We ignore the first few emojis as they are widely assumed to be single width
|
|
# in legacy applications
|
|
FIRST_EMOJI = 0x2194
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
|
|
def get_data(fname, folder='UCD'):
|
|
url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
|
|
bn = os.path.basename(url)
|
|
local = os.path.join('/tmp', bn)
|
|
if os.path.exists(local):
|
|
data = open(local, 'rb').read()
|
|
else:
|
|
data = urlopen(url).read()
|
|
open(local, 'wb').write(data)
|
|
for line in data.decode('utf-8').splitlines():
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
yield line
|
|
|
|
|
|
# Map of class names to set of codepoints in class
|
|
class_maps = {}
|
|
combining_codepoints = set()
|
|
not_assigned = set(range(0, sys.maxunicode))
|
|
|
|
|
|
def parse_ucd():
|
|
first = None
|
|
for line in get_data('ucd/UnicodeData.txt'):
|
|
parts = [x.strip() for x in line.split(';')]
|
|
codepoint = int(parts[0], 16)
|
|
category = parts[2]
|
|
s = class_maps.setdefault(category, set())
|
|
desc = parts[1]
|
|
codepoints = (codepoint,)
|
|
if first is None:
|
|
if desc.endswith(', First>'):
|
|
first = codepoint
|
|
continue
|
|
else:
|
|
codepoints = range(first, codepoint + 1)
|
|
first = None
|
|
for codepoint in codepoints:
|
|
s.add(codepoint)
|
|
not_assigned.discard(codepoint)
|
|
cc = parts[3]
|
|
if cc and cc != '0':
|
|
combining_codepoints.add(codepoint)
|
|
|
|
|
|
def split_two(line):
|
|
spec, rest = line.split(';', 1)
|
|
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
|
|
if '..' in spec:
|
|
chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
|
|
chars = set(range(chars[0], chars[1] + 1))
|
|
else:
|
|
chars = {int(spec, 16)}
|
|
return chars, rest
|
|
|
|
|
|
all_emoji = set()
|
|
emoji_categories = {}
|
|
|
|
|
|
def parse_emoji():
|
|
for line in get_data('emoji-data.txt', 'emoji'):
|
|
chars, rest = split_two(line)
|
|
if max(chars) >= FIRST_EMOJI:
|
|
s = emoji_categories.setdefault(rest, set())
|
|
s |= chars
|
|
all_emoji.update(chars)
|
|
|
|
|
|
doublewidth, ambiguous = set(), set()
|
|
|
|
|
|
def parse_eaw():
|
|
global doublewidth, ambiguous
|
|
seen = set()
|
|
for line in get_data('ucd/EastAsianWidth.txt'):
|
|
chars, eaw = split_two(line)
|
|
if eaw == 'A':
|
|
ambiguous |= chars
|
|
seen |= chars
|
|
elif eaw == 'W' or eaw == 'F':
|
|
doublewidth |= chars
|
|
seen |= chars
|
|
doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
|
|
doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
|
|
doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
|
|
doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
|
|
doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
|
|
|
|
|
|
def get_ranges(items):
|
|
items.sort()
|
|
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
|
|
group = tuple(map(itemgetter(1), g))
|
|
a, b = group[0], group[-1]
|
|
if a == b:
|
|
yield a
|
|
else:
|
|
yield a, b
|
|
|
|
|
|
def write_case(spec, p):
|
|
if isinstance(spec, tuple):
|
|
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
|
|
else:
|
|
p('\t\tcase 0x{:x}:'.format(spec))
|
|
|
|
|
|
@contextmanager
|
|
def create_header(path):
|
|
f = open(path, 'w')
|
|
p = partial(print, file=f)
|
|
p('// unicode data, built from the unicode standard on:', date.today())
|
|
p('// see gen-wcwidth.py')
|
|
p('#pragma once')
|
|
p('#include "data-types.h"\n')
|
|
p('START_ALLOW_CASE_RANGE')
|
|
p()
|
|
yield p
|
|
p()
|
|
p('END_ALLOW_CASE_RANGE')
|
|
f.close()
|
|
|
|
|
|
def gen_emoji():
|
|
with create_header('kitty/emoji.h') as p:
|
|
p('static inline bool\nis_emoji(char_type code) {')
|
|
p('\tswitch(code) {')
|
|
for spec in get_ranges(list(all_emoji)):
|
|
write_case(spec, p)
|
|
p('\t\t\treturn true;')
|
|
p('\t\tdefault: return false;')
|
|
p('\t}')
|
|
p('\treturn false;\n}')
|
|
p('static inline bool\nis_emoji_modifier(char_type code) {')
|
|
p('\tswitch(code) {')
|
|
for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
|
|
write_case(spec, p)
|
|
p('\t\t\treturn true;')
|
|
p('\t\tdefault: return false;')
|
|
p('\t}')
|
|
p('\treturn false;\n}')
|
|
|
|
|
|
def gen_wcwidth():
|
|
seen = set()
|
|
|
|
def add(p, comment, chars_, ret):
|
|
chars = chars_ - seen
|
|
seen.update(chars)
|
|
p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{{')
|
|
for spec in get_ranges(list(chars)):
|
|
write_case(spec, p)
|
|
p(f'\t\t\treturn {ret};')
|
|
p('\t\t// }}}\n')
|
|
|
|
with create_header('kitty/wcwidth-std.h') as p:
|
|
p('static int\nwcwidth_std(int32_t code) {')
|
|
p('\tswitch(code) {')
|
|
|
|
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
|
|
add(p, 'Non-printing characters', non_printing, -1)
|
|
add(p, 'Combining characters', combining_codepoints, -1)
|
|
add(p, 'Private use', class_maps['Co'], -3)
|
|
add(p, 'East Asian ambiguous width', ambiguous, -2)
|
|
add(p, 'East Asian double width', doublewidth, 2)
|
|
add(p, 'Emoji', all_emoji, 2)
|
|
|
|
add(p, 'Not assigned in the unicode character database', not_assigned, -1)
|
|
|
|
p('\t\tdefault: return 1;')
|
|
p('\t}')
|
|
p('\treturn 1;\n}')
|
|
|
|
|
|
parse_ucd()
|
|
parse_emoji()
|
|
parse_eaw()
|
|
gen_wcwidth()
|
|
gen_emoji()
|