2017-12-20 13:36:58 +03:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
|
|
|
|
|
|
|
import os
|
2019-03-06 11:23:09 +03:00
|
|
|
import re
|
2022-08-24 13:31:54 +03:00
|
|
|
import subprocess
|
2017-12-20 13:36:58 +03:00
|
|
|
import sys
|
2018-02-09 17:26:25 +03:00
|
|
|
from collections import defaultdict
|
2017-12-20 13:36:58 +03:00
|
|
|
from contextlib import contextmanager
|
2022-11-18 08:12:06 +03:00
|
|
|
from functools import lru_cache, partial
|
2018-02-09 17:26:25 +03:00
|
|
|
from html.entities import html5
|
2020-04-06 20:30:34 +03:00
|
|
|
from itertools import groupby
|
2017-12-20 13:36:58 +03:00
|
|
|
from operator import itemgetter
|
2020-03-06 06:05:23 +03:00
|
|
|
from typing import (
|
2023-01-09 14:17:42 +03:00
|
|
|
Callable,
|
|
|
|
DefaultDict,
|
|
|
|
Dict,
|
|
|
|
FrozenSet,
|
|
|
|
Generator,
|
|
|
|
Iterable,
|
|
|
|
List,
|
|
|
|
Optional,
|
|
|
|
Set,
|
|
|
|
Tuple,
|
|
|
|
Union,
|
2020-03-06 06:05:23 +03:00
|
|
|
)
|
2017-12-20 13:36:58 +03:00
|
|
|
from urllib.request import urlopen
|
|
|
|
|
|
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
|
2018-06-14 07:50:13 +03:00
|
|
|
non_characters = frozenset(range(0xfffe, 0x10ffff, 0x10000))
|
|
|
|
non_characters |= frozenset(range(0xffff, 0x10ffff + 1, 0x10000))
|
|
|
|
non_characters |= frozenset(range(0xfdd0, 0xfdf0))
|
|
|
|
if len(non_characters) != 66:
|
|
|
|
raise SystemExit('non_characters table incorrect')
|
2018-08-04 07:36:25 +03:00
|
|
|
emoji_skin_tone_modifiers = frozenset(range(0x1f3fb, 0x1F3FF + 1))
|
2018-06-14 07:50:13 +03:00
|
|
|
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def get_data(fname: str, folder: str = 'UCD') -> Iterable[str]:
|
2017-12-20 13:36:58 +03:00
|
|
|
url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
|
|
|
|
bn = os.path.basename(url)
|
|
|
|
local = os.path.join('/tmp', bn)
|
|
|
|
if os.path.exists(local):
|
2019-08-01 21:21:26 +03:00
|
|
|
with open(local, 'rb') as f:
|
|
|
|
data = f.read()
|
2017-12-20 13:36:58 +03:00
|
|
|
else:
|
|
|
|
data = urlopen(url).read()
|
2019-08-01 21:21:26 +03:00
|
|
|
with open(local, 'wb') as f:
|
|
|
|
f.write(data)
|
2017-12-20 13:36:58 +03:00
|
|
|
for line in data.decode('utf-8').splitlines():
|
|
|
|
line = line.strip()
|
|
|
|
if line and not line.startswith('#'):
|
|
|
|
yield line
|
|
|
|
|
|
|
|
|
2022-11-18 08:12:06 +03:00
|
|
|
@lru_cache(maxsize=2)
|
2022-11-17 17:55:35 +03:00
|
|
|
def unicode_version() -> Tuple[int, int, int]:
|
2022-11-17 17:41:50 +03:00
|
|
|
for line in get_data("ReadMe.txt"):
|
|
|
|
m = re.search(r'Version\s+(\d+)\.(\d+)\.(\d+)', line)
|
|
|
|
if m is not None:
|
|
|
|
return int(m.group(1)), int(m.group(2)), int(m.group(3))
|
|
|
|
raise ValueError('Could not find Unicode Version')
|
|
|
|
|
|
|
|
|
2017-12-20 13:36:58 +03:00
|
|
|
# Map of class names to set of codepoints in class
|
2020-03-04 05:38:46 +03:00
|
|
|
class_maps: Dict[str, Set[int]] = {}
|
2020-03-06 06:05:23 +03:00
|
|
|
all_symbols: Set[int] = set()
|
|
|
|
name_map: Dict[int, str] = {}
|
2020-03-04 05:38:46 +03:00
|
|
|
word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
|
2021-10-07 07:56:57 +03:00
|
|
|
soft_hyphen = 0xad
|
2020-04-06 19:46:59 +03:00
|
|
|
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
|
2021-10-07 07:56:57 +03:00
|
|
|
# See https://github.com/harfbuzz/harfbuzz/issues/169
|
2022-01-05 06:05:46 +03:00
|
|
|
marks = set(emoji_skin_tone_modifiers) | flag_codepoints
|
2017-12-20 13:36:58 +03:00
|
|
|
not_assigned = set(range(0, sys.maxunicode))
|
2021-10-07 07:56:57 +03:00
|
|
|
property_maps: Dict[str, Set[int]] = defaultdict(set)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_prop_list() -> None:
|
|
|
|
global marks
|
|
|
|
for line in get_data('ucd/PropList.txt'):
|
|
|
|
if line.startswith('#'):
|
|
|
|
continue
|
|
|
|
cp_or_range, rest = line.split(';', 1)
|
|
|
|
chars = parse_range_spec(cp_or_range.strip())
|
|
|
|
name = rest.strip().split()[0]
|
|
|
|
property_maps[name] |= chars
|
|
|
|
# see https://www.unicode.org/faq/unsup_char.html#3
|
|
|
|
marks |= property_maps['Other_Default_Ignorable_Code_Point']
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def parse_ucd() -> None:
|
2018-02-09 17:26:25 +03:00
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def add_word(w: str, c: int) -> None:
|
2018-02-09 17:26:25 +03:00
|
|
|
if c <= 32 or c == 127 or 128 <= c <= 159:
|
|
|
|
return
|
2018-04-24 05:15:20 +03:00
|
|
|
if len(w) > 1:
|
|
|
|
word_search_map[w.lower()].add(c)
|
2018-02-09 17:26:25 +03:00
|
|
|
|
2020-03-06 06:05:23 +03:00
|
|
|
first: Optional[int] = None
|
2018-02-09 17:26:25 +03:00
|
|
|
for word, c in html5.items():
|
|
|
|
if len(c) == 1:
|
|
|
|
add_word(word.rstrip(';'), ord(c))
|
|
|
|
word_search_map['nnbsp'].add(0x202f)
|
2017-12-20 13:36:58 +03:00
|
|
|
for line in get_data('ucd/UnicodeData.txt'):
|
|
|
|
parts = [x.strip() for x in line.split(';')]
|
|
|
|
codepoint = int(parts[0], 16)
|
2018-05-01 07:43:22 +03:00
|
|
|
name = parts[1] or parts[10]
|
|
|
|
if name == '<control>':
|
|
|
|
name = parts[10]
|
2018-02-09 17:26:25 +03:00
|
|
|
if name:
|
|
|
|
name_map[codepoint] = name
|
|
|
|
for word in name.lower().split():
|
|
|
|
add_word(word, codepoint)
|
2017-12-20 13:36:58 +03:00
|
|
|
category = parts[2]
|
|
|
|
s = class_maps.setdefault(category, set())
|
|
|
|
desc = parts[1]
|
2020-03-06 06:05:23 +03:00
|
|
|
codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint,)
|
2017-12-20 13:36:58 +03:00
|
|
|
if first is None:
|
|
|
|
if desc.endswith(', First>'):
|
|
|
|
first = codepoint
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
codepoints = range(first, codepoint + 1)
|
|
|
|
first = None
|
|
|
|
for codepoint in codepoints:
|
|
|
|
s.add(codepoint)
|
|
|
|
not_assigned.discard(codepoint)
|
2018-01-17 20:55:59 +03:00
|
|
|
if category.startswith('M'):
|
|
|
|
marks.add(codepoint)
|
2019-10-01 16:27:06 +03:00
|
|
|
elif category.startswith('S'):
|
|
|
|
all_symbols.add(codepoint)
|
2022-01-05 06:05:46 +03:00
|
|
|
elif category == 'Cf':
|
2022-01-05 06:27:14 +03:00
|
|
|
# we add Cf to marks as it contains things like tags and zero
|
|
|
|
# width chars. Not sure if *all* of Cf should be treated as
|
|
|
|
# combining chars, might need to add individual exceptions in
|
|
|
|
# the future.
|
2022-01-05 06:05:46 +03:00
|
|
|
marks.add(codepoint)
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2020-09-22 17:17:39 +03:00
|
|
|
with open('nerd-fonts-glyphs.txt') as f:
|
|
|
|
for line in f:
|
|
|
|
line = line.strip()
|
|
|
|
if not line or line.startswith('#'):
|
|
|
|
continue
|
|
|
|
code, category, name = line.split(' ', 2)
|
|
|
|
codepoint = int(code, 16)
|
|
|
|
if name and codepoint not in name_map:
|
|
|
|
name_map[codepoint] = name.upper()
|
|
|
|
for word in name.lower().split():
|
|
|
|
add_word(word, codepoint)
|
|
|
|
|
2018-11-13 13:16:33 +03:00
|
|
|
# Some common synonyms
|
|
|
|
word_search_map['bee'] |= word_search_map['honeybee']
|
|
|
|
word_search_map['lambda'] |= word_search_map['lamda']
|
|
|
|
word_search_map['lamda'] |= word_search_map['lambda']
|
2021-04-02 10:23:58 +03:00
|
|
|
word_search_map['diamond'] |= word_search_map['gem']
|
2018-11-13 13:16:33 +03:00
|
|
|
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2020-04-06 16:29:35 +03:00
|
|
|
def parse_range_spec(spec: str) -> Set[int]:
|
|
|
|
spec = spec.strip()
|
2017-12-20 13:36:58 +03:00
|
|
|
if '..' in spec:
|
2020-03-06 06:05:23 +03:00
|
|
|
chars_ = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
|
|
|
|
chars = set(range(chars_[0], chars_[1] + 1))
|
2017-12-20 13:36:58 +03:00
|
|
|
else:
|
|
|
|
chars = {int(spec, 16)}
|
2020-04-06 16:29:35 +03:00
|
|
|
return chars
|
|
|
|
|
|
|
|
|
|
|
|
def split_two(line: str) -> Tuple[Set[int], str]:
|
|
|
|
spec, rest = line.split(';', 1)
|
|
|
|
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
|
|
|
|
return parse_range_spec(spec), rest
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-04 05:38:46 +03:00
|
|
|
all_emoji: Set[int] = set()
|
|
|
|
emoji_presentation_bases: Set[int] = set()
|
2020-04-06 16:29:35 +03:00
|
|
|
narrow_emoji: Set[int] = set()
|
|
|
|
wide_emoji: Set[int] = set()
|
2020-04-06 18:46:14 +03:00
|
|
|
flags: Dict[int, List[int]] = {}
|
2020-04-06 16:29:35 +03:00
|
|
|
|
|
|
|
|
|
|
|
def parse_basic_emoji(spec: str) -> None:
|
|
|
|
parts = list(filter(None, spec.split()))
|
|
|
|
has_emoji_presentation = len(parts) < 2
|
|
|
|
chars = parse_range_spec(parts[0])
|
|
|
|
all_emoji.update(chars)
|
|
|
|
emoji_presentation_bases.update(chars)
|
|
|
|
(wide_emoji if has_emoji_presentation else narrow_emoji).update(chars)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_keycap_sequence(spec: str) -> None:
|
|
|
|
base, fe0f, cc = list(filter(None, spec.split()))
|
|
|
|
chars = parse_range_spec(base)
|
|
|
|
all_emoji.update(chars)
|
|
|
|
emoji_presentation_bases.update(chars)
|
|
|
|
narrow_emoji.update(chars)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_flag_emoji_sequence(spec: str) -> None:
|
|
|
|
a, b = list(filter(None, spec.split()))
|
|
|
|
left, right = int(a, 16), int(b, 16)
|
|
|
|
chars = {left, right}
|
|
|
|
all_emoji.update(chars)
|
|
|
|
wide_emoji.update(chars)
|
|
|
|
emoji_presentation_bases.update(chars)
|
2020-04-06 18:46:14 +03:00
|
|
|
flags.setdefault(left, []).append(right)
|
2020-04-06 16:29:35 +03:00
|
|
|
|
|
|
|
|
|
|
|
def parse_emoji_tag_sequence(spec: str) -> None:
|
|
|
|
a = int(spec.split()[0], 16)
|
|
|
|
all_emoji.add(a)
|
|
|
|
wide_emoji.add(a)
|
|
|
|
emoji_presentation_bases.add(a)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_emoji_modifier_sequence(spec: str) -> None:
|
|
|
|
a, b = list(filter(None, spec.split()))
|
|
|
|
char, mod = int(a, 16), int(b, 16)
|
|
|
|
mod
|
|
|
|
all_emoji.add(char)
|
|
|
|
wide_emoji.add(char)
|
|
|
|
emoji_presentation_bases.add(char)
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def parse_emoji() -> None:
|
2020-04-06 16:29:35 +03:00
|
|
|
for line in get_data('emoji-sequences.txt', 'emoji'):
|
|
|
|
parts = [x.strip() for x in line.split(';')]
|
|
|
|
if len(parts) < 2:
|
2018-02-06 07:22:55 +03:00
|
|
|
continue
|
2020-04-06 16:29:35 +03:00
|
|
|
data, etype = parts[:2]
|
|
|
|
if etype == 'Basic_Emoji':
|
|
|
|
parse_basic_emoji(data)
|
|
|
|
elif etype == 'Emoji_Keycap_Sequence':
|
|
|
|
parse_keycap_sequence(data)
|
|
|
|
elif etype == 'RGI_Emoji_Flag_Sequence':
|
|
|
|
parse_flag_emoji_sequence(data)
|
|
|
|
elif etype == 'RGI_Emoji_Tag_Sequence':
|
|
|
|
parse_emoji_tag_sequence(data)
|
|
|
|
elif etype == 'RGI_Emoji_Modifier_Sequence':
|
|
|
|
parse_emoji_modifier_sequence(data)
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-04 05:38:46 +03:00
|
|
|
doublewidth: Set[int] = set()
|
|
|
|
ambiguous: Set[int] = set()
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def parse_eaw() -> None:
|
2017-12-20 13:36:58 +03:00
|
|
|
global doublewidth, ambiguous
|
2020-03-06 06:05:23 +03:00
|
|
|
seen: Set[int] = set()
|
2017-12-20 13:36:58 +03:00
|
|
|
for line in get_data('ucd/EastAsianWidth.txt'):
|
|
|
|
chars, eaw = split_two(line)
|
|
|
|
if eaw == 'A':
|
|
|
|
ambiguous |= chars
|
|
|
|
seen |= chars
|
2021-10-25 06:13:08 +03:00
|
|
|
elif eaw in ('W', 'F'):
|
2017-12-20 13:36:58 +03:00
|
|
|
doublewidth |= chars
|
|
|
|
seen |= chars
|
|
|
|
doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
|
|
|
|
doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
|
|
|
|
doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
|
|
|
|
doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
|
|
|
|
doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
|
|
|
|
|
|
|
|
|
2020-03-06 06:05:23 +03:00
|
|
|
def get_ranges(items: List[int]) -> Generator[Union[int, Tuple[int, int]], None, None]:
|
2017-12-20 13:36:58 +03:00
|
|
|
items.sort()
|
|
|
|
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
|
|
|
|
group = tuple(map(itemgetter(1), g))
|
|
|
|
a, b = group[0], group[-1]
|
|
|
|
if a == b:
|
|
|
|
yield a
|
|
|
|
else:
|
|
|
|
yield a, b
|
|
|
|
|
|
|
|
|
2022-08-24 13:31:54 +03:00
|
|
|
def write_case(spec: Union[Tuple[int, ...], int], p: Callable[..., None], for_go: bool = False) -> None:
|
2017-12-20 13:36:58 +03:00
|
|
|
if isinstance(spec, tuple):
|
2022-08-24 13:31:54 +03:00
|
|
|
if for_go:
|
|
|
|
v = ', '.join(f'0x{x:x}' for x in range(spec[0], spec[1] + 1))
|
|
|
|
p(f'\t\tcase {v}:')
|
|
|
|
else:
|
|
|
|
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
|
2017-12-20 13:36:58 +03:00
|
|
|
else:
|
2021-11-18 20:10:44 +03:00
|
|
|
p(f'\t\tcase 0x{spec:x}:')
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
2021-10-27 08:20:24 +03:00
|
|
|
def create_header(path: str, include_data_types: bool = True) -> Generator[Callable[..., None], None, None]:
|
2019-08-01 21:21:26 +03:00
|
|
|
with open(path, 'w') as f:
|
|
|
|
p = partial(print, file=f)
|
2022-11-18 08:01:32 +03:00
|
|
|
p('// Unicode data, built from the Unicode Standard', '.'.join(map(str, unicode_version())))
|
|
|
|
p(f'// Code generated by {os.path.basename(__file__)}, DO NOT EDIT.', end='\n\n')
|
2019-08-01 21:21:26 +03:00
|
|
|
if path.endswith('.h'):
|
|
|
|
p('#pragma once')
|
|
|
|
if include_data_types:
|
|
|
|
p('#include "data-types.h"\n')
|
|
|
|
p('START_ALLOW_CASE_RANGE')
|
|
|
|
p()
|
|
|
|
yield p
|
|
|
|
p()
|
|
|
|
if include_data_types:
|
|
|
|
p('END_ALLOW_CASE_RANGE')
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def gen_emoji() -> None:
|
2017-12-20 13:36:58 +03:00
|
|
|
with create_header('kitty/emoji.h') as p:
|
|
|
|
p('static inline bool\nis_emoji(char_type code) {')
|
|
|
|
p('\tswitch(code) {')
|
|
|
|
for spec in get_ranges(list(all_emoji)):
|
|
|
|
write_case(spec, p)
|
|
|
|
p('\t\t\treturn true;')
|
|
|
|
p('\t\tdefault: return false;')
|
|
|
|
p('\t}')
|
2017-12-20 20:14:20 +03:00
|
|
|
p('\treturn false;\n}')
|
2019-10-01 16:27:06 +03:00
|
|
|
|
|
|
|
p('static inline bool\nis_symbol(char_type code) {')
|
|
|
|
p('\tswitch(code) {')
|
|
|
|
for spec in get_ranges(list(all_symbols)):
|
|
|
|
write_case(spec, p)
|
|
|
|
p('\t\t\treturn true;')
|
|
|
|
p('\t\tdefault: return false;')
|
|
|
|
p('\t}')
|
|
|
|
p('\treturn false;\n}')
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def category_test(
|
|
|
|
name: str,
|
2021-10-27 08:20:24 +03:00
|
|
|
p: Callable[..., None],
|
2020-03-14 12:07:11 +03:00
|
|
|
classes: Iterable[str],
|
|
|
|
comment: str,
|
|
|
|
use_static: bool = False,
|
|
|
|
extra_chars: Union[FrozenSet[int], Set[int]] = frozenset(),
|
2020-08-06 15:15:40 +03:00
|
|
|
exclude: Union[Set[int], FrozenSet[int]] = frozenset(),
|
2020-08-06 15:35:33 +03:00
|
|
|
least_check_return: Optional[str] = None,
|
|
|
|
ascii_range: Optional[str] = None
|
2020-03-14 12:07:11 +03:00
|
|
|
) -> None:
|
|
|
|
static = 'static inline ' if use_static else ''
|
2020-03-06 06:05:23 +03:00
|
|
|
chars: Set[int] = set()
|
2018-01-17 21:39:40 +03:00
|
|
|
for c in classes:
|
|
|
|
chars |= class_maps[c]
|
2018-06-14 07:50:13 +03:00
|
|
|
chars |= extra_chars
|
2018-08-04 15:59:45 +03:00
|
|
|
chars -= exclude
|
2018-01-17 21:39:40 +03:00
|
|
|
p(f'{static}bool\n{name}(char_type code) {{')
|
|
|
|
p(f'\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
|
2020-08-06 15:15:40 +03:00
|
|
|
if least_check_return is not None:
|
|
|
|
least = min(chars)
|
|
|
|
p(f'\tif (LIKELY(code < {least})) return {least_check_return};')
|
2020-08-06 15:35:33 +03:00
|
|
|
if ascii_range is not None:
|
|
|
|
p(f'\tif (LIKELY(0x20 <= code && code <= 0x7e)) return {ascii_range};')
|
2018-01-17 21:39:40 +03:00
|
|
|
p('\tswitch(code) {')
|
|
|
|
for spec in get_ranges(list(chars)):
|
|
|
|
write_case(spec, p)
|
2020-05-12 20:24:08 +03:00
|
|
|
p('\t\t\treturn true;')
|
2018-01-17 21:39:40 +03:00
|
|
|
p('\t} // }}}\n')
|
|
|
|
p('\treturn false;\n}\n')
|
|
|
|
|
|
|
|
|
2021-10-27 08:20:24 +03:00
|
|
|
def codepoint_to_mark_map(p: Callable[..., None], mark_map: List[int]) -> Dict[int, int]:
|
2018-02-09 17:26:25 +03:00
|
|
|
p('\tswitch(c) { // {{{')
|
|
|
|
rmap = {c: m for m, c in enumerate(mark_map)}
|
|
|
|
for spec in get_ranges(mark_map):
|
|
|
|
if isinstance(spec, tuple):
|
|
|
|
s = rmap[spec[0]]
|
2018-05-01 08:57:10 +03:00
|
|
|
cases = ' '.join(f'case {i}:' for i in range(spec[0], spec[1]+1))
|
|
|
|
p(f'\t\t{cases} return {s} + c - {spec[0]};')
|
2018-02-09 17:26:25 +03:00
|
|
|
else:
|
|
|
|
p(f'\t\tcase {spec}: return {rmap[spec]};')
|
|
|
|
p('default: return 0;')
|
|
|
|
p('\t} // }}}')
|
|
|
|
return rmap
|
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def classes_to_regex(classes: Iterable[str], exclude: str = '') -> Iterable[str]:
|
2020-03-06 06:05:23 +03:00
|
|
|
chars: Set[int] = set()
|
2018-02-14 08:31:17 +03:00
|
|
|
for c in classes:
|
|
|
|
chars |= class_maps[c]
|
2020-03-14 12:07:11 +03:00
|
|
|
for x in map(ord, exclude):
|
|
|
|
chars.discard(x)
|
2018-02-14 08:31:17 +03:00
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def as_string(codepoint: int) -> str:
|
2018-02-14 08:31:17 +03:00
|
|
|
if codepoint < 256:
|
2021-11-18 20:10:44 +03:00
|
|
|
return fr'\x{codepoint:02x}'
|
2018-02-14 08:31:17 +03:00
|
|
|
if codepoint <= 0xffff:
|
2021-11-18 20:10:44 +03:00
|
|
|
return fr'\u{codepoint:04x}'
|
|
|
|
return fr'\U{codepoint:08x}'
|
2018-02-14 08:31:17 +03:00
|
|
|
|
|
|
|
for spec in get_ranges(list(chars)):
|
|
|
|
if isinstance(spec, tuple):
|
|
|
|
yield '{}-{}'.format(*map(as_string, (spec[0], spec[1])))
|
|
|
|
else:
|
|
|
|
yield as_string(spec)
|
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def gen_ucd() -> None:
|
2018-02-14 08:31:17 +03:00
|
|
|
cz = {c for c in class_maps if c[0] in 'CZ'}
|
2018-01-17 21:39:40 +03:00
|
|
|
with create_header('kitty/unicode-data.c') as p:
|
|
|
|
p('#include "unicode-data.h"')
|
2018-08-04 07:36:25 +03:00
|
|
|
category_test(
|
|
|
|
'is_combining_char', p,
|
2021-10-07 07:56:57 +03:00
|
|
|
(),
|
|
|
|
'Combining and default ignored characters',
|
|
|
|
extra_chars=marks,
|
2020-08-06 15:15:40 +03:00
|
|
|
least_check_return='false'
|
2018-08-04 07:36:25 +03:00
|
|
|
)
|
2018-08-04 15:59:45 +03:00
|
|
|
category_test(
|
2021-10-07 07:56:57 +03:00
|
|
|
'is_ignored_char', p, 'Cc Cs'.split(),
|
2020-08-06 15:35:33 +03:00
|
|
|
'Control characters and non-characters',
|
2021-10-07 07:56:57 +03:00
|
|
|
extra_chars=non_characters,
|
|
|
|
ascii_range='false'
|
|
|
|
)
|
|
|
|
category_test(
|
2022-01-05 06:05:46 +03:00
|
|
|
'is_non_rendered_char', p, 'Cc Cs Cf'.split(),
|
2021-10-07 07:56:57 +03:00
|
|
|
'Other_Default_Ignorable_Code_Point and soft hyphen',
|
2022-01-05 20:03:53 +03:00
|
|
|
extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | set(range(0xfe00, 0xfe0f + 1)),
|
2020-08-06 15:35:33 +03:00
|
|
|
ascii_range='false'
|
|
|
|
)
|
2018-01-17 21:39:40 +03:00
|
|
|
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
|
2018-02-14 08:31:17 +03:00
|
|
|
category_test('is_CZ_category', p, cz, 'C and Z categories')
|
2018-01-17 21:39:40 +03:00
|
|
|
category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
|
2018-01-18 13:36:07 +03:00
|
|
|
mark_map = [0] + list(sorted(marks))
|
|
|
|
p('char_type codepoint_for_mark(combining_type m) {')
|
|
|
|
p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}')
|
|
|
|
p('\tif (m < arraysz(map)) return map[m];')
|
|
|
|
p('\treturn 0;')
|
|
|
|
p('}\n')
|
|
|
|
p('combining_type mark_for_codepoint(char_type c) {')
|
2018-02-09 17:26:25 +03:00
|
|
|
rmap = codepoint_to_mark_map(p, mark_map)
|
2018-01-18 13:36:07 +03:00
|
|
|
p('}\n')
|
2021-10-07 07:56:57 +03:00
|
|
|
with open('kitty/unicode-data.h', 'r+') as f:
|
|
|
|
raw = f.read()
|
|
|
|
f.seek(0)
|
|
|
|
raw, num = re.subn(
|
|
|
|
r'^// START_KNOWN_MARKS.+?^// END_KNOWN_MARKS',
|
|
|
|
'// START_KNOWN_MARKS\nstatic const combining_type '
|
|
|
|
f'VS15 = {rmap[0xfe0e]}, VS16 = {rmap[0xfe0f]};'
|
|
|
|
'\n// END_KNOWN_MARKS', raw, flags=re.MULTILINE | re.DOTALL)
|
|
|
|
if not num:
|
|
|
|
raise SystemExit('Faile dto patch mark definitions in unicode-data.h')
|
|
|
|
f.truncate()
|
|
|
|
f.write(raw)
|
|
|
|
|
2018-04-23 18:55:33 +03:00
|
|
|
with open('kittens/hints/url_regex.py', 'w') as f:
|
2021-07-19 15:39:00 +03:00
|
|
|
f.write('# generated by gen-wcwidth.py, do not edit\n\n')
|
|
|
|
f.write("url_delimiters = '{}' # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))
|
2018-01-17 21:39:40 +03:00
|
|
|
|
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def gen_names() -> None:
|
2023-02-09 16:37:55 +03:00
|
|
|
aliases_map: Dict[int, Set[str]] = {}
|
|
|
|
for word, codepoints in word_search_map.items():
|
|
|
|
for cp in codepoints:
|
|
|
|
aliases_map.setdefault(cp, set()).add(word)
|
|
|
|
if len(name_map) > 0xffff:
|
|
|
|
raise Exception('Too many named codepoints')
|
|
|
|
with open('tools/unicode_names/names.txt', 'w') as f:
|
|
|
|
print(len(name_map), len(word_search_map), file=f)
|
|
|
|
for cp in sorted(name_map):
|
|
|
|
name = name_map[cp]
|
|
|
|
words = name.lower().split()
|
|
|
|
aliases = aliases_map.get(cp, set()) - set(words)
|
|
|
|
end = '\n'
|
|
|
|
if aliases:
|
|
|
|
end = '\t' + ' '.join(sorted(aliases)) + end
|
|
|
|
print(cp, *words, end=end, file=f)
|
2018-04-24 05:15:20 +03:00
|
|
|
|
2018-02-09 17:26:25 +03:00
|
|
|
|
2020-03-14 12:07:11 +03:00
|
|
|
def gen_wcwidth() -> None:
|
2020-03-06 06:05:23 +03:00
|
|
|
seen: Set[int] = set()
|
2022-08-24 13:31:54 +03:00
|
|
|
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2022-08-24 13:31:54 +03:00
|
|
|
def add(p: Callable[..., None], comment: str, chars_: Union[Set[int], FrozenSet[int]], ret: int, for_go: bool = False) -> None:
|
2017-12-20 13:36:58 +03:00
|
|
|
chars = chars_ - seen
|
|
|
|
seen.update(chars)
|
2018-01-17 21:39:40 +03:00
|
|
|
p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{' '{')
|
2017-12-20 13:36:58 +03:00
|
|
|
for spec in get_ranges(list(chars)):
|
2022-08-24 13:31:54 +03:00
|
|
|
write_case(spec, p, for_go)
|
2017-12-20 13:36:58 +03:00
|
|
|
p(f'\t\t\treturn {ret};')
|
|
|
|
p('\t\t// }}}\n')
|
|
|
|
|
2022-08-24 13:31:54 +03:00
|
|
|
def add_all(p: Callable[..., None], for_go: bool = False) -> None:
|
|
|
|
seen.clear()
|
|
|
|
add(p, 'Flags', flag_codepoints, 2, for_go)
|
|
|
|
add(p, 'Marks', marks | {0}, 0, for_go)
|
|
|
|
add(p, 'Non-printing characters', non_printing, -1, for_go)
|
|
|
|
add(p, 'Private use', class_maps['Co'], -3, for_go)
|
|
|
|
add(p, 'Text Presentation', narrow_emoji, 1, for_go)
|
|
|
|
add(p, 'East Asian ambiguous width', ambiguous, -2, for_go)
|
|
|
|
add(p, 'East Asian double width', doublewidth, 2, for_go)
|
|
|
|
add(p, 'Emoji Presentation', wide_emoji, 2, for_go)
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2022-08-24 13:31:54 +03:00
|
|
|
add(p, 'Not assigned in the unicode character database', not_assigned, -4, for_go)
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2022-08-24 13:31:54 +03:00
|
|
|
p('\t\tdefault:\n\t\t\treturn 1;')
|
2017-12-20 13:36:58 +03:00
|
|
|
p('\t}')
|
2022-08-24 17:25:46 +03:00
|
|
|
if for_go:
|
|
|
|
p('\t}')
|
|
|
|
else:
|
|
|
|
p('\treturn 1;\n}')
|
2017-12-20 13:36:58 +03:00
|
|
|
|
2022-08-24 19:35:51 +03:00
|
|
|
with create_header('kitty/wcwidth-std.h') as p, open('tools/wcswidth/std.go', 'w') as gof:
|
2022-08-24 13:31:54 +03:00
|
|
|
gop = partial(print, file=gof)
|
2022-08-24 19:35:51 +03:00
|
|
|
gop('package wcswidth\n\n')
|
|
|
|
gop('func Runewidth(code rune) int {')
|
2022-08-24 13:31:54 +03:00
|
|
|
p('static inline int\nwcwidth_std(int32_t code) {')
|
|
|
|
p('\tif (LIKELY(0x20 <= code && code <= 0x7e)) { return 1; }')
|
|
|
|
p('\tswitch(code) {')
|
|
|
|
gop('\tswitch(code) {')
|
|
|
|
add_all(p)
|
|
|
|
add_all(gop, True)
|
|
|
|
|
2020-09-21 11:01:24 +03:00
|
|
|
p('static inline bool\nis_emoji_presentation_base(uint32_t code) {')
|
2022-08-24 13:31:54 +03:00
|
|
|
gop('func IsEmojiPresentationBase(code rune) bool {')
|
2018-02-06 07:22:55 +03:00
|
|
|
p('\tswitch(code) {')
|
2022-08-24 13:31:54 +03:00
|
|
|
gop('\tswitch(code) {')
|
2018-02-06 07:22:55 +03:00
|
|
|
for spec in get_ranges(list(emoji_presentation_bases)):
|
|
|
|
write_case(spec, p)
|
2022-08-24 13:31:54 +03:00
|
|
|
write_case(spec, gop, for_go=True)
|
2018-02-06 07:22:55 +03:00
|
|
|
p('\t\t\treturn true;')
|
2022-08-24 13:31:54 +03:00
|
|
|
gop('\t\t\treturn true;')
|
2018-02-06 07:22:55 +03:00
|
|
|
p('\t\tdefault: return false;')
|
|
|
|
p('\t}')
|
2022-08-24 13:31:54 +03:00
|
|
|
gop('\t\tdefault:\n\t\t\treturn false')
|
|
|
|
gop('\t}')
|
|
|
|
p('\treturn true;\n}')
|
|
|
|
gop('\n}')
|
2022-11-17 17:41:50 +03:00
|
|
|
uv = unicode_version()
|
|
|
|
p(f'#define UNICODE_MAJOR_VERSION {uv[0]}')
|
|
|
|
p(f'#define UNICODE_MINOR_VERSION {uv[1]}')
|
|
|
|
p(f'#define UNICODE_PATCH_VERSION {uv[2]}')
|
|
|
|
gop('var UnicodeDatabaseVersion [3]int = [3]int{' f'{uv[0]}, {uv[1]}, {uv[2]}' + '}')
|
2022-08-24 13:31:54 +03:00
|
|
|
subprocess.check_call(['gofmt', '-w', '-s', gof.name])
|
2018-02-06 07:22:55 +03:00
|
|
|
|
2017-12-20 13:36:58 +03:00
|
|
|
|
|
|
|
parse_ucd()
|
2021-10-07 07:56:57 +03:00
|
|
|
parse_prop_list()
|
2017-12-20 13:36:58 +03:00
|
|
|
parse_emoji()
|
|
|
|
parse_eaw()
|
2018-01-17 21:39:40 +03:00
|
|
|
gen_ucd()
|
2017-12-20 13:36:58 +03:00
|
|
|
gen_wcwidth()
|
|
|
|
gen_emoji()
|
2018-02-09 17:26:25 +03:00
|
|
|
gen_names()
|