mirror of
https://github.com/microsoft/pyright.git
synced 2024-10-27 03:12:44 +03:00
206 lines
7.5 KiB
Python
206 lines
7.5 KiB
Python
# generateUnicodeTables.py
|
|
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT license.
|
|
#
|
|
# Generates the content of unicode.ts based on the official Unicode
|
|
# character database.
|
|
|
|
import sys
|
|
import urllib.request
|
|
from io import TextIOWrapper
|
|
|
|
|
|
class Character:
|
|
def __init__(self, code: int, category: str, *, end: int | None = None):
|
|
self.code = code
|
|
self.category = category
|
|
|
|
self.hasSurrogate = code > 0xFFFF
|
|
if self.hasSurrogate:
|
|
unicodeChar = chr(code)
|
|
utf16 = unicodeChar.encode("utf-16")
|
|
rawHex = utf16.hex()
|
|
hex = rawHex[4:]
|
|
|
|
self.highSurrogate = int(hex[2:4] + hex[0:2], base=16)
|
|
self.lowSurrogate = int(hex[6:8] + hex[4:6], base=16)
|
|
|
|
|
|
class CharacterRange:
|
|
def __init__(self, start: Character, end: Character):
|
|
self.start = start
|
|
self.end = end
|
|
|
|
|
|
def downloadUnicodeData(unicodeVersion: str) -> str:
|
|
url = f"https://www.unicode.org/Public/{unicodeVersion}.0/ucd/UnicodeData.txt"
|
|
(path, _) = urllib.request.urlretrieve(url)
|
|
return path
|
|
|
|
|
|
def parseFile(filePath: str) -> list[Character]:
|
|
with open(filePath, "r") as reader:
|
|
lines = reader.readlines()
|
|
chars: list[Character] = []
|
|
for i in range(len(lines)):
|
|
line = lines[i]
|
|
splitOnSemicolon = line.split(";")
|
|
charCode = int(splitOnSemicolon[0], base=16)
|
|
category = splitOnSemicolon[2]
|
|
|
|
if splitOnSemicolon[1].endswith(", First>"):
|
|
# Legacy range syntax
|
|
# D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
|
# DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
|
nextLine = lines[i + 1]
|
|
nextSplitOnSemicolon = nextLine.split(";")
|
|
nextCharCode = int(nextSplitOnSemicolon[0], base=16)
|
|
for ord in range(charCode, nextCharCode + 1):
|
|
chars.append(Character(ord, category))
|
|
elif splitOnSemicolon[1].endswith(", Last>"):
|
|
continue
|
|
else:
|
|
chars.append(Character(charCode, category))
|
|
|
|
return chars
|
|
|
|
|
|
# Given a collection of characters, returns a list of ranges of contiguous
|
|
# characters. Contiguous means that the character codes are sequential with
|
|
# no gaps and the characters all have the same category. For character codes
|
|
# greater than 0xFFFF, contiguous means that the high surrogate is the same
|
|
# and the low surrogate values are sequential with no gaps. So, two charcter
|
|
# codes might be sequential numerically but have different high surrogates,
|
|
# and therefore would not be members of the same range.
|
|
def getSurrogateRanges(chars: list[Character]) -> list[CharacterRange]:
|
|
surrogateRanges: list[CharacterRange] = []
|
|
|
|
consecutiveRangeStartChar: Character | None = None
|
|
previousChar: Character | None = None
|
|
for char in chars:
|
|
if not consecutiveRangeStartChar:
|
|
consecutiveRangeStartChar = char
|
|
|
|
if previousChar:
|
|
if not previousChar.hasSurrogate and not char.hasSurrogate:
|
|
if (
|
|
char.code == previousChar.code + 1
|
|
and char.category == previousChar.category
|
|
):
|
|
pass
|
|
elif not previousChar.hasSurrogate and char.hasSurrogate:
|
|
consecutiveRangeStartChar = char
|
|
else:
|
|
if (
|
|
char.highSurrogate == previousChar.highSurrogate
|
|
and char.lowSurrogate == previousChar.lowSurrogate + 1
|
|
and char.category == previousChar.category
|
|
):
|
|
pass
|
|
else:
|
|
surrogateRanges.append(
|
|
CharacterRange(consecutiveRangeStartChar, previousChar)
|
|
)
|
|
consecutiveRangeStartChar = char
|
|
|
|
previousChar = char
|
|
|
|
return surrogateRanges
|
|
|
|
|
|
# Write out a table of all character codes within the specified category. These are
|
|
# the full hex character codes (Unicode code points) not surrogate values. Sequential
|
|
# ranges of character codes are written as arrays of two numbers (start and end) to
|
|
# save space.
|
|
def writeRangeTable(writer: TextIOWrapper, category: str, chars: list[Character]):
|
|
chars = [ch for ch in chars if ch.category == category]
|
|
|
|
writer.write(f"export const unicode{category}: UnicodeRangeTable = [\n")
|
|
|
|
consecutiveRangeStartChar: Character | None = None
|
|
for i in range(len(chars)):
|
|
char = chars[i]
|
|
|
|
if not consecutiveRangeStartChar:
|
|
consecutiveRangeStartChar = char
|
|
|
|
if i + 1 >= len(chars) or chars[i + 1].code != char.code + 1:
|
|
if consecutiveRangeStartChar.code == char.code:
|
|
writer.write(f" 0x{consecutiveRangeStartChar.code:04X},\n")
|
|
else:
|
|
writer.write(f" [0x{consecutiveRangeStartChar.code:04X}, 0x{char.code:04X}],\n")
|
|
|
|
consecutiveRangeStartChar = None
|
|
|
|
writer.write("];\n\n")
|
|
|
|
|
|
# Write out a table of all characters within the specified category using their UTF-16
|
|
# values. Characters are grouped by high surrogate value. Sequential ranges of low
|
|
# surrogate values are written as arrays of two numbers (start and end) to save space.
|
|
def writeSurrogateRangeTable(
|
|
writer: TextIOWrapper, category: str, surrogateRanges: list[CharacterRange]
|
|
):
|
|
surrogateRanges = [r for r in surrogateRanges if r.start.category == category]
|
|
|
|
if len(surrogateRanges) == 0:
|
|
return
|
|
|
|
writer.write(
|
|
f"export const unicode{category}Surrogate: UnicodeSurrogateRangeTable = {{\n"
|
|
)
|
|
|
|
previousCharRange: CharacterRange | None = None
|
|
for charRange in surrogateRanges:
|
|
if (
|
|
previousCharRange
|
|
and charRange.start.highSurrogate != previousCharRange.start.highSurrogate
|
|
):
|
|
writer.write(" ],\n")
|
|
previousCharRange = None
|
|
|
|
if not previousCharRange:
|
|
writer.write(f" 0x{charRange.start.highSurrogate:04X}: [\n")
|
|
previousCharRange = charRange
|
|
|
|
if charRange.start.lowSurrogate == charRange.end.lowSurrogate:
|
|
writer.write(f" 0x{charRange.start.lowSurrogate:04X}, // 0x{charRange.start.code:04X}\n")
|
|
else:
|
|
writer.write(
|
|
f" [0x{charRange.start.lowSurrogate:04X}, 0x{charRange.end.lowSurrogate:04X}], // 0x{charRange.start.code:04X}..0x{charRange.end.code:04X}\n"
|
|
)
|
|
|
|
writer.write(" ],\n")
|
|
writer.write("};\n\n")
|
|
|
|
|
|
unicodeVersion = "15.1" if len(sys.argv) <= 1 else sys.argv[1]
|
|
path = downloadUnicodeData(unicodeVersion)
|
|
chars = parseFile(path)
|
|
surrogateRanges = getSurrogateRanges(chars)
|
|
|
|
with open("packages/pyright-internal/src/parser/unicode.ts", "w") as writer:
|
|
writer.write(
|
|
f"""/*
|
|
* unicode.ts
|
|
* Copyright (c) Microsoft Corporation.
|
|
* Licensed under the MIT license.
|
|
*
|
|
* Tables that encode Unicode character codes for various Unicode-
|
|
* defined categories used in the Python spec.
|
|
*
|
|
* Generated by build/generateUnicodeTables.py from the UnicodeData.txt
|
|
* metadata file for Unicode {unicodeVersion}.
|
|
*/
|
|
|
|
export type UnicodeRange = [number, number] | number;
|
|
export type UnicodeRangeTable = UnicodeRange[];
|
|
export type UnicodeSurrogateRangeTable = {{ [surrogate: number]: UnicodeRange[] }};
|
|
|
|
"""
|
|
)
|
|
|
|
for category in ["Lu", "Ll", "Lt", "Lo", "Lm", "Nl", "Mn", "Mc", "Nd", "Pc"]:
|
|
writeRangeTable(writer, category, chars)
|
|
writeSurrogateRangeTable(writer, category, surrogateRanges)
|