Added support for identifiers that contain Unicode characters that require two UTF16 character codes (surrogates). This allows identifiers to use characters in the Unicode blocks for Egyptian Hieroglyphs, Linear B Ideograms, Cuneiform, Phoenician, etc.

This commit is contained in:
Eric Traut 2021-05-15 16:30:34 -07:00
parent 1e64db5e62
commit 4bf6de4755
5 changed files with 369 additions and 20 deletions

View File

@ -23,6 +23,10 @@ enum CharCategory {
// Character can appear only within identifier, not at beginning
IdentifierChar = 2,
// Character is a surrogate, meaning that additional character
// needs to be consulted.
SurrogateChar = 3,
}
// Table of first 256 character codes (the most common cases).
@ -30,14 +34,18 @@ const _identifierCharFastTableSize = 256;
const _identifierCharFastTable: CharCategory[] = new Array(_identifierCharFastTableSize);
// Map of remaining characters that can appear within identifier.
const _identifierCharMap: { [code: number]: CharCategory } = {};
type CharCategoryMap = { [code: number]: CharCategory };
const _identifierCharMap: CharCategoryMap = {};
// Secondary character map based on the primary (surrogate) character.
const _surrogateCharMap: { [code: number]: CharCategoryMap } = {};
// We do lazy initialization of this map because it's rarely used.
let _identifierCharMapInitialized = false;
export function isIdentifierStartChar(ch: number) {
if (ch < _identifierCharFastTableSize) {
return _identifierCharFastTable[ch] === CharCategory.StartIdentifierChar;
export function isIdentifierStartChar(char: number, nextChar?: number) {
if (char < _identifierCharFastTableSize) {
return _identifierCharFastTable[char] === CharCategory.StartIdentifierChar;
}
// Lazy initialize the char map. We'll rarely get here.
@ -46,14 +54,21 @@ export function isIdentifierStartChar(ch: number) {
_identifierCharMapInitialized = true;
}
return _identifierCharMap[ch] === CharCategory.StartIdentifierChar;
let charCategory: CharCategory;
if (nextChar !== undefined) {
charCategory = _lookUpSurrogate(char, nextChar);
} else {
charCategory = _identifierCharMap[char];
}
return charCategory === CharCategory.StartIdentifierChar;
}
export function isIdentifierChar(ch: number) {
if (ch < _identifierCharFastTableSize) {
export function isIdentifierChar(char: number, nextChar?: number) {
if (char < _identifierCharFastTableSize) {
return (
_identifierCharFastTable[ch] === CharCategory.StartIdentifierChar ||
_identifierCharFastTable[ch] === CharCategory.IdentifierChar
_identifierCharFastTable[char] === CharCategory.StartIdentifierChar ||
_identifierCharFastTable[char] === CharCategory.IdentifierChar
);
}
@ -63,9 +78,13 @@ export function isIdentifierChar(ch: number) {
_identifierCharMapInitialized = true;
}
if (nextChar !== undefined) {
return _lookUpSurrogate(char, nextChar);
}
return (
_identifierCharMap[ch] === CharCategory.StartIdentifierChar ||
_identifierCharMap[ch] === CharCategory.IdentifierChar
_identifierCharMap[char] === CharCategory.StartIdentifierChar ||
_identifierCharMap[char] === CharCategory.IdentifierChar
);
}
@ -97,6 +116,19 @@ export function isBinary(ch: number): boolean {
return ch === Char._0 || ch === Char._1 || ch === Char.Underscore;
}
function _lookUpSurrogate(char: number, nextChar: number) {
if (_identifierCharMap[char] !== CharCategory.SurrogateChar) {
return CharCategory.NotIdentifierChar;
}
const surrogateTable = _surrogateCharMap[char];
if (!surrogateTable) {
return CharCategory.NotIdentifierChar;
}
return surrogateTable[nextChar];
}
// Underscore is explicitly allowed to start an identifier.
// Characters with the Other_ID_Start property.
const _specialStartIdentifierChars: unicode.UnicodeRangeTable = [
@ -135,7 +167,9 @@ const _identifierCharRanges = [
function _buildIdentifierLookupTableFromUnicodeRangeTable(
table: unicode.UnicodeRangeTable,
category: CharCategory,
fastTableOnly: boolean
fastTableOnly: boolean,
fastTable: CharCategoryMap,
fullTable: CharCategoryMap
) {
for (let entryIndex = 0; entryIndex < table.length; entryIndex++) {
const entry = table[entryIndex];
@ -151,9 +185,9 @@ function _buildIdentifierLookupTableFromUnicodeRangeTable(
for (let i = rangeStart; i <= rangeEnd; i++) {
if (i < _identifierCharFastTableSize) {
_identifierCharFastTable[i] = category;
fastTable[i] = category;
} else {
_identifierCharMap[i] = category;
fullTable[i] = category;
}
}
@ -168,12 +202,41 @@ function _buildIdentifierLookupTable(fastTableOnly: boolean) {
_identifierCharFastTable.fill(CharCategory.NotIdentifierChar);
_identifierCharRanges.forEach((table) => {
_buildIdentifierLookupTableFromUnicodeRangeTable(table, CharCategory.IdentifierChar, fastTableOnly);
_buildIdentifierLookupTableFromUnicodeRangeTable(
table,
CharCategory.IdentifierChar,
fastTableOnly,
_identifierCharFastTable,
_identifierCharMap
);
});
_startIdentifierCharRanges.forEach((table) => {
_buildIdentifierLookupTableFromUnicodeRangeTable(table, CharCategory.StartIdentifierChar, fastTableOnly);
_buildIdentifierLookupTableFromUnicodeRangeTable(
table,
CharCategory.StartIdentifierChar,
fastTableOnly,
_identifierCharFastTable,
_identifierCharMap
);
});
// Populate the surrogate tables for characters that require two
// character codes.
if (!fastTableOnly) {
for (const surrogateChar in unicode.unicodeLoSurrogate) {
_surrogateCharMap[surrogateChar] = {};
_identifierCharMap[surrogateChar] = CharCategory.SurrogateChar;
_buildIdentifierLookupTableFromUnicodeRangeTable(
unicode.unicodeLoSurrogate[surrogateChar],
CharCategory.StartIdentifierChar,
fastTableOnly,
_surrogateCharMap[surrogateChar],
_surrogateCharMap[surrogateChar]
);
}
}
}
_buildIdentifierLookupTable(true);

View File

@ -624,13 +624,29 @@ export class Tokenizer {
}
private _tryIdentifier(): boolean {
const swallowRemainingChars = () => {
while (true) {
if (isIdentifierChar(this._cs.currentChar)) {
this._cs.moveNext();
} else if (isIdentifierChar(this._cs.currentChar, this._cs.nextChar)) {
this._cs.moveNext();
this._cs.moveNext();
} else {
break;
}
}
};
const start = this._cs.position;
if (isIdentifierStartChar(this._cs.currentChar)) {
this._cs.moveNext();
while (isIdentifierChar(this._cs.currentChar)) {
this._cs.moveNext();
}
swallowRemainingChars();
} else if (isIdentifierStartChar(this._cs.currentChar, this._cs.nextChar)) {
this._cs.moveNext();
this._cs.moveNext();
swallowRemainingChars();
}
if (this._cs.position > start) {
const value = this._cs.getText().substr(start, this._cs.position - start);
if (_keywords[value] !== undefined) {

View File

@ -10,7 +10,9 @@
* in a much more verbose form.
*/
export type UnicodeRangeTable = ([number, number] | number)[];
export type UnicodeRange = [number, number] | number;
export type UnicodeRangeTable = UnicodeRange[];
export type UnicodeSurrogateRangeTable = { [surrogate: number]: UnicodeRange[] };
export const unicodeLu: UnicodeRangeTable = [
[65, 90],
@ -1753,6 +1755,237 @@ export const unicodeLo: UnicodeRangeTable = [
[194560, 195101],
];
export const unicodeLoSurrogate: UnicodeSurrogateRangeTable = {
55296: [
[56320, 56331],
[56333, 56358],
[56360, 56378],
[56380, 56381],
[56383, 56397],
[56400, 56413],
[56448, 56570],
[56960, 56988],
[56992, 57040],
[57088, 57119],
[57133, 57152],
[57154, 57161],
[57168, 57205],
[57216, 57245],
[57248, 57283],
[57288, 57295],
],
55297: [
[56400, 56477],
[56576, 56615],
[56624, 56675],
[56832, 57142],
[57152, 57173],
[57184, 57191],
],
55298: [
[56320, 56325],
[56328, 56328],
[56330, 56373],
[56375, 56376],
[56380, 56380],
[56383, 56405],
[56416, 56438],
[56448, 56478],
[56544, 56562],
[56564, 56565],
[56576, 56597],
[56608, 56633],
[56704, 56759],
[56766, 56767],
[56832, 56832],
[56848, 56851],
[56853, 56855],
[56857, 56885],
[56928, 56956],
[56960, 56988],
[57024, 57031],
[57033, 57060],
[57088, 57141],
[57152, 57173],
[57184, 57202],
[57216, 57233],
],
55299: [
[56320, 56392],
[56576, 56611],
[56960, 57001],
[57008, 57009],
[57088, 57116],
57127,
[57136, 57157],
[57264, 57284],
[57312, 57334],
],
55300: [
[56323, 56375],
[56451, 56495],
[56528, 56552],
[56579, 56614],
56644,
56647,
[56656, 56690],
56694,
[56707, 56754],
[56769, 56772],
56794,
56796,
[56832, 56849],
[56851, 56875],
[56960, 56966],
56968,
[56970, 56973],
[56975, 56989],
[56991, 57000],
[57008, 57054],
[57093, 57100],
[57103, 57104],
[57107, 57128],
[57130, 57136],
[57138, 57139],
[57141, 57145],
57149,
57168,
[57181, 57185],
],
55301: [
[56320, 56372],
[56391, 56394],
[56415, 56417],
[56448, 56495],
[56516, 56517],
56519,
[56704, 56750],
[56792, 56795],
[56832, 56879],
56900,
[56960, 57002],
57016,
[57088, 57114],
],
55302: [
[56320, 56363],
[56575, 56582],
56585,
[56588, 56595],
[56597, 56598],
[56600, 56623],
56639,
56641,
[56736, 56743],
[56746, 56784],
56801,
56803,
56832,
[56843, 56882],
56890,
56912,
[56924, 56969],
56989,
[57024, 57080],
],
55303: [
[56320, 56328],
[56330, 56366],
56384,
56434,
[56576, 56582],
[56584, 56585],
[56587, 56624],
56646,
[56672, 56677],
[56679, 56680],
[56682, 56713],
56728,
[57056, 57074],
57264,
],
55304: [[56320, 57241]],
55305: [[56448, 56643]],
55308: [[56320, 57343]],
55309: [[56320, 56366]],
55313: [[56320, 56902]],
55322: [
[56320, 56888],
[56896, 56926],
[57040, 57069],
[57088, 57135],
[57187, 57207],
[57213, 57231],
],
55323: [[57088, 57162], 57168],
55324: [56320],
55329: [57335],
55330: [[56320, 57343]],
55331: [
[56320, 56533],
[56576, 56576],
[56584, 56584],
],
55340: [
[56320, 56606],
[56656, 56658],
[56676, 56679],
[56688, 57083],
],
55343: [
[56320, 56426],
[56432, 56444],
[56448, 56456],
[56464, 56473],
],
55352: [[56576, 56620], 56654, [57024, 57067]],
55354: [[56320, 56516]],
55355: [
[56832, 56835],
[56837, 56863],
[56865, 56866],
56868,
56871,
[56873, 56882],
[56884, 56887],
56889,
56891,
56898,
56903,
56905,
56907,
[56909, 56911],
[56913, 56914],
56916,
56919,
56921,
56923,
56925,
56927,
[56929, 56930],
56932,
[56935, 56938],
[56940, 56946],
[56948, 56951],
[56953, 56956],
56958,
[56960, 56969],
[56971, 56987],
[56993, 56995],
[56997, 57001],
[57003, 57019],
],
55360: [56320],
55401: [57053, 57088],
55405: [57140, 57152],
55406: [56349, 56352],
55411: [56993, 57008],
55418: [57312],
55422: [[56320, 56861]],
55424: [56320],
55428: [57162],
};
export const unicodeLm: UnicodeRangeTable = [
[688, 705],
[710, 721],

View File

@ -22,6 +22,11 @@ test('BadToken1', () => {
TestUtils.validateResults(analysisResults, 1);
});
test('Unicode1', () => {
const analysisResults = TestUtils.typeAnalyzeSampleFiles(['unicode1.py']);
TestUtils.validateResults(analysisResults, 1);
});
test('CircularBaseClass', () => {
const analysisResults = TestUtils.typeAnalyzeSampleFiles(['circularBaseClass.py']);

View File

@ -0,0 +1,32 @@
# This sample tests a variety of unicode characters including those that
# require two-code (surrogate) forms.
# Old Italic
𐌎𐌘𐌟𐌁 = 42
# Egyptian hieroglyphs
𓃘𓐭𓇀𓅨𓆙 = 2
# Linear B Ideograms
𐂂𐃪𐃯 = ""
# Cuneiform
𒀟𒀕𒀰𒁜𒂐𒄊 = ""
# Old Persian
𐎠𐏊𐏏 = 3
# Lydian
𐤢𐤷𐤬𐤮 = 4
# Phoenician
𐤔𐤑𐤇 = 4
# Nabataean
𐢖𐢊ﬗ = 0
# This should generate an error because "𐢭" is outside the range of
# characters supported by the Python standard.
𐢭 = 0