Fix word separation (#3667)

* Fix word separation

* Add tests

* Add Latin-1 punctuations in printable char table

* Fix terminology

* Add more tests

* Classify keyword characters as punctuations

* Add more test

* Rename `table` to `symbolTable`
This commit is contained in:
Koki Kato 2019-04-18 01:38:29 +09:00 committed by Jason Poon
parent 6307efbb3e
commit 57cb40a434
2 changed files with 65 additions and 49 deletions

View File

@ -895,45 +895,7 @@ export class Position extends vscode.Position {
return result;
}
private static makeUnicodeWordRegex(characterSet: string): RegExp {
const segments = [
// ASCII word characters (in many cases 0-9A-Za-z_)
// and non-word characters
...Position.makeAsciiWordSegments(characterSet),
// Unicode characters (punctuations, ideographs, ...)
...Position.makeUnicodeWordSegments(),
// Other spelling characters (Greek, ...)
'\\S+',
'$^',
];
const result = new RegExp(segments.join('|'), 'ug');
return result;
}
private static makeAsciiWordSegments(nonWordChars: string): string[] {
const nonWordCodes = nonWordChars
.split('')
.sort()
.map(c => c.codePointAt(0)!);
nonWordCodes.push(0x7f); // guard
const wordChars: string[] = [];
let wordCode = 0x21;
for (let nonWordCode of nonWordCodes) {
for (; wordCode < nonWordCode; wordCode++) {
wordChars.push(String.fromCharCode(wordCode));
}
wordCode = nonWordCode + 1;
}
const wordSegment = `([${wordChars.join('')}]+)`;
const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`;
return [wordSegment, nonWordSegment];
}
private static makeUnicodeWordSegments(): string[] {
private static makeUnicodeWordRegex(keywordChars: string): RegExp {
// Distinct categories of characters
enum CharKind {
Punctuation,
@ -946,10 +908,11 @@ export class Position extends vscode.Position {
Hangul,
}
// List of printable characters (code point intervals) and their character kinds.
// Latin alphabets (e.g., ASCII alphabets and numbers, Latin-1 Supplement, European Latin) are excluded.
// Imported from utf_class_buf in src/mbyte.c of Vim.
// Spelling alphabets are not listed here since they are covered as non-white letters.
// TODO(ajalab): add Emoji
const codePointRanges: [[number, number], CharKind][] = [
const symbolTable: [[number, number], CharKind][] = [
[[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation
[[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark
[[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia
[[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation
@ -1013,23 +976,42 @@ export class Position extends vscode.Position {
[[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs
];
const fragments: string[][] = [];
const codePointRangePatterns: string[][] = [];
for (let kind in CharKind) {
if (!isNaN(Number(kind))) {
fragments[kind] = [];
codePointRangePatterns[kind] = [];
}
}
for (let [[first, last], kind] of codePointRanges) {
for (let [[first, last], kind] of symbolTable) {
if (first === last) {
// '\u{hhhh}'
fragments[kind].push(`\\u{${first.toString(16)}}`);
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`);
} else {
// '\u{hhhh}-\u{hhhh}'
fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
}
}
return fragments.map(patterns => `([${patterns.join('')}]+)`);
// Symbols in vim.iskeyword or editor.wordSeparators
// are treated as CharKind.Punctuation
const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-');
codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars);
const codePointRanges = codePointRangePatterns.map(patterns => patterns.join(''));
const symbolSegments = codePointRanges.map(range => `([${range}]+)`);
// wordSegment matches word characters.
// A word character is a symbol which is neither
// - space
// - a symbol listed in the table
// - a keyword (vim.iskeyword)
const wordSegment = `([^\\s${codePointRanges.join('')}]+)`;
// https://regex101.com/r/X1agK6/2
const segments = symbolSegments.concat(wordSegment, '$^');
const regexp = new RegExp(segments.join('|'), 'ug');
return regexp;
}
private getAllPositions(line: string, regex: RegExp): number[] {

View File

@ -353,7 +353,13 @@ suite('word motion', () => {
});
suite('unicode word motion', () => {
let text: Array<string> = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε'];
let text: Array<string> = [
'漢字ひらがなカタカナalphabets、いろいろな文字。',
'Καλημέρα κόσμε',
'Die früh sich einst dem trüben Blick gezeigt.',
'Được tiếp đãi ân cần',
'100£and100$and100¥#♯x',
];
suiteSetup(() => {
return setupWorkspace().then(() => {
@ -387,6 +393,28 @@ suite('unicode word motion', () => {
assert.equal(motion.line, 1);
assert.equal(motion.character, 9);
});
test('move cursor word right recognizes a latin string which has diacritics as a single word', () => {
let motion = new Position(2, 4).getWordRight();
assert.equal(motion.line, 2);
assert.equal(motion.character, 9);
});
test('move cursor word right recognizes a latin-1 symbol as punctuation', () => {
let motion = new Position(4, 3).getWordRight();
assert.equal(motion.line, 4);
assert.equal(motion.character, 4);
motion = motion.getWordRight(); // issue #3680
assert.equal(motion.line, 4);
assert.equal(motion.character, 10);
});
test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => {
let motion = new Position(4, 17).getWordRight();
assert.equal(motion.line, 4);
assert.equal(motion.character, 20);
});
});
suite('word left', () => {
@ -413,6 +441,12 @@ suite('unicode word motion', () => {
assert.equal(motion.line, 1);
assert.equal(motion.character, 9);
});
test('move cursor word left recognizes a latin string which has diacritics as a single word', () => {
let motion = new Position(3, 10).getWordLeft();
assert.equal(motion.line, 3);
assert.equal(motion.character, 5);
});
});
});