mirror of
https://github.com/VSCodeVim/Vim.git
synced 2024-11-11 06:39:50 +03:00
Fix word separation (#3667)
* Fix word separation * Add tests * Add Latin-1 punctuations in printable char table * Fix terminology * Add more tests * Classify keyword characters as punctuations * Add more test * Rename `table` to `symbolTable`
This commit is contained in:
parent
6307efbb3e
commit
57cb40a434
@ -895,45 +895,7 @@ export class Position extends vscode.Position {
|
||||
return result;
|
||||
}
|
||||
|
||||
private static makeUnicodeWordRegex(characterSet: string): RegExp {
|
||||
const segments = [
|
||||
// ASCII word characters (in many cases 0-9A-Za-z_)
|
||||
// and non-word characters
|
||||
...Position.makeAsciiWordSegments(characterSet),
|
||||
|
||||
// Unicode characters (punctuations, ideographs, ...)
|
||||
...Position.makeUnicodeWordSegments(),
|
||||
|
||||
// Other spelling characters (Greek, ...)
|
||||
'\\S+',
|
||||
|
||||
'$^',
|
||||
];
|
||||
const result = new RegExp(segments.join('|'), 'ug');
|
||||
return result;
|
||||
}
|
||||
|
||||
private static makeAsciiWordSegments(nonWordChars: string): string[] {
|
||||
const nonWordCodes = nonWordChars
|
||||
.split('')
|
||||
.sort()
|
||||
.map(c => c.codePointAt(0)!);
|
||||
nonWordCodes.push(0x7f); // guard
|
||||
const wordChars: string[] = [];
|
||||
let wordCode = 0x21;
|
||||
for (let nonWordCode of nonWordCodes) {
|
||||
for (; wordCode < nonWordCode; wordCode++) {
|
||||
wordChars.push(String.fromCharCode(wordCode));
|
||||
}
|
||||
wordCode = nonWordCode + 1;
|
||||
}
|
||||
|
||||
const wordSegment = `([${wordChars.join('')}]+)`;
|
||||
const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`;
|
||||
return [wordSegment, nonWordSegment];
|
||||
}
|
||||
|
||||
private static makeUnicodeWordSegments(): string[] {
|
||||
private static makeUnicodeWordRegex(keywordChars: string): RegExp {
|
||||
// Distinct categories of characters
|
||||
enum CharKind {
|
||||
Punctuation,
|
||||
@ -946,10 +908,11 @@ export class Position extends vscode.Position {
|
||||
Hangul,
|
||||
}
|
||||
|
||||
// List of printable characters (code point intervals) and their character kinds.
|
||||
// Latin alphabets (e.g., ASCII alphabets and numbers, Latin-1 Supplement, European Latin) are excluded.
|
||||
// Imported from utf_class_buf in src/mbyte.c of Vim.
|
||||
// Spelling alphabets are not listed here since they are covered as non-white letters.
|
||||
// TODO(ajalab): add Emoji
|
||||
const codePointRanges: [[number, number], CharKind][] = [
|
||||
const symbolTable: [[number, number], CharKind][] = [
|
||||
[[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation
|
||||
[[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark
|
||||
[[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia
|
||||
[[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation
|
||||
@ -1013,23 +976,42 @@ export class Position extends vscode.Position {
|
||||
[[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs
|
||||
];
|
||||
|
||||
const fragments: string[][] = [];
|
||||
const codePointRangePatterns: string[][] = [];
|
||||
for (let kind in CharKind) {
|
||||
if (!isNaN(Number(kind))) {
|
||||
fragments[kind] = [];
|
||||
codePointRangePatterns[kind] = [];
|
||||
}
|
||||
}
|
||||
|
||||
for (let [[first, last], kind] of codePointRanges) {
|
||||
for (let [[first, last], kind] of symbolTable) {
|
||||
if (first === last) {
|
||||
// '\u{hhhh}'
|
||||
fragments[kind].push(`\\u{${first.toString(16)}}`);
|
||||
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`);
|
||||
} else {
|
||||
// '\u{hhhh}-\u{hhhh}'
|
||||
fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
|
||||
codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
|
||||
}
|
||||
}
|
||||
return fragments.map(patterns => `([${patterns.join('')}]+)`);
|
||||
|
||||
// Symbols in vim.iskeyword or editor.wordSeparators
|
||||
// are treated as CharKind.Punctuation
|
||||
const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-');
|
||||
codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars);
|
||||
|
||||
const codePointRanges = codePointRangePatterns.map(patterns => patterns.join(''));
|
||||
const symbolSegments = codePointRanges.map(range => `([${range}]+)`);
|
||||
|
||||
// wordSegment matches word characters.
|
||||
// A word character is a symbol which is neither
|
||||
// - space
|
||||
// - a symbol listed in the table
|
||||
// - a keyword (vim.iskeyword)
|
||||
const wordSegment = `([^\\s${codePointRanges.join('')}]+)`;
|
||||
|
||||
// https://regex101.com/r/X1agK6/2
|
||||
const segments = symbolSegments.concat(wordSegment, '$^');
|
||||
const regexp = new RegExp(segments.join('|'), 'ug');
|
||||
return regexp;
|
||||
}
|
||||
|
||||
private getAllPositions(line: string, regex: RegExp): number[] {
|
||||
|
@ -353,7 +353,13 @@ suite('word motion', () => {
|
||||
});
|
||||
|
||||
suite('unicode word motion', () => {
|
||||
let text: Array<string> = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε'];
|
||||
let text: Array<string> = [
|
||||
'漢字ひらがなカタカナalphabets、いろいろな文字。',
|
||||
'Καλημέρα κόσμε',
|
||||
'Die früh sich einst dem trüben Blick gezeigt.',
|
||||
'Được tiếp đãi ân cần',
|
||||
'100£and100$and100¥#♯x',
|
||||
];
|
||||
|
||||
suiteSetup(() => {
|
||||
return setupWorkspace().then(() => {
|
||||
@ -387,6 +393,28 @@ suite('unicode word motion', () => {
|
||||
assert.equal(motion.line, 1);
|
||||
assert.equal(motion.character, 9);
|
||||
});
|
||||
|
||||
test('move cursor word right recognizes a latin string which has diacritics as a single word', () => {
|
||||
let motion = new Position(2, 4).getWordRight();
|
||||
assert.equal(motion.line, 2);
|
||||
assert.equal(motion.character, 9);
|
||||
});
|
||||
|
||||
test('move cursor word right recognizes a latin-1 symbol as punctuation', () => {
|
||||
let motion = new Position(4, 3).getWordRight();
|
||||
assert.equal(motion.line, 4);
|
||||
assert.equal(motion.character, 4);
|
||||
|
||||
motion = motion.getWordRight(); // issue #3680
|
||||
assert.equal(motion.line, 4);
|
||||
assert.equal(motion.character, 10);
|
||||
});
|
||||
|
||||
test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => {
|
||||
let motion = new Position(4, 17).getWordRight();
|
||||
assert.equal(motion.line, 4);
|
||||
assert.equal(motion.character, 20);
|
||||
});
|
||||
});
|
||||
|
||||
suite('word left', () => {
|
||||
@ -413,6 +441,12 @@ suite('unicode word motion', () => {
|
||||
assert.equal(motion.line, 1);
|
||||
assert.equal(motion.character, 9);
|
||||
});
|
||||
|
||||
test('move cursor word left recognizes a latin string which has diacritics as a single word', () => {
|
||||
let motion = new Position(3, 10).getWordLeft();
|
||||
assert.equal(motion.line, 3);
|
||||
assert.equal(motion.character, 5);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user