Fix word separation (#3667)

* Fix word separation * Add tests * Add Latin-1 punctuations in printable char table * Fix terminology * Add more tests * Classify keyword characters as punctuations * Add more test * Rename `table` to `symbolTable`
2024-11-11 06:39:50 +03:00 · 2019-04-18 01:38:29 +09:00 · 2019-04-18 01:38:29 +09:00 · 57cb40a434
commit 57cb40a434
parent 6307efbb3e
2 changed files with 65 additions and 49 deletions
--- a/src/common/motion/position.ts
+++ b/src/common/motion/position.ts
@ -895,45 +895,7 @@ export class Position extends vscode.Position {
    return result;
  }

-  private static makeUnicodeWordRegex(characterSet: string): RegExp {
-    const segments = [
-      // ASCII word characters (in many cases 0-9A-Za-z_)
-      // and non-word characters
-      ...Position.makeAsciiWordSegments(characterSet),
-
-      // Unicode characters (punctuations, ideographs, ...)
-      ...Position.makeUnicodeWordSegments(),
-
-      // Other spelling characters (Greek, ...)
-      '\\S+',
-
-      '$^',
-    ];
-    const result = new RegExp(segments.join('|'), 'ug');
-    return result;
-  }
-
-  private static makeAsciiWordSegments(nonWordChars: string): string[] {
-    const nonWordCodes = nonWordChars
-      .split('')
-      .sort()
-      .map(c => c.codePointAt(0)!);
-    nonWordCodes.push(0x7f); // guard
-    const wordChars: string[] = [];
-    let wordCode = 0x21;
-    for (let nonWordCode of nonWordCodes) {
-      for (; wordCode < nonWordCode; wordCode++) {
-        wordChars.push(String.fromCharCode(wordCode));
-      }
-      wordCode = nonWordCode + 1;
-    }
-
-    const wordSegment = `([${wordChars.join('')}]+)`;
-    const nonWordSegment = `[${_.escapeRegExp(nonWordChars).replace(/-/g, '\\-')}]+`;
-    return [wordSegment, nonWordSegment];
-  }
-
-  private static makeUnicodeWordSegments(): string[] {
+  private static makeUnicodeWordRegex(keywordChars: string): RegExp {
    // Distinct categories of characters
    enum CharKind {
      Punctuation,
@ -946,10 +908,11 @@ export class Position extends vscode.Position {
      Hangul,
    }

+    // List of printable characters (code point intervals) and their character kinds.
+    // Latin alphabets (e.g., ASCII alphabets and numbers,  Latin-1 Supplement, European Latin) are excluded.
    // Imported from utf_class_buf in src/mbyte.c of Vim.
-    // Spelling alphabets are not listed here since they are covered as non-white letters.
-    // TODO(ajalab): add Emoji
-    const codePointRanges: [[number, number], CharKind][] = [
+    const symbolTable: [[number, number], CharKind][] = [
+      [[0x00a1, 0x00bf], CharKind.Punctuation], // Latin-1 punctuation
      [[0x037e, 0x037e], CharKind.Punctuation], // Greek question mark
      [[0x0387, 0x0387], CharKind.Punctuation], // Greek ano teleia
      [[0x055a, 0x055f], CharKind.Punctuation], // Armenian punctuation
@ -1013,23 +976,42 @@ export class Position extends vscode.Position {
      [[0x2f800, 0x2fa1f], CharKind.Ideograph], // CJK Ideographs
    ];

-    const fragments: string[][] = [];
+    const codePointRangePatterns: string[][] = [];
    for (let kind in CharKind) {
      if (!isNaN(Number(kind))) {
-        fragments[kind] = [];
+        codePointRangePatterns[kind] = [];
      }
    }

-    for (let [[first, last], kind] of codePointRanges) {
+    for (let [[first, last], kind] of symbolTable) {
      if (first === last) {
        // '\u{hhhh}'
-        fragments[kind].push(`\\u{${first.toString(16)}}`);
+        codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}`);
      } else {
        // '\u{hhhh}-\u{hhhh}'
-        fragments[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
+        codePointRangePatterns[kind].push(`\\u{${first.toString(16)}}-\\u{${last.toString(16)}}`);
      }
    }
-    return fragments.map(patterns => `([${patterns.join('')}]+)`);
+
+    // Symbols in vim.iskeyword or editor.wordSeparators
+    // are treated as CharKind.Punctuation
+    const escapedKeywordChars = _.escapeRegExp(keywordChars).replace(/-/g, '\\-');
+    codePointRangePatterns[Number(CharKind.Punctuation)].push(escapedKeywordChars);
+
+    const codePointRanges = codePointRangePatterns.map(patterns => patterns.join(''));
+    const symbolSegments = codePointRanges.map(range => `([${range}]+)`);
+
+    // wordSegment matches word characters.
+    // A word character is a symbol which is neither
+    // - space
+    // - a symbol listed in the table
+    // - a keyword (vim.iskeyword)
+    const wordSegment = `([^\\s${codePointRanges.join('')}]+)`;
+
+    // https://regex101.com/r/X1agK6/2
+    const segments = symbolSegments.concat(wordSegment, '$^');
+    const regexp = new RegExp(segments.join('|'), 'ug');
+    return regexp;
  }

  private getAllPositions(line: string, regex: RegExp): number[] {
--- a/test/motion.test.ts
+++ b/test/motion.test.ts
@ -353,7 +353,13 @@ suite('word motion', () => {
 });

 suite('unicode word motion', () => {
-  let text: Array<string> = ['漢字ひらがなカタカナalphabets、いろいろな文字。', 'Καλημέρα κόσμε'];
+  let text: Array<string> = [
+    '漢字ひらがなカタカナalphabets、いろいろな文字。',
+    'Καλημέρα κόσμε',
+    'Die früh sich einst dem trüben Blick gezeigt.',
+    'Được tiếp đãi ân cần',
+    '100£and100$and100¥#♯x',
+  ];

  suiteSetup(() => {
    return setupWorkspace().then(() => {
@ -387,6 +393,28 @@ suite('unicode word motion', () => {
      assert.equal(motion.line, 1);
      assert.equal(motion.character, 9);
    });
+
+    test('move cursor word right recognizes a latin string which has diacritics as a single word', () => {
+      let motion = new Position(2, 4).getWordRight();
+      assert.equal(motion.line, 2);
+      assert.equal(motion.character, 9);
+    });
+
+    test('move cursor word right recognizes a latin-1 symbol as punctuation', () => {
+      let motion = new Position(4, 3).getWordRight();
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 4);
+
+      motion = motion.getWordRight(); // issue #3680
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 10);
+    });
+
+    test('move cursor word right recognizes a sequence of latin-1 symbols and other symbols as a word', () => {
+      let motion = new Position(4, 17).getWordRight();
+      assert.equal(motion.line, 4);
+      assert.equal(motion.character, 20);
+    });
  });

  suite('word left', () => {
@ -413,6 +441,12 @@ suite('unicode word motion', () => {
      assert.equal(motion.line, 1);
      assert.equal(motion.character, 9);
    });
+
+    test('move cursor word left recognizes a latin string which has diacritics as a single word', () => {
+      let motion = new Position(3, 10).getWordLeft();
+      assert.equal(motion.line, 3);
+      assert.equal(motion.character, 5);
+    });
  });
 });