Update sub word regex/splitter (#1027)

* Added subWordSplitter function

* Add support for unicode characters in sub words

* Added additional test

* Support language specific identifier regex

* Cleanup

* cleanup

* Use unicode number in camel case regex

Co-authored-by: Pokey Rule <755842+pokey@users.noreply.github.com>
This commit is contained in:
Andreas Arvidsson 2022-10-11 17:24:33 +02:00 committed by GitHub
parent 0e769bee81
commit 429b6b72db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 174 additions and 47 deletions

View File

@ -6,5 +6,5 @@ import { LanguageTokenizerOverrides } from "./tokenizer.types";
* Useful for languages like CSS, SCSS, shell, etc.
*/
export const languageWithDashedIdentifiers: LanguageTokenizerOverrides = {
identifiersRegex: "[\\p{L}\\p{M}_\\-0-9]+",
identifierWordDelimiters: ["-", "_"],
};

View File

@ -1,4 +1,4 @@
import { mapValues } from "lodash";
import { escapeRegExp, mapValues } from "lodash";
import { LanguageId, SupportedLanguageId } from "../languages/constants";
import { matchAll } from "../util/regex";
@ -45,27 +45,34 @@ const FIXED_TOKENS = [
"-->",
];
const IDENTIFIERS_REGEX = "[\\p{L}\\p{M}_0-9]+";
export const IDENTIFIER_WORD_REGEXES = ["\\p{L}", "\\p{M}", "\\p{N}"];
const IDENTIFIER_WORD_DELIMITERS = ["_"];
const SINGLE_SYMBOLS_REGEX = "[^\\s\\w]";
const NUMBERS_REGEX = "(?<=[^.\\d]|^)\\d+\\.\\d+(?=[^.\\d]|$)"; // (not-dot/digit digits dot digits not-dot/digit)
const defaultLanguageTokenizerComponents: LanguageTokenizerComponents = {
fixedTokens: FIXED_TOKENS,
repeatableSymbols: REPEATABLE_SYMBOLS,
identifiersRegex: IDENTIFIERS_REGEX,
identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
identifierWordDelimiters: IDENTIFIER_WORD_DELIMITERS,
numbersRegex: NUMBERS_REGEX,
singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
};
interface Matcher {
tokenMatcher: RegExp;
identifierMatcher: RegExp;
wordMatcher: RegExp;
}
const defaultMatcher = generateMatcher();
const defaultTokenMatcher = generateTokenMatcher();
function generateTokenMatcher(
function generateMatcher(
languageOverrides: LanguageTokenizerOverrides = {}
): RegExp {
): Matcher {
const {
fixedTokens,
repeatableSymbols,
identifiersRegex,
identifierWordRegexes,
identifierWordDelimiters,
numbersRegex,
singleSymbolsRegex,
}: LanguageTokenizerComponents = {
@ -80,6 +87,12 @@ function generateTokenMatcher(
const fixedTokensRegex = fixedTokens.map(escapeRegExp).join("|");
const identifierComponents = identifierWordRegexes.concat(
identifierWordDelimiters.map(escapeRegExp)
);
const identifiersRegex = `(${identifierComponents.join("|")})+`;
const wordRegex = `(${identifierWordRegexes.join("|")})+`;
// Order matters here.
const regex = [
fixedTokensRegex,
@ -89,7 +102,11 @@ function generateTokenMatcher(
singleSymbolsRegex,
].join("|");
return new RegExp(regex, "gu");
return {
identifierMatcher: new RegExp(identifiersRegex, "gu"),
wordMatcher: new RegExp(wordRegex, "gu"),
tokenMatcher: new RegExp(regex, "gu"),
};
}
const languageTokenizerOverrides: Partial<
@ -100,15 +117,15 @@ const languageTokenizerOverrides: Partial<
shellscript: languageWithDashedIdentifiers,
};
const tokenMatchersForLanguage: Partial<Record<LanguageId, RegExp>> = mapValues(
languageTokenizerOverrides,
(val: LanguageTokenizerComponents) => generateTokenMatcher(val)
);
const tokenMatchersForLanguage: Partial<Record<LanguageId, Matcher>> =
mapValues(languageTokenizerOverrides, (val: LanguageTokenizerComponents) =>
generateMatcher(val)
);
export function getTokenMatcher(languageId: string): RegExp {
export function getMatcher(languageId: string): Matcher {
return (
tokenMatchersForLanguage[languageId as SupportedLanguageId] ??
defaultTokenMatcher
defaultMatcher
);
}
@ -117,10 +134,5 @@ export function tokenize<T>(
languageId: string,
mapfn: (v: RegExpMatchArray, k: number) => T
) {
return matchAll(text, getTokenMatcher(languageId), mapfn);
}
//https://stackoverflow.com/a/6969486
function escapeRegExp(string: string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
return matchAll(text, getMatcher(languageId).tokenMatcher, mapfn);
}

View File

@ -2,7 +2,20 @@
export interface LanguageTokenizerComponents {
fixedTokens: string[];
identifiersRegex: string;
/**
* Each element of this list is a regex that can appear inside a token, and
* will be considered part of a subword. Note that there is no need to add a
* `*` here, as the regex will be allowed to repeat.
*/
identifierWordRegexes: string[];
/**
* These are allowable inside identifiers, and act to separate words in the
* identifier. They are raw strings, and will be regex-escaped.
*/
identifierWordDelimiters: string[];
numbersRegex: string;
repeatableSymbols: string[];
singleSymbolsRegex: string;

View File

@ -7,17 +7,14 @@ import {
EveryScopeModifier,
} from "../../../typings/targetDescriptor.types";
import { ProcessedTargetsContext } from "../../../typings/Types";
import { matchAll } from "../../../util/regex";
import { MatchedText, matchText } from "../../../util/regex";
import { ModifierStage } from "../../PipelineStages.types";
import { PlainTarget, SubTokenWordTarget } from "../../targets";
import { SUBWORD_MATCHER } from "../subToken";
import { subWordSplitter } from "../subToken";
import { getTokenRangeForSelection } from "./TokenStage";
abstract class SubTokenStage implements ModifierStage {
constructor(
private modifier: ContainingScopeModifier | EveryScopeModifier,
private regex: RegExp
) {}
constructor(private modifier: ContainingScopeModifier | EveryScopeModifier) {}
run(context: ProcessedTargetsContext, target: Target): Target[] {
const { document } = target.editor;
@ -27,14 +24,12 @@ abstract class SubTokenStage implements ModifierStage {
);
const text = document.getText(tokenRange);
const offset = document.offsetAt(tokenRange.start);
const contentRanges = matchAll<Range>(
text,
this.regex,
const matches = this.getMatchedText(text, document.languageId);
const contentRanges = matches.map(
(match) =>
new Range(
document.positionAt(offset + match.index!),
document.positionAt(offset + match.index! + match[0].length)
document.positionAt(offset + match.index),
document.positionAt(offset + match.index + match.text.length)
)
);
@ -107,6 +102,14 @@ abstract class SubTokenStage implements ModifierStage {
);
}
/**
* Return matches for {@link text}
*/
protected abstract getMatchedText(
text: string,
languageId: string
): MatchedText[];
/**
* Create one target for each element of {@link contentRanges}
*/
@ -119,7 +122,11 @@ abstract class SubTokenStage implements ModifierStage {
export class WordStage extends SubTokenStage {
constructor(modifier: ContainingScopeModifier | EveryScopeModifier) {
super(modifier, SUBWORD_MATCHER);
super(modifier);
}
protected getMatchedText(text: string, languageId: string): MatchedText[] {
return subWordSplitter(text, languageId);
}
protected createTargetsFromRanges(
@ -166,7 +173,11 @@ export class WordStage extends SubTokenStage {
export class CharacterStage extends SubTokenStage {
constructor(modifier: ContainingScopeModifier | EveryScopeModifier) {
super(modifier, GRAPHEME_SPLIT_REGEX);
super(modifier);
}
protected getMatchedText(text: string): MatchedText[] {
return matchText(text, GRAPHEME_SPLIT_REGEX);
}
protected createTargetsFromRanges(

View File

@ -1 +1,30 @@
export const SUBWORD_MATCHER = /[A-Z]?[a-z]+|[A-Z]+(?![a-z])|[0-9]+/g;
import { getMatcher } from "../../core/tokenizer";
import { matchText } from "../../util/regex";
const camelRegex = /\p{Lu}?\p{Ll}+|\p{Lu}+(?!\p{Ll})|\p{N}+/gu;
export function subWordSplitter(text: string, languageId: string) {
// First split on identifiers. The input text can contain multiple
// tokens/identifiers and these can have different formats.
// eg `publicApiV1 public_api_v1`
const { identifierMatcher, wordMatcher } = getMatcher(languageId);
return matchText(text, identifierMatcher).flatMap((t) =>
splitIdentifier(wordMatcher, t.text, t.index)
);
}
function splitIdentifier(wordMatcher: RegExp, text: string, index: number) {
// First try to split on non letter characters
const wordMatches = matchText(text, wordMatcher);
const matches =
wordMatches.length > 1
? wordMatches
: // Secondly try split on camel case
matchText(text, camelRegex);
return matches.map((match) => ({
index: index + match.index,
text: match.text,
}));
}

View File

@ -0,0 +1,25 @@
languageId: scss
command:
spokenForm: chuck last word
version: 3
targets:
- type: primitive
modifiers:
- type: ordinalScope
scopeType: {type: word}
start: -1
length: 1
usePrePhraseSnapshot: true
action: {name: remove}
initialState:
documentContents: "margin-top: 0;"
selections:
- anchor: {line: 0, character: 0}
active: {line: 0, character: 0}
marks: {}
finalState:
documentContents: "margin: 0;"
selections:
- anchor: {line: 0, character: 0}
active: {line: 0, character: 0}
fullTargets: [{type: primitive, mark: {type: cursor}, modifiers: [{type: ordinalScope, scopeType: {type: word}, start: -1, length: 1}]}]

View File

@ -65,7 +65,27 @@ export const subtokenFixture: Fixture[] = [
expectedOutput: ["mock", "API", "Client", "Factory"],
},
{
input: "mockAPIClient123Factory",
expectedOutput: ["mock", "API", "Client", "123", "Factory"],
input: "mockAPIClient123FactoryV1",
expectedOutput: ["mock", "API", "Client", "123", "Factory", "V", "1"],
},
{
input: "mock_api_client_123_factory_v1",
expectedOutput: ["mock", "api", "client", "123", "factory", "v1"],
},
{
input: "v1",
expectedOutput: ["v", "1"],
},
{
input: "aaBbÄä",
expectedOutput: ["aa", "Bb", "Ää"],
},
{
input: "apiV1 api_v_1",
expectedOutput: ["api", "V", "1", "api", "v", "1"],
},
{
input: "_quickBrownFox_",
expectedOutput: ["quick", "Brown", "Fox"],
},
];

View File

@ -1,12 +1,14 @@
import * as assert from "assert";
import { SUBWORD_MATCHER } from "../../processTargets/modifiers/subToken";
import { subWordSplitter } from "../../processTargets/modifiers/subToken";
import { subtokenFixture } from "./fixtures/subtoken.fixture";
suite("subtoken regex matcher", () => {
subtokenFixture.forEach(({ input, expectedOutput }) => {
test(input, () => {
assert.deepStrictEqual(input.match(SUBWORD_MATCHER), expectedOutput);
assert.deepStrictEqual(
subWordSplitter(input, "anyLang").map(({ text }) => text),
expectedOutput
);
});
});
});

View File

@ -1,10 +1,10 @@
import { concat, flatten, maxBy, min } from "lodash";
import * as vscode from "vscode";
import { HatStyleName } from "../core/hatStyles";
import { getTokenMatcher } from "../core/tokenizer";
import Decorations from "../core/Decorations";
import { HatStyleName } from "../core/hatStyles";
import { IndividualHatMap } from "../core/IndividualHatMap";
import { TokenGraphemeSplitter } from "../core/TokenGraphemeSplitter";
import { getMatcher } from "../core/tokenizer";
import { Token } from "../typings/Types";
import { getDisplayLineMap } from "./getDisplayLineMap";
import { getTokenComparator } from "./getTokenComparator";
@ -44,9 +44,12 @@ export function addDecorationsToEditors(
expansionBehavior: {
start: {
type: "regex",
regex: getTokenMatcher(languageId),
regex: getMatcher(languageId).tokenMatcher,
},
end: {
type: "regex",
regex: getMatcher(languageId).tokenMatcher,
},
end: { type: "regex", regex: getTokenMatcher(languageId) },
},
}))
)

View File

@ -37,3 +37,15 @@ export function matchAll<T>(
) {
return Array.from(text.matchAll(regex), mapfn);
}
export interface MatchedText {
index: number;
text: string;
}
export function matchText(text: string, regex: RegExp): MatchedText[] {
return matchAll(text, regex, (match) => ({
index: match.index!,
text: match[0],
}));
}