mirror of
https://github.com/cursorless-dev/cursorless.git
synced 2024-10-05 05:17:38 +03:00
Update sub word regex/splitter (#1027)
* Added subWordSplitter function * Add support for unicode characters in sub words * Added additional test * Support language specific identifier regex * Cleanup * cleanup * Use unicode number in camel case regex Co-authored-by: Pokey Rule <755842+pokey@users.noreply.github.com>
This commit is contained in:
parent
0e769bee81
commit
429b6b72db
@ -6,5 +6,5 @@ import { LanguageTokenizerOverrides } from "./tokenizer.types";
|
||||
* Useful for languages like CSS, SCSS, shell, etc.
|
||||
*/
|
||||
export const languageWithDashedIdentifiers: LanguageTokenizerOverrides = {
|
||||
identifiersRegex: "[\\p{L}\\p{M}_\\-0-9]+",
|
||||
identifierWordDelimiters: ["-", "_"],
|
||||
};
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { mapValues } from "lodash";
|
||||
import { escapeRegExp, mapValues } from "lodash";
|
||||
import { LanguageId, SupportedLanguageId } from "../languages/constants";
|
||||
|
||||
import { matchAll } from "../util/regex";
|
||||
@ -45,27 +45,34 @@ const FIXED_TOKENS = [
|
||||
"-->",
|
||||
];
|
||||
|
||||
const IDENTIFIERS_REGEX = "[\\p{L}\\p{M}_0-9]+";
|
||||
export const IDENTIFIER_WORD_REGEXES = ["\\p{L}", "\\p{M}", "\\p{N}"];
|
||||
const IDENTIFIER_WORD_DELIMITERS = ["_"];
|
||||
const SINGLE_SYMBOLS_REGEX = "[^\\s\\w]";
|
||||
const NUMBERS_REGEX = "(?<=[^.\\d]|^)\\d+\\.\\d+(?=[^.\\d]|$)"; // (not-dot/digit digits dot digits not-dot/digit)
|
||||
|
||||
const defaultLanguageTokenizerComponents: LanguageTokenizerComponents = {
|
||||
fixedTokens: FIXED_TOKENS,
|
||||
repeatableSymbols: REPEATABLE_SYMBOLS,
|
||||
identifiersRegex: IDENTIFIERS_REGEX,
|
||||
identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
|
||||
identifierWordDelimiters: IDENTIFIER_WORD_DELIMITERS,
|
||||
numbersRegex: NUMBERS_REGEX,
|
||||
singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
|
||||
};
|
||||
interface Matcher {
|
||||
tokenMatcher: RegExp;
|
||||
identifierMatcher: RegExp;
|
||||
wordMatcher: RegExp;
|
||||
}
|
||||
const defaultMatcher = generateMatcher();
|
||||
|
||||
const defaultTokenMatcher = generateTokenMatcher();
|
||||
|
||||
function generateTokenMatcher(
|
||||
function generateMatcher(
|
||||
languageOverrides: LanguageTokenizerOverrides = {}
|
||||
): RegExp {
|
||||
): Matcher {
|
||||
const {
|
||||
fixedTokens,
|
||||
repeatableSymbols,
|
||||
identifiersRegex,
|
||||
identifierWordRegexes,
|
||||
identifierWordDelimiters,
|
||||
numbersRegex,
|
||||
singleSymbolsRegex,
|
||||
}: LanguageTokenizerComponents = {
|
||||
@ -80,6 +87,12 @@ function generateTokenMatcher(
|
||||
|
||||
const fixedTokensRegex = fixedTokens.map(escapeRegExp).join("|");
|
||||
|
||||
const identifierComponents = identifierWordRegexes.concat(
|
||||
identifierWordDelimiters.map(escapeRegExp)
|
||||
);
|
||||
const identifiersRegex = `(${identifierComponents.join("|")})+`;
|
||||
const wordRegex = `(${identifierWordRegexes.join("|")})+`;
|
||||
|
||||
// Order matters here.
|
||||
const regex = [
|
||||
fixedTokensRegex,
|
||||
@ -89,7 +102,11 @@ function generateTokenMatcher(
|
||||
singleSymbolsRegex,
|
||||
].join("|");
|
||||
|
||||
return new RegExp(regex, "gu");
|
||||
return {
|
||||
identifierMatcher: new RegExp(identifiersRegex, "gu"),
|
||||
wordMatcher: new RegExp(wordRegex, "gu"),
|
||||
tokenMatcher: new RegExp(regex, "gu"),
|
||||
};
|
||||
}
|
||||
|
||||
const languageTokenizerOverrides: Partial<
|
||||
@ -100,15 +117,15 @@ const languageTokenizerOverrides: Partial<
|
||||
shellscript: languageWithDashedIdentifiers,
|
||||
};
|
||||
|
||||
const tokenMatchersForLanguage: Partial<Record<LanguageId, RegExp>> = mapValues(
|
||||
languageTokenizerOverrides,
|
||||
(val: LanguageTokenizerComponents) => generateTokenMatcher(val)
|
||||
);
|
||||
const tokenMatchersForLanguage: Partial<Record<LanguageId, Matcher>> =
|
||||
mapValues(languageTokenizerOverrides, (val: LanguageTokenizerComponents) =>
|
||||
generateMatcher(val)
|
||||
);
|
||||
|
||||
export function getTokenMatcher(languageId: string): RegExp {
|
||||
export function getMatcher(languageId: string): Matcher {
|
||||
return (
|
||||
tokenMatchersForLanguage[languageId as SupportedLanguageId] ??
|
||||
defaultTokenMatcher
|
||||
defaultMatcher
|
||||
);
|
||||
}
|
||||
|
||||
@ -117,10 +134,5 @@ export function tokenize<T>(
|
||||
languageId: string,
|
||||
mapfn: (v: RegExpMatchArray, k: number) => T
|
||||
) {
|
||||
return matchAll(text, getTokenMatcher(languageId), mapfn);
|
||||
}
|
||||
|
||||
//https://stackoverflow.com/a/6969486
|
||||
function escapeRegExp(string: string) {
|
||||
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
|
||||
return matchAll(text, getMatcher(languageId).tokenMatcher, mapfn);
|
||||
}
|
||||
|
@ -2,7 +2,20 @@
|
||||
|
||||
export interface LanguageTokenizerComponents {
|
||||
fixedTokens: string[];
|
||||
identifiersRegex: string;
|
||||
|
||||
/**
|
||||
* Each element of this list is a regex that can appear inside a token, and
|
||||
* will be considered part of a subword. Note that there is no need to add a
|
||||
* `*` here, as the regex will be allowed to repeat.
|
||||
*/
|
||||
identifierWordRegexes: string[];
|
||||
|
||||
/**
|
||||
* These are allowable inside identifiers, and act to separate words in the
|
||||
* identifier. They are raw strings, and will be regex-escaped.
|
||||
*/
|
||||
identifierWordDelimiters: string[];
|
||||
|
||||
numbersRegex: string;
|
||||
repeatableSymbols: string[];
|
||||
singleSymbolsRegex: string;
|
||||
|
@ -7,17 +7,14 @@ import {
|
||||
EveryScopeModifier,
|
||||
} from "../../../typings/targetDescriptor.types";
|
||||
import { ProcessedTargetsContext } from "../../../typings/Types";
|
||||
import { matchAll } from "../../../util/regex";
|
||||
import { MatchedText, matchText } from "../../../util/regex";
|
||||
import { ModifierStage } from "../../PipelineStages.types";
|
||||
import { PlainTarget, SubTokenWordTarget } from "../../targets";
|
||||
import { SUBWORD_MATCHER } from "../subToken";
|
||||
import { subWordSplitter } from "../subToken";
|
||||
import { getTokenRangeForSelection } from "./TokenStage";
|
||||
|
||||
abstract class SubTokenStage implements ModifierStage {
|
||||
constructor(
|
||||
private modifier: ContainingScopeModifier | EveryScopeModifier,
|
||||
private regex: RegExp
|
||||
) {}
|
||||
constructor(private modifier: ContainingScopeModifier | EveryScopeModifier) {}
|
||||
|
||||
run(context: ProcessedTargetsContext, target: Target): Target[] {
|
||||
const { document } = target.editor;
|
||||
@ -27,14 +24,12 @@ abstract class SubTokenStage implements ModifierStage {
|
||||
);
|
||||
const text = document.getText(tokenRange);
|
||||
const offset = document.offsetAt(tokenRange.start);
|
||||
|
||||
const contentRanges = matchAll<Range>(
|
||||
text,
|
||||
this.regex,
|
||||
const matches = this.getMatchedText(text, document.languageId);
|
||||
const contentRanges = matches.map(
|
||||
(match) =>
|
||||
new Range(
|
||||
document.positionAt(offset + match.index!),
|
||||
document.positionAt(offset + match.index! + match[0].length)
|
||||
document.positionAt(offset + match.index),
|
||||
document.positionAt(offset + match.index + match.text.length)
|
||||
)
|
||||
);
|
||||
|
||||
@ -107,6 +102,14 @@ abstract class SubTokenStage implements ModifierStage {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return matches for {@link text}
|
||||
*/
|
||||
protected abstract getMatchedText(
|
||||
text: string,
|
||||
languageId: string
|
||||
): MatchedText[];
|
||||
|
||||
/**
|
||||
* Create one target for each element of {@link contentRanges}
|
||||
*/
|
||||
@ -119,7 +122,11 @@ abstract class SubTokenStage implements ModifierStage {
|
||||
|
||||
export class WordStage extends SubTokenStage {
|
||||
constructor(modifier: ContainingScopeModifier | EveryScopeModifier) {
|
||||
super(modifier, SUBWORD_MATCHER);
|
||||
super(modifier);
|
||||
}
|
||||
|
||||
protected getMatchedText(text: string, languageId: string): MatchedText[] {
|
||||
return subWordSplitter(text, languageId);
|
||||
}
|
||||
|
||||
protected createTargetsFromRanges(
|
||||
@ -166,7 +173,11 @@ export class WordStage extends SubTokenStage {
|
||||
|
||||
export class CharacterStage extends SubTokenStage {
|
||||
constructor(modifier: ContainingScopeModifier | EveryScopeModifier) {
|
||||
super(modifier, GRAPHEME_SPLIT_REGEX);
|
||||
super(modifier);
|
||||
}
|
||||
|
||||
protected getMatchedText(text: string): MatchedText[] {
|
||||
return matchText(text, GRAPHEME_SPLIT_REGEX);
|
||||
}
|
||||
|
||||
protected createTargetsFromRanges(
|
||||
|
@ -1 +1,30 @@
|
||||
export const SUBWORD_MATCHER = /[A-Z]?[a-z]+|[A-Z]+(?![a-z])|[0-9]+/g;
|
||||
import { getMatcher } from "../../core/tokenizer";
|
||||
import { matchText } from "../../util/regex";
|
||||
|
||||
const camelRegex = /\p{Lu}?\p{Ll}+|\p{Lu}+(?!\p{Ll})|\p{N}+/gu;
|
||||
|
||||
export function subWordSplitter(text: string, languageId: string) {
|
||||
// First split on identifiers. The input text can contain multiple
|
||||
// tokens/identifiers and these can have different formats.
|
||||
// eg `publicApiV1 public_api_v1`
|
||||
const { identifierMatcher, wordMatcher } = getMatcher(languageId);
|
||||
return matchText(text, identifierMatcher).flatMap((t) =>
|
||||
splitIdentifier(wordMatcher, t.text, t.index)
|
||||
);
|
||||
}
|
||||
|
||||
function splitIdentifier(wordMatcher: RegExp, text: string, index: number) {
|
||||
// First try to split on non letter characters
|
||||
const wordMatches = matchText(text, wordMatcher);
|
||||
|
||||
const matches =
|
||||
wordMatches.length > 1
|
||||
? wordMatches
|
||||
: // Secondly try split on camel case
|
||||
matchText(text, camelRegex);
|
||||
|
||||
return matches.map((match) => ({
|
||||
index: index + match.index,
|
||||
text: match.text,
|
||||
}));
|
||||
}
|
||||
|
25
src/test/suite/fixtures/recorded/subtoken/chuckLastWord.yml
Normal file
25
src/test/suite/fixtures/recorded/subtoken/chuckLastWord.yml
Normal file
@ -0,0 +1,25 @@
|
||||
languageId: scss
|
||||
command:
|
||||
spokenForm: chuck last word
|
||||
version: 3
|
||||
targets:
|
||||
- type: primitive
|
||||
modifiers:
|
||||
- type: ordinalScope
|
||||
scopeType: {type: word}
|
||||
start: -1
|
||||
length: 1
|
||||
usePrePhraseSnapshot: true
|
||||
action: {name: remove}
|
||||
initialState:
|
||||
documentContents: "margin-top: 0;"
|
||||
selections:
|
||||
- anchor: {line: 0, character: 0}
|
||||
active: {line: 0, character: 0}
|
||||
marks: {}
|
||||
finalState:
|
||||
documentContents: "margin: 0;"
|
||||
selections:
|
||||
- anchor: {line: 0, character: 0}
|
||||
active: {line: 0, character: 0}
|
||||
fullTargets: [{type: primitive, mark: {type: cursor}, modifiers: [{type: ordinalScope, scopeType: {type: word}, start: -1, length: 1}]}]
|
@ -65,7 +65,27 @@ export const subtokenFixture: Fixture[] = [
|
||||
expectedOutput: ["mock", "API", "Client", "Factory"],
|
||||
},
|
||||
{
|
||||
input: "mockAPIClient123Factory",
|
||||
expectedOutput: ["mock", "API", "Client", "123", "Factory"],
|
||||
input: "mockAPIClient123FactoryV1",
|
||||
expectedOutput: ["mock", "API", "Client", "123", "Factory", "V", "1"],
|
||||
},
|
||||
{
|
||||
input: "mock_api_client_123_factory_v1",
|
||||
expectedOutput: ["mock", "api", "client", "123", "factory", "v1"],
|
||||
},
|
||||
{
|
||||
input: "v1",
|
||||
expectedOutput: ["v", "1"],
|
||||
},
|
||||
{
|
||||
input: "aaBbÄä",
|
||||
expectedOutput: ["aa", "Bb", "Ää"],
|
||||
},
|
||||
{
|
||||
input: "apiV1 api_v_1",
|
||||
expectedOutput: ["api", "V", "1", "api", "v", "1"],
|
||||
},
|
||||
{
|
||||
input: "_quickBrownFox_",
|
||||
expectedOutput: ["quick", "Brown", "Fox"],
|
||||
},
|
||||
];
|
||||
|
@ -1,12 +1,14 @@
|
||||
import * as assert from "assert";
|
||||
import { SUBWORD_MATCHER } from "../../processTargets/modifiers/subToken";
|
||||
|
||||
import { subWordSplitter } from "../../processTargets/modifiers/subToken";
|
||||
import { subtokenFixture } from "./fixtures/subtoken.fixture";
|
||||
|
||||
suite("subtoken regex matcher", () => {
|
||||
subtokenFixture.forEach(({ input, expectedOutput }) => {
|
||||
test(input, () => {
|
||||
assert.deepStrictEqual(input.match(SUBWORD_MATCHER), expectedOutput);
|
||||
assert.deepStrictEqual(
|
||||
subWordSplitter(input, "anyLang").map(({ text }) => text),
|
||||
expectedOutput
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -1,10 +1,10 @@
|
||||
import { concat, flatten, maxBy, min } from "lodash";
|
||||
import * as vscode from "vscode";
|
||||
import { HatStyleName } from "../core/hatStyles";
|
||||
import { getTokenMatcher } from "../core/tokenizer";
|
||||
import Decorations from "../core/Decorations";
|
||||
import { HatStyleName } from "../core/hatStyles";
|
||||
import { IndividualHatMap } from "../core/IndividualHatMap";
|
||||
import { TokenGraphemeSplitter } from "../core/TokenGraphemeSplitter";
|
||||
import { getMatcher } from "../core/tokenizer";
|
||||
import { Token } from "../typings/Types";
|
||||
import { getDisplayLineMap } from "./getDisplayLineMap";
|
||||
import { getTokenComparator } from "./getTokenComparator";
|
||||
@ -44,9 +44,12 @@ export function addDecorationsToEditors(
|
||||
expansionBehavior: {
|
||||
start: {
|
||||
type: "regex",
|
||||
regex: getTokenMatcher(languageId),
|
||||
regex: getMatcher(languageId).tokenMatcher,
|
||||
},
|
||||
end: {
|
||||
type: "regex",
|
||||
regex: getMatcher(languageId).tokenMatcher,
|
||||
},
|
||||
end: { type: "regex", regex: getTokenMatcher(languageId) },
|
||||
},
|
||||
}))
|
||||
)
|
||||
|
@ -37,3 +37,15 @@ export function matchAll<T>(
|
||||
) {
|
||||
return Array.from(text.matchAll(regex), mapfn);
|
||||
}
|
||||
|
||||
export interface MatchedText {
|
||||
index: number;
|
||||
text: string;
|
||||
}
|
||||
|
||||
export function matchText(text: string, regex: RegExp): MatchedText[] {
|
||||
return matchAll(text, regex, (match) => ({
|
||||
index: match.index!,
|
||||
text: match[0],
|
||||
}));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user