diff --git a/src/core/languageTokenizers.ts b/src/core/languageTokenizers.ts index 9afd3fe05..0cf6500df 100644 --- a/src/core/languageTokenizers.ts +++ b/src/core/languageTokenizers.ts @@ -6,5 +6,5 @@ import { LanguageTokenizerOverrides } from "./tokenizer.types"; * Useful for languages like CSS, SCSS, shell, etc. */ export const languageWithDashedIdentifiers: LanguageTokenizerOverrides = { - identifiersRegex: "[\\p{L}\\p{M}_\\-0-9]+", + identifierWordDelimiters: ["-", "_"], }; diff --git a/src/core/tokenizer.ts b/src/core/tokenizer.ts index 48f02f584..58f271dd2 100644 --- a/src/core/tokenizer.ts +++ b/src/core/tokenizer.ts @@ -1,4 +1,4 @@ -import { mapValues } from "lodash"; +import { escapeRegExp, mapValues } from "lodash"; import { LanguageId, SupportedLanguageId } from "../languages/constants"; import { matchAll } from "../util/regex"; @@ -45,27 +45,34 @@ const FIXED_TOKENS = [ "-->", ]; -const IDENTIFIERS_REGEX = "[\\p{L}\\p{M}_0-9]+"; +export const IDENTIFIER_WORD_REGEXES = ["\\p{L}", "\\p{M}", "\\p{N}"]; +const IDENTIFIER_WORD_DELIMITERS = ["_"]; const SINGLE_SYMBOLS_REGEX = "[^\\s\\w]"; const NUMBERS_REGEX = "(?<=[^.\\d]|^)\\d+\\.\\d+(?=[^.\\d]|$)"; // (not-dot/digit digits dot digits not-dot/digit) const defaultLanguageTokenizerComponents: LanguageTokenizerComponents = { fixedTokens: FIXED_TOKENS, repeatableSymbols: REPEATABLE_SYMBOLS, - identifiersRegex: IDENTIFIERS_REGEX, + identifierWordRegexes: IDENTIFIER_WORD_REGEXES, + identifierWordDelimiters: IDENTIFIER_WORD_DELIMITERS, numbersRegex: NUMBERS_REGEX, singleSymbolsRegex: SINGLE_SYMBOLS_REGEX, }; +interface Matcher { + tokenMatcher: RegExp; + identifierMatcher: RegExp; + wordMatcher: RegExp; +} +const defaultMatcher = generateMatcher(); -const defaultTokenMatcher = generateTokenMatcher(); - -function generateTokenMatcher( +function generateMatcher( languageOverrides: LanguageTokenizerOverrides = {} -): RegExp { +): Matcher { const { fixedTokens, repeatableSymbols, - identifiersRegex, + identifierWordRegexes, + identifierWordDelimiters, numbersRegex, singleSymbolsRegex, }: LanguageTokenizerComponents = { @@ -80,6 +87,12 @@ function generateTokenMatcher( const fixedTokensRegex = fixedTokens.map(escapeRegExp).join("|"); + const identifierComponents = identifierWordRegexes.concat( + identifierWordDelimiters.map(escapeRegExp) + ); + const identifiersRegex = `(${identifierComponents.join("|")})+`; + const wordRegex = `(${identifierWordRegexes.join("|")})+`; + // Order matters here. const regex = [ fixedTokensRegex, @@ -89,7 +102,11 @@ function generateTokenMatcher( singleSymbolsRegex, ].join("|"); - return new RegExp(regex, "gu"); + return { + identifierMatcher: new RegExp(identifiersRegex, "gu"), + wordMatcher: new RegExp(wordRegex, "gu"), + tokenMatcher: new RegExp(regex, "gu"), + }; } const languageTokenizerOverrides: Partial< @@ -100,15 +117,15 @@ const languageTokenizerOverrides: Partial< shellscript: languageWithDashedIdentifiers, }; -const tokenMatchersForLanguage: Partial> = mapValues( - languageTokenizerOverrides, - (val: LanguageTokenizerComponents) => generateTokenMatcher(val) -); +const tokenMatchersForLanguage: Partial> = + mapValues(languageTokenizerOverrides, (val: LanguageTokenizerComponents) => + generateMatcher(val) + ); -export function getTokenMatcher(languageId: string): RegExp { +export function getMatcher(languageId: string): Matcher { return ( tokenMatchersForLanguage[languageId as SupportedLanguageId] ?? - defaultTokenMatcher + defaultMatcher ); } @@ -117,10 +134,5 @@ export function tokenize( languageId: string, mapfn: (v: RegExpMatchArray, k: number) => T ) { - return matchAll(text, getTokenMatcher(languageId), mapfn); -} - -//https://stackoverflow.com/a/6969486 -function escapeRegExp(string: string) { - return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + return matchAll(text, getMatcher(languageId).tokenMatcher, mapfn); } diff --git a/src/core/tokenizer.types.ts b/src/core/tokenizer.types.ts index 8bf1bf832..556935712 100644 --- a/src/core/tokenizer.types.ts +++ b/src/core/tokenizer.types.ts @@ -2,7 +2,20 @@ export interface LanguageTokenizerComponents { fixedTokens: string[]; - identifiersRegex: string; + + /** + * Each element of this list is a regex that can appear inside a token, and + * will be considered part of a subword. Note that there is no need to add a + * `*` here, as the regex will be allowed to repeat. + */ + identifierWordRegexes: string[]; + + /** + * These are allowable inside identifiers, and act to separate words in the + * identifier. They are raw strings, and will be regex-escaped. + */ + identifierWordDelimiters: string[]; + numbersRegex: string; repeatableSymbols: string[]; singleSymbolsRegex: string; diff --git a/src/processTargets/modifiers/scopeTypeStages/SubTokenStages.ts b/src/processTargets/modifiers/scopeTypeStages/SubTokenStages.ts index 06be51abd..7f54dffa6 100644 --- a/src/processTargets/modifiers/scopeTypeStages/SubTokenStages.ts +++ b/src/processTargets/modifiers/scopeTypeStages/SubTokenStages.ts @@ -7,17 +7,14 @@ import { EveryScopeModifier, } from "../../../typings/targetDescriptor.types"; import { ProcessedTargetsContext } from "../../../typings/Types"; -import { matchAll } from "../../../util/regex"; +import { MatchedText, matchText } from "../../../util/regex"; import { ModifierStage } from "../../PipelineStages.types"; import { PlainTarget, SubTokenWordTarget } from "../../targets"; -import { SUBWORD_MATCHER } from "../subToken"; +import { subWordSplitter } from "../subToken"; import { getTokenRangeForSelection } from "./TokenStage"; abstract class SubTokenStage implements ModifierStage { - constructor( - private modifier: ContainingScopeModifier | EveryScopeModifier, - private regex: RegExp - ) {} + constructor(private modifier: ContainingScopeModifier | EveryScopeModifier) {} run(context: ProcessedTargetsContext, target: Target): Target[] { const { document } = target.editor; @@ -27,14 +24,12 @@ abstract class SubTokenStage implements ModifierStage { ); const text = document.getText(tokenRange); const offset = document.offsetAt(tokenRange.start); - - const contentRanges = matchAll( - text, - this.regex, + const matches = this.getMatchedText(text, document.languageId); + const contentRanges = matches.map( (match) => new Range( - document.positionAt(offset + match.index!), - document.positionAt(offset + match.index! + match[0].length) + document.positionAt(offset + match.index), + document.positionAt(offset + match.index + match.text.length) ) ); @@ -107,6 +102,14 @@ abstract class SubTokenStage implements ModifierStage { ); } + /** + * Return matches for {@link text} + */ + protected abstract getMatchedText( + text: string, + languageId: string + ): MatchedText[]; + /** * Create one target for each element of {@link contentRanges} */ @@ -119,7 +122,11 @@ abstract class SubTokenStage implements ModifierStage { export class WordStage extends SubTokenStage { constructor(modifier: ContainingScopeModifier | EveryScopeModifier) { - super(modifier, SUBWORD_MATCHER); + super(modifier); + } + + protected getMatchedText(text: string, languageId: string): MatchedText[] { + return subWordSplitter(text, languageId); } protected createTargetsFromRanges( @@ -166,7 +173,11 @@ export class WordStage extends SubTokenStage { export class CharacterStage extends SubTokenStage { constructor(modifier: ContainingScopeModifier | EveryScopeModifier) { - super(modifier, GRAPHEME_SPLIT_REGEX); + super(modifier); + } + + protected getMatchedText(text: string): MatchedText[] { + return matchText(text, GRAPHEME_SPLIT_REGEX); } protected createTargetsFromRanges( diff --git a/src/processTargets/modifiers/subToken.ts b/src/processTargets/modifiers/subToken.ts index ce097f5d6..137ea7d14 100644 --- a/src/processTargets/modifiers/subToken.ts +++ b/src/processTargets/modifiers/subToken.ts @@ -1 +1,30 @@ -export const SUBWORD_MATCHER = /[A-Z]?[a-z]+|[A-Z]+(?![a-z])|[0-9]+/g; +import { getMatcher } from "../../core/tokenizer"; +import { matchText } from "../../util/regex"; + +const camelRegex = /\p{Lu}?\p{Ll}+|\p{Lu}+(?!\p{Ll})|\p{N}+/gu; + +export function subWordSplitter(text: string, languageId: string) { + // First split on identifiers. The input text can contain multiple + // tokens/identifiers and these can have different formats. + // eg `publicApiV1 public_api_v1` + const { identifierMatcher, wordMatcher } = getMatcher(languageId); + return matchText(text, identifierMatcher).flatMap((t) => + splitIdentifier(wordMatcher, t.text, t.index) + ); +} + +function splitIdentifier(wordMatcher: RegExp, text: string, index: number) { + // First try to split on non letter characters + const wordMatches = matchText(text, wordMatcher); + + const matches = + wordMatches.length > 1 + ? wordMatches + : // Secondly try split on camel case + matchText(text, camelRegex); + + return matches.map((match) => ({ + index: index + match.index, + text: match.text, + })); +} diff --git a/src/test/suite/fixtures/recorded/subtoken/chuckLastWord.yml b/src/test/suite/fixtures/recorded/subtoken/chuckLastWord.yml new file mode 100644 index 000000000..25be2450e --- /dev/null +++ b/src/test/suite/fixtures/recorded/subtoken/chuckLastWord.yml @@ -0,0 +1,25 @@ +languageId: scss +command: + spokenForm: chuck last word + version: 3 + targets: + - type: primitive + modifiers: + - type: ordinalScope + scopeType: {type: word} + start: -1 + length: 1 + usePrePhraseSnapshot: true + action: {name: remove} +initialState: + documentContents: "margin-top: 0;" + selections: + - anchor: {line: 0, character: 0} + active: {line: 0, character: 0} + marks: {} +finalState: + documentContents: "margin: 0;" + selections: + - anchor: {line: 0, character: 0} + active: {line: 0, character: 0} +fullTargets: [{type: primitive, mark: {type: cursor}, modifiers: [{type: ordinalScope, scopeType: {type: word}, start: -1, length: 1}]}] diff --git a/src/test/suite/fixtures/subtoken.fixture.ts b/src/test/suite/fixtures/subtoken.fixture.ts index b9a5d31d9..370ffea28 100644 --- a/src/test/suite/fixtures/subtoken.fixture.ts +++ b/src/test/suite/fixtures/subtoken.fixture.ts @@ -65,7 +65,27 @@ export const subtokenFixture: Fixture[] = [ expectedOutput: ["mock", "API", "Client", "Factory"], }, { - input: "mockAPIClient123Factory", - expectedOutput: ["mock", "API", "Client", "123", "Factory"], + input: "mockAPIClient123FactoryV1", + expectedOutput: ["mock", "API", "Client", "123", "Factory", "V", "1"], + }, + { + input: "mock_api_client_123_factory_v1", + expectedOutput: ["mock", "api", "client", "123", "factory", "v1"], + }, + { + input: "v1", + expectedOutput: ["v", "1"], + }, + { + input: "aaBbÄä", + expectedOutput: ["aa", "Bb", "Ää"], + }, + { + input: "apiV1 api_v_1", + expectedOutput: ["api", "V", "1", "api", "v", "1"], + }, + { + input: "_quickBrownFox_", + expectedOutput: ["quick", "Brown", "Fox"], }, ]; diff --git a/src/test/suite/subtoken.test.ts b/src/test/suite/subtoken.test.ts index 272ef3bda..235ad56ec 100644 --- a/src/test/suite/subtoken.test.ts +++ b/src/test/suite/subtoken.test.ts @@ -1,12 +1,14 @@ import * as assert from "assert"; -import { SUBWORD_MATCHER } from "../../processTargets/modifiers/subToken"; - +import { subWordSplitter } from "../../processTargets/modifiers/subToken"; import { subtokenFixture } from "./fixtures/subtoken.fixture"; suite("subtoken regex matcher", () => { subtokenFixture.forEach(({ input, expectedOutput }) => { test(input, () => { - assert.deepStrictEqual(input.match(SUBWORD_MATCHER), expectedOutput); + assert.deepStrictEqual( + subWordSplitter(input, "anyLang").map(({ text }) => text), + expectedOutput + ); }); }); }); diff --git a/src/util/addDecorationsToEditor.ts b/src/util/addDecorationsToEditor.ts index e48f64218..eccd96b8a 100644 --- a/src/util/addDecorationsToEditor.ts +++ b/src/util/addDecorationsToEditor.ts @@ -1,10 +1,10 @@ import { concat, flatten, maxBy, min } from "lodash"; import * as vscode from "vscode"; -import { HatStyleName } from "../core/hatStyles"; -import { getTokenMatcher } from "../core/tokenizer"; import Decorations from "../core/Decorations"; +import { HatStyleName } from "../core/hatStyles"; import { IndividualHatMap } from "../core/IndividualHatMap"; import { TokenGraphemeSplitter } from "../core/TokenGraphemeSplitter"; +import { getMatcher } from "../core/tokenizer"; import { Token } from "../typings/Types"; import { getDisplayLineMap } from "./getDisplayLineMap"; import { getTokenComparator } from "./getTokenComparator"; @@ -44,9 +44,12 @@ export function addDecorationsToEditors( expansionBehavior: { start: { type: "regex", - regex: getTokenMatcher(languageId), + regex: getMatcher(languageId).tokenMatcher, + }, + end: { + type: "regex", + regex: getMatcher(languageId).tokenMatcher, }, - end: { type: "regex", regex: getTokenMatcher(languageId) }, }, })) ) diff --git a/src/util/regex.ts b/src/util/regex.ts index 7210250f5..31b535b58 100644 --- a/src/util/regex.ts +++ b/src/util/regex.ts @@ -37,3 +37,15 @@ export function matchAll( ) { return Array.from(text.matchAll(regex), mapfn); } + +export interface MatchedText { + index: number; + text: string; +} + +export function matchText(text: string, regex: RegExp): MatchedText[] { + return matchAll(text, regex, (match) => ({ + index: match.index!, + text: match[0], + })); +}