Split string tokenization logic into basic tokenization and unescaping. Even though this is worse way to do it (less efficient, more edge cases unhandled), it's apparently the way the CPython tokenizer and parser work, so we need to maintain compatibility.

This commit is contained in:
Eric Traut 2019-06-17 09:19:22 -06:00
parent 0f9b59579e
commit 5856f2fff8
7 changed files with 483 additions and 345 deletions

View File

@ -27,6 +27,7 @@ import { AwaitExpressionNode, ClassNode, ErrorExpressionNode,
ModuleNode, NonlocalNode, RaiseNode, StatementListNode, StatementNode,
StringListNode, SuiteNode, TryNode, TypeAnnotationExpressionNode, WhileNode,
YieldExpressionNode, YieldFromExpressionNode } from '../parser/parseNodes';
import { StringTokenUtils } from '../parser/stringTokenUtils';
import { StringToken, StringTokenFlags } from '../parser/tokenizerTypes';
import { ScopeUtils } from '../scopeUtils';
import { AnalyzerFileInfo } from './analyzerFileInfo';
@ -382,38 +383,17 @@ export abstract class SemanticAnalyzer extends ParseTreeWalker {
}
visitStringList(node: StringListNode): boolean {
for (let string of node.strings) {
const stringToken = string.token;
if (stringToken.flags & StringTokenFlags.Unterminated) {
this._addError('String literal is unterminated', stringToken);
}
for (let stringNode of node.strings) {
if (stringNode.hasInvalidEscapeSequence) {
const unescapedResult = StringTokenUtils.getUnescapedString(stringNode.token);
if (stringToken.flags & StringTokenFlags.NonAsciiInBytes) {
this._addError('Non-ASCII character not allowed in bytes string literal', stringToken);
}
if (stringToken.flags & StringTokenFlags.Format) {
if (this._fileInfo.executionEnvironment.pythonVersion < PythonVersion.V36) {
this._addError('Format string literals (f-strings) require Python 3.6 or newer', stringToken);
}
if (stringToken.flags & StringTokenFlags.Bytes) {
this._addError('Format string literals (f-strings) cannot be binary', stringToken);
}
if (stringToken.flags & StringTokenFlags.Unicode) {
this._addError('Format string literals (f-strings) cannot be unicode', stringToken);
}
}
if (stringToken.flags & StringTokenFlags.UnrecognizedEscape) {
if (stringToken.invalidEscapeOffsets) {
stringToken.invalidEscapeOffsets.forEach(offset => {
const textRange = new TextRange(stringToken.start + offset, 1);
this._addDiagnostic(this._fileInfo.diagnosticSettings.reportInvalidStringEscapeSequence,
'Unsupported escape sequence in string literal', textRange);
});
}
unescapedResult.invalidEscapeOffsets.forEach(offset => {
const start = stringNode.token.start + stringNode.token.prefixLength +
stringNode.token.quoteMarkLength + offset;
const textRange = new TextRange(start - 1, 2);
this._addDiagnostic(this._fileInfo.diagnosticSettings.reportInvalidStringEscapeSequence,
'Unsupported escape sequence in string literal', textRange);
});
}
}
@ -519,7 +499,7 @@ export abstract class SemanticAnalyzer extends ParseTreeWalker {
return undefined;
}
return DocStringUtils.decodeDocString(docStringToken.value);
return DocStringUtils.decodeDocString(docStringNode.strings[0].value);
}
private _validateYieldUsage(node: YieldExpressionNode | YieldFromExpressionNode) {

View File

@ -765,10 +765,14 @@ export class NumberNode extends ExpressionNode {
export class StringNode extends ExpressionNode {
readonly nodeType = ParseNodeType.String;
token: StringToken;
value: string;
hasInvalidEscapeSequence: boolean;
constructor(token: StringToken) {
constructor(token: StringToken, unescapedValue: string, hasInvalidEscapeSequence: boolean) {
super(token);
this.token = token;
this.value = unescapedValue;
this.hasInvalidEscapeSequence = hasInvalidEscapeSequence;
}
getChildren(): RecursiveParseNodeArray {
@ -776,17 +780,21 @@ export class StringNode extends ExpressionNode {
}
getValue(): string {
return this.token.value;
return this.value;
}
}
export class FormatStringNode extends ExpressionNode {
readonly nodeType = ParseNodeType.String;
token: StringToken;
value: string;
hasInvalidEscapeSequence: boolean;
constructor(token: StringToken) {
constructor(token: StringToken, unescapedValue: string, hasInvalidEscapeSequence: boolean) {
super(token);
this.token = token;
this.value = unescapedValue;
this.hasInvalidEscapeSequence = hasInvalidEscapeSequence;
}
getChildren(): RecursiveParseNodeArray {
@ -794,7 +802,7 @@ export class FormatStringNode extends ExpressionNode {
}
getValue(): string {
return this.token.value;
return this.value;
}
}

View File

@ -39,6 +39,7 @@ import { ArgumentCategory, ArgumentNode, AssertNode,
TupleExpressionNode, TypeAnnotationExpressionNode, UnaryExpressionNode,
UnpackExpressionNode, WhileNode, WithItemNode, WithNode, YieldExpressionNode,
YieldFromExpressionNode } from './parseNodes';
import { StringTokenUtils, UnescapedString } from './stringTokenUtils';
import { Tokenizer, TokenizerOutput } from './tokenizer';
import { DedentToken, IdentifierToken, KeywordToken, KeywordType,
NumberToken, OperatorToken, OperatorType, StringToken,
@ -58,6 +59,7 @@ export class ParseOptions {
isStubFile: boolean;
pythonVersion: PythonVersion;
reportInvalidStringEscapeSequence: boolean;
}
export interface ParseResults {
@ -2240,6 +2242,36 @@ export class Parser {
this._isParsingTypeAnnotation = wasParsingTypeAnnotation;
}
private _reportStringTokenErrors(stringToken: StringToken, unescapedResult: UnescapedString) {
if (stringToken.flags & StringTokenFlags.Unterminated) {
this._addError('String literal is unterminated', stringToken);
}
if (unescapedResult.nonAsciiInBytes) {
this._addError('Non-ASCII character not allowed in bytes string literal', stringToken);
}
if (stringToken.flags & StringTokenFlags.Format) {
if (this._getLanguageVersion() < PythonVersion.V36) {
this._addError('Format string literals (f-strings) require Python 3.6 or newer', stringToken);
}
if (stringToken.flags & StringTokenFlags.Bytes) {
this._addError('Format string literals (f-strings) cannot be binary', stringToken);
}
if (stringToken.flags & StringTokenFlags.Unicode) {
this._addError('Format string literals (f-strings) cannot be unicode', stringToken);
}
}
}
private _makeStringNode(stringToken: StringToken): StringNode {
const unescapedResult = StringTokenUtils.getUnescapedString(stringToken);
this._reportStringTokenErrors(stringToken, unescapedResult);
return new StringNode(stringToken, unescapedResult.value, unescapedResult.invalidEscapeOffsets.length > 0);
}
private _getTypeAnnotationComment(): ExpressionNode | undefined {
if (this._tokenIndex === 0) {
return undefined;
@ -2263,30 +2295,33 @@ export class Parser {
const typeString = match[2];
const tokenOffset = curToken.end + match[1].length;
const stringToken = new StringToken(tokenOffset,
typeString.length, StringTokenFlags.None, typeString, 0,
undefined, undefined);
const stringNode = new StringListNode([new StringNode(stringToken)]);
typeString.length, StringTokenFlags.None, typeString, 0, undefined);
const stringNode = this._makeStringNode(stringToken);
const stringListNode = new StringListNode([stringNode]);
let parser = new Parser();
let parseResults = parser.parseTextExpression(this._fileContents!,
tokenOffset, typeString.length, this._parseOptions);
parseResults.diagnostics.forEach(diag => {
this._addError(diag.message, stringNode);
this._addError(diag.message, stringListNode);
});
if (!parseResults.parseTree) {
return undefined;
}
stringNode.typeAnnotation = parseResults.parseTree;
stringListNode.typeAnnotation = parseResults.parseTree;
return stringNode;
return stringListNode;
}
private _parseFormatString(token: StringToken): FormatStringNode {
private _parseFormatString(stringToken: StringToken): FormatStringNode {
const unescapedResult = StringTokenUtils.getUnescapedString(stringToken);
this._reportStringTokenErrors(stringToken, unescapedResult);
// TODO - need to implement
return new FormatStringNode(token);
return new FormatStringNode(stringToken, unescapedResult.value, unescapedResult.invalidEscapeOffsets.length > 0);
}
private _parseStringList(): StringListNode {
@ -2297,7 +2332,7 @@ export class Parser {
if (stringToken.flags & StringTokenFlags.Format) {
stringList.push(this._parseFormatString(stringToken));
} else {
stringList.push(new StringNode(stringToken));
stringList.push(this._makeStringNode(stringToken));
}
}
@ -2315,20 +2350,19 @@ export class Parser {
this._addError('Type hints cannot use format string literals (f-strings)', stringNode);
} else {
const stringToken = stringNode.strings[0].token;
const stringValue = stringToken.value;
const stringValue = StringTokenUtils.getUnescapedString(stringNode.strings[0].token);
const unescapedString = stringValue.value;
const tokenOffset = stringToken.start;
// Add one character to the prefix to also include the quote.
const prefixLength = stringToken.prefixLength + 1;
const prefixLength = stringToken.prefixLength + stringToken.quoteMarkLength;
// Don't allow escape characters because we have no way of mapping
// error ranges back to the escaped text.
if (stringToken.value.length !== stringToken.length - prefixLength - 1) {
if (unescapedString.length !== stringToken.length - prefixLength - stringToken.quoteMarkLength) {
this._addError('Type hints cannot contain escape characters', stringNode);
} else {
let parser = new Parser();
let parseResults = parser.parseTextExpression(this._fileContents!,
tokenOffset + prefixLength, stringValue.length, this._parseOptions);
tokenOffset + prefixLength, unescapedString.length, this._parseOptions);
parseResults.diagnostics.forEach(diag => {
this._addError(diag.message, stringNode);

View File

@ -0,0 +1,294 @@
/*
* stringTokenUtils.ts
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT license.
* Author: Eric Traut
*
* Methods that handle unescaping of escaped string token
* literal values.
*/
import Char from 'typescript-char';
import { StringToken, StringTokenFlags } from './tokenizerTypes';
export interface UnescapedString {
value: string;
invalidEscapeOffsets: number[];
nonAsciiInBytes: boolean;
}
export class StringTokenUtils {
static getUnescapedString(stringToken: StringToken): UnescapedString {
const escapedString = stringToken.escapedValue;
const isRaw = (stringToken.flags & StringTokenFlags.Raw) !== 0;
const isBytes = (stringToken.flags & StringTokenFlags.Bytes) !== 0;
let strOffset = 0;
let output: UnescapedString = {
value: '',
invalidEscapeOffsets: [],
nonAsciiInBytes: false
};
const addInvalidEscapeOffset = () => {
// Invalid escapes are not reported for raw strings.
if (!isRaw) {
output.invalidEscapeOffsets.push(strOffset);
}
};
const getEscapedCharacter = (offset = 0) => {
if (strOffset + offset >= escapedString.length) {
return Char.EndOfText;
}
return escapedString.charCodeAt(strOffset + offset);
};
const scanHexEscape = (digitCount: number) => {
let foundIllegalHexDigit = false;
let hexValue = 0;
let localValue = '';
for (let i = 0; i < digitCount; i++) {
const charCode = getEscapedCharacter(1 + i);
if (!this._isHexCharCode(charCode)) {
foundIllegalHexDigit = true;
break;
}
hexValue = 16 * hexValue + this._getHexDigitValue(charCode);
}
if (foundIllegalHexDigit) {
addInvalidEscapeOffset();
localValue = '\\' + String.fromCharCode(getEscapedCharacter());
strOffset++;
} else {
localValue = String.fromCharCode(hexValue);
strOffset += 1 + digitCount;
}
return localValue;
};
while (true) {
let curChar = getEscapedCharacter();
if (curChar === Char.EndOfText) {
return output;
}
if (curChar === Char.Backslash) {
// Move past the escape (backslash) character.
strOffset++;
curChar = getEscapedCharacter();
let localValue = '';
if (curChar === Char.CarriageReturn || curChar === Char.LineFeed) {
if (curChar === Char.CarriageReturn && getEscapedCharacter(1) === Char.LineFeed) {
if (isRaw) {
localValue += String.fromCharCode(curChar);
}
strOffset++;
curChar = getEscapedCharacter();
}
if (isRaw) {
localValue = '\\' + localValue + String.fromCharCode(curChar);
}
strOffset++;
} else {
if (isRaw) {
localValue = '\\' + String.fromCharCode(curChar);
strOffset++;
} else {
switch (curChar) {
case Char.Backslash:
case Char.SingleQuote:
case Char.DoubleQuote:
localValue = String.fromCharCode(curChar);
strOffset++;
break;
case Char.a:
localValue = '\u0007';
strOffset++;
break;
case Char.b:
localValue = '\b';
strOffset++;
break;
case Char.f:
localValue = '\f';
strOffset++;
break;
case Char.n:
localValue = '\n';
strOffset++;
break;
case Char.r:
localValue = '\r';
strOffset++;
break;
case Char.t:
localValue = '\t';
strOffset++;
break;
case Char.v:
localValue = '\v';
strOffset++;
break;
case Char.x:
localValue = scanHexEscape(2);
break;
case Char.N: {
let foundIllegalChar = false;
let charCount = 1;
if (getEscapedCharacter(charCount) !== Char.OpenBrace) {
foundIllegalChar = true;
} else {
charCount++;
while (true) {
const lookaheadChar = getEscapedCharacter(charCount);
if (lookaheadChar === Char.CloseBrace) {
break;
} else if (!this._isAlphaNumericChar(lookaheadChar)) {
foundIllegalChar = true;
break;
} else {
charCount++;
}
}
}
if (foundIllegalChar) {
addInvalidEscapeOffset();
localValue = '\\' + String.fromCharCode(curChar);
strOffset++;
} else {
// We don't have the Unicode name database handy, so
// assume that the name is valid and use a '-' as a
// replacement character.
localValue = '-';
strOffset += 1 + charCount;
}
break;
}
case Char.u:
localValue = scanHexEscape(4);
break;
case Char.U:
localValue = scanHexEscape(8);
break;
default:
if (this._isOctalCharCode(curChar)) {
let octalCode = curChar - Char._0;
strOffset++;
curChar = getEscapedCharacter();
if (this._isOctalCharCode(curChar)) {
octalCode = octalCode * 8 + curChar - Char._0;
strOffset++;
curChar = getEscapedCharacter();
if (this._isOctalCharCode(curChar)) {
octalCode = octalCode * 8 + curChar - Char._0;
strOffset++;
}
}
localValue = String.fromCharCode(octalCode);
} else {
localValue = '\\' + String.fromCharCode(curChar);
addInvalidEscapeOffset();
strOffset++;
}
break;
}
}
}
output.value += localValue;
} else if (curChar === Char.LineFeed || curChar === Char.CarriageReturn) {
// Skip over the escaped new line (either one or two characters).
if (curChar === Char.CarriageReturn && getEscapedCharacter(1) === Char.LineFeed) {
output.value += String.fromCharCode(curChar);
strOffset++;
curChar = getEscapedCharacter();
}
output.value += String.fromCharCode(curChar);
strOffset++;
} else {
// There's nothing to unescape, so output the escaped character directly.
if (isBytes && curChar >= 128) {
output.nonAsciiInBytes = true;
}
output.value += String.fromCharCode(curChar);
strOffset++;
}
}
}
private static _isAlphaNumericChar(charCode: number): boolean {
if (charCode >= Char._0 && charCode <= Char._9) {
return true;
}
if (charCode >= Char.a && charCode <= Char.z) {
return true;
}
if (charCode >= Char.A && charCode <= Char.A) {
return true;
}
return false;
}
private static _isOctalCharCode(charCode: number): boolean {
return charCode >= Char._0 && charCode <= Char._7;
}
private static _isHexCharCode(charCode: number): boolean {
if (charCode >= Char._0 && charCode <= Char._9) {
return true;
}
if (charCode >= Char.a && charCode <= Char.f) {
return true;
}
if (charCode >= Char.A && charCode <= Char.F) {
return true;
}
return false;
}
private static _getHexDigitValue(charCode: number): number {
if (charCode >= Char._0 && charCode <= Char._9) {
return charCode - Char._0;
}
if (charCode >= Char.a && charCode <= Char.f) {
return charCode - Char.a + 10;
}
if (charCode >= Char.A && charCode <= Char.F) {
return charCode - Char.A + 10;
}
return 0;
}
}

View File

@ -110,9 +110,8 @@ export interface TokenizerOutput {
}
interface StringScannerOutput {
value: string;
escapedValue: string;
flags: StringTokenFlags;
invalidEscapeOffsets?: number[];
}
export class Tokenizer {
@ -805,218 +804,63 @@ export class Tokenizer {
this._cs.moveNext();
}
const stringLiteralInfo = this._skipToEndOfStringLiteral(flags, start);
const stringLiteralInfo = this._skipToEndOfStringLiteral(flags);
let end = this._cs.position;
this._tokens.push(new StringToken(start, end - start, stringLiteralInfo.flags,
stringLiteralInfo.value, stringPrefixLength, stringLiteralInfo.invalidEscapeOffsets,
this._getComments()));
stringLiteralInfo.escapedValue, stringPrefixLength, this._getComments()));
}
private _skipToEndOfStringLiteral(flags: StringTokenFlags, startPosition: number): StringScannerOutput {
private _skipToEndOfStringLiteral(flags: StringTokenFlags): StringScannerOutput {
const quoteChar = (flags & StringTokenFlags.SingleQuote) ? Char.SingleQuote : Char.DoubleQuote;
const isTriplicate = (flags & StringTokenFlags.Triplicate) !== 0;
const isRaw = (flags & StringTokenFlags.Raw) !== 0;
const isBytes = (flags & StringTokenFlags.Bytes) !== 0;
let unescapedValue = '';
let invalidEscapeOffsets: number[] | undefined;
const addInvalidEscapeOffset = () => {
// Invalid escapes are not reported for raw strings.
if ((flags & StringTokenFlags.Raw) === 0) {
flags |= StringTokenFlags.UnrecognizedEscape;
if (!invalidEscapeOffsets) {
invalidEscapeOffsets = [];
}
invalidEscapeOffsets.push(this._cs.position - startPosition);
}
};
const scanHexEscape = (digitCount: number) => {
let foundIllegalHexDigit = false;
let hexValue = 0;
let localValue = '';
for (let i = 0; i < digitCount; i++) {
const charCode = this._cs.lookAhead(1 + i);
if (!this._isHexCharCode(charCode)) {
foundIllegalHexDigit = true;
break;
}
hexValue = 16 * hexValue + this._getHexDigitValue(charCode);
}
if (foundIllegalHexDigit) {
addInvalidEscapeOffset();
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
} else {
localValue = String.fromCharCode(hexValue);
this._cs.advance(1 + digitCount);
}
return localValue;
};
let escapedValue = '';
while (true) {
if (this._cs.isEndOfStream()) {
// Hit the end of file without a termination.
flags |= StringTokenFlags.Unterminated;
return { value: unescapedValue, flags, invalidEscapeOffsets };
return { escapedValue, flags };
}
if (this._cs.currentChar === Char.Backslash) {
escapedValue += String.fromCharCode(this._cs.currentChar);
// Move past the escape (backslash) character.
this._cs.moveNext();
let localValue = '';
if (this._cs.getCurrentChar() === Char.CarriageReturn || this._cs.getCurrentChar() === Char.LineFeed) {
if (this._cs.getCurrentChar() === Char.CarriageReturn && this._cs.nextChar === Char.LineFeed) {
if (isRaw) {
localValue += String.fromCharCode(this._cs.currentChar);
escapedValue += String.fromCharCode(this._cs.getCurrentChar());
}
this._cs.moveNext();
}
if (isRaw) {
localValue = '\\' + localValue + String.fromCharCode(this._cs.currentChar);
escapedValue += String.fromCharCode(this._cs.getCurrentChar());
}
this._cs.moveNext();
this._addLineRange();
} else {
if (isRaw) {
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
} else {
switch (this._cs.getCurrentChar()) {
case Char.Backslash:
case Char.SingleQuote:
case Char.DoubleQuote:
localValue = String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
break;
case Char.a:
localValue = '\u0007';
this._cs.moveNext();
break;
case Char.b:
localValue = '\b';
this._cs.moveNext();
break;
case Char.f:
localValue = '\f';
this._cs.moveNext();
break;
case Char.n:
localValue = '\n';
this._cs.moveNext();
break;
case Char.r:
localValue = '\r';
this._cs.moveNext();
break;
case Char.t:
localValue = '\t';
this._cs.moveNext();
break;
case Char.v:
localValue = '\v';
this._cs.moveNext();
break;
case Char.x:
localValue = scanHexEscape(2);
break;
case Char.N: {
let foundIllegalChar = false;
let charCount = 1;
if (this._cs.lookAhead(charCount) !== Char.OpenBrace) {
foundIllegalChar = true;
} else {
charCount++;
while (true) {
const lookaheadChar = this._cs.lookAhead(charCount);
if (lookaheadChar === Char.CloseBrace) {
break;
} else if (!this._isAlphaNumericChar(lookaheadChar)) {
foundIllegalChar = true;
break;
} else {
charCount++;
}
}
}
if (foundIllegalChar) {
addInvalidEscapeOffset();
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
} else {
// We don't have the Unicode name database handy, so
// assume that the name is valid and use a '-' as a
// replacement character.
localValue = '-';
this._cs.advance(1 + charCount);
}
break;
}
case Char.u:
localValue = scanHexEscape(4);
break;
case Char.U:
localValue = scanHexEscape(8);
break;
default:
if (this._isOctalCharCode(this._cs.currentChar)) {
let octalCode = this._cs.currentChar - Char._0;
this._cs.moveNext();
if (this._isOctalCharCode(this._cs.currentChar)) {
octalCode = octalCode * 8 + this._cs.currentChar - Char._0;
this._cs.moveNext();
if (this._isOctalCharCode(this._cs.currentChar)) {
octalCode = octalCode * 8 + this._cs.currentChar - Char._0;
this._cs.moveNext();
}
}
localValue = String.fromCharCode(octalCode);
} else {
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
addInvalidEscapeOffset();
this._cs.moveNext();
}
break;
}
}
escapedValue += String.fromCharCode(this._cs.getCurrentChar());
this._cs.moveNext();
}
unescapedValue += localValue;
} else if (this._cs.currentChar === Char.LineFeed || this._cs.currentChar === Char.CarriageReturn) {
if (!isTriplicate) {
// Unterminated single-line string
flags |= StringTokenFlags.Unterminated;
return { value: unescapedValue, flags, invalidEscapeOffsets };
return { escapedValue, flags };
}
// Skip over the escaped new line (either one or two characters).
// Skip over the new line (either one or two characters).
if (this._cs.currentChar === Char.CarriageReturn && this._cs.nextChar === Char.LineFeed) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
escapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
escapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
this._addLineRange();
} else if (!isTriplicate && this._cs.currentChar === quoteChar) {
@ -1028,69 +872,12 @@ export class Tokenizer {
this._cs.advance(3);
break;
} else {
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInBytes;
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
escapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
}
}
return { value: unescapedValue, flags, invalidEscapeOffsets };
}
private _isAlphaNumericChar(charCode: number): boolean {
if (charCode >= Char._0 && charCode <= Char._9) {
return true;
}
if (charCode >= Char.a && charCode <= Char.z) {
return true;
}
if (charCode >= Char.A && charCode <= Char.A) {
return true;
}
return false;
}
private _isOctalCharCode(charCode: number): boolean {
return charCode >= Char._0 && charCode <= Char._7;
}
private _isHexCharCode(charCode: number): boolean {
if (charCode >= Char._0 && charCode <= Char._9) {
return true;
}
if (charCode >= Char.a && charCode <= Char.f) {
return true;
}
if (charCode >= Char.A && charCode <= Char.F) {
return true;
}
return false;
}
private _getHexDigitValue(charCode: number): number {
if (charCode >= Char._0 && charCode <= Char._9) {
return charCode - Char._0;
}
if (charCode >= Char.a && charCode <= Char.f) {
return charCode - Char.a + 10;
}
if (charCode >= Char.A && charCode <= Char.F) {
return charCode - Char.A + 10;
}
return 0;
return { escapedValue, flags };
}
private _skipFloatingPointCandidate(): boolean {

View File

@ -155,9 +155,7 @@ export enum StringTokenFlags {
Format = 0x40,
// Error conditions
Unterminated = 0x1000,
NonAsciiInBytes = 0x2000,
UnrecognizedEscape = 0x4000
Unterminated = 0x1000
}
export class Comment extends TextRange {
@ -232,22 +230,26 @@ export class KeywordToken extends Token {
export class StringToken extends Token {
readonly flags: StringTokenFlags;
readonly value: string;
readonly invalidEscapeOffsets: number[] | undefined;
// Use StringTokenUtils to convert escaped value to unescaped value.
readonly escapedValue: string;
// Number of characters in token that appear before
// the quote marks (e.g. "r" or "UR").
readonly prefixLength: number;
constructor(start: number, length: number, flags: StringTokenFlags, value: string,
prefixLength: number, invalidEscapeOffsets: number[] | undefined,
comments: Comment[] | undefined) {
// Number of characters in token that make up the quote
// (either 1 or 3).
readonly quoteMarkLength: number;
constructor(start: number, length: number, flags: StringTokenFlags, escapedValue: string,
prefixLength: number, comments: Comment[] | undefined) {
super(TokenType.String, start, length, comments);
this.flags = flags;
this.value = value;
this.escapedValue = escapedValue;
this.prefixLength = prefixLength;
this.invalidEscapeOffsets = invalidEscapeOffsets;
this.quoteMarkLength = (flags & StringTokenFlags.Triplicate) ? 3 : 1;
}
}

View File

@ -14,6 +14,7 @@ import * as assert from 'assert';
import { TestUtils } from './testUtils';
import { StringTokenUtils } from '../parser/stringTokenUtils';
import { Tokenizer } from '../parser/tokenizer';
import { DedentToken, IdentifierToken, IndentToken, NewLineToken, NewLineType,
NumberToken, OperatorToken, OperatorType, StringToken,
@ -224,7 +225,7 @@ test('Strings: simple', () => {
const stringToken = results.tokens.getItemAt(0) as StringToken;
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.length, 3);
assert.equal(stringToken.value, 'a');
assert.equal(stringToken.escapedValue, 'a');
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
});
@ -304,7 +305,7 @@ test('Strings: single quote escape', () => {
assert.equal(stringToken.flags, StringTokenFlags.SingleQuote);
assert.equal(stringToken.length, 12);
assert.equal(stringToken.prefixLength, 0);
assert.equal(stringToken.value, '\'quoted\'');
assert.equal(stringToken.escapedValue, '\\\'quoted\\\'');
});
test('Strings: double quote escape', () => {
@ -316,7 +317,7 @@ test('Strings: double quote escape', () => {
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken.length, 12);
assert.equal(stringToken.value, '"quoted"');
assert.equal(stringToken.escapedValue, '\\"quoted\\"');
});
test('Strings: triplicate double quote escape', () => {
@ -329,10 +330,10 @@ test('Strings: triplicate double quote escape', () => {
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Triplicate);
assert.equal(stringToken.length, 16);
assert.equal(stringToken.value, '"quoted"');
assert.equal(stringToken.escapedValue, '\\"quoted\\"');
});
test('Strings: single quoted f-string ', () => {
test('Strings: single quoted f-string', () => {
const t = new Tokenizer();
// tslint:disable-next-line:quotemark
const results = t.tokenize("a+f'quoted'");
@ -344,10 +345,10 @@ test('Strings: single quoted f-string ', () => {
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.flags, StringTokenFlags.SingleQuote | StringTokenFlags.Format);
assert.equal(stringToken.length, 9);
assert.equal(stringToken.value, 'quoted');
assert.equal(stringToken.escapedValue, 'quoted');
});
test('Strings: double quoted f-string ', () => {
test('Strings: double quoted f-string', () => {
const t = new Tokenizer();
const results = t.tokenize('x(1,f"quoted")');
assert.equal(results.tokens.count, 6 + _implicitTokenCount);
@ -361,10 +362,10 @@ test('Strings: double quoted f-string ', () => {
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote | StringTokenFlags.Format);
assert.equal(stringToken.length, 9);
assert.equal(stringToken.value, 'quoted');
assert.equal(stringToken.escapedValue, 'quoted');
});
test('Strings: single quoted multiline f-string ', () => {
test('Strings: single quoted multiline f-string', () => {
const t = new Tokenizer();
// tslint:disable-next-line:quotemark
const results = t.tokenize("f'''quoted'''");
@ -375,10 +376,10 @@ test('Strings: single quoted multiline f-string ', () => {
assert.equal(stringToken.flags,
StringTokenFlags.SingleQuote | StringTokenFlags.Triplicate | StringTokenFlags.Format);
assert.equal(stringToken.length, 13);
assert.equal(stringToken.value, 'quoted');
assert.equal(stringToken.escapedValue, 'quoted');
});
test('Strings: double quoted multiline f-string ', () => {
test('Strings: double quoted multiline f-string', () => {
const t = new Tokenizer();
const results = t.tokenize('f"""quoted """');
assert.equal(results.tokens.count, 1 + _implicitTokenCount);
@ -388,10 +389,10 @@ test('Strings: double quoted multiline f-string ', () => {
assert.equal(stringToken.flags,
StringTokenFlags.DoubleQuote | StringTokenFlags.Triplicate | StringTokenFlags.Format);
assert.equal(stringToken.length, 14);
assert.equal(stringToken.value, 'quoted ');
assert.equal(stringToken.escapedValue, 'quoted ');
});
test('Strings: escape at the end of single quoted string ', () => {
test('Strings: escape at the end of single quoted string', () => {
const t = new Tokenizer();
// tslint:disable-next-line:quotemark
const results = t.tokenize("'quoted\\'\nx");
@ -402,13 +403,13 @@ test('Strings: escape at the end of single quoted string ', () => {
assert.equal(stringToken.flags,
StringTokenFlags.SingleQuote | StringTokenFlags.Unterminated);
assert.equal(stringToken.length, 9);
assert.equal(stringToken.value, 'quoted\'');
assert.equal(stringToken.escapedValue, 'quoted\\\'');
assert.equal(results.tokens.getItemAt(1).type, TokenType.NewLine);
assert.equal(results.tokens.getItemAt(2).type, TokenType.Identifier);
});
test('Strings: escape at the end of double quoted string ', () => {
test('Strings: escape at the end of double quoted string', () => {
const t = new Tokenizer();
const results = t.tokenize('"quoted\\"\nx');
assert.equal(results.tokens.count, 3 + _implicitTokenCount);
@ -418,7 +419,7 @@ test('Strings: escape at the end of double quoted string ', () => {
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Unterminated);
assert.equal(stringToken.length, 9);
assert.equal(stringToken.value, 'quoted"');
assert.equal(stringToken.escapedValue, 'quoted\\"');
assert.equal(results.tokens.getItemAt(1).type, TokenType.NewLine);
assert.equal(results.tokens.getItemAt(2).type, TokenType.Identifier);
@ -434,7 +435,7 @@ test('Strings: b/u/r-string', () => {
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Bytes);
assert.equal(stringToken0.length, 4);
assert.equal(stringToken0.value, 'b');
assert.equal(stringToken0.escapedValue, 'b');
assert.equal(stringToken0.prefixLength, 1);
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
@ -442,7 +443,7 @@ test('Strings: b/u/r-string', () => {
assert.equal(stringToken1.flags,
StringTokenFlags.SingleQuote | StringTokenFlags.Unicode);
assert.equal(stringToken1.length, 4);
assert.equal(stringToken1.value, 'u');
assert.equal(stringToken1.escapedValue, 'u');
assert.equal(stringToken1.prefixLength, 1);
const stringToken2 = results.tokens.getItemAt(2) as StringToken;
@ -450,7 +451,7 @@ test('Strings: b/u/r-string', () => {
assert.equal(stringToken2.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Bytes | StringTokenFlags.Raw);
assert.equal(stringToken2.length, 6);
assert.equal(stringToken2.value, 'br');
assert.equal(stringToken2.escapedValue, 'br');
assert.equal(stringToken2.prefixLength, 2);
const stringToken3 = results.tokens.getItemAt(3) as StringToken;
@ -458,7 +459,7 @@ test('Strings: b/u/r-string', () => {
assert.equal(stringToken3.flags, StringTokenFlags.SingleQuote |
StringTokenFlags.Unicode | StringTokenFlags.Raw);
assert.equal(stringToken3.length, 6);
assert.equal(stringToken3.value, 'ur');
assert.equal(stringToken3.escapedValue, 'ur');
assert.equal(stringToken3.prefixLength, 2);
});
@ -468,16 +469,21 @@ test('Strings: bytes string with non-ASCII', () => {
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(
stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes);
StringTokenFlags.Bytes);
assert.equal(unescapedValue0.nonAsciiInBytes, true);
assert.equal(stringToken0.length, 7);
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(
stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.SingleQuote |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes |
StringTokenFlags.Triplicate);
StringTokenFlags.Bytes | StringTokenFlags.Triplicate);
assert.equal(unescapedValue1.nonAsciiInBytes, true);
assert.equal(stringToken1.length, 11);
});
@ -487,18 +493,22 @@ test('Strings: raw strings with escapes', () => {
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Raw);
assert.equal(stringToken0.length, 5);
assert.equal(stringToken0.value, '\\"');
assert.equal(stringToken0.escapedValue, '\\"');
assert.equal(unescapedValue0.value, '\\"');
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Raw);
assert.equal(stringToken1.length, 10);
assert.equal(stringToken1.value, '\\\r\n\\\n\\a');
assert.equal(stringToken1.escapedValue, '\\\r\n\\\n\\a');
assert.equal(unescapedValue1.value, '\\\r\n\\\n\\a');
});
test('Strings: escape at the end of double quoted string', () => {
@ -511,7 +521,7 @@ test('Strings: escape at the end of double quoted string', () => {
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Unterminated);
assert.equal(stringToken.length, 9);
assert.equal(stringToken.value, 'quoted"');
assert.equal(stringToken.escapedValue, 'quoted\\"');
assert.equal(results.tokens.getItemAt(1).type, TokenType.NewLine);
assert.equal(results.tokens.getItemAt(2).type, TokenType.Identifier);
@ -523,10 +533,11 @@ test('Strings: special escape characters', () => {
assert.equal(results.tokens.count, 1 + _implicitTokenCount);
const stringToken = results.tokens.getItemAt(0) as StringToken;
const unescapedValue = StringTokenUtils.getUnescapedString(stringToken);
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken.length, 18);
assert.equal(stringToken.value, '\r\n\u0007\v\t\b\f\\');
assert.equal(unescapedValue.value, '\r\n\u0007\v\t\b\f\\');
});
test('Strings: invalid escape characters', () => {
@ -535,14 +546,14 @@ test('Strings: invalid escape characters', () => {
assert.equal(results.tokens.count, 1 + _implicitTokenCount);
const stringToken = results.tokens.getItemAt(0) as StringToken;
const unescapedValue = StringTokenUtils.getUnescapedString(stringToken);
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken.length, 8);
assert.equal(stringToken.value, '\\d \\ ');
assert.equal(stringToken.invalidEscapeOffsets!.length, 2);
assert.equal(stringToken.invalidEscapeOffsets![0], 2);
assert.equal(stringToken.invalidEscapeOffsets![1], 6);
assert.equal(stringToken.escapedValue, '\\d \\ ');
assert.equal(unescapedValue.invalidEscapeOffsets.length, 2);
assert.equal(unescapedValue.invalidEscapeOffsets[0], 1);
assert.equal(unescapedValue.invalidEscapeOffsets[1], 5);
});
test('Strings: good hex escapes', () => {
@ -551,22 +562,28 @@ test('Strings: good hex escapes', () => {
assert.equal(results.tokens.count, 3 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken0.length, 6);
assert.equal(stringToken0.value, 'M');
assert.equal(stringToken0.escapedValue, '\\x4d');
assert.equal(unescapedValue0.value, 'M');
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken1.length, 8);
assert.equal(stringToken1.value, 'k');
assert.equal(stringToken1.escapedValue, '\\u006b');
assert.equal(unescapedValue1.value, 'k');
const stringToken2 = results.tokens.getItemAt(2) as StringToken;
const unescapedValue2 = StringTokenUtils.getUnescapedString(stringToken2);
assert.equal(stringToken2.type, TokenType.String);
assert.equal(stringToken2.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken2.length, 12);
assert.equal(stringToken2.value, 'o');
assert.equal(stringToken2.escapedValue, '\\U0000006F');
assert.equal(unescapedValue2.value, 'o');
});
test('Strings: bad hex escapes', () => {
@ -575,25 +592,31 @@ test('Strings: bad hex escapes', () => {
assert.equal(results.tokens.count, 3 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(
stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote);
assert.equal(unescapedValue0.invalidEscapeOffsets.length, 1);
assert.equal(stringToken0.length, 6);
assert.equal(stringToken0.value, '\\x4g');
assert.equal(unescapedValue0.value, '\\x4g');
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(
stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote);
assert.equal(unescapedValue1.invalidEscapeOffsets.length, 1);
assert.equal(stringToken1.length, 7);
assert.equal(stringToken1.value, '\\u006');
assert.equal(unescapedValue1.value, '\\u006');
const stringToken2 = results.tokens.getItemAt(2) as StringToken;
const unescapedValue2 = StringTokenUtils.getUnescapedString(
stringToken2);
assert.equal(stringToken2.type, TokenType.String);
assert.equal(stringToken2.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken2.flags, StringTokenFlags.DoubleQuote);
assert.equal(unescapedValue2.invalidEscapeOffsets.length, 1);
assert.equal(stringToken2.length, 12);
assert.equal(stringToken2.value, '\\U0000006m');
assert.equal(unescapedValue2.value, '\\U0000006m');
});
test('Strings: good name escapes', () => {
@ -602,16 +625,22 @@ test('Strings: good name escapes', () => {
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(
stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken0.length, 11);
assert.equal(stringToken0.value, '-');
assert.equal(stringToken0.escapedValue, '\\N{caret}');
assert.equal(unescapedValue0.value, '-');
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(
stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote);
assert.equal(stringToken1.length, 10);
assert.equal(stringToken1.value, 'a-a');
assert.equal(stringToken1.escapedValue, 'a\\N{A9}a');
assert.equal(unescapedValue1.value, 'a-a');
});
test('Strings: bad name escapes', () => {
@ -620,18 +649,22 @@ test('Strings: bad name escapes', () => {
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
const unescapedValue0 = StringTokenUtils.getUnescapedString(stringToken0);
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote);
assert.equal(unescapedValue0.invalidEscapeOffsets, 1);
assert.equal(stringToken0.length, 10);
assert.equal(stringToken0.value, '\\N{caret');
assert.equal(stringToken0.escapedValue, '\\N{caret');
assert.equal(unescapedValue0.value, '\\N{caret');
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
const unescapedValue1 = StringTokenUtils.getUnescapedString(stringToken1);
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.UnrecognizedEscape);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote);
assert.equal(unescapedValue1.invalidEscapeOffsets, 1);
assert.equal(stringToken1.length, 9);
assert.equal(stringToken1.value, '\\N{ A9}');
assert.equal(stringToken1.escapedValue, '\\N{ A9}');
assert.equal(unescapedValue1.value, '\\N{ A9}');
});
test('Comments', () => {