More work on the string tokenizer.

This commit is contained in:
Eric Traut 2019-04-21 10:33:51 -07:00
parent 57243bbdfe
commit 7e58f045f8
5 changed files with 176 additions and 95 deletions

View File

@ -43,6 +43,15 @@ export class CharacterStream {
return this._currentChar;
}
// We also expose a (non-property) method that is
// the equivalent of currentChar above. This allows
// us to work around assumptions in the TypeScript
// compiler that method calls (e.g. moveNext()) don't
// modify properties.
getCurrentChar(): number {
return this._currentChar;
}
get nextChar(): number {
return this.position + 1 < this._text.length ? this._text.charCodeAt(this.position + 1) : 0;
}

View File

@ -2235,10 +2235,15 @@ export class Parser {
if (stringToken.flags & StringTokenFlags.Unterminated) {
this._addError('String literal is unterminated', stringToken);
}
if (stringToken.flags & StringTokenFlags.NonAsciiInByte) {
if (stringToken.flags & StringTokenFlags.NonAsciiInBytes) {
this._addError('Non-ASCII character not allowed in bytes string literal', stringToken);
}
if (stringToken.flags & StringTokenFlags.UnrecognizedEscape) {
this._addWarning('Unsupported escape sequence in string literal', stringToken);
}
stringTokenList.push(stringToken);
}
@ -2463,4 +2468,10 @@ export class Parser {
this._diagSink.addError(message,
convertOffsetsToRange(range.start, range.end, this._tokenizerOutput!.lines));
}
private _addWarning(message: string, range: TextRange) {
assert(range !== undefined);
this._diagSink.addWarning(message,
convertOffsetsToRange(range.start, range.end, this._tokenizerOutput!.lines));
}
}

View File

@ -757,23 +757,22 @@ export class Tokenizer {
if (flags & StringTokenFlags.Triplicate) {
this._cs.advance(3);
[value, flags] = this._skipToTripleEndQuote(flags);
} else {
this._cs.moveNext();
[value, flags] = this._skipToSingleEndQuote(flags);
}
[value, flags] = this._skipToEndOfStringLiteral(flags);
let end = this._cs.position;
this._tokens.push(new StringToken(start, end - start, flags, value));
}
private _skipToSingleEndQuote(flags: StringTokenFlags): [string, StringTokenFlags] {
const quote = flags & StringTokenFlags.SingleQuote ?
Char.SingleQuote : Char.DoubleQuote;
private _skipToEndOfStringLiteral(flags: StringTokenFlags): [string, StringTokenFlags] {
const quoteChar = (flags & StringTokenFlags.SingleQuote) ? Char.SingleQuote : Char.DoubleQuote;
const isTriplicate = (flags & StringTokenFlags.Triplicate) !== 0;
const isRaw = (flags & StringTokenFlags.Raw) !== 0;
const isBytes = (flags & StringTokenFlags.Bytes) !== 0;
let isEscaped = false;
let unescapedValue = '';
while (true) {
@ -783,8 +782,119 @@ export class Tokenizer {
return [unescapedValue, flags];
}
if (this._cs.currentChar === Char.LineFeed || this._cs.currentChar === Char.CarriageReturn) {
if (!isEscaped) {
if (this._cs.currentChar === Char.Backslash) {
// Move past the escape (backslash) character.
this._cs.moveNext();
let localValue = '';
if (this._cs.getCurrentChar() === Char.CarriageReturn || this._cs.getCurrentChar() === Char.LineFeed) {
if (this._cs.getCurrentChar() === Char.CarriageReturn && this._cs.nextChar === Char.LineFeed) {
if (isRaw) {
localValue += String.fromCharCode(this._cs.currentChar);
}
this._cs.moveNext();
}
if (isRaw) {
localValue = '\\' + localValue + String.fromCharCode(this._cs.currentChar);
}
this._cs.moveNext();
this._addLineRange();
} else {
if (isRaw) {
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
} else {
switch (this._cs.getCurrentChar()) {
case Char.Backslash:
case Char.SingleQuote:
case Char.DoubleQuote:
localValue = String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
break;
case Char.a:
localValue = '\u0007';
this._cs.moveNext();
break;
case Char.b:
localValue = '\b';
this._cs.moveNext();
break;
case Char.f:
localValue = '\f';
this._cs.moveNext();
break;
case Char.n:
localValue = '\n';
this._cs.moveNext();
break;
case Char.r:
localValue = '\r';
this._cs.moveNext();
break;
case Char.t:
localValue = '\t';
this._cs.moveNext();
break;
case Char.v:
localValue = '\v';
this._cs.moveNext();
break;
case Char._0:
case Char._1:
case Char._2:
case Char._3:
case Char._4:
case Char._5:
case Char._6:
case Char._7:
// TODO - need to handle octal
localValue = '0';
this._cs.moveNext();
break;
case Char.x:
// TODO - need to handle hex
localValue = '0';
this._cs.moveNext();
break;
case Char.N:
// TODO - need to handle name
localValue = '0';
this._cs.moveNext();
break;
case Char.u:
// TODO - need to handle unicode
localValue = '0';
break;
case Char.U:
// TODO - need to handle unicode
localValue = '0';
this._cs.moveNext();
break;
default:
localValue = '\\' + String.fromCharCode(this._cs.currentChar);
flags |= StringTokenFlags.UnrecognizedEscape;
this._cs.moveNext();
break;
}
}
}
unescapedValue += localValue;
} else if (this._cs.currentChar === Char.LineFeed || this._cs.currentChar === Char.CarriageReturn) {
if (!isTriplicate) {
// Unterminated single-line string
flags |= StringTokenFlags.Unterminated;
return [unescapedValue, flags];
@ -792,93 +902,32 @@ export class Tokenizer {
// Skip over the escaped new line (either one or two characters).
if (this._cs.currentChar === Char.LineFeed && this._cs.nextChar === Char.CarriageReturn) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
isEscaped = false;
this._addLineRange();
} else if (!isTriplicate && this._cs.currentChar === quoteChar) {
this._cs.moveNext();
break;
} else if (isTriplicate && this._cs.currentChar === quoteChar &&
this._cs.nextChar === quoteChar && this._cs.lookAhead(2) === quoteChar) {
this._cs.advance(3);
break;
} else {
if (isEscaped) {
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInByte;
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
// TODO - need to properly handle escapes \ooo, \xhh, \N{name}, \uxxxx and \Uxxxxxxxx
isEscaped = false;
} else if (this._cs.currentChar === Char.Backslash) {
if (isRaw) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
}
isEscaped = true;
} else if (this._cs.currentChar === quote) {
break;
} else {
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInByte;
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
isEscaped = false;
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInBytes;
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
}
}
this._cs.moveNext();
return [unescapedValue, flags];
}
private _skipToTripleEndQuote(flags: StringTokenFlags): [string, StringTokenFlags] {
const quote = flags & StringTokenFlags.SingleQuote ?
Char.SingleQuote : Char.DoubleQuote;
const isBytes = (flags & StringTokenFlags.Bytes) !== 0;
const isRaw = (flags & StringTokenFlags.Raw) !== 0;
let unescapedValue = '';
while (!this ._cs.isEndOfStream() && (this._cs.currentChar !== quote ||
this._cs.nextChar !== quote || this._cs.lookAhead(2) !== quote)) {
if (this._cs.currentChar === Char.CarriageReturn) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
if (this._cs.nextChar === Char.LineFeed) {
this._cs.moveNext();
unescapedValue += String.fromCharCode(this._cs.currentChar);
}
this._cs.moveNext();
this._addLineRange();
} else if (this._cs.currentChar === Char.LineFeed) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
this._addLineRange();
} else if (this._cs.currentChar === Char.Backslash) {
if (isRaw) {
unescapedValue += String.fromCharCode(this._cs.currentChar);
}
// This is an escape. Move past the next character.
this._cs.moveNext();
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInByte;
}
// TODO - need to handle special escapes
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
} else {
if (isBytes && this._cs.currentChar >= 128) {
flags |= StringTokenFlags.NonAsciiInByte;
}
unescapedValue += String.fromCharCode(this._cs.currentChar);
this._cs.moveNext();
}
}
this._cs.advance(3);
return [unescapedValue, flags];
}

View File

@ -155,8 +155,8 @@ export enum StringTokenFlags {
// Error conditions
Unterminated = 0x1000,
NonAsciiInByte = 0x1001,
UnrecognizedEscape = 0x1002
NonAsciiInBytes = 0x2000,
UnrecognizedEscape = 0x4000
}
export class Token extends TextRange implements Token {

View File

@ -214,6 +214,18 @@ test('IndentDedentParen', () => {
assert.equal(results.tokens.getItemAt(9).type, TokenType.EndOfStream);
});
test('Strings: simple', () => {
const t = new Tokenizer();
const results = t.tokenize(' "a"');
assert.equal(results.tokens.count, 1 + _implicitTokenCount);
const stringToken = results.tokens.getItemAt(0) as StringToken;
assert.equal(stringToken.type, TokenType.String);
assert.equal(stringToken.length, 3);
assert.equal(stringToken.value, 'a');
assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
});
test('Strings: unclosed', () => {
const t = new Tokenizer();
const results = t.tokenize(' "string" """line1\n#line2"""\t\'un#closed');
@ -229,7 +241,7 @@ test('Strings: unclosed', () => {
test('Strings: escaped across multiple lines', () => {
const t = new Tokenizer();
const results = t.tokenize(' "a\\\nb" \'c\\\n\rb\'');
const results = t.tokenize(' "a\\\nb" \'c\\\r\nb\'');
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const ranges = [[1, 6], [8, 7]];
@ -451,20 +463,20 @@ test('Strings: bytes string with non-ASCII', () => {
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
assert.equal(stringToken0.type, TokenType.String);
assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte);
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes);
assert.equal(stringToken0.length, 7);
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.SingleQuote |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes |
StringTokenFlags.Triplicate);
assert.equal(stringToken1.length, 11);
});
test('Strings: raw strings with escapes', () => {
const t = new Tokenizer();
const results = t.tokenize('R"\\""');
const results = t.tokenize('R"\\"" r"\\\r\n\\\n\\a"');
assert.equal(results.tokens.count, 2 + _implicitTokenCount);
const stringToken0 = results.tokens.getItemAt(0) as StringToken;
@ -476,10 +488,10 @@ test('Strings: raw strings with escapes', () => {
const stringToken1 = results.tokens.getItemAt(1) as StringToken;
assert.equal(stringToken1.type, TokenType.String);
assert.equal(stringToken1.flags, StringTokenFlags.SingleQuote |
StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte |
StringTokenFlags.Triplicate);
assert.equal(stringToken1.length, 11);
assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote |
StringTokenFlags.Raw);
assert.equal(stringToken1.length, 10);
assert.equal(stringToken1.value, '\\\r\n\\\n\\a');
});
test('Strings: escape at the end of double quoted string ', () => {