More work on the string tokenizer.

2024-09-11 16:06:39 +03:00 · 2019-04-21 10:33:51 -07:00 · 2019-04-21 10:33:51 -07:00 · 7e58f045f8
commit 7e58f045f8
parent 57243bbdfe
5 changed files with 176 additions and 95 deletions
--- a/server/src/parser/characterStream.ts
+++ b/server/src/parser/characterStream.ts
@ -43,6 +43,15 @@ export class CharacterStream {
        return this._currentChar;
    }

+    // We also expose a (non-property) method that is
+    // the equivalent of currentChar above. This allows
+    // us to work around assumptions in the TypeScript
+    // compiler that method calls (e.g. moveNext()) don't
+    // modify properties.
+    getCurrentChar(): number {
+        return this._currentChar;
+    }
+
    get nextChar(): number {
        return this.position + 1 < this._text.length ? this._text.charCodeAt(this.position + 1) : 0;
    }
--- a/server/src/parser/parser.ts
+++ b/server/src/parser/parser.ts
@ -2235,10 +2235,15 @@ export class Parser {
            if (stringToken.flags & StringTokenFlags.Unterminated) {
                this._addError('String literal is unterminated', stringToken);
            }
-            if (stringToken.flags & StringTokenFlags.NonAsciiInByte) {
+
+            if (stringToken.flags & StringTokenFlags.NonAsciiInBytes) {
                this._addError('Non-ASCII character not allowed in bytes string literal', stringToken);
            }

+            if (stringToken.flags & StringTokenFlags.UnrecognizedEscape) {
+                this._addWarning('Unsupported escape sequence in string literal', stringToken);
+            }
+
            stringTokenList.push(stringToken);
        }

@ -2463,4 +2468,10 @@ export class Parser {
        this._diagSink.addError(message,
            convertOffsetsToRange(range.start, range.end, this._tokenizerOutput!.lines));
    }
+
+    private _addWarning(message: string, range: TextRange) {
+        assert(range !== undefined);
+        this._diagSink.addWarning(message,
+            convertOffsetsToRange(range.start, range.end, this._tokenizerOutput!.lines));
+    }
 }
--- a/server/src/parser/tokenizer.ts
+++ b/server/src/parser/tokenizer.ts
@ -757,23 +757,22 @@ export class Tokenizer {

        if (flags & StringTokenFlags.Triplicate) {
            this._cs.advance(3);
-            [value, flags] = this._skipToTripleEndQuote(flags);
        } else {
            this._cs.moveNext();
-            [value, flags] = this._skipToSingleEndQuote(flags);
        }

+        [value, flags] = this._skipToEndOfStringLiteral(flags);
+
        let end = this._cs.position;

        this._tokens.push(new StringToken(start, end - start, flags, value));
    }

-    private _skipToSingleEndQuote(flags: StringTokenFlags): [string, StringTokenFlags] {
-        const quote = flags & StringTokenFlags.SingleQuote ?
-            Char.SingleQuote : Char.DoubleQuote;
+    private _skipToEndOfStringLiteral(flags: StringTokenFlags): [string, StringTokenFlags] {
+        const quoteChar = (flags & StringTokenFlags.SingleQuote) ? Char.SingleQuote : Char.DoubleQuote;
+        const isTriplicate = (flags & StringTokenFlags.Triplicate) !== 0;
        const isRaw = (flags & StringTokenFlags.Raw) !== 0;
        const isBytes = (flags & StringTokenFlags.Bytes) !== 0;
-        let isEscaped = false;
        let unescapedValue = '';

        while (true) {
@ -783,8 +782,119 @@ export class Tokenizer {
                return [unescapedValue, flags];
            }

-            if (this._cs.currentChar === Char.LineFeed || this._cs.currentChar === Char.CarriageReturn) {
-                if (!isEscaped) {
+            if (this._cs.currentChar === Char.Backslash) {
+                // Move past the escape (backslash) character.
+                this._cs.moveNext();
+                let localValue = '';
+
+                if (this._cs.getCurrentChar() === Char.CarriageReturn || this._cs.getCurrentChar() === Char.LineFeed) {
+                    if (this._cs.getCurrentChar() === Char.CarriageReturn && this._cs.nextChar === Char.LineFeed) {
+                        if (isRaw) {
+                            localValue += String.fromCharCode(this._cs.currentChar);
+                        }
+                        this._cs.moveNext();
+                    }
+                    if (isRaw) {
+                        localValue = '\\' + localValue + String.fromCharCode(this._cs.currentChar);
+                    }
+                    this._cs.moveNext();
+                    this._addLineRange();
+                } else {
+                    if (isRaw) {
+                        localValue = '\\' + String.fromCharCode(this._cs.currentChar);
+                        this._cs.moveNext();
+                    } else {
+                        switch (this._cs.getCurrentChar()) {
+                            case Char.Backslash:
+                            case Char.SingleQuote:
+                            case Char.DoubleQuote:
+                                localValue = String.fromCharCode(this._cs.currentChar);
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.a:
+                                localValue = '\u0007';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.b:
+                                localValue = '\b';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.f:
+                                localValue = '\f';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.n:
+                                localValue = '\n';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.r:
+                                localValue = '\r';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.t:
+                                localValue = '\t';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.v:
+                                localValue = '\v';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char._0:
+                            case Char._1:
+                            case Char._2:
+                            case Char._3:
+                            case Char._4:
+                            case Char._5:
+                            case Char._6:
+                            case Char._7:
+                                // TODO - need to handle octal
+                                localValue = '0';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.x:
+                                // TODO - need to handle hex
+                                localValue = '0';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.N:
+                                // TODO - need to handle name
+                                localValue = '0';
+                                this._cs.moveNext();
+                                break;
+
+                            case Char.u:
+                                // TODO - need to handle unicode
+                                localValue = '0';
+                                break;
+
+                            case Char.U:
+                                // TODO - need to handle unicode
+                                localValue = '0';
+                                this._cs.moveNext();
+                                break;
+
+                            default:
+                                localValue = '\\' + String.fromCharCode(this._cs.currentChar);
+                                flags |= StringTokenFlags.UnrecognizedEscape;
+                                this._cs.moveNext();
+                                break;
+                        }
+                    }
+                }
+
+                unescapedValue += localValue;
+            } else if (this._cs.currentChar === Char.LineFeed || this._cs.currentChar === Char.CarriageReturn) {
+                if (!isTriplicate) {
                    // Unterminated single-line string
                    flags |= StringTokenFlags.Unterminated;
                    return [unescapedValue, flags];
@ -792,93 +902,32 @@ export class Tokenizer {

                // Skip over the escaped new line (either one or two characters).
                if (this._cs.currentChar === Char.LineFeed && this._cs.nextChar === Char.CarriageReturn) {
+                    unescapedValue += String.fromCharCode(this._cs.currentChar);
                    this._cs.moveNext();
                }

+                unescapedValue += String.fromCharCode(this._cs.currentChar);
                this._cs.moveNext();
-                isEscaped = false;
+                this._addLineRange();
+            } else if (!isTriplicate && this._cs.currentChar === quoteChar) {
+                this._cs.moveNext();
+                break;
+            } else if (isTriplicate && this._cs.currentChar === quoteChar &&
+                    this._cs.nextChar === quoteChar && this._cs.lookAhead(2) === quoteChar) {
+
+                this._cs.advance(3);
+                break;
            } else {
-                if (isEscaped) {
-                    if (isBytes && this._cs.currentChar >= 128) {
-                        flags |= StringTokenFlags.NonAsciiInByte;
-                    }
-
-                    unescapedValue += String.fromCharCode(this._cs.currentChar);
-
-                    // TODO - need to properly handle escapes \ooo, \xhh, \N{name}, \uxxxx and \Uxxxxxxxx
-                    isEscaped = false;
-                } else if (this._cs.currentChar === Char.Backslash) {
-                    if (isRaw) {
-                        unescapedValue += String.fromCharCode(this._cs.currentChar);
-                    }
-                    isEscaped = true;
-                } else if (this._cs.currentChar === quote) {
-                    break;
-                } else {
-                    if (isBytes && this._cs.currentChar >= 128) {
-                        flags |= StringTokenFlags.NonAsciiInByte;
-                    }
-
-                    unescapedValue += String.fromCharCode(this._cs.currentChar);
-                    isEscaped = false;
+                if (isBytes && this._cs.currentChar >= 128) {
+                    flags |= StringTokenFlags.NonAsciiInBytes;
                }

+                unescapedValue += String.fromCharCode(this._cs.currentChar);
+
                this._cs.moveNext();
            }
        }

-        this._cs.moveNext();
-        return [unescapedValue, flags];
-    }
-
-    private _skipToTripleEndQuote(flags: StringTokenFlags): [string, StringTokenFlags] {
-        const quote = flags & StringTokenFlags.SingleQuote ?
-            Char.SingleQuote : Char.DoubleQuote;
-        const isBytes = (flags & StringTokenFlags.Bytes) !== 0;
-        const isRaw = (flags & StringTokenFlags.Raw) !== 0;
-        let unescapedValue = '';
-
-        while (!this ._cs.isEndOfStream() && (this._cs.currentChar !== quote ||
-                this._cs.nextChar !== quote || this._cs.lookAhead(2) !== quote)) {
-
-            if (this._cs.currentChar === Char.CarriageReturn) {
-                unescapedValue += String.fromCharCode(this._cs.currentChar);
-                if (this._cs.nextChar === Char.LineFeed) {
-                    this._cs.moveNext();
-                    unescapedValue += String.fromCharCode(this._cs.currentChar);
-                }
-                this._cs.moveNext();
-                this._addLineRange();
-            } else if (this._cs.currentChar === Char.LineFeed) {
-                unescapedValue += String.fromCharCode(this._cs.currentChar);
-                this._cs.moveNext();
-                this._addLineRange();
-            } else if (this._cs.currentChar === Char.Backslash) {
-                if (isRaw) {
-                    unescapedValue += String.fromCharCode(this._cs.currentChar);
-                }
-
-                // This is an escape. Move past the next character.
-                this._cs.moveNext();
-
-                if (isBytes && this._cs.currentChar >= 128) {
-                    flags |= StringTokenFlags.NonAsciiInByte;
-                }
-
-                // TODO - need to handle special escapes
-                unescapedValue += String.fromCharCode(this._cs.currentChar);
-                this._cs.moveNext();
-            } else {
-                if (isBytes && this._cs.currentChar >= 128) {
-                    flags |= StringTokenFlags.NonAsciiInByte;
-                }
-
-                unescapedValue += String.fromCharCode(this._cs.currentChar);
-                this._cs.moveNext();
-            }
-        }
-        this._cs.advance(3);
-
        return [unescapedValue, flags];
    }

--- a/server/src/parser/tokenizerTypes.ts
+++ b/server/src/parser/tokenizerTypes.ts
@ -155,8 +155,8 @@ export enum StringTokenFlags {

    // Error conditions
    Unterminated = 0x1000,
-    NonAsciiInByte = 0x1001,
-    UnrecognizedEscape = 0x1002
+    NonAsciiInBytes = 0x2000,
+    UnrecognizedEscape = 0x4000
 }

 export class Token extends TextRange implements Token {
--- a/server/src/tests/tokenizer.test.ts
+++ b/server/src/tests/tokenizer.test.ts
@ -214,6 +214,18 @@ test('IndentDedentParen', () => {
    assert.equal(results.tokens.getItemAt(9).type, TokenType.EndOfStream);
 });

+test('Strings: simple', () => {
+    const t = new Tokenizer();
+    const results = t.tokenize(' "a"');
+    assert.equal(results.tokens.count, 1 + _implicitTokenCount);
+
+    const stringToken = results.tokens.getItemAt(0) as StringToken;
+    assert.equal(stringToken.type, TokenType.String);
+    assert.equal(stringToken.length, 3);
+    assert.equal(stringToken.value, 'a');
+    assert.equal(stringToken.flags, StringTokenFlags.DoubleQuote);
+});
+
 test('Strings: unclosed', () => {
    const t = new Tokenizer();
    const results = t.tokenize(' "string" """line1\n#line2"""\t\'un#closed');
@ -229,7 +241,7 @@ test('Strings: unclosed', () => {

 test('Strings: escaped across multiple lines', () => {
    const t = new Tokenizer();
-    const results = t.tokenize(' "a\\\nb" \'c\\\n\rb\'');
+    const results = t.tokenize(' "a\\\nb" \'c\\\r\nb\'');
    assert.equal(results.tokens.count, 2 + _implicitTokenCount);

    const ranges = [[1, 6], [8, 7]];
@ -451,20 +463,20 @@ test('Strings: bytes string with non-ASCII', () => {
    const stringToken0 = results.tokens.getItemAt(0) as StringToken;
    assert.equal(stringToken0.type, TokenType.String);
    assert.equal(stringToken0.flags, StringTokenFlags.DoubleQuote |
-        StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte);
+        StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes);
    assert.equal(stringToken0.length, 7);

    const stringToken1 = results.tokens.getItemAt(1) as StringToken;
    assert.equal(stringToken1.type, TokenType.String);
    assert.equal(stringToken1.flags, StringTokenFlags.SingleQuote |
-        StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte |
+        StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInBytes |
        StringTokenFlags.Triplicate);
    assert.equal(stringToken1.length, 11);
 });

 test('Strings: raw strings with escapes', () => {
    const t = new Tokenizer();
-    const results = t.tokenize('R"\\""');
+    const results = t.tokenize('R"\\"" r"\\\r\n\\\n\\a"');
    assert.equal(results.tokens.count, 2 + _implicitTokenCount);

    const stringToken0 = results.tokens.getItemAt(0) as StringToken;
@ -476,10 +488,10 @@ test('Strings: raw strings with escapes', () => {

    const stringToken1 = results.tokens.getItemAt(1) as StringToken;
    assert.equal(stringToken1.type, TokenType.String);
-    assert.equal(stringToken1.flags, StringTokenFlags.SingleQuote |
-        StringTokenFlags.Bytes | StringTokenFlags.NonAsciiInByte |
-        StringTokenFlags.Triplicate);
-    assert.equal(stringToken1.length, 11);
+    assert.equal(stringToken1.flags, StringTokenFlags.DoubleQuote |
+        StringTokenFlags.Raw);
+    assert.equal(stringToken1.length, 10);
+    assert.equal(stringToken1.value, '\\\r\n\\\n\\a');
 });

 test('Strings: escape at the end of double quoted string ', () => {