Save some memory in token creation (#7434)

* Save some memory in token creation * Found another 200k * Fix wrong way to check for undefined * Back out unnecessary regex change * Remove 'ExceedsMaxSize'
2024-09-11 16:06:39 +03:00 · 2024-03-11 09:30:48 -07:00 · 2024-03-11 09:30:48 -07:00 · 06bc912e38
commit 06bc912e38
parent 6e73064841
6 changed files with 54 additions and 59 deletions
--- a/packages/pyright-internal/src/analyzer/parseTreeUtils.ts
+++ b/packages/pyright-internal/src/analyzer/parseTreeUtils.ts
@ -277,7 +277,7 @@ export function printExpression(node: ExpressionNode, flags = PrintExpressionFla
            let escapedString = node.token.escapedValue;
            if ((flags & PrintExpressionFlags.DoNotLimitStringLength) === 0) {
                const maxStringLength = 32;
-                escapedString = escapedString.substring(0, maxStringLength);
+                escapedString = escapedString.slice(0, maxStringLength);
            }

            if (node.token.flags & StringTokenFlags.Triplicate) {
--- a/packages/pyright-internal/src/commands/dumpFileDebugInfoCommand.ts
+++ b/packages/pyright-internal/src/commands/dumpFileDebugInfoCommand.ts
@ -1344,7 +1344,6 @@ function getKeywordTypeString(type: KeywordType) {
 const StringTokenFlagsStrings: [StringTokenFlags, string][] = [
    [StringTokenFlags.Bytes, 'Bytes'],
    [StringTokenFlags.DoubleQuote, 'DoubleQuote'],
-    [StringTokenFlags.ExceedsMaxSize, 'ExceedsMaxSize'],
    [StringTokenFlags.Format, 'Format'],
    [StringTokenFlags.Raw, 'Raw'],
    [StringTokenFlags.SingleQuote, 'SingleQuote'],
--- a/packages/pyright-internal/src/parser/parser.ts
+++ b/packages/pyright-internal/src/parser/parser.ts
@ -4552,7 +4552,7 @@ export class Parser {
            return undefined;
        }

-        const interTokenContents = this._fileContents!.substring(curToken.start + curToken.length, nextToken.start);
+        const interTokenContents = this._fileContents!.slice(curToken.start + curToken.length, nextToken.start);
        const commentRegEx = /^(\s*#\s*type:\s*)([^\r\n]*)/;
        const match = interTokenContents.match(commentRegEx);
        if (!match) {
--- a/packages/pyright-internal/src/parser/stringTokenUtils.ts
+++ b/packages/pyright-internal/src/parser/stringTokenUtils.ts
@ -9,7 +9,6 @@
 */

 import { Char } from '../common/charCodes';
-import { maxStringTokenLength } from './tokenizer';
 import { FStringMiddleToken, StringToken, StringTokenFlags } from './tokenizerTypes';

 export const enum UnescapeErrorType {
@ -40,10 +39,14 @@ interface IncompleteUnescapedString {
    nonAsciiInBytes: boolean;
 }

-function completeUnescapedString(incomplete: IncompleteUnescapedString): UnescapedString {
+function completeUnescapedString(incomplete: IncompleteUnescapedString, originalString: string): UnescapedString {
+    const newValue = incomplete.valueParts.join('');
+    // Use the original string if it's identical. This prevents us from allocating memory to hold
+    // a copy (a copy is made because the original string is a 'slice' of another, so it doesn't exist in the cache yet).
+    const value = originalString !== newValue ? newValue : originalString;
    return {
        ...incomplete,
-        value: incomplete.valueParts.join(''),
+        value,
    };
 }

@ -89,14 +92,11 @@ export function getUnescapedString(stringToken: StringToken | FStringMiddleToken
    const addInvalidEscapeOffset = () => {
        // Invalid escapes are not reported for raw strings.
        if (!isRaw) {
-            // If this is the last character of a truncated string, don't report.
-            if ((stringToken.flags & StringTokenFlags.ExceedsMaxSize) === 0 || strOffset < maxStringTokenLength) {
-                output.unescapeErrors.push({
-                    offset: strOffset - 1,
-                    length: 2,
-                    errorType: UnescapeErrorType.InvalidEscapeSequence,
-                });
-            }
+            output.unescapeErrors.push({
+                offset: strOffset - 1,
+                length: 2,
+                errorType: UnescapeErrorType.InvalidEscapeSequence,
+            });
        }
    };

@ -142,7 +142,7 @@ export function getUnescapedString(stringToken: StringToken | FStringMiddleToken
    while (true) {
        let curChar = getEscapedCharacter();
        if (curChar === Char.EndOfText) {
-            return completeUnescapedString(output);
+            return completeUnescapedString(output, escapedString);
        }

        if (curChar === Char.Backslash) {
--- a/packages/pyright-internal/src/parser/tokenizer.ts
+++ b/packages/pyright-internal/src/parser/tokenizer.ts
@ -143,8 +143,6 @@ const _byteOrderMarker = 0xfeff;

 const defaultTabSize = 8;

-export const maxStringTokenLength = 32 * 1024;
-
 export interface TokenizerOutput {
    // List of all tokens.
    tokens: TextRangeCollection<Token>;
@ -266,7 +264,7 @@ export class Tokenizer {
        } else if (length < 0 || start + length > text.length) {
            throw new Error(`Invalid range length (start=${start}, length=${length}, text.length=${text.length})`);
        } else if (start + length < text.length) {
-            text = text.substring(0, start + length);
+            text = text.slice(0, start + length);
        }

        this._cs = new CharacterStream(text);
@ -432,7 +430,7 @@ export class Tokenizer {
        if (stringPrefixLength >= 0) {
            let stringPrefix = '';
            if (stringPrefixLength > 0) {
-                stringPrefix = this._cs.getText().substring(this._cs.position, this._cs.position + stringPrefixLength);
+                stringPrefix = this._cs.getText().slice(this._cs.position, this._cs.position + stringPrefixLength);
                // Indeed a string
                this._cs.advance(stringPrefixLength);
            }
@ -860,7 +858,7 @@ export class Tokenizer {
        }

        if (this._cs.position > start) {
-            const value = this._cs.getText().substring(start, this._cs.position);
+            const value = this._cs.getText().slice(start, this._cs.position);
            if (_keywords.has(value)) {
                this._tokens.push(
                    KeywordToken.create(start, this._cs.position - start, _keywords.get(value)!, this._getComments())
@ -926,9 +924,9 @@ export class Tokenizer {
            }

            if (radix > 0) {
-                const text = this._cs.getText().substring(start, this._cs.position);
+                const text = this._cs.getText().slice(start, this._cs.position);
                const simpleIntText = text.replace(/_/g, '');
-                let intValue: number | bigint = parseInt(simpleIntText.substring(leadingChars), radix);
+                let intValue: number | bigint = parseInt(simpleIntText.slice(leadingChars), radix);

                if (!isNaN(intValue)) {
                    const bigIntValue = BigInt(simpleIntText);
@ -979,7 +977,7 @@ export class Tokenizer {
        }

        if (isDecimalInteger) {
-            let text = this._cs.getText().substring(start, this._cs.position);
+            let text = this._cs.getText().slice(start, this._cs.position);
            const simpleIntText = text.replace(/_/g, '');
            let intValue: number | bigint = parseInt(simpleIntText, 10);

@ -1015,7 +1013,7 @@ export class Tokenizer {
            (this._cs.currentChar === Char.Period && this._cs.nextChar >= Char._0 && this._cs.nextChar <= Char._9)
        ) {
            if (this._skipFloatingPointCandidate()) {
-                let text = this._cs.getText().substring(start, this._cs.position);
+                let text = this._cs.getText().slice(start, this._cs.position);
                const value = parseFloat(text);
                if (!isNaN(value)) {
                    let isImaginary = false;
@ -1244,7 +1242,7 @@ export class Tokenizer {

            if (type === CommentType.IPythonMagic || type === CommentType.IPythonShellEscape) {
                const length = this._cs.position - begin;
-                const value = this._cs.getText().substring(begin, begin + length);
+                const value = this._cs.getText().slice(begin, begin + length);

                // is it multiline magics?
                // %magic command \
@ -1259,9 +1257,7 @@ export class Tokenizer {
        } while (!this._cs.isEndOfStream());

        const length = this._cs.position - start;
-        const value = this._cs.getText().substring(start, start + length);
-
-        const comment = Comment.create(start, length, value, type);
+        const comment = Comment.create(start, length, this._cs.getText().slice(start, start + length), type);
        this._addComments(comment);
    }

@ -1270,10 +1266,9 @@ export class Tokenizer {
        this._cs.skipToEol();

        const length = this._cs.position - start;
-        const value = this._cs.getText().substring(start, start + length);
-        const comment = Comment.create(start, length, value);
+        const comment = Comment.create(start, length, this._cs.getText().slice(start, start + length));

-        const typeIgnoreRegexMatch = value.match(/((^|#)\s*)type:\s*ignore(\s*\[([\s*\w-,]*)\]|\s|$)/);
+        const typeIgnoreRegexMatch = comment.value.match(/((^|#)\s*)type:\s*ignore(\s*\[([\s*\w-,]*)\]|\s|$)/);
        if (typeIgnoreRegexMatch) {
            const commentStart = start + (typeIgnoreRegexMatch.index ?? 0);
            const textRange: TextRange = {
@ -1292,7 +1287,7 @@ export class Tokenizer {
            }
        }

-        const pyrightIgnoreRegexMatch = value.match(/((^|#)\s*)pyright:\s*ignore(\s*\[([\s*\w-,]*)\]|\s|$)/);
+        const pyrightIgnoreRegexMatch = comment.value.match(/((^|#)\s*)pyright:\s*ignore(\s*\[([\s*\w-,]*)\]|\s|$)/);
        if (pyrightIgnoreRegexMatch) {
            const commentStart = start + (pyrightIgnoreRegexMatch.index ?? 0);
            const textRange: TextRange = {
@ -1371,7 +1366,7 @@ export class Tokenizer {
        if (this._cs.lookAhead(2) === Char.SingleQuote || this._cs.lookAhead(2) === Char.DoubleQuote) {
            const prefix = this._cs
                .getText()
-                .substring(this._cs.position, this._cs.position + 2)
+                .slice(this._cs.position, this._cs.position + 2)
                .toLowerCase();
            switch (prefix) {
                case 'rf':
@ -1572,17 +1567,22 @@ export class Tokenizer {
        const isTriplicate = (flags & StringTokenFlags.Triplicate) !== 0;
        const isFString = (flags & StringTokenFlags.Format) !== 0;
        let isInNamedUnicodeEscape = false;
-        let escapedValueParts: number[] = [];
+        const start = this._cs.position;
+        let escapedValueLength = 0;
+        const getEscapedValue = () => this._cs.getText().slice(start, start + escapedValueLength);

        while (true) {
            if (this._cs.isEndOfStream()) {
                // Hit the end of file without a termination.
                flags |= StringTokenFlags.Unterminated;
-                return { escapedValue: String.fromCharCode.apply(undefined, escapedValueParts), flags };
+                return {
+                    escapedValue: getEscapedValue(),
+                    flags,
+                };
            }

            if (this._cs.currentChar === Char.Backslash) {
-                escapedValueParts.push(this._cs.currentChar);
+                escapedValueLength++;

                // Move past the escape (backslash) character.
                this._cs.moveNext();
@ -1611,14 +1611,14 @@ export class Tokenizer {
                                this._cs.getCurrentChar() === Char.CarriageReturn &&
                                this._cs.nextChar === Char.LineFeed
                            ) {
-                                escapedValueParts.push(this._cs.currentChar);
+                                escapedValueLength++;
                                this._cs.moveNext();
                            }
-                            escapedValueParts.push(this._cs.currentChar);
+                            escapedValueLength++;
                            this._cs.moveNext();
                            this._addLineRange();
                        } else {
-                            escapedValueParts.push(this._cs.currentChar);
+                            escapedValueLength++;
                            this._cs.moveNext();
                        }
                    }
@ -1627,16 +1627,19 @@ export class Tokenizer {
                if (!isTriplicate && !isFString) {
                    // Unterminated single-line string
                    flags |= StringTokenFlags.Unterminated;
-                    return { escapedValue: String.fromCharCode.apply(undefined, escapedValueParts), flags };
+                    return {
+                        escapedValue: getEscapedValue(),
+                        flags,
+                    };
                }

                // Skip over the new line (either one or two characters).
                if (this._cs.currentChar === Char.CarriageReturn && this._cs.nextChar === Char.LineFeed) {
-                    escapedValueParts.push(this._cs.currentChar);
+                    escapedValueLength++;
                    this._cs.moveNext();
                }

-                escapedValueParts.push(this._cs.currentChar);
+                escapedValueLength++;
                this._cs.moveNext();
                this._addLineRange();
            } else if (!isTriplicate && this._cs.currentChar === quoteChar) {
@ -1655,41 +1658,35 @@ export class Tokenizer {
                    flags |= StringTokenFlags.ReplacementFieldStart;
                    break;
                } else {
-                    escapedValueParts.push(this._cs.currentChar);
+                    escapedValueLength++;
                    this._cs.moveNext();
-                    escapedValueParts.push(this._cs.currentChar);
+                    escapedValueLength++;
                    this._cs.moveNext();
                }
            } else if (isInNamedUnicodeEscape && this._cs.currentChar === Char.CloseBrace) {
                isInNamedUnicodeEscape = false;
-                escapedValueParts.push(this._cs.currentChar);
+                escapedValueLength++;
                this._cs.moveNext();
            } else if (isFString && this._cs.currentChar === Char.CloseBrace) {
                if (inFormatSpecifier || this._cs.nextChar !== Char.CloseBrace) {
                    flags |= StringTokenFlags.ReplacementFieldEnd;
                    break;
                } else {
-                    escapedValueParts.push(this._cs.currentChar);
+                    escapedValueLength++;
                    this._cs.moveNext();
-                    escapedValueParts.push(this._cs.currentChar);
+                    escapedValueLength++;
                    this._cs.moveNext();
                }
            } else {
-                escapedValueParts.push(this._cs.currentChar);
+                escapedValueLength++;
                this._cs.moveNext();
            }
        }

-        // String.fromCharCode.apply crashes (stack overflow) if passed an array
-        // that is too long. Cut off the extra characters in this case to avoid
-        // the crash. It's unlikely that the full string value will be used as
-        // a string literal or a docstring, so this should be fine.
-        if (escapedValueParts.length > maxStringTokenLength) {
-            escapedValueParts = escapedValueParts.slice(0, maxStringTokenLength);
-            flags |= StringTokenFlags.ExceedsMaxSize;
-        }
-
-        return { escapedValue: String.fromCharCode.apply(undefined, escapedValueParts), flags };
+        return {
+            escapedValue: getEscapedValue(),
+            flags,
+        };
    }

    private _skipFloatingPointCandidate(): boolean {
--- a/packages/pyright-internal/src/parser/tokenizerTypes.ts
+++ b/packages/pyright-internal/src/parser/tokenizerTypes.ts
@ -173,7 +173,6 @@ export const enum StringTokenFlags {

    // Error conditions
    Unterminated = 1 << 16,
-    ExceedsMaxSize = 1 << 17,
 }

 export const enum CommentType {