From d4ae44ac4547ad0964bb4c3bc482c9a23c13feb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Donny/=EA=B0=95=EB=8F=99=EC=9C=A4?= Date: Mon, 6 Nov 2023 19:45:46 +0900 Subject: [PATCH] perf(es/parser): Use smarter lookup table for lexer (#8226) **Description:** `phf` is slower than expected. --- crates/swc_ecma_ast/src/ident.rs | 44 ++-- crates/swc_ecma_parser/src/lexer/mod.rs | 97 ++++----- crates/swc_ecma_parser/src/lexer/table.rs | 240 +++++++++++++++++++++- 3 files changed, 305 insertions(+), 76 deletions(-) diff --git a/crates/swc_ecma_ast/src/ident.rs b/crates/swc_ecma_ast/src/ident.rs index b684247a27a..7b9b4d58dd6 100644 --- a/crates/swc_ecma_ast/src/ident.rs +++ b/crates/swc_ecma_ast/src/ident.rs @@ -151,6 +151,12 @@ impl From for Id { } } +#[repr(C, align(64))] +struct Align64(pub(crate) T); + +const T: bool = true; +const F: bool = false; + impl Ident { /// In `op`, [EqIgnoreSpan] of [Ident] will ignore the syntax context. pub fn within_ignored_ctxt(op: F) -> Ret @@ -175,26 +181,40 @@ impl Ident { /// Returns true if `c` is a valid character for an identifier start. #[inline] pub fn is_valid_start(c: char) -> bool { - c == '$' || c == '_' || c.is_ascii_alphabetic() || { - if c.is_ascii() { - false - } else { - UnicodeID::is_id_start(c) - } + // This contains `$` (36) and `_` (95) + const ASCII_START: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, + T, T, T, T, F, F, F, F, T, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, + T, T, T, T, T, T, T, F, F, F, F, F, + ]); + + if c.is_ascii() { + return ASCII_START.0[c as usize]; } + + UnicodeID::is_id_start(c) } /// Returns true if `c` is a valid character for an identifier part after /// start. #[inline] pub fn is_valid_continue(c: char) -> bool { - c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || { - if c.is_ascii() { - false - } else { - UnicodeID::is_id_continue(c) - } + // This contains `$` (36) + const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, + F, F, F, F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, + F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, + T, T, T, T, F, F, F, F, T, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, + T, T, T, T, T, T, T, F, F, F, F, F, + ]); + + if c.is_ascii() { + return ASCII_CONTINUE.0[c as usize]; } + + UnicodeID::is_id_continue(c) || c == '\u{200c}' || c == '\u{200d}' } /// Alternative for `toIdentifier` of babel. diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 7f91870a172..075d892558c 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -21,7 +21,7 @@ pub use self::{ }; use crate::{ error::{Error, SyntaxError}, - token::{BinOpToken, Keyword, Token, Word}, + token::{BinOpToken, IdentLike, Token, Word}, Context, Syntax, }; @@ -759,59 +759,34 @@ impl<'a> Lexer<'a> { Ok(Some(token)) } - /// See https://tc39.github.io/ecma262/#sec-names-and-keywords - fn read_ident_or_keyword(&mut self) -> LexResult { - static KNOWN_WORDS: phf::Map<&str, Word> = phf::phf_map! { - "await" => Word::Keyword(Keyword::Await), - "break" => Word::Keyword(Keyword::Break), - "case" => Word::Keyword(Keyword::Case), - "catch" => Word::Keyword(Keyword::Catch), - "class" => Word::Keyword(Keyword::Class), - "const" => Word::Keyword(Keyword::Const), - "continue" => Word::Keyword(Keyword::Continue), - "debugger" => Word::Keyword(Keyword::Debugger), - "default" => Word::Keyword(Keyword::Default_), - "delete" => Word::Keyword(Keyword::Delete), - "do" => Word::Keyword(Keyword::Do), - "else" => Word::Keyword(Keyword::Else), - "export" => Word::Keyword(Keyword::Export), - "extends" => Word::Keyword(Keyword::Extends), - "false" => Word::False, - "finally" => Word::Keyword(Keyword::Finally), - "for" => Word::Keyword(Keyword::For), - "function" => Word::Keyword(Keyword::Function), - "if" => Word::Keyword(Keyword::If), - "import" => Word::Keyword(Keyword::Import), - "in" => Word::Keyword(Keyword::In), - "instanceof" => Word::Keyword(Keyword::InstanceOf), - "let" => Word::Keyword(Keyword::Let), - "new" => Word::Keyword(Keyword::New), - "null" => Word::Null, - "return" => Word::Keyword(Keyword::Return), - "super" => Word::Keyword(Keyword::Super), - "switch" => Word::Keyword(Keyword::Switch), - "this" => Word::Keyword(Keyword::This), - "throw" => Word::Keyword(Keyword::Throw), - "true" => Word::True, - "try" => Word::Keyword(Keyword::Try), - "typeof" => Word::Keyword(Keyword::TypeOf), - "var" => Word::Keyword(Keyword::Var), - "void" => Word::Keyword(Keyword::Void), - "while" => Word::Keyword(Keyword::While), - "with" => Word::Keyword(Keyword::With), - "yield" => Word::Keyword(Keyword::Yield), - - }; - + /// This can be used if there's no keyword starting with the first + /// character. + fn read_ident_unknown(&mut self) -> LexResult { debug_assert!(self.cur().is_some()); - let start = self.cur_pos(); - let (word, has_escape) = self.read_word_as_str_with(|s| { - if let Some(word) = KNOWN_WORDS.get(s) { - return word.clone(); + let (word, _) = + self.read_word_as_str_with(|s, _, _| Word::Ident(IdentLike::Other(s.into())))?; + + Ok(Word(word)) + } + + /// This can be used if there's no keyword starting with the first + /// character. + fn read_word_with( + &mut self, + convert: impl FnOnce(&str) -> Option, + ) -> LexResult> { + debug_assert!(self.cur().is_some()); + + let start = self.cur_pos(); + let (word, has_escape) = self.read_word_as_str_with(|s, _, can_be_known| { + if can_be_known { + if let Some(word) = convert(s) { + return word; + } } - Word::Ident(s.into()) + Word::Ident(IdentLike::Other(s.into())) })?; // Note: ctx is store in lexer because of this error. @@ -824,17 +799,20 @@ impl<'a> Lexer<'a> { SyntaxError::EscapeInReservedWord { word: word.into() }, )? } else { - Ok(Word(word)) + Ok(Some(Token::Word(word))) } } /// This method is optimized for texts without escape sequences. + /// + /// `convert(text, has_escape, can_be_keyword)` fn read_word_as_str_with(&mut self, convert: F) -> LexResult<(Ret, bool)> where - F: FnOnce(&str) -> Ret, + F: FnOnce(&str, bool, bool) -> Ret, { debug_assert!(self.cur().is_some()); let mut first = true; + let mut can_be_keyword = true; self.with_buf(|l, buf| { let mut has_escape = false; @@ -842,7 +820,14 @@ impl<'a> Lexer<'a> { while let Some(c) = { // Optimization { - let s = l.input.uncons_while(|c| c.is_ident_part()); + let s = l.input.uncons_while(|c| { + // Performance optimization + if c.is_ascii_uppercase() || c.is_ascii_digit() || !c.is_ascii() { + can_be_keyword = false; + } + + c.is_ident_part() + }); if !s.is_empty() { first = false; } @@ -892,7 +877,7 @@ impl<'a> Lexer<'a> { } first = false; } - let value = convert(buf); + let value = convert(buf, has_escape, can_be_keyword); Ok((value, has_escape)) }) @@ -1137,7 +1122,9 @@ impl<'a> Lexer<'a> { // let flags_start = self.cur_pos(); let flags = { match self.cur() { - Some(c) if c.is_ident_start() => self.read_word_as_str_with(|s| s.into()).map(Some), + Some(c) if c.is_ident_start() => { + self.read_word_as_str_with(|s, _, _| s.into()).map(Some) + } _ => Ok(None), } }? diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs index 78ccaecf431..04e8e67c020 100644 --- a/crates/swc_ecma_parser/src/lexer/table.rs +++ b/crates/swc_ecma_parser/src/lexer/table.rs @@ -11,7 +11,7 @@ use swc_ecma_ast::AssignOp; use super::{pos_span, util::CharExt, LexResult, Lexer}; use crate::{ error::SyntaxError, - token::{BinOpToken, Token}, + token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word}, }; pub(super) type ByteHandler = Option fn(&mut Lexer<'aa>) -> LexResult>>; @@ -21,12 +21,12 @@ pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F // EOF, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 0 ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1 - ___, EXL, QOT, HSH, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 + ___, EXL, QOT, HSH, IDN, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3 - AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4 - IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, IDT, BTC, CRT, IDT, // 5 - TPL, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 6 - IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BEO, PIP, BEC, TLD, ERR, // 7 + AT_, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, // 4 + IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, BTO, IDN, BTC, CRT, IDN, // 5 + TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H, L_I, L_J, L_K, L_L, L_M, L_N, L_O, // 6 + L_P, L_Q, L_R, L_S, L_T, L_U, L_V, L_W, L_X, L_Y, L_Z, BEO, PIP, BEC, TLD, ERR, // 7 UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8 UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9 UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A @@ -59,8 +59,230 @@ const ERR: ByteHandler = Some(|lexer| { lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })? }); -/// Identifier -const IDT: ByteHandler = Some(|lexer| lexer.read_ident_or_keyword().map(Some)); +/// Identifier and we know that this cannot be a keyword or known ident. +const IDN: ByteHandler = Some(|lexer| lexer.read_ident_unknown().map(Some)); + +const L_A: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "abstract" => Some(Word::Ident(IdentLike::Known(KnownIdent::Abstract))), + "as" => Some(Word::Ident(IdentLike::Known(KnownIdent::As))), + "await" => Some(Word::Keyword(Keyword::Await)), + "async" => Some(Word::Ident(IdentLike::Known(KnownIdent::Async))), + "assert" => Some(Word::Ident(IdentLike::Known(KnownIdent::Assert))), + "asserts" => Some(Word::Ident(IdentLike::Known(KnownIdent::Asserts))), + "any" => Some(Word::Ident(IdentLike::Known(KnownIdent::Any))), + "accessor" => Some(Word::Ident(IdentLike::Known(KnownIdent::Accessor))), + _ => None, + }) +}); + +const L_B: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "break" => Some(Word::Keyword(Keyword::Break)), + "boolean" => Some(Word::Ident(IdentLike::Known(KnownIdent::Boolean))), + "bigint" => Some(Word::Ident(IdentLike::Known(KnownIdent::Bigint))), + _ => None, + }) +}); + +const L_C: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "case" => Some(Word::Keyword(Keyword::Case)), + "catch" => Some(Word::Keyword(Keyword::Catch)), + "class" => Some(Word::Keyword(Keyword::Class)), + "const" => Some(Word::Keyword(Keyword::Const)), + "continue" => Some(Word::Keyword(Keyword::Continue)), + _ => None, + }) +}); + +const L_D: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "debugger" => Some(Word::Keyword(Keyword::Debugger)), + "default" => Some(Word::Keyword(Keyword::Default_)), + "delete" => Some(Word::Keyword(Keyword::Delete)), + "do" => Some(Word::Keyword(Keyword::Do)), + "declare" => Some(Word::Ident(IdentLike::Known(KnownIdent::Declare))), + _ => None, + }) +}); + +const L_E: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "else" => Some(Word::Keyword(Keyword::Else)), + "enum" => Some(Word::Ident(IdentLike::Known(KnownIdent::Enum))), + "export" => Some(Word::Keyword(Keyword::Export)), + "extends" => Some(Word::Keyword(Keyword::Extends)), + _ => None, + }) +}); + +const L_F: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "false" => Some(Word::False), + "finally" => Some(Word::Keyword(Keyword::Finally)), + "for" => Some(Word::Keyword(Keyword::For)), + "function" => Some(Word::Keyword(Keyword::Function)), + "from" => Some(Word::Ident(IdentLike::Known(KnownIdent::From))), + _ => None, + }) +}); + +const L_G: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "global" => Some(Word::Ident(IdentLike::Known(KnownIdent::Global))), + "get" => Some(Word::Ident(IdentLike::Known(KnownIdent::Get))), + _ => None, + }) +}); + +const L_H: ByteHandler = IDN; + +const L_I: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "if" => Some(Word::Keyword(Keyword::If)), + "import" => Some(Word::Keyword(Keyword::Import)), + "in" => Some(Word::Keyword(Keyword::In)), + "instanceof" => Some(Word::Keyword(Keyword::InstanceOf)), + "is" => Some(Word::Ident(IdentLike::Known(KnownIdent::Is))), + "infer" => Some(Word::Ident(IdentLike::Known(KnownIdent::Infer))), + "interface" => Some(Word::Ident(IdentLike::Known(KnownIdent::Interface))), + "implements" => Some(Word::Ident(IdentLike::Known(KnownIdent::Implements))), + "intrinsic" => Some(Word::Ident(IdentLike::Known(KnownIdent::Intrinsic))), + _ => None, + }) +}); + +const L_J: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "let" => Some(Word::Keyword(Keyword::Let)), + _ => None, + }) +}); + +const L_K: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "keyof" => Some(Word::Ident(IdentLike::Known(KnownIdent::Keyof))), + _ => None, + }) +}); + +const L_L: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "let" => Some(Word::Keyword(Keyword::Let)), + _ => None, + }) +}); + +const L_M: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "meta" => Some(Word::Ident(IdentLike::Known(KnownIdent::Meta))), + _ => None, + }) +}); + +const L_N: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "new" => Some(Word::Keyword(Keyword::New)), + "null" => Some(Word::Null), + "number" => Some(Word::Ident(IdentLike::Known(KnownIdent::Number))), + "never" => Some(Word::Ident(IdentLike::Known(KnownIdent::Never))), + "namespace" => Some(Word::Ident(IdentLike::Known(KnownIdent::Namespace))), + _ => None, + }) +}); + +const L_O: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "of" => Some(Word::Ident(IdentLike::Known(KnownIdent::Of))), + "object" => Some(Word::Ident(IdentLike::Known(KnownIdent::Object))), + _ => None, + }) +}); + +const L_P: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "public" => Some(Word::Ident(IdentLike::Known(KnownIdent::Public))), + "pacakge" => Some(Word::Ident(IdentLike::Known(KnownIdent::Package))), + "protected" => Some(Word::Ident(IdentLike::Known(KnownIdent::Protected))), + "private" => Some(Word::Ident(IdentLike::Known(KnownIdent::Private))), + _ => None, + }) +}); + +const L_Q: ByteHandler = IDN; + +const L_R: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "return" => Some(Word::Keyword(Keyword::Return)), + "readonly" => Some(Word::Ident(IdentLike::Known(KnownIdent::Readonly))), + "require" => Some(Word::Ident(IdentLike::Known(KnownIdent::Require))), + _ => None, + }) +}); + +const L_S: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "super" => Some(Word::Keyword(Keyword::Super)), + "static" => Some(Word::Ident(IdentLike::Known(KnownIdent::Static))), + "switch" => Some(Word::Keyword(Keyword::Switch)), + "symbol" => Some(Word::Ident(IdentLike::Known(KnownIdent::Symbol))), + "set" => Some(Word::Ident(IdentLike::Known(KnownIdent::Set))), + "string" => Some(Word::Ident(IdentLike::Known(KnownIdent::String))), + "satisfies" => Some(Word::Ident(IdentLike::Known(KnownIdent::Satisfies))), + _ => None, + }) +}); + +const L_T: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "this" => Some(Word::Keyword(Keyword::This)), + "throw" => Some(Word::Keyword(Keyword::Throw)), + "true" => Some(Word::True), + "typeof" => Some(Word::Keyword(Keyword::TypeOf)), + "try" => Some(Word::Keyword(Keyword::Try)), + "type" => Some(Word::Ident(IdentLike::Known(KnownIdent::Type))), + "target" => Some(Word::Ident(IdentLike::Known(KnownIdent::Target))), + _ => None, + }) +}); + +const L_U: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "using" => Some(Word::Ident(IdentLike::Known(KnownIdent::Using))), + "unique" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unique))), + "undefined" => Some(Word::Ident(IdentLike::Known(KnownIdent::Undefined))), + "unknown" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unknown))), + _ => None, + }) +}); + +const L_V: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "var" => Some(Word::Keyword(Keyword::Var)), + "void" => Some(Word::Keyword(Keyword::Void)), + _ => None, + }) +}); + +const L_W: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "while" => Some(Word::Keyword(Keyword::While)), + "with" => Some(Word::Keyword(Keyword::With)), + _ => None, + }) +}); + +const L_X: ByteHandler = IDN; + +const L_Y: ByteHandler = Some(|lexer| { + lexer.read_word_with(|s| match s { + "yield" => Some(Word::Keyword(Keyword::Yield)), + _ => None, + }) +}); + +const L_Z: ByteHandler = IDN; /// `0` const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero().map(Some)); @@ -89,7 +311,7 @@ const UNI: ByteHandler = Some(|lexer| { // Identifier or keyword. '\uXXXX' sequences are allowed in // identifiers, so '\' also dispatches to that. if c == '\\' || c.is_ident_start() { - return lexer.read_ident_or_keyword().map(Some); + return lexer.read_ident_unknown().map(Some); } let start = lexer.cur_pos();