perf(es/parser): Use smarter lookup table for lexer (#8226)

**Description:**

`phf` is slower than expected.
This commit is contained in:
Donny/강동윤 2023-11-06 19:45:46 +09:00 committed by GitHub
parent f00238d543
commit d4ae44ac45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 305 additions and 76 deletions

View File

@ -151,6 +151,12 @@ impl From<Ident> for Id {
}
}
#[repr(C, align(64))]
struct Align64<T>(pub(crate) T);
const T: bool = true;
const F: bool = false;
impl Ident {
/// In `op`, [EqIgnoreSpan] of [Ident] will ignore the syntax context.
pub fn within_ignored_ctxt<F, Ret>(op: F) -> Ret
@ -175,26 +181,40 @@ impl Ident {
/// Returns true if `c` is a valid character for an identifier start.
#[inline]
pub fn is_valid_start(c: char) -> bool {
c == '$' || c == '_' || c.is_ascii_alphabetic() || {
if c.is_ascii() {
false
} else {
UnicodeID::is_id_start(c)
}
// This contains `$` (36) and `_` (95)
const ASCII_START: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
T, T, T, T, F, F, F, F, T, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
T, T, T, T, T, T, T, F, F, F, F, F,
]);
if c.is_ascii() {
return ASCII_START.0[c as usize];
}
UnicodeID::is_id_start(c)
}
/// Returns true if `c` is a valid character for an identifier part after
/// start.
#[inline]
pub fn is_valid_continue(c: char) -> bool {
c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || {
if c.is_ascii() {
false
} else {
UnicodeID::is_id_continue(c)
}
// This contains `$` (36)
const ASCII_CONTINUE: Align64<[bool; 128]> = Align64([
F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
F, F, F, F, F, F, F, T, F, F, F, F, F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T,
F, F, F, F, F, F, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
T, T, T, T, F, F, F, F, T, F, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
T, T, T, T, T, T, T, F, F, F, F, F,
]);
if c.is_ascii() {
return ASCII_CONTINUE.0[c as usize];
}
UnicodeID::is_id_continue(c) || c == '\u{200c}' || c == '\u{200d}'
}
/// Alternative for `toIdentifier` of babel.

View File

@ -21,7 +21,7 @@ pub use self::{
};
use crate::{
error::{Error, SyntaxError},
token::{BinOpToken, Keyword, Token, Word},
token::{BinOpToken, IdentLike, Token, Word},
Context, Syntax,
};
@ -759,59 +759,34 @@ impl<'a> Lexer<'a> {
Ok(Some(token))
}
/// See https://tc39.github.io/ecma262/#sec-names-and-keywords
fn read_ident_or_keyword(&mut self) -> LexResult<Token> {
static KNOWN_WORDS: phf::Map<&str, Word> = phf::phf_map! {
"await" => Word::Keyword(Keyword::Await),
"break" => Word::Keyword(Keyword::Break),
"case" => Word::Keyword(Keyword::Case),
"catch" => Word::Keyword(Keyword::Catch),
"class" => Word::Keyword(Keyword::Class),
"const" => Word::Keyword(Keyword::Const),
"continue" => Word::Keyword(Keyword::Continue),
"debugger" => Word::Keyword(Keyword::Debugger),
"default" => Word::Keyword(Keyword::Default_),
"delete" => Word::Keyword(Keyword::Delete),
"do" => Word::Keyword(Keyword::Do),
"else" => Word::Keyword(Keyword::Else),
"export" => Word::Keyword(Keyword::Export),
"extends" => Word::Keyword(Keyword::Extends),
"false" => Word::False,
"finally" => Word::Keyword(Keyword::Finally),
"for" => Word::Keyword(Keyword::For),
"function" => Word::Keyword(Keyword::Function),
"if" => Word::Keyword(Keyword::If),
"import" => Word::Keyword(Keyword::Import),
"in" => Word::Keyword(Keyword::In),
"instanceof" => Word::Keyword(Keyword::InstanceOf),
"let" => Word::Keyword(Keyword::Let),
"new" => Word::Keyword(Keyword::New),
"null" => Word::Null,
"return" => Word::Keyword(Keyword::Return),
"super" => Word::Keyword(Keyword::Super),
"switch" => Word::Keyword(Keyword::Switch),
"this" => Word::Keyword(Keyword::This),
"throw" => Word::Keyword(Keyword::Throw),
"true" => Word::True,
"try" => Word::Keyword(Keyword::Try),
"typeof" => Word::Keyword(Keyword::TypeOf),
"var" => Word::Keyword(Keyword::Var),
"void" => Word::Keyword(Keyword::Void),
"while" => Word::Keyword(Keyword::While),
"with" => Word::Keyword(Keyword::With),
"yield" => Word::Keyword(Keyword::Yield),
};
/// This can be used if there's no keyword starting with the first
/// character.
fn read_ident_unknown(&mut self) -> LexResult<Token> {
debug_assert!(self.cur().is_some());
let start = self.cur_pos();
let (word, has_escape) = self.read_word_as_str_with(|s| {
if let Some(word) = KNOWN_WORDS.get(s) {
return word.clone();
let (word, _) =
self.read_word_as_str_with(|s, _, _| Word::Ident(IdentLike::Other(s.into())))?;
Ok(Word(word))
}
/// This can be used if there's no keyword starting with the first
/// character.
fn read_word_with(
&mut self,
convert: impl FnOnce(&str) -> Option<Word>,
) -> LexResult<Option<Token>> {
debug_assert!(self.cur().is_some());
let start = self.cur_pos();
let (word, has_escape) = self.read_word_as_str_with(|s, _, can_be_known| {
if can_be_known {
if let Some(word) = convert(s) {
return word;
}
}
Word::Ident(s.into())
Word::Ident(IdentLike::Other(s.into()))
})?;
// Note: ctx is store in lexer because of this error.
@ -824,17 +799,20 @@ impl<'a> Lexer<'a> {
SyntaxError::EscapeInReservedWord { word: word.into() },
)?
} else {
Ok(Word(word))
Ok(Some(Token::Word(word)))
}
}
/// This method is optimized for texts without escape sequences.
///
/// `convert(text, has_escape, can_be_keyword)`
fn read_word_as_str_with<F, Ret>(&mut self, convert: F) -> LexResult<(Ret, bool)>
where
F: FnOnce(&str) -> Ret,
F: FnOnce(&str, bool, bool) -> Ret,
{
debug_assert!(self.cur().is_some());
let mut first = true;
let mut can_be_keyword = true;
self.with_buf(|l, buf| {
let mut has_escape = false;
@ -842,7 +820,14 @@ impl<'a> Lexer<'a> {
while let Some(c) = {
// Optimization
{
let s = l.input.uncons_while(|c| c.is_ident_part());
let s = l.input.uncons_while(|c| {
// Performance optimization
if c.is_ascii_uppercase() || c.is_ascii_digit() || !c.is_ascii() {
can_be_keyword = false;
}
c.is_ident_part()
});
if !s.is_empty() {
first = false;
}
@ -892,7 +877,7 @@ impl<'a> Lexer<'a> {
}
first = false;
}
let value = convert(buf);
let value = convert(buf, has_escape, can_be_keyword);
Ok((value, has_escape))
})
@ -1137,7 +1122,9 @@ impl<'a> Lexer<'a> {
// let flags_start = self.cur_pos();
let flags = {
match self.cur() {
Some(c) if c.is_ident_start() => self.read_word_as_str_with(|s| s.into()).map(Some),
Some(c) if c.is_ident_start() => {
self.read_word_as_str_with(|s, _, _| s.into()).map(Some)
}
_ => Ok(None),
}
}?

View File

@ -11,7 +11,7 @@ use swc_ecma_ast::AssignOp;
use super::{pos_span, util::CharExt, LexResult, Lexer};
use crate::{
error::SyntaxError,
token::{BinOpToken, Token},
token::{BinOpToken, IdentLike, Keyword, KnownIdent, Token, Word},
};
pub(super) type ByteHandler = Option<for<'aa> fn(&mut Lexer<'aa>) -> LexResult<Option<Token>>>;
@ -21,12 +21,12 @@ pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [
// 0 1 2 3 4 5 6 7 8 9 A B C D E F //
EOF, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 0
___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1
___, EXL, QOT, HSH, IDT, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
___, EXL, QOT, HSH, IDN, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2
ZER, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, COL, SEM, LSS, EQL, MOR, QST, // 3
AT_, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 4
IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BTO, IDT, BTC, CRT, IDT, // 5
TPL, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, // 6
IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, IDT, BEO, PIP, BEC, TLD, ERR, // 7
AT_, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, // 4
IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, IDN, BTO, IDN, BTC, CRT, IDN, // 5
TPL, L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H, L_I, L_J, L_K, L_L, L_M, L_N, L_O, // 6
L_P, L_Q, L_R, L_S, L_T, L_U, L_V, L_W, L_X, L_Y, L_Z, BEO, PIP, BEC, TLD, ERR, // 7
UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
@ -59,8 +59,230 @@ const ERR: ByteHandler = Some(|lexer| {
lexer.error_span(pos_span(start), SyntaxError::UnexpectedChar { c })?
});
/// Identifier
const IDT: ByteHandler = Some(|lexer| lexer.read_ident_or_keyword().map(Some));
/// Identifier and we know that this cannot be a keyword or known ident.
const IDN: ByteHandler = Some(|lexer| lexer.read_ident_unknown().map(Some));
const L_A: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"abstract" => Some(Word::Ident(IdentLike::Known(KnownIdent::Abstract))),
"as" => Some(Word::Ident(IdentLike::Known(KnownIdent::As))),
"await" => Some(Word::Keyword(Keyword::Await)),
"async" => Some(Word::Ident(IdentLike::Known(KnownIdent::Async))),
"assert" => Some(Word::Ident(IdentLike::Known(KnownIdent::Assert))),
"asserts" => Some(Word::Ident(IdentLike::Known(KnownIdent::Asserts))),
"any" => Some(Word::Ident(IdentLike::Known(KnownIdent::Any))),
"accessor" => Some(Word::Ident(IdentLike::Known(KnownIdent::Accessor))),
_ => None,
})
});
const L_B: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"break" => Some(Word::Keyword(Keyword::Break)),
"boolean" => Some(Word::Ident(IdentLike::Known(KnownIdent::Boolean))),
"bigint" => Some(Word::Ident(IdentLike::Known(KnownIdent::Bigint))),
_ => None,
})
});
const L_C: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"case" => Some(Word::Keyword(Keyword::Case)),
"catch" => Some(Word::Keyword(Keyword::Catch)),
"class" => Some(Word::Keyword(Keyword::Class)),
"const" => Some(Word::Keyword(Keyword::Const)),
"continue" => Some(Word::Keyword(Keyword::Continue)),
_ => None,
})
});
const L_D: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"debugger" => Some(Word::Keyword(Keyword::Debugger)),
"default" => Some(Word::Keyword(Keyword::Default_)),
"delete" => Some(Word::Keyword(Keyword::Delete)),
"do" => Some(Word::Keyword(Keyword::Do)),
"declare" => Some(Word::Ident(IdentLike::Known(KnownIdent::Declare))),
_ => None,
})
});
const L_E: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"else" => Some(Word::Keyword(Keyword::Else)),
"enum" => Some(Word::Ident(IdentLike::Known(KnownIdent::Enum))),
"export" => Some(Word::Keyword(Keyword::Export)),
"extends" => Some(Word::Keyword(Keyword::Extends)),
_ => None,
})
});
const L_F: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"false" => Some(Word::False),
"finally" => Some(Word::Keyword(Keyword::Finally)),
"for" => Some(Word::Keyword(Keyword::For)),
"function" => Some(Word::Keyword(Keyword::Function)),
"from" => Some(Word::Ident(IdentLike::Known(KnownIdent::From))),
_ => None,
})
});
const L_G: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"global" => Some(Word::Ident(IdentLike::Known(KnownIdent::Global))),
"get" => Some(Word::Ident(IdentLike::Known(KnownIdent::Get))),
_ => None,
})
});
const L_H: ByteHandler = IDN;
const L_I: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"if" => Some(Word::Keyword(Keyword::If)),
"import" => Some(Word::Keyword(Keyword::Import)),
"in" => Some(Word::Keyword(Keyword::In)),
"instanceof" => Some(Word::Keyword(Keyword::InstanceOf)),
"is" => Some(Word::Ident(IdentLike::Known(KnownIdent::Is))),
"infer" => Some(Word::Ident(IdentLike::Known(KnownIdent::Infer))),
"interface" => Some(Word::Ident(IdentLike::Known(KnownIdent::Interface))),
"implements" => Some(Word::Ident(IdentLike::Known(KnownIdent::Implements))),
"intrinsic" => Some(Word::Ident(IdentLike::Known(KnownIdent::Intrinsic))),
_ => None,
})
});
const L_J: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"let" => Some(Word::Keyword(Keyword::Let)),
_ => None,
})
});
const L_K: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"keyof" => Some(Word::Ident(IdentLike::Known(KnownIdent::Keyof))),
_ => None,
})
});
const L_L: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"let" => Some(Word::Keyword(Keyword::Let)),
_ => None,
})
});
const L_M: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"meta" => Some(Word::Ident(IdentLike::Known(KnownIdent::Meta))),
_ => None,
})
});
const L_N: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"new" => Some(Word::Keyword(Keyword::New)),
"null" => Some(Word::Null),
"number" => Some(Word::Ident(IdentLike::Known(KnownIdent::Number))),
"never" => Some(Word::Ident(IdentLike::Known(KnownIdent::Never))),
"namespace" => Some(Word::Ident(IdentLike::Known(KnownIdent::Namespace))),
_ => None,
})
});
const L_O: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"of" => Some(Word::Ident(IdentLike::Known(KnownIdent::Of))),
"object" => Some(Word::Ident(IdentLike::Known(KnownIdent::Object))),
_ => None,
})
});
const L_P: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"public" => Some(Word::Ident(IdentLike::Known(KnownIdent::Public))),
"pacakge" => Some(Word::Ident(IdentLike::Known(KnownIdent::Package))),
"protected" => Some(Word::Ident(IdentLike::Known(KnownIdent::Protected))),
"private" => Some(Word::Ident(IdentLike::Known(KnownIdent::Private))),
_ => None,
})
});
const L_Q: ByteHandler = IDN;
const L_R: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"return" => Some(Word::Keyword(Keyword::Return)),
"readonly" => Some(Word::Ident(IdentLike::Known(KnownIdent::Readonly))),
"require" => Some(Word::Ident(IdentLike::Known(KnownIdent::Require))),
_ => None,
})
});
const L_S: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"super" => Some(Word::Keyword(Keyword::Super)),
"static" => Some(Word::Ident(IdentLike::Known(KnownIdent::Static))),
"switch" => Some(Word::Keyword(Keyword::Switch)),
"symbol" => Some(Word::Ident(IdentLike::Known(KnownIdent::Symbol))),
"set" => Some(Word::Ident(IdentLike::Known(KnownIdent::Set))),
"string" => Some(Word::Ident(IdentLike::Known(KnownIdent::String))),
"satisfies" => Some(Word::Ident(IdentLike::Known(KnownIdent::Satisfies))),
_ => None,
})
});
const L_T: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"this" => Some(Word::Keyword(Keyword::This)),
"throw" => Some(Word::Keyword(Keyword::Throw)),
"true" => Some(Word::True),
"typeof" => Some(Word::Keyword(Keyword::TypeOf)),
"try" => Some(Word::Keyword(Keyword::Try)),
"type" => Some(Word::Ident(IdentLike::Known(KnownIdent::Type))),
"target" => Some(Word::Ident(IdentLike::Known(KnownIdent::Target))),
_ => None,
})
});
const L_U: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"using" => Some(Word::Ident(IdentLike::Known(KnownIdent::Using))),
"unique" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unique))),
"undefined" => Some(Word::Ident(IdentLike::Known(KnownIdent::Undefined))),
"unknown" => Some(Word::Ident(IdentLike::Known(KnownIdent::Unknown))),
_ => None,
})
});
const L_V: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"var" => Some(Word::Keyword(Keyword::Var)),
"void" => Some(Word::Keyword(Keyword::Void)),
_ => None,
})
});
const L_W: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"while" => Some(Word::Keyword(Keyword::While)),
"with" => Some(Word::Keyword(Keyword::With)),
_ => None,
})
});
const L_X: ByteHandler = IDN;
const L_Y: ByteHandler = Some(|lexer| {
lexer.read_word_with(|s| match s {
"yield" => Some(Word::Keyword(Keyword::Yield)),
_ => None,
})
});
const L_Z: ByteHandler = IDN;
/// `0`
const ZER: ByteHandler = Some(|lexer| lexer.read_token_zero().map(Some));
@ -89,7 +311,7 @@ const UNI: ByteHandler = Some(|lexer| {
// Identifier or keyword. '\uXXXX' sequences are allowed in
// identifiers, so '\' also dispatches to that.
if c == '\\' || c.is_ident_start() {
return lexer.read_ident_or_keyword().map(Some);
return lexer.read_ident_unknown().map(Some);
}
let start = lexer.cur_pos();