perf(es/lexer): Use jump table for skip_space (#7073)

2024-12-25 22:56:11 +03:00 · 2023-03-13 17:32:49 +09:00 · 2023-03-13 17:32:49 +09:00 · f854d51343
commit f854d51343
parent 9c29666402
4 changed files with 121 additions and 33 deletions
--- a/crates/swc_ecma_parser/scripts/instrument/bench.sh
+++ b/crates/swc_ecma_parser/scripts/instrument/bench.sh
@ -4,4 +4,4 @@ set -eu
 export RUST_LOG=off
 export MIMALLOC_SHOW_STATS=1

-cargo profile instruments --release -t time --features tracing/release_max_level_info --features swc_common/concurrent --features swc_common/parking_lot --bench parser -- --bench --color
+cargo profile instruments --release -t time --features tracing/release_max_level_info --features swc_common/concurrent --features swc_common/parking_lot --bench parser -- --bench --color $@
--- a/crates/swc_ecma_parser/src/lexer/mod.rs
+++ b/crates/swc_ecma_parser/src/lexer/mod.rs
@ -34,6 +34,7 @@ mod table;
 #[cfg(test)]
 mod tests;
 pub mod util;
+mod whitespace;

 pub(crate) type LexResult<T> = Result<T, Error>;

--- a/crates/swc_ecma_parser/src/lexer/util.rs
+++ b/crates/swc_ecma_parser/src/lexer/util.rs
@ -12,7 +12,10 @@ use swc_common::{
 use swc_ecma_ast::Ident;
 use tracing::warn;

-use super::{comments_buffer::BufferedComment, input::Input, Char, LexResult, Lexer};
+use super::{
+    comments_buffer::BufferedComment, input::Input, whitespace::SkipWhitespace, Char, LexResult,
+    Lexer,
+};
 use crate::{
    error::{Error, SyntaxError},
    lexer::comments_buffer::BufferedCommentKind,
@ -184,18 +187,20 @@ impl<'a> Lexer<'a> {
    /// See https://tc39.github.io/ecma262/#sec-white-space
    pub(super) fn skip_space<const LEX_COMMENTS: bool>(&mut self) -> LexResult<()> {
        loop {
-            let cur_b = self.input.cur_as_ascii();
+            let (offset, newline) = {
+                let mut skip = SkipWhitespace {
+                    input: self.input.as_str(),
+                    newline: false,
+                    offset: 0,
+                };

-            if matches!(cur_b, Some(b'\n' | b'\r')) {
-                self.input.bump();
-                self.state.had_line_break = true;
-                continue;
-            }
+                skip.scan();

-            if matches!(cur_b, Some(b'\x09' | b'\x0b' | b'\x0c' | b'\x20' | b'\xa0')) {
-                self.input.bump();
-                continue;
-            }
+                (skip.offset, skip.newline)
+            };
+
+            self.input.bump_bytes(offset);
+            self.state.had_line_break |= newline;

            if LEX_COMMENTS && self.input.is_byte(b'/') {
                if self.peek() == Some('/') {
@ -205,34 +210,15 @@ impl<'a> Lexer<'a> {
                    self.skip_block_comment()?;
                    continue;
                }
+            }
+
            break;
        }

-            let c = self.cur();
-            let c = match c {
-                Some(v) => v,
-                None => break,
-            };
-
-            match c {
-                // white spaces
-                '\u{feff}' => {}
-                // line breaks
-                '\u{2028}' | '\u{2029}' => {
-                    self.state.had_line_break = true;
-                }
-
-                _ if c.is_whitespace() => {}
-
-                _ => break,
-            }
-
-            self.bump();
-        }
-
        Ok(())
    }

+    #[inline(never)]
    pub(super) fn skip_line_comment(&mut self, start_skip: usize) {
        let start = self.cur_pos();
        self.input.bump_bytes(start_skip);
@ -282,6 +268,7 @@ impl<'a> Lexer<'a> {
    }

    /// Expects current char to be '/' and next char to be '*'.
+    #[inline(never)]
    pub(super) fn skip_block_comment(&mut self) -> LexResult<()> {
        let start = self.cur_pos();

--- a/crates/swc_ecma_parser/src/lexer/whitespace.rs
+++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs
@ -0,0 +1,100 @@
+/// Returns true if it's done
+pub(super) type ByteHandler = Option<for<'aa> fn(&mut SkipWhitespace<'aa>) -> usize>;
+
+/// Lookup table for whitespace
+static BYTE_HANDLERS: [ByteHandler; 256] = [
+    //   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F   //
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, SPC, NLN, SPC, SPC, NLN, ___, ___, // 0
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1
+    SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 2
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 3
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 4
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 5
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 6
+    ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 7
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 8
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // 9
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // A
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // B
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // C
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // D
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // E
+    UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, UNI, // F
+];
+
+/// Stop
+const ___: ByteHandler = None;
+
+/// Newline
+const NLN: ByteHandler = Some(|skip| {
+    skip.newline = true;
+
+    1
+});
+
+/// Space
+const SPC: ByteHandler = Some(|_| 1);
+
+/// Unicode
+const UNI: ByteHandler = Some(|skip| {
+    let s = unsafe {
+        // Safety: `skip.offset` is always valid
+        skip.input.get_unchecked(skip.offset..)
+    };
+
+    let c = unsafe {
+        // Safety: Byte handlers are called only when `skip.input` is not empty
+        s.chars().next().unwrap_unchecked()
+    };
+
+    match c {
+        // white spaces
+        '\u{feff}' => {}
+        // line breaks
+        '\u{2028}' | '\u{2029}' => {
+            skip.newline = true;
+        }
+
+        _ if c.is_whitespace() => {}
+
+        _ => return 0,
+    }
+
+    c.len_utf8()
+});
+
+/// API is taked from oxc by Boshen (https://github.com/Boshen/oxc/pull/26)
+pub(super) struct SkipWhitespace<'a> {
+    pub input: &'a str,
+
+    /// Total offset
+    pub offset: usize,
+
+    /// Found newline
+    pub newline: bool,
+}
+
+impl SkipWhitespace<'_> {
+    #[inline(always)]
+    pub fn scan(&mut self) {
+        let mut byte;
+        loop {
+            byte = match self.input.as_bytes().get(self.offset).copied() {
+                Some(v) => v,
+                None => return,
+            };
+
+            let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) };
+
+            if let Some(handler) = handler {
+                let delta = handler(self);
+                if delta == 0 {
+                    return;
+                }
+                self.offset += delta;
+            } else {
+                return;
+            }
+        }
+    }
+}