perf(html/parser): Optimize usage of buffers (#6590)

2024-10-04 20:28:43 +03:00 · 2022-12-07 05:18:31 +03:00 · 2022-12-07 05:18:31 +03:00 · d6e961368b
commit d6e961368b
parent d9c1c3a9bf
9 changed files with 519 additions and 82 deletions
--- a/crates/swc_html_codegen/src/lib.rs
+++ b/crates/swc_html_codegen/src/lib.rs
@ -199,7 +199,7 @@ where

        doctype.push('>');

-        write_raw!(self, n.span, &doctype);
+        write_multiline_raw!(self, n.span, &doctype);
        formatting_newline!(self);
    }

--- a/crates/swc_html_parser/benches/lexer.rs
+++ b/crates/swc_html_parser/benches/lexer.rs
@ -4,7 +4,7 @@ use criterion::{black_box, criterion_group, criterion_main, Bencher, Criterion};
 use swc_common::{input::StringInput, FileName};
 use swc_html_parser::lexer::Lexer;

-fn bench_lexer(b: &mut Bencher, src: &'static str) {
+fn bench_document(b: &mut Bencher, src: &'static str) {
    let _ = ::testing::run_test(false, |cm, _| {
        let fm = cm.new_source_file(FileName::Anon, src.into());

@ -20,18 +20,30 @@ fn bench_lexer(b: &mut Bencher, src: &'static str) {
    });
 }

+fn run(c: &mut Criterion, id: &str, src: &'static str) {
+    c.bench_function(&format!("html/lexer/{}", id), |b| {
+        bench_document(b, src);
+    });
+}
+
 fn bench_files(c: &mut Criterion) {
-    c.bench_function("html/lexer/css_2021_spec", |b| {
-        bench_lexer(b, include_str!("./files/css_2021_spec.html"))
-    });
+    run(
+        c,
+        "css_2021_spec",
+        include_str!("./files/css_2021_spec.html"),
+    );

-    c.bench_function("html/lexer/github_com_17_05_2022", |b| {
-        bench_lexer(b, include_str!("./files/github_com_17_05_2022.html"))
-    });
+    run(
+        c,
+        "github_com_17_05_2022",
+        include_str!("./files/github_com_17_05_2022.html"),
+    );

-    c.bench_function("html/lexer/stackoverflow_com_17_05_2022", |b| {
-        bench_lexer(b, include_str!("./files/stackoverflow_com_17_05_2022.html"))
-    });
+    run(
+        c,
+        "stackoverflow_com_17_05_2022",
+        include_str!("./files/stackoverflow_com_17_05_2022.html"),
+    );
 }

 criterion_group!(benches, bench_files);
--- a/crates/swc_html_parser/benches/parser.rs
+++ b/crates/swc_html_parser/benches/parser.rs
@ -51,30 +51,52 @@ fn bench_document_fragment(b: &mut Bencher, src: &'static str) {
    });
 }

+fn run_document(c: &mut Criterion, id: &str, src: &'static str) {
+    c.bench_function(&format!("html/parser/{}", id), |b| {
+        bench_document(b, src);
+    });
+}
+fn run_document_fragment(c: &mut Criterion, id: &str, src: &'static str) {
+    c.bench_function(&format!("html/parser/{}", id), |b| {
+        bench_document_fragment(b, src);
+    });
+}
+
 fn bench_files(c: &mut Criterion) {
-    c.bench_function("html/parser_document/css_2021_spec", |b| {
-        bench_document(b, include_str!("./files/css_2021_spec.html"))
-    });
+    run_document(
+        c,
+        "parser_document/css_2021_spec",
+        include_str!("./files/css_2021_spec.html"),
+    );

-    c.bench_function("html/parser_document/github_com_17_05_2022", |b| {
-        bench_document(b, include_str!("./files/github_com_17_05_2022.html"))
-    });
+    run_document(
+        c,
+        "parser_document/github_com_17_05_2022",
+        include_str!("./files/github_com_17_05_2022.html"),
+    );

-    c.bench_function("html/parser_document/stackoverflow_com_17_05_2022", |b| {
-        bench_document(b, include_str!("./files/stackoverflow_com_17_05_2022.html"))
-    });
+    run_document(
+        c,
+        "parser_document/stackoverflow_com_17_05_2022",
+        include_str!("./files/stackoverflow_com_17_05_2022.html"),
+    );

-    c.bench_function("html/parser_document_fragment/css_2021_spec", |b| {
-        bench_document_fragment(b, include_str!("./files/css_2021_spec.html"))
-    });
+    run_document_fragment(
+        c,
+        "parser_document_fragment/css_2021_spec",
+        include_str!("./files/css_2021_spec.html"),
+    );

-    c.bench_function("html/parser_document_fragment/github_com_17_05_2022", |b| {
-        bench_document_fragment(b, include_str!("./files/github_com_17_05_2022.html"))
-    });
+    run_document_fragment(
+        c,
+        "parser_document_fragment/github_com_17_05_2022",
+        include_str!("./files/github_com_17_05_2022.html"),
+    );

-    c.bench_function(
-        "html/parser_document_fragment/stackoverflow_com_17_05_2022",
-        |b| bench_document_fragment(b, include_str!("./files/stackoverflow_com_17_05_2022.html")),
+    run_document_fragment(
+        c,
+        "parser_document_fragment/stackoverflow_com_17_05_2022",
+        include_str!("./files/stackoverflow_com_17_05_2022.html"),
    );
 }

--- a/crates/swc_html_parser/src/lexer/mod.rs
+++ b/crates/swc_html_parser/src/lexer/mod.rs
@ -225,9 +225,9 @@ where
    fn validate_input_stream_character(&mut self, c: char) {
        let code = c as u32;

-        if (0xd800..=0xdfff).contains(&code) {
+        if is_surrogate(code) {
            self.emit_error(ErrorKind::SurrogateInInputStream);
-        } else if code != 0x00 && is_control(code) {
+        } else if is_allowed_control_character(code) {
            self.emit_error(ErrorKind::ControlCharacterInInputStream);
        } else if is_noncharacter(code) {
            self.emit_error(ErrorKind::NoncharacterInInputStream);
@ -432,6 +432,86 @@ where
        }
    }

+    fn consume_and_append_to_doctype_token_name<F>(&mut self, c: char, f: F)
+    where
+        F: Fn(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        buf.push(c.to_ascii_lowercase());
+        sub_buf.push(c);
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(&value.to_ascii_lowercase());
+        sub_buf.push_str(value);
+    }
+
+    fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: char, f: F)
+    where
+        F: Fn(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        let is_cr = c == '\r';
+
+        if is_cr {
+            buf.push('\n');
+            sub_buf.push(c);
+
+            if self.input.cur() == Some('\n') {
+                self.input.bump();
+
+                sub_buf.push('\n');
+            }
+        } else {
+            buf.push(c);
+            sub_buf.push(c);
+        }
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(value);
+        sub_buf.push_str(value);
+    }
+
+    fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: char, f: F)
+    where
+        F: Fn(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        let is_cr = c == '\r';
+
+        if is_cr {
+            buf.push('\n');
+            sub_buf.push(c);
+
+            if self.input.cur() == Some('\n') {
+                self.input.bump();
+
+                sub_buf.push('\n');
+            }
+        } else {
+            buf.push(c);
+            sub_buf.push(c);
+        }
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(value);
+        sub_buf.push_str(value);
+    }
+
    #[inline(always)]
    fn set_doctype_token_force_quirks(&mut self) {
        if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token {
@ -550,6 +630,24 @@ where
        }
    }

+    fn consume_and_append_to_tag_token_name<F>(&mut self, c: char, f: F)
+    where
+        F: Fn(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        buf.push(c.to_ascii_lowercase());
+        sub_buf.push(c);
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(&value.to_ascii_lowercase());
+        sub_buf.push_str(value);
+    }
+
    fn finish_tag_token_name(&mut self) {
        if let Some(
            Token::StartTag {
@ -603,6 +701,46 @@ where
        sub_buf.push(raw_c);
    }

+    fn consume_and_append_to_attribute_token_name<F>(&mut self, c: char, f: F)
+    where
+        F: FnMut(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        buf.push(c.to_ascii_lowercase());
+        sub_buf.push(c);
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(&value.to_ascii_lowercase());
+        sub_buf.push_str(value);
+    }
+
+    fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: char, f: F)
+    where
+        F: FnMut(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        buf.push(c.to_ascii_lowercase());
+        sub_buf.push(c);
+
+        self.temporary_buffer.push(c);
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(&value.to_ascii_lowercase());
+        sub_buf.push_str(value);
+
+        self.temporary_buffer.push_str(value);
+    }
+
    fn finish_attribute_token_name(&mut self) {
        if let Some(attribute_start_position) = self.attribute_start_position {
            if let Some(
@ -672,6 +810,37 @@ where
        }
    }

+    fn consume_and_append_to_attribute_token_value<F>(&mut self, c: char, f: F)
+    where
+        F: FnMut(char) -> bool,
+    {
+        let b = self.buf.clone();
+        let mut buf = b.borrow_mut();
+        let b = self.sub_buf.clone();
+        let mut sub_buf = b.borrow_mut();
+
+        let is_cr = c == '\r';
+
+        if is_cr {
+            buf.push('\n');
+            sub_buf.push(c);
+
+            if self.input.cur() == Some('\n') {
+                self.input.bump();
+
+                sub_buf.push('\n');
+            }
+        } else {
+            buf.push(c);
+            sub_buf.push(c);
+        }
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(value);
+        sub_buf.push_str(value);
+    }
+
    fn finish_attribute_token_value(&mut self) {
        if let Some(attribute_start_position) = self.attribute_start_position {
            if let Some(
@ -770,7 +939,10 @@ where
        sub_buf.push(raw_c);
    }

-    fn handle_raw_and_append_to_comment_token(&mut self, c: char) {
+    fn consume_and_append_to_comment_token<F>(&mut self, c: char, f: F)
+    where
+        F: Fn(char) -> bool,
+    {
        let b = self.buf.clone();
        let mut buf = b.borrow_mut();
        let b = self.sub_buf.clone();
@ -791,6 +963,11 @@ where
            buf.push(c);
            sub_buf.push(c);
        }
+
+        let value = self.input.uncons_while(f);
+
+        buf.push_str(value);
+        sub_buf.push_str(value);
    }

    fn emit_comment_token(&mut self, raw_end: Option<&str>) {
@ -1150,7 +1327,7 @@ where
                    // Switch to the before attribute name state.
                    Some(c) if is_spacy(c) => {
                        self.finish_tag_token_name();
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                        self.state = State::BeforeAttributeName;
                    }
                    // U+002F SOLIDUS (/)
@ -1170,7 +1347,7 @@ where
                    // Append the lowercase version of the current input character (add 0x0020
                    // to the character's code point) to the current tag token's tag name.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
+                        self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha);
                    }
                    // U+0000 NULL
                    // This is an unexpected-null-character parse error. Append a U+FFFD
@ -1192,7 +1369,17 @@ where
                    // Append the current input character to the current tag token's tag name.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_to_tag_token_name(c, c);
+                        self.consume_and_append_to_tag_token_name(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control
+                            !is_spacy(c)
+                                && !matches!(c, '/' | '>' | '\x00')
+                                && !is_ascii_upper_alpha(c)
+                        });
                    }
                }
            }
@ -1257,7 +1444,7 @@ where
                    // to the before attribute name state. Otherwise, treat it as per the
                    // "anything else" entry below.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);

                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                            self.finish_tag_token_name();
@ -1296,15 +1483,19 @@ where
                    // to the character's code point) to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_upper_alpha,
+                        );
                    }
                    // ASCII lower alpha
                    // Append the current input character to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_lower_alpha(c) => {
-                        self.append_to_tag_token_name(c, c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_lower_alpha,
+                        );
                    }
                    // Anything else
                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1377,7 +1568,7 @@ where
                    // to the before attribute name state. Otherwise, treat it as per the
                    // "anything else" entry below.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);

                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                            self.finish_tag_token_name();
@ -1416,15 +1607,19 @@ where
                    // to the character's code point) to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_upper_alpha,
+                        );
                    }
                    // ASCII lower alpha
                    // Append the current input character to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_lower_alpha(c) => {
-                        self.append_to_tag_token_name(c, c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_lower_alpha,
+                        );
                    }
                    // Anything else
                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1505,7 +1700,7 @@ where
                    // to the before attribute name state. Otherwise, treat it as per the
                    // "anything else" entry below.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);

                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                            self.finish_tag_token_name();
@ -1544,15 +1739,19 @@ where
                    // to the character's code point) to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_upper_alpha,
+                        );
                    }
                    // ASCII lower alpha
                    // Append the current input character to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_lower_alpha(c) => {
-                        self.append_to_tag_token_name(c, c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_lower_alpha,
+                        );
                    }
                    // Anything else
                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1801,7 +2000,7 @@ where
                    // to the before attribute name state. Otherwise, treat it as per the
                    // "anything else" entry below.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);

                        if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
                            self.finish_tag_token_name();
@ -1840,15 +2039,19 @@ where
                    // to the character's code point) to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_upper_alpha,
+                        );
                    }
                    // ASCII lower alpha
                    // Append the current input character to the current tag token's tag name.
                    // Append the current input character to the temporary buffer.
                    Some(c) if is_ascii_lower_alpha(c) => {
-                        self.append_to_tag_token_name(c, c);
-                        self.temporary_buffer.push(c);
+                        self.consume_and_append_to_attribute_token_name_and_temp_buf(
+                            c,
+                            is_ascii_lower_alpha,
+                        );
                    }
                    // Anything else
                    // Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -2143,7 +2346,7 @@ where
                    // U+0020 SPACE
                    // Ignore the character.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                    }
                    // U+002F SOLIDUS (/)
                    // U+003E GREATER-THAN SIGN (>)
@ -2192,7 +2395,7 @@ where
                    // Reconsume in the after attribute name state.
                    Some(c) if is_spacy(c) => {
                        self.finish_attribute_token_name();
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                        self.reconsume_in_state(State::AfterAttributeName);
                    }
                    Some('/' | '>') | None => {
@ -2209,7 +2412,9 @@ where
                    // Append the lowercase version of the current input character (add 0x0020
                    // to the character's code point) to the current attribute's name.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_to_attribute_token_name(c.to_ascii_lowercase(), c);
+                        self.consume_and_append_to_attribute_token_name(c, |c| {
+                            is_ascii_upper_alpha(c)
+                        });
                    }
                    // U+0000 NULL
                    // This is an unexpected-null-character parse error. Append a U+FFFD
@ -2232,8 +2437,17 @@ where
                    // Append the current input character to the current attribute's name.
                    Some(c) => {
                        self.validate_input_stream_character(c);
+                        self.consume_and_append_to_attribute_token_name(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }

-                        anything_else(self, c);
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control
+                            !is_spacy(c)
+                                && !matches!(c, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
+                                && !is_ascii_upper_alpha(c)
+                        });
                    }
                }

@ -2257,7 +2471,7 @@ where
                    // U+0020 SPACE
                    // Ignore the character.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                    }
                    // U+002F SOLIDUS (/)
                    // Switch to the self-closing start tag state.
@ -2303,7 +2517,7 @@ where
                    // U+0020 SPACE
                    // Ignore the character.
                    Some(c) if is_spacy(c) => {
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                    }
                    // U+0022 QUOTATION MARK (")
                    // Switch to the attribute value (double-quoted) state.
@ -2369,7 +2583,15 @@ where
                    // Append the current input character to the current attribute's value.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_to_attribute_token_value(Some(c), Some(c));
+                        self.consume_and_append_to_attribute_token_value(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control, `\r` is in list because of newline normalization
+                            !matches!(c, '"' | '&' | '\x00' | '\r')
+                        });
                    }
                }
            }
@ -2410,7 +2632,15 @@ where
                    // Append the current input character to the current attribute's value.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_to_attribute_token_value(Some(c), Some(c));
+                        self.consume_and_append_to_attribute_token_value(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control, `\r` is in list because of newline normalization
+                            !matches!(c, '\'' | '&' | '\x00' | '\r')
+                        });
                    }
                }
            }
@ -2429,7 +2659,7 @@ where
                    // Switch to the before attribute name state.
                    Some(c) if is_spacy(c) => {
                        self.finish_attribute_token_value();
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                        self.state = State::BeforeAttributeName;
                    }
                    // U+0026 AMPERSAND (&)
@ -2479,8 +2709,19 @@ where
                    // Append the current input character to the current attribute's value.
                    Some(c) => {
                        self.validate_input_stream_character(c);
+                        self.consume_and_append_to_attribute_token_value(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }

-                        anything_else(self, c);
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control, `\r` is in list because of newline normalization
+                            !is_spacy(c)
+                                && !matches!(
+                                    c,
+                                    '&' | '>' | '\x00' | '"' | '\'' | '<' | '=' | '`' | '\r'
+                                )
+                        });
                    }
                }
            }
@ -2495,7 +2736,7 @@ where
                    // Switch to the before attribute name state.
                    Some(c) if is_spacy(c) => {
                        self.finish_attribute_token_value();
-                        self.skip_next_lf(c);
+                        self.skip_whitespaces(c);
                        self.state = State::BeforeAttributeName;
                    }
                    // U+002F SOLIDUS (/)
@ -2599,7 +2840,15 @@ where
                    // Append the current input character to the comment token's data.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.handle_raw_and_append_to_comment_token(c);
+                        self.consume_and_append_to_comment_token(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control, `\r` is in list because of newline normalization
+                            !matches!(c, '>' | '\x00' | '\r')
+                        });
                    }
                }
            }
@ -2832,7 +3081,15 @@ where
                    // Append the current input character to the comment token's data.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.handle_raw_and_append_to_comment_token(c);
+                        self.consume_and_append_to_comment_token(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            // List of characters from above to stop consumption and a certain
+                            // branch took control, `\r` is in list because of newline normalization
+                            !matches!(c, '<' | '-' | '\x00' | '\r')
+                        });
                    }
                }
            }
@ -3157,8 +3414,7 @@ where
                    // Append the lowercase version of the current input character (add 0x0020
                    // to the character's code point) to the current DOCTYPE token's name.
                    Some(c) if is_ascii_upper_alpha(c) => {
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(Some(c.to_ascii_lowercase()), None, None);
+                        self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha);
                    }
                    // U+0000 NULL
                    // This is an unexpected-null-character parse error. Append a U+FFFD
@ -3185,8 +3441,13 @@ where
                    // Append the current input character to the current DOCTYPE token's name.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(Some(c), None, None);
+                        self.consume_and_append_to_doctype_token_name(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            !is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c)
+                        });
                    }
                }
            }
@ -3469,8 +3730,13 @@ where
                    // identifier.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(None, Some(c), None);
+                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            !matches!(c, '"' | '\x00' | '>' | '\r')
+                        });
                    }
                }
            }
@ -3524,8 +3790,13 @@ where
                    // identifier.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(None, Some(c), None);
+                        self.consume_and_append_to_doctype_token_public_id(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            !matches!(c, '\'' | '\x00' | '>' | '\r')
+                        });
                    }
                }
            }
@ -3841,8 +4112,13 @@ where
                    // identifier.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(None, None, Some(c));
+                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            !matches!(c, '"' | '\x00' | '>' | '\r')
+                        });
                    }
                }
            }
@ -3896,8 +4172,13 @@ where
                    // identifier.
                    Some(c) => {
                        self.validate_input_stream_character(c);
-                        self.append_raw_to_doctype_token(c);
-                        self.append_to_doctype_token(None, None, Some(c));
+                        self.consume_and_append_to_doctype_token_system_id(c, |c| {
+                            if !is_allowed_character(c) {
+                                return false;
+                            }
+
+                            !matches!(c, '\'' | '\x00' | '>' | '\r')
+                        });
                    }
                }
            }
@ -4061,7 +4342,6 @@ where
                    // numeric character reference state.
                    Some(c @ '#') => {
                        self.temporary_buffer.push(c);
-
                        self.state = State::NumericCharacterReference;
                    }
                    // Anything else
@ -4532,7 +4812,7 @@ where
    }

    #[inline(always)]
-    fn skip_next_lf(&mut self, c: char) {
+    fn skip_whitespaces(&mut self, c: char) {
        if c == '\r' && self.input.cur() == Some('\n') {
            self.input.bump();
        }
@ -4634,3 +4914,19 @@ fn is_ascii_lower_alpha(c: char) -> bool {
 fn is_ascii_alpha(c: char) -> bool {
    is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c)
 }
+
+#[inline(always)]
+fn is_allowed_control_character(c: u32) -> bool {
+    c != 0x00 && is_control(c)
+}
+
+#[inline(always)]
+fn is_allowed_character(c: char) -> bool {
+    let c = c as u32;
+
+    if is_surrogate(c) || is_allowed_control_character(c) || is_noncharacter(c) {
+        return false;
+    }
+
+    return true;
+}
--- a/crates/swc_html_parser/tests/recovery/doctype/newline/dom.rust-debug
+++ b/crates/swc_html_parser/tests/recovery/doctype/newline/dom.rust-debug
@ -0,0 +1,6 @@
+| <!DOCTYPE html "-//W3C//DTD HTML 4.01
+
+Transitional//EN" "">
+| <html>
+|   <head>
+|   <body>
--- a/crates/swc_html_parser/tests/recovery/doctype/newline/input.html
+++ b/crates/swc_html_parser/tests/recovery/doctype/newline/input.html
@ -0,0 +1,3 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
+
+Transitional//EN">
--- a/crates/swc_html_parser/tests/recovery/doctype/newline/output.json
+++ b/crates/swc_html_parser/tests/recovery/doctype/newline/output.json
@ -0,0 +1,66 @@
+{
+  "type": "Document",
+  "span": {
+    "start": 1,
+    "end": 65,
+    "ctxt": 0
+  },
+  "mode": "no-quirks",
+  "children": [
+    {
+      "type": "DocumentType",
+      "span": {
+        "start": 1,
+        "end": 65,
+        "ctxt": 0
+      },
+      "name": "html",
+      "publicId": "-//W3C//DTD HTML 4.01\n\nTransitional//EN",
+      "systemId": null,
+      "raw": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01\n\nTransitional//EN\">"
+    },
+    {
+      "type": "Element",
+      "span": {
+        "start": 0,
+        "end": 0,
+        "ctxt": 0
+      },
+      "tagName": "html",
+      "namespace": "http://www.w3.org/1999/xhtml",
+      "attributes": [],
+      "children": [
+        {
+          "type": "Element",
+          "span": {
+            "start": 0,
+            "end": 0,
+            "ctxt": 0
+          },
+          "tagName": "head",
+          "namespace": "http://www.w3.org/1999/xhtml",
+          "attributes": [],
+          "children": [],
+          "content": null,
+          "isSelfClosing": false
+        },
+        {
+          "type": "Element",
+          "span": {
+            "start": 0,
+            "end": 0,
+            "ctxt": 0
+          },
+          "tagName": "body",
+          "namespace": "http://www.w3.org/1999/xhtml",
+          "attributes": [],
+          "children": [],
+          "content": null,
+          "isSelfClosing": false
+        }
+      ],
+      "content": null,
+      "isSelfClosing": false
+    }
+  ]
+}
--- a/crates/swc_html_parser/tests/recovery/doctype/newline/output.stderr
+++ b/crates/swc_html_parser/tests/recovery/doctype/newline/output.stderr
@ -0,0 +1,7 @@
+
+  x Non conforming doctype
+   ,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
+ 1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
+ 2 | |   
+ 3 | `-> Transitional//EN">
+   `----
--- a/crates/swc_html_parser/tests/recovery/doctype/newline/span.rust-debug
+++ b/crates/swc_html_parser/tests/recovery/doctype/newline/span.rust-debug
@ -0,0 +1,25 @@
+
+  x Document
+   ,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
+ 1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
+ 2 | |   
+ 3 | `-> Transitional//EN">
+   `----
+
+  x Child
+   ,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
+ 1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
+ 2 | |   
+ 3 | `-> Transitional//EN">
+   `----
+
+  x DocumentType
+   ,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
+ 1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
+ 2 | |   
+ 3 | `-> Transitional//EN">
+   `----
+
+  x Child
+
+  x Element