perf(html/parser): Optimize usage of buffers (#6590)

This commit is contained in:
Alexander Akait 2022-12-07 05:18:31 +03:00 committed by GitHub
parent d9c1c3a9bf
commit d6e961368b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 519 additions and 82 deletions

View File

@ -199,7 +199,7 @@ where
doctype.push('>');
write_raw!(self, n.span, &doctype);
write_multiline_raw!(self, n.span, &doctype);
formatting_newline!(self);
}

View File

@ -4,7 +4,7 @@ use criterion::{black_box, criterion_group, criterion_main, Bencher, Criterion};
use swc_common::{input::StringInput, FileName};
use swc_html_parser::lexer::Lexer;
fn bench_lexer(b: &mut Bencher, src: &'static str) {
fn bench_document(b: &mut Bencher, src: &'static str) {
let _ = ::testing::run_test(false, |cm, _| {
let fm = cm.new_source_file(FileName::Anon, src.into());
@ -20,18 +20,30 @@ fn bench_lexer(b: &mut Bencher, src: &'static str) {
});
}
fn run(c: &mut Criterion, id: &str, src: &'static str) {
c.bench_function(&format!("html/lexer/{}", id), |b| {
bench_document(b, src);
});
}
fn bench_files(c: &mut Criterion) {
c.bench_function("html/lexer/css_2021_spec", |b| {
bench_lexer(b, include_str!("./files/css_2021_spec.html"))
});
run(
c,
"css_2021_spec",
include_str!("./files/css_2021_spec.html"),
);
c.bench_function("html/lexer/github_com_17_05_2022", |b| {
bench_lexer(b, include_str!("./files/github_com_17_05_2022.html"))
});
run(
c,
"github_com_17_05_2022",
include_str!("./files/github_com_17_05_2022.html"),
);
c.bench_function("html/lexer/stackoverflow_com_17_05_2022", |b| {
bench_lexer(b, include_str!("./files/stackoverflow_com_17_05_2022.html"))
});
run(
c,
"stackoverflow_com_17_05_2022",
include_str!("./files/stackoverflow_com_17_05_2022.html"),
);
}
criterion_group!(benches, bench_files);

View File

@ -51,30 +51,52 @@ fn bench_document_fragment(b: &mut Bencher, src: &'static str) {
});
}
fn run_document(c: &mut Criterion, id: &str, src: &'static str) {
c.bench_function(&format!("html/parser/{}", id), |b| {
bench_document(b, src);
});
}
fn run_document_fragment(c: &mut Criterion, id: &str, src: &'static str) {
c.bench_function(&format!("html/parser/{}", id), |b| {
bench_document_fragment(b, src);
});
}
fn bench_files(c: &mut Criterion) {
c.bench_function("html/parser_document/css_2021_spec", |b| {
bench_document(b, include_str!("./files/css_2021_spec.html"))
});
run_document(
c,
"parser_document/css_2021_spec",
include_str!("./files/css_2021_spec.html"),
);
c.bench_function("html/parser_document/github_com_17_05_2022", |b| {
bench_document(b, include_str!("./files/github_com_17_05_2022.html"))
});
run_document(
c,
"parser_document/github_com_17_05_2022",
include_str!("./files/github_com_17_05_2022.html"),
);
c.bench_function("html/parser_document/stackoverflow_com_17_05_2022", |b| {
bench_document(b, include_str!("./files/stackoverflow_com_17_05_2022.html"))
});
run_document(
c,
"parser_document/stackoverflow_com_17_05_2022",
include_str!("./files/stackoverflow_com_17_05_2022.html"),
);
c.bench_function("html/parser_document_fragment/css_2021_spec", |b| {
bench_document_fragment(b, include_str!("./files/css_2021_spec.html"))
});
run_document_fragment(
c,
"parser_document_fragment/css_2021_spec",
include_str!("./files/css_2021_spec.html"),
);
c.bench_function("html/parser_document_fragment/github_com_17_05_2022", |b| {
bench_document_fragment(b, include_str!("./files/github_com_17_05_2022.html"))
});
run_document_fragment(
c,
"parser_document_fragment/github_com_17_05_2022",
include_str!("./files/github_com_17_05_2022.html"),
);
c.bench_function(
"html/parser_document_fragment/stackoverflow_com_17_05_2022",
|b| bench_document_fragment(b, include_str!("./files/stackoverflow_com_17_05_2022.html")),
run_document_fragment(
c,
"parser_document_fragment/stackoverflow_com_17_05_2022",
include_str!("./files/stackoverflow_com_17_05_2022.html"),
);
}

View File

@ -225,9 +225,9 @@ where
fn validate_input_stream_character(&mut self, c: char) {
let code = c as u32;
if (0xd800..=0xdfff).contains(&code) {
if is_surrogate(code) {
self.emit_error(ErrorKind::SurrogateInInputStream);
} else if code != 0x00 && is_control(code) {
} else if is_allowed_control_character(code) {
self.emit_error(ErrorKind::ControlCharacterInInputStream);
} else if is_noncharacter(code) {
self.emit_error(ErrorKind::NoncharacterInInputStream);
@ -432,6 +432,86 @@ where
}
}
fn consume_and_append_to_doctype_token_name<F>(&mut self, c: char, f: F)
where
F: Fn(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
buf.push(c.to_ascii_lowercase());
sub_buf.push(c);
let value = self.input.uncons_while(f);
buf.push_str(&value.to_ascii_lowercase());
sub_buf.push_str(value);
}
fn consume_and_append_to_doctype_token_public_id<F>(&mut self, c: char, f: F)
where
F: Fn(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
let is_cr = c == '\r';
if is_cr {
buf.push('\n');
sub_buf.push(c);
if self.input.cur() == Some('\n') {
self.input.bump();
sub_buf.push('\n');
}
} else {
buf.push(c);
sub_buf.push(c);
}
let value = self.input.uncons_while(f);
buf.push_str(value);
sub_buf.push_str(value);
}
fn consume_and_append_to_doctype_token_system_id<F>(&mut self, c: char, f: F)
where
F: Fn(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
let is_cr = c == '\r';
if is_cr {
buf.push('\n');
sub_buf.push(c);
if self.input.cur() == Some('\n') {
self.input.bump();
sub_buf.push('\n');
}
} else {
buf.push(c);
sub_buf.push(c);
}
let value = self.input.uncons_while(f);
buf.push_str(value);
sub_buf.push_str(value);
}
#[inline(always)]
fn set_doctype_token_force_quirks(&mut self) {
if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token {
@ -550,6 +630,24 @@ where
}
}
fn consume_and_append_to_tag_token_name<F>(&mut self, c: char, f: F)
where
F: Fn(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
buf.push(c.to_ascii_lowercase());
sub_buf.push(c);
let value = self.input.uncons_while(f);
buf.push_str(&value.to_ascii_lowercase());
sub_buf.push_str(value);
}
fn finish_tag_token_name(&mut self) {
if let Some(
Token::StartTag {
@ -603,6 +701,46 @@ where
sub_buf.push(raw_c);
}
fn consume_and_append_to_attribute_token_name<F>(&mut self, c: char, f: F)
where
F: FnMut(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
buf.push(c.to_ascii_lowercase());
sub_buf.push(c);
let value = self.input.uncons_while(f);
buf.push_str(&value.to_ascii_lowercase());
sub_buf.push_str(value);
}
fn consume_and_append_to_attribute_token_name_and_temp_buf<F>(&mut self, c: char, f: F)
where
F: FnMut(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
buf.push(c.to_ascii_lowercase());
sub_buf.push(c);
self.temporary_buffer.push(c);
let value = self.input.uncons_while(f);
buf.push_str(&value.to_ascii_lowercase());
sub_buf.push_str(value);
self.temporary_buffer.push_str(value);
}
fn finish_attribute_token_name(&mut self) {
if let Some(attribute_start_position) = self.attribute_start_position {
if let Some(
@ -672,6 +810,37 @@ where
}
}
fn consume_and_append_to_attribute_token_value<F>(&mut self, c: char, f: F)
where
F: FnMut(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
let mut sub_buf = b.borrow_mut();
let is_cr = c == '\r';
if is_cr {
buf.push('\n');
sub_buf.push(c);
if self.input.cur() == Some('\n') {
self.input.bump();
sub_buf.push('\n');
}
} else {
buf.push(c);
sub_buf.push(c);
}
let value = self.input.uncons_while(f);
buf.push_str(value);
sub_buf.push_str(value);
}
fn finish_attribute_token_value(&mut self) {
if let Some(attribute_start_position) = self.attribute_start_position {
if let Some(
@ -770,7 +939,10 @@ where
sub_buf.push(raw_c);
}
fn handle_raw_and_append_to_comment_token(&mut self, c: char) {
fn consume_and_append_to_comment_token<F>(&mut self, c: char, f: F)
where
F: Fn(char) -> bool,
{
let b = self.buf.clone();
let mut buf = b.borrow_mut();
let b = self.sub_buf.clone();
@ -791,6 +963,11 @@ where
buf.push(c);
sub_buf.push(c);
}
let value = self.input.uncons_while(f);
buf.push_str(value);
sub_buf.push_str(value);
}
fn emit_comment_token(&mut self, raw_end: Option<&str>) {
@ -1150,7 +1327,7 @@ where
// Switch to the before attribute name state.
Some(c) if is_spacy(c) => {
self.finish_tag_token_name();
self.skip_next_lf(c);
self.skip_whitespaces(c);
self.state = State::BeforeAttributeName;
}
// U+002F SOLIDUS (/)
@ -1170,7 +1347,7 @@ where
// Append the lowercase version of the current input character (add 0x0020
// to the character's code point) to the current tag token's tag name.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
self.consume_and_append_to_tag_token_name(c, is_ascii_upper_alpha);
}
// U+0000 NULL
// This is an unexpected-null-character parse error. Append a U+FFFD
@ -1192,7 +1369,17 @@ where
// Append the current input character to the current tag token's tag name.
Some(c) => {
self.validate_input_stream_character(c);
self.append_to_tag_token_name(c, c);
self.consume_and_append_to_tag_token_name(c, |c| {
if !is_allowed_character(c) {
return false;
}
// List of characters from above to stop consumption and a certain
// branch took control
!is_spacy(c)
&& !matches!(c, '/' | '>' | '\x00')
&& !is_ascii_upper_alpha(c)
});
}
}
}
@ -1257,7 +1444,7 @@ where
// to the before attribute name state. Otherwise, treat it as per the
// "anything else" entry below.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
self.finish_tag_token_name();
@ -1296,15 +1483,19 @@ where
// to the character's code point) to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_upper_alpha,
);
}
// ASCII lower alpha
// Append the current input character to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_lower_alpha(c) => {
self.append_to_tag_token_name(c, c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_lower_alpha,
);
}
// Anything else
// Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1377,7 +1568,7 @@ where
// to the before attribute name state. Otherwise, treat it as per the
// "anything else" entry below.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
self.finish_tag_token_name();
@ -1416,15 +1607,19 @@ where
// to the character's code point) to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_upper_alpha,
);
}
// ASCII lower alpha
// Append the current input character to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_lower_alpha(c) => {
self.append_to_tag_token_name(c, c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_lower_alpha,
);
}
// Anything else
// Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1505,7 +1700,7 @@ where
// to the before attribute name state. Otherwise, treat it as per the
// "anything else" entry below.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
self.finish_tag_token_name();
@ -1544,15 +1739,19 @@ where
// to the character's code point) to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_upper_alpha,
);
}
// ASCII lower alpha
// Append the current input character to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_lower_alpha(c) => {
self.append_to_tag_token_name(c, c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_lower_alpha,
);
}
// Anything else
// Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -1801,7 +2000,7 @@ where
// to the before attribute name state. Otherwise, treat it as per the
// "anything else" entry below.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
if self.current_end_tag_token_is_an_appropriate_end_tag_token() {
self.finish_tag_token_name();
@ -1840,15 +2039,19 @@ where
// to the character's code point) to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_tag_token_name(c.to_ascii_lowercase(), c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_upper_alpha,
);
}
// ASCII lower alpha
// Append the current input character to the current tag token's tag name.
// Append the current input character to the temporary buffer.
Some(c) if is_ascii_lower_alpha(c) => {
self.append_to_tag_token_name(c, c);
self.temporary_buffer.push(c);
self.consume_and_append_to_attribute_token_name_and_temp_buf(
c,
is_ascii_lower_alpha,
);
}
// Anything else
// Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character
@ -2143,7 +2346,7 @@ where
// U+0020 SPACE
// Ignore the character.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
}
// U+002F SOLIDUS (/)
// U+003E GREATER-THAN SIGN (>)
@ -2192,7 +2395,7 @@ where
// Reconsume in the after attribute name state.
Some(c) if is_spacy(c) => {
self.finish_attribute_token_name();
self.skip_next_lf(c);
self.skip_whitespaces(c);
self.reconsume_in_state(State::AfterAttributeName);
}
Some('/' | '>') | None => {
@ -2209,7 +2412,9 @@ where
// Append the lowercase version of the current input character (add 0x0020
// to the character's code point) to the current attribute's name.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_attribute_token_name(c.to_ascii_lowercase(), c);
self.consume_and_append_to_attribute_token_name(c, |c| {
is_ascii_upper_alpha(c)
});
}
// U+0000 NULL
// This is an unexpected-null-character parse error. Append a U+FFFD
@ -2232,8 +2437,17 @@ where
// Append the current input character to the current attribute's name.
Some(c) => {
self.validate_input_stream_character(c);
self.consume_and_append_to_attribute_token_name(c, |c| {
if !is_allowed_character(c) {
return false;
}
anything_else(self, c);
// List of characters from above to stop consumption and a certain
// branch took control
!is_spacy(c)
&& !matches!(c, '/' | '>' | '=' | '\x00' | '"' | '\'' | '<')
&& !is_ascii_upper_alpha(c)
});
}
}
@ -2257,7 +2471,7 @@ where
// U+0020 SPACE
// Ignore the character.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
}
// U+002F SOLIDUS (/)
// Switch to the self-closing start tag state.
@ -2303,7 +2517,7 @@ where
// U+0020 SPACE
// Ignore the character.
Some(c) if is_spacy(c) => {
self.skip_next_lf(c);
self.skip_whitespaces(c);
}
// U+0022 QUOTATION MARK (")
// Switch to the attribute value (double-quoted) state.
@ -2369,7 +2583,15 @@ where
// Append the current input character to the current attribute's value.
Some(c) => {
self.validate_input_stream_character(c);
self.append_to_attribute_token_value(Some(c), Some(c));
self.consume_and_append_to_attribute_token_value(c, |c| {
if !is_allowed_character(c) {
return false;
}
// List of characters from above to stop consumption and a certain
// branch took control, `\r` is in list because of newline normalization
!matches!(c, '"' | '&' | '\x00' | '\r')
});
}
}
}
@ -2410,7 +2632,15 @@ where
// Append the current input character to the current attribute's value.
Some(c) => {
self.validate_input_stream_character(c);
self.append_to_attribute_token_value(Some(c), Some(c));
self.consume_and_append_to_attribute_token_value(c, |c| {
if !is_allowed_character(c) {
return false;
}
// List of characters from above to stop consumption and a certain
// branch took control, `\r` is in list because of newline normalization
!matches!(c, '\'' | '&' | '\x00' | '\r')
});
}
}
}
@ -2429,7 +2659,7 @@ where
// Switch to the before attribute name state.
Some(c) if is_spacy(c) => {
self.finish_attribute_token_value();
self.skip_next_lf(c);
self.skip_whitespaces(c);
self.state = State::BeforeAttributeName;
}
// U+0026 AMPERSAND (&)
@ -2479,8 +2709,19 @@ where
// Append the current input character to the current attribute's value.
Some(c) => {
self.validate_input_stream_character(c);
self.consume_and_append_to_attribute_token_value(c, |c| {
if !is_allowed_character(c) {
return false;
}
anything_else(self, c);
// List of characters from above to stop consumption and a certain
// branch took control, `\r` is in list because of newline normalization
!is_spacy(c)
&& !matches!(
c,
'&' | '>' | '\x00' | '"' | '\'' | '<' | '=' | '`' | '\r'
)
});
}
}
}
@ -2495,7 +2736,7 @@ where
// Switch to the before attribute name state.
Some(c) if is_spacy(c) => {
self.finish_attribute_token_value();
self.skip_next_lf(c);
self.skip_whitespaces(c);
self.state = State::BeforeAttributeName;
}
// U+002F SOLIDUS (/)
@ -2599,7 +2840,15 @@ where
// Append the current input character to the comment token's data.
Some(c) => {
self.validate_input_stream_character(c);
self.handle_raw_and_append_to_comment_token(c);
self.consume_and_append_to_comment_token(c, |c| {
if !is_allowed_character(c) {
return false;
}
// List of characters from above to stop consumption and a certain
// branch took control, `\r` is in list because of newline normalization
!matches!(c, '>' | '\x00' | '\r')
});
}
}
}
@ -2832,7 +3081,15 @@ where
// Append the current input character to the comment token's data.
Some(c) => {
self.validate_input_stream_character(c);
self.handle_raw_and_append_to_comment_token(c);
self.consume_and_append_to_comment_token(c, |c| {
if !is_allowed_character(c) {
return false;
}
// List of characters from above to stop consumption and a certain
// branch took control, `\r` is in list because of newline normalization
!matches!(c, '<' | '-' | '\x00' | '\r')
});
}
}
}
@ -3157,8 +3414,7 @@ where
// Append the lowercase version of the current input character (add 0x0020
// to the character's code point) to the current DOCTYPE token's name.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(Some(c.to_ascii_lowercase()), None, None);
self.consume_and_append_to_doctype_token_name(c, is_ascii_upper_alpha);
}
// U+0000 NULL
// This is an unexpected-null-character parse error. Append a U+FFFD
@ -3185,8 +3441,13 @@ where
// Append the current input character to the current DOCTYPE token's name.
Some(c) => {
self.validate_input_stream_character(c);
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(Some(c), None, None);
self.consume_and_append_to_doctype_token_name(c, |c| {
if !is_allowed_character(c) {
return false;
}
!is_spacy(c) && !matches!(c, '>' | '\x00') && !is_ascii_upper_alpha(c)
});
}
}
}
@ -3469,8 +3730,13 @@ where
// identifier.
Some(c) => {
self.validate_input_stream_character(c);
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(None, Some(c), None);
self.consume_and_append_to_doctype_token_public_id(c, |c| {
if !is_allowed_character(c) {
return false;
}
!matches!(c, '"' | '\x00' | '>' | '\r')
});
}
}
}
@ -3524,8 +3790,13 @@ where
// identifier.
Some(c) => {
self.validate_input_stream_character(c);
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(None, Some(c), None);
self.consume_and_append_to_doctype_token_public_id(c, |c| {
if !is_allowed_character(c) {
return false;
}
!matches!(c, '\'' | '\x00' | '>' | '\r')
});
}
}
}
@ -3841,8 +4112,13 @@ where
// identifier.
Some(c) => {
self.validate_input_stream_character(c);
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(None, None, Some(c));
self.consume_and_append_to_doctype_token_system_id(c, |c| {
if !is_allowed_character(c) {
return false;
}
!matches!(c, '"' | '\x00' | '>' | '\r')
});
}
}
}
@ -3896,8 +4172,13 @@ where
// identifier.
Some(c) => {
self.validate_input_stream_character(c);
self.append_raw_to_doctype_token(c);
self.append_to_doctype_token(None, None, Some(c));
self.consume_and_append_to_doctype_token_system_id(c, |c| {
if !is_allowed_character(c) {
return false;
}
!matches!(c, '\'' | '\x00' | '>' | '\r')
});
}
}
}
@ -4061,7 +4342,6 @@ where
// numeric character reference state.
Some(c @ '#') => {
self.temporary_buffer.push(c);
self.state = State::NumericCharacterReference;
}
// Anything else
@ -4532,7 +4812,7 @@ where
}
#[inline(always)]
fn skip_next_lf(&mut self, c: char) {
fn skip_whitespaces(&mut self, c: char) {
if c == '\r' && self.input.cur() == Some('\n') {
self.input.bump();
}
@ -4634,3 +4914,19 @@ fn is_ascii_lower_alpha(c: char) -> bool {
fn is_ascii_alpha(c: char) -> bool {
is_ascii_upper_alpha(c) || is_ascii_lower_alpha(c)
}
#[inline(always)]
fn is_allowed_control_character(c: u32) -> bool {
c != 0x00 && is_control(c)
}
#[inline(always)]
fn is_allowed_character(c: char) -> bool {
let c = c as u32;
if is_surrogate(c) || is_allowed_control_character(c) || is_noncharacter(c) {
return false;
}
return true;
}

View File

@ -0,0 +1,6 @@
| <!DOCTYPE html "-//W3C//DTD HTML 4.01
Transitional//EN" "">
| <html>
| <head>
| <body>

View File

@ -0,0 +1,3 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
Transitional//EN">

View File

@ -0,0 +1,66 @@
{
"type": "Document",
"span": {
"start": 1,
"end": 65,
"ctxt": 0
},
"mode": "no-quirks",
"children": [
{
"type": "DocumentType",
"span": {
"start": 1,
"end": 65,
"ctxt": 0
},
"name": "html",
"publicId": "-//W3C//DTD HTML 4.01\n\nTransitional//EN",
"systemId": null,
"raw": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01\n\nTransitional//EN\">"
},
{
"type": "Element",
"span": {
"start": 0,
"end": 0,
"ctxt": 0
},
"tagName": "html",
"namespace": "http://www.w3.org/1999/xhtml",
"attributes": [],
"children": [
{
"type": "Element",
"span": {
"start": 0,
"end": 0,
"ctxt": 0
},
"tagName": "head",
"namespace": "http://www.w3.org/1999/xhtml",
"attributes": [],
"children": [],
"content": null,
"isSelfClosing": false
},
{
"type": "Element",
"span": {
"start": 0,
"end": 0,
"ctxt": 0
},
"tagName": "body",
"namespace": "http://www.w3.org/1999/xhtml",
"attributes": [],
"children": [],
"content": null,
"isSelfClosing": false
}
],
"content": null,
"isSelfClosing": false
}
]
}

View File

@ -0,0 +1,7 @@
x Non conforming doctype
,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
2 | |
3 | `-> Transitional//EN">
`----

View File

@ -0,0 +1,25 @@
x Document
,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
2 | |
3 | `-> Transitional//EN">
`----
x Child
,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
2 | |
3 | `-> Transitional//EN">
`----
x DocumentType
,-[$DIR/tests/recovery/doctype/newline/input.html:1:1]
1 | ,-> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
2 | |
3 | `-> Transitional//EN">
`----
x Child
x Element