From 5dc9376a4ba4d449d1c3f2755a6aca6a77919f6f Mon Sep 17 00:00:00 2001 From: Alexander Akait <4567934+alexander-akait@users.noreply.github.com> Date: Thu, 5 May 2022 05:03:27 +0300 Subject: [PATCH] fix(html/parser): Fix parsing of cdata in foreign context (#4531) --- crates/swc_html_parser/src/lexer/mod.rs | 18 +++-- crates/swc_html_parser/src/parser/input.rs | 9 ++- crates/swc_html_parser/src/parser/mod.rs | 21 ++++-- .../tests/recovery/element/svg-1/input.html | 2 + .../tests/recovery/element/svg-1/output.json | 75 +++++++++++++++++++ .../recovery/element/svg-1/output.stderr | 6 ++ .../recovery/element/svg-1/span.rust-debug | 42 +++++++++++ 7 files changed, 159 insertions(+), 14 deletions(-) create mode 100644 crates/swc_html_parser/tests/recovery/element/svg-1/input.html create mode 100644 crates/swc_html_parser/tests/recovery/element/svg-1/output.json create mode 100644 crates/swc_html_parser/tests/recovery/element/svg-1/output.stderr create mode 100644 crates/swc_html_parser/tests/recovery/element/svg-1/span.rust-debug diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index 39ff4c548ad..62a702c8b51 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -27,12 +27,12 @@ where state: State, return_state: State, errors: Vec, - in_foreign_node: bool, pub last_start_tag_token: Option, pending_tokens: Vec, cur_token: Option, character_reference_code: Option>, temporary_buffer: Option, + is_adjusted_current_node_is_element_in_html_namespace: Option, doctype_keyword: Option, last_emitted_error_pos: Option, } @@ -54,12 +54,12 @@ where state: State::Data, return_state: State::Data, errors: vec![], - in_foreign_node: false, last_start_tag_token: None, pending_tokens: vec![], cur_token: None, character_reference_code: None, temporary_buffer: None, + is_adjusted_current_node_is_element_in_html_namespace: None, doctype_keyword: None, last_emitted_error_pos: None, } @@ -205,13 +205,17 @@ where self.input.reset_to(state.pos); } - fn set_input_state(&mut self, state: State) { - self.state = state; - } - fn take_errors(&mut self) -> Vec { take(&mut self.errors) } + + fn set_adjusted_current_node_to_html_namespace(&mut self, value: bool) { + self.is_adjusted_current_node_is_element_in_html_namespace = Some(value); + } + + fn set_input_state(&mut self, state: State) { + self.state = state; + } } impl Lexer @@ -3108,7 +3112,7 @@ where Some(a2 @ 'a' | a2 @ 'A') => { match self.consume_next_char() { Some('[') => { - if self.in_foreign_node { + if let Some(false) = self.is_adjusted_current_node_is_element_in_html_namespace { self.state = State::CdataSection; } else { self.emit_error( diff --git a/crates/swc_html_parser/src/parser/input.rs b/crates/swc_html_parser/src/parser/input.rs index 2549956ec6f..c507ed1dc84 100644 --- a/crates/swc_html_parser/src/parser/input.rs +++ b/crates/swc_html_parser/src/parser/input.rs @@ -20,9 +20,11 @@ pub trait ParserInput { fn reset(&mut self, state: &Self::State); + fn take_errors(&mut self) -> Vec; + fn set_input_state(&mut self, state: State); - fn take_errors(&mut self) -> Vec; + fn set_adjusted_current_node_to_html_namespace(&mut self, value: bool); } #[derive(Debug)] @@ -120,4 +122,9 @@ where pub(super) fn set_input_state(&mut self, state: State) { self.input.set_input_state(state); } + + pub(super) fn set_adjusted_current_node_to_html_namespace(&mut self, value: bool) { + self.input + .set_adjusted_current_node_to_html_namespace(value); + } } diff --git a/crates/swc_html_parser/src/parser/mod.rs b/crates/swc_html_parser/src/parser/mod.rs index 259f89272fe..a6c0537285d 100644 --- a/crates/swc_html_parser/src/parser/mod.rs +++ b/crates/swc_html_parser/src/parser/mod.rs @@ -278,17 +278,26 @@ where // to the current insertion mode in HTML content. let adjusted_current_node = self.get_adjusted_current_node(); + let is_element_in_html_namespace = is_element_in_html_namespace(adjusted_current_node); + let is_mathml_text_integration_point = + is_mathml_text_integration_point(adjusted_current_node); + let is_mathml_annotation_xml = is_mathml_annotation_xml(adjusted_current_node); + let is_html_integration_point = is_html_integration_point(adjusted_current_node); + + self.input + .set_adjusted_current_node_to_html_namespace(is_element_in_html_namespace); + if self.open_elements_stack.items.is_empty() - || is_element_in_html_namespace(adjusted_current_node) - || (is_mathml_text_integration_point(adjusted_current_node) + || is_element_in_html_namespace + || (is_mathml_text_integration_point && matches!(&token_and_info.token, Token::StartTag { tag_name, .. } if &*tag_name != "mglyph" && &*tag_name != "malignmark")) - || (is_mathml_text_integration_point(adjusted_current_node) + || (is_mathml_text_integration_point && matches!(&token_and_info.token, Token::Character { .. })) - || (is_mathml_annotation_xml(adjusted_current_node) + || (is_mathml_annotation_xml && matches!(&token_and_info.token, Token::StartTag { tag_name, .. } if &*tag_name == "svg")) - || (is_html_integration_point(adjusted_current_node) + || (is_html_integration_point && matches!(&token_and_info.token, Token::StartTag { .. })) - || (is_html_integration_point(adjusted_current_node) + || (is_html_integration_point && matches!(&token_and_info.token, Token::Character { .. })) || matches!(&token_and_info.token, Token::Eof) { diff --git a/crates/swc_html_parser/tests/recovery/element/svg-1/input.html b/crates/swc_html_parser/tests/recovery/element/svg-1/input.html new file mode 100644 index 00000000000..badf4036a30 --- /dev/null +++ b/crates/swc_html_parser/tests/recovery/element/svg-1/input.html @@ -0,0 +1,2 @@ +foo +bar \ No newline at end of file diff --git a/crates/swc_html_parser/tests/recovery/element/svg-1/output.json b/crates/swc_html_parser/tests/recovery/element/svg-1/output.json new file mode 100644 index 00000000000..f2240764abc --- /dev/null +++ b/crates/swc_html_parser/tests/recovery/element/svg-1/output.json @@ -0,0 +1,75 @@ +{ + "type": "Document", + "span": { + "start": 0, + "end": 21, + "ctxt": 0 + }, + "mode": "no-quirks", + "children": [ + { + "type": "Element", + "span": { + "start": 0, + "end": 21, + "ctxt": 0 + }, + "tagName": "html", + "namespace": "http://www.w3.org/1999/xhtml", + "attributes": [], + "children": [ + { + "type": "Element", + "span": { + "start": 0, + "end": 5, + "ctxt": 0 + }, + "tagName": "head", + "namespace": "http://www.w3.org/1999/xhtml", + "attributes": [], + "children": [], + "content": null + }, + { + "type": "Element", + "span": { + "start": 0, + "end": 21, + "ctxt": 0 + }, + "tagName": "body", + "namespace": "http://www.w3.org/1999/xhtml", + "attributes": [], + "children": [ + { + "type": "Element", + "span": { + "start": 0, + "end": 21, + "ctxt": 0 + }, + "tagName": "svg", + "namespace": "http://www.w3.org/2000/svg", + "attributes": [], + "children": [ + { + "type": "Text", + "span": { + "start": 5, + "end": 21, + "ctxt": 0 + }, + "value": "foo\nbar" + } + ], + "content": null + } + ], + "content": null + } + ], + "content": null + } + ] +} diff --git a/crates/swc_html_parser/tests/recovery/element/svg-1/output.stderr b/crates/swc_html_parser/tests/recovery/element/svg-1/output.stderr new file mode 100644 index 00000000000..32ac6b201c9 --- /dev/null +++ b/crates/swc_html_parser/tests/recovery/element/svg-1/output.stderr @@ -0,0 +1,6 @@ + + x Unexpected token + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> foo + 2 | `-> bar + `---- diff --git a/crates/swc_html_parser/tests/recovery/element/svg-1/span.rust-debug b/crates/swc_html_parser/tests/recovery/element/svg-1/span.rust-debug new file mode 100644 index 00000000000..a2c1a5a37cd --- /dev/null +++ b/crates/swc_html_parser/tests/recovery/element/svg-1/span.rust-debug @@ -0,0 +1,42 @@ + + x Document + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> foo + 2 | `-> bar + `---- + + x Child + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> foo + 2 | `-> bar + `---- + + x Element + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> foo + 2 | `-> bar + `---- + + x Child + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | foo + : ^^^^^ + `---- + + x Element + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | <svg><![CDATA[foo + : ^^^^^ + `---- + + x Child + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> <svg><![CDATA[foo + 2 | `-> bar + `---- + + x Text + ,-[$DIR/tests/recovery/element/svg-1/input.html:1:1] + 1 | ,-> foo + 2 | `-> bar + `----