use std::{fs, mem::take, path::PathBuf}; use common::{document_span_visualizer, DomVisualizer}; use serde_json::Value; use swc_atoms::{js_word, JsWord}; use swc_common::{ collections::AHashSet, input::{SourceFileInput, StringInput}, BytePos, }; use swc_html_ast::*; use swc_html_parser::{ error::ErrorKind, lexer::{Lexer, State}, parser::{input::ParserInput, PResult, Parser, ParserConfig}, }; use swc_html_visit::VisitMutWith; use testing::NormalizedOutput; #[path = "common/mod.rs"] mod common; #[testing::fixture("tests/html5lib-tests-fixture/**/*.html")] fn span_visualizer(input: PathBuf) { document_span_visualizer( input, ParserConfig { ..Default::default() }, true, ) } fn unescape(s: &str) -> Option { let mut out = String::with_capacity(s.len()); let mut it = s.chars().peekable(); loop { match it.next() { None => return Some(out), Some('\\') => { if it.peek() != Some(&'u') { panic!("can't understand escape"); } let hex: String = it.by_ref().take(4).collect(); match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) { // TODO fix me surrogate paris // Some of the tests use lone surrogates, but we have no // way to represent them in the UTF-8 input to our parser. // Since these can only come from script, we will catch // them there. None => return None, Some(c) => out.push(c), } } Some(c) => out.push(c), } } } // TODO we need to enable `preserve_order` for serde, but we can't https://github.com/tkaitchuck/aHash/issues/95, so we sort attributes #[testing::fixture("tests/html5lib-tests/tokenizer/**/*.test")] fn html5lib_test_tokenizer(input: PathBuf) { let filename = input.to_str().expect("failed to parse path"); let contents = fs::read_to_string(filename).expect("Something went wrong reading the file"); let obj: Value = serde_json::from_str(&contents).expect("json parse error"); let tests = match obj.get(&"tests".to_string()) { Some(&Value::Array(ref tests)) => tests, _ => return, }; for test in tests.iter() { let description = test .get("description") .expect("failed to get input in test"); let states = if let Some(initial_states) = test.get("initialStates") { let mut states = vec![]; let json_states: Vec = serde_json::from_value(initial_states.clone()) .expect("failed to get input in test"); for json_state in json_states { match &*json_state { "Data state" => { states.push(State::Data); } "PLAINTEXT state" => { states.push(State::PlainText); } "RCDATA state" => { states.push(State::Rcdata); } "RAWTEXT state" => { states.push(State::Rawtext); } "Script data state" => { states.push(State::ScriptData); } "CDATA section state" => { states.push(State::CdataSection); } _ => { unreachable!() } } } states } else { vec![State::Data] }; for state in states.iter() { eprintln!("==== ==== Description ==== ====\n{}\n", description); let json_input = test["input"].clone(); let mut input: String = serde_json::from_value(json_input).expect("failed to get input in test"); let need_double_escaped = test.get("doubleEscaped").is_some(); if need_double_escaped { input = match unescape(&input) { Some(unescaped) => unescaped, _ => { continue; } }; } eprintln!("==== ==== Input ==== ====\n{}\n", input); let json_output = test["output"].clone(); let output = json_output.to_string(); eprintln!("==== ==== Output ==== ====\n{}\n", output); let lexer_str_input = StringInput::new(&input, BytePos(0), BytePos(input.len() as u32)); let mut lexer = Lexer::new(lexer_str_input); lexer.set_input_state(state.clone()); if let Some(last_start_tag) = test.get("lastStartTag") { let last_start_tag: String = serde_json::from_value(last_start_tag.clone()) .expect("failed to get lastStartTag in test"); lexer.set_last_start_tag_name(&last_start_tag); } let mut actual_tokens = vec![]; loop { let token_and_span = lexer.next(); if token_and_span.is_none() { break; } let mut new_token = token_and_span.unwrap().token.clone(); match new_token { Token::Doctype { ref mut raw, .. } => { *raw = None; } Token::StartTag { ref mut raw_tag_name, ref mut attributes, .. } => { *raw_tag_name = None; let mut new_attributes = vec![]; let mut already_seen: AHashSet = Default::default(); for mut attribute in take(attributes) { if already_seen.contains(&attribute.name) { continue; } already_seen.insert(attribute.name.clone()); if attribute.value.is_none() { attribute.value = Some("".into()); } attribute.span = Default::default(); attribute.raw_name = None; attribute.raw_value = None; new_attributes.push(attribute); } new_attributes.sort_by(|a, b| a.name.partial_cmp(&b.name).unwrap()); *attributes = new_attributes; } Token::EndTag { ref mut raw_tag_name, ref mut attributes, ref mut is_self_closing, .. } => { *raw_tag_name = None; *is_self_closing = false; *attributes = vec![]; } Token::Character { ref mut raw, .. } => { *raw = None; } Token::Comment { ref mut raw, .. } => { *raw = js_word!(""); } _ => {} } actual_tokens.push(new_token); } let mut expected_tokens: Vec = vec![]; if let Some(output_tokens) = json_output.as_array() { for output_token in output_tokens { match output_token { Value::Array(token_parts) => { let tokens = match token_parts[0].as_str().expect("failed") { "DOCTYPE" => { let name: Option = serde_json::from_value(token_parts[1].clone()) .expect("failed to deserialize"); let public_id: Option = serde_json::from_value(token_parts[2].clone()) .expect("failed to deserialize"); let system_id: Option = serde_json::from_value(token_parts[3].clone()) .expect("failed to deserialize"); let correctness: bool = serde_json::from_value(token_parts[4].clone()) .expect("failed to deserialize"); vec![Token::Doctype { name: name.map(|v| v.into()), force_quirks: !correctness, public_id: public_id.map(|v| v.into()), system_id: system_id.map(|v| v.into()), raw: None, }] } "StartTag" => { let tag_name: String = serde_json::from_value(token_parts[1].clone()) .expect("failed to deserialize"); let mut attributes = vec![]; if let Some(json_attributes) = token_parts.get(2) { let obj_attributes: Value = serde_json::from_value(json_attributes.clone()) .expect("failed to deserialize"); match obj_attributes { Value::Object(obj) => { for key in obj.keys() { let json_value = obj.get(key).expect( "failed to get value for attribute", ); let value: Option = serde_json::from_value(json_value.clone()) .expect("failed to deserialize"); attributes.push(AttributeToken { span: Default::default(), name: key.clone().into(), raw_name: None, value: value.map(|v| v.into()), raw_value: None, }) } } _ => { unreachable!(); } } } let mut is_self_closing = false; if let Some(json_is_self_closing) = token_parts.get(3) { let value: bool = serde_json::from_value(json_is_self_closing.clone()) .expect("failed to deserialize"); is_self_closing = value; } attributes.sort_by(|a, b| a.name.partial_cmp(&b.name).unwrap()); vec![Token::StartTag { tag_name: tag_name.into(), raw_tag_name: None, is_self_closing, attributes, }] } "EndTag" => { let tag_name: String = serde_json::from_value(token_parts[1].clone()) .expect("failed to deserialize"); vec![Token::EndTag { tag_name: tag_name.into(), raw_tag_name: None, is_self_closing: false, attributes: vec![], }] } "Character" => { let mut data: String = serde_json::from_value(token_parts[1].clone()) .expect("failed to deserialize"); if need_double_escaped { data = match unescape(&data) { Some(v) => v, _ => { continue; } }; } let mut tokens = vec![]; for c in data.chars() { tokens.push(Token::Character { value: c, raw: None, }) } tokens } "Comment" => { let mut data: String = serde_json::from_value(token_parts[1].clone()) .expect("failed to deserialize"); if need_double_escaped { data = match unescape(&data) { Some(v) => v, _ => { continue; } }; } vec![Token::Comment { data: data.into(), raw: js_word!(""), }] } _ => { unreachable!("unknown token {}", token_parts[0]) } }; expected_tokens.extend(tokens); } _ => { unreachable!(); } } } } let actual = serde_json::to_string(&actual_tokens).expect("failed to serialize actual tokens"); let expected = serde_json::to_string(&expected_tokens) .expect("failed to serialize expected tokens"); if let Some(json_errors) = test.get("errors") { let expected_errors = json_errors.as_array().expect("failed to deserialize error"); let actual_errors = lexer.take_errors(); eprintln!("==== ==== Errors ==== ====\n{:?}\n", actual_errors); assert_eq!(actual_errors.len(), expected_errors.len()); for expected_error in expected_errors.iter() { let obj_expected_code = expected_error.as_object().expect("failed to get error"); let expected_code = match obj_expected_code.get("code") { Some(expected_code) => match expected_code.as_str() { Some("eof-in-doctype") => ErrorKind::EofInDoctype, Some("eof-in-comment") => ErrorKind::EofInComment, Some("eof-in-cdata") => ErrorKind::EofInCdata, Some("eof-in-tag") => ErrorKind::EofInTag, Some("eof-before-tag-name") => ErrorKind::EofBeforeTagName, Some("eof-in-script-html-comment-like-text") => { ErrorKind::EofInScriptHtmlCommentLikeText } Some("unknown-named-character-reference") => { ErrorKind::UnknownNamedCharacterReference } Some("incorrectly-opened-comment") => { ErrorKind::IncorrectlyOpenedComment } Some("abrupt-closing-of-empty-comment") => { ErrorKind::AbruptClosingOfEmptyComment } Some("abrupt-doctype-public-identifier") => { ErrorKind::AbruptDoctypePublicIdentifier } Some("abrupt-doctype-system-identifier") => { ErrorKind::AbruptDoctypeSystemIdentifier } Some("absence-of-digits-in-numeric-character-reference") => { ErrorKind::AbsenceOfDigitsInNumericCharacterReference } Some("surrogate-character-reference") => { ErrorKind::SurrogateCharacterReference } Some("nested-comment") => ErrorKind::NestedComment, Some("end-tag-with-trailing-solidus") => { ErrorKind::EndTagWithTrailingSolidus } Some("null-character-reference") => ErrorKind::NullCharacterReference, Some("cdata-in-html-content") => ErrorKind::CdataInHtmlContent, Some("character-reference-outside-unicode-range") => { ErrorKind::CharacterReferenceOutsideUnicodeRange } Some("control-character-in-input-stream") => { ErrorKind::ControlCharacterInInputStream } Some("control-character-reference") => { ErrorKind::ControlCharacterReference } Some("noncharacter-in-input-stream") => { ErrorKind::NoncharacterInInputStream } Some("noncharacter-character-reference") => { ErrorKind::NoncharacterCharacterReference } Some("unexpected-equals-sign-before-attribute-name") => { ErrorKind::UnexpectedEqualsSignBeforeAttributeName } Some("unexpected-question-mark-instead-of-tag-name") => { ErrorKind::UnexpectedQuestionMarkInsteadOfTagName } Some("unexpected-character-after-doctype-system-identifier") => { ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier } Some("unexpected-null-character") => ErrorKind::UnexpectedNullCharacter, Some("unexpected-solidus-in-tag") => ErrorKind::UnexpectedSolidusInTag, Some("unexpected-character-in-attribute-name") => { ErrorKind::UnexpectedCharacterInAttributeName } Some("unexpected-character-in-unquoted-attribute-value") => { ErrorKind::UnexpectedCharacterInUnquotedAttributeValue } Some("duplicate-attribute") => ErrorKind::DuplicateAttribute, Some("end-tag-with-attributes") => ErrorKind::EndTagWithAttributes, Some("missing-whitespace-before-doctype-name") => { ErrorKind::MissingWhitespaceBeforeDoctypeName } Some("missing-attribute-value") => ErrorKind::MissingAttributeValue, Some("missing-doctype-public-identifier") => { ErrorKind::MissingDoctypePublicIdentifier } Some("missing-end-tag-name") => ErrorKind::MissingEndTagName, Some("missing-doctype-name") => ErrorKind::MissingDoctypeName, Some("missing-doctype-system-identifier") => { ErrorKind::MissingDoctypeSystemIdentifier } Some("missing-whitespace-after-doctype-system-keyword") => { ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword } Some("missing-whitespace-after-doctype-public-keyword") => { ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword } Some("missing-quote-before-doctype-public-identifier") => { ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier } Some("missing-quote-before-doctype-system-identifier") => { ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier } Some("incorrectly-closed-comment") => { ErrorKind::IncorrectlyClosedComment } Some("invalid-character-sequence-after-doctype-name") => { ErrorKind::InvalidCharacterSequenceAfterDoctypeName } Some( "missing-whitespace-between-doctype-public-and-system-identifiers", ) => { ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers } Some("missing-whitespace-between-attributes") => { ErrorKind::MissingWhitespaceBetweenAttributes } Some("missing-semicolon-after-character-reference") => { ErrorKind::MissingSemicolonAfterCharacterReference } Some("invalid-first-character-of-tag-name") => { ErrorKind::InvalidFirstCharacterOfTagName } _ => { unreachable!("unknown error {:?}", expected_code); } }, _ => { unreachable!(); } }; assert!(actual_errors .iter() .any(|error| *error.kind() == expected_code)); } } else { let errors = lexer.take_errors(); assert_eq!(errors.len(), 0); } assert_eq!(actual, expected); } } } enum TestState { Data, Document, DocumentFragment, Errors, NewErrors, } enum DocumentOrDocumentFragment { Document(PResult), DocumentFragment(PResult), } #[testing::fixture("tests/html5lib-tests/tree-construction/**/*.dat")] #[testing::fixture("tests/html5lib-tests-fixture/**/*.html")] fn html5lib_test_tree_construction(input: PathBuf) { if input.extension().unwrap() == "dat" { let mut tree_construction_base = None; let mut tests_base = None; let mut path_buf = input.to_path_buf(); while path_buf.pop() { if path_buf.ends_with("tree-construction") { tree_construction_base = Some(path_buf.clone()); } if path_buf.ends_with("tests") { tests_base = Some(path_buf.clone()); break; } } let tree_construction_base = tree_construction_base.unwrap(); let relative_path_to_test = input .strip_prefix(tree_construction_base) .expect("failed to get relative filename") .to_str() .unwrap() .replace(['/', '\\', '.'], "_"); let tests_base = tests_base.unwrap(); let dir = tests_base.join("html5lib-tests-fixture"); fs::create_dir_all(dir.clone()).expect("failed to create directory for fixtures"); let tests_file = fs::read_to_string(input).expect("Something went wrong reading the file"); let mut tests = tests_file.split("#data\n"); tests.next(); let mut counter = 0; while let Some(test) = tests.next() { let mut data: Vec<&str> = vec![]; let mut document: Vec<&str> = vec![]; let mut document_fragment: Vec<&str> = vec![]; let mut errors: Vec<&str> = vec![]; let mut new_errors: Vec<&str> = vec![]; let mut scripting_enabled = false; let mut state = Some(TestState::Data); let lines = test.lines(); for line in lines { match line { "#data" => { state = Some(TestState::Data); continue; } "#errors" => { state = Some(TestState::Errors); continue; } "#new-errors" => { state = Some(TestState::NewErrors); continue; } "#document" => { state = Some(TestState::Document); continue; } "#document-fragment" => { state = Some(TestState::DocumentFragment); continue; } "#script-on" => { scripting_enabled = true; state = None; continue; } "#script-off" => { scripting_enabled = false; state = None; continue; } _ => {} } match &state { Some(TestState::Data) => { data.push(line); } Some(TestState::Document) => { document.push(line); } Some(TestState::DocumentFragment) => { document_fragment.push(line); } Some(TestState::Errors) => { errors.push(line); } Some(TestState::NewErrors) => { new_errors.push(line); } _ => { unreachable!(); } } } let mut file_stem = relative_path_to_test.to_string(); file_stem.push('.'); file_stem.push_str(&counter.to_string()); // TODO workaround, fix - https://github.com/html5lib/html5lib-tests/pull/151 let need_skip_fragment = relative_path_to_test.contains("template_dat") && matches!(counter, 109 | 110 | 111); if !need_skip_fragment { if !document_fragment.is_empty() { file_stem += ".fragment_"; file_stem += &document_fragment.join("").replace(' ', "_"); } } if scripting_enabled { file_stem += ".script_on"; } let mut html_path = dir.clone(); html_path.push(file_stem.clone() + ".html"); fs::write(html_path, data.join("\n")) .expect("Something went wrong when writing to the file {:?}"); let mut dom_snapshot_path = dir.clone(); dom_snapshot_path.push(file_stem.clone() + ".dom.rust-debug"); let mut dom = document.join("\n"); if !dom.ends_with('\n') { dom.push('\n'); } fs::write(dom_snapshot_path, dom) .expect("Something went wrong when writing to the file"); let errors = errors.join("\n"); let mut errors_snapshot_path = dir.clone(); errors_snapshot_path.push(file_stem.clone() + ".output.stderr"); fs::write(errors_snapshot_path, errors) .expect("Something went wrong when writing to the file"); counter += 1; } return; } testing::run_test2(false, |cm, handler| { // Type annotation if false { return Ok(()); } let fm = cm.load_file(&input).unwrap(); let file_name = input.file_name().unwrap().to_string_lossy(); let scripting_enabled = file_name.contains("script_on"); let lexer = Lexer::new(SourceFileInput::from(&*fm)); let config = ParserConfig { scripting_enabled, iframe_srcdoc: false, }; let mut parser = Parser::new(lexer, config); let document_or_document_fragment = if file_name.contains("fragment") { let mut context_element_namespace = Namespace::HTML; let mut context_element_tag_name = "unknown"; let mut splitted = file_name.split('.'); let index = splitted.clone().count() - 2; let context_element = splitted .nth(index) .expect("failed to get context element from filename") .replace("fragment_", ""); if context_element.contains('_') { let mut splited = context_element.split('_'); if let Some(namespace) = splited.next() { context_element_namespace = match namespace { "math" => Namespace::MATHML, "svg" => Namespace::SVG, _ => { unreachable!(); } }; } if let Some(tag_name) = splited.next() { context_element_tag_name = tag_name; } } else { context_element_tag_name = &context_element; } let context_element = Element { span: Default::default(), namespace: context_element_namespace, tag_name: context_element_tag_name.into(), attributes: vec![], is_self_closing: false, children: vec![], content: None, }; DocumentOrDocumentFragment::DocumentFragment(parser.parse_document_fragment( context_element, DocumentMode::NoQuirks, None, )) } else { DocumentOrDocumentFragment::Document(parser.parse_document()) }; // `scripted` for browser tests with JS // `search` proposed, but not merged in spec let need_skip_tests = file_name.contains("scripted") || file_name.contains("search"); if !need_skip_tests { let errors = parser.take_errors(); let errors_path = input.with_extension("output.stderr"); let contents = fs::read_to_string(errors_path).expect("Something went wrong reading the file"); // TODO bug in tests - https://github.com/html5lib/html5lib-tests/issues/138 let actual_number_of_errors = if file_name.contains("tests19_dat.84") { errors.len() + 1 } else if file_name.contains("math_dat.5.fragment_tbody") || file_name.contains("math_dat.6.fragment_tbody") || file_name.contains("math_dat.7.fragment_tbody") || file_name.contains("svg_dat.5.fragment_tbody") || file_name.contains("svg_dat.6.fragment_tbody") || file_name.contains("svg_dat.7.fragment_tbody") { errors.len() - 1 } else if file_name.contains("foreign-fragment_dat.3.fragment_svg_path") { errors.len() - 1 } else { errors.len() }; let expected_number_of_errors = contents.lines().count(); assert_eq!(actual_number_of_errors, expected_number_of_errors); } let json_path = input.with_extension("output.json"); match document_or_document_fragment { DocumentOrDocumentFragment::Document(Ok(mut document)) => { let actual_json = serde_json::to_string_pretty(&document) .map(NormalizedOutput::from) .expect("failed to serialize document"); actual_json.compare_to_file(&json_path).unwrap(); if file_name.contains("scripted") || file_name.contains("search") { return Ok(()); } let mut dom_buf = String::new(); document.visit_mut_with(&mut DomVisualizer { dom_buf: &mut dom_buf, indent: 0, }); NormalizedOutput::from(dom_buf) .compare_to_file(&input.with_extension("dom.rust-debug")) .unwrap(); Ok(()) } DocumentOrDocumentFragment::DocumentFragment(Ok(mut document_fragment)) => { let actual_json = serde_json::to_string_pretty(&document_fragment) .map(NormalizedOutput::from) .expect("failed to serialize document"); actual_json.compare_to_file(&json_path).unwrap(); if need_skip_tests { return Ok(()); } let mut dom_buf = String::new(); document_fragment.visit_mut_with(&mut DomVisualizer { dom_buf: &mut dom_buf, indent: 0, }); NormalizedOutput::from(dom_buf) .compare_to_file(&input.with_extension("dom.rust-debug")) .unwrap(); Ok(()) } DocumentOrDocumentFragment::Document(Err(err)) | DocumentOrDocumentFragment::DocumentFragment(Err(err)) => { let mut d = err.to_diagnostics(&handler); d.note(&format!("current token = {}", parser.dump_cur())); d.emit(); panic!(); } } }) .unwrap(); }