diff --git a/crates/swc_html_parser/tests/fixture.rs b/crates/swc_html_parser/tests/fixture.rs index 71edd6f605d0..28e426379b05 100644 --- a/crates/swc_html_parser/tests/fixture.rs +++ b/crates/swc_html_parser/tests/fixture.rs @@ -43,846 +43,3 @@ fn dom_visualizer(input: PathBuf) { }, ) } - -fn unescape(s: &str) -> Option { - let mut out = String::with_capacity(s.len()); - let mut it = s.chars().peekable(); - - loop { - match it.next() { - None => return Some(out), - Some('\\') => { - if it.peek() != Some(&'u') { - panic!("can't understand escape"); - } - - let hex: String = it.by_ref().take(4).collect(); - - match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) { - // TODO fix me surrogate paris - // Some of the tests use lone surrogates, but we have no - // way to represent them in the UTF-8 input to our parser. - // Since these can only come from script, we will catch - // them there. - None => return None, - Some(c) => out.push(c), - } - } - Some(c) => out.push(c), - } - } -} - -// TODO we need to enable `preserve_order` for serde, but we can't https://github.com/tkaitchuck/aHash/issues/95, so we sort attributes -#[testing::fixture("tests/html5lib-tests/tokenizer/**/*.test")] -fn html5lib_test_tokenizer(input: PathBuf) { - let filename = input.to_str().expect("failed to parse path"); - let contents = fs::read_to_string(filename).expect("Something went wrong reading the file"); - let obj: Value = serde_json::from_str(&contents).expect("json parse error"); - let tests = match obj.get(&"tests".to_string()) { - Some(&Value::Array(ref tests)) => tests, - _ => return, - }; - - for test in tests.iter() { - let description = test - .get("description") - .expect("failed to get input in test"); - - let states = if let Some(initial_states) = test.get("initialStates") { - let mut states = vec![]; - let json_states: Vec = serde_json::from_value(initial_states.clone()) - .expect("failed to get input in test"); - - for json_state in json_states { - match &*json_state { - "Data state" => { - states.push(State::Data); - } - "PLAINTEXT state" => { - states.push(State::PlainText); - } - "RCDATA state" => { - states.push(State::Rcdata); - } - "RAWTEXT state" => { - states.push(State::Rawtext); - } - "Script data state" => { - states.push(State::ScriptData); - } - "CDATA section state" => { - states.push(State::CdataSection); - } - _ => { - unreachable!() - } - } - } - - states - } else { - vec![State::Data] - }; - - for state in states.iter() { - eprintln!("==== ==== Description ==== ====\n{}\n", description); - - let json_input = test["input"].clone(); - let mut input: String = - serde_json::from_value(json_input).expect("failed to get input in test"); - - let need_double_escaped = test.get("doubleEscaped").is_some(); - - if need_double_escaped { - input = match unescape(&input) { - Some(unescaped) => unescaped, - _ => { - continue; - } - }; - } - - eprintln!("==== ==== Input ==== ====\n{}\n", input); - - let json_output = test["output"].clone(); - let output = json_output.to_string(); - - eprintln!("==== ==== Output ==== ====\n{}\n", output); - - let lexer_str_input = StringInput::new(&input, BytePos(0), BytePos(input.len() as u32)); - let mut lexer = Lexer::new(lexer_str_input); - - lexer.set_input_state(state.clone()); - - if let Some(last_start_tag) = test.get("lastStartTag") { - let last_start_tag: String = serde_json::from_value(last_start_tag.clone()) - .expect("failed to get lastStartTag in test"); - - lexer.set_last_start_tag_name(&last_start_tag); - } - - let mut actual_tokens = vec![]; - - loop { - let token_and_span = lexer.next(); - - if token_and_span.is_none() { - break; - } - - let mut new_token = token_and_span.unwrap().token.clone(); - - match new_token { - Token::Doctype { ref mut raw, .. } => { - *raw = None; - } - Token::StartTag { - ref mut raw_tag_name, - ref mut attributes, - .. - } => { - *raw_tag_name = None; - - let mut new_attributes = vec![]; - let mut already_seen: AHashSet = Default::default(); - - for mut attribute in take(attributes) { - if already_seen.contains(&attribute.name) { - continue; - } - - already_seen.insert(attribute.name.clone()); - - if attribute.value.is_none() { - attribute.value = Some("".into()); - } - - attribute.span = Default::default(); - attribute.raw_name = None; - attribute.raw_value = None; - - new_attributes.push(attribute); - } - - new_attributes.sort_by(|a, b| a.name.partial_cmp(&b.name).unwrap()); - - *attributes = new_attributes; - } - Token::EndTag { - ref mut raw_tag_name, - ref mut attributes, - ref mut is_self_closing, - .. - } => { - *raw_tag_name = None; - *is_self_closing = false; - *attributes = vec![]; - } - Token::Character { ref mut raw, .. } => { - *raw = None; - } - Token::Comment { ref mut raw, .. } => { - *raw = js_word!(""); - } - _ => {} - } - - actual_tokens.push(new_token); - } - - let mut expected_tokens: Vec = vec![]; - - if let Some(output_tokens) = json_output.as_array() { - for output_token in output_tokens { - match output_token { - Value::Array(token_parts) => { - let tokens = match token_parts[0].as_str().expect("failed") { - "DOCTYPE" => { - let name: Option = - serde_json::from_value(token_parts[1].clone()) - .expect("failed to deserialize"); - let public_id: Option = - serde_json::from_value(token_parts[2].clone()) - .expect("failed to deserialize"); - let system_id: Option = - serde_json::from_value(token_parts[3].clone()) - .expect("failed to deserialize"); - let correctness: bool = - serde_json::from_value(token_parts[4].clone()) - .expect("failed to deserialize"); - - vec![Token::Doctype { - name: name.map(|v| v.into()), - force_quirks: !correctness, - public_id: public_id.map(|v| v.into()), - system_id: system_id.map(|v| v.into()), - raw: None, - }] - } - "StartTag" => { - let tag_name: String = - serde_json::from_value(token_parts[1].clone()) - .expect("failed to deserialize"); - let mut attributes = vec![]; - - if let Some(json_attributes) = token_parts.get(2) { - let obj_attributes: Value = - serde_json::from_value(json_attributes.clone()) - .expect("failed to deserialize"); - - match obj_attributes { - Value::Object(obj) => { - for key in obj.keys() { - let json_value = obj.get(key).expect( - "failed to get value for attribute", - ); - let value: Option = - serde_json::from_value(json_value.clone()) - .expect("failed to deserialize"); - - attributes.push(AttributeToken { - span: Default::default(), - name: key.clone().into(), - raw_name: None, - value: value.map(|v| v.into()), - raw_value: None, - }) - } - } - _ => { - unreachable!(); - } - } - } - - let mut is_self_closing = false; - - if let Some(json_is_self_closing) = token_parts.get(3) { - let value: bool = - serde_json::from_value(json_is_self_closing.clone()) - .expect("failed to deserialize"); - - is_self_closing = value; - } - - attributes.sort_by(|a, b| a.name.partial_cmp(&b.name).unwrap()); - - vec![Token::StartTag { - tag_name: tag_name.into(), - raw_tag_name: None, - is_self_closing, - attributes, - }] - } - "EndTag" => { - let tag_name: String = - serde_json::from_value(token_parts[1].clone()) - .expect("failed to deserialize"); - - vec![Token::EndTag { - tag_name: tag_name.into(), - raw_tag_name: None, - is_self_closing: false, - attributes: vec![], - }] - } - "Character" => { - let mut data: String = - serde_json::from_value(token_parts[1].clone()) - .expect("failed to deserialize"); - - if need_double_escaped { - data = match unescape(&data) { - Some(v) => v, - _ => { - continue; - } - }; - } - - let mut tokens = vec![]; - - for c in data.chars() { - tokens.push(Token::Character { - value: c, - raw: None, - }) - } - - tokens - } - "Comment" => { - let mut data: String = - serde_json::from_value(token_parts[1].clone()) - .expect("failed to deserialize"); - - if need_double_escaped { - data = match unescape(&data) { - Some(v) => v, - _ => { - continue; - } - }; - } - - vec![Token::Comment { - data: data.into(), - raw: js_word!(""), - }] - } - _ => { - unreachable!("unknown token {}", token_parts[0]) - } - }; - - expected_tokens.extend(tokens); - } - _ => { - unreachable!(); - } - } - } - } - - let actual = - serde_json::to_string(&actual_tokens).expect("failed to serialize actual tokens"); - let expected = serde_json::to_string(&expected_tokens) - .expect("failed to serialize expected tokens"); - - if let Some(json_errors) = test.get("errors") { - let expected_errors = json_errors.as_array().expect("failed to deserialize error"); - let actual_errors = lexer.take_errors(); - - eprintln!("==== ==== Errors ==== ====\n{:?}\n", actual_errors); - - assert_eq!(actual_errors.len(), expected_errors.len()); - - for expected_error in expected_errors.iter() { - let obj_expected_code = - expected_error.as_object().expect("failed to get error"); - let expected_code = match obj_expected_code.get("code") { - Some(expected_code) => match expected_code.as_str() { - Some("eof-in-doctype") => ErrorKind::EofInDoctype, - Some("eof-in-comment") => ErrorKind::EofInComment, - Some("eof-in-cdata") => ErrorKind::EofInCdata, - Some("eof-in-tag") => ErrorKind::EofInTag, - Some("eof-before-tag-name") => ErrorKind::EofBeforeTagName, - Some("eof-in-script-html-comment-like-text") => { - ErrorKind::EofInScriptHtmlCommentLikeText - } - Some("unknown-named-character-reference") => { - ErrorKind::UnknownNamedCharacterReference - } - Some("incorrectly-opened-comment") => { - ErrorKind::IncorrectlyOpenedComment - } - Some("abrupt-closing-of-empty-comment") => { - ErrorKind::AbruptClosingOfEmptyComment - } - Some("abrupt-doctype-public-identifier") => { - ErrorKind::AbruptDoctypePublicIdentifier - } - Some("abrupt-doctype-system-identifier") => { - ErrorKind::AbruptDoctypeSystemIdentifier - } - Some("absence-of-digits-in-numeric-character-reference") => { - ErrorKind::AbsenceOfDigitsInNumericCharacterReference - } - Some("surrogate-character-reference") => { - ErrorKind::SurrogateCharacterReference - } - Some("nested-comment") => ErrorKind::NestedComment, - Some("end-tag-with-trailing-solidus") => { - ErrorKind::EndTagWithTrailingSolidus - } - Some("null-character-reference") => ErrorKind::NullCharacterReference, - Some("cdata-in-html-content") => ErrorKind::CdataInHtmlContent, - Some("character-reference-outside-unicode-range") => { - ErrorKind::CharacterReferenceOutsideUnicodeRange - } - Some("control-character-in-input-stream") => { - ErrorKind::ControlCharacterInInputStream - } - Some("control-character-reference") => { - ErrorKind::ControlCharacterReference - } - Some("noncharacter-in-input-stream") => { - ErrorKind::NoncharacterInInputStream - } - Some("noncharacter-character-reference") => { - ErrorKind::NoncharacterCharacterReference - } - Some("unexpected-equals-sign-before-attribute-name") => { - ErrorKind::UnexpectedEqualsSignBeforeAttributeName - } - Some("unexpected-question-mark-instead-of-tag-name") => { - ErrorKind::UnexpectedQuestionMarkInsteadOfTagName - } - Some("unexpected-character-after-doctype-system-identifier") => { - ErrorKind::UnexpectedCharacterAfterDoctypeSystemIdentifier - } - Some("unexpected-null-character") => ErrorKind::UnexpectedNullCharacter, - Some("unexpected-solidus-in-tag") => ErrorKind::UnexpectedSolidusInTag, - Some("unexpected-character-in-attribute-name") => { - ErrorKind::UnexpectedCharacterInAttributeName - } - Some("unexpected-character-in-unquoted-attribute-value") => { - ErrorKind::UnexpectedCharacterInUnquotedAttributeValue - } - Some("duplicate-attribute") => ErrorKind::DuplicateAttribute, - Some("end-tag-with-attributes") => ErrorKind::EndTagWithAttributes, - Some("missing-whitespace-before-doctype-name") => { - ErrorKind::MissingWhitespaceBeforeDoctypeName - } - Some("missing-attribute-value") => ErrorKind::MissingAttributeValue, - Some("missing-doctype-public-identifier") => { - ErrorKind::MissingDoctypePublicIdentifier - } - Some("missing-end-tag-name") => ErrorKind::MissingEndTagName, - Some("missing-doctype-name") => ErrorKind::MissingDoctypeName, - Some("missing-doctype-system-identifier") => { - ErrorKind::MissingDoctypeSystemIdentifier - } - Some("missing-whitespace-after-doctype-system-keyword") => { - ErrorKind::MissingWhitespaceAfterDoctypeSystemKeyword - } - Some("missing-whitespace-after-doctype-public-keyword") => { - ErrorKind::MissingWhitespaceAfterDoctypePublicKeyword - } - Some("missing-quote-before-doctype-public-identifier") => { - ErrorKind::MissingQuoteBeforeDoctypePublicIdentifier - } - Some("missing-quote-before-doctype-system-identifier") => { - ErrorKind::MissingQuoteBeforeDoctypeSystemIdentifier - } - Some("incorrectly-closed-comment") => { - ErrorKind::IncorrectlyClosedComment - } - Some("invalid-character-sequence-after-doctype-name") => { - ErrorKind::InvalidCharacterSequenceAfterDoctypeName - } - Some( - "missing-whitespace-between-doctype-public-and-system-identifiers", - ) => { - ErrorKind::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers - } - Some("missing-whitespace-between-attributes") => { - ErrorKind::MissingWhitespaceBetweenAttributes - } - Some("missing-semicolon-after-character-reference") => { - ErrorKind::MissingSemicolonAfterCharacterReference - } - Some("invalid-first-character-of-tag-name") => { - ErrorKind::InvalidFirstCharacterOfTagName - } - _ => { - unreachable!("unknown error {:?}", expected_code); - } - }, - _ => { - unreachable!(); - } - }; - - assert!(actual_errors - .iter() - .any(|error| *error.kind() == expected_code)); - } - } else { - let errors = lexer.take_errors(); - - assert_eq!(errors.len(), 0); - } - - assert_eq!(actual, expected); - } - } -} - -enum TestState { - Data, - Document, - DocumentFragment, - Errors, - NewErrors, -} - -enum DocumentOrDocumentFragment { - Document(PResult), - DocumentFragment(PResult), -} - -#[testing::fixture("tests/html5lib-tests/tree-construction/**/*.dat")] -#[testing::fixture("tests/html5lib-tests-fixture/**/*.html")] -fn html5lib_test_tree_construction(input: PathBuf) { - if input.extension().unwrap() == "dat" { - let mut tree_construction_base = None; - let mut tests_base = None; - let mut path_buf = input.to_path_buf(); - - while path_buf.pop() { - if path_buf.ends_with("tree-construction") { - tree_construction_base = Some(path_buf.clone()); - } - - if path_buf.ends_with("tests") { - tests_base = Some(path_buf.clone()); - - break; - } - } - - let tree_construction_base = tree_construction_base.unwrap(); - let relative_path_to_test = input - .strip_prefix(tree_construction_base) - .expect("failed to get relative filename") - .to_str() - .unwrap() - .replace(['/', '.'], "_"); - let tests_base = tests_base.unwrap(); - - let dir = tests_base - .join("html5lib-tests-fixture") - .join(&relative_path_to_test); - - fs::create_dir_all(dir.clone()).expect("failed to create directory for fixtures"); - - let tests_file = fs::read_to_string(input).expect("Something went wrong reading the file"); - let mut tests = tests_file.split("#data\n"); - - tests.next(); - - let mut counter = 0; - - while let Some(test) = tests.next() { - let mut data: Vec<&str> = vec![]; - let mut document: Vec<&str> = vec![]; - let mut document_fragment: Vec<&str> = vec![]; - let mut errors: Vec<&str> = vec![]; - let mut new_errors: Vec<&str> = vec![]; - let mut scripting_enabled = false; - - let mut state = Some(TestState::Data); - let lines = test.lines(); - - for line in lines { - match line { - "#data" => { - state = Some(TestState::Data); - - continue; - } - "#errors" => { - state = Some(TestState::Errors); - - continue; - } - "#new-errors" => { - state = Some(TestState::NewErrors); - - continue; - } - "#document" => { - state = Some(TestState::Document); - - continue; - } - "#document-fragment" => { - state = Some(TestState::DocumentFragment); - - continue; - } - "#script-on" => { - scripting_enabled = true; - - state = None; - - continue; - } - "#script-off" => { - scripting_enabled = false; - - state = None; - - continue; - } - _ => {} - } - - match &state { - Some(TestState::Data) => { - data.push(line); - } - Some(TestState::Document) => { - document.push(line); - } - Some(TestState::DocumentFragment) => { - document_fragment.push(line); - } - Some(TestState::Errors) => { - errors.push(line); - } - Some(TestState::NewErrors) => { - new_errors.push(line); - } - _ => { - unreachable!(); - } - } - } - - let mut file_stem = counter.to_string(); - - // TODO workaround, fix - https://github.com/html5lib/html5lib-tests/pull/151 - let need_skip_fragment = relative_path_to_test.contains("template_dat") - && matches!(counter, 109 | 110 | 111); - - if !need_skip_fragment { - if !document_fragment.is_empty() { - file_stem += ".fragment_"; - file_stem += &document_fragment.join("").replace(' ', "_"); - } - } - - if scripting_enabled { - file_stem += ".script_on"; - } - - let test_case_dir = dir.join(file_stem); - - fs::create_dir_all(test_case_dir.clone()) - .expect("failed to create directory for fixtures"); - - let html_path = test_case_dir.join("input.html"); - - fs::write(html_path, data.join("\n")) - .expect("Something went wrong when writing to the file"); - - let dom_snapshot_path = test_case_dir.join("dom.rust-debug"); - - let mut dom = document.join("\n"); - - if !dom.ends_with('\n') { - dom.push('\n'); - } - - fs::write(dom_snapshot_path, dom) - .expect("Something went wrong when writing to the file"); - - let errors = errors.join("\n"); - let errors_snapshot_path = test_case_dir.join("output.stderr"); - - fs::write(errors_snapshot_path, errors) - .expect("Something went wrong when writing to the file"); - - counter += 1; - } - - return; - } - - testing::run_test2(false, |cm, handler| { - // Type annotation - if false { - return Ok(()); - } - - let parent = input.parent().unwrap(); - let parent_str = parent.to_string_lossy(); - - let scripting_enabled = parent_str.contains("script_on"); - let json_path = parent.join("output.json"); - let fm = cm.load_file(&input).unwrap(); - - let lexer = Lexer::new(SourceFileInput::from(&*fm)); - let config = ParserConfig { - scripting_enabled, - iframe_srcdoc: false, - }; - let mut parser = Parser::new(lexer, config); - let document_or_document_fragment = if parent_str.contains("fragment") { - let mut context_element_namespace = Namespace::HTML; - let mut context_element_tag_name = "unknown"; - - let context_element = parent_str - .split('.') - .last() - .expect("failed to get context element from filename") - .replace("fragment_", ""); - - if context_element.contains('_') { - let mut splited = context_element.split('_'); - - if let Some(namespace) = splited.next() { - context_element_namespace = match namespace { - "math" => Namespace::MATHML, - "svg" => Namespace::SVG, - _ => { - unreachable!(); - } - }; - } - - if let Some(tag_name) = splited.next() { - context_element_tag_name = tag_name; - } - } else { - context_element_tag_name = &context_element; - } - - let context_element = Element { - span: Default::default(), - namespace: context_element_namespace, - tag_name: context_element_tag_name.into(), - attributes: vec![], - is_self_closing: false, - children: vec![], - content: None, - }; - - DocumentOrDocumentFragment::DocumentFragment(parser.parse_document_fragment( - context_element, - DocumentMode::NoQuirks, - None, - )) - } else { - DocumentOrDocumentFragment::Document(parser.parse_document()) - }; - - let parent_parent = parent.parent().unwrap().to_string_lossy(); - // `scripted` for browser tests with JS - // `search` proposed, but not merged in spec - let need_skip_tests = - parent_parent.contains("scripted") || parent_parent.contains("search"); - - if !need_skip_tests { - let errors = parser.take_errors(); - let errors_path = input.parent().unwrap().join("output.stderr"); - let contents = - fs::read_to_string(errors_path).expect("Something went wrong reading the file"); - - // TODO bug in tests - https://github.com/html5lib/html5lib-tests/issues/138 - let actual_number_of_errors = - if parent_parent.contains("tests19_dat") && parent_str.contains("84") { - errors.len() + 1 - } else if (parent_parent.contains("math_dat") || parent_parent.contains("svg_dat")) - && (parent_str.contains("5.fragment_tbody") - || parent_str.contains("6.fragment_tbody") - || parent_str.contains("7.fragment_tbody")) - { - errors.len() - 1 - } else if parent_parent.contains("foreign-fragment_dat") - && parent_str.contains("3.fragment_svg_path") - { - errors.len() - 1 - } else { - errors.len() - }; - let expected_number_of_errors = contents.lines().count(); - - assert_eq!(actual_number_of_errors, expected_number_of_errors); - } - - match document_or_document_fragment { - DocumentOrDocumentFragment::Document(Ok(mut document)) => { - let actual_json = serde_json::to_string_pretty(&document) - .map(NormalizedOutput::from) - .expect("failed to serialize document"); - - actual_json.compare_to_file(&json_path).unwrap(); - - if parent_parent.contains("scripted") || parent_parent.contains("search") { - return Ok(()); - } - - let mut dom_buf = String::new(); - - document.visit_mut_with(&mut DomVisualizer { - dom_buf: &mut dom_buf, - indent: 0, - }); - - NormalizedOutput::from(dom_buf) - .compare_to_file(&parent.join("dom.rust-debug")) - .unwrap(); - - Ok(()) - } - DocumentOrDocumentFragment::DocumentFragment(Ok(mut document_fragment)) => { - let actual_json = serde_json::to_string_pretty(&document_fragment) - .map(NormalizedOutput::from) - .expect("failed to serialize document"); - - actual_json.compare_to_file(&json_path).unwrap(); - - if need_skip_tests { - return Ok(()); - } - - let mut dom_buf = String::new(); - - document_fragment.visit_mut_with(&mut DomVisualizer { - dom_buf: &mut dom_buf, - indent: 0, - }); - - NormalizedOutput::from(dom_buf) - .compare_to_file(&parent.join("dom.rust-debug")) - .unwrap(); - - Ok(()) - } - DocumentOrDocumentFragment::Document(Err(err)) - | DocumentOrDocumentFragment::DocumentFragment(Err(err)) => { - let mut d = err.to_diagnostics(&handler); - - d.note(&format!("current token = {}", parser.dump_cur())); - d.emit(); - - panic!(); - } - } - }) - .unwrap(); -}