From 420ba32e8c3b4f8ea34e3b17c82af31e2934059b Mon Sep 17 00:00:00 2001 From: Alexander Akait <4567934+alexander-akait@users.noreply.github.com> Date: Thu, 20 Oct 2022 02:32:22 +0300 Subject: [PATCH] perf(html/ast): Use `Atom` for `raw` (#6210) --- crates/swc_html_ast/src/base.rs | 12 +- crates/swc_html_ast/src/token.rs | 34 +++- crates/swc_html_parser/src/lexer/mod.rs | 223 +++++++++++----------- crates/swc_html_parser/src/parser/mod.rs | 24 ++- crates/swc_html_parser/src/parser/node.rs | 6 +- crates/swc_html_visit/src/lib.rs | 12 +- 6 files changed, 170 insertions(+), 141 deletions(-) diff --git a/crates/swc_html_ast/src/base.rs b/crates/swc_html_ast/src/base.rs index 1dd4b3e99710..2209109c0649 100644 --- a/crates/swc_html_ast/src/base.rs +++ b/crates/swc_html_ast/src/base.rs @@ -1,6 +1,6 @@ use is_macro::Is; use string_enum::StringEnum; -use swc_atoms::JsWord; +use swc_atoms::{Atom, JsWord}; use swc_common::{ast_node, EqIgnoreSpan, Span}; #[ast_node("Document")] @@ -64,7 +64,7 @@ pub struct DocumentType { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub system_id: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw: Option, + pub raw: Option, } impl EqIgnoreSpan for DocumentType { @@ -127,11 +127,11 @@ pub struct Attribute { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub name: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw_name: Option, + pub raw_name: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub value: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw_value: Option, + pub raw_value: Option, } impl EqIgnoreSpan for Attribute { @@ -150,7 +150,7 @@ pub struct Text { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub data: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw: Option, + pub raw: Option, } impl EqIgnoreSpan for Text { @@ -166,7 +166,7 @@ pub struct Comment { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub data: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw: Option, + pub raw: Option, } impl EqIgnoreSpan for Comment { diff --git a/crates/swc_html_ast/src/token.rs b/crates/swc_html_ast/src/token.rs index 048b48dcddcf..25edb1285c39 100644 --- a/crates/swc_html_ast/src/token.rs +++ b/crates/swc_html_ast/src/token.rs @@ -1,4 +1,4 @@ -use swc_atoms::JsWord; +use swc_atoms::{Atom, JsWord}; use swc_common::{ast_node, EqIgnoreSpan, Span}; #[ast_node("TokenAndSpan")] @@ -15,11 +15,29 @@ pub struct AttributeToken { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub name: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw_name: Option, + pub raw_name: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] pub value: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - pub raw_value: Option, + pub raw_value: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, EqIgnoreSpan)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize) +)] +#[cfg_attr( + feature = "rkyv", + archive(bound( + serialize = "__S: rkyv::ser::Serializer + rkyv::ser::ScratchSpace + \ + rkyv::ser::SharedSerializeRegistry", + deserialize = "__D: rkyv::de::SharedDeserializeRegistry" + )) +)] +pub enum Raw { + Same, + Atom(Atom), } #[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, EqIgnoreSpan)] @@ -50,13 +68,13 @@ pub enum Token { system_id: Option, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] // Raw value - raw: Option, + raw: Option, }, StartTag { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] tag_name: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - raw_tag_name: Option, + raw_tag_name: Option, is_self_closing: bool, attributes: Vec, }, @@ -64,7 +82,7 @@ pub enum Token { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] tag_name: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - raw_tag_name: Option, + raw_tag_name: Option, is_self_closing: bool, attributes: Vec, }, @@ -72,12 +90,12 @@ pub enum Token { #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] data: JsWord, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - raw: Option, + raw: Option, }, Character { value: char, #[cfg_attr(feature = "rkyv", with(swc_atoms::EncodeJsWord))] - raw: Option, + raw: Option, }, Eof, } diff --git a/crates/swc_html_parser/src/lexer/mod.rs b/crates/swc_html_parser/src/lexer/mod.rs index a28dbe6d849e..5792fc6c831f 100644 --- a/crates/swc_html_parser/src/lexer/mod.rs +++ b/crates/swc_html_parser/src/lexer/mod.rs @@ -1,8 +1,8 @@ use std::{cell::RefCell, char::REPLACEMENT_CHARACTER, collections::VecDeque, mem::take, rc::Rc}; -use swc_atoms::JsWord; +use swc_atoms::{Atom, JsWord}; use swc_common::{collections::AHashSet, input::Input, BytePos, Span}; -use swc_html_ast::{AttributeToken, Token, TokenAndSpan}; +use swc_html_ast::{AttributeToken, Raw, Token, TokenAndSpan}; use swc_html_utils::{Entity, HTML_ENTITIES}; use crate::{ @@ -364,7 +364,7 @@ where for c in take(&mut self.temporary_buffer).chars() { self.emit_token(Token::Character { value: c, - raw: Some(String::from(c).into()), + raw: Some(Raw::Same), }); } } @@ -421,23 +421,20 @@ where // When the length of raw is more than the length of temporary buffer we emit a // raw character in the first character token let mut once_raw = raw; - let mut once_emitted = false; + + let is_value_eq_raw = if let Some(raw) = &once_raw { + *raw == self.temporary_buffer + } else { + true + }; for c in take(&mut self.temporary_buffer).chars() { self.emit_token(Token::Character { value: c, - raw: match once_raw { - Some(_) => { - once_emitted = true; - once_raw.take().map(|x| x.into()) - } - _ => { - if once_emitted { - None - } else { - Some(String::from(c).into()) - } - } + raw: if is_value_eq_raw { + Some(Raw::Same) + } else { + once_raw.take().map(|x| Raw::Atom(Atom::new(x))) }, }); } @@ -559,7 +556,7 @@ where force_quirks: current_doctype_token.force_quirks, public_id: current_doctype_token.public_id.map(JsWord::from), system_id: current_doctype_token.system_id.map(JsWord::from), - raw: Some(JsWord::from(raw)), + raw: Some(Atom::new(raw)), }; self.emit_token(token); @@ -695,7 +692,7 @@ where let start_tag_token = Token::StartTag { tag_name: current_tag_token.tag_name.into(), - raw_tag_name: current_tag_token.raw_tag_name.map(JsWord::from), + raw_tag_name: current_tag_token.raw_tag_name.map(Atom::new), is_self_closing: current_tag_token.is_self_closing, attributes: current_tag_token .attributes @@ -715,9 +712,9 @@ where AttributeToken { span: attribute.span, name, - raw_name: attribute.raw_name.map(JsWord::from), + raw_name: attribute.raw_name.map(Atom::new), value: attribute.value.map(JsWord::from), - raw_value: attribute.raw_value.map(JsWord::from), + raw_value: attribute.raw_value.map(Atom::new), } }) .collect(), @@ -738,7 +735,7 @@ where let end_tag_token = Token::EndTag { tag_name: current_tag_token.tag_name.into(), - raw_tag_name: current_tag_token.raw_tag_name.map(JsWord::from), + raw_tag_name: current_tag_token.raw_tag_name.map(Atom::new), is_self_closing: current_tag_token.is_self_closing, attributes: current_tag_token .attributes @@ -758,9 +755,9 @@ where AttributeToken { span: attribute.span, name, - raw_name: attribute.raw_name.map(JsWord::from), + raw_name: attribute.raw_name.map(Atom::new), value: attribute.value.map(JsWord::from), - raw_value: attribute.raw_value.map(JsWord::from), + raw_value: attribute.raw_value.map(Atom::new), } }) .collect(), @@ -826,7 +823,7 @@ where self.emit_token(Token::Comment { data: comment.data.into(), - raw: Some(comment.raw.into()), + raw: Some(Atom::new(comment.raw)), }); } @@ -842,6 +839,30 @@ where op(self, &mut buf) } + #[inline(always)] + fn emit_character_token(&mut self, value: char) -> LexResult<()> { + self.emit_token(Token::Character { + value, + raw: Some(Raw::Same), + }); + + Ok(()) + } + + #[inline(always)] + fn emit_character_token_with_raw(&mut self, value: (char, char)) -> LexResult<()> { + self.with_char_buf(|l, buf| { + buf.push(value.1); + + l.emit_token(Token::Character { + value: value.0, + raw: Some(Raw::Atom(Atom::new(&**buf))), + }); + + Ok(()) + }) + } + fn handle_raw_and_emit_character_token(&mut self, c: char) -> LexResult<()> { let is_cr = c == '\r'; @@ -857,37 +878,19 @@ where l.emit_token(Token::Character { value: '\n', - raw: Some((&**buf).into()), + raw: Some(Raw::Atom(Atom::new(&**buf))), }); Ok(()) }) } else { - self.with_char_buf(|l, buf| { - buf.push(c); - - l.emit_token(Token::Character { - value: c, - raw: Some((&**buf).into()), - }); - - Ok(()) - }) - } - } - - #[inline(always)] - fn emit_character_token(&mut self, value: (char, char)) -> LexResult<()> { - self.with_char_buf(|l, buf| { - buf.push(value.1); - - l.emit_token(Token::Character { - value: value.0, - raw: Some((&**buf).into()), + self.emit_token(Token::Character { + value: c, + raw: Some(Raw::Same), }); Ok(()) - }) + } } fn read_token_and_span(&mut self) -> LexResult { @@ -936,7 +939,7 @@ where // character as a character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // EOF // Emit an end-of-file token. @@ -974,7 +977,7 @@ where // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // Emit an end-of-file token. @@ -1003,7 +1006,7 @@ where // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // Emit an end-of-file token. @@ -1032,7 +1035,7 @@ where // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // Emit an end-of-file token. @@ -1058,7 +1061,7 @@ where // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // Emit an end-of-file token. @@ -1110,7 +1113,7 @@ where // character token and an end-of-file token. None => { self.emit_error(ErrorKind::EofBeforeTagName); - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.emit_token(Token::Eof); return Ok(()); @@ -1120,7 +1123,7 @@ where // LESS-THAN SIGN character token. Reconsume in the data state. _ => { self.emit_error(ErrorKind::InvalidFirstCharacterOfTagName); - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::Data); } } @@ -1148,8 +1151,8 @@ where // token. None => { self.emit_error(ErrorKind::EofBeforeTagName); - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('/', '/'))?; + self.emit_character_token('<')?; + self.emit_character_token('/')?; self.emit_token(Token::Eof); return Ok(()); @@ -1233,7 +1236,7 @@ where // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RCDATA // state. _ => { - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::Rcdata); } } @@ -1253,8 +1256,8 @@ where // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS // character token. Reconsume in the RCDATA state. _ => { - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('/', '/'))?; + self.emit_character_token('<')?; + self.emit_character_token('/')?; self.reconsume_in_state(State::Rcdata); } } @@ -1262,8 +1265,8 @@ where // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state State::RcdataEndTagName => { let anything_else = |lexer: &mut Lexer| -> LexResult<()> { - lexer.emit_character_token(('<', '<'))?; - lexer.emit_character_token(('/', '/'))?; + lexer.emit_character_token('<')?; + lexer.emit_character_token('/')?; lexer.emit_temporary_buffer_as_character_tokens(); lexer.reconsume_in_state(State::Rcdata); @@ -1351,7 +1354,7 @@ where // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the RAWTEXT // state. _ => { - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::Rawtext); } } @@ -1371,8 +1374,8 @@ where // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS // character token. Reconsume in the RAWTEXT state. _ => { - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('/', '/'))?; + self.emit_character_token('<')?; + self.emit_character_token('/')?; self.reconsume_in_state(State::Rawtext); } } @@ -1380,8 +1383,8 @@ where // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state State::RawtextEndTagName => { let anything_else = |lexer: &mut Lexer| -> LexResult<()> { - lexer.emit_character_token(('<', '<'))?; - lexer.emit_character_token(('/', '/'))?; + lexer.emit_character_token('<')?; + lexer.emit_character_token('/')?; lexer.emit_temporary_buffer_as_character_tokens(); lexer.reconsume_in_state(State::Rawtext); @@ -1470,14 +1473,14 @@ where // SIGN character token and a U+0021 EXCLAMATION MARK character token. Some('!') => { self.state = State::ScriptDataEscapeStart; - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('!', '!'))?; + self.emit_character_token('<')?; + self.emit_character_token('!')?; } // Anything else // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script // data state. _ => { - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::ScriptData); } } @@ -1497,8 +1500,8 @@ where // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS // character token. Reconsume in the script data state. _ => { - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('/', '/'))?; + self.emit_character_token('<')?; + self.emit_character_token('/')?; self.reconsume_in_state(State::ScriptData); } } @@ -1506,8 +1509,8 @@ where // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state State::ScriptDataEndTagName => { let anything_else = |lexer: &mut Lexer| -> LexResult<()> { - lexer.emit_character_token(('<', '<'))?; - lexer.emit_character_token(('/', '/'))?; + lexer.emit_character_token('<')?; + lexer.emit_character_token('/')?; lexer.emit_temporary_buffer_as_character_tokens(); lexer.reconsume_in_state(State::ScriptData); @@ -1589,7 +1592,7 @@ where // HYPHEN-MINUS character token. Some(c @ '-') => { self.state = State::ScriptDataEscapeStartDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // Anything else // Reconsume in the script data state. @@ -1607,7 +1610,7 @@ where // HYPHEN-MINUS character token. Some(c @ '-') => { self.state = State::ScriptDataEscapedDashDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // Anything else // Reconsume in the script data state. @@ -1625,7 +1628,7 @@ where // character token. Some(c @ '-') => { self.state = State::ScriptDataEscapedDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data escaped less-than sign state. @@ -1637,7 +1640,7 @@ where // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -1665,7 +1668,7 @@ where // HYPHEN-MINUS character token. Some(c @ '-') => { self.state = State::ScriptDataEscapedDashDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data escaped less-than sign state. @@ -1678,7 +1681,7 @@ where Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); self.state = State::ScriptDataEscaped; - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -1706,7 +1709,7 @@ where // U+002D HYPHEN-MINUS (-) // Emit a U+002D HYPHEN-MINUS character token. Some(c @ '-') => { - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data escaped less-than sign state. @@ -1718,7 +1721,7 @@ where // character token. Some(c @ '>') => { self.state = State::ScriptData; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+0000 NULL // This is an unexpected-null-character parse error. Switch to the script @@ -1726,7 +1729,7 @@ where Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); self.state = State::ScriptDataEscaped; - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -1764,14 +1767,14 @@ where // state. Some(c) if is_ascii_alpha(c) => { self.temporary_buffer.clear(); - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::ScriptDataDoubleEscapeStart); } // Anything else // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the script // data escaped state. _ => { - self.emit_character_token(('<', '<'))?; + self.emit_character_token('<')?; self.reconsume_in_state(State::ScriptDataEscaped); } } @@ -1791,8 +1794,8 @@ where // Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS // character token. Reconsume in the script data escaped state. _ => { - self.emit_character_token(('<', '<'))?; - self.emit_character_token(('/', '/'))?; + self.emit_character_token('<')?; + self.emit_character_token('/')?; self.reconsume_in_state(State::ScriptDataEscaped); } } @@ -1800,8 +1803,8 @@ where // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state State::ScriptDataEscapedEndTagName => { let anything_else = |lexer: &mut Lexer| -> LexResult<()> { - lexer.emit_character_token(('<', '<'))?; - lexer.emit_character_token(('/', '/'))?; + lexer.emit_character_token('<')?; + lexer.emit_character_token('/')?; lexer.emit_temporary_buffer_as_character_tokens(); lexer.reconsume_in_state(State::ScriptDataEscaped); @@ -1908,7 +1911,7 @@ where self.state = State::ScriptDataEscaped; } - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // ASCII upper alpha // Append the lowercase version of the current input character (add 0x0020 @@ -1916,14 +1919,14 @@ where // input character as a character token. Some(c) if is_ascii_upper_alpha(c) => { self.temporary_buffer.push(c.to_ascii_lowercase()); - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // ASCII lower alpha // Append the current input character to the temporary buffer. Emit the // current input character as a character token. Some(c) if is_ascii_lower_alpha(c) => { self.temporary_buffer.push(c); - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // Anything else // Reconsume in the script data escaped state. @@ -1941,21 +1944,21 @@ where // HYPHEN-MINUS character token. Some(c @ '-') => { self.state = State::ScriptDataDoubleEscapedDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data double escaped less-than sign state. Emit a // U+003C LESS-THAN SIGN character token. Some(c @ '<') => { self.state = State::ScriptDataDoubleEscapedLessThanSign; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+0000 NULL // This is an unexpected-null-character parse error. Emit a U+FFFD // REPLACEMENT CHARACTER character token. Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -1983,14 +1986,14 @@ where // HYPHEN-MINUS character token. Some(c @ '-') => { self.state = State::ScriptDataDoubleEscapedDashDash; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data double escaped less-than sign state. Emit a // U+003C LESS-THAN SIGN character token. Some(c @ '<') => { self.state = State::ScriptDataDoubleEscapedLessThanSign; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+0000 NULL // This is an unexpected-null-character parse error. Switch to the script @@ -1999,7 +2002,7 @@ where Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); self.state = State::ScriptDataDoubleEscaped; - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -2027,21 +2030,21 @@ where // U+002D HYPHEN-MINUS (-) // Emit a U+002D HYPHEN-MINUS character token. Some(c @ '-') => { - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003C LESS-THAN SIGN (<) // Switch to the script data double escaped less-than sign state. Emit a // U+003C LESS-THAN SIGN character token. Some(c @ '<') => { self.state = State::ScriptDataDoubleEscapedLessThanSign; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+003E GREATER-THAN SIGN (>) // Switch to the script data state. Emit a U+003E GREATER-THAN SIGN // character token. Some(c @ '>') => { self.state = State::ScriptData; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // U+0000 NULL // This is an unexpected-null-character parse error. Switch to the script @@ -2050,7 +2053,7 @@ where Some(c @ '\x00') => { self.emit_error(ErrorKind::UnexpectedNullCharacter); self.state = State::ScriptDataDoubleEscaped; - self.emit_character_token((REPLACEMENT_CHARACTER, c))?; + self.emit_character_token_with_raw((REPLACEMENT_CHARACTER, c))?; } // EOF // This is an eof-in-script-html-comment-like-text parse error. Emit an @@ -2081,7 +2084,7 @@ where Some(c @ '/') => { self.temporary_buffer.clear(); self.state = State::ScriptDataDoubleEscapeEnd; - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // Anything else // Reconsume in the script data double escaped state. @@ -2123,7 +2126,7 @@ where self.state = State::ScriptDataDoubleEscaped; } - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // ASCII upper alpha // Append the lowercase version of the current input character (add 0x0020 @@ -2131,7 +2134,7 @@ where // input character as a character token. Some(c) if is_ascii_upper_alpha(c) => { self.temporary_buffer.push(c.to_ascii_lowercase()); - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // ASCII lower alpha // Append the current input character to the temporary buffer. Emit the @@ -2139,7 +2142,7 @@ where Some(c) if is_ascii_lower_alpha(c) => { self.temporary_buffer.push(c); - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } // Anything else // Reconsume in the script data double escaped state. @@ -4015,7 +4018,7 @@ where // Emit a U+005D RIGHT SQUARE BRACKET character token. Reconsume in the // CDATA section state. _ => { - self.emit_character_token((']', ']'))?; + self.emit_character_token(']')?; self.reconsume_in_state(State::CdataSection); } } @@ -4027,7 +4030,7 @@ where // U+005D RIGHT SQUARE BRACKET (]) // Emit a U+005D RIGHT SQUARE BRACKET character token. Some(c @ ']') => { - self.emit_character_token((']', c))?; + self.emit_character_token_with_raw((']', c))?; } // U+003E GREATER-THAN SIGN character // Switch to the data state. @@ -4038,8 +4041,8 @@ where // Emit two U+005D RIGHT SQUARE BRACKET character tokens. Reconsume in the // CDATA section state. _ => { - self.emit_character_token((']', ']'))?; - self.emit_character_token((']', ']'))?; + self.emit_character_token(']')?; + self.emit_character_token(']')?; self.reconsume_in_state(State::CdataSection); } } @@ -4199,7 +4202,7 @@ where if self.is_consumed_as_part_of_an_attribute() { self.append_to_attribute(None, Some((false, Some(c), Some(c)))); } else { - self.emit_character_token((c, c))?; + self.emit_character_token(c)?; } } // U+003B SEMICOLON (;) diff --git a/crates/swc_html_parser/src/parser/mod.rs b/crates/swc_html_parser/src/parser/mod.rs index 4293f8fa9607..ffc4ba8851b5 100644 --- a/crates/swc_html_parser/src/parser/mod.rs +++ b/crates/swc_html_parser/src/parser/mod.rs @@ -4,7 +4,7 @@ use active_formatting_element_stack::*; use doctypes::*; use node::*; use open_elements_stack::*; -use swc_atoms::{js_word, JsWord}; +use swc_atoms::{js_word, Atom, JsWord}; use swc_common::{Span, DUMMY_SP}; use swc_html_ast::*; @@ -676,7 +676,7 @@ where // A character token that is U+0000 NULL // // Parse error. Insert a U+FFFD REPLACEMENT CHARACTER character. - Token::Character { value, raw } if *value == '\x00' => { + Token::Character { value, .. } if *value == '\x00' => { self.errors.push(Error::new( token_and_info.span, ErrorKind::UnexpectedNullCharacter, @@ -684,9 +684,11 @@ where token_and_info.token = Token::Character { value: '\u{FFFD}', - raw: raw.clone(), + raw: Some(Raw::Atom(Atom::new(String::from('\x00')))), }; + println!("{:?}", token_and_info.token); + self.insert_character(token_and_info)?; } // A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), @@ -8482,7 +8484,9 @@ where } => { data.borrow_mut().push(*c); - if let Some(raw_c) = raw_c { + if let Some(Raw::Same) = raw_c { + raw_data.borrow_mut().push(*c); + } else if let Some(Raw::Atom(raw_c)) = raw_c { raw_data.borrow_mut().push_str(raw_c); } } @@ -8517,7 +8521,9 @@ where } => { data.borrow_mut().push(*c); - if let Some(raw_c) = raw_c { + if let Some(Raw::Same) = raw_c { + raw_data.borrow_mut().push(*c); + } else if let Some(Raw::Atom(raw_c)) = raw_c { raw_data.borrow_mut().push_str(raw_c); } } @@ -8547,13 +8553,15 @@ where value: c, raw: raw_c, } => { - let mut data = String::with_capacity(255); + let mut data = String::with_capacity(64); data.push(*c); - let mut raw = String::with_capacity(255); + let mut raw = String::with_capacity(64); - if let Some(raw_c) = raw_c { + if let Some(Raw::Same) = raw_c { + raw.push(*c); + } else if let Some(Raw::Atom(raw_c)) = raw_c { raw.push_str(raw_c); } diff --git a/crates/swc_html_parser/src/parser/node.rs b/crates/swc_html_parser/src/parser/node.rs index f8eafb6d905c..b0cab6dec64e 100644 --- a/crates/swc_html_parser/src/parser/node.rs +++ b/crates/swc_html_parser/src/parser/node.rs @@ -4,7 +4,7 @@ use std::{ rc::{Rc, Weak}, }; -use swc_atoms::JsWord; +use swc_atoms::{Atom, JsWord}; use swc_common::Span; use swc_html_ast::*; @@ -24,7 +24,7 @@ pub enum Data { name: Option, public_id: Option, system_id: Option, - raw: Option, + raw: Option, }, Element { namespace: Namespace, @@ -38,7 +38,7 @@ pub enum Data { }, Comment { data: JsWord, - raw: Option, + raw: Option, }, } diff --git a/crates/swc_html_visit/src/lib.rs b/crates/swc_html_visit/src/lib.rs index 4ac810eaa992..7345562e58c2 100644 --- a/crates/swc_html_visit/src/lib.rs +++ b/crates/swc_html_visit/src/lib.rs @@ -2,7 +2,7 @@ #![deny(clippy::all)] #![allow(clippy::ptr_arg)] -use swc_atoms::JsWord; +use swc_atoms::{Atom, JsWord}; use swc_common::Span; use swc_html_ast::*; use swc_visit::define; @@ -40,7 +40,7 @@ define!({ pub name: Option, pub public_id: Option, pub system_id: Option, - pub raw: Option, + pub raw: Option, } pub enum Namespace { @@ -67,21 +67,21 @@ define!({ pub namespace: Option, pub prefix: Option, pub name: JsWord, - pub raw_name: Option, + pub raw_name: Option, pub value: Option, - pub raw_value: Option, + pub raw_value: Option, } pub struct Text { pub span: Span, pub data: JsWord, - pub raw: Option, + pub raw: Option, } pub struct Comment { pub span: Span, pub data: JsWord, - pub raw: Option, + pub raw: Option, } });