Skip to content

Commit

Permalink
refactor(html/parser): Refactor (#6267)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-akait committed Oct 27, 2022
1 parent 09b2961 commit c255cfd
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 66 deletions.
115 changes: 51 additions & 64 deletions crates/swc_html_parser/src/lexer/mod.rs
Expand Up @@ -234,7 +234,7 @@ where
take(&mut self.errors)
}

fn set_last_start_tag_name(&mut self, tag_name: &str) {
fn set_last_start_tag_name(&mut self, tag_name: &JsWord) {
self.last_start_tag_name = Some(tag_name.into());
}

Expand Down Expand Up @@ -596,16 +596,11 @@ where
}
}

fn start_new_attribute(&mut self, c: Option<char>) {
fn start_new_attribute(&mut self) {
if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
// The longest known attribute is "glyph-orientation-horizontal" for SVG tags
let mut name = String::with_capacity(28);
let mut raw_name = String::with_capacity(28);

if let Some(c) = c {
name.push(c);
raw_name.push(c);
};
let name = String::with_capacity(28);
let raw_name = String::with_capacity(28);

attributes.push(Attribute {
span: Default::default(),
Expand All @@ -619,49 +614,49 @@ where
}
}

fn append_to_attribute(
&mut self,
name: Option<(char, char)>,
value: Option<(bool, Option<char>, Option<char>)>,
) {
fn append_name_to_attribute(&mut self, c: char, raw_c: Option<char>) {
if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
if let Some(attribute) = attributes.last_mut() {
if let Some(name) = name {
attribute.name.push(name.0);
attribute.name.push(c);

if let Some(raw_c) = raw_c {
if let Some(raw_name) = &mut attribute.raw_name {
raw_name.push(name.1);
raw_name.push(raw_c);
}
}
}
}
}

if let Some(value) = value {
if let Some(c) = value.1 {
if let Some(old_value) = &mut attribute.value {
old_value.push(c);
} else {
let mut new_value = String::with_capacity(255);
fn append_value_to_attribute(&mut self, quotes: bool, c: Option<char>, raw_c: Option<char>) {
if let Some(Tag { attributes, .. }) = &mut self.current_tag_token {
if let Some(attribute) = attributes.last_mut() {
if let Some(c) = c {
if let Some(old_value) = &mut attribute.value {
old_value.push(c);
} else {
let mut new_value = String::with_capacity(255);

new_value.push(c);
new_value.push(c);

attribute.value = Some(new_value);
}
attribute.value = Some(new_value);
}
}

if let Some(c) = value.2 {
// Quote for attribute was found, so we set empty value by default
if value.0 && attribute.value.is_none() {
attribute.value = Some(String::with_capacity(255));
}
if let Some(raw_c) = raw_c {
// Quote for attribute was found, so we set empty value by default
if quotes && attribute.value.is_none() {
attribute.value = Some(String::with_capacity(255));
}

if let Some(raw_value) = &mut attribute.raw_value {
raw_value.push(c);
} else {
let mut raw_new_value = String::with_capacity(255);
if let Some(raw_value) = &mut attribute.raw_value {
raw_value.push(raw_c);
} else {
let mut raw_new_value = String::with_capacity(255);

raw_new_value.push(c);
raw_new_value.push(raw_c);

attribute.raw_value = Some(raw_new_value);
}
attribute.raw_value = Some(raw_new_value);
}
}
}
Expand Down Expand Up @@ -2178,23 +2173,24 @@ where
// We set `None` for `value` to support boolean attributes in AST
Some(c @ '=') => {
self.emit_error(ErrorKind::UnexpectedEqualsSignBeforeAttributeName);
self.start_new_attribute(Some(c));
self.start_new_attribute();
self.append_name_to_attribute(c, Some(c));
self.state = State::AttributeName;
}
// Anything else
// Start a new attribute in the current tag token. Set that attribute name
// and value to the empty string. Reconsume in the attribute name state.
// We set `None` for `value` to support boolean attributes in AST
_ => {
self.start_new_attribute(None);
self.start_new_attribute();
self.reconsume_in_state(State::AttributeName);
}
}
}
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
State::AttributeName => {
let anything_else = |lexer: &mut Lexer<I>, c: char| {
lexer.append_to_attribute(Some((c, c)), None);
lexer.append_name_to_attribute(c, Some(c));
};

// Consume the next input character:
Expand Down Expand Up @@ -2225,14 +2221,14 @@ where
// Append the lowercase version of the current input character (add 0x0020
// to the character's code point) to the current attribute's name.
Some(c) if is_ascii_upper_alpha(c) => {
self.append_to_attribute(Some((c.to_ascii_lowercase(), c)), None);
self.append_name_to_attribute(c.to_ascii_lowercase(), Some(c));
}
// U+0000 NULL
// This is an unexpected-null-character parse error. Append a U+FFFD
// REPLACEMENT CHARACTER character to the current attribute's name.
Some(c @ '\x00') => {
self.emit_error(ErrorKind::UnexpectedNullCharacter);
self.append_to_attribute(Some((REPLACEMENT_CHARACTER, c)), None);
self.append_name_to_attribute(REPLACEMENT_CHARACTER, Some(c));
}
// U+0022 QUOTATION MARK (")
// U+0027 APOSTROPHE (')
Expand Down Expand Up @@ -2304,7 +2300,7 @@ where
// and value to the empty string. Reconsume in the attribute name state.
// We set `None` for `value` to support boolean attributes in AST
_ => {
self.start_new_attribute(None);
self.start_new_attribute();
self.reconsume_in_state(State::AttributeName);
}
}
Expand All @@ -2324,13 +2320,13 @@ where
// U+0022 QUOTATION MARK (")
// Switch to the attribute value (double-quoted) state.
Some(c @ '"') => {
self.append_to_attribute(None, Some((true, None, Some(c))));
self.append_value_to_attribute(true, None, Some(c));
self.state = State::AttributeValueDoubleQuoted;
}
// U+0027 APOSTROPHE (')
// Switch to the attribute value (single-quoted) state.
Some(c @ '\'') => {
self.append_to_attribute(None, Some((true, None, Some(c))));
self.append_value_to_attribute(true, None, Some(c));
self.state = State::AttributeValueSingleQuoted;
}
// U+003E GREATER-THAN SIGN (>)
Expand All @@ -2356,7 +2352,7 @@ where
// Switch to the after attribute value (quoted) state.
// We set value to support empty attributes (i.e. `attr=""`)
Some(c @ '"') => {
self.append_to_attribute(None, Some((false, None, Some(c))));
self.append_value_to_attribute(false, None, Some(c));
self.state = State::AfterAttributeValueQuoted;
}
// U+0026 AMPERSAND (&)
Expand All @@ -2371,10 +2367,7 @@ where
// REPLACEMENT CHARACTER character to the current attribute's value.
Some(c @ '\x00') => {
self.emit_error(ErrorKind::UnexpectedNullCharacter);
self.append_to_attribute(
None,
Some((false, Some(REPLACEMENT_CHARACTER), Some(c))),
);
self.append_value_to_attribute(false, Some(REPLACEMENT_CHARACTER), Some(c));
}
// EOF
// This is an eof-in-tag parse error. Emit an end-of-file token.
Expand All @@ -2388,7 +2381,7 @@ where
// Append the current input character to the current attribute's value.
Some(c) => {
self.validate_input_stream_character(c);
self.append_to_attribute(None, Some((false, Some(c), Some(c))));
self.append_value_to_attribute(false, Some(c), Some(c));
}
}
}
Expand All @@ -2400,7 +2393,7 @@ where
// Switch to the after attribute value (quoted) state.
// We set value to support empty attributes (i.e. `attr=''`)
Some(c @ '\'') => {
self.append_to_attribute(None, Some((false, None, Some(c))));
self.append_value_to_attribute(false, None, Some(c));
self.state = State::AfterAttributeValueQuoted;
}
// U+0026 AMPERSAND (&)
Expand All @@ -2415,10 +2408,7 @@ where
// REPLACEMENT CHARACTER character to the current attribute's value.
Some(c @ '\x00') => {
self.emit_error(ErrorKind::UnexpectedNullCharacter);
self.append_to_attribute(
None,
Some((false, Some(REPLACEMENT_CHARACTER), Some(c))),
);
self.append_value_to_attribute(false, Some(REPLACEMENT_CHARACTER), Some(c));
}
// EOF
// This is an eof-in-tag parse error. Emit an end-of-file token.
Expand All @@ -2432,14 +2422,14 @@ where
// Append the current input character to the current attribute's value.
Some(c) => {
self.validate_input_stream_character(c);
self.append_to_attribute(None, Some((false, Some(c), Some(c))));
self.append_value_to_attribute(false, Some(c), Some(c));
}
}
}
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
State::AttributeValueUnquoted => {
let anything_else = |lexer: &mut Lexer<I>, c: char| {
lexer.append_to_attribute(None, Some((false, Some(c), Some(c))));
lexer.append_value_to_attribute(false, Some(c), Some(c));
};

// Consume the next input character:
Expand Down Expand Up @@ -2473,10 +2463,7 @@ where
// REPLACEMENT CHARACTER character to the current attribute's value.
Some(c @ '\x00') => {
self.emit_error(ErrorKind::UnexpectedNullCharacter);
self.append_to_attribute(
None,
Some((false, Some(REPLACEMENT_CHARACTER), Some(c))),
);
self.append_value_to_attribute(false, Some(REPLACEMENT_CHARACTER), Some(c));
}
// U+0022 QUOTATION MARK (")
// U+0027 APOSTROPHE (')
Expand Down Expand Up @@ -4196,7 +4183,7 @@ where
// Otherwise, emit the current input character as a character token.
Some(c) if c.is_ascii_alphanumeric() => {
if self.is_consumed_as_part_of_an_attribute() {
self.append_to_attribute(None, Some((false, Some(c), Some(c))));
self.append_value_to_attribute(false, Some(c), Some(c));
} else {
self.emit_character_token(c)?;
}
Expand Down
3 changes: 2 additions & 1 deletion crates/swc_html_parser/src/parser/input.rs
@@ -1,5 +1,6 @@
use std::{fmt::Debug, mem::take};

use swc_atoms::JsWord;
use swc_common::{BytePos, Span};
use swc_html_ast::{Token, TokenAndSpan};

Expand All @@ -13,7 +14,7 @@ pub trait ParserInput: Iterator<Item = TokenAndSpan> {

fn take_errors(&mut self) -> Vec<Error>;

fn set_last_start_tag_name(&mut self, tag_name: &str);
fn set_last_start_tag_name(&mut self, tag_name: &JsWord);

fn set_input_state(&mut self, state: State);

Expand Down
2 changes: 1 addition & 1 deletion crates/swc_html_parser/tests/html5lib_tests.rs
Expand Up @@ -146,7 +146,7 @@ fn html5lib_test_tokenizer(input: PathBuf) {
lexer.set_input_state(state.clone());

if let Some(last_start_tag) = test.get("lastStartTag") {
let last_start_tag: String = serde_json::from_value(last_start_tag.clone())
let last_start_tag: JsWord = serde_json::from_value(last_start_tag.clone())
.expect("failed to get lastStartTag in test");

lexer.set_last_start_tag_name(&last_start_tag);
Expand Down

1 comment on commit c255cfd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmark

Benchmark suite Current: c255cfd Previous: c318446 Ratio
es/full/bugs-1 425656 ns/iter (± 12124) 351098 ns/iter (± 13736) 1.21
es/full/minify/libraries/antd 2112013799 ns/iter (± 23917214) 1902755724 ns/iter (± 30591665) 1.11
es/full/minify/libraries/d3 436986000 ns/iter (± 8385848) 412876150 ns/iter (± 16936844) 1.06
es/full/minify/libraries/echarts 1800964043 ns/iter (± 18143263) 1665869386 ns/iter (± 165102758) 1.08
es/full/minify/libraries/jquery 119725938 ns/iter (± 3836487) 97019617 ns/iter (± 6327132) 1.23
es/full/minify/libraries/lodash 140705740 ns/iter (± 2894877) 142928270 ns/iter (± 28017963) 0.98
es/full/minify/libraries/moment 70106299 ns/iter (± 460609) 61512055 ns/iter (± 2424335) 1.14
es/full/minify/libraries/react 24144169 ns/iter (± 281704) 20458342 ns/iter (± 699566) 1.18
es/full/minify/libraries/terser 330787959 ns/iter (± 13183300) 349326588 ns/iter (± 16923562) 0.95
es/full/minify/libraries/three 604775193 ns/iter (± 11840143) 592443534 ns/iter (± 19889612) 1.02
es/full/minify/libraries/typescript 4009473205 ns/iter (± 30798593) 3640111607 ns/iter (± 775570662) 1.10
es/full/minify/libraries/victory 904122343 ns/iter (± 20983518) 818085181 ns/iter (± 21394040) 1.11
es/full/minify/libraries/vue 178210396 ns/iter (± 4913465) 150694166 ns/iter (± 16114547) 1.18
es/full/codegen/es3 40808 ns/iter (± 486) 32709 ns/iter (± 694) 1.25
es/full/codegen/es5 40599 ns/iter (± 540) 32685 ns/iter (± 551) 1.24
es/full/codegen/es2015 40717 ns/iter (± 990) 32861 ns/iter (± 1124) 1.24
es/full/codegen/es2016 40545 ns/iter (± 712) 32689 ns/iter (± 932) 1.24
es/full/codegen/es2017 40537 ns/iter (± 846) 32759 ns/iter (± 1962) 1.24
es/full/codegen/es2018 40572 ns/iter (± 1096) 32785 ns/iter (± 4431) 1.24
es/full/codegen/es2019 40580 ns/iter (± 1502) 32721 ns/iter (± 595) 1.24
es/full/codegen/es2020 40719 ns/iter (± 864) 33265 ns/iter (± 548) 1.22
es/full/all/es3 232201716 ns/iter (± 4903607) 192896187 ns/iter (± 7174584) 1.20
es/full/all/es5 219331415 ns/iter (± 3412543) 182716767 ns/iter (± 7374127) 1.20
es/full/all/es2015 177321052 ns/iter (± 4010221) 145588591 ns/iter (± 7520312) 1.22
es/full/all/es2016 177739154 ns/iter (± 3677578) 144217344 ns/iter (± 6207969) 1.23
es/full/all/es2017 177918666 ns/iter (± 4961369) 144833095 ns/iter (± 10954996) 1.23
es/full/all/es2018 175457004 ns/iter (± 4340355) 142447115 ns/iter (± 4386895) 1.23
es/full/all/es2019 176206418 ns/iter (± 4462392) 141718989 ns/iter (± 4894981) 1.24
es/full/all/es2020 166514582 ns/iter (± 4176959) 136371626 ns/iter (± 4117309) 1.22
es/full/parser 882187 ns/iter (± 48365) 723572 ns/iter (± 34702) 1.22
es/full/base/fixer 32344 ns/iter (± 3300) 25784 ns/iter (± 398) 1.25
es/full/base/resolver_and_hygiene 114658 ns/iter (± 4395) 91475 ns/iter (± 3030) 1.25
serialization of ast node 259 ns/iter (± 3) 216 ns/iter (± 4) 1.20
serialization of serde 263 ns/iter (± 22) 217 ns/iter (± 5) 1.21

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.