inikulin · fb55 · Apr 3, 2022 · Apr 1, 2022 · Apr 3, 2022
diff --git a/package-lock.json b/package-lock.json
diff --git a/packages/parse5-html-rewriting-stream/lib/index.ts b/packages/parse5-html-rewriting-stream/lib/index.ts
@@ -1,6 +1,7 @@
+import { escapeText, escapeAttribute } from 'entities';
 import type { Location } from 'parse5/dist/common/token.js';
 import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser';
-import { hasUnescapedText, escapeString } from 'parse5/dist/serializer/index.js';
+import { hasUnescapedText } from 'parse5/dist/serializer/index.js';
 
 /**
  * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
@@ -113,7 +114,7 @@ export class RewritingStream extends SAXParser {
         let res = `<${token.tagName}`;
 
         for (const attr of token.attrs) {
-            res += ` ${attr.name}="${escapeString(attr.value, true)}"`;
+            res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
         }
 
         res += token.selfClosing ? '/>' : '>';
@@ -131,7 +132,7 @@ export class RewritingStream extends SAXParser {
         this.push(
             !this.parserFeedbackSimulator.inForeignContent && hasUnescapedText(this.tokenizer.lastStartTagName, true)
                 ? text
-                : escapeString(text, false)
+                : escapeText(text)
         );
     }
 

diff --git a/packages/parse5-html-rewriting-stream/package.json b/packages/parse5-html-rewriting-stream/package.json
@@ -20,6 +20,7 @@
     "main": "dist/index.js",
     "exports": "dist/index.js",
     "dependencies": {
+        "entities": "^4.1.1",
         "parse5": "^6.0.1",
         "parse5-sax-parser": "^6.0.1"
     },

diff --git a/packages/parse5/lib/serializer/index.ts b/packages/parse5/lib/serializer/index.ts
@@ -1,14 +1,8 @@
+import { escapeText, escapeAttribute } from 'entities';
 import { TAG_NAMES as $, NAMESPACES as NS } from '../common/html.js';
 import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface';
 import { defaultTreeAdapter, type DefaultTreeAdapterMap } from '../tree-adapters/default.js';
 
-//Escaping regexes
-const AMP_REGEX = /&/g;
-const NBSP_REGEX = /\u00A0/g;
-const DOUBLE_QUOTE_REGEX = /"/g;
-const LT_REGEX = /</g;
-const GT_REGEX = />/g;
-
 // Sets
 const VOID_ELEMENTS = new Set<string>([
     $.AREA,
@@ -208,7 +202,7 @@ function serializeAttributes<T extends TreeAdapterTypeMap>(
                 }
             }
 
-        html += `="${escapeString(attr.value, true)}"`;
+        html += `="${escapeAttribute(attr.value)}"`;
     }
 
     return html;
@@ -224,7 +218,7 @@ function serializeTextNode<T extends TreeAdapterTypeMap>(node: T['textNode'], op
         treeAdapter.getNamespaceURI(parent) === NS.HTML &&
         hasUnescapedText(parentTn, options.scriptingEnabled)
         ? content
-        : escapeString(content, false);
+        : escapeText(content);
 }
 
 function serializeCommentNode<T extends TreeAdapterTypeMap>(
@@ -240,12 +234,3 @@ function serializeDocumentTypeNode<T extends TreeAdapterTypeMap>(
 ): string {
     return `<!DOCTYPE ${treeAdapter.getDocumentTypeNodeName(node)}>`;
 }
-
-// NOTE: used in tests and by rewriting stream
-export function escapeString(str: string, attrMode = false): string {
-    str = str.replace(AMP_REGEX, '&amp;').replace(NBSP_REGEX, '&nbsp;');
-
-    return attrMode
-        ? str.replace(DOUBLE_QUOTE_REGEX, '&quot;')
-        : str.replace(LT_REGEX, '&lt;').replace(GT_REGEX, '&gt;');
-}
diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts
@@ -616,8 +616,13 @@ export class Tokenizer {
 
             current = htmlDecodeTree[i];
 
+            const masked = current & BinTrieFlags.VALUE_LENGTH;
+
             // If the branch is a value, store it and continue
-            if (current & BinTrieFlags.HAS_VALUE) {
+            if (masked) {
+                // The mask is the number of bytes of the value, including the current byte.
+                const valueLength = (masked >> 14) - 1;
+
                 // Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
                 // See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
                 if (
@@ -629,17 +634,25 @@ export class Tokenizer {
                     //emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes.
                     result = [$.AMPERSAND];
 
-                    // Skip over the value. No need to consider multi-byte values, as legacy entities are always a single byte.
-                    i += 1;
+                    // Skip over the value.
+                    i += valueLength;
                 } else {
                     // If this is a surrogate pair, consume the next two bytes.
                     result =
-                        current & BinTrieFlags.MULTI_BYTE
-                            ? [htmlDecodeTree[++i], htmlDecodeTree[++i]]
-                            : [htmlDecodeTree[++i]];
+                        valueLength === 0
+                            ? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH]
+                            : valueLength === 1
+                            ? [htmlDecodeTree[++i]]
+                            : [htmlDecodeTree[++i], htmlDecodeTree[++i]];
                     excess = 0;
                     withoutSemicolon = cp !== $.SEMICOLON;
                 }
+
+                if (valueLength === 0) {
+                    // If the value is zero-length, we're done.
+                    this._consume();
+                    break;
+                }
             }
         }
 

diff --git a/packages/parse5/package.json b/packages/parse5/package.json
@@ -8,7 +8,7 @@
     "homepage": "https://github.com/inikulin/parse5",
     "funding": "https://github.com/inikulin/parse5?sponsor=1",
     "dependencies": {
-        "entities": "^3.0.1"
+        "entities": "^4.1.1"
     },
     "keywords": [
         "html",