From b428b7914483a8cef21a197bb2c9bb2aceb4527a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 3 Apr 2022 14:43:40 +0100 Subject: [PATCH] Bump `entities` to `4.1.1`, update code (#486) --- package-lock.json | 18 +++++++------ .../parse5-html-rewriting-stream/lib/index.ts | 7 +++--- .../parse5-html-rewriting-stream/package.json | 1 + packages/parse5/lib/serializer/index.ts | 21 +++------------- packages/parse5/lib/tokenizer/index.ts | 25 ++++++++++++++----- packages/parse5/package.json | 2 +- 6 files changed, 38 insertions(+), 36 deletions(-) diff --git a/package-lock.json b/package-lock.json index 792582f5d..1dc99d2cd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2322,9 +2322,9 @@ "dev": true }, "node_modules/entities": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/entities/-/entities-3.0.1.tgz", - "integrity": "sha512-WiyBqoomrwMdFG1e0kqvASYfnlb0lp8M5o5Fw2OFq1hNZxxcNk8Ik0Xm7LxzBhuidnZB/UtBqVCgUz3kBOP51Q==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.1.1.tgz", + "integrity": "sha512-AxszXDqnHj5aVzjBpofDDfXX9zC8gugYwJxEYDdA52d6dqoxPKfNDBFxZyIZrkaqUtNy/ip/knBm6mRJed7p1A==", "engines": { "node": ">=0.12" }, @@ -6279,7 +6279,7 @@ "version": "6.0.1", "license": "MIT", "dependencies": { - "entities": "^3.0.1" + "entities": "^4.1.1" }, "funding": { "url": "https://github.com/inikulin/parse5?sponsor=1" @@ -6289,6 +6289,7 @@ "version": "6.0.1", "license": "MIT", "dependencies": { + "entities": "^4.1.1", "parse5": "^6.0.1", "parse5-sax-parser": "^6.0.1" }, @@ -8084,9 +8085,9 @@ "dev": true }, "entities": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/entities/-/entities-3.0.1.tgz", - "integrity": "sha512-WiyBqoomrwMdFG1e0kqvASYfnlb0lp8M5o5Fw2OFq1hNZxxcNk8Ik0Xm7LxzBhuidnZB/UtBqVCgUz3kBOP51Q==" + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.1.1.tgz", + "integrity": "sha512-AxszXDqnHj5aVzjBpofDDfXX9zC8gugYwJxEYDdA52d6dqoxPKfNDBFxZyIZrkaqUtNy/ip/knBm6mRJed7p1A==" }, "error-ex": { "version": "1.3.2", @@ -9946,7 +9947,7 @@ "parse5": { "version": "file:packages/parse5", "requires": { - "entities": "^3.0.1" + "entities": "4.1.1" } }, "parse5-benchmarks": { @@ -9968,6 +9969,7 @@ "parse5-html-rewriting-stream": { "version": "file:packages/parse5-html-rewriting-stream", "requires": { + "entities": "^4.1.1", "parse5": "^6.0.1", "parse5-sax-parser": "^6.0.1" } diff --git a/packages/parse5-html-rewriting-stream/lib/index.ts b/packages/parse5-html-rewriting-stream/lib/index.ts index 40c71027f..ebbb1660d 100644 --- a/packages/parse5-html-rewriting-stream/lib/index.ts +++ b/packages/parse5-html-rewriting-stream/lib/index.ts @@ -1,6 +1,7 @@ +import { escapeText, escapeAttribute } from 'entities'; import type { Location } from 'parse5/dist/common/token.js'; import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser'; -import { hasUnescapedText, escapeString } from 'parse5/dist/serializer/index.js'; +import { hasUnescapedText } from 'parse5/dist/serializer/index.js'; /** * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter. @@ -113,7 +114,7 @@ export class RewritingStream extends SAXParser { let res = `<${token.tagName}`; for (const attr of token.attrs) { - res += ` ${attr.name}="${escapeString(attr.value, true)}"`; + res += ` ${attr.name}="${escapeAttribute(attr.value)}"`; } res += token.selfClosing ? '/>' : '>'; @@ -131,7 +132,7 @@ export class RewritingStream extends SAXParser { this.push( !this.parserFeedbackSimulator.inForeignContent && hasUnescapedText(this.tokenizer.lastStartTagName, true) ? text - : escapeString(text, false) + : escapeText(text) ); } diff --git a/packages/parse5-html-rewriting-stream/package.json b/packages/parse5-html-rewriting-stream/package.json index 407726358..28aef2f83 100644 --- a/packages/parse5-html-rewriting-stream/package.json +++ b/packages/parse5-html-rewriting-stream/package.json @@ -20,6 +20,7 @@ "main": "dist/index.js", "exports": "dist/index.js", "dependencies": { + "entities": "^4.1.1", "parse5": "^6.0.1", "parse5-sax-parser": "^6.0.1" }, diff --git a/packages/parse5/lib/serializer/index.ts b/packages/parse5/lib/serializer/index.ts index 44c293456..31d2f3e81 100644 --- a/packages/parse5/lib/serializer/index.ts +++ b/packages/parse5/lib/serializer/index.ts @@ -1,14 +1,8 @@ +import { escapeText, escapeAttribute } from 'entities'; import { TAG_NAMES as $, NAMESPACES as NS } from '../common/html.js'; import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface'; import { defaultTreeAdapter, type DefaultTreeAdapterMap } from '../tree-adapters/default.js'; -//Escaping regexes -const AMP_REGEX = /&/g; -const NBSP_REGEX = /\u00A0/g; -const DOUBLE_QUOTE_REGEX = /"/g; -const LT_REGEX = //g; - // Sets const VOID_ELEMENTS = new Set([ $.AREA, @@ -208,7 +202,7 @@ function serializeAttributes( } } - html += `="${escapeString(attr.value, true)}"`; + html += `="${escapeAttribute(attr.value)}"`; } return html; @@ -224,7 +218,7 @@ function serializeTextNode(node: T['textNode'], op treeAdapter.getNamespaceURI(parent) === NS.HTML && hasUnescapedText(parentTn, options.scriptingEnabled) ? content - : escapeString(content, false); + : escapeText(content); } function serializeCommentNode( @@ -240,12 +234,3 @@ function serializeDocumentTypeNode( ): string { return ``; } - -// NOTE: used in tests and by rewriting stream -export function escapeString(str: string, attrMode = false): string { - str = str.replace(AMP_REGEX, '&').replace(NBSP_REGEX, ' '); - - return attrMode - ? str.replace(DOUBLE_QUOTE_REGEX, '"') - : str.replace(LT_REGEX, '<').replace(GT_REGEX, '>'); -} diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 51bc0873f..bd39df998 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -616,8 +616,13 @@ export class Tokenizer { current = htmlDecodeTree[i]; + const masked = current & BinTrieFlags.VALUE_LENGTH; + // If the branch is a value, store it and continue - if (current & BinTrieFlags.HAS_VALUE) { + if (masked) { + // The mask is the number of bytes of the value, including the current byte. + const valueLength = (masked >> 14) - 1; + // Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error. // See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state if ( @@ -629,17 +634,25 @@ export class Tokenizer { //emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes. result = [$.AMPERSAND]; - // Skip over the value. No need to consider multi-byte values, as legacy entities are always a single byte. - i += 1; + // Skip over the value. + i += valueLength; } else { // If this is a surrogate pair, consume the next two bytes. result = - current & BinTrieFlags.MULTI_BYTE - ? [htmlDecodeTree[++i], htmlDecodeTree[++i]] - : [htmlDecodeTree[++i]]; + valueLength === 0 + ? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH] + : valueLength === 1 + ? [htmlDecodeTree[++i]] + : [htmlDecodeTree[++i], htmlDecodeTree[++i]]; excess = 0; withoutSemicolon = cp !== $.SEMICOLON; } + + if (valueLength === 0) { + // If the value is zero-length, we're done. + this._consume(); + break; + } } } diff --git a/packages/parse5/package.json b/packages/parse5/package.json index 8a63533d1..c70dde086 100644 --- a/packages/parse5/package.json +++ b/packages/parse5/package.json @@ -8,7 +8,7 @@ "homepage": "https://github.com/inikulin/parse5", "funding": "https://github.com/inikulin/parse5?sponsor=1", "dependencies": { - "entities": "^3.0.1" + "entities": "^4.1.1" }, "keywords": [ "html",