Skip to content

Commit

Permalink
Bump entities to 4.1.1, update code (#486)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 committed Apr 3, 2022
1 parent c8e4f8e commit b428b79
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 36 deletions.
18 changes: 10 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions packages/parse5-html-rewriting-stream/lib/index.ts
@@ -1,6 +1,7 @@
import { escapeText, escapeAttribute } from 'entities';
import type { Location } from 'parse5/dist/common/token.js';
import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser';
import { hasUnescapedText, escapeString } from 'parse5/dist/serializer/index.js';
import { hasUnescapedText } from 'parse5/dist/serializer/index.js';

/**
* Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
Expand Down Expand Up @@ -113,7 +114,7 @@ export class RewritingStream extends SAXParser {
let res = `<${token.tagName}`;

for (const attr of token.attrs) {
res += ` ${attr.name}="${escapeString(attr.value, true)}"`;
res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
}

res += token.selfClosing ? '/>' : '>';
Expand All @@ -131,7 +132,7 @@ export class RewritingStream extends SAXParser {
this.push(
!this.parserFeedbackSimulator.inForeignContent && hasUnescapedText(this.tokenizer.lastStartTagName, true)
? text
: escapeString(text, false)
: escapeText(text)
);
}

Expand Down
1 change: 1 addition & 0 deletions packages/parse5-html-rewriting-stream/package.json
Expand Up @@ -20,6 +20,7 @@
"main": "dist/index.js",
"exports": "dist/index.js",
"dependencies": {
"entities": "^4.1.1",
"parse5": "^6.0.1",
"parse5-sax-parser": "^6.0.1"
},
Expand Down
21 changes: 3 additions & 18 deletions packages/parse5/lib/serializer/index.ts
@@ -1,14 +1,8 @@
import { escapeText, escapeAttribute } from 'entities';
import { TAG_NAMES as $, NAMESPACES as NS } from '../common/html.js';
import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface';
import { defaultTreeAdapter, type DefaultTreeAdapterMap } from '../tree-adapters/default.js';

//Escaping regexes
const AMP_REGEX = /&/g;
const NBSP_REGEX = /\u00A0/g;
const DOUBLE_QUOTE_REGEX = /"/g;
const LT_REGEX = /</g;
const GT_REGEX = />/g;

// Sets
const VOID_ELEMENTS = new Set<string>([
$.AREA,
Expand Down Expand Up @@ -208,7 +202,7 @@ function serializeAttributes<T extends TreeAdapterTypeMap>(
}
}

html += `="${escapeString(attr.value, true)}"`;
html += `="${escapeAttribute(attr.value)}"`;
}

return html;
Expand All @@ -224,7 +218,7 @@ function serializeTextNode<T extends TreeAdapterTypeMap>(node: T['textNode'], op
treeAdapter.getNamespaceURI(parent) === NS.HTML &&
hasUnescapedText(parentTn, options.scriptingEnabled)
? content
: escapeString(content, false);
: escapeText(content);
}

function serializeCommentNode<T extends TreeAdapterTypeMap>(
Expand All @@ -240,12 +234,3 @@ function serializeDocumentTypeNode<T extends TreeAdapterTypeMap>(
): string {
return `<!DOCTYPE ${treeAdapter.getDocumentTypeNodeName(node)}>`;
}

// NOTE: used in tests and by rewriting stream
export function escapeString(str: string, attrMode = false): string {
str = str.replace(AMP_REGEX, '&amp;').replace(NBSP_REGEX, '&nbsp;');

return attrMode
? str.replace(DOUBLE_QUOTE_REGEX, '&quot;')
: str.replace(LT_REGEX, '&lt;').replace(GT_REGEX, '&gt;');
}
25 changes: 19 additions & 6 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -616,8 +616,13 @@ export class Tokenizer {

current = htmlDecodeTree[i];

const masked = current & BinTrieFlags.VALUE_LENGTH;

// If the branch is a value, store it and continue
if (current & BinTrieFlags.HAS_VALUE) {
if (masked) {
// The mask is the number of bytes of the value, including the current byte.
const valueLength = (masked >> 14) - 1;

// Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
// See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
if (
Expand All @@ -629,17 +634,25 @@ export class Tokenizer {
//emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes.
result = [$.AMPERSAND];

// Skip over the value. No need to consider multi-byte values, as legacy entities are always a single byte.
i += 1;
// Skip over the value.
i += valueLength;
} else {
// If this is a surrogate pair, consume the next two bytes.
result =
current & BinTrieFlags.MULTI_BYTE
? [htmlDecodeTree[++i], htmlDecodeTree[++i]]
: [htmlDecodeTree[++i]];
valueLength === 0
? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH]
: valueLength === 1
? [htmlDecodeTree[++i]]
: [htmlDecodeTree[++i], htmlDecodeTree[++i]];
excess = 0;
withoutSemicolon = cp !== $.SEMICOLON;
}

if (valueLength === 0) {
// If the value is zero-length, we're done.
this._consume();
break;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion packages/parse5/package.json
Expand Up @@ -8,7 +8,7 @@
"homepage": "https://github.com/inikulin/parse5",
"funding": "https://github.com/inikulin/parse5?sponsor=1",
"dependencies": {
"entities": "^3.0.1"
"entities": "^4.1.1"
},
"keywords": [
"html",
Expand Down

0 comments on commit b428b79

Please sign in to comment.