Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump entities to 4.1.1, update code #486

Merged
merged 2 commits into from Apr 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 10 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions packages/parse5-html-rewriting-stream/lib/index.ts
@@ -1,6 +1,7 @@
import { escapeText, escapeAttribute } from 'entities';
import type { Location } from 'parse5/dist/common/token.js';
import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser';
import { hasUnescapedText, escapeString } from 'parse5/dist/serializer/index.js';
import { hasUnescapedText } from 'parse5/dist/serializer/index.js';

/**
* Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
Expand Down Expand Up @@ -113,7 +114,7 @@ export class RewritingStream extends SAXParser {
let res = `<${token.tagName}`;

for (const attr of token.attrs) {
res += ` ${attr.name}="${escapeString(attr.value, true)}"`;
res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
}

res += token.selfClosing ? '/>' : '>';
Expand All @@ -131,7 +132,7 @@ export class RewritingStream extends SAXParser {
this.push(
!this.parserFeedbackSimulator.inForeignContent && hasUnescapedText(this.tokenizer.lastStartTagName, true)
? text
: escapeString(text, false)
: escapeText(text)
);
}

Expand Down
1 change: 1 addition & 0 deletions packages/parse5-html-rewriting-stream/package.json
Expand Up @@ -20,6 +20,7 @@
"main": "dist/index.js",
"exports": "dist/index.js",
"dependencies": {
"entities": "^4.1.1",
"parse5": "^6.0.1",
"parse5-sax-parser": "^6.0.1"
},
Expand Down
21 changes: 3 additions & 18 deletions packages/parse5/lib/serializer/index.ts
@@ -1,14 +1,8 @@
import { escapeText, escapeAttribute } from 'entities';
import { TAG_NAMES as $, NAMESPACES as NS } from '../common/html.js';
import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface';
import { defaultTreeAdapter, type DefaultTreeAdapterMap } from '../tree-adapters/default.js';

//Escaping regexes
const AMP_REGEX = /&/g;
const NBSP_REGEX = /\u00A0/g;
const DOUBLE_QUOTE_REGEX = /"/g;
const LT_REGEX = /</g;
const GT_REGEX = />/g;

// Sets
const VOID_ELEMENTS = new Set<string>([
$.AREA,
Expand Down Expand Up @@ -208,7 +202,7 @@ function serializeAttributes<T extends TreeAdapterTypeMap>(
}
}

html += `="${escapeString(attr.value, true)}"`;
html += `="${escapeAttribute(attr.value)}"`;
}

return html;
Expand All @@ -224,7 +218,7 @@ function serializeTextNode<T extends TreeAdapterTypeMap>(node: T['textNode'], op
treeAdapter.getNamespaceURI(parent) === NS.HTML &&
hasUnescapedText(parentTn, options.scriptingEnabled)
? content
: escapeString(content, false);
: escapeText(content);
}

function serializeCommentNode<T extends TreeAdapterTypeMap>(
Expand All @@ -240,12 +234,3 @@ function serializeDocumentTypeNode<T extends TreeAdapterTypeMap>(
): string {
return `<!DOCTYPE ${treeAdapter.getDocumentTypeNodeName(node)}>`;
}

// NOTE: used in tests and by rewriting stream
export function escapeString(str: string, attrMode = false): string {
str = str.replace(AMP_REGEX, '&amp;').replace(NBSP_REGEX, '&nbsp;');

return attrMode
? str.replace(DOUBLE_QUOTE_REGEX, '&quot;')
: str.replace(LT_REGEX, '&lt;').replace(GT_REGEX, '&gt;');
}
25 changes: 19 additions & 6 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -616,8 +616,13 @@ export class Tokenizer {

current = htmlDecodeTree[i];

const masked = current & BinTrieFlags.VALUE_LENGTH;

// If the branch is a value, store it and continue
if (current & BinTrieFlags.HAS_VALUE) {
if (masked) {
// The mask is the number of bytes of the value, including the current byte.
const valueLength = (masked >> 14) - 1;

// Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
// See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
if (
Expand All @@ -629,17 +634,25 @@ export class Tokenizer {
//emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes.
result = [$.AMPERSAND];

// Skip over the value. No need to consider multi-byte values, as legacy entities are always a single byte.
i += 1;
// Skip over the value.
i += valueLength;
} else {
// If this is a surrogate pair, consume the next two bytes.
result =
current & BinTrieFlags.MULTI_BYTE
? [htmlDecodeTree[++i], htmlDecodeTree[++i]]
: [htmlDecodeTree[++i]];
valueLength === 0
? [htmlDecodeTree[i] & ~BinTrieFlags.VALUE_LENGTH]
: valueLength === 1
? [htmlDecodeTree[++i]]
: [htmlDecodeTree[++i], htmlDecodeTree[++i]];
excess = 0;
withoutSemicolon = cp !== $.SEMICOLON;
}

if (valueLength === 0) {
// If the value is zero-length, we're done.
this._consume();
break;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion packages/parse5/package.json
Expand Up @@ -8,7 +8,7 @@
"homepage": "https://github.com/inikulin/parse5",
"funding": "https://github.com/inikulin/parse5?sponsor=1",
"dependencies": {
"entities": "^3.0.1"
"entities": "^4.1.1"
},
"keywords": [
"html",
Expand Down