Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump entities to 4.1.0 #1146

Merged
merged 1 commit into from Apr 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -51,7 +51,7 @@
"domelementtype": "^2.0.1",
"domhandler": "^4.3.1",
"domutils": "^2.8.0",
"entities": "^3.0.1"
"entities": "^4.1.0"
},
"devDependencies": {
"@types/jest": "^27.4.1",
Expand Down
6 changes: 5 additions & 1 deletion src/Parser.ts
Expand Up @@ -252,7 +252,11 @@ export class Parser implements Callbacks {

/** @internal */
ontextentity(cp: number): void {
const idx = this.tokenizer.getIndex();
/*
* Entities can be emitted on the character, or directly after.
* We use the section start here to get accurate indices.
*/
const idx = this.tokenizer.getSectionStart();
this.endIndex = idx - 1;
this.cbs.ontext?.(decodeCodePoint(cp));
this.startIndex = idx;
Expand Down
57 changes: 43 additions & 14 deletions src/Tokenizer.ts
Expand Up @@ -157,7 +157,7 @@ export default class Tokenizer {
/** The read buffer. */
private buffer = "";
/** The beginning of the section that is currently being read. */
public sectionStart = 0;
private sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
private index = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
Expand Down Expand Up @@ -224,6 +224,13 @@ export default class Tokenizer {
return this.index;
}

/**
* The start of the current section.
*/
public getSectionStart(): number {
return this.sectionStart;
}

private stateText(c: number): void {
if (
c === CharCodes.Lt ||
Expand Down Expand Up @@ -647,12 +654,16 @@ export default class Tokenizer {

this.trieCurrent = this.entityTrie[this.trieIndex];

const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH;

// If the branch is a value, store it and continue
if (this.trieCurrent & BinTrieFlags.HAS_VALUE) {
if (masked) {
// The mask is the number of bytes of the value, including the current byte.
const valueLength = (masked >> 14) - 1;

// If we have a legacy entity while parsing strictly, just skip the number of bytes
if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
// No need to consider multi-byte values, as the legacy entity is always a single byte
this.trieIndex += 1;
this.trieIndex += valueLength;
} else {
// Add 1 as we have already incremented the excess
const entityStart = this.index - this.entityExcess + 1;
Expand All @@ -663,20 +674,42 @@ export default class Tokenizer {

// If this is a surrogate pair, consume the next two bytes
this.entityResult = this.trieIndex;
this.trieIndex +=
1 +
Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
this.trieIndex += valueLength;
this.entityExcess = 0;
this.sectionStart = this.index + 1;

if (valueLength === 0) {
this.emitNamedEntity();
}
}
}
}

private emitNamedEntity(): void {
if (this.entityResult !== 0) {
if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
this.state = this.baseState;

if (this.entityResult === 0) {
return;
}

const valueLength =
(this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >>
14;

switch (valueLength) {
case 1:
this.emitCodePoint(
this.entityTrie[this.entityResult] &
~BinTrieFlags.VALUE_LENGTH
);
break;
case 2:
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
break;
case 3: {
const first = this.entityTrie[this.entityResult + 1];
const second = this.entityTrie[this.entityResult + 2];

// If this is a surrogate pair, combine the code points.
if (first >= 0xd8_00 && first <= 0xdf_ff) {
this.emitCodePoint(
Expand All @@ -687,12 +720,8 @@ export default class Tokenizer {
this.emitCodePoint(first);
this.emitCodePoint(second);
}
} else {
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
}
}

this.state = this.baseState;
}

private stateBeforeNumericEntity(c: number): void {
Expand All @@ -716,8 +745,8 @@ export default class Tokenizer {
this.emitPartial(this.sectionStart, entityStart);
}

this.emitCodePoint(this.entityResult);
this.sectionStart = this.index + Number(strict);
this.emitCodePoint(this.entityResult);
}
this.state = this.baseState;
}
Expand Down