Skip to content

Commit

Permalink
Bump entities to 4.1.0 (#1146)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 committed Apr 2, 2022
1 parent 9fcd5e3 commit 1d3c55f
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 23 deletions.
14 changes: 7 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -51,7 +51,7 @@
"domelementtype": "^2.0.1",
"domhandler": "^4.3.1",
"domutils": "^2.8.0",
"entities": "^3.0.1"
"entities": "^4.1.0"
},
"devDependencies": {
"@types/jest": "^27.4.1",
Expand Down
6 changes: 5 additions & 1 deletion src/Parser.ts
Expand Up @@ -252,7 +252,11 @@ export class Parser implements Callbacks {

/** @internal */
ontextentity(cp: number): void {
const idx = this.tokenizer.getIndex();
/*
* Entities can be emitted on the character, or directly after.
* We use the section start here to get accurate indices.
*/
const idx = this.tokenizer.getSectionStart();
this.endIndex = idx - 1;
this.cbs.ontext?.(decodeCodePoint(cp));
this.startIndex = idx;
Expand Down
57 changes: 43 additions & 14 deletions src/Tokenizer.ts
Expand Up @@ -157,7 +157,7 @@ export default class Tokenizer {
/** The read buffer. */
private buffer = "";
/** The beginning of the section that is currently being read. */
public sectionStart = 0;
private sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
private index = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
Expand Down Expand Up @@ -224,6 +224,13 @@ export default class Tokenizer {
return this.index;
}

/**
* The start of the current section.
*/
public getSectionStart(): number {
return this.sectionStart;
}

private stateText(c: number): void {
if (
c === CharCodes.Lt ||
Expand Down Expand Up @@ -647,12 +654,16 @@ export default class Tokenizer {

this.trieCurrent = this.entityTrie[this.trieIndex];

const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH;

// If the branch is a value, store it and continue
if (this.trieCurrent & BinTrieFlags.HAS_VALUE) {
if (masked) {
// The mask is the number of bytes of the value, including the current byte.
const valueLength = (masked >> 14) - 1;

// If we have a legacy entity while parsing strictly, just skip the number of bytes
if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
// No need to consider multi-byte values, as the legacy entity is always a single byte
this.trieIndex += 1;
this.trieIndex += valueLength;
} else {
// Add 1 as we have already incremented the excess
const entityStart = this.index - this.entityExcess + 1;
Expand All @@ -663,20 +674,42 @@ export default class Tokenizer {

// If this is a surrogate pair, consume the next two bytes
this.entityResult = this.trieIndex;
this.trieIndex +=
1 +
Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
this.trieIndex += valueLength;
this.entityExcess = 0;
this.sectionStart = this.index + 1;

if (valueLength === 0) {
this.emitNamedEntity();
}
}
}
}

private emitNamedEntity(): void {
if (this.entityResult !== 0) {
if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
this.state = this.baseState;

if (this.entityResult === 0) {
return;
}

const valueLength =
(this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >>
14;

switch (valueLength) {
case 1:
this.emitCodePoint(
this.entityTrie[this.entityResult] &
~BinTrieFlags.VALUE_LENGTH
);
break;
case 2:
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
break;
case 3: {
const first = this.entityTrie[this.entityResult + 1];
const second = this.entityTrie[this.entityResult + 2];

// If this is a surrogate pair, combine the code points.
if (first >= 0xd8_00 && first <= 0xdf_ff) {
this.emitCodePoint(
Expand All @@ -687,12 +720,8 @@ export default class Tokenizer {
this.emitCodePoint(first);
this.emitCodePoint(second);
}
} else {
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
}
}

this.state = this.baseState;
}

private stateBeforeNumericEntity(c: number): void {
Expand All @@ -716,8 +745,8 @@ export default class Tokenizer {
this.emitPartial(this.sectionStart, entityStart);
}

this.emitCodePoint(this.entityResult);
this.sectionStart = this.index + Number(strict);
this.emitCodePoint(this.entityResult);
}
this.state = this.baseState;
}
Expand Down

0 comments on commit 1d3c55f

Please sign in to comment.