From a4428c7a345330fe19c778384ba0f3761749dbd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 19:24:52 +0000
Subject: [PATCH 1/7] refactor: Call tokenizer callbacks with indices
---
src/Parser.ts | 97 +++++++---
src/Tokenizer.ts | 230 ++++++++++++++---------
src/__snapshots__/Tokenizer.spec.ts.snap | 70 ++++---
3 files changed, 253 insertions(+), 144 deletions(-)
diff --git a/src/Parser.ts b/src/Parser.ts
index cccd33f9e..429fd517a 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -1,4 +1,5 @@
-import Tokenizer from "./Tokenizer";
+import Tokenizer, { Callbacks, QuoteType } from "./Tokenizer";
+import decodeCodePoint from "entities/lib/decode_codepoint";
const formTags = new Set([
"input",
@@ -195,7 +196,7 @@ export interface Handler {
const reNameEnd = /\s|\//;
-export class Parser {
+export class Parser implements Callbacks {
/** The start index of the last event. */
public startIndex = 0;
/** The end index of the last event. */
@@ -235,20 +236,31 @@ export class Parser {
// Tokenizer event handlers
/** @internal */
- ontext(data: string): void {
- const idx = this.tokenizer.getAbsoluteIndex();
+ ontext(start: number, length: number): void {
+ const data = this.getSubstr(start, length);
+ const idx = start + length;
this.endIndex = idx - 1;
this.cbs.ontext?.(data);
this.startIndex = idx;
}
+ /** @internal */
+ ontextentity(cp: number): void {
+ const idx = this.tokenizer.getIndex();
+ this.endIndex = idx - 1;
+ this.cbs.ontext?.(decodeCodePoint(cp));
+ this.startIndex = idx;
+ }
+
protected isVoidElement(name: string): boolean {
return !this.options.xmlMode && voidElements.has(name);
}
/** @internal */
- onopentagname(name: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ onopentagname(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+
+ let name = this.getSubstr(start, length);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -287,7 +299,7 @@ export class Parser {
private endOpenTag(isImplied: boolean) {
this.startIndex = this.openTagStart;
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ this.endIndex = this.tokenizer.getIndex();
if (this.attribs) {
this.cbs.onopentag?.(this.tagname, this.attribs, isImplied);
@@ -309,8 +321,10 @@ export class Parser {
}
/** @internal */
- onclosetag(name: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ onclosetag(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+
+ let name = this.getSubstr(start, length);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -378,25 +392,39 @@ export class Parser {
}
/** @internal */
- onattribname(name: string): void {
- this.startIndex = this.tokenizer.getAbsoluteSectionStart();
+ onattribname(start: number, length: number): void {
+ this.startIndex = start;
+ const name = this.getSubstr(start, length);
- if (this.lowerCaseAttributeNames) {
- name = name.toLowerCase();
- }
- this.attribname = name;
+ this.attribname = this.lowerCaseAttributeNames
+ ? name.toLowerCase()
+ : name;
}
/** @internal */
- onattribdata(value: string): void {
- this.attribvalue += value;
+ onattribdata(start: number, length: number): void {
+ this.attribvalue += this.getSubstr(start, length);
}
/** @internal */
- onattribend(quote: string | undefined | null): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ onattribentity(cp: number): void {
+ this.attribvalue += decodeCodePoint(cp);
+ }
- this.cbs.onattribute?.(this.attribname, this.attribvalue, quote);
+ /** @internal */
+ onattribend(quote: QuoteType): void {
+ this.endIndex = this.tokenizer.getIndex();
+
+ const quoteVal =
+ quote === QuoteType.Double
+ ? '"'
+ : quote === QuoteType.Single
+ ? "'"
+ : quote === QuoteType.NoValue
+ ? undefined
+ : null;
+
+ this.cbs.onattribute?.(this.attribname, this.attribvalue, quoteVal);
if (
this.attribs &&
!Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)
@@ -419,8 +447,9 @@ export class Parser {
}
/** @internal */
- ondeclaration(value: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ ondeclaration(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+ const value = this.getSubstr(start, length);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
@@ -432,8 +461,9 @@ export class Parser {
}
/** @internal */
- onprocessinginstruction(value: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ onprocessinginstruction(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+ const value = this.getSubstr(start, length);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
@@ -445,8 +475,9 @@ export class Parser {
}
/** @internal */
- oncomment(value: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ oncomment(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+ const value = this.getSubstr(start, length);
this.cbs.oncomment?.(value);
this.cbs.oncommentend?.();
@@ -456,8 +487,9 @@ export class Parser {
}
/** @internal */
- oncdata(value: string): void {
- this.endIndex = this.tokenizer.getAbsoluteIndex();
+ oncdata(start: number, length: number): void {
+ this.endIndex = this.tokenizer.getIndex();
+ const value = this.getSubstr(start, length);
if (this.options.xmlMode || this.options.recognizeCDATA) {
this.cbs.oncdatastart?.();
@@ -504,6 +536,7 @@ export class Parser {
this.startIndex = 0;
this.endIndex = 0;
this.cbs.onparserinit?.(this);
+ this.buffer = "";
}
/**
@@ -517,12 +550,19 @@ export class Parser {
this.end(data);
}
+ private buffer = "";
+
+ private getSubstr(start: number, length: number) {
+ return this.buffer.substr(start, length);
+ }
+
/**
* Parses a chunk of data and calls the corresponding callbacks.
*
* @param chunk Chunk to parse.
*/
public write(chunk: string): void {
+ this.buffer += chunk;
this.tokenizer.write(chunk);
}
@@ -532,6 +572,7 @@ export class Parser {
* @param chunk Optional final chunk to parse.
*/
public end(chunk?: string): void {
+ if (chunk) this.buffer += chunk;
this.tokenizer.end(chunk);
}
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 6b5182f9d..a597439df 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,4 +1,3 @@
-import decodeCodePoint from "entities/lib/decode_codepoint";
import {
htmlDecodeTree,
xmlDecodeTree,
@@ -105,21 +104,30 @@ function isASCIIAlpha(c: number): boolean {
);
}
+export enum QuoteType {
+ NoValue = 0,
+ Unquoted = 1,
+ Single = 2,
+ Double = 3,
+}
+
export interface Callbacks {
- onattribdata(value: string): void;
- onattribend(quote: string | undefined | null): void;
- onattribname(name: string): void;
- oncdata(data: string): void;
- onclosetag(name: string): void;
- oncomment(data: string): void;
- ondeclaration(content: string): void;
+ onattribdata(start: number, length: number): void;
+ onattribentity(codepoint: number): void;
+ onattribend(quote: QuoteType): void;
+ onattribname(start: number, length: number): void;
+ oncdata(start: number, length: number): void;
+ onclosetag(start: number, length: number): void;
+ oncomment(start: number, length: number): void;
+ ondeclaration(start: number, length: number): void;
onend(): void;
onerror(error: Error, state?: State): void;
onopentagend(): void;
- onopentagname(name: string): void;
- onprocessinginstruction(instruction: string): void;
+ onopentagname(start: number, length: number): void;
+ onprocessinginstruction(start: number, length: number): void;
onselfclosingtag(): void;
- ontext(value: string): void;
+ ontext(start: number, length: number): void;
+ ontextentity(codepoint: number): void;
}
/**
@@ -146,11 +154,6 @@ export default class Tokenizer {
public sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
private _index = 0;
- /**
- * Data that has already been processed will be removed from the buffer occasionally.
- * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
- */
- private bufferOffset = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
private baseState = State.Text;
/** For special parsing behavior inside of script and style tags. */
@@ -181,7 +184,6 @@ export default class Tokenizer {
this.buffer = "";
this.sectionStart = 0;
this._index = 0;
- this.bufferOffset = 0;
this.baseState = State.Text;
this.currentSequence = undefined!;
this.running = true;
@@ -215,18 +217,11 @@ export default class Tokenizer {
}
}
- /**
- * The start of the current section.
- */
- public getAbsoluteSectionStart(): number {
- return this.sectionStart + this.bufferOffset;
- }
-
/**
* The current index within all of the written data.
*/
- public getAbsoluteIndex(): number {
- return this.bufferOffset + this._index;
+ public getIndex(): number {
+ return this._index;
}
private stateText(c: number) {
@@ -235,7 +230,10 @@ export default class Tokenizer {
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
) {
if (this._index > this.sectionStart) {
- this.cbs.ontext(this.getSection());
+ this.cbs.ontext(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
}
this._state = State.BeforeTagName;
this.sectionStart = this._index;
@@ -276,7 +274,10 @@ export default class Tokenizer {
// Spoof the index so that reported locations match up.
const actualIndex = this._index;
this._index = endOfText;
- this.cbs.ontext(this.getSection());
+ this.cbs.ontext(
+ this.sectionStart,
+ endOfText - this.sectionStart
+ );
this._index = actualIndex;
}
@@ -358,15 +359,12 @@ export default class Tokenizer {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
// Remove 2 trailing chars
- const section = this.buffer.slice(
- this.sectionStart,
- this._index - 2
- );
+ const length = this._index - 2 - this.sectionStart;
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(section);
+ this.cbs.oncdata(this.sectionStart, length);
} else {
- this.cbs.oncomment(section);
+ this.cbs.oncomment(this.sectionStart, length);
}
this.sequenceIndex = 0;
@@ -428,7 +426,10 @@ export default class Tokenizer {
}
private stateInTagName(c: number) {
if (isEndOfTagSection(c)) {
- this.cbs.onopentagname(this.getSection());
+ this.cbs.onopentagname(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this.sectionStart = -1;
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
@@ -448,7 +449,10 @@ export default class Tokenizer {
}
private stateInClosingTagName(c: number) {
if (c === CharCodes.Gt || isWhitespace(c)) {
- this.cbs.onclosetag(this.getSection());
+ this.cbs.onclosetag(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this.sectionStart = -1;
this._state = State.AfterClosingTagName;
this.stateAfterClosingTagName(c);
@@ -493,7 +497,10 @@ export default class Tokenizer {
}
private stateInAttributeName(c: number) {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
- this.cbs.onattribname(this.getSection());
+ this.cbs.onattribname(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this.sectionStart = -1;
this._state = State.AfterAttributeName;
this.stateAfterAttributeName(c);
@@ -503,11 +510,11 @@ export default class Tokenizer {
if (c === CharCodes.Eq) {
this._state = State.BeforeAttributeValue;
} else if (c === CharCodes.Slash || c === CharCodes.Gt) {
- this.cbs.onattribend(undefined);
+ this.cbs.onattribend(QuoteType.NoValue);
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (!isWhitespace(c)) {
- this.cbs.onattribend(undefined);
+ this.cbs.onattribend(QuoteType.NoValue);
this._state = State.InAttributeName;
this.sectionStart = this._index;
}
@@ -530,9 +537,16 @@ export default class Tokenizer {
c === quote ||
(!this.decodeEntities && this.fastForwardTo(quote))
) {
- this.cbs.onattribdata(this.getSection());
+ this.cbs.onattribdata(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this.sectionStart = -1;
- this.cbs.onattribend(String.fromCharCode(quote));
+ this.cbs.onattribend(
+ quote === CharCodes.DoubleQuote
+ ? QuoteType.Double
+ : QuoteType.Single
+ );
this._state = State.BeforeAttributeName;
} else if (this.decodeEntities && c === CharCodes.Amp) {
this.baseState = this._state;
@@ -547,9 +561,12 @@ export default class Tokenizer {
}
private stateInAttributeValueNoQuotes(c: number) {
if (isWhitespace(c) || c === CharCodes.Gt) {
- this.cbs.onattribdata(this.getSection());
+ this.cbs.onattribdata(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this.sectionStart = -1;
- this.cbs.onattribend(null);
+ this.cbs.onattribend(QuoteType.Unquoted);
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -570,14 +587,20 @@ export default class Tokenizer {
}
private stateInDeclaration(c: number) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.ondeclaration(this.getSection());
+ this.cbs.ondeclaration(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this._state = State.Text;
this.sectionStart = this._index + 1;
}
}
private stateInProcessingInstruction(c: number) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.onprocessinginstruction(this.getSection());
+ this.cbs.onprocessinginstruction(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this._state = State.Text;
this.sectionStart = this._index + 1;
}
@@ -595,7 +618,10 @@ export default class Tokenizer {
}
private stateInSpecialComment(c: number) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.oncomment(this.getSection());
+ this.cbs.oncomment(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
this._state = State.Text;
this.sectionStart = this._index + 1;
}
@@ -614,7 +640,7 @@ export default class Tokenizer {
private trieIndex = 0;
private trieCurrent = 0;
- private trieResult: string | null = null;
+ private trieResult = 0;
private entityExcess = 0;
private stateBeforeEntity(c: number) {
@@ -628,7 +654,7 @@ export default class Tokenizer {
} else {
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
- this.trieResult = null;
+ this.trieResult = 0;
this._state = State.InNamedEntity;
this.stateInNamedEntity(c);
}
@@ -664,20 +690,16 @@ export default class Tokenizer {
if (entityStart > this.sectionStart) {
this.emitPartial(
- this.buffer.substring(this.sectionStart, entityStart)
+ this.sectionStart,
+ entityStart - this.sectionStart
);
}
// If this is a surrogate pair, consume the next two bytes
- this.trieResult =
- this.trieCurrent & BinTrieFlags.MULTI_BYTE
- ? String.fromCharCode(
- this.entityTrie[++this.trieIndex],
- this.entityTrie[++this.trieIndex]
- )
- : String.fromCharCode(
- this.entityTrie[++this.trieIndex]
- );
+ this.trieResult = this.trieIndex;
+ this.trieIndex +=
+ 1 +
+ Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
this.entityExcess = 0;
this.sectionStart = this._index + 1;
}
@@ -685,8 +707,23 @@ export default class Tokenizer {
}
private emitNamedEntity() {
- if (this.trieResult) {
- this.emitPartial(this.trieResult);
+ if (this.trieResult !== 0) {
+ if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) {
+ const first = this.entityTrie[this.trieResult + 1];
+ const second = this.entityTrie[this.trieResult + 2];
+ // If this is a surrogate pair, combine the code points.
+ if (first >= 0xd8_00 && first <= 0xdf_ff) {
+ this.emitCodePoint(
+ // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+ (first - 0xd8_00) * 0x4_00 + second + 0x24_00
+ );
+ } else {
+ this.emitCodePoint(first);
+ this.emitCodePoint(second);
+ }
+ } else {
+ this.emitCodePoint(this.entityTrie[this.trieResult + 1]);
+ }
}
this._state = this.baseState;
@@ -710,14 +747,15 @@ export default class Tokenizer {
// Emit leading data if any
if (entityStart > this.sectionStart) {
this.emitPartial(
- this.buffer.substring(this.sectionStart, entityStart)
+ this.sectionStart,
+ entityStart - this.sectionStart
);
}
// Parse entity
const entity = this.buffer.substring(numberStart, this._index);
const parsed = parseInt(entity, base);
- this.emitPartial(decodeCodePoint(parsed));
+ this.emitCodePoint(parsed);
this.sectionStart = this._index + Number(strict);
}
this._state = this.baseState;
@@ -767,27 +805,28 @@ export default class Tokenizer {
* Remove data that has already been consumed from the buffer.
*/
private cleanup() {
- // If we are inside of text, emit what we already have.
- if (
- this.running &&
- this.sectionStart !== this._index &&
- (this._state === State.Text ||
- (this._state === State.InSpecialTag &&
- this.sequenceIndex === 0))
- ) {
- // TODO: We could emit attribute data here as well.
- this.cbs.ontext(this.buffer.substr(this.sectionStart));
- this.sectionStart = this._index;
- }
-
- const start = this.sectionStart < 0 ? this._index : this.sectionStart;
- this.buffer =
- start === this.buffer.length ? "" : this.buffer.substr(start);
- this._index -= start;
- this.bufferOffset += start;
-
- if (this.sectionStart > 0) {
- this.sectionStart = 0;
+ // If we are inside of text or attributes, emit what we already have.
+ if (this.running && this.sectionStart !== this._index) {
+ if (
+ this._state === State.Text ||
+ (this._state === State.InSpecialTag && this.sequenceIndex === 0)
+ ) {
+ this.cbs.ontext(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
+ this.sectionStart = this._index;
+ } else if (
+ this._state === State.InAttributeValueDq ||
+ this._state === State.InAttributeValueSq ||
+ this._state === State.InAttributeValueNq
+ ) {
+ this.cbs.onattribdata(
+ this.sectionStart,
+ this._index - this.sectionStart
+ );
+ this.sectionStart = this._index;
+ }
}
}
@@ -882,12 +921,12 @@ export default class Tokenizer {
/** Handle any trailing data. */
private handleTrailingData() {
- const data = this.buffer.substr(this.sectionStart);
+ const remaining = this.buffer.length - this.sectionStart;
if (this._state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(data);
+ this.cbs.oncdata(this.sectionStart, remaining);
} else {
- this.cbs.oncomment(data);
+ this.cbs.oncomment(this.sectionStart, remaining);
}
} else if (
this._state === State.InNumericEntity &&
@@ -917,21 +956,28 @@ export default class Tokenizer {
* respective callback signals that the tag should be ignored.
*/
} else {
- this.cbs.ontext(data);
+ this.cbs.ontext(this.sectionStart, remaining);
}
}
- private getSection(): string {
- return this.buffer.substring(this.sectionStart, this._index);
+ private emitPartial(start: number, length: number) {
+ if (
+ this.baseState !== State.Text &&
+ this.baseState !== State.InSpecialTag
+ ) {
+ this.cbs.onattribdata(start, length);
+ } else {
+ this.cbs.ontext(start, length);
+ }
}
- private emitPartial(value: string) {
+ private emitCodePoint(cp: number) {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
- this.cbs.onattribdata(value);
+ this.cbs.onattribentity(cp);
} else {
- this.cbs.ontext(value);
+ this.cbs.ontextentity(cp);
}
}
}
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 38a60dfff..4951ecd2a 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -3,12 +3,13 @@
exports[`Tokenizer should not lose data when pausing 1`] = `
Array [
Array [
- "ontext",
- "&",
+ "ontextentity",
+ 38,
],
Array [
"ontext",
- " it up!",
+ 5,
+ 7,
],
Array [
"onend",
@@ -20,21 +21,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing scr
Array [
Array [
"onopentagname",
- "script",
+ 1,
+ 6,
],
Array [
"onselfclosingtag",
],
Array [
"onopentagname",
- "div",
+ 11,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 17,
+ 3,
],
Array [
"onend",
@@ -46,21 +50,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing sty
Array [
Array [
"onopentagname",
- "style",
+ 1,
+ 5,
],
Array [
"onselfclosingtag",
],
Array [
"onopentagname",
- "div",
+ 10,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 16,
+ 3,
],
Array [
"onend",
@@ -72,21 +79,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing tit
Array [
Array [
"onopentagname",
- "title",
+ 1,
+ 5,
],
Array [
"onselfclosingtag",
],
Array [
"onopentagname",
- "div",
+ 10,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 16,
+ 3,
],
Array [
"onend",
@@ -98,25 +108,29 @@ exports[`Tokenizer should support standard special tags for normal script tag 1`
Array [
Array [
"onopentagname",
- "script",
+ 1,
+ 6,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "script",
+ 10,
+ 6,
],
Array [
"onopentagname",
- "div",
+ 18,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 24,
+ 3,
],
Array [
"onend",
@@ -128,25 +142,29 @@ exports[`Tokenizer should support standard special tags for normal sitle tag 1`]
Array [
Array [
"onopentagname",
- "title",
+ 1,
+ 5,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "title",
+ 9,
+ 5,
],
Array [
"onopentagname",
- "div",
+ 16,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 22,
+ 3,
],
Array [
"onend",
@@ -158,25 +176,29 @@ exports[`Tokenizer should support standard special tags for normal style tag 1`]
Array [
Array [
"onopentagname",
- "style",
+ 1,
+ 5,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "style",
+ 9,
+ 5,
],
Array [
"onopentagname",
- "div",
+ 16,
+ 3,
],
Array [
"onopentagend",
],
Array [
"onclosetag",
- "div",
+ 22,
+ 3,
],
Array [
"onend",
From e2b23ea2d66daa0e7eb8425ad018e3a51e7217ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 19:28:36 +0000
Subject: [PATCH 2/7] Add return types
---
src/Tokenizer.ts | 74 ++++++++++++++++++++++++------------------------
1 file changed, 37 insertions(+), 37 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index a597439df..61aa6c979 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -224,7 +224,7 @@ export default class Tokenizer {
return this._index;
}
- private stateText(c: number) {
+ private stateText(c: number): void {
if (
c === CharCodes.Lt ||
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
@@ -244,7 +244,7 @@ export default class Tokenizer {
private currentSequence!: Uint8Array;
private sequenceIndex = 0;
- private stateSpecialStartSequence(c: number) {
+ private stateSpecialStartSequence(c: number): void {
const isEnd = this.sequenceIndex === this.currentSequence.length;
const isMatch = isEnd
? // If we are at the end of the sequence, make sure the tag name has ended
@@ -265,7 +265,7 @@ export default class Tokenizer {
}
/** Look for an end tag. For
tags, also decode entities. */
- private stateInSpecialTag(c: number) {
+ private stateInSpecialTag(c: number): void {
if (this.sequenceIndex === this.currentSequence.length) {
if (c === CharCodes.Gt || isWhitespace(c)) {
const endOfText = this._index - this.currentSequence.length;
@@ -308,7 +308,7 @@ export default class Tokenizer {
}
}
- private stateCDATASequence(c: number) {
+ private stateCDATASequence(c: number): void {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
this._state = State.InCommentLike;
@@ -355,7 +355,7 @@ export default class Tokenizer {
* - That character is then repeated, so we have to check multiple repeats.
* - All characters but the start character of the sequence can be skipped.
*/
- private stateInCommentLike(c: number) {
+ private stateInCommentLike(c: number): void {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
// Remove 2 trailing chars
@@ -399,7 +399,7 @@ export default class Tokenizer {
this._state = State.SpecialStartSequence;
}
- private stateBeforeTagName(c: number) {
+ private stateBeforeTagName(c: number): void {
if (c === CharCodes.ExclamationMark) {
this._state = State.BeforeDeclaration;
this.sectionStart = this._index + 1;
@@ -424,7 +424,7 @@ export default class Tokenizer {
this.stateText(c);
}
}
- private stateInTagName(c: number) {
+ private stateInTagName(c: number): void {
if (isEndOfTagSection(c)) {
this.cbs.onopentagname(
this.sectionStart,
@@ -435,7 +435,7 @@ export default class Tokenizer {
this.stateBeforeAttributeName(c);
}
}
- private stateBeforeClosingTagName(c: number) {
+ private stateBeforeClosingTagName(c: number): void {
if (isWhitespace(c)) {
// Ignore
} else if (c === CharCodes.Gt) {
@@ -447,7 +447,7 @@ export default class Tokenizer {
this.sectionStart = this._index;
}
}
- private stateInClosingTagName(c: number) {
+ private stateInClosingTagName(c: number): void {
if (c === CharCodes.Gt || isWhitespace(c)) {
this.cbs.onclosetag(
this.sectionStart,
@@ -458,14 +458,14 @@ export default class Tokenizer {
this.stateAfterClosingTagName(c);
}
}
- private stateAfterClosingTagName(c: number) {
+ private stateAfterClosingTagName(c: number): void {
// Skip everything until ">"
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this._state = State.Text;
this.sectionStart = this._index + 1;
}
}
- private stateBeforeAttributeName(c: number) {
+ private stateBeforeAttributeName(c: number): void {
if (c === CharCodes.Gt) {
this.cbs.onopentagend();
if (this.isSpecial) {
@@ -483,7 +483,7 @@ export default class Tokenizer {
this.sectionStart = this._index;
}
}
- private stateInSelfClosingTag(c: number) {
+ private stateInSelfClosingTag(c: number): void {
if (c === CharCodes.Gt) {
this.cbs.onselfclosingtag();
this._state = State.Text;
@@ -495,7 +495,7 @@ export default class Tokenizer {
this.stateBeforeAttributeName(c);
}
}
- private stateInAttributeName(c: number) {
+ private stateInAttributeName(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.onattribname(
this.sectionStart,
@@ -506,7 +506,7 @@ export default class Tokenizer {
this.stateAfterAttributeName(c);
}
}
- private stateAfterAttributeName(c: number) {
+ private stateAfterAttributeName(c: number): void {
if (c === CharCodes.Eq) {
this._state = State.BeforeAttributeValue;
} else if (c === CharCodes.Slash || c === CharCodes.Gt) {
@@ -519,7 +519,7 @@ export default class Tokenizer {
this.sectionStart = this._index;
}
}
- private stateBeforeAttributeValue(c: number) {
+ private stateBeforeAttributeValue(c: number): void {
if (c === CharCodes.DoubleQuote) {
this._state = State.InAttributeValueDq;
this.sectionStart = this._index + 1;
@@ -553,13 +553,13 @@ export default class Tokenizer {
this._state = State.BeforeEntity;
}
}
- private stateInAttributeValueDoubleQuotes(c: number) {
+ private stateInAttributeValueDoubleQuotes(c: number): void {
this.handleInAttributeValue(c, CharCodes.DoubleQuote);
}
- private stateInAttributeValueSingleQuotes(c: number) {
+ private stateInAttributeValueSingleQuotes(c: number): void {
this.handleInAttributeValue(c, CharCodes.SingleQuote);
}
- private stateInAttributeValueNoQuotes(c: number) {
+ private stateInAttributeValueNoQuotes(c: number): void {
if (isWhitespace(c) || c === CharCodes.Gt) {
this.cbs.onattribdata(
this.sectionStart,
@@ -574,7 +574,7 @@ export default class Tokenizer {
this._state = State.BeforeEntity;
}
}
- private stateBeforeDeclaration(c: number) {
+ private stateBeforeDeclaration(c: number): void {
if (c === CharCodes.OpeningSquareBracket) {
this._state = State.CDATASequence;
this.sequenceIndex = 0;
@@ -585,7 +585,7 @@ export default class Tokenizer {
: State.InDeclaration;
}
}
- private stateInDeclaration(c: number) {
+ private stateInDeclaration(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.ondeclaration(
this.sectionStart,
@@ -595,7 +595,7 @@ export default class Tokenizer {
this.sectionStart = this._index + 1;
}
}
- private stateInProcessingInstruction(c: number) {
+ private stateInProcessingInstruction(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.onprocessinginstruction(
this.sectionStart,
@@ -605,7 +605,7 @@ export default class Tokenizer {
this.sectionStart = this._index + 1;
}
}
- private stateBeforeComment(c: number) {
+ private stateBeforeComment(c: number): void {
if (c === CharCodes.Dash) {
this._state = State.InCommentLike;
this.currentSequence = Sequences.CommentEnd;
@@ -616,7 +616,7 @@ export default class Tokenizer {
this._state = State.InDeclaration;
}
}
- private stateInSpecialComment(c: number) {
+ private stateInSpecialComment(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.oncomment(
this.sectionStart,
@@ -626,7 +626,7 @@ export default class Tokenizer {
this.sectionStart = this._index + 1;
}
}
- private stateBeforeSpecialS(c: number) {
+ private stateBeforeSpecialS(c: number): void {
const lower = c | 0x20;
if (lower === Sequences.ScriptEnd[3]) {
this.startSpecial(Sequences.ScriptEnd, 4);
@@ -640,12 +640,13 @@ export default class Tokenizer {
private trieIndex = 0;
private trieCurrent = 0;
- private trieResult = 0;
+ private entityResult = 0;
private entityExcess = 0;
- private stateBeforeEntity(c: number) {
+ private stateBeforeEntity(c: number): void {
// Start excess with 1 to include the '&'
this.entityExcess = 1;
+ this.entityResult = 0;
if (c === CharCodes.Num) {
this._state = State.BeforeNumericEntity;
@@ -654,13 +655,12 @@ export default class Tokenizer {
} else {
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
- this.trieResult = 0;
this._state = State.InNamedEntity;
this.stateInNamedEntity(c);
}
}
- private stateInNamedEntity(c: number) {
+ private stateInNamedEntity(c: number): void {
this.entityExcess += 1;
this.trieIndex = determineBranch(
@@ -696,7 +696,7 @@ export default class Tokenizer {
}
// If this is a surrogate pair, consume the next two bytes
- this.trieResult = this.trieIndex;
+ this.entityResult = this.trieIndex;
this.trieIndex +=
1 +
Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
@@ -707,10 +707,10 @@ export default class Tokenizer {
}
private emitNamedEntity() {
- if (this.trieResult !== 0) {
- if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) {
- const first = this.entityTrie[this.trieResult + 1];
- const second = this.entityTrie[this.trieResult + 2];
+ if (this.entityResult !== 0) {
+ if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
+ const first = this.entityTrie[this.entityResult + 1];
+ const second = this.entityTrie[this.entityResult + 2];
// If this is a surrogate pair, combine the code points.
if (first >= 0xd8_00 && first <= 0xdf_ff) {
this.emitCodePoint(
@@ -722,14 +722,14 @@ export default class Tokenizer {
this.emitCodePoint(second);
}
} else {
- this.emitCodePoint(this.entityTrie[this.trieResult + 1]);
+ this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
}
}
this._state = this.baseState;
}
- private stateBeforeNumericEntity(c: number) {
+ private stateBeforeNumericEntity(c: number): void {
if ((c | 0x20) === CharCodes.LowerX) {
this.entityExcess++;
this._state = State.InHexEntity;
@@ -760,7 +760,7 @@ export default class Tokenizer {
}
this._state = this.baseState;
}
- private stateInNumericEntity(c: number) {
+ private stateInNumericEntity(c: number): void {
if (c === CharCodes.Semi) {
this.decodeNumericEntity(10, true);
} else if (!isNumber(c)) {
@@ -774,7 +774,7 @@ export default class Tokenizer {
this.entityExcess++;
}
}
- private stateInHexEntity(c: number) {
+ private stateInHexEntity(c: number): void {
if (c === CharCodes.Semi) {
this.decodeNumericEntity(16, true);
} else if (
From a5175ae4763ff0e8c28d783d1721356df776fae0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 20:53:11 +0000
Subject: [PATCH 3/7] Pass end indices in several callbacks
Avoids `getIndex` calls
---
src/Parser.ts | 28 +++++++--------
src/Tokenizer.ts | 45 +++++++++---------------
src/__snapshots__/Tokenizer.spec.ts.snap | 42 +++++++++++-----------
3 files changed, 52 insertions(+), 63 deletions(-)
diff --git a/src/Parser.ts b/src/Parser.ts
index 429fd517a..2dcffd0a1 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -257,10 +257,10 @@ export class Parser implements Callbacks {
}
/** @internal */
- onopentagname(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
+ onopentagname(start: number, endIndex: number): void {
+ this.endIndex = endIndex;
- let name = this.getSubstr(start, length);
+ let name = this.getSubstr(start, endIndex - start);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -321,10 +321,10 @@ export class Parser implements Callbacks {
}
/** @internal */
- onclosetag(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
+ onclosetag(start: number, endIndex: number): void {
+ this.endIndex = endIndex;
- let name = this.getSubstr(start, length);
+ let name = this.getSubstr(start, endIndex - start);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -412,8 +412,8 @@ export class Parser implements Callbacks {
}
/** @internal */
- onattribend(quote: QuoteType): void {
- this.endIndex = this.tokenizer.getIndex();
+ onattribend(quote: QuoteType, endIndex: number): void {
+ this.endIndex = endIndex;
const quoteVal =
quote === QuoteType.Double
@@ -447,9 +447,9 @@ export class Parser implements Callbacks {
}
/** @internal */
- ondeclaration(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
- const value = this.getSubstr(start, length);
+ ondeclaration(start: number, endIndex: number): void {
+ this.endIndex = endIndex;
+ const value = this.getSubstr(start, endIndex - start);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
@@ -461,9 +461,9 @@ export class Parser implements Callbacks {
}
/** @internal */
- onprocessinginstruction(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
- const value = this.getSubstr(start, length);
+ onprocessinginstruction(start: number, endIndex: number): void {
+ this.endIndex = endIndex;
+ const value = this.getSubstr(start, endIndex - start);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 61aa6c979..b0b65b914 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -114,17 +114,17 @@ export enum QuoteType {
export interface Callbacks {
onattribdata(start: number, length: number): void;
onattribentity(codepoint: number): void;
- onattribend(quote: QuoteType): void;
+ onattribend(quote: QuoteType, endIndex: number): void;
onattribname(start: number, length: number): void;
oncdata(start: number, length: number): void;
- onclosetag(start: number, length: number): void;
+ onclosetag(start: number, endIndex: number): void;
oncomment(start: number, length: number): void;
- ondeclaration(start: number, length: number): void;
+ ondeclaration(start: number, endIndex: number): void;
onend(): void;
onerror(error: Error, state?: State): void;
onopentagend(): void;
- onopentagname(start: number, length: number): void;
- onprocessinginstruction(start: number, length: number): void;
+ onopentagname(start: number, endIndex: number): void;
+ onprocessinginstruction(start: number, endIndex: number): void;
onselfclosingtag(): void;
ontext(start: number, length: number): void;
ontextentity(codepoint: number): void;
@@ -426,10 +426,7 @@ export default class Tokenizer {
}
private stateInTagName(c: number): void {
if (isEndOfTagSection(c)) {
- this.cbs.onopentagname(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onopentagname(this.sectionStart, this._index);
this.sectionStart = -1;
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
@@ -449,10 +446,7 @@ export default class Tokenizer {
}
private stateInClosingTagName(c: number): void {
if (c === CharCodes.Gt || isWhitespace(c)) {
- this.cbs.onclosetag(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onclosetag(this.sectionStart, this._index);
this.sectionStart = -1;
this._state = State.AfterClosingTagName;
this.stateAfterClosingTagName(c);
@@ -510,11 +504,11 @@ export default class Tokenizer {
if (c === CharCodes.Eq) {
this._state = State.BeforeAttributeValue;
} else if (c === CharCodes.Slash || c === CharCodes.Gt) {
- this.cbs.onattribend(QuoteType.NoValue);
+ this.cbs.onattribend(QuoteType.NoValue, this._index);
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (!isWhitespace(c)) {
- this.cbs.onattribend(QuoteType.NoValue);
+ this.cbs.onattribend(QuoteType.NoValue, this._index);
this._state = State.InAttributeName;
this.sectionStart = this._index;
}
@@ -545,7 +539,8 @@ export default class Tokenizer {
this.cbs.onattribend(
quote === CharCodes.DoubleQuote
? QuoteType.Double
- : QuoteType.Single
+ : QuoteType.Single,
+ this._index
);
this._state = State.BeforeAttributeName;
} else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -566,7 +561,7 @@ export default class Tokenizer {
this._index - this.sectionStart
);
this.sectionStart = -1;
- this.cbs.onattribend(QuoteType.Unquoted);
+ this.cbs.onattribend(QuoteType.Unquoted, this._index);
this._state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -587,20 +582,14 @@ export default class Tokenizer {
}
private stateInDeclaration(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.ondeclaration(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.ondeclaration(this.sectionStart, this._index);
this._state = State.Text;
this.sectionStart = this._index + 1;
}
}
private stateInProcessingInstruction(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.onprocessinginstruction(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onprocessinginstruction(this.sectionStart, this._index);
this._state = State.Text;
this.sectionStart = this._index + 1;
}
@@ -706,7 +695,7 @@ export default class Tokenizer {
}
}
- private emitNamedEntity() {
+ private emitNamedEntity(): void {
if (this.entityResult !== 0) {
if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
const first = this.entityTrie[this.entityResult + 1];
@@ -960,7 +949,7 @@ export default class Tokenizer {
}
}
- private emitPartial(start: number, length: number) {
+ private emitPartial(start: number, length: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
@@ -970,7 +959,7 @@ export default class Tokenizer {
this.cbs.ontext(start, length);
}
}
- private emitCodePoint(cp: number) {
+ private emitCodePoint(cp: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 4951ecd2a..5f641da6b 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -22,7 +22,7 @@ Array [
Array [
"onopentagname",
1,
- 6,
+ 7,
],
Array [
"onselfclosingtag",
@@ -30,7 +30,7 @@ Array [
Array [
"onopentagname",
11,
- 3,
+ 14,
],
Array [
"onopentagend",
@@ -38,7 +38,7 @@ Array [
Array [
"onclosetag",
17,
- 3,
+ 20,
],
Array [
"onend",
@@ -51,7 +51,7 @@ Array [
Array [
"onopentagname",
1,
- 5,
+ 6,
],
Array [
"onselfclosingtag",
@@ -59,7 +59,7 @@ Array [
Array [
"onopentagname",
10,
- 3,
+ 13,
],
Array [
"onopentagend",
@@ -67,7 +67,7 @@ Array [
Array [
"onclosetag",
16,
- 3,
+ 19,
],
Array [
"onend",
@@ -80,7 +80,7 @@ Array [
Array [
"onopentagname",
1,
- 5,
+ 6,
],
Array [
"onselfclosingtag",
@@ -88,7 +88,7 @@ Array [
Array [
"onopentagname",
10,
- 3,
+ 13,
],
Array [
"onopentagend",
@@ -96,7 +96,7 @@ Array [
Array [
"onclosetag",
16,
- 3,
+ 19,
],
Array [
"onend",
@@ -109,7 +109,7 @@ Array [
Array [
"onopentagname",
1,
- 6,
+ 7,
],
Array [
"onopentagend",
@@ -117,12 +117,12 @@ Array [
Array [
"onclosetag",
10,
- 6,
+ 16,
],
Array [
"onopentagname",
18,
- 3,
+ 21,
],
Array [
"onopentagend",
@@ -130,7 +130,7 @@ Array [
Array [
"onclosetag",
24,
- 3,
+ 27,
],
Array [
"onend",
@@ -143,7 +143,7 @@ Array [
Array [
"onopentagname",
1,
- 5,
+ 6,
],
Array [
"onopentagend",
@@ -151,12 +151,12 @@ Array [
Array [
"onclosetag",
9,
- 5,
+ 14,
],
Array [
"onopentagname",
16,
- 3,
+ 19,
],
Array [
"onopentagend",
@@ -164,7 +164,7 @@ Array [
Array [
"onclosetag",
22,
- 3,
+ 25,
],
Array [
"onend",
@@ -177,7 +177,7 @@ Array [
Array [
"onopentagname",
1,
- 5,
+ 6,
],
Array [
"onopentagend",
@@ -185,12 +185,12 @@ Array [
Array [
"onclosetag",
9,
- 5,
+ 14,
],
Array [
"onopentagname",
16,
- 3,
+ 19,
],
Array [
"onopentagend",
@@ -198,7 +198,7 @@ Array [
Array [
"onclosetag",
22,
- 3,
+ 25,
],
Array [
"onend",
From 397ef7865b9a2406ae093d831ff9c3ba96f05645 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 21:10:15 +0000
Subject: [PATCH 4/7] Decode numeric entities on the go
---
src/Tokenizer.ts | 52 ++++++++++++++++++++++++++++--------------------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index b0b65b914..52944ee26 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -104,6 +104,13 @@ function isASCIIAlpha(c: number): boolean {
);
}
+function isHexDigit(c: number): boolean {
+ return (
+ (c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
+ (c >= CharCodes.LowerA && c <= CharCodes.LowerF)
+ );
+}
+
export enum QuoteType {
NoValue = 0,
Unquoted = 1,
@@ -629,6 +636,7 @@ export default class Tokenizer {
private trieIndex = 0;
private trieCurrent = 0;
+ /** For named entities, the index of the value. For numeric entities, the code point. */
private entityResult = 0;
private entityExcess = 0;
@@ -728,9 +736,10 @@ export default class Tokenizer {
}
}
- private decodeNumericEntity(base: 10 | 16, strict: boolean) {
+ private emitNumericEntity(strict: boolean) {
const entityStart = this._index - this.entityExcess - 1;
- const numberStart = entityStart + 2 + (base >> 4);
+ const numberStart =
+ entityStart + 2 + Number(this._state === State.InHexEntity);
if (numberStart !== this._index) {
// Emit leading data if any
@@ -741,44 +750,43 @@ export default class Tokenizer {
);
}
- // Parse entity
- const entity = this.buffer.substring(numberStart, this._index);
- const parsed = parseInt(entity, base);
- this.emitCodePoint(parsed);
+ this.emitCodePoint(this.entityResult);
this.sectionStart = this._index + Number(strict);
}
this._state = this.baseState;
}
private stateInNumericEntity(c: number): void {
if (c === CharCodes.Semi) {
- this.decodeNumericEntity(10, true);
- } else if (!isNumber(c)) {
+ this.emitNumericEntity(true);
+ } else if (isNumber(c)) {
+ this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
+ this.entityExcess++;
+ } else {
if (this.allowLegacyEntity()) {
- this.decodeNumericEntity(10, false);
+ this.emitNumericEntity(false);
} else {
this._state = this.baseState;
}
this._index--;
- } else {
- this.entityExcess++;
}
}
private stateInHexEntity(c: number): void {
if (c === CharCodes.Semi) {
- this.decodeNumericEntity(16, true);
- } else if (
- (c < CharCodes.LowerA || c > CharCodes.LowerF) &&
- (c < CharCodes.UpperA || c > CharCodes.UpperF) &&
- !isNumber(c)
- ) {
+ this.emitNumericEntity(true);
+ } else if (isNumber(c)) {
+ this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
+ this.entityExcess++;
+ } else if (isHexDigit(c)) {
+ this.entityResult =
+ this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
+ this.entityExcess++;
+ } else {
if (this.allowLegacyEntity()) {
- this.decodeNumericEntity(16, false);
+ this.emitNumericEntity(false);
} else {
this._state = this.baseState;
}
this._index--;
- } else {
- this.entityExcess++;
}
}
@@ -921,13 +929,13 @@ export default class Tokenizer {
this._state === State.InNumericEntity &&
this.allowLegacyEntity()
) {
- this.decodeNumericEntity(10, false);
+ this.emitNumericEntity(false);
// All trailing data will have been consumed
} else if (
this._state === State.InHexEntity &&
this.allowLegacyEntity()
) {
- this.decodeNumericEntity(16, false);
+ this.emitNumericEntity(false);
// All trailing data will have been consumed
} else if (
this._state === State.InTagName ||
From f30745478018befe9e83c32f95be1cb8d7a42160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 21:24:51 +0000
Subject: [PATCH 5/7] Pass `endIndex` for all callbacks
---
src/Parser.ts | 65 +++++++++--------
src/Tokenizer.ts | 88 ++++++++----------------
src/__snapshots__/Tokenizer.spec.ts.snap | 14 +++-
3 files changed, 72 insertions(+), 95 deletions(-)
diff --git a/src/Parser.ts b/src/Parser.ts
index 2dcffd0a1..da4744a2d 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -236,12 +236,11 @@ export class Parser implements Callbacks {
// Tokenizer event handlers
/** @internal */
- ontext(start: number, length: number): void {
- const data = this.getSubstr(start, length);
- const idx = start + length;
- this.endIndex = idx - 1;
+ ontext(start: number, endIndex: number): void {
+ const data = this.getSlice(start, endIndex);
+ this.endIndex = endIndex - 1;
this.cbs.ontext?.(data);
- this.startIndex = idx;
+ this.startIndex = endIndex;
}
/** @internal */
@@ -260,7 +259,7 @@ export class Parser implements Callbacks {
onopentagname(start: number, endIndex: number): void {
this.endIndex = endIndex;
- let name = this.getSubstr(start, endIndex - start);
+ let name = this.getSlice(start, endIndex);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -299,7 +298,6 @@ export class Parser implements Callbacks {
private endOpenTag(isImplied: boolean) {
this.startIndex = this.openTagStart;
- this.endIndex = this.tokenizer.getIndex();
if (this.attribs) {
this.cbs.onopentag?.(this.tagname, this.attribs, isImplied);
@@ -313,18 +311,19 @@ export class Parser implements Callbacks {
}
/** @internal */
- onopentagend(): void {
+ onopentagend(endIndex: number): void {
+ this.endIndex = endIndex;
this.endOpenTag(false);
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
onclosetag(start: number, endIndex: number): void {
this.endIndex = endIndex;
- let name = this.getSubstr(start, endIndex - start);
+ let name = this.getSlice(start, endIndex);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
@@ -359,11 +358,12 @@ export class Parser implements Callbacks {
}
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
- onselfclosingtag(): void {
+ onselfclosingtag(endIndex: number): void {
+ this.endIndex = endIndex;
if (
this.options.xmlMode ||
this.options.recognizeSelfClosing ||
@@ -372,10 +372,10 @@ export class Parser implements Callbacks {
this.closeCurrentTag(false);
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
} else {
// Ignore the fact that the tag is self-closing.
- this.onopentagend();
+ this.onopentagend(endIndex);
}
}
@@ -392,9 +392,9 @@ export class Parser implements Callbacks {
}
/** @internal */
- onattribname(start: number, length: number): void {
+ onattribname(start: number, endIndex: number): void {
this.startIndex = start;
- const name = this.getSubstr(start, length);
+ const name = this.getSlice(start, endIndex);
this.attribname = this.lowerCaseAttributeNames
? name.toLowerCase()
@@ -402,8 +402,8 @@ export class Parser implements Callbacks {
}
/** @internal */
- onattribdata(start: number, length: number): void {
- this.attribvalue += this.getSubstr(start, length);
+ onattribdata(start: number, endIndex: number): void {
+ this.attribvalue += this.getSlice(start, endIndex);
}
/** @internal */
@@ -449,7 +449,7 @@ export class Parser implements Callbacks {
/** @internal */
ondeclaration(start: number, endIndex: number): void {
this.endIndex = endIndex;
- const value = this.getSubstr(start, endIndex - start);
+ const value = this.getSlice(start, endIndex);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
@@ -457,13 +457,13 @@ export class Parser implements Callbacks {
}
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
onprocessinginstruction(start: number, endIndex: number): void {
this.endIndex = endIndex;
- const value = this.getSubstr(start, endIndex - start);
+ const value = this.getSlice(start, endIndex);
if (this.cbs.onprocessinginstruction) {
const name = this.getInstructionName(value);
@@ -471,25 +471,24 @@ export class Parser implements Callbacks {
}
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
- oncomment(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
- const value = this.getSubstr(start, length);
+ oncomment(start: number, endIndex: number, offset: number): void {
+ this.endIndex = endIndex;
- this.cbs.oncomment?.(value);
+ this.cbs.oncomment?.(this.getSlice(start, endIndex - offset));
this.cbs.oncommentend?.();
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
- oncdata(start: number, length: number): void {
- this.endIndex = this.tokenizer.getIndex();
- const value = this.getSubstr(start, length);
+ oncdata(start: number, endIndex: number, offset: number): void {
+ this.endIndex = endIndex;
+ const value = this.getSlice(start, endIndex - offset);
if (this.options.xmlMode || this.options.recognizeCDATA) {
this.cbs.oncdatastart?.();
@@ -501,7 +500,7 @@ export class Parser implements Callbacks {
}
// Set `startIndex` for next node
- this.startIndex = this.endIndex + 1;
+ this.startIndex = endIndex + 1;
}
/** @internal */
@@ -552,8 +551,8 @@ export class Parser implements Callbacks {
private buffer = "";
- private getSubstr(start: number, length: number) {
- return this.buffer.substr(start, length);
+ private getSlice(start: number, end: number) {
+ return this.buffer.slice(start, end);
}
/**
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 52944ee26..901b20dd6 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -119,21 +119,21 @@ export enum QuoteType {
}
export interface Callbacks {
- onattribdata(start: number, length: number): void;
+ onattribdata(start: number, endIndex: number): void;
onattribentity(codepoint: number): void;
onattribend(quote: QuoteType, endIndex: number): void;
- onattribname(start: number, length: number): void;
- oncdata(start: number, length: number): void;
+ onattribname(start: number, endIndex: number): void;
+ oncdata(start: number, endIndex: number, endOffset: number): void;
onclosetag(start: number, endIndex: number): void;
- oncomment(start: number, length: number): void;
+ oncomment(start: number, endIndex: number, endOffset: number): void;
ondeclaration(start: number, endIndex: number): void;
onend(): void;
onerror(error: Error, state?: State): void;
- onopentagend(): void;
+ onopentagend(endIndex: number): void;
onopentagname(start: number, endIndex: number): void;
onprocessinginstruction(start: number, endIndex: number): void;
- onselfclosingtag(): void;
- ontext(start: number, length: number): void;
+ onselfclosingtag(endIndex: number): void;
+ ontext(start: number, endIndex: number): void;
ontextentity(codepoint: number): void;
}
@@ -237,10 +237,7 @@ export default class Tokenizer {
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
) {
if (this._index > this.sectionStart) {
- this.cbs.ontext(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.ontext(this.sectionStart, this._index);
}
this._state = State.BeforeTagName;
this.sectionStart = this._index;
@@ -281,10 +278,7 @@ export default class Tokenizer {
// Spoof the index so that reported locations match up.
const actualIndex = this._index;
this._index = endOfText;
- this.cbs.ontext(
- this.sectionStart,
- endOfText - this.sectionStart
- );
+ this.cbs.ontext(this.sectionStart, endOfText);
this._index = actualIndex;
}
@@ -365,13 +359,10 @@ export default class Tokenizer {
private stateInCommentLike(c: number): void {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
- // Remove 2 trailing chars
- const length = this._index - 2 - this.sectionStart;
-
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(this.sectionStart, length);
+ this.cbs.oncdata(this.sectionStart, this._index, 2);
} else {
- this.cbs.oncomment(this.sectionStart, length);
+ this.cbs.oncomment(this.sectionStart, this._index, 2);
}
this.sequenceIndex = 0;
@@ -468,7 +459,7 @@ export default class Tokenizer {
}
private stateBeforeAttributeName(c: number): void {
if (c === CharCodes.Gt) {
- this.cbs.onopentagend();
+ this.cbs.onopentagend(this._index);
if (this.isSpecial) {
this._state = State.InSpecialTag;
this.sequenceIndex = 0;
@@ -486,7 +477,7 @@ export default class Tokenizer {
}
private stateInSelfClosingTag(c: number): void {
if (c === CharCodes.Gt) {
- this.cbs.onselfclosingtag();
+ this.cbs.onselfclosingtag(this._index);
this._state = State.Text;
this.baseState = State.Text;
this.sectionStart = this._index + 1;
@@ -498,10 +489,7 @@ export default class Tokenizer {
}
private stateInAttributeName(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
- this.cbs.onattribname(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onattribname(this.sectionStart, this._index);
this.sectionStart = -1;
this._state = State.AfterAttributeName;
this.stateAfterAttributeName(c);
@@ -538,10 +526,7 @@ export default class Tokenizer {
c === quote ||
(!this.decodeEntities && this.fastForwardTo(quote))
) {
- this.cbs.onattribdata(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onattribdata(this.sectionStart, this._index);
this.sectionStart = -1;
this.cbs.onattribend(
quote === CharCodes.DoubleQuote
@@ -563,10 +548,7 @@ export default class Tokenizer {
}
private stateInAttributeValueNoQuotes(c: number): void {
if (isWhitespace(c) || c === CharCodes.Gt) {
- this.cbs.onattribdata(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onattribdata(this.sectionStart, this._index);
this.sectionStart = -1;
this.cbs.onattribend(QuoteType.Unquoted, this._index);
this._state = State.BeforeAttributeName;
@@ -614,10 +596,7 @@ export default class Tokenizer {
}
private stateInSpecialComment(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.oncomment(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.oncomment(this.sectionStart, this._index, 0);
this._state = State.Text;
this.sectionStart = this._index + 1;
}
@@ -686,10 +665,7 @@ export default class Tokenizer {
const entityStart = this._index - this.entityExcess + 1;
if (entityStart > this.sectionStart) {
- this.emitPartial(
- this.sectionStart,
- entityStart - this.sectionStart
- );
+ this.emitPartial(this.sectionStart, entityStart);
}
// If this is a surrogate pair, consume the next two bytes
@@ -744,10 +720,7 @@ export default class Tokenizer {
if (numberStart !== this._index) {
// Emit leading data if any
if (entityStart > this.sectionStart) {
- this.emitPartial(
- this.sectionStart,
- entityStart - this.sectionStart
- );
+ this.emitPartial(this.sectionStart, entityStart);
}
this.emitCodePoint(this.entityResult);
@@ -808,20 +781,14 @@ export default class Tokenizer {
this._state === State.Text ||
(this._state === State.InSpecialTag && this.sequenceIndex === 0)
) {
- this.cbs.ontext(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.ontext(this.sectionStart, this._index);
this.sectionStart = this._index;
} else if (
this._state === State.InAttributeValueDq ||
this._state === State.InAttributeValueSq ||
this._state === State.InAttributeValueNq
) {
- this.cbs.onattribdata(
- this.sectionStart,
- this._index - this.sectionStart
- );
+ this.cbs.onattribdata(this.sectionStart, this._index);
this.sectionStart = this._index;
}
}
@@ -918,12 +885,11 @@ export default class Tokenizer {
/** Handle any trailing data. */
private handleTrailingData() {
- const remaining = this.buffer.length - this.sectionStart;
if (this._state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(this.sectionStart, remaining);
+ this.cbs.oncdata(this.sectionStart, this.buffer.length, 0);
} else {
- this.cbs.oncomment(this.sectionStart, remaining);
+ this.cbs.oncomment(this.sectionStart, this.buffer.length, 0);
}
} else if (
this._state === State.InNumericEntity &&
@@ -953,18 +919,18 @@ export default class Tokenizer {
* respective callback signals that the tag should be ignored.
*/
} else {
- this.cbs.ontext(this.sectionStart, remaining);
+ this.cbs.ontext(this.sectionStart, this.buffer.length);
}
}
- private emitPartial(start: number, length: number): void {
+ private emitPartial(start: number, endIndex: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
- this.cbs.onattribdata(start, length);
+ this.cbs.onattribdata(start, endIndex);
} else {
- this.cbs.ontext(start, length);
+ this.cbs.ontext(start, endIndex);
}
}
private emitCodePoint(cp: number): void {
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 5f641da6b..91d8ecf27 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -9,7 +9,7 @@ Array [
Array [
"ontext",
5,
- 7,
+ 12,
],
Array [
"onend",
@@ -26,6 +26,7 @@ Array [
],
Array [
"onselfclosingtag",
+ 9,
],
Array [
"onopentagname",
@@ -34,6 +35,7 @@ Array [
],
Array [
"onopentagend",
+ 14,
],
Array [
"onclosetag",
@@ -55,6 +57,7 @@ Array [
],
Array [
"onselfclosingtag",
+ 8,
],
Array [
"onopentagname",
@@ -63,6 +66,7 @@ Array [
],
Array [
"onopentagend",
+ 13,
],
Array [
"onclosetag",
@@ -84,6 +88,7 @@ Array [
],
Array [
"onselfclosingtag",
+ 8,
],
Array [
"onopentagname",
@@ -92,6 +97,7 @@ Array [
],
Array [
"onopentagend",
+ 13,
],
Array [
"onclosetag",
@@ -113,6 +119,7 @@ Array [
],
Array [
"onopentagend",
+ 7,
],
Array [
"onclosetag",
@@ -126,6 +133,7 @@ Array [
],
Array [
"onopentagend",
+ 21,
],
Array [
"onclosetag",
@@ -147,6 +155,7 @@ Array [
],
Array [
"onopentagend",
+ 6,
],
Array [
"onclosetag",
@@ -160,6 +169,7 @@ Array [
],
Array [
"onopentagend",
+ 19,
],
Array [
"onclosetag",
@@ -181,6 +191,7 @@ Array [
],
Array [
"onopentagend",
+ 6,
],
Array [
"onclosetag",
@@ -194,6 +205,7 @@ Array [
],
Array [
"onopentagend",
+ 19,
],
Array [
"onclosetag",
From ca09b29d5db8a09cb26a5850f1fde1208414cbd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:14:19 +0000
Subject: [PATCH 6/7] Don't concat strings
and only pass a single string at a time to the tokenizer
---
src/Parser.spec.ts | 4 +--
src/Parser.ts | 74 +++++++++++++++++++++++++++++++++++++---------
src/Tokenizer.ts | 40 +++++++++++--------------
3 files changed, 79 insertions(+), 39 deletions(-)
diff --git a/src/Parser.spec.ts b/src/Parser.spec.ts
index e546c5da7..88fb2bde9 100644
--- a/src/Parser.spec.ts
+++ b/src/Parser.spec.ts
@@ -48,11 +48,11 @@ describe("API", () => {
p.resume();
expect(onText).toHaveBeenCalledTimes(1);
p.pause();
- p.end("foo");
+ p.end("bar");
expect(onText).toHaveBeenCalledTimes(1);
p.resume();
expect(onText).toHaveBeenCalledTimes(2);
- expect(onText).toHaveBeenLastCalledWith("foo");
+ expect(onText).toHaveBeenLastCalledWith("bar");
});
test("should back out of numeric entities (#125)", () => {
diff --git a/src/Parser.ts b/src/Parser.ts
index da4744a2d..2339f9e83 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -218,6 +218,13 @@ export class Parser implements Callbacks {
private readonly lowerCaseAttributeNames: boolean;
private readonly tokenizer: Tokenizer;
+ private readonly buffers: string[] = [];
+ private bufferOffset = 0;
+ /** The index of the last written buffer. Used when resuming after a `pause()`. */
+ private writeIndex = 0;
+ /** Indicates whether the parser has finished running / `.end` has been called. */
+ private ended = false;
+
constructor(
cbs?: Partial | null,
private readonly options: ParserOptions = {}
@@ -503,11 +510,6 @@ export class Parser implements Callbacks {
this.startIndex = endIndex + 1;
}
- /** @internal */
- onerror(err: Error): void {
- this.cbs.onerror?.(err);
- }
-
/** @internal */
onend(): void {
if (this.cbs.onclosetag) {
@@ -531,11 +533,14 @@ export class Parser implements Callbacks {
this.tagname = "";
this.attribname = "";
this.attribs = null;
- this.stack = [];
+ this.stack.length = 0;
this.startIndex = 0;
this.endIndex = 0;
this.cbs.onparserinit?.(this);
- this.buffer = "";
+ this.buffers.length = 0;
+ this.bufferOffset = 0;
+ this.writeIndex = 0;
+ this.ended = false;
}
/**
@@ -549,10 +554,28 @@ export class Parser implements Callbacks {
this.end(data);
}
- private buffer = "";
-
private getSlice(start: number, end: number) {
- return this.buffer.slice(start, end);
+ while (start - this.bufferOffset >= this.buffers[0].length) {
+ this.shiftBuffer();
+ }
+
+ let str = this.buffers[0].slice(
+ start - this.bufferOffset,
+ end - this.bufferOffset
+ );
+
+ while (end - this.bufferOffset > this.buffers[0].length) {
+ this.shiftBuffer();
+ str += this.buffers[0].slice(0, end - this.bufferOffset);
+ }
+
+ return str;
+ }
+
+ private shiftBuffer(): void {
+ this.bufferOffset += this.buffers[0].length;
+ this.writeIndex--;
+ this.buffers.shift();
}
/**
@@ -561,8 +584,16 @@ export class Parser implements Callbacks {
* @param chunk Chunk to parse.
*/
public write(chunk: string): void {
- this.buffer += chunk;
- this.tokenizer.write(chunk);
+ if (this.ended) {
+ this.cbs.onerror?.(new Error(".write() after done!"));
+ return;
+ }
+
+ this.buffers.push(chunk);
+ if (this.tokenizer.running) {
+ this.tokenizer.write(chunk);
+ this.writeIndex++;
+ }
}
/**
@@ -571,8 +602,14 @@ export class Parser implements Callbacks {
* @param chunk Optional final chunk to parse.
*/
public end(chunk?: string): void {
- if (chunk) this.buffer += chunk;
- this.tokenizer.end(chunk);
+ if (this.ended) {
+ this.cbs.onerror?.(Error(".end() after done!"));
+ return;
+ }
+
+ if (chunk) this.write(chunk);
+ this.ended = true;
+ this.tokenizer.end();
}
/**
@@ -587,6 +624,15 @@ export class Parser implements Callbacks {
*/
public resume(): void {
this.tokenizer.resume();
+
+ while (
+ this.tokenizer.running &&
+ this.writeIndex < this.buffers.length
+ ) {
+ this.tokenizer.write(this.buffers[this.writeIndex++]);
+ }
+
+ if (this.ended) this.tokenizer.end();
}
/**
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 901b20dd6..543b93514 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -128,7 +128,6 @@ export interface Callbacks {
oncomment(start: number, endIndex: number, endOffset: number): void;
ondeclaration(start: number, endIndex: number): void;
onend(): void;
- onerror(error: Error, state?: State): void;
onopentagend(endIndex: number): void;
onopentagname(start: number, endIndex: number): void;
onprocessinginstruction(start: number, endIndex: number): void;
@@ -166,9 +165,9 @@ export default class Tokenizer {
/** For special parsing behavior inside of script and style tags. */
private isSpecial = false;
/** Indicates whether the tokenizer has been paused. */
- private running = true;
- /** Indicates whether the tokenizer has finished running / `.end` has been called. */
- private ended = false;
+ public running = true;
+ /** The offset of the current buffer. */
+ private offset = 0;
private readonly xmlMode: boolean;
private readonly decodeEntities: boolean;
@@ -194,19 +193,16 @@ export default class Tokenizer {
this.baseState = State.Text;
this.currentSequence = undefined!;
this.running = true;
- this.ended = false;
+ this.offset = 0;
}
public write(chunk: string): void {
- if (this.ended) return this.cbs.onerror(Error(".write() after done!"));
- this.buffer += chunk;
+ this.offset += this.buffer.length;
+ this.buffer = chunk;
this.parse();
}
- public end(chunk?: string): void {
- if (this.ended) return this.cbs.onerror(Error(".end() after done!"));
- if (chunk) this.write(chunk);
- this.ended = true;
+ public end(): void {
if (this.running) this.finish();
}
@@ -216,12 +212,9 @@ export default class Tokenizer {
public resume(): void {
this.running = true;
- if (this._index < this.buffer.length) {
+ if (this._index < this.buffer.length + this.offset) {
this.parse();
}
- if (this.ended) {
- this.finish();
- }
}
/**
@@ -331,8 +324,8 @@ export default class Tokenizer {
* @returns Whether the character was found.
*/
private fastForwardTo(c: number): boolean {
- while (++this._index < this.buffer.length) {
- if (this.buffer.charCodeAt(this._index) === c) {
+ while (++this._index < this.buffer.length + this.offset) {
+ if (this.buffer.charCodeAt(this._index - this.offset) === c) {
return true;
}
}
@@ -343,7 +336,7 @@ export default class Tokenizer {
*
* TODO: Refactor `parse` to increment index before calling states.
*/
- this._index = this.buffer.length - 1;
+ this._index = this.buffer.length + this.offset - 1;
return false;
}
@@ -795,7 +788,7 @@ export default class Tokenizer {
}
private shouldContinue() {
- return this._index < this.buffer.length && this.running;
+ return this._index < this.buffer.length + this.offset && this.running;
}
/**
@@ -805,7 +798,7 @@ export default class Tokenizer {
*/
private parse() {
while (this.shouldContinue()) {
- const c = this.buffer.charCodeAt(this._index);
+ const c = this.buffer.charCodeAt(this._index - this.offset);
if (this._state === State.Text) {
this.stateText(c);
} else if (this._state === State.SpecialStartSequence) {
@@ -885,11 +878,12 @@ export default class Tokenizer {
/** Handle any trailing data. */
private handleTrailingData() {
+ const endIndex = this.buffer.length + this.offset;
if (this._state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(this.sectionStart, this.buffer.length, 0);
+ this.cbs.oncdata(this.sectionStart, endIndex, 0);
} else {
- this.cbs.oncomment(this.sectionStart, this.buffer.length, 0);
+ this.cbs.oncomment(this.sectionStart, endIndex, 0);
}
} else if (
this._state === State.InNumericEntity &&
@@ -919,7 +913,7 @@ export default class Tokenizer {
* respective callback signals that the tag should be ignored.
*/
} else {
- this.cbs.ontext(this.sectionStart, this.buffer.length);
+ this.cbs.ontext(this.sectionStart, endIndex);
}
}
From 09ac7e188db9722d83b6564b518456351b5b54e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:16:50 +0000
Subject: [PATCH 7/7] Remove `_` prefix from tokenizer private props
---
src/Tokenizer.ts | 336 +++++++++++++++++++++++------------------------
1 file changed, 168 insertions(+), 168 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 543b93514..93643215c 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -153,13 +153,13 @@ const Sequences = {
export default class Tokenizer {
/** The current state the tokenizer is in. */
- private _state = State.Text;
+ private state = State.Text;
/** The read buffer. */
private buffer = "";
/** The beginning of the section that is currently being read. */
public sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
- private _index = 0;
+ private index = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
private baseState = State.Text;
/** For special parsing behavior inside of script and style tags. */
@@ -186,10 +186,10 @@ export default class Tokenizer {
}
public reset(): void {
- this._state = State.Text;
+ this.state = State.Text;
this.buffer = "";
this.sectionStart = 0;
- this._index = 0;
+ this.index = 0;
this.baseState = State.Text;
this.currentSequence = undefined!;
this.running = true;
@@ -212,7 +212,7 @@ export default class Tokenizer {
public resume(): void {
this.running = true;
- if (this._index < this.buffer.length + this.offset) {
+ if (this.index < this.buffer.length + this.offset) {
this.parse();
}
}
@@ -221,7 +221,7 @@ export default class Tokenizer {
* The current index within all of the written data.
*/
public getIndex(): number {
- return this._index;
+ return this.index;
}
private stateText(c: number): void {
@@ -229,13 +229,13 @@ export default class Tokenizer {
c === CharCodes.Lt ||
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
) {
- if (this._index > this.sectionStart) {
- this.cbs.ontext(this.sectionStart, this._index);
+ if (this.index > this.sectionStart) {
+ this.cbs.ontext(this.sectionStart, this.index);
}
- this._state = State.BeforeTagName;
- this.sectionStart = this._index;
+ this.state = State.BeforeTagName;
+ this.sectionStart = this.index;
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this._state = State.BeforeEntity;
+ this.state = State.BeforeEntity;
}
}
@@ -257,7 +257,7 @@ export default class Tokenizer {
}
this.sequenceIndex = 0;
- this._state = State.InTagName;
+ this.state = State.InTagName;
this.stateInTagName(c);
}
@@ -265,14 +265,14 @@ export default class Tokenizer {
private stateInSpecialTag(c: number): void {
if (this.sequenceIndex === this.currentSequence.length) {
if (c === CharCodes.Gt || isWhitespace(c)) {
- const endOfText = this._index - this.currentSequence.length;
+ const endOfText = this.index - this.currentSequence.length;
if (this.sectionStart < endOfText) {
// Spoof the index so that reported locations match up.
- const actualIndex = this._index;
- this._index = endOfText;
+ const actualIndex = this.index;
+ this.index = endOfText;
this.cbs.ontext(this.sectionStart, endOfText);
- this._index = actualIndex;
+ this.index = actualIndex;
}
this.isSpecial = false;
@@ -290,7 +290,7 @@ export default class Tokenizer {
if (this.currentSequence === Sequences.TitleEnd) {
// We have to parse entities in tags.
if (this.decodeEntities && c === CharCodes.Amp) {
- this._state = State.BeforeEntity;
+ this.state = State.BeforeEntity;
}
} else if (this.fastForwardTo(CharCodes.Lt)) {
// Outside of tags, we can fast-forward.
@@ -305,14 +305,14 @@ export default class Tokenizer {
private stateCDATASequence(c: number): void {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
- this._state = State.InCommentLike;
+ this.state = State.InCommentLike;
this.currentSequence = Sequences.CdataEnd;
this.sequenceIndex = 0;
- this.sectionStart = this._index + 1;
+ this.sectionStart = this.index + 1;
}
} else {
this.sequenceIndex = 0;
- this._state = State.InDeclaration;
+ this.state = State.InDeclaration;
this.stateInDeclaration(c); // Reconsume the character
}
}
@@ -324,8 +324,8 @@ export default class Tokenizer {
* @returns Whether the character was found.
*/
private fastForwardTo(c: number): boolean {
- while (++this._index < this.buffer.length + this.offset) {
- if (this.buffer.charCodeAt(this._index - this.offset) === c) {
+ while (++this.index < this.buffer.length + this.offset) {
+ if (this.buffer.charCodeAt(this.index - this.offset) === c) {
return true;
}
}
@@ -336,7 +336,7 @@ export default class Tokenizer {
*
* TODO: Refactor `parse` to increment index before calling states.
*/
- this._index = this.buffer.length + this.offset - 1;
+ this.index = this.buffer.length + this.offset - 1;
return false;
}
@@ -353,14 +353,14 @@ export default class Tokenizer {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
if (this.currentSequence === Sequences.CdataEnd) {
- this.cbs.oncdata(this.sectionStart, this._index, 2);
+ this.cbs.oncdata(this.sectionStart, this.index, 2);
} else {
- this.cbs.oncomment(this.sectionStart, this._index, 2);
+ this.cbs.oncomment(this.sectionStart, this.index, 2);
}
this.sequenceIndex = 0;
- this.sectionStart = this._index + 1;
- this._state = State.Text;
+ this.sectionStart = this.index + 1;
+ this.state = State.Text;
}
} else if (this.sequenceIndex === 0) {
// Fast-forward to the first character of the sequence
@@ -387,39 +387,39 @@ export default class Tokenizer {
this.isSpecial = true;
this.currentSequence = sequence;
this.sequenceIndex = offset;
- this._state = State.SpecialStartSequence;
+ this.state = State.SpecialStartSequence;
}
private stateBeforeTagName(c: number): void {
if (c === CharCodes.ExclamationMark) {
- this._state = State.BeforeDeclaration;
- this.sectionStart = this._index + 1;
+ this.state = State.BeforeDeclaration;
+ this.sectionStart = this.index + 1;
} else if (c === CharCodes.Questionmark) {
- this._state = State.InProcessingInstruction;
- this.sectionStart = this._index + 1;
+ this.state = State.InProcessingInstruction;
+ this.sectionStart = this.index + 1;
} else if (this.isTagStartChar(c)) {
const lower = c | 0x20;
- this.sectionStart = this._index;
+ this.sectionStart = this.index;
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
this.startSpecial(Sequences.TitleEnd, 3);
} else {
- this._state =
+ this.state =
!this.xmlMode && lower === Sequences.ScriptEnd[2]
? State.BeforeSpecialS
: State.InTagName;
}
} else if (c === CharCodes.Slash) {
- this._state = State.BeforeClosingTagName;
+ this.state = State.BeforeClosingTagName;
} else {
- this._state = State.Text;
+ this.state = State.Text;
this.stateText(c);
}
}
private stateInTagName(c: number): void {
if (isEndOfTagSection(c)) {
- this.cbs.onopentagname(this.sectionStart, this._index);
+ this.cbs.onopentagname(this.sectionStart, this.index);
this.sectionStart = -1;
- this._state = State.BeforeAttributeName;
+ this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
}
@@ -427,90 +427,90 @@ export default class Tokenizer {
if (isWhitespace(c)) {
// Ignore
} else if (c === CharCodes.Gt) {
- this._state = State.Text;
+ this.state = State.Text;
} else {
- this._state = this.isTagStartChar(c)
+ this.state = this.isTagStartChar(c)
? State.InClosingTagName
: State.InSpecialComment;
- this.sectionStart = this._index;
+ this.sectionStart = this.index;
}
}
private stateInClosingTagName(c: number): void {
if (c === CharCodes.Gt || isWhitespace(c)) {
- this.cbs.onclosetag(this.sectionStart, this._index);
+ this.cbs.onclosetag(this.sectionStart, this.index);
this.sectionStart = -1;
- this._state = State.AfterClosingTagName;
+ this.state = State.AfterClosingTagName;
this.stateAfterClosingTagName(c);
}
}
private stateAfterClosingTagName(c: number): void {
// Skip everything until ">"
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this._state = State.Text;
- this.sectionStart = this._index + 1;
+ this.state = State.Text;
+ this.sectionStart = this.index + 1;
}
}
private stateBeforeAttributeName(c: number): void {
if (c === CharCodes.Gt) {
- this.cbs.onopentagend(this._index);
+ this.cbs.onopentagend(this.index);
if (this.isSpecial) {
- this._state = State.InSpecialTag;
+ this.state = State.InSpecialTag;
this.sequenceIndex = 0;
} else {
- this._state = State.Text;
+ this.state = State.Text;
}
- this.baseState = this._state;
- this.sectionStart = this._index + 1;
+ this.baseState = this.state;
+ this.sectionStart = this.index + 1;
} else if (c === CharCodes.Slash) {
- this._state = State.InSelfClosingTag;
+ this.state = State.InSelfClosingTag;
} else if (!isWhitespace(c)) {
- this._state = State.InAttributeName;
- this.sectionStart = this._index;
+ this.state = State.InAttributeName;
+ this.sectionStart = this.index;
}
}
private stateInSelfClosingTag(c: number): void {
if (c === CharCodes.Gt) {
- this.cbs.onselfclosingtag(this._index);
- this._state = State.Text;
+ this.cbs.onselfclosingtag(this.index);
+ this.state = State.Text;
this.baseState = State.Text;
- this.sectionStart = this._index + 1;
+ this.sectionStart = this.index + 1;
this.isSpecial = false; // Reset special state, in case of self-closing special tags
} else if (!isWhitespace(c)) {
- this._state = State.BeforeAttributeName;
+ this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
}
private stateInAttributeName(c: number): void {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
- this.cbs.onattribname(this.sectionStart, this._index);
+ this.cbs.onattribname(this.sectionStart, this.index);
this.sectionStart = -1;
- this._state = State.AfterAttributeName;
+ this.state = State.AfterAttributeName;
this.stateAfterAttributeName(c);
}
}
private stateAfterAttributeName(c: number): void {
if (c === CharCodes.Eq) {
- this._state = State.BeforeAttributeValue;
+ this.state = State.BeforeAttributeValue;
} else if (c === CharCodes.Slash || c === CharCodes.Gt) {
- this.cbs.onattribend(QuoteType.NoValue, this._index);
- this._state = State.BeforeAttributeName;
+ this.cbs.onattribend(QuoteType.NoValue, this.index);
+ this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (!isWhitespace(c)) {
- this.cbs.onattribend(QuoteType.NoValue, this._index);
- this._state = State.InAttributeName;
- this.sectionStart = this._index;
+ this.cbs.onattribend(QuoteType.NoValue, this.index);
+ this.state = State.InAttributeName;
+ this.sectionStart = this.index;
}
}
private stateBeforeAttributeValue(c: number): void {
if (c === CharCodes.DoubleQuote) {
- this._state = State.InAttributeValueDq;
- this.sectionStart = this._index + 1;
+ this.state = State.InAttributeValueDq;
+ this.sectionStart = this.index + 1;
} else if (c === CharCodes.SingleQuote) {
- this._state = State.InAttributeValueSq;
- this.sectionStart = this._index + 1;
+ this.state = State.InAttributeValueSq;
+ this.sectionStart = this.index + 1;
} else if (!isWhitespace(c)) {
- this.sectionStart = this._index;
- this._state = State.InAttributeValueNq;
+ this.sectionStart = this.index;
+ this.state = State.InAttributeValueNq;
this.stateInAttributeValueNoQuotes(c); // Reconsume token
}
}
@@ -519,18 +519,18 @@ export default class Tokenizer {
c === quote ||
(!this.decodeEntities && this.fastForwardTo(quote))
) {
- this.cbs.onattribdata(this.sectionStart, this._index);
+ this.cbs.onattribdata(this.sectionStart, this.index);
this.sectionStart = -1;
this.cbs.onattribend(
quote === CharCodes.DoubleQuote
? QuoteType.Double
: QuoteType.Single,
- this._index
+ this.index
);
- this._state = State.BeforeAttributeName;
+ this.state = State.BeforeAttributeName;
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this.baseState = this._state;
- this._state = State.BeforeEntity;
+ this.baseState = this.state;
+ this.state = State.BeforeEntity;
}
}
private stateInAttributeValueDoubleQuotes(c: number): void {
@@ -541,22 +541,22 @@ export default class Tokenizer {
}
private stateInAttributeValueNoQuotes(c: number): void {
if (isWhitespace(c) || c === CharCodes.Gt) {
- this.cbs.onattribdata(this.sectionStart, this._index);
+ this.cbs.onattribdata(this.sectionStart, this.index);
this.sectionStart = -1;
- this.cbs.onattribend(QuoteType.Unquoted, this._index);
- this._state = State.BeforeAttributeName;
+ this.cbs.onattribend(QuoteType.Unquoted, this.index);
+ this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this.baseState = this._state;
- this._state = State.BeforeEntity;
+ this.baseState = this.state;
+ this.state = State.BeforeEntity;
}
}
private stateBeforeDeclaration(c: number): void {
if (c === CharCodes.OpeningSquareBracket) {
- this._state = State.CDATASequence;
+ this.state = State.CDATASequence;
this.sequenceIndex = 0;
} else {
- this._state =
+ this.state =
c === CharCodes.Dash
? State.BeforeComment
: State.InDeclaration;
@@ -564,34 +564,34 @@ export default class Tokenizer {
}
private stateInDeclaration(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.ondeclaration(this.sectionStart, this._index);
- this._state = State.Text;
- this.sectionStart = this._index + 1;
+ this.cbs.ondeclaration(this.sectionStart, this.index);
+ this.state = State.Text;
+ this.sectionStart = this.index + 1;
}
}
private stateInProcessingInstruction(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.onprocessinginstruction(this.sectionStart, this._index);
- this._state = State.Text;
- this.sectionStart = this._index + 1;
+ this.cbs.onprocessinginstruction(this.sectionStart, this.index);
+ this.state = State.Text;
+ this.sectionStart = this.index + 1;
}
}
private stateBeforeComment(c: number): void {
if (c === CharCodes.Dash) {
- this._state = State.InCommentLike;
+ this.state = State.InCommentLike;
this.currentSequence = Sequences.CommentEnd;
// Allow short comments (eg. )
this.sequenceIndex = 2;
- this.sectionStart = this._index + 1;
+ this.sectionStart = this.index + 1;
} else {
- this._state = State.InDeclaration;
+ this.state = State.InDeclaration;
}
}
private stateInSpecialComment(c: number): void {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
- this.cbs.oncomment(this.sectionStart, this._index, 0);
- this._state = State.Text;
- this.sectionStart = this._index + 1;
+ this.cbs.oncomment(this.sectionStart, this.index, 0);
+ this.state = State.Text;
+ this.sectionStart = this.index + 1;
}
}
private stateBeforeSpecialS(c: number): void {
@@ -601,7 +601,7 @@ export default class Tokenizer {
} else if (lower === Sequences.StyleEnd[3]) {
this.startSpecial(Sequences.StyleEnd, 4);
} else {
- this._state = State.InTagName;
+ this.state = State.InTagName;
this.stateInTagName(c); // Consume the token again
}
}
@@ -618,13 +618,13 @@ export default class Tokenizer {
this.entityResult = 0;
if (c === CharCodes.Num) {
- this._state = State.BeforeNumericEntity;
+ this.state = State.BeforeNumericEntity;
} else if (c === CharCodes.Amp) {
// We have two `&` characters in a row. Stay in the current state.
} else {
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
- this._state = State.InNamedEntity;
+ this.state = State.InNamedEntity;
this.stateInNamedEntity(c);
}
}
@@ -641,7 +641,7 @@ export default class Tokenizer {
if (this.trieIndex < 0) {
this.emitNamedEntity();
- this._index--;
+ this.index--;
return;
}
@@ -655,7 +655,7 @@ export default class Tokenizer {
this.trieIndex += 1;
} else {
// Add 1 as we have already incremented the excess
- const entityStart = this._index - this.entityExcess + 1;
+ const entityStart = this.index - this.entityExcess + 1;
if (entityStart > this.sectionStart) {
this.emitPartial(this.sectionStart, entityStart);
@@ -667,7 +667,7 @@ export default class Tokenizer {
1 +
Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
this.entityExcess = 0;
- this.sectionStart = this._index + 1;
+ this.sectionStart = this.index + 1;
}
}
}
@@ -692,34 +692,34 @@ export default class Tokenizer {
}
}
- this._state = this.baseState;
+ this.state = this.baseState;
}
private stateBeforeNumericEntity(c: number): void {
if ((c | 0x20) === CharCodes.LowerX) {
this.entityExcess++;
- this._state = State.InHexEntity;
+ this.state = State.InHexEntity;
} else {
- this._state = State.InNumericEntity;
+ this.state = State.InNumericEntity;
this.stateInNumericEntity(c);
}
}
private emitNumericEntity(strict: boolean) {
- const entityStart = this._index - this.entityExcess - 1;
+ const entityStart = this.index - this.entityExcess - 1;
const numberStart =
- entityStart + 2 + Number(this._state === State.InHexEntity);
+ entityStart + 2 + Number(this.state === State.InHexEntity);
- if (numberStart !== this._index) {
+ if (numberStart !== this.index) {
// Emit leading data if any
if (entityStart > this.sectionStart) {
this.emitPartial(this.sectionStart, entityStart);
}
this.emitCodePoint(this.entityResult);
- this.sectionStart = this._index + Number(strict);
+ this.sectionStart = this.index + Number(strict);
}
- this._state = this.baseState;
+ this.state = this.baseState;
}
private stateInNumericEntity(c: number): void {
if (c === CharCodes.Semi) {
@@ -731,9 +731,9 @@ export default class Tokenizer {
if (this.allowLegacyEntity()) {
this.emitNumericEntity(false);
} else {
- this._state = this.baseState;
+ this.state = this.baseState;
}
- this._index--;
+ this.index--;
}
}
private stateInHexEntity(c: number): void {
@@ -750,9 +750,9 @@ export default class Tokenizer {
if (this.allowLegacyEntity()) {
this.emitNumericEntity(false);
} else {
- this._state = this.baseState;
+ this.state = this.baseState;
}
- this._index--;
+ this.index--;
}
}
@@ -769,26 +769,26 @@ export default class Tokenizer {
*/
private cleanup() {
// If we are inside of text or attributes, emit what we already have.
- if (this.running && this.sectionStart !== this._index) {
+ if (this.running && this.sectionStart !== this.index) {
if (
- this._state === State.Text ||
- (this._state === State.InSpecialTag && this.sequenceIndex === 0)
+ this.state === State.Text ||
+ (this.state === State.InSpecialTag && this.sequenceIndex === 0)
) {
- this.cbs.ontext(this.sectionStart, this._index);
- this.sectionStart = this._index;
+ this.cbs.ontext(this.sectionStart, this.index);
+ this.sectionStart = this.index;
} else if (
- this._state === State.InAttributeValueDq ||
- this._state === State.InAttributeValueSq ||
- this._state === State.InAttributeValueNq
+ this.state === State.InAttributeValueDq ||
+ this.state === State.InAttributeValueSq ||
+ this.state === State.InAttributeValueNq
) {
- this.cbs.onattribdata(this.sectionStart, this._index);
- this.sectionStart = this._index;
+ this.cbs.onattribdata(this.sectionStart, this.index);
+ this.sectionStart = this.index;
}
}
}
private shouldContinue() {
- return this._index < this.buffer.length + this.offset && this.running;
+ return this.index < this.buffer.length + this.offset && this.running;
}
/**
@@ -798,79 +798,79 @@ export default class Tokenizer {
*/
private parse() {
while (this.shouldContinue()) {
- const c = this.buffer.charCodeAt(this._index - this.offset);
- if (this._state === State.Text) {
+ const c = this.buffer.charCodeAt(this.index - this.offset);
+ if (this.state === State.Text) {
this.stateText(c);
- } else if (this._state === State.SpecialStartSequence) {
+ } else if (this.state === State.SpecialStartSequence) {
this.stateSpecialStartSequence(c);
- } else if (this._state === State.InSpecialTag) {
+ } else if (this.state === State.InSpecialTag) {
this.stateInSpecialTag(c);
- } else if (this._state === State.CDATASequence) {
+ } else if (this.state === State.CDATASequence) {
this.stateCDATASequence(c);
- } else if (this._state === State.InAttributeValueDq) {
+ } else if (this.state === State.InAttributeValueDq) {
this.stateInAttributeValueDoubleQuotes(c);
- } else if (this._state === State.InAttributeName) {
+ } else if (this.state === State.InAttributeName) {
this.stateInAttributeName(c);
- } else if (this._state === State.InCommentLike) {
+ } else if (this.state === State.InCommentLike) {
this.stateInCommentLike(c);
- } else if (this._state === State.InSpecialComment) {
+ } else if (this.state === State.InSpecialComment) {
this.stateInSpecialComment(c);
- } else if (this._state === State.BeforeAttributeName) {
+ } else if (this.state === State.BeforeAttributeName) {
this.stateBeforeAttributeName(c);
- } else if (this._state === State.InTagName) {
+ } else if (this.state === State.InTagName) {
this.stateInTagName(c);
- } else if (this._state === State.InClosingTagName) {
+ } else if (this.state === State.InClosingTagName) {
this.stateInClosingTagName(c);
- } else if (this._state === State.BeforeTagName) {
+ } else if (this.state === State.BeforeTagName) {
this.stateBeforeTagName(c);
- } else if (this._state === State.AfterAttributeName) {
+ } else if (this.state === State.AfterAttributeName) {
this.stateAfterAttributeName(c);
- } else if (this._state === State.InAttributeValueSq) {
+ } else if (this.state === State.InAttributeValueSq) {
this.stateInAttributeValueSingleQuotes(c);
- } else if (this._state === State.BeforeAttributeValue) {
+ } else if (this.state === State.BeforeAttributeValue) {
this.stateBeforeAttributeValue(c);
- } else if (this._state === State.BeforeClosingTagName) {
+ } else if (this.state === State.BeforeClosingTagName) {
this.stateBeforeClosingTagName(c);
- } else if (this._state === State.AfterClosingTagName) {
+ } else if (this.state === State.AfterClosingTagName) {
this.stateAfterClosingTagName(c);
- } else if (this._state === State.BeforeSpecialS) {
+ } else if (this.state === State.BeforeSpecialS) {
this.stateBeforeSpecialS(c);
- } else if (this._state === State.InAttributeValueNq) {
+ } else if (this.state === State.InAttributeValueNq) {
this.stateInAttributeValueNoQuotes(c);
- } else if (this._state === State.InSelfClosingTag) {
+ } else if (this.state === State.InSelfClosingTag) {
this.stateInSelfClosingTag(c);
- } else if (this._state === State.InDeclaration) {
+ } else if (this.state === State.InDeclaration) {
this.stateInDeclaration(c);
- } else if (this._state === State.BeforeDeclaration) {
+ } else if (this.state === State.BeforeDeclaration) {
this.stateBeforeDeclaration(c);
- } else if (this._state === State.BeforeComment) {
+ } else if (this.state === State.BeforeComment) {
this.stateBeforeComment(c);
- } else if (this._state === State.InProcessingInstruction) {
+ } else if (this.state === State.InProcessingInstruction) {
this.stateInProcessingInstruction(c);
- } else if (this._state === State.InNamedEntity) {
+ } else if (this.state === State.InNamedEntity) {
this.stateInNamedEntity(c);
- } else if (this._state === State.BeforeEntity) {
+ } else if (this.state === State.BeforeEntity) {
this.stateBeforeEntity(c);
- } else if (this._state === State.InHexEntity) {
+ } else if (this.state === State.InHexEntity) {
this.stateInHexEntity(c);
- } else if (this._state === State.InNumericEntity) {
+ } else if (this.state === State.InNumericEntity) {
this.stateInNumericEntity(c);
} else {
// `this._state === State.BeforeNumericEntity`
this.stateBeforeNumericEntity(c);
}
- this._index++;
+ this.index++;
}
this.cleanup();
}
private finish() {
- if (this._state === State.InNamedEntity) {
+ if (this.state === State.InNamedEntity) {
this.emitNamedEntity();
}
// If there is remaining data, emit it in a reasonable way
- if (this.sectionStart < this._index) {
+ if (this.sectionStart < this.index) {
this.handleTrailingData();
}
this.cbs.onend();
@@ -879,34 +879,34 @@ export default class Tokenizer {
/** Handle any trailing data. */
private handleTrailingData() {
const endIndex = this.buffer.length + this.offset;
- if (this._state === State.InCommentLike) {
+ if (this.state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(this.sectionStart, endIndex, 0);
} else {
this.cbs.oncomment(this.sectionStart, endIndex, 0);
}
} else if (
- this._state === State.InNumericEntity &&
+ this.state === State.InNumericEntity &&
this.allowLegacyEntity()
) {
this.emitNumericEntity(false);
// All trailing data will have been consumed
} else if (
- this._state === State.InHexEntity &&
+ this.state === State.InHexEntity &&
this.allowLegacyEntity()
) {
this.emitNumericEntity(false);
// All trailing data will have been consumed
} else if (
- this._state === State.InTagName ||
- this._state === State.BeforeAttributeName ||
- this._state === State.BeforeAttributeValue ||
- this._state === State.AfterAttributeName ||
- this._state === State.InAttributeName ||
- this._state === State.InAttributeValueSq ||
- this._state === State.InAttributeValueDq ||
- this._state === State.InAttributeValueNq ||
- this._state === State.InClosingTagName
+ this.state === State.InTagName ||
+ this.state === State.BeforeAttributeName ||
+ this.state === State.BeforeAttributeValue ||
+ this.state === State.AfterAttributeName ||
+ this.state === State.InAttributeName ||
+ this.state === State.InAttributeValueSq ||
+ this.state === State.InAttributeValueDq ||
+ this.state === State.InAttributeValueNq ||
+ this.state === State.InClosingTagName
) {
/*
* If we are currently in an opening or closing tag, us not calling the