From a4428c7a345330fe19c778384ba0f3761749dbd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 19:24:52 +0000
Subject: [PATCH 1/7] refactor: Call tokenizer callbacks with indices

---
 src/Parser.ts                            |  97 +++++++---
 src/Tokenizer.ts                         | 230 ++++++++++++++---------
 src/__snapshots__/Tokenizer.spec.ts.snap |  70 ++++---
 3 files changed, 253 insertions(+), 144 deletions(-)

diff --git a/src/Parser.ts b/src/Parser.ts
index cccd33f9e..429fd517a 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -1,4 +1,5 @@
-import Tokenizer from "./Tokenizer";
+import Tokenizer, { Callbacks, QuoteType } from "./Tokenizer";
+import decodeCodePoint from "entities/lib/decode_codepoint";
 
 const formTags = new Set([
     "input",
@@ -195,7 +196,7 @@ export interface Handler {
 
 const reNameEnd = /\s|\//;
 
-export class Parser {
+export class Parser implements Callbacks {
     /** The start index of the last event. */
     public startIndex = 0;
     /** The end index of the last event. */
@@ -235,20 +236,31 @@ export class Parser {
     // Tokenizer event handlers
 
     /** @internal */
-    ontext(data: string): void {
-        const idx = this.tokenizer.getAbsoluteIndex();
+    ontext(start: number, length: number): void {
+        const data = this.getSubstr(start, length);
+        const idx = start + length;
         this.endIndex = idx - 1;
         this.cbs.ontext?.(data);
         this.startIndex = idx;
     }
 
+    /** @internal */
+    ontextentity(cp: number): void {
+        const idx = this.tokenizer.getIndex();
+        this.endIndex = idx - 1;
+        this.cbs.ontext?.(decodeCodePoint(cp));
+        this.startIndex = idx;
+    }
+
     protected isVoidElement(name: string): boolean {
         return !this.options.xmlMode && voidElements.has(name);
     }
 
     /** @internal */
-    onopentagname(name: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    onopentagname(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+
+        let name = this.getSubstr(start, length);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -287,7 +299,7 @@ export class Parser {
 
     private endOpenTag(isImplied: boolean) {
         this.startIndex = this.openTagStart;
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+        this.endIndex = this.tokenizer.getIndex();
 
         if (this.attribs) {
             this.cbs.onopentag?.(this.tagname, this.attribs, isImplied);
@@ -309,8 +321,10 @@ export class Parser {
     }
 
     /** @internal */
-    onclosetag(name: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    onclosetag(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+
+        let name = this.getSubstr(start, length);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -378,25 +392,39 @@ export class Parser {
     }
 
     /** @internal */
-    onattribname(name: string): void {
-        this.startIndex = this.tokenizer.getAbsoluteSectionStart();
+    onattribname(start: number, length: number): void {
+        this.startIndex = start;
+        const name = this.getSubstr(start, length);
 
-        if (this.lowerCaseAttributeNames) {
-            name = name.toLowerCase();
-        }
-        this.attribname = name;
+        this.attribname = this.lowerCaseAttributeNames
+            ? name.toLowerCase()
+            : name;
     }
 
     /** @internal */
-    onattribdata(value: string): void {
-        this.attribvalue += value;
+    onattribdata(start: number, length: number): void {
+        this.attribvalue += this.getSubstr(start, length);
     }
 
     /** @internal */
-    onattribend(quote: string | undefined | null): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    onattribentity(cp: number): void {
+        this.attribvalue += decodeCodePoint(cp);
+    }
 
-        this.cbs.onattribute?.(this.attribname, this.attribvalue, quote);
+    /** @internal */
+    onattribend(quote: QuoteType): void {
+        this.endIndex = this.tokenizer.getIndex();
+
+        const quoteVal =
+            quote === QuoteType.Double
+                ? '"'
+                : quote === QuoteType.Single
+                ? "'"
+                : quote === QuoteType.NoValue
+                ? undefined
+                : null;
+
+        this.cbs.onattribute?.(this.attribname, this.attribvalue, quoteVal);
         if (
             this.attribs &&
             !Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)
@@ -419,8 +447,9 @@ export class Parser {
     }
 
     /** @internal */
-    ondeclaration(value: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    ondeclaration(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+        const value = this.getSubstr(start, length);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
@@ -432,8 +461,9 @@ export class Parser {
     }
 
     /** @internal */
-    onprocessinginstruction(value: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    onprocessinginstruction(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+        const value = this.getSubstr(start, length);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
@@ -445,8 +475,9 @@ export class Parser {
     }
 
     /** @internal */
-    oncomment(value: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    oncomment(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+        const value = this.getSubstr(start, length);
 
         this.cbs.oncomment?.(value);
         this.cbs.oncommentend?.();
@@ -456,8 +487,9 @@ export class Parser {
     }
 
     /** @internal */
-    oncdata(value: string): void {
-        this.endIndex = this.tokenizer.getAbsoluteIndex();
+    oncdata(start: number, length: number): void {
+        this.endIndex = this.tokenizer.getIndex();
+        const value = this.getSubstr(start, length);
 
         if (this.options.xmlMode || this.options.recognizeCDATA) {
             this.cbs.oncdatastart?.();
@@ -504,6 +536,7 @@ export class Parser {
         this.startIndex = 0;
         this.endIndex = 0;
         this.cbs.onparserinit?.(this);
+        this.buffer = "";
     }
 
     /**
@@ -517,12 +550,19 @@ export class Parser {
         this.end(data);
     }
 
+    private buffer = "";
+
+    private getSubstr(start: number, length: number) {
+        return this.buffer.substr(start, length);
+    }
+
     /**
      * Parses a chunk of data and calls the corresponding callbacks.
      *
      * @param chunk Chunk to parse.
      */
     public write(chunk: string): void {
+        this.buffer += chunk;
         this.tokenizer.write(chunk);
     }
 
@@ -532,6 +572,7 @@ export class Parser {
      * @param chunk Optional final chunk to parse.
      */
     public end(chunk?: string): void {
+        if (chunk) this.buffer += chunk;
         this.tokenizer.end(chunk);
     }
 
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 6b5182f9d..a597439df 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,4 +1,3 @@
-import decodeCodePoint from "entities/lib/decode_codepoint";
 import {
     htmlDecodeTree,
     xmlDecodeTree,
@@ -105,21 +104,30 @@ function isASCIIAlpha(c: number): boolean {
     );
 }
 
+export enum QuoteType {
+    NoValue = 0,
+    Unquoted = 1,
+    Single = 2,
+    Double = 3,
+}
+
 export interface Callbacks {
-    onattribdata(value: string): void;
-    onattribend(quote: string | undefined | null): void;
-    onattribname(name: string): void;
-    oncdata(data: string): void;
-    onclosetag(name: string): void;
-    oncomment(data: string): void;
-    ondeclaration(content: string): void;
+    onattribdata(start: number, length: number): void;
+    onattribentity(codepoint: number): void;
+    onattribend(quote: QuoteType): void;
+    onattribname(start: number, length: number): void;
+    oncdata(start: number, length: number): void;
+    onclosetag(start: number, length: number): void;
+    oncomment(start: number, length: number): void;
+    ondeclaration(start: number, length: number): void;
     onend(): void;
     onerror(error: Error, state?: State): void;
     onopentagend(): void;
-    onopentagname(name: string): void;
-    onprocessinginstruction(instruction: string): void;
+    onopentagname(start: number, length: number): void;
+    onprocessinginstruction(start: number, length: number): void;
     onselfclosingtag(): void;
-    ontext(value: string): void;
+    ontext(start: number, length: number): void;
+    ontextentity(codepoint: number): void;
 }
 
 /**
@@ -146,11 +154,6 @@ export default class Tokenizer {
     public sectionStart = 0;
     /** The index within the buffer that we are currently looking at. */
     private _index = 0;
-    /**
-     * Data that has already been processed will be removed from the buffer occasionally.
-     * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
-     */
-    private bufferOffset = 0;
     /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
     private baseState = State.Text;
     /** For special parsing behavior inside of script and style tags. */
@@ -181,7 +184,6 @@ export default class Tokenizer {
         this.buffer = "";
         this.sectionStart = 0;
         this._index = 0;
-        this.bufferOffset = 0;
         this.baseState = State.Text;
         this.currentSequence = undefined!;
         this.running = true;
@@ -215,18 +217,11 @@ export default class Tokenizer {
         }
     }
 
-    /**
-     * The start of the current section.
-     */
-    public getAbsoluteSectionStart(): number {
-        return this.sectionStart + this.bufferOffset;
-    }
-
     /**
      * The current index within all of the written data.
      */
-    public getAbsoluteIndex(): number {
-        return this.bufferOffset + this._index;
+    public getIndex(): number {
+        return this._index;
     }
 
     private stateText(c: number) {
@@ -235,7 +230,10 @@ export default class Tokenizer {
             (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
         ) {
             if (this._index > this.sectionStart) {
-                this.cbs.ontext(this.getSection());
+                this.cbs.ontext(
+                    this.sectionStart,
+                    this._index - this.sectionStart
+                );
             }
             this._state = State.BeforeTagName;
             this.sectionStart = this._index;
@@ -276,7 +274,10 @@ export default class Tokenizer {
                     // Spoof the index so that reported locations match up.
                     const actualIndex = this._index;
                     this._index = endOfText;
-                    this.cbs.ontext(this.getSection());
+                    this.cbs.ontext(
+                        this.sectionStart,
+                        endOfText - this.sectionStart
+                    );
                     this._index = actualIndex;
                 }
 
@@ -358,15 +359,12 @@ export default class Tokenizer {
         if (c === this.currentSequence[this.sequenceIndex]) {
             if (++this.sequenceIndex === this.currentSequence.length) {
                 // Remove 2 trailing chars
-                const section = this.buffer.slice(
-                    this.sectionStart,
-                    this._index - 2
-                );
+                const length = this._index - 2 - this.sectionStart;
 
                 if (this.currentSequence === Sequences.CdataEnd) {
-                    this.cbs.oncdata(section);
+                    this.cbs.oncdata(this.sectionStart, length);
                 } else {
-                    this.cbs.oncomment(section);
+                    this.cbs.oncomment(this.sectionStart, length);
                 }
 
                 this.sequenceIndex = 0;
@@ -428,7 +426,10 @@ export default class Tokenizer {
     }
     private stateInTagName(c: number) {
         if (isEndOfTagSection(c)) {
-            this.cbs.onopentagname(this.getSection());
+            this.cbs.onopentagname(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this.sectionStart = -1;
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
@@ -448,7 +449,10 @@ export default class Tokenizer {
     }
     private stateInClosingTagName(c: number) {
         if (c === CharCodes.Gt || isWhitespace(c)) {
-            this.cbs.onclosetag(this.getSection());
+            this.cbs.onclosetag(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this.sectionStart = -1;
             this._state = State.AfterClosingTagName;
             this.stateAfterClosingTagName(c);
@@ -493,7 +497,10 @@ export default class Tokenizer {
     }
     private stateInAttributeName(c: number) {
         if (c === CharCodes.Eq || isEndOfTagSection(c)) {
-            this.cbs.onattribname(this.getSection());
+            this.cbs.onattribname(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this.sectionStart = -1;
             this._state = State.AfterAttributeName;
             this.stateAfterAttributeName(c);
@@ -503,11 +510,11 @@ export default class Tokenizer {
         if (c === CharCodes.Eq) {
             this._state = State.BeforeAttributeValue;
         } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
-            this.cbs.onattribend(undefined);
+            this.cbs.onattribend(QuoteType.NoValue);
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (!isWhitespace(c)) {
-            this.cbs.onattribend(undefined);
+            this.cbs.onattribend(QuoteType.NoValue);
             this._state = State.InAttributeName;
             this.sectionStart = this._index;
         }
@@ -530,9 +537,16 @@ export default class Tokenizer {
             c === quote ||
             (!this.decodeEntities && this.fastForwardTo(quote))
         ) {
-            this.cbs.onattribdata(this.getSection());
+            this.cbs.onattribdata(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this.sectionStart = -1;
-            this.cbs.onattribend(String.fromCharCode(quote));
+            this.cbs.onattribend(
+                quote === CharCodes.DoubleQuote
+                    ? QuoteType.Double
+                    : QuoteType.Single
+            );
             this._state = State.BeforeAttributeName;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
             this.baseState = this._state;
@@ -547,9 +561,12 @@ export default class Tokenizer {
     }
     private stateInAttributeValueNoQuotes(c: number) {
         if (isWhitespace(c) || c === CharCodes.Gt) {
-            this.cbs.onattribdata(this.getSection());
+            this.cbs.onattribdata(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this.sectionStart = -1;
-            this.cbs.onattribend(null);
+            this.cbs.onattribend(QuoteType.Unquoted);
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -570,14 +587,20 @@ export default class Tokenizer {
     }
     private stateInDeclaration(c: number) {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.ondeclaration(this.getSection());
+            this.cbs.ondeclaration(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
     }
     private stateInProcessingInstruction(c: number) {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.onprocessinginstruction(this.getSection());
+            this.cbs.onprocessinginstruction(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
@@ -595,7 +618,10 @@ export default class Tokenizer {
     }
     private stateInSpecialComment(c: number) {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.oncomment(this.getSection());
+            this.cbs.oncomment(
+                this.sectionStart,
+                this._index - this.sectionStart
+            );
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
@@ -614,7 +640,7 @@ export default class Tokenizer {
 
     private trieIndex = 0;
     private trieCurrent = 0;
-    private trieResult: string | null = null;
+    private trieResult = 0;
     private entityExcess = 0;
 
     private stateBeforeEntity(c: number) {
@@ -628,7 +654,7 @@ export default class Tokenizer {
         } else {
             this.trieIndex = 0;
             this.trieCurrent = this.entityTrie[0];
-            this.trieResult = null;
+            this.trieResult = 0;
             this._state = State.InNamedEntity;
             this.stateInNamedEntity(c);
         }
@@ -664,20 +690,16 @@ export default class Tokenizer {
 
                 if (entityStart > this.sectionStart) {
                     this.emitPartial(
-                        this.buffer.substring(this.sectionStart, entityStart)
+                        this.sectionStart,
+                        entityStart - this.sectionStart
                     );
                 }
 
                 // If this is a surrogate pair, consume the next two bytes
-                this.trieResult =
-                    this.trieCurrent & BinTrieFlags.MULTI_BYTE
-                        ? String.fromCharCode(
-                              this.entityTrie[++this.trieIndex],
-                              this.entityTrie[++this.trieIndex]
-                          )
-                        : String.fromCharCode(
-                              this.entityTrie[++this.trieIndex]
-                          );
+                this.trieResult = this.trieIndex;
+                this.trieIndex +=
+                    1 +
+                    Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
                 this.entityExcess = 0;
                 this.sectionStart = this._index + 1;
             }
@@ -685,8 +707,23 @@ export default class Tokenizer {
     }
 
     private emitNamedEntity() {
-        if (this.trieResult) {
-            this.emitPartial(this.trieResult);
+        if (this.trieResult !== 0) {
+            if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) {
+                const first = this.entityTrie[this.trieResult + 1];
+                const second = this.entityTrie[this.trieResult + 2];
+                // If this is a surrogate pair, combine the code points.
+                if (first >= 0xd8_00 && first <= 0xdf_ff) {
+                    this.emitCodePoint(
+                        // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+                        (first - 0xd8_00) * 0x4_00 + second + 0x24_00
+                    );
+                } else {
+                    this.emitCodePoint(first);
+                    this.emitCodePoint(second);
+                }
+            } else {
+                this.emitCodePoint(this.entityTrie[this.trieResult + 1]);
+            }
         }
 
         this._state = this.baseState;
@@ -710,14 +747,15 @@ export default class Tokenizer {
             // Emit leading data if any
             if (entityStart > this.sectionStart) {
                 this.emitPartial(
-                    this.buffer.substring(this.sectionStart, entityStart)
+                    this.sectionStart,
+                    entityStart - this.sectionStart
                 );
             }
 
             // Parse entity
             const entity = this.buffer.substring(numberStart, this._index);
             const parsed = parseInt(entity, base);
-            this.emitPartial(decodeCodePoint(parsed));
+            this.emitCodePoint(parsed);
             this.sectionStart = this._index + Number(strict);
         }
         this._state = this.baseState;
@@ -767,27 +805,28 @@ export default class Tokenizer {
      * Remove data that has already been consumed from the buffer.
      */
     private cleanup() {
-        // If we are inside of text, emit what we already have.
-        if (
-            this.running &&
-            this.sectionStart !== this._index &&
-            (this._state === State.Text ||
-                (this._state === State.InSpecialTag &&
-                    this.sequenceIndex === 0))
-        ) {
-            // TODO: We could emit attribute data here as well.
-            this.cbs.ontext(this.buffer.substr(this.sectionStart));
-            this.sectionStart = this._index;
-        }
-
-        const start = this.sectionStart < 0 ? this._index : this.sectionStart;
-        this.buffer =
-            start === this.buffer.length ? "" : this.buffer.substr(start);
-        this._index -= start;
-        this.bufferOffset += start;
-
-        if (this.sectionStart > 0) {
-            this.sectionStart = 0;
+        // If we are inside of text or attributes, emit what we already have.
+        if (this.running && this.sectionStart !== this._index) {
+            if (
+                this._state === State.Text ||
+                (this._state === State.InSpecialTag && this.sequenceIndex === 0)
+            ) {
+                this.cbs.ontext(
+                    this.sectionStart,
+                    this._index - this.sectionStart
+                );
+                this.sectionStart = this._index;
+            } else if (
+                this._state === State.InAttributeValueDq ||
+                this._state === State.InAttributeValueSq ||
+                this._state === State.InAttributeValueNq
+            ) {
+                this.cbs.onattribdata(
+                    this.sectionStart,
+                    this._index - this.sectionStart
+                );
+                this.sectionStart = this._index;
+            }
         }
     }
 
@@ -882,12 +921,12 @@ export default class Tokenizer {
 
     /** Handle any trailing data. */
     private handleTrailingData() {
-        const data = this.buffer.substr(this.sectionStart);
+        const remaining = this.buffer.length - this.sectionStart;
         if (this._state === State.InCommentLike) {
             if (this.currentSequence === Sequences.CdataEnd) {
-                this.cbs.oncdata(data);
+                this.cbs.oncdata(this.sectionStart, remaining);
             } else {
-                this.cbs.oncomment(data);
+                this.cbs.oncomment(this.sectionStart, remaining);
             }
         } else if (
             this._state === State.InNumericEntity &&
@@ -917,21 +956,28 @@ export default class Tokenizer {
              * respective callback signals that the tag should be ignored.
              */
         } else {
-            this.cbs.ontext(data);
+            this.cbs.ontext(this.sectionStart, remaining);
         }
     }
 
-    private getSection(): string {
-        return this.buffer.substring(this.sectionStart, this._index);
+    private emitPartial(start: number, length: number) {
+        if (
+            this.baseState !== State.Text &&
+            this.baseState !== State.InSpecialTag
+        ) {
+            this.cbs.onattribdata(start, length);
+        } else {
+            this.cbs.ontext(start, length);
+        }
     }
-    private emitPartial(value: string) {
+    private emitCodePoint(cp: number) {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
         ) {
-            this.cbs.onattribdata(value);
+            this.cbs.onattribentity(cp);
         } else {
-            this.cbs.ontext(value);
+            this.cbs.ontextentity(cp);
         }
     }
 }
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 38a60dfff..4951ecd2a 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -3,12 +3,13 @@
 exports[`Tokenizer should not lose data when pausing 1`] = `
 Array [
   Array [
-    "ontext",
-    "&",
+    "ontextentity",
+    38,
   ],
   Array [
     "ontext",
-    " it up!",
+    5,
+    7,
   ],
   Array [
     "onend",
@@ -20,21 +21,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing scr
 Array [
   Array [
     "onopentagname",
-    "script",
+    1,
+    6,
   ],
   Array [
     "onselfclosingtag",
   ],
   Array [
     "onopentagname",
-    "div",
+    11,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    17,
+    3,
   ],
   Array [
     "onend",
@@ -46,21 +50,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing sty
 Array [
   Array [
     "onopentagname",
-    "style",
+    1,
+    5,
   ],
   Array [
     "onselfclosingtag",
   ],
   Array [
     "onopentagname",
-    "div",
+    10,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    16,
+    3,
   ],
   Array [
     "onend",
@@ -72,21 +79,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing tit
 Array [
   Array [
     "onopentagname",
-    "title",
+    1,
+    5,
   ],
   Array [
     "onselfclosingtag",
   ],
   Array [
     "onopentagname",
-    "div",
+    10,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    16,
+    3,
   ],
   Array [
     "onend",
@@ -98,25 +108,29 @@ exports[`Tokenizer should support standard special tags for normal script tag 1`
 Array [
   Array [
     "onopentagname",
-    "script",
+    1,
+    6,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "script",
+    10,
+    6,
   ],
   Array [
     "onopentagname",
-    "div",
+    18,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    24,
+    3,
   ],
   Array [
     "onend",
@@ -128,25 +142,29 @@ exports[`Tokenizer should support standard special tags for normal sitle tag 1`]
 Array [
   Array [
     "onopentagname",
-    "title",
+    1,
+    5,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "title",
+    9,
+    5,
   ],
   Array [
     "onopentagname",
-    "div",
+    16,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    22,
+    3,
   ],
   Array [
     "onend",
@@ -158,25 +176,29 @@ exports[`Tokenizer should support standard special tags for normal style tag 1`]
 Array [
   Array [
     "onopentagname",
-    "style",
+    1,
+    5,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "style",
+    9,
+    5,
   ],
   Array [
     "onopentagname",
-    "div",
+    16,
+    3,
   ],
   Array [
     "onopentagend",
   ],
   Array [
     "onclosetag",
-    "div",
+    22,
+    3,
   ],
   Array [
     "onend",

From e2b23ea2d66daa0e7eb8425ad018e3a51e7217ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 19:28:36 +0000
Subject: [PATCH 2/7] Add return types

---
 src/Tokenizer.ts | 74 ++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index a597439df..61aa6c979 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -224,7 +224,7 @@ export default class Tokenizer {
         return this._index;
     }
 
-    private stateText(c: number) {
+    private stateText(c: number): void {
         if (
             c === CharCodes.Lt ||
             (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
@@ -244,7 +244,7 @@ export default class Tokenizer {
 
     private currentSequence!: Uint8Array;
     private sequenceIndex = 0;
-    private stateSpecialStartSequence(c: number) {
+    private stateSpecialStartSequence(c: number): void {
         const isEnd = this.sequenceIndex === this.currentSequence.length;
         const isMatch = isEnd
             ? // If we are at the end of the sequence, make sure the tag name has ended
@@ -265,7 +265,7 @@ export default class Tokenizer {
     }
 
     /** Look for an end tag. For <title> tags, also decode entities. */
-    private stateInSpecialTag(c: number) {
+    private stateInSpecialTag(c: number): void {
         if (this.sequenceIndex === this.currentSequence.length) {
             if (c === CharCodes.Gt || isWhitespace(c)) {
                 const endOfText = this._index - this.currentSequence.length;
@@ -308,7 +308,7 @@ export default class Tokenizer {
         }
     }
 
-    private stateCDATASequence(c: number) {
+    private stateCDATASequence(c: number): void {
         if (c === Sequences.Cdata[this.sequenceIndex]) {
             if (++this.sequenceIndex === Sequences.Cdata.length) {
                 this._state = State.InCommentLike;
@@ -355,7 +355,7 @@ export default class Tokenizer {
      * - That character is then repeated, so we have to check multiple repeats.
      * - All characters but the start character of the sequence can be skipped.
      */
-    private stateInCommentLike(c: number) {
+    private stateInCommentLike(c: number): void {
         if (c === this.currentSequence[this.sequenceIndex]) {
             if (++this.sequenceIndex === this.currentSequence.length) {
                 // Remove 2 trailing chars
@@ -399,7 +399,7 @@ export default class Tokenizer {
         this._state = State.SpecialStartSequence;
     }
 
-    private stateBeforeTagName(c: number) {
+    private stateBeforeTagName(c: number): void {
         if (c === CharCodes.ExclamationMark) {
             this._state = State.BeforeDeclaration;
             this.sectionStart = this._index + 1;
@@ -424,7 +424,7 @@ export default class Tokenizer {
             this.stateText(c);
         }
     }
-    private stateInTagName(c: number) {
+    private stateInTagName(c: number): void {
         if (isEndOfTagSection(c)) {
             this.cbs.onopentagname(
                 this.sectionStart,
@@ -435,7 +435,7 @@ export default class Tokenizer {
             this.stateBeforeAttributeName(c);
         }
     }
-    private stateBeforeClosingTagName(c: number) {
+    private stateBeforeClosingTagName(c: number): void {
         if (isWhitespace(c)) {
             // Ignore
         } else if (c === CharCodes.Gt) {
@@ -447,7 +447,7 @@ export default class Tokenizer {
             this.sectionStart = this._index;
         }
     }
-    private stateInClosingTagName(c: number) {
+    private stateInClosingTagName(c: number): void {
         if (c === CharCodes.Gt || isWhitespace(c)) {
             this.cbs.onclosetag(
                 this.sectionStart,
@@ -458,14 +458,14 @@ export default class Tokenizer {
             this.stateAfterClosingTagName(c);
         }
     }
-    private stateAfterClosingTagName(c: number) {
+    private stateAfterClosingTagName(c: number): void {
         // Skip everything until ">"
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
     }
-    private stateBeforeAttributeName(c: number) {
+    private stateBeforeAttributeName(c: number): void {
         if (c === CharCodes.Gt) {
             this.cbs.onopentagend();
             if (this.isSpecial) {
@@ -483,7 +483,7 @@ export default class Tokenizer {
             this.sectionStart = this._index;
         }
     }
-    private stateInSelfClosingTag(c: number) {
+    private stateInSelfClosingTag(c: number): void {
         if (c === CharCodes.Gt) {
             this.cbs.onselfclosingtag();
             this._state = State.Text;
@@ -495,7 +495,7 @@ export default class Tokenizer {
             this.stateBeforeAttributeName(c);
         }
     }
-    private stateInAttributeName(c: number) {
+    private stateInAttributeName(c: number): void {
         if (c === CharCodes.Eq || isEndOfTagSection(c)) {
             this.cbs.onattribname(
                 this.sectionStart,
@@ -506,7 +506,7 @@ export default class Tokenizer {
             this.stateAfterAttributeName(c);
         }
     }
-    private stateAfterAttributeName(c: number) {
+    private stateAfterAttributeName(c: number): void {
         if (c === CharCodes.Eq) {
             this._state = State.BeforeAttributeValue;
         } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
@@ -519,7 +519,7 @@ export default class Tokenizer {
             this.sectionStart = this._index;
         }
     }
-    private stateBeforeAttributeValue(c: number) {
+    private stateBeforeAttributeValue(c: number): void {
         if (c === CharCodes.DoubleQuote) {
             this._state = State.InAttributeValueDq;
             this.sectionStart = this._index + 1;
@@ -553,13 +553,13 @@ export default class Tokenizer {
             this._state = State.BeforeEntity;
         }
     }
-    private stateInAttributeValueDoubleQuotes(c: number) {
+    private stateInAttributeValueDoubleQuotes(c: number): void {
         this.handleInAttributeValue(c, CharCodes.DoubleQuote);
     }
-    private stateInAttributeValueSingleQuotes(c: number) {
+    private stateInAttributeValueSingleQuotes(c: number): void {
         this.handleInAttributeValue(c, CharCodes.SingleQuote);
     }
-    private stateInAttributeValueNoQuotes(c: number) {
+    private stateInAttributeValueNoQuotes(c: number): void {
         if (isWhitespace(c) || c === CharCodes.Gt) {
             this.cbs.onattribdata(
                 this.sectionStart,
@@ -574,7 +574,7 @@ export default class Tokenizer {
             this._state = State.BeforeEntity;
         }
     }
-    private stateBeforeDeclaration(c: number) {
+    private stateBeforeDeclaration(c: number): void {
         if (c === CharCodes.OpeningSquareBracket) {
             this._state = State.CDATASequence;
             this.sequenceIndex = 0;
@@ -585,7 +585,7 @@ export default class Tokenizer {
                     : State.InDeclaration;
         }
     }
-    private stateInDeclaration(c: number) {
+    private stateInDeclaration(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
             this.cbs.ondeclaration(
                 this.sectionStart,
@@ -595,7 +595,7 @@ export default class Tokenizer {
             this.sectionStart = this._index + 1;
         }
     }
-    private stateInProcessingInstruction(c: number) {
+    private stateInProcessingInstruction(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
             this.cbs.onprocessinginstruction(
                 this.sectionStart,
@@ -605,7 +605,7 @@ export default class Tokenizer {
             this.sectionStart = this._index + 1;
         }
     }
-    private stateBeforeComment(c: number) {
+    private stateBeforeComment(c: number): void {
         if (c === CharCodes.Dash) {
             this._state = State.InCommentLike;
             this.currentSequence = Sequences.CommentEnd;
@@ -616,7 +616,7 @@ export default class Tokenizer {
             this._state = State.InDeclaration;
         }
     }
-    private stateInSpecialComment(c: number) {
+    private stateInSpecialComment(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
             this.cbs.oncomment(
                 this.sectionStart,
@@ -626,7 +626,7 @@ export default class Tokenizer {
             this.sectionStart = this._index + 1;
         }
     }
-    private stateBeforeSpecialS(c: number) {
+    private stateBeforeSpecialS(c: number): void {
         const lower = c | 0x20;
         if (lower === Sequences.ScriptEnd[3]) {
             this.startSpecial(Sequences.ScriptEnd, 4);
@@ -640,12 +640,13 @@ export default class Tokenizer {
 
     private trieIndex = 0;
     private trieCurrent = 0;
-    private trieResult = 0;
+    private entityResult = 0;
     private entityExcess = 0;
 
-    private stateBeforeEntity(c: number) {
+    private stateBeforeEntity(c: number): void {
         // Start excess with 1 to include the '&'
         this.entityExcess = 1;
+        this.entityResult = 0;
 
         if (c === CharCodes.Num) {
             this._state = State.BeforeNumericEntity;
@@ -654,13 +655,12 @@ export default class Tokenizer {
         } else {
             this.trieIndex = 0;
             this.trieCurrent = this.entityTrie[0];
-            this.trieResult = 0;
             this._state = State.InNamedEntity;
             this.stateInNamedEntity(c);
         }
     }
 
-    private stateInNamedEntity(c: number) {
+    private stateInNamedEntity(c: number): void {
         this.entityExcess += 1;
 
         this.trieIndex = determineBranch(
@@ -696,7 +696,7 @@ export default class Tokenizer {
                 }
 
                 // If this is a surrogate pair, consume the next two bytes
-                this.trieResult = this.trieIndex;
+                this.entityResult = this.trieIndex;
                 this.trieIndex +=
                     1 +
                     Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
@@ -707,10 +707,10 @@ export default class Tokenizer {
     }
 
     private emitNamedEntity() {
-        if (this.trieResult !== 0) {
-            if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) {
-                const first = this.entityTrie[this.trieResult + 1];
-                const second = this.entityTrie[this.trieResult + 2];
+        if (this.entityResult !== 0) {
+            if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
+                const first = this.entityTrie[this.entityResult + 1];
+                const second = this.entityTrie[this.entityResult + 2];
                 // If this is a surrogate pair, combine the code points.
                 if (first >= 0xd8_00 && first <= 0xdf_ff) {
                     this.emitCodePoint(
@@ -722,14 +722,14 @@ export default class Tokenizer {
                     this.emitCodePoint(second);
                 }
             } else {
-                this.emitCodePoint(this.entityTrie[this.trieResult + 1]);
+                this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
             }
         }
 
         this._state = this.baseState;
     }
 
-    private stateBeforeNumericEntity(c: number) {
+    private stateBeforeNumericEntity(c: number): void {
         if ((c | 0x20) === CharCodes.LowerX) {
             this.entityExcess++;
             this._state = State.InHexEntity;
@@ -760,7 +760,7 @@ export default class Tokenizer {
         }
         this._state = this.baseState;
     }
-    private stateInNumericEntity(c: number) {
+    private stateInNumericEntity(c: number): void {
         if (c === CharCodes.Semi) {
             this.decodeNumericEntity(10, true);
         } else if (!isNumber(c)) {
@@ -774,7 +774,7 @@ export default class Tokenizer {
             this.entityExcess++;
         }
     }
-    private stateInHexEntity(c: number) {
+    private stateInHexEntity(c: number): void {
         if (c === CharCodes.Semi) {
             this.decodeNumericEntity(16, true);
         } else if (

From a5175ae4763ff0e8c28d783d1721356df776fae0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 20:53:11 +0000
Subject: [PATCH 3/7] Pass end indices in several callbacks

Avoids `getIndex` calls
---
 src/Parser.ts                            | 28 +++++++--------
 src/Tokenizer.ts                         | 45 +++++++++---------------
 src/__snapshots__/Tokenizer.spec.ts.snap | 42 +++++++++++-----------
 3 files changed, 52 insertions(+), 63 deletions(-)

diff --git a/src/Parser.ts b/src/Parser.ts
index 429fd517a..2dcffd0a1 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -257,10 +257,10 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onopentagname(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
+    onopentagname(start: number, endIndex: number): void {
+        this.endIndex = endIndex;
 
-        let name = this.getSubstr(start, length);
+        let name = this.getSubstr(start, endIndex - start);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -321,10 +321,10 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onclosetag(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
+    onclosetag(start: number, endIndex: number): void {
+        this.endIndex = endIndex;
 
-        let name = this.getSubstr(start, length);
+        let name = this.getSubstr(start, endIndex - start);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -412,8 +412,8 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onattribend(quote: QuoteType): void {
-        this.endIndex = this.tokenizer.getIndex();
+    onattribend(quote: QuoteType, endIndex: number): void {
+        this.endIndex = endIndex;
 
         const quoteVal =
             quote === QuoteType.Double
@@ -447,9 +447,9 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    ondeclaration(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
-        const value = this.getSubstr(start, length);
+    ondeclaration(start: number, endIndex: number): void {
+        this.endIndex = endIndex;
+        const value = this.getSubstr(start, endIndex - start);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
@@ -461,9 +461,9 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onprocessinginstruction(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
-        const value = this.getSubstr(start, length);
+    onprocessinginstruction(start: number, endIndex: number): void {
+        this.endIndex = endIndex;
+        const value = this.getSubstr(start, endIndex - start);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 61aa6c979..b0b65b914 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -114,17 +114,17 @@ export enum QuoteType {
 export interface Callbacks {
     onattribdata(start: number, length: number): void;
     onattribentity(codepoint: number): void;
-    onattribend(quote: QuoteType): void;
+    onattribend(quote: QuoteType, endIndex: number): void;
     onattribname(start: number, length: number): void;
     oncdata(start: number, length: number): void;
-    onclosetag(start: number, length: number): void;
+    onclosetag(start: number, endIndex: number): void;
     oncomment(start: number, length: number): void;
-    ondeclaration(start: number, length: number): void;
+    ondeclaration(start: number, endIndex: number): void;
     onend(): void;
     onerror(error: Error, state?: State): void;
     onopentagend(): void;
-    onopentagname(start: number, length: number): void;
-    onprocessinginstruction(start: number, length: number): void;
+    onopentagname(start: number, endIndex: number): void;
+    onprocessinginstruction(start: number, endIndex: number): void;
     onselfclosingtag(): void;
     ontext(start: number, length: number): void;
     ontextentity(codepoint: number): void;
@@ -426,10 +426,7 @@ export default class Tokenizer {
     }
     private stateInTagName(c: number): void {
         if (isEndOfTagSection(c)) {
-            this.cbs.onopentagname(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onopentagname(this.sectionStart, this._index);
             this.sectionStart = -1;
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
@@ -449,10 +446,7 @@ export default class Tokenizer {
     }
     private stateInClosingTagName(c: number): void {
         if (c === CharCodes.Gt || isWhitespace(c)) {
-            this.cbs.onclosetag(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onclosetag(this.sectionStart, this._index);
             this.sectionStart = -1;
             this._state = State.AfterClosingTagName;
             this.stateAfterClosingTagName(c);
@@ -510,11 +504,11 @@ export default class Tokenizer {
         if (c === CharCodes.Eq) {
             this._state = State.BeforeAttributeValue;
         } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
-            this.cbs.onattribend(QuoteType.NoValue);
+            this.cbs.onattribend(QuoteType.NoValue, this._index);
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (!isWhitespace(c)) {
-            this.cbs.onattribend(QuoteType.NoValue);
+            this.cbs.onattribend(QuoteType.NoValue, this._index);
             this._state = State.InAttributeName;
             this.sectionStart = this._index;
         }
@@ -545,7 +539,8 @@ export default class Tokenizer {
             this.cbs.onattribend(
                 quote === CharCodes.DoubleQuote
                     ? QuoteType.Double
-                    : QuoteType.Single
+                    : QuoteType.Single,
+                this._index
             );
             this._state = State.BeforeAttributeName;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -566,7 +561,7 @@ export default class Tokenizer {
                 this._index - this.sectionStart
             );
             this.sectionStart = -1;
-            this.cbs.onattribend(QuoteType.Unquoted);
+            this.cbs.onattribend(QuoteType.Unquoted, this._index);
             this._state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (this.decodeEntities && c === CharCodes.Amp) {
@@ -587,20 +582,14 @@ export default class Tokenizer {
     }
     private stateInDeclaration(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.ondeclaration(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.ondeclaration(this.sectionStart, this._index);
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
     }
     private stateInProcessingInstruction(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.onprocessinginstruction(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onprocessinginstruction(this.sectionStart, this._index);
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
@@ -706,7 +695,7 @@ export default class Tokenizer {
         }
     }
 
-    private emitNamedEntity() {
+    private emitNamedEntity(): void {
         if (this.entityResult !== 0) {
             if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) {
                 const first = this.entityTrie[this.entityResult + 1];
@@ -960,7 +949,7 @@ export default class Tokenizer {
         }
     }
 
-    private emitPartial(start: number, length: number) {
+    private emitPartial(start: number, length: number): void {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
@@ -970,7 +959,7 @@ export default class Tokenizer {
             this.cbs.ontext(start, length);
         }
     }
-    private emitCodePoint(cp: number) {
+    private emitCodePoint(cp: number): void {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 4951ecd2a..5f641da6b 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -22,7 +22,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    6,
+    7,
   ],
   Array [
     "onselfclosingtag",
@@ -30,7 +30,7 @@ Array [
   Array [
     "onopentagname",
     11,
-    3,
+    14,
   ],
   Array [
     "onopentagend",
@@ -38,7 +38,7 @@ Array [
   Array [
     "onclosetag",
     17,
-    3,
+    20,
   ],
   Array [
     "onend",
@@ -51,7 +51,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    5,
+    6,
   ],
   Array [
     "onselfclosingtag",
@@ -59,7 +59,7 @@ Array [
   Array [
     "onopentagname",
     10,
-    3,
+    13,
   ],
   Array [
     "onopentagend",
@@ -67,7 +67,7 @@ Array [
   Array [
     "onclosetag",
     16,
-    3,
+    19,
   ],
   Array [
     "onend",
@@ -80,7 +80,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    5,
+    6,
   ],
   Array [
     "onselfclosingtag",
@@ -88,7 +88,7 @@ Array [
   Array [
     "onopentagname",
     10,
-    3,
+    13,
   ],
   Array [
     "onopentagend",
@@ -96,7 +96,7 @@ Array [
   Array [
     "onclosetag",
     16,
-    3,
+    19,
   ],
   Array [
     "onend",
@@ -109,7 +109,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    6,
+    7,
   ],
   Array [
     "onopentagend",
@@ -117,12 +117,12 @@ Array [
   Array [
     "onclosetag",
     10,
-    6,
+    16,
   ],
   Array [
     "onopentagname",
     18,
-    3,
+    21,
   ],
   Array [
     "onopentagend",
@@ -130,7 +130,7 @@ Array [
   Array [
     "onclosetag",
     24,
-    3,
+    27,
   ],
   Array [
     "onend",
@@ -143,7 +143,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    5,
+    6,
   ],
   Array [
     "onopentagend",
@@ -151,12 +151,12 @@ Array [
   Array [
     "onclosetag",
     9,
-    5,
+    14,
   ],
   Array [
     "onopentagname",
     16,
-    3,
+    19,
   ],
   Array [
     "onopentagend",
@@ -164,7 +164,7 @@ Array [
   Array [
     "onclosetag",
     22,
-    3,
+    25,
   ],
   Array [
     "onend",
@@ -177,7 +177,7 @@ Array [
   Array [
     "onopentagname",
     1,
-    5,
+    6,
   ],
   Array [
     "onopentagend",
@@ -185,12 +185,12 @@ Array [
   Array [
     "onclosetag",
     9,
-    5,
+    14,
   ],
   Array [
     "onopentagname",
     16,
-    3,
+    19,
   ],
   Array [
     "onopentagend",
@@ -198,7 +198,7 @@ Array [
   Array [
     "onclosetag",
     22,
-    3,
+    25,
   ],
   Array [
     "onend",

From 397ef7865b9a2406ae093d831ff9c3ba96f05645 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 21:10:15 +0000
Subject: [PATCH 4/7] Decode numeric entities on the go

---
 src/Tokenizer.ts | 52 ++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index b0b65b914..52944ee26 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -104,6 +104,13 @@ function isASCIIAlpha(c: number): boolean {
     );
 }
 
+function isHexDigit(c: number): boolean {
+    return (
+        (c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
+        (c >= CharCodes.LowerA && c <= CharCodes.LowerF)
+    );
+}
+
 export enum QuoteType {
     NoValue = 0,
     Unquoted = 1,
@@ -629,6 +636,7 @@ export default class Tokenizer {
 
     private trieIndex = 0;
     private trieCurrent = 0;
+    /** For named entities, the index of the value. For numeric entities, the code point. */
     private entityResult = 0;
     private entityExcess = 0;
 
@@ -728,9 +736,10 @@ export default class Tokenizer {
         }
     }
 
-    private decodeNumericEntity(base: 10 | 16, strict: boolean) {
+    private emitNumericEntity(strict: boolean) {
         const entityStart = this._index - this.entityExcess - 1;
-        const numberStart = entityStart + 2 + (base >> 4);
+        const numberStart =
+            entityStart + 2 + Number(this._state === State.InHexEntity);
 
         if (numberStart !== this._index) {
             // Emit leading data if any
@@ -741,44 +750,43 @@ export default class Tokenizer {
                 );
             }
 
-            // Parse entity
-            const entity = this.buffer.substring(numberStart, this._index);
-            const parsed = parseInt(entity, base);
-            this.emitCodePoint(parsed);
+            this.emitCodePoint(this.entityResult);
             this.sectionStart = this._index + Number(strict);
         }
         this._state = this.baseState;
     }
     private stateInNumericEntity(c: number): void {
         if (c === CharCodes.Semi) {
-            this.decodeNumericEntity(10, true);
-        } else if (!isNumber(c)) {
+            this.emitNumericEntity(true);
+        } else if (isNumber(c)) {
+            this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
+            this.entityExcess++;
+        } else {
             if (this.allowLegacyEntity()) {
-                this.decodeNumericEntity(10, false);
+                this.emitNumericEntity(false);
             } else {
                 this._state = this.baseState;
             }
             this._index--;
-        } else {
-            this.entityExcess++;
         }
     }
     private stateInHexEntity(c: number): void {
         if (c === CharCodes.Semi) {
-            this.decodeNumericEntity(16, true);
-        } else if (
-            (c < CharCodes.LowerA || c > CharCodes.LowerF) &&
-            (c < CharCodes.UpperA || c > CharCodes.UpperF) &&
-            !isNumber(c)
-        ) {
+            this.emitNumericEntity(true);
+        } else if (isNumber(c)) {
+            this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
+            this.entityExcess++;
+        } else if (isHexDigit(c)) {
+            this.entityResult =
+                this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
+            this.entityExcess++;
+        } else {
             if (this.allowLegacyEntity()) {
-                this.decodeNumericEntity(16, false);
+                this.emitNumericEntity(false);
             } else {
                 this._state = this.baseState;
             }
             this._index--;
-        } else {
-            this.entityExcess++;
         }
     }
 
@@ -921,13 +929,13 @@ export default class Tokenizer {
             this._state === State.InNumericEntity &&
             this.allowLegacyEntity()
         ) {
-            this.decodeNumericEntity(10, false);
+            this.emitNumericEntity(false);
             // All trailing data will have been consumed
         } else if (
             this._state === State.InHexEntity &&
             this.allowLegacyEntity()
         ) {
-            this.decodeNumericEntity(16, false);
+            this.emitNumericEntity(false);
             // All trailing data will have been consumed
         } else if (
             this._state === State.InTagName ||

From f30745478018befe9e83c32f95be1cb8d7a42160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 21:24:51 +0000
Subject: [PATCH 5/7] Pass `endIndex` for all callbacks

---
 src/Parser.ts                            | 65 +++++++++--------
 src/Tokenizer.ts                         | 88 ++++++++----------------
 src/__snapshots__/Tokenizer.spec.ts.snap | 14 +++-
 3 files changed, 72 insertions(+), 95 deletions(-)

diff --git a/src/Parser.ts b/src/Parser.ts
index 2dcffd0a1..da4744a2d 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -236,12 +236,11 @@ export class Parser implements Callbacks {
     // Tokenizer event handlers
 
     /** @internal */
-    ontext(start: number, length: number): void {
-        const data = this.getSubstr(start, length);
-        const idx = start + length;
-        this.endIndex = idx - 1;
+    ontext(start: number, endIndex: number): void {
+        const data = this.getSlice(start, endIndex);
+        this.endIndex = endIndex - 1;
         this.cbs.ontext?.(data);
-        this.startIndex = idx;
+        this.startIndex = endIndex;
     }
 
     /** @internal */
@@ -260,7 +259,7 @@ export class Parser implements Callbacks {
     onopentagname(start: number, endIndex: number): void {
         this.endIndex = endIndex;
 
-        let name = this.getSubstr(start, endIndex - start);
+        let name = this.getSlice(start, endIndex);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -299,7 +298,6 @@ export class Parser implements Callbacks {
 
     private endOpenTag(isImplied: boolean) {
         this.startIndex = this.openTagStart;
-        this.endIndex = this.tokenizer.getIndex();
 
         if (this.attribs) {
             this.cbs.onopentag?.(this.tagname, this.attribs, isImplied);
@@ -313,18 +311,19 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onopentagend(): void {
+    onopentagend(endIndex: number): void {
+        this.endIndex = endIndex;
         this.endOpenTag(false);
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
     onclosetag(start: number, endIndex: number): void {
         this.endIndex = endIndex;
 
-        let name = this.getSubstr(start, endIndex - start);
+        let name = this.getSlice(start, endIndex);
 
         if (this.lowerCaseTagNames) {
             name = name.toLowerCase();
@@ -359,11 +358,12 @@ export class Parser implements Callbacks {
         }
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
-    onselfclosingtag(): void {
+    onselfclosingtag(endIndex: number): void {
+        this.endIndex = endIndex;
         if (
             this.options.xmlMode ||
             this.options.recognizeSelfClosing ||
@@ -372,10 +372,10 @@ export class Parser implements Callbacks {
             this.closeCurrentTag(false);
 
             // Set `startIndex` for next node
-            this.startIndex = this.endIndex + 1;
+            this.startIndex = endIndex + 1;
         } else {
             // Ignore the fact that the tag is self-closing.
-            this.onopentagend();
+            this.onopentagend(endIndex);
         }
     }
 
@@ -392,9 +392,9 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onattribname(start: number, length: number): void {
+    onattribname(start: number, endIndex: number): void {
         this.startIndex = start;
-        const name = this.getSubstr(start, length);
+        const name = this.getSlice(start, endIndex);
 
         this.attribname = this.lowerCaseAttributeNames
             ? name.toLowerCase()
@@ -402,8 +402,8 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    onattribdata(start: number, length: number): void {
-        this.attribvalue += this.getSubstr(start, length);
+    onattribdata(start: number, endIndex: number): void {
+        this.attribvalue += this.getSlice(start, endIndex);
     }
 
     /** @internal */
@@ -449,7 +449,7 @@ export class Parser implements Callbacks {
     /** @internal */
     ondeclaration(start: number, endIndex: number): void {
         this.endIndex = endIndex;
-        const value = this.getSubstr(start, endIndex - start);
+        const value = this.getSlice(start, endIndex);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
@@ -457,13 +457,13 @@ export class Parser implements Callbacks {
         }
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
     onprocessinginstruction(start: number, endIndex: number): void {
         this.endIndex = endIndex;
-        const value = this.getSubstr(start, endIndex - start);
+        const value = this.getSlice(start, endIndex);
 
         if (this.cbs.onprocessinginstruction) {
             const name = this.getInstructionName(value);
@@ -471,25 +471,24 @@ export class Parser implements Callbacks {
         }
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
-    oncomment(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
-        const value = this.getSubstr(start, length);
+    oncomment(start: number, endIndex: number, offset: number): void {
+        this.endIndex = endIndex;
 
-        this.cbs.oncomment?.(value);
+        this.cbs.oncomment?.(this.getSlice(start, endIndex - offset));
         this.cbs.oncommentend?.();
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
-    oncdata(start: number, length: number): void {
-        this.endIndex = this.tokenizer.getIndex();
-        const value = this.getSubstr(start, length);
+    oncdata(start: number, endIndex: number, offset: number): void {
+        this.endIndex = endIndex;
+        const value = this.getSlice(start, endIndex - offset);
 
         if (this.options.xmlMode || this.options.recognizeCDATA) {
             this.cbs.oncdatastart?.();
@@ -501,7 +500,7 @@ export class Parser implements Callbacks {
         }
 
         // Set `startIndex` for next node
-        this.startIndex = this.endIndex + 1;
+        this.startIndex = endIndex + 1;
     }
 
     /** @internal */
@@ -552,8 +551,8 @@ export class Parser implements Callbacks {
 
     private buffer = "";
 
-    private getSubstr(start: number, length: number) {
-        return this.buffer.substr(start, length);
+    private getSlice(start: number, end: number) {
+        return this.buffer.slice(start, end);
     }
 
     /**
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 52944ee26..901b20dd6 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -119,21 +119,21 @@ export enum QuoteType {
 }
 
 export interface Callbacks {
-    onattribdata(start: number, length: number): void;
+    onattribdata(start: number, endIndex: number): void;
     onattribentity(codepoint: number): void;
     onattribend(quote: QuoteType, endIndex: number): void;
-    onattribname(start: number, length: number): void;
-    oncdata(start: number, length: number): void;
+    onattribname(start: number, endIndex: number): void;
+    oncdata(start: number, endIndex: number, endOffset: number): void;
     onclosetag(start: number, endIndex: number): void;
-    oncomment(start: number, length: number): void;
+    oncomment(start: number, endIndex: number, endOffset: number): void;
     ondeclaration(start: number, endIndex: number): void;
     onend(): void;
     onerror(error: Error, state?: State): void;
-    onopentagend(): void;
+    onopentagend(endIndex: number): void;
     onopentagname(start: number, endIndex: number): void;
     onprocessinginstruction(start: number, endIndex: number): void;
-    onselfclosingtag(): void;
-    ontext(start: number, length: number): void;
+    onselfclosingtag(endIndex: number): void;
+    ontext(start: number, endIndex: number): void;
     ontextentity(codepoint: number): void;
 }
 
@@ -237,10 +237,7 @@ export default class Tokenizer {
             (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
         ) {
             if (this._index > this.sectionStart) {
-                this.cbs.ontext(
-                    this.sectionStart,
-                    this._index - this.sectionStart
-                );
+                this.cbs.ontext(this.sectionStart, this._index);
             }
             this._state = State.BeforeTagName;
             this.sectionStart = this._index;
@@ -281,10 +278,7 @@ export default class Tokenizer {
                     // Spoof the index so that reported locations match up.
                     const actualIndex = this._index;
                     this._index = endOfText;
-                    this.cbs.ontext(
-                        this.sectionStart,
-                        endOfText - this.sectionStart
-                    );
+                    this.cbs.ontext(this.sectionStart, endOfText);
                     this._index = actualIndex;
                 }
 
@@ -365,13 +359,10 @@ export default class Tokenizer {
     private stateInCommentLike(c: number): void {
         if (c === this.currentSequence[this.sequenceIndex]) {
             if (++this.sequenceIndex === this.currentSequence.length) {
-                // Remove 2 trailing chars
-                const length = this._index - 2 - this.sectionStart;
-
                 if (this.currentSequence === Sequences.CdataEnd) {
-                    this.cbs.oncdata(this.sectionStart, length);
+                    this.cbs.oncdata(this.sectionStart, this._index, 2);
                 } else {
-                    this.cbs.oncomment(this.sectionStart, length);
+                    this.cbs.oncomment(this.sectionStart, this._index, 2);
                 }
 
                 this.sequenceIndex = 0;
@@ -468,7 +459,7 @@ export default class Tokenizer {
     }
     private stateBeforeAttributeName(c: number): void {
         if (c === CharCodes.Gt) {
-            this.cbs.onopentagend();
+            this.cbs.onopentagend(this._index);
             if (this.isSpecial) {
                 this._state = State.InSpecialTag;
                 this.sequenceIndex = 0;
@@ -486,7 +477,7 @@ export default class Tokenizer {
     }
     private stateInSelfClosingTag(c: number): void {
         if (c === CharCodes.Gt) {
-            this.cbs.onselfclosingtag();
+            this.cbs.onselfclosingtag(this._index);
             this._state = State.Text;
             this.baseState = State.Text;
             this.sectionStart = this._index + 1;
@@ -498,10 +489,7 @@ export default class Tokenizer {
     }
     private stateInAttributeName(c: number): void {
         if (c === CharCodes.Eq || isEndOfTagSection(c)) {
-            this.cbs.onattribname(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onattribname(this.sectionStart, this._index);
             this.sectionStart = -1;
             this._state = State.AfterAttributeName;
             this.stateAfterAttributeName(c);
@@ -538,10 +526,7 @@ export default class Tokenizer {
             c === quote ||
             (!this.decodeEntities && this.fastForwardTo(quote))
         ) {
-            this.cbs.onattribdata(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onattribdata(this.sectionStart, this._index);
             this.sectionStart = -1;
             this.cbs.onattribend(
                 quote === CharCodes.DoubleQuote
@@ -563,10 +548,7 @@ export default class Tokenizer {
     }
     private stateInAttributeValueNoQuotes(c: number): void {
         if (isWhitespace(c) || c === CharCodes.Gt) {
-            this.cbs.onattribdata(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.onattribdata(this.sectionStart, this._index);
             this.sectionStart = -1;
             this.cbs.onattribend(QuoteType.Unquoted, this._index);
             this._state = State.BeforeAttributeName;
@@ -614,10 +596,7 @@ export default class Tokenizer {
     }
     private stateInSpecialComment(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.oncomment(
-                this.sectionStart,
-                this._index - this.sectionStart
-            );
+            this.cbs.oncomment(this.sectionStart, this._index, 0);
             this._state = State.Text;
             this.sectionStart = this._index + 1;
         }
@@ -686,10 +665,7 @@ export default class Tokenizer {
                 const entityStart = this._index - this.entityExcess + 1;
 
                 if (entityStart > this.sectionStart) {
-                    this.emitPartial(
-                        this.sectionStart,
-                        entityStart - this.sectionStart
-                    );
+                    this.emitPartial(this.sectionStart, entityStart);
                 }
 
                 // If this is a surrogate pair, consume the next two bytes
@@ -744,10 +720,7 @@ export default class Tokenizer {
         if (numberStart !== this._index) {
             // Emit leading data if any
             if (entityStart > this.sectionStart) {
-                this.emitPartial(
-                    this.sectionStart,
-                    entityStart - this.sectionStart
-                );
+                this.emitPartial(this.sectionStart, entityStart);
             }
 
             this.emitCodePoint(this.entityResult);
@@ -808,20 +781,14 @@ export default class Tokenizer {
                 this._state === State.Text ||
                 (this._state === State.InSpecialTag && this.sequenceIndex === 0)
             ) {
-                this.cbs.ontext(
-                    this.sectionStart,
-                    this._index - this.sectionStart
-                );
+                this.cbs.ontext(this.sectionStart, this._index);
                 this.sectionStart = this._index;
             } else if (
                 this._state === State.InAttributeValueDq ||
                 this._state === State.InAttributeValueSq ||
                 this._state === State.InAttributeValueNq
             ) {
-                this.cbs.onattribdata(
-                    this.sectionStart,
-                    this._index - this.sectionStart
-                );
+                this.cbs.onattribdata(this.sectionStart, this._index);
                 this.sectionStart = this._index;
             }
         }
@@ -918,12 +885,11 @@ export default class Tokenizer {
 
     /** Handle any trailing data. */
     private handleTrailingData() {
-        const remaining = this.buffer.length - this.sectionStart;
         if (this._state === State.InCommentLike) {
             if (this.currentSequence === Sequences.CdataEnd) {
-                this.cbs.oncdata(this.sectionStart, remaining);
+                this.cbs.oncdata(this.sectionStart, this.buffer.length, 0);
             } else {
-                this.cbs.oncomment(this.sectionStart, remaining);
+                this.cbs.oncomment(this.sectionStart, this.buffer.length, 0);
             }
         } else if (
             this._state === State.InNumericEntity &&
@@ -953,18 +919,18 @@ export default class Tokenizer {
              * respective callback signals that the tag should be ignored.
              */
         } else {
-            this.cbs.ontext(this.sectionStart, remaining);
+            this.cbs.ontext(this.sectionStart, this.buffer.length);
         }
     }
 
-    private emitPartial(start: number, length: number): void {
+    private emitPartial(start: number, endIndex: number): void {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
         ) {
-            this.cbs.onattribdata(start, length);
+            this.cbs.onattribdata(start, endIndex);
         } else {
-            this.cbs.ontext(start, length);
+            this.cbs.ontext(start, endIndex);
         }
     }
     private emitCodePoint(cp: number): void {
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 5f641da6b..91d8ecf27 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -9,7 +9,7 @@ Array [
   Array [
     "ontext",
     5,
-    7,
+    12,
   ],
   Array [
     "onend",
@@ -26,6 +26,7 @@ Array [
   ],
   Array [
     "onselfclosingtag",
+    9,
   ],
   Array [
     "onopentagname",
@@ -34,6 +35,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    14,
   ],
   Array [
     "onclosetag",
@@ -55,6 +57,7 @@ Array [
   ],
   Array [
     "onselfclosingtag",
+    8,
   ],
   Array [
     "onopentagname",
@@ -63,6 +66,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    13,
   ],
   Array [
     "onclosetag",
@@ -84,6 +88,7 @@ Array [
   ],
   Array [
     "onselfclosingtag",
+    8,
   ],
   Array [
     "onopentagname",
@@ -92,6 +97,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    13,
   ],
   Array [
     "onclosetag",
@@ -113,6 +119,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    7,
   ],
   Array [
     "onclosetag",
@@ -126,6 +133,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    21,
   ],
   Array [
     "onclosetag",
@@ -147,6 +155,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    6,
   ],
   Array [
     "onclosetag",
@@ -160,6 +169,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    19,
   ],
   Array [
     "onclosetag",
@@ -181,6 +191,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    6,
   ],
   Array [
     "onclosetag",
@@ -194,6 +205,7 @@ Array [
   ],
   Array [
     "onopentagend",
+    19,
   ],
   Array [
     "onclosetag",

From ca09b29d5db8a09cb26a5850f1fde1208414cbd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:14:19 +0000
Subject: [PATCH 6/7] Don't concat strings

and only pass a single string at a time to the tokenizer
---
 src/Parser.spec.ts |  4 +--
 src/Parser.ts      | 74 +++++++++++++++++++++++++++++++++++++---------
 src/Tokenizer.ts   | 40 +++++++++++--------------
 3 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/src/Parser.spec.ts b/src/Parser.spec.ts
index e546c5da7..88fb2bde9 100644
--- a/src/Parser.spec.ts
+++ b/src/Parser.spec.ts
@@ -48,11 +48,11 @@ describe("API", () => {
         p.resume();
         expect(onText).toHaveBeenCalledTimes(1);
         p.pause();
-        p.end("foo");
+        p.end("bar");
         expect(onText).toHaveBeenCalledTimes(1);
         p.resume();
         expect(onText).toHaveBeenCalledTimes(2);
-        expect(onText).toHaveBeenLastCalledWith("foo");
+        expect(onText).toHaveBeenLastCalledWith("bar");
     });
 
     test("should back out of numeric entities (#125)", () => {
diff --git a/src/Parser.ts b/src/Parser.ts
index da4744a2d..2339f9e83 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -218,6 +218,13 @@ export class Parser implements Callbacks {
     private readonly lowerCaseAttributeNames: boolean;
     private readonly tokenizer: Tokenizer;
 
+    private readonly buffers: string[] = [];
+    private bufferOffset = 0;
+    /** The index of the last written buffer. Used when resuming after a `pause()`. */
+    private writeIndex = 0;
+    /** Indicates whether the parser has finished running / `.end` has been called. */
+    private ended = false;
+
     constructor(
         cbs?: Partial<Handler> | null,
         private readonly options: ParserOptions = {}
@@ -503,11 +510,6 @@ export class Parser implements Callbacks {
         this.startIndex = endIndex + 1;
     }
 
-    /** @internal */
-    onerror(err: Error): void {
-        this.cbs.onerror?.(err);
-    }
-
     /** @internal */
     onend(): void {
         if (this.cbs.onclosetag) {
@@ -531,11 +533,14 @@ export class Parser implements Callbacks {
         this.tagname = "";
         this.attribname = "";
         this.attribs = null;
-        this.stack = [];
+        this.stack.length = 0;
         this.startIndex = 0;
         this.endIndex = 0;
         this.cbs.onparserinit?.(this);
-        this.buffer = "";
+        this.buffers.length = 0;
+        this.bufferOffset = 0;
+        this.writeIndex = 0;
+        this.ended = false;
     }
 
     /**
@@ -549,10 +554,28 @@ export class Parser implements Callbacks {
         this.end(data);
     }
 
-    private buffer = "";
-
     private getSlice(start: number, end: number) {
-        return this.buffer.slice(start, end);
+        while (start - this.bufferOffset >= this.buffers[0].length) {
+            this.shiftBuffer();
+        }
+
+        let str = this.buffers[0].slice(
+            start - this.bufferOffset,
+            end - this.bufferOffset
+        );
+
+        while (end - this.bufferOffset > this.buffers[0].length) {
+            this.shiftBuffer();
+            str += this.buffers[0].slice(0, end - this.bufferOffset);
+        }
+
+        return str;
+    }
+
+    private shiftBuffer(): void {
+        this.bufferOffset += this.buffers[0].length;
+        this.writeIndex--;
+        this.buffers.shift();
     }
 
     /**
@@ -561,8 +584,16 @@ export class Parser implements Callbacks {
      * @param chunk Chunk to parse.
      */
     public write(chunk: string): void {
-        this.buffer += chunk;
-        this.tokenizer.write(chunk);
+        if (this.ended) {
+            this.cbs.onerror?.(new Error(".write() after done!"));
+            return;
+        }
+
+        this.buffers.push(chunk);
+        if (this.tokenizer.running) {
+            this.tokenizer.write(chunk);
+            this.writeIndex++;
+        }
     }
 
     /**
@@ -571,8 +602,14 @@ export class Parser implements Callbacks {
      * @param chunk Optional final chunk to parse.
      */
     public end(chunk?: string): void {
-        if (chunk) this.buffer += chunk;
-        this.tokenizer.end(chunk);
+        if (this.ended) {
+            this.cbs.onerror?.(Error(".end() after done!"));
+            return;
+        }
+
+        if (chunk) this.write(chunk);
+        this.ended = true;
+        this.tokenizer.end();
     }
 
     /**
@@ -587,6 +624,15 @@ export class Parser implements Callbacks {
      */
     public resume(): void {
         this.tokenizer.resume();
+
+        while (
+            this.tokenizer.running &&
+            this.writeIndex < this.buffers.length
+        ) {
+            this.tokenizer.write(this.buffers[this.writeIndex++]);
+        }
+
+        if (this.ended) this.tokenizer.end();
     }
 
     /**
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 901b20dd6..543b93514 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -128,7 +128,6 @@ export interface Callbacks {
     oncomment(start: number, endIndex: number, endOffset: number): void;
     ondeclaration(start: number, endIndex: number): void;
     onend(): void;
-    onerror(error: Error, state?: State): void;
     onopentagend(endIndex: number): void;
     onopentagname(start: number, endIndex: number): void;
     onprocessinginstruction(start: number, endIndex: number): void;
@@ -166,9 +165,9 @@ export default class Tokenizer {
     /** For special parsing behavior inside of script and style tags. */
     private isSpecial = false;
     /** Indicates whether the tokenizer has been paused. */
-    private running = true;
-    /** Indicates whether the tokenizer has finished running / `.end` has been called. */
-    private ended = false;
+    public running = true;
+    /** The offset of the current buffer. */
+    private offset = 0;
 
     private readonly xmlMode: boolean;
     private readonly decodeEntities: boolean;
@@ -194,19 +193,16 @@ export default class Tokenizer {
         this.baseState = State.Text;
         this.currentSequence = undefined!;
         this.running = true;
-        this.ended = false;
+        this.offset = 0;
     }
 
     public write(chunk: string): void {
-        if (this.ended) return this.cbs.onerror(Error(".write() after done!"));
-        this.buffer += chunk;
+        this.offset += this.buffer.length;
+        this.buffer = chunk;
         this.parse();
     }
 
-    public end(chunk?: string): void {
-        if (this.ended) return this.cbs.onerror(Error(".end() after done!"));
-        if (chunk) this.write(chunk);
-        this.ended = true;
+    public end(): void {
         if (this.running) this.finish();
     }
 
@@ -216,12 +212,9 @@ export default class Tokenizer {
 
     public resume(): void {
         this.running = true;
-        if (this._index < this.buffer.length) {
+        if (this._index < this.buffer.length + this.offset) {
             this.parse();
         }
-        if (this.ended) {
-            this.finish();
-        }
     }
 
     /**
@@ -331,8 +324,8 @@ export default class Tokenizer {
      * @returns Whether the character was found.
      */
     private fastForwardTo(c: number): boolean {
-        while (++this._index < this.buffer.length) {
-            if (this.buffer.charCodeAt(this._index) === c) {
+        while (++this._index < this.buffer.length + this.offset) {
+            if (this.buffer.charCodeAt(this._index - this.offset) === c) {
                 return true;
             }
         }
@@ -343,7 +336,7 @@ export default class Tokenizer {
          *
          * TODO: Refactor `parse` to increment index before calling states.
          */
-        this._index = this.buffer.length - 1;
+        this._index = this.buffer.length + this.offset - 1;
 
         return false;
     }
@@ -795,7 +788,7 @@ export default class Tokenizer {
     }
 
     private shouldContinue() {
-        return this._index < this.buffer.length && this.running;
+        return this._index < this.buffer.length + this.offset && this.running;
     }
 
     /**
@@ -805,7 +798,7 @@ export default class Tokenizer {
      */
     private parse() {
         while (this.shouldContinue()) {
-            const c = this.buffer.charCodeAt(this._index);
+            const c = this.buffer.charCodeAt(this._index - this.offset);
             if (this._state === State.Text) {
                 this.stateText(c);
             } else if (this._state === State.SpecialStartSequence) {
@@ -885,11 +878,12 @@ export default class Tokenizer {
 
     /** Handle any trailing data. */
     private handleTrailingData() {
+        const endIndex = this.buffer.length + this.offset;
         if (this._state === State.InCommentLike) {
             if (this.currentSequence === Sequences.CdataEnd) {
-                this.cbs.oncdata(this.sectionStart, this.buffer.length, 0);
+                this.cbs.oncdata(this.sectionStart, endIndex, 0);
             } else {
-                this.cbs.oncomment(this.sectionStart, this.buffer.length, 0);
+                this.cbs.oncomment(this.sectionStart, endIndex, 0);
             }
         } else if (
             this._state === State.InNumericEntity &&
@@ -919,7 +913,7 @@ export default class Tokenizer {
              * respective callback signals that the tag should be ignored.
              */
         } else {
-            this.cbs.ontext(this.sectionStart, this.buffer.length);
+            this.cbs.ontext(this.sectionStart, endIndex);
         }
     }
 

From 09ac7e188db9722d83b6564b518456351b5b54e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:16:50 +0000
Subject: [PATCH 7/7] Remove `_` prefix from tokenizer private props

---
 src/Tokenizer.ts | 336 +++++++++++++++++++++++------------------------
 1 file changed, 168 insertions(+), 168 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 543b93514..93643215c 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -153,13 +153,13 @@ const Sequences = {
 
 export default class Tokenizer {
     /** The current state the tokenizer is in. */
-    private _state = State.Text;
+    private state = State.Text;
     /** The read buffer. */
     private buffer = "";
     /** The beginning of the section that is currently being read. */
     public sectionStart = 0;
     /** The index within the buffer that we are currently looking at. */
-    private _index = 0;
+    private index = 0;
     /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
     private baseState = State.Text;
     /** For special parsing behavior inside of script and style tags. */
@@ -186,10 +186,10 @@ export default class Tokenizer {
     }
 
     public reset(): void {
-        this._state = State.Text;
+        this.state = State.Text;
         this.buffer = "";
         this.sectionStart = 0;
-        this._index = 0;
+        this.index = 0;
         this.baseState = State.Text;
         this.currentSequence = undefined!;
         this.running = true;
@@ -212,7 +212,7 @@ export default class Tokenizer {
 
     public resume(): void {
         this.running = true;
-        if (this._index < this.buffer.length + this.offset) {
+        if (this.index < this.buffer.length + this.offset) {
             this.parse();
         }
     }
@@ -221,7 +221,7 @@ export default class Tokenizer {
      * The current index within all of the written data.
      */
     public getIndex(): number {
-        return this._index;
+        return this.index;
     }
 
     private stateText(c: number): void {
@@ -229,13 +229,13 @@ export default class Tokenizer {
             c === CharCodes.Lt ||
             (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))
         ) {
-            if (this._index > this.sectionStart) {
-                this.cbs.ontext(this.sectionStart, this._index);
+            if (this.index > this.sectionStart) {
+                this.cbs.ontext(this.sectionStart, this.index);
             }
-            this._state = State.BeforeTagName;
-            this.sectionStart = this._index;
+            this.state = State.BeforeTagName;
+            this.sectionStart = this.index;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this._state = State.BeforeEntity;
+            this.state = State.BeforeEntity;
         }
     }
 
@@ -257,7 +257,7 @@ export default class Tokenizer {
         }
 
         this.sequenceIndex = 0;
-        this._state = State.InTagName;
+        this.state = State.InTagName;
         this.stateInTagName(c);
     }
 
@@ -265,14 +265,14 @@ export default class Tokenizer {
     private stateInSpecialTag(c: number): void {
         if (this.sequenceIndex === this.currentSequence.length) {
             if (c === CharCodes.Gt || isWhitespace(c)) {
-                const endOfText = this._index - this.currentSequence.length;
+                const endOfText = this.index - this.currentSequence.length;
 
                 if (this.sectionStart < endOfText) {
                     // Spoof the index so that reported locations match up.
-                    const actualIndex = this._index;
-                    this._index = endOfText;
+                    const actualIndex = this.index;
+                    this.index = endOfText;
                     this.cbs.ontext(this.sectionStart, endOfText);
-                    this._index = actualIndex;
+                    this.index = actualIndex;
                 }
 
                 this.isSpecial = false;
@@ -290,7 +290,7 @@ export default class Tokenizer {
             if (this.currentSequence === Sequences.TitleEnd) {
                 // We have to parse entities in <title> tags.
                 if (this.decodeEntities && c === CharCodes.Amp) {
-                    this._state = State.BeforeEntity;
+                    this.state = State.BeforeEntity;
                 }
             } else if (this.fastForwardTo(CharCodes.Lt)) {
                 // Outside of <title> tags, we can fast-forward.
@@ -305,14 +305,14 @@ export default class Tokenizer {
     private stateCDATASequence(c: number): void {
         if (c === Sequences.Cdata[this.sequenceIndex]) {
             if (++this.sequenceIndex === Sequences.Cdata.length) {
-                this._state = State.InCommentLike;
+                this.state = State.InCommentLike;
                 this.currentSequence = Sequences.CdataEnd;
                 this.sequenceIndex = 0;
-                this.sectionStart = this._index + 1;
+                this.sectionStart = this.index + 1;
             }
         } else {
             this.sequenceIndex = 0;
-            this._state = State.InDeclaration;
+            this.state = State.InDeclaration;
             this.stateInDeclaration(c); // Reconsume the character
         }
     }
@@ -324,8 +324,8 @@ export default class Tokenizer {
      * @returns Whether the character was found.
      */
     private fastForwardTo(c: number): boolean {
-        while (++this._index < this.buffer.length + this.offset) {
-            if (this.buffer.charCodeAt(this._index - this.offset) === c) {
+        while (++this.index < this.buffer.length + this.offset) {
+            if (this.buffer.charCodeAt(this.index - this.offset) === c) {
                 return true;
             }
         }
@@ -336,7 +336,7 @@ export default class Tokenizer {
          *
          * TODO: Refactor `parse` to increment index before calling states.
          */
-        this._index = this.buffer.length + this.offset - 1;
+        this.index = this.buffer.length + this.offset - 1;
 
         return false;
     }
@@ -353,14 +353,14 @@ export default class Tokenizer {
         if (c === this.currentSequence[this.sequenceIndex]) {
             if (++this.sequenceIndex === this.currentSequence.length) {
                 if (this.currentSequence === Sequences.CdataEnd) {
-                    this.cbs.oncdata(this.sectionStart, this._index, 2);
+                    this.cbs.oncdata(this.sectionStart, this.index, 2);
                 } else {
-                    this.cbs.oncomment(this.sectionStart, this._index, 2);
+                    this.cbs.oncomment(this.sectionStart, this.index, 2);
                 }
 
                 this.sequenceIndex = 0;
-                this.sectionStart = this._index + 1;
-                this._state = State.Text;
+                this.sectionStart = this.index + 1;
+                this.state = State.Text;
             }
         } else if (this.sequenceIndex === 0) {
             // Fast-forward to the first character of the sequence
@@ -387,39 +387,39 @@ export default class Tokenizer {
         this.isSpecial = true;
         this.currentSequence = sequence;
         this.sequenceIndex = offset;
-        this._state = State.SpecialStartSequence;
+        this.state = State.SpecialStartSequence;
     }
 
     private stateBeforeTagName(c: number): void {
         if (c === CharCodes.ExclamationMark) {
-            this._state = State.BeforeDeclaration;
-            this.sectionStart = this._index + 1;
+            this.state = State.BeforeDeclaration;
+            this.sectionStart = this.index + 1;
         } else if (c === CharCodes.Questionmark) {
-            this._state = State.InProcessingInstruction;
-            this.sectionStart = this._index + 1;
+            this.state = State.InProcessingInstruction;
+            this.sectionStart = this.index + 1;
         } else if (this.isTagStartChar(c)) {
             const lower = c | 0x20;
-            this.sectionStart = this._index;
+            this.sectionStart = this.index;
             if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
                 this.startSpecial(Sequences.TitleEnd, 3);
             } else {
-                this._state =
+                this.state =
                     !this.xmlMode && lower === Sequences.ScriptEnd[2]
                         ? State.BeforeSpecialS
                         : State.InTagName;
             }
         } else if (c === CharCodes.Slash) {
-            this._state = State.BeforeClosingTagName;
+            this.state = State.BeforeClosingTagName;
         } else {
-            this._state = State.Text;
+            this.state = State.Text;
             this.stateText(c);
         }
     }
     private stateInTagName(c: number): void {
         if (isEndOfTagSection(c)) {
-            this.cbs.onopentagname(this.sectionStart, this._index);
+            this.cbs.onopentagname(this.sectionStart, this.index);
             this.sectionStart = -1;
-            this._state = State.BeforeAttributeName;
+            this.state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         }
     }
@@ -427,90 +427,90 @@ export default class Tokenizer {
         if (isWhitespace(c)) {
             // Ignore
         } else if (c === CharCodes.Gt) {
-            this._state = State.Text;
+            this.state = State.Text;
         } else {
-            this._state = this.isTagStartChar(c)
+            this.state = this.isTagStartChar(c)
                 ? State.InClosingTagName
                 : State.InSpecialComment;
-            this.sectionStart = this._index;
+            this.sectionStart = this.index;
         }
     }
     private stateInClosingTagName(c: number): void {
         if (c === CharCodes.Gt || isWhitespace(c)) {
-            this.cbs.onclosetag(this.sectionStart, this._index);
+            this.cbs.onclosetag(this.sectionStart, this.index);
             this.sectionStart = -1;
-            this._state = State.AfterClosingTagName;
+            this.state = State.AfterClosingTagName;
             this.stateAfterClosingTagName(c);
         }
     }
     private stateAfterClosingTagName(c: number): void {
         // Skip everything until ">"
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this._state = State.Text;
-            this.sectionStart = this._index + 1;
+            this.state = State.Text;
+            this.sectionStart = this.index + 1;
         }
     }
     private stateBeforeAttributeName(c: number): void {
         if (c === CharCodes.Gt) {
-            this.cbs.onopentagend(this._index);
+            this.cbs.onopentagend(this.index);
             if (this.isSpecial) {
-                this._state = State.InSpecialTag;
+                this.state = State.InSpecialTag;
                 this.sequenceIndex = 0;
             } else {
-                this._state = State.Text;
+                this.state = State.Text;
             }
-            this.baseState = this._state;
-            this.sectionStart = this._index + 1;
+            this.baseState = this.state;
+            this.sectionStart = this.index + 1;
         } else if (c === CharCodes.Slash) {
-            this._state = State.InSelfClosingTag;
+            this.state = State.InSelfClosingTag;
         } else if (!isWhitespace(c)) {
-            this._state = State.InAttributeName;
-            this.sectionStart = this._index;
+            this.state = State.InAttributeName;
+            this.sectionStart = this.index;
         }
     }
     private stateInSelfClosingTag(c: number): void {
         if (c === CharCodes.Gt) {
-            this.cbs.onselfclosingtag(this._index);
-            this._state = State.Text;
+            this.cbs.onselfclosingtag(this.index);
+            this.state = State.Text;
             this.baseState = State.Text;
-            this.sectionStart = this._index + 1;
+            this.sectionStart = this.index + 1;
             this.isSpecial = false; // Reset special state, in case of self-closing special tags
         } else if (!isWhitespace(c)) {
-            this._state = State.BeforeAttributeName;
+            this.state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         }
     }
     private stateInAttributeName(c: number): void {
         if (c === CharCodes.Eq || isEndOfTagSection(c)) {
-            this.cbs.onattribname(this.sectionStart, this._index);
+            this.cbs.onattribname(this.sectionStart, this.index);
             this.sectionStart = -1;
-            this._state = State.AfterAttributeName;
+            this.state = State.AfterAttributeName;
             this.stateAfterAttributeName(c);
         }
     }
     private stateAfterAttributeName(c: number): void {
         if (c === CharCodes.Eq) {
-            this._state = State.BeforeAttributeValue;
+            this.state = State.BeforeAttributeValue;
         } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
-            this.cbs.onattribend(QuoteType.NoValue, this._index);
-            this._state = State.BeforeAttributeName;
+            this.cbs.onattribend(QuoteType.NoValue, this.index);
+            this.state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (!isWhitespace(c)) {
-            this.cbs.onattribend(QuoteType.NoValue, this._index);
-            this._state = State.InAttributeName;
-            this.sectionStart = this._index;
+            this.cbs.onattribend(QuoteType.NoValue, this.index);
+            this.state = State.InAttributeName;
+            this.sectionStart = this.index;
         }
     }
     private stateBeforeAttributeValue(c: number): void {
         if (c === CharCodes.DoubleQuote) {
-            this._state = State.InAttributeValueDq;
-            this.sectionStart = this._index + 1;
+            this.state = State.InAttributeValueDq;
+            this.sectionStart = this.index + 1;
         } else if (c === CharCodes.SingleQuote) {
-            this._state = State.InAttributeValueSq;
-            this.sectionStart = this._index + 1;
+            this.state = State.InAttributeValueSq;
+            this.sectionStart = this.index + 1;
         } else if (!isWhitespace(c)) {
-            this.sectionStart = this._index;
-            this._state = State.InAttributeValueNq;
+            this.sectionStart = this.index;
+            this.state = State.InAttributeValueNq;
             this.stateInAttributeValueNoQuotes(c); // Reconsume token
         }
     }
@@ -519,18 +519,18 @@ export default class Tokenizer {
             c === quote ||
             (!this.decodeEntities && this.fastForwardTo(quote))
         ) {
-            this.cbs.onattribdata(this.sectionStart, this._index);
+            this.cbs.onattribdata(this.sectionStart, this.index);
             this.sectionStart = -1;
             this.cbs.onattribend(
                 quote === CharCodes.DoubleQuote
                     ? QuoteType.Double
                     : QuoteType.Single,
-                this._index
+                this.index
             );
-            this._state = State.BeforeAttributeName;
+            this.state = State.BeforeAttributeName;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this.baseState = this._state;
-            this._state = State.BeforeEntity;
+            this.baseState = this.state;
+            this.state = State.BeforeEntity;
         }
     }
     private stateInAttributeValueDoubleQuotes(c: number): void {
@@ -541,22 +541,22 @@ export default class Tokenizer {
     }
     private stateInAttributeValueNoQuotes(c: number): void {
         if (isWhitespace(c) || c === CharCodes.Gt) {
-            this.cbs.onattribdata(this.sectionStart, this._index);
+            this.cbs.onattribdata(this.sectionStart, this.index);
             this.sectionStart = -1;
-            this.cbs.onattribend(QuoteType.Unquoted, this._index);
-            this._state = State.BeforeAttributeName;
+            this.cbs.onattribend(QuoteType.Unquoted, this.index);
+            this.state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this.baseState = this._state;
-            this._state = State.BeforeEntity;
+            this.baseState = this.state;
+            this.state = State.BeforeEntity;
         }
     }
     private stateBeforeDeclaration(c: number): void {
         if (c === CharCodes.OpeningSquareBracket) {
-            this._state = State.CDATASequence;
+            this.state = State.CDATASequence;
             this.sequenceIndex = 0;
         } else {
-            this._state =
+            this.state =
                 c === CharCodes.Dash
                     ? State.BeforeComment
                     : State.InDeclaration;
@@ -564,34 +564,34 @@ export default class Tokenizer {
     }
     private stateInDeclaration(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.ondeclaration(this.sectionStart, this._index);
-            this._state = State.Text;
-            this.sectionStart = this._index + 1;
+            this.cbs.ondeclaration(this.sectionStart, this.index);
+            this.state = State.Text;
+            this.sectionStart = this.index + 1;
         }
     }
     private stateInProcessingInstruction(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.onprocessinginstruction(this.sectionStart, this._index);
-            this._state = State.Text;
-            this.sectionStart = this._index + 1;
+            this.cbs.onprocessinginstruction(this.sectionStart, this.index);
+            this.state = State.Text;
+            this.sectionStart = this.index + 1;
         }
     }
     private stateBeforeComment(c: number): void {
         if (c === CharCodes.Dash) {
-            this._state = State.InCommentLike;
+            this.state = State.InCommentLike;
             this.currentSequence = Sequences.CommentEnd;
             // Allow short comments (eg. <!-->)
             this.sequenceIndex = 2;
-            this.sectionStart = this._index + 1;
+            this.sectionStart = this.index + 1;
         } else {
-            this._state = State.InDeclaration;
+            this.state = State.InDeclaration;
         }
     }
     private stateInSpecialComment(c: number): void {
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
-            this.cbs.oncomment(this.sectionStart, this._index, 0);
-            this._state = State.Text;
-            this.sectionStart = this._index + 1;
+            this.cbs.oncomment(this.sectionStart, this.index, 0);
+            this.state = State.Text;
+            this.sectionStart = this.index + 1;
         }
     }
     private stateBeforeSpecialS(c: number): void {
@@ -601,7 +601,7 @@ export default class Tokenizer {
         } else if (lower === Sequences.StyleEnd[3]) {
             this.startSpecial(Sequences.StyleEnd, 4);
         } else {
-            this._state = State.InTagName;
+            this.state = State.InTagName;
             this.stateInTagName(c); // Consume the token again
         }
     }
@@ -618,13 +618,13 @@ export default class Tokenizer {
         this.entityResult = 0;
 
         if (c === CharCodes.Num) {
-            this._state = State.BeforeNumericEntity;
+            this.state = State.BeforeNumericEntity;
         } else if (c === CharCodes.Amp) {
             // We have two `&` characters in a row. Stay in the current state.
         } else {
             this.trieIndex = 0;
             this.trieCurrent = this.entityTrie[0];
-            this._state = State.InNamedEntity;
+            this.state = State.InNamedEntity;
             this.stateInNamedEntity(c);
         }
     }
@@ -641,7 +641,7 @@ export default class Tokenizer {
 
         if (this.trieIndex < 0) {
             this.emitNamedEntity();
-            this._index--;
+            this.index--;
             return;
         }
 
@@ -655,7 +655,7 @@ export default class Tokenizer {
                 this.trieIndex += 1;
             } else {
                 // Add 1 as we have already incremented the excess
-                const entityStart = this._index - this.entityExcess + 1;
+                const entityStart = this.index - this.entityExcess + 1;
 
                 if (entityStart > this.sectionStart) {
                     this.emitPartial(this.sectionStart, entityStart);
@@ -667,7 +667,7 @@ export default class Tokenizer {
                     1 +
                     Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0);
                 this.entityExcess = 0;
-                this.sectionStart = this._index + 1;
+                this.sectionStart = this.index + 1;
             }
         }
     }
@@ -692,34 +692,34 @@ export default class Tokenizer {
             }
         }
 
-        this._state = this.baseState;
+        this.state = this.baseState;
     }
 
     private stateBeforeNumericEntity(c: number): void {
         if ((c | 0x20) === CharCodes.LowerX) {
             this.entityExcess++;
-            this._state = State.InHexEntity;
+            this.state = State.InHexEntity;
         } else {
-            this._state = State.InNumericEntity;
+            this.state = State.InNumericEntity;
             this.stateInNumericEntity(c);
         }
     }
 
     private emitNumericEntity(strict: boolean) {
-        const entityStart = this._index - this.entityExcess - 1;
+        const entityStart = this.index - this.entityExcess - 1;
         const numberStart =
-            entityStart + 2 + Number(this._state === State.InHexEntity);
+            entityStart + 2 + Number(this.state === State.InHexEntity);
 
-        if (numberStart !== this._index) {
+        if (numberStart !== this.index) {
             // Emit leading data if any
             if (entityStart > this.sectionStart) {
                 this.emitPartial(this.sectionStart, entityStart);
             }
 
             this.emitCodePoint(this.entityResult);
-            this.sectionStart = this._index + Number(strict);
+            this.sectionStart = this.index + Number(strict);
         }
-        this._state = this.baseState;
+        this.state = this.baseState;
     }
     private stateInNumericEntity(c: number): void {
         if (c === CharCodes.Semi) {
@@ -731,9 +731,9 @@ export default class Tokenizer {
             if (this.allowLegacyEntity()) {
                 this.emitNumericEntity(false);
             } else {
-                this._state = this.baseState;
+                this.state = this.baseState;
             }
-            this._index--;
+            this.index--;
         }
     }
     private stateInHexEntity(c: number): void {
@@ -750,9 +750,9 @@ export default class Tokenizer {
             if (this.allowLegacyEntity()) {
                 this.emitNumericEntity(false);
             } else {
-                this._state = this.baseState;
+                this.state = this.baseState;
             }
-            this._index--;
+            this.index--;
         }
     }
 
@@ -769,26 +769,26 @@ export default class Tokenizer {
      */
     private cleanup() {
         // If we are inside of text or attributes, emit what we already have.
-        if (this.running && this.sectionStart !== this._index) {
+        if (this.running && this.sectionStart !== this.index) {
             if (
-                this._state === State.Text ||
-                (this._state === State.InSpecialTag && this.sequenceIndex === 0)
+                this.state === State.Text ||
+                (this.state === State.InSpecialTag && this.sequenceIndex === 0)
             ) {
-                this.cbs.ontext(this.sectionStart, this._index);
-                this.sectionStart = this._index;
+                this.cbs.ontext(this.sectionStart, this.index);
+                this.sectionStart = this.index;
             } else if (
-                this._state === State.InAttributeValueDq ||
-                this._state === State.InAttributeValueSq ||
-                this._state === State.InAttributeValueNq
+                this.state === State.InAttributeValueDq ||
+                this.state === State.InAttributeValueSq ||
+                this.state === State.InAttributeValueNq
             ) {
-                this.cbs.onattribdata(this.sectionStart, this._index);
-                this.sectionStart = this._index;
+                this.cbs.onattribdata(this.sectionStart, this.index);
+                this.sectionStart = this.index;
             }
         }
     }
 
     private shouldContinue() {
-        return this._index < this.buffer.length + this.offset && this.running;
+        return this.index < this.buffer.length + this.offset && this.running;
     }
 
     /**
@@ -798,79 +798,79 @@ export default class Tokenizer {
      */
     private parse() {
         while (this.shouldContinue()) {
-            const c = this.buffer.charCodeAt(this._index - this.offset);
-            if (this._state === State.Text) {
+            const c = this.buffer.charCodeAt(this.index - this.offset);
+            if (this.state === State.Text) {
                 this.stateText(c);
-            } else if (this._state === State.SpecialStartSequence) {
+            } else if (this.state === State.SpecialStartSequence) {
                 this.stateSpecialStartSequence(c);
-            } else if (this._state === State.InSpecialTag) {
+            } else if (this.state === State.InSpecialTag) {
                 this.stateInSpecialTag(c);
-            } else if (this._state === State.CDATASequence) {
+            } else if (this.state === State.CDATASequence) {
                 this.stateCDATASequence(c);
-            } else if (this._state === State.InAttributeValueDq) {
+            } else if (this.state === State.InAttributeValueDq) {
                 this.stateInAttributeValueDoubleQuotes(c);
-            } else if (this._state === State.InAttributeName) {
+            } else if (this.state === State.InAttributeName) {
                 this.stateInAttributeName(c);
-            } else if (this._state === State.InCommentLike) {
+            } else if (this.state === State.InCommentLike) {
                 this.stateInCommentLike(c);
-            } else if (this._state === State.InSpecialComment) {
+            } else if (this.state === State.InSpecialComment) {
                 this.stateInSpecialComment(c);
-            } else if (this._state === State.BeforeAttributeName) {
+            } else if (this.state === State.BeforeAttributeName) {
                 this.stateBeforeAttributeName(c);
-            } else if (this._state === State.InTagName) {
+            } else if (this.state === State.InTagName) {
                 this.stateInTagName(c);
-            } else if (this._state === State.InClosingTagName) {
+            } else if (this.state === State.InClosingTagName) {
                 this.stateInClosingTagName(c);
-            } else if (this._state === State.BeforeTagName) {
+            } else if (this.state === State.BeforeTagName) {
                 this.stateBeforeTagName(c);
-            } else if (this._state === State.AfterAttributeName) {
+            } else if (this.state === State.AfterAttributeName) {
                 this.stateAfterAttributeName(c);
-            } else if (this._state === State.InAttributeValueSq) {
+            } else if (this.state === State.InAttributeValueSq) {
                 this.stateInAttributeValueSingleQuotes(c);
-            } else if (this._state === State.BeforeAttributeValue) {
+            } else if (this.state === State.BeforeAttributeValue) {
                 this.stateBeforeAttributeValue(c);
-            } else if (this._state === State.BeforeClosingTagName) {
+            } else if (this.state === State.BeforeClosingTagName) {
                 this.stateBeforeClosingTagName(c);
-            } else if (this._state === State.AfterClosingTagName) {
+            } else if (this.state === State.AfterClosingTagName) {
                 this.stateAfterClosingTagName(c);
-            } else if (this._state === State.BeforeSpecialS) {
+            } else if (this.state === State.BeforeSpecialS) {
                 this.stateBeforeSpecialS(c);
-            } else if (this._state === State.InAttributeValueNq) {
+            } else if (this.state === State.InAttributeValueNq) {
                 this.stateInAttributeValueNoQuotes(c);
-            } else if (this._state === State.InSelfClosingTag) {
+            } else if (this.state === State.InSelfClosingTag) {
                 this.stateInSelfClosingTag(c);
-            } else if (this._state === State.InDeclaration) {
+            } else if (this.state === State.InDeclaration) {
                 this.stateInDeclaration(c);
-            } else if (this._state === State.BeforeDeclaration) {
+            } else if (this.state === State.BeforeDeclaration) {
                 this.stateBeforeDeclaration(c);
-            } else if (this._state === State.BeforeComment) {
+            } else if (this.state === State.BeforeComment) {
                 this.stateBeforeComment(c);
-            } else if (this._state === State.InProcessingInstruction) {
+            } else if (this.state === State.InProcessingInstruction) {
                 this.stateInProcessingInstruction(c);
-            } else if (this._state === State.InNamedEntity) {
+            } else if (this.state === State.InNamedEntity) {
                 this.stateInNamedEntity(c);
-            } else if (this._state === State.BeforeEntity) {
+            } else if (this.state === State.BeforeEntity) {
                 this.stateBeforeEntity(c);
-            } else if (this._state === State.InHexEntity) {
+            } else if (this.state === State.InHexEntity) {
                 this.stateInHexEntity(c);
-            } else if (this._state === State.InNumericEntity) {
+            } else if (this.state === State.InNumericEntity) {
                 this.stateInNumericEntity(c);
             } else {
                 // `this._state === State.BeforeNumericEntity`
                 this.stateBeforeNumericEntity(c);
             }
-            this._index++;
+            this.index++;
         }
         this.cleanup();
     }
 
     private finish() {
-        if (this._state === State.InNamedEntity) {
+        if (this.state === State.InNamedEntity) {
             this.emitNamedEntity();
         }
 
         // If there is remaining data, emit it in a reasonable way
-        if (this.sectionStart < this._index) {
+        if (this.sectionStart < this.index) {
             this.handleTrailingData();
         }
         this.cbs.onend();
@@ -879,34 +879,34 @@ export default class Tokenizer {
     /** Handle any trailing data. */
     private handleTrailingData() {
         const endIndex = this.buffer.length + this.offset;
-        if (this._state === State.InCommentLike) {
+        if (this.state === State.InCommentLike) {
             if (this.currentSequence === Sequences.CdataEnd) {
                 this.cbs.oncdata(this.sectionStart, endIndex, 0);
             } else {
                 this.cbs.oncomment(this.sectionStart, endIndex, 0);
             }
         } else if (
-            this._state === State.InNumericEntity &&
+            this.state === State.InNumericEntity &&
             this.allowLegacyEntity()
         ) {
             this.emitNumericEntity(false);
             // All trailing data will have been consumed
         } else if (
-            this._state === State.InHexEntity &&
+            this.state === State.InHexEntity &&
             this.allowLegacyEntity()
         ) {
             this.emitNumericEntity(false);
             // All trailing data will have been consumed
         } else if (
-            this._state === State.InTagName ||
-            this._state === State.BeforeAttributeName ||
-            this._state === State.BeforeAttributeValue ||
-            this._state === State.AfterAttributeName ||
-            this._state === State.InAttributeName ||
-            this._state === State.InAttributeValueSq ||
-            this._state === State.InAttributeValueDq ||
-            this._state === State.InAttributeValueNq ||
-            this._state === State.InClosingTagName
+            this.state === State.InTagName ||
+            this.state === State.BeforeAttributeName ||
+            this.state === State.BeforeAttributeValue ||
+            this.state === State.AfterAttributeName ||
+            this.state === State.InAttributeName ||
+            this.state === State.InAttributeValueSq ||
+            this.state === State.InAttributeValueDq ||
+            this.state === State.InAttributeValueNq ||
+            this.state === State.InClosingTagName
         ) {
             /*
              * If we are currently in an opening or closing tag, us not calling the