From a4428c7a345330fe19c778384ba0f3761749dbd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 19:24:52 +0000 Subject: [PATCH 1/7] refactor: Call tokenizer callbacks with indices --- src/Parser.ts | 97 +++++++--- src/Tokenizer.ts | 230 ++++++++++++++--------- src/__snapshots__/Tokenizer.spec.ts.snap | 70 ++++--- 3 files changed, 253 insertions(+), 144 deletions(-) diff --git a/src/Parser.ts b/src/Parser.ts index cccd33f9e..429fd517a 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -1,4 +1,5 @@ -import Tokenizer from "./Tokenizer"; +import Tokenizer, { Callbacks, QuoteType } from "./Tokenizer"; +import decodeCodePoint from "entities/lib/decode_codepoint"; const formTags = new Set([ "input", @@ -195,7 +196,7 @@ export interface Handler { const reNameEnd = /\s|\//; -export class Parser { +export class Parser implements Callbacks { /** The start index of the last event. */ public startIndex = 0; /** The end index of the last event. */ @@ -235,20 +236,31 @@ export class Parser { // Tokenizer event handlers /** @internal */ - ontext(data: string): void { - const idx = this.tokenizer.getAbsoluteIndex(); + ontext(start: number, length: number): void { + const data = this.getSubstr(start, length); + const idx = start + length; this.endIndex = idx - 1; this.cbs.ontext?.(data); this.startIndex = idx; } + /** @internal */ + ontextentity(cp: number): void { + const idx = this.tokenizer.getIndex(); + this.endIndex = idx - 1; + this.cbs.ontext?.(decodeCodePoint(cp)); + this.startIndex = idx; + } + protected isVoidElement(name: string): boolean { return !this.options.xmlMode && voidElements.has(name); } /** @internal */ - onopentagname(name: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + onopentagname(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + + let name = this.getSubstr(start, length); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -287,7 +299,7 @@ export class Parser { private endOpenTag(isImplied: boolean) { this.startIndex = this.openTagStart; - this.endIndex = this.tokenizer.getAbsoluteIndex(); + this.endIndex = this.tokenizer.getIndex(); if (this.attribs) { this.cbs.onopentag?.(this.tagname, this.attribs, isImplied); @@ -309,8 +321,10 @@ export class Parser { } /** @internal */ - onclosetag(name: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + onclosetag(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + + let name = this.getSubstr(start, length); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -378,25 +392,39 @@ export class Parser { } /** @internal */ - onattribname(name: string): void { - this.startIndex = this.tokenizer.getAbsoluteSectionStart(); + onattribname(start: number, length: number): void { + this.startIndex = start; + const name = this.getSubstr(start, length); - if (this.lowerCaseAttributeNames) { - name = name.toLowerCase(); - } - this.attribname = name; + this.attribname = this.lowerCaseAttributeNames + ? name.toLowerCase() + : name; } /** @internal */ - onattribdata(value: string): void { - this.attribvalue += value; + onattribdata(start: number, length: number): void { + this.attribvalue += this.getSubstr(start, length); } /** @internal */ - onattribend(quote: string | undefined | null): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + onattribentity(cp: number): void { + this.attribvalue += decodeCodePoint(cp); + } - this.cbs.onattribute?.(this.attribname, this.attribvalue, quote); + /** @internal */ + onattribend(quote: QuoteType): void { + this.endIndex = this.tokenizer.getIndex(); + + const quoteVal = + quote === QuoteType.Double + ? '"' + : quote === QuoteType.Single + ? "'" + : quote === QuoteType.NoValue + ? undefined + : null; + + this.cbs.onattribute?.(this.attribname, this.attribvalue, quoteVal); if ( this.attribs && !Object.prototype.hasOwnProperty.call(this.attribs, this.attribname) @@ -419,8 +447,9 @@ export class Parser { } /** @internal */ - ondeclaration(value: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + ondeclaration(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + const value = this.getSubstr(start, length); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); @@ -432,8 +461,9 @@ export class Parser { } /** @internal */ - onprocessinginstruction(value: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + onprocessinginstruction(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + const value = this.getSubstr(start, length); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); @@ -445,8 +475,9 @@ export class Parser { } /** @internal */ - oncomment(value: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + oncomment(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + const value = this.getSubstr(start, length); this.cbs.oncomment?.(value); this.cbs.oncommentend?.(); @@ -456,8 +487,9 @@ export class Parser { } /** @internal */ - oncdata(value: string): void { - this.endIndex = this.tokenizer.getAbsoluteIndex(); + oncdata(start: number, length: number): void { + this.endIndex = this.tokenizer.getIndex(); + const value = this.getSubstr(start, length); if (this.options.xmlMode || this.options.recognizeCDATA) { this.cbs.oncdatastart?.(); @@ -504,6 +536,7 @@ export class Parser { this.startIndex = 0; this.endIndex = 0; this.cbs.onparserinit?.(this); + this.buffer = ""; } /** @@ -517,12 +550,19 @@ export class Parser { this.end(data); } + private buffer = ""; + + private getSubstr(start: number, length: number) { + return this.buffer.substr(start, length); + } + /** * Parses a chunk of data and calls the corresponding callbacks. * * @param chunk Chunk to parse. */ public write(chunk: string): void { + this.buffer += chunk; this.tokenizer.write(chunk); } @@ -532,6 +572,7 @@ export class Parser { * @param chunk Optional final chunk to parse. */ public end(chunk?: string): void { + if (chunk) this.buffer += chunk; this.tokenizer.end(chunk); } diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 6b5182f9d..a597439df 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -1,4 +1,3 @@ -import decodeCodePoint from "entities/lib/decode_codepoint"; import { htmlDecodeTree, xmlDecodeTree, @@ -105,21 +104,30 @@ function isASCIIAlpha(c: number): boolean { ); } +export enum QuoteType { + NoValue = 0, + Unquoted = 1, + Single = 2, + Double = 3, +} + export interface Callbacks { - onattribdata(value: string): void; - onattribend(quote: string | undefined | null): void; - onattribname(name: string): void; - oncdata(data: string): void; - onclosetag(name: string): void; - oncomment(data: string): void; - ondeclaration(content: string): void; + onattribdata(start: number, length: number): void; + onattribentity(codepoint: number): void; + onattribend(quote: QuoteType): void; + onattribname(start: number, length: number): void; + oncdata(start: number, length: number): void; + onclosetag(start: number, length: number): void; + oncomment(start: number, length: number): void; + ondeclaration(start: number, length: number): void; onend(): void; onerror(error: Error, state?: State): void; onopentagend(): void; - onopentagname(name: string): void; - onprocessinginstruction(instruction: string): void; + onopentagname(start: number, length: number): void; + onprocessinginstruction(start: number, length: number): void; onselfclosingtag(): void; - ontext(value: string): void; + ontext(start: number, length: number): void; + ontextentity(codepoint: number): void; } /** @@ -146,11 +154,6 @@ export default class Tokenizer { public sectionStart = 0; /** The index within the buffer that we are currently looking at. */ private _index = 0; - /** - * Data that has already been processed will be removed from the buffer occasionally. - * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate. - */ - private bufferOffset = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ private baseState = State.Text; /** For special parsing behavior inside of script and style tags. */ @@ -181,7 +184,6 @@ export default class Tokenizer { this.buffer = ""; this.sectionStart = 0; this._index = 0; - this.bufferOffset = 0; this.baseState = State.Text; this.currentSequence = undefined!; this.running = true; @@ -215,18 +217,11 @@ export default class Tokenizer { } } - /** - * The start of the current section. - */ - public getAbsoluteSectionStart(): number { - return this.sectionStart + this.bufferOffset; - } - /** * The current index within all of the written data. */ - public getAbsoluteIndex(): number { - return this.bufferOffset + this._index; + public getIndex(): number { + return this._index; } private stateText(c: number) { @@ -235,7 +230,10 @@ export default class Tokenizer { (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) ) { if (this._index > this.sectionStart) { - this.cbs.ontext(this.getSection()); + this.cbs.ontext( + this.sectionStart, + this._index - this.sectionStart + ); } this._state = State.BeforeTagName; this.sectionStart = this._index; @@ -276,7 +274,10 @@ export default class Tokenizer { // Spoof the index so that reported locations match up. const actualIndex = this._index; this._index = endOfText; - this.cbs.ontext(this.getSection()); + this.cbs.ontext( + this.sectionStart, + endOfText - this.sectionStart + ); this._index = actualIndex; } @@ -358,15 +359,12 @@ export default class Tokenizer { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { // Remove 2 trailing chars - const section = this.buffer.slice( - this.sectionStart, - this._index - 2 - ); + const length = this._index - 2 - this.sectionStart; if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(section); + this.cbs.oncdata(this.sectionStart, length); } else { - this.cbs.oncomment(section); + this.cbs.oncomment(this.sectionStart, length); } this.sequenceIndex = 0; @@ -428,7 +426,10 @@ export default class Tokenizer { } private stateInTagName(c: number) { if (isEndOfTagSection(c)) { - this.cbs.onopentagname(this.getSection()); + this.cbs.onopentagname( + this.sectionStart, + this._index - this.sectionStart + ); this.sectionStart = -1; this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); @@ -448,7 +449,10 @@ export default class Tokenizer { } private stateInClosingTagName(c: number) { if (c === CharCodes.Gt || isWhitespace(c)) { - this.cbs.onclosetag(this.getSection()); + this.cbs.onclosetag( + this.sectionStart, + this._index - this.sectionStart + ); this.sectionStart = -1; this._state = State.AfterClosingTagName; this.stateAfterClosingTagName(c); @@ -493,7 +497,10 @@ export default class Tokenizer { } private stateInAttributeName(c: number) { if (c === CharCodes.Eq || isEndOfTagSection(c)) { - this.cbs.onattribname(this.getSection()); + this.cbs.onattribname( + this.sectionStart, + this._index - this.sectionStart + ); this.sectionStart = -1; this._state = State.AfterAttributeName; this.stateAfterAttributeName(c); @@ -503,11 +510,11 @@ export default class Tokenizer { if (c === CharCodes.Eq) { this._state = State.BeforeAttributeValue; } else if (c === CharCodes.Slash || c === CharCodes.Gt) { - this.cbs.onattribend(undefined); + this.cbs.onattribend(QuoteType.NoValue); this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (!isWhitespace(c)) { - this.cbs.onattribend(undefined); + this.cbs.onattribend(QuoteType.NoValue); this._state = State.InAttributeName; this.sectionStart = this._index; } @@ -530,9 +537,16 @@ export default class Tokenizer { c === quote || (!this.decodeEntities && this.fastForwardTo(quote)) ) { - this.cbs.onattribdata(this.getSection()); + this.cbs.onattribdata( + this.sectionStart, + this._index - this.sectionStart + ); this.sectionStart = -1; - this.cbs.onattribend(String.fromCharCode(quote)); + this.cbs.onattribend( + quote === CharCodes.DoubleQuote + ? QuoteType.Double + : QuoteType.Single + ); this._state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { this.baseState = this._state; @@ -547,9 +561,12 @@ export default class Tokenizer { } private stateInAttributeValueNoQuotes(c: number) { if (isWhitespace(c) || c === CharCodes.Gt) { - this.cbs.onattribdata(this.getSection()); + this.cbs.onattribdata( + this.sectionStart, + this._index - this.sectionStart + ); this.sectionStart = -1; - this.cbs.onattribend(null); + this.cbs.onattribend(QuoteType.Unquoted); this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { @@ -570,14 +587,20 @@ export default class Tokenizer { } private stateInDeclaration(c: number) { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.ondeclaration(this.getSection()); + this.cbs.ondeclaration( + this.sectionStart, + this._index - this.sectionStart + ); this._state = State.Text; this.sectionStart = this._index + 1; } } private stateInProcessingInstruction(c: number) { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.onprocessinginstruction(this.getSection()); + this.cbs.onprocessinginstruction( + this.sectionStart, + this._index - this.sectionStart + ); this._state = State.Text; this.sectionStart = this._index + 1; } @@ -595,7 +618,10 @@ export default class Tokenizer { } private stateInSpecialComment(c: number) { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.oncomment(this.getSection()); + this.cbs.oncomment( + this.sectionStart, + this._index - this.sectionStart + ); this._state = State.Text; this.sectionStart = this._index + 1; } @@ -614,7 +640,7 @@ export default class Tokenizer { private trieIndex = 0; private trieCurrent = 0; - private trieResult: string | null = null; + private trieResult = 0; private entityExcess = 0; private stateBeforeEntity(c: number) { @@ -628,7 +654,7 @@ export default class Tokenizer { } else { this.trieIndex = 0; this.trieCurrent = this.entityTrie[0]; - this.trieResult = null; + this.trieResult = 0; this._state = State.InNamedEntity; this.stateInNamedEntity(c); } @@ -664,20 +690,16 @@ export default class Tokenizer { if (entityStart > this.sectionStart) { this.emitPartial( - this.buffer.substring(this.sectionStart, entityStart) + this.sectionStart, + entityStart - this.sectionStart ); } // If this is a surrogate pair, consume the next two bytes - this.trieResult = - this.trieCurrent & BinTrieFlags.MULTI_BYTE - ? String.fromCharCode( - this.entityTrie[++this.trieIndex], - this.entityTrie[++this.trieIndex] - ) - : String.fromCharCode( - this.entityTrie[++this.trieIndex] - ); + this.trieResult = this.trieIndex; + this.trieIndex += + 1 + + Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0); this.entityExcess = 0; this.sectionStart = this._index + 1; } @@ -685,8 +707,23 @@ export default class Tokenizer { } private emitNamedEntity() { - if (this.trieResult) { - this.emitPartial(this.trieResult); + if (this.trieResult !== 0) { + if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) { + const first = this.entityTrie[this.trieResult + 1]; + const second = this.entityTrie[this.trieResult + 2]; + // If this is a surrogate pair, combine the code points. + if (first >= 0xd8_00 && first <= 0xdf_ff) { + this.emitCodePoint( + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + (first - 0xd8_00) * 0x4_00 + second + 0x24_00 + ); + } else { + this.emitCodePoint(first); + this.emitCodePoint(second); + } + } else { + this.emitCodePoint(this.entityTrie[this.trieResult + 1]); + } } this._state = this.baseState; @@ -710,14 +747,15 @@ export default class Tokenizer { // Emit leading data if any if (entityStart > this.sectionStart) { this.emitPartial( - this.buffer.substring(this.sectionStart, entityStart) + this.sectionStart, + entityStart - this.sectionStart ); } // Parse entity const entity = this.buffer.substring(numberStart, this._index); const parsed = parseInt(entity, base); - this.emitPartial(decodeCodePoint(parsed)); + this.emitCodePoint(parsed); this.sectionStart = this._index + Number(strict); } this._state = this.baseState; @@ -767,27 +805,28 @@ export default class Tokenizer { * Remove data that has already been consumed from the buffer. */ private cleanup() { - // If we are inside of text, emit what we already have. - if ( - this.running && - this.sectionStart !== this._index && - (this._state === State.Text || - (this._state === State.InSpecialTag && - this.sequenceIndex === 0)) - ) { - // TODO: We could emit attribute data here as well. - this.cbs.ontext(this.buffer.substr(this.sectionStart)); - this.sectionStart = this._index; - } - - const start = this.sectionStart < 0 ? this._index : this.sectionStart; - this.buffer = - start === this.buffer.length ? "" : this.buffer.substr(start); - this._index -= start; - this.bufferOffset += start; - - if (this.sectionStart > 0) { - this.sectionStart = 0; + // If we are inside of text or attributes, emit what we already have. + if (this.running && this.sectionStart !== this._index) { + if ( + this._state === State.Text || + (this._state === State.InSpecialTag && this.sequenceIndex === 0) + ) { + this.cbs.ontext( + this.sectionStart, + this._index - this.sectionStart + ); + this.sectionStart = this._index; + } else if ( + this._state === State.InAttributeValueDq || + this._state === State.InAttributeValueSq || + this._state === State.InAttributeValueNq + ) { + this.cbs.onattribdata( + this.sectionStart, + this._index - this.sectionStart + ); + this.sectionStart = this._index; + } } } @@ -882,12 +921,12 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { - const data = this.buffer.substr(this.sectionStart); + const remaining = this.buffer.length - this.sectionStart; if (this._state === State.InCommentLike) { if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(data); + this.cbs.oncdata(this.sectionStart, remaining); } else { - this.cbs.oncomment(data); + this.cbs.oncomment(this.sectionStart, remaining); } } else if ( this._state === State.InNumericEntity && @@ -917,21 +956,28 @@ export default class Tokenizer { * respective callback signals that the tag should be ignored. */ } else { - this.cbs.ontext(data); + this.cbs.ontext(this.sectionStart, remaining); } } - private getSection(): string { - return this.buffer.substring(this.sectionStart, this._index); + private emitPartial(start: number, length: number) { + if ( + this.baseState !== State.Text && + this.baseState !== State.InSpecialTag + ) { + this.cbs.onattribdata(start, length); + } else { + this.cbs.ontext(start, length); + } } - private emitPartial(value: string) { + private emitCodePoint(cp: number) { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag ) { - this.cbs.onattribdata(value); + this.cbs.onattribentity(cp); } else { - this.cbs.ontext(value); + this.cbs.ontextentity(cp); } } } diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 38a60dfff..4951ecd2a 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -3,12 +3,13 @@ exports[`Tokenizer should not lose data when pausing 1`] = ` Array [ Array [ - "ontext", - "&", + "ontextentity", + 38, ], Array [ "ontext", - " it up!", + 5, + 7, ], Array [ "onend", @@ -20,21 +21,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing scr Array [ Array [ "onopentagname", - "script", + 1, + 6, ], Array [ "onselfclosingtag", ], Array [ "onopentagname", - "div", + 11, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 17, + 3, ], Array [ "onend", @@ -46,21 +50,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing sty Array [ Array [ "onopentagname", - "style", + 1, + 5, ], Array [ "onselfclosingtag", ], Array [ "onopentagname", - "div", + 10, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 16, + 3, ], Array [ "onend", @@ -72,21 +79,24 @@ exports[`Tokenizer should support self-closing special tags for self-closing tit Array [ Array [ "onopentagname", - "title", + 1, + 5, ], Array [ "onselfclosingtag", ], Array [ "onopentagname", - "div", + 10, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 16, + 3, ], Array [ "onend", @@ -98,25 +108,29 @@ exports[`Tokenizer should support standard special tags for normal script tag 1` Array [ Array [ "onopentagname", - "script", + 1, + 6, ], Array [ "onopentagend", ], Array [ "onclosetag", - "script", + 10, + 6, ], Array [ "onopentagname", - "div", + 18, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 24, + 3, ], Array [ "onend", @@ -128,25 +142,29 @@ exports[`Tokenizer should support standard special tags for normal sitle tag 1`] Array [ Array [ "onopentagname", - "title", + 1, + 5, ], Array [ "onopentagend", ], Array [ "onclosetag", - "title", + 9, + 5, ], Array [ "onopentagname", - "div", + 16, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 22, + 3, ], Array [ "onend", @@ -158,25 +176,29 @@ exports[`Tokenizer should support standard special tags for normal style tag 1`] Array [ Array [ "onopentagname", - "style", + 1, + 5, ], Array [ "onopentagend", ], Array [ "onclosetag", - "style", + 9, + 5, ], Array [ "onopentagname", - "div", + 16, + 3, ], Array [ "onopentagend", ], Array [ "onclosetag", - "div", + 22, + 3, ], Array [ "onend", From e2b23ea2d66daa0e7eb8425ad018e3a51e7217ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 19:28:36 +0000 Subject: [PATCH 2/7] Add return types --- src/Tokenizer.ts | 74 ++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index a597439df..61aa6c979 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -224,7 +224,7 @@ export default class Tokenizer { return this._index; } - private stateText(c: number) { + private stateText(c: number): void { if ( c === CharCodes.Lt || (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) @@ -244,7 +244,7 @@ export default class Tokenizer { private currentSequence!: Uint8Array; private sequenceIndex = 0; - private stateSpecialStartSequence(c: number) { + private stateSpecialStartSequence(c: number): void { const isEnd = this.sequenceIndex === this.currentSequence.length; const isMatch = isEnd ? // If we are at the end of the sequence, make sure the tag name has ended @@ -265,7 +265,7 @@ export default class Tokenizer { } /** Look for an end tag. For tags, also decode entities. */ - private stateInSpecialTag(c: number) { + private stateInSpecialTag(c: number): void { if (this.sequenceIndex === this.currentSequence.length) { if (c === CharCodes.Gt || isWhitespace(c)) { const endOfText = this._index - this.currentSequence.length; @@ -308,7 +308,7 @@ export default class Tokenizer { } } - private stateCDATASequence(c: number) { + private stateCDATASequence(c: number): void { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { this._state = State.InCommentLike; @@ -355,7 +355,7 @@ export default class Tokenizer { * - That character is then repeated, so we have to check multiple repeats. * - All characters but the start character of the sequence can be skipped. */ - private stateInCommentLike(c: number) { + private stateInCommentLike(c: number): void { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { // Remove 2 trailing chars @@ -399,7 +399,7 @@ export default class Tokenizer { this._state = State.SpecialStartSequence; } - private stateBeforeTagName(c: number) { + private stateBeforeTagName(c: number): void { if (c === CharCodes.ExclamationMark) { this._state = State.BeforeDeclaration; this.sectionStart = this._index + 1; @@ -424,7 +424,7 @@ export default class Tokenizer { this.stateText(c); } } - private stateInTagName(c: number) { + private stateInTagName(c: number): void { if (isEndOfTagSection(c)) { this.cbs.onopentagname( this.sectionStart, @@ -435,7 +435,7 @@ export default class Tokenizer { this.stateBeforeAttributeName(c); } } - private stateBeforeClosingTagName(c: number) { + private stateBeforeClosingTagName(c: number): void { if (isWhitespace(c)) { // Ignore } else if (c === CharCodes.Gt) { @@ -447,7 +447,7 @@ export default class Tokenizer { this.sectionStart = this._index; } } - private stateInClosingTagName(c: number) { + private stateInClosingTagName(c: number): void { if (c === CharCodes.Gt || isWhitespace(c)) { this.cbs.onclosetag( this.sectionStart, @@ -458,14 +458,14 @@ export default class Tokenizer { this.stateAfterClosingTagName(c); } } - private stateAfterClosingTagName(c: number) { + private stateAfterClosingTagName(c: number): void { // Skip everything until ">" if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this._state = State.Text; this.sectionStart = this._index + 1; } } - private stateBeforeAttributeName(c: number) { + private stateBeforeAttributeName(c: number): void { if (c === CharCodes.Gt) { this.cbs.onopentagend(); if (this.isSpecial) { @@ -483,7 +483,7 @@ export default class Tokenizer { this.sectionStart = this._index; } } - private stateInSelfClosingTag(c: number) { + private stateInSelfClosingTag(c: number): void { if (c === CharCodes.Gt) { this.cbs.onselfclosingtag(); this._state = State.Text; @@ -495,7 +495,7 @@ export default class Tokenizer { this.stateBeforeAttributeName(c); } } - private stateInAttributeName(c: number) { + private stateInAttributeName(c: number): void { if (c === CharCodes.Eq || isEndOfTagSection(c)) { this.cbs.onattribname( this.sectionStart, @@ -506,7 +506,7 @@ export default class Tokenizer { this.stateAfterAttributeName(c); } } - private stateAfterAttributeName(c: number) { + private stateAfterAttributeName(c: number): void { if (c === CharCodes.Eq) { this._state = State.BeforeAttributeValue; } else if (c === CharCodes.Slash || c === CharCodes.Gt) { @@ -519,7 +519,7 @@ export default class Tokenizer { this.sectionStart = this._index; } } - private stateBeforeAttributeValue(c: number) { + private stateBeforeAttributeValue(c: number): void { if (c === CharCodes.DoubleQuote) { this._state = State.InAttributeValueDq; this.sectionStart = this._index + 1; @@ -553,13 +553,13 @@ export default class Tokenizer { this._state = State.BeforeEntity; } } - private stateInAttributeValueDoubleQuotes(c: number) { + private stateInAttributeValueDoubleQuotes(c: number): void { this.handleInAttributeValue(c, CharCodes.DoubleQuote); } - private stateInAttributeValueSingleQuotes(c: number) { + private stateInAttributeValueSingleQuotes(c: number): void { this.handleInAttributeValue(c, CharCodes.SingleQuote); } - private stateInAttributeValueNoQuotes(c: number) { + private stateInAttributeValueNoQuotes(c: number): void { if (isWhitespace(c) || c === CharCodes.Gt) { this.cbs.onattribdata( this.sectionStart, @@ -574,7 +574,7 @@ export default class Tokenizer { this._state = State.BeforeEntity; } } - private stateBeforeDeclaration(c: number) { + private stateBeforeDeclaration(c: number): void { if (c === CharCodes.OpeningSquareBracket) { this._state = State.CDATASequence; this.sequenceIndex = 0; @@ -585,7 +585,7 @@ export default class Tokenizer { : State.InDeclaration; } } - private stateInDeclaration(c: number) { + private stateInDeclaration(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.ondeclaration( this.sectionStart, @@ -595,7 +595,7 @@ export default class Tokenizer { this.sectionStart = this._index + 1; } } - private stateInProcessingInstruction(c: number) { + private stateInProcessingInstruction(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.onprocessinginstruction( this.sectionStart, @@ -605,7 +605,7 @@ export default class Tokenizer { this.sectionStart = this._index + 1; } } - private stateBeforeComment(c: number) { + private stateBeforeComment(c: number): void { if (c === CharCodes.Dash) { this._state = State.InCommentLike; this.currentSequence = Sequences.CommentEnd; @@ -616,7 +616,7 @@ export default class Tokenizer { this._state = State.InDeclaration; } } - private stateInSpecialComment(c: number) { + private stateInSpecialComment(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.oncomment( this.sectionStart, @@ -626,7 +626,7 @@ export default class Tokenizer { this.sectionStart = this._index + 1; } } - private stateBeforeSpecialS(c: number) { + private stateBeforeSpecialS(c: number): void { const lower = c | 0x20; if (lower === Sequences.ScriptEnd[3]) { this.startSpecial(Sequences.ScriptEnd, 4); @@ -640,12 +640,13 @@ export default class Tokenizer { private trieIndex = 0; private trieCurrent = 0; - private trieResult = 0; + private entityResult = 0; private entityExcess = 0; - private stateBeforeEntity(c: number) { + private stateBeforeEntity(c: number): void { // Start excess with 1 to include the '&' this.entityExcess = 1; + this.entityResult = 0; if (c === CharCodes.Num) { this._state = State.BeforeNumericEntity; @@ -654,13 +655,12 @@ export default class Tokenizer { } else { this.trieIndex = 0; this.trieCurrent = this.entityTrie[0]; - this.trieResult = 0; this._state = State.InNamedEntity; this.stateInNamedEntity(c); } } - private stateInNamedEntity(c: number) { + private stateInNamedEntity(c: number): void { this.entityExcess += 1; this.trieIndex = determineBranch( @@ -696,7 +696,7 @@ export default class Tokenizer { } // If this is a surrogate pair, consume the next two bytes - this.trieResult = this.trieIndex; + this.entityResult = this.trieIndex; this.trieIndex += 1 + Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0); @@ -707,10 +707,10 @@ export default class Tokenizer { } private emitNamedEntity() { - if (this.trieResult !== 0) { - if (this.entityTrie[this.trieResult] & BinTrieFlags.MULTI_BYTE) { - const first = this.entityTrie[this.trieResult + 1]; - const second = this.entityTrie[this.trieResult + 2]; + if (this.entityResult !== 0) { + if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) { + const first = this.entityTrie[this.entityResult + 1]; + const second = this.entityTrie[this.entityResult + 2]; // If this is a surrogate pair, combine the code points. if (first >= 0xd8_00 && first <= 0xdf_ff) { this.emitCodePoint( @@ -722,14 +722,14 @@ export default class Tokenizer { this.emitCodePoint(second); } } else { - this.emitCodePoint(this.entityTrie[this.trieResult + 1]); + this.emitCodePoint(this.entityTrie[this.entityResult + 1]); } } this._state = this.baseState; } - private stateBeforeNumericEntity(c: number) { + private stateBeforeNumericEntity(c: number): void { if ((c | 0x20) === CharCodes.LowerX) { this.entityExcess++; this._state = State.InHexEntity; @@ -760,7 +760,7 @@ export default class Tokenizer { } this._state = this.baseState; } - private stateInNumericEntity(c: number) { + private stateInNumericEntity(c: number): void { if (c === CharCodes.Semi) { this.decodeNumericEntity(10, true); } else if (!isNumber(c)) { @@ -774,7 +774,7 @@ export default class Tokenizer { this.entityExcess++; } } - private stateInHexEntity(c: number) { + private stateInHexEntity(c: number): void { if (c === CharCodes.Semi) { this.decodeNumericEntity(16, true); } else if ( From a5175ae4763ff0e8c28d783d1721356df776fae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 20:53:11 +0000 Subject: [PATCH 3/7] Pass end indices in several callbacks Avoids `getIndex` calls --- src/Parser.ts | 28 +++++++-------- src/Tokenizer.ts | 45 +++++++++--------------- src/__snapshots__/Tokenizer.spec.ts.snap | 42 +++++++++++----------- 3 files changed, 52 insertions(+), 63 deletions(-) diff --git a/src/Parser.ts b/src/Parser.ts index 429fd517a..2dcffd0a1 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -257,10 +257,10 @@ export class Parser implements Callbacks { } /** @internal */ - onopentagname(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); + onopentagname(start: number, endIndex: number): void { + this.endIndex = endIndex; - let name = this.getSubstr(start, length); + let name = this.getSubstr(start, endIndex - start); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -321,10 +321,10 @@ export class Parser implements Callbacks { } /** @internal */ - onclosetag(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); + onclosetag(start: number, endIndex: number): void { + this.endIndex = endIndex; - let name = this.getSubstr(start, length); + let name = this.getSubstr(start, endIndex - start); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -412,8 +412,8 @@ export class Parser implements Callbacks { } /** @internal */ - onattribend(quote: QuoteType): void { - this.endIndex = this.tokenizer.getIndex(); + onattribend(quote: QuoteType, endIndex: number): void { + this.endIndex = endIndex; const quoteVal = quote === QuoteType.Double @@ -447,9 +447,9 @@ export class Parser implements Callbacks { } /** @internal */ - ondeclaration(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); - const value = this.getSubstr(start, length); + ondeclaration(start: number, endIndex: number): void { + this.endIndex = endIndex; + const value = this.getSubstr(start, endIndex - start); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); @@ -461,9 +461,9 @@ export class Parser implements Callbacks { } /** @internal */ - onprocessinginstruction(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); - const value = this.getSubstr(start, length); + onprocessinginstruction(start: number, endIndex: number): void { + this.endIndex = endIndex; + const value = this.getSubstr(start, endIndex - start); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 61aa6c979..b0b65b914 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -114,17 +114,17 @@ export enum QuoteType { export interface Callbacks { onattribdata(start: number, length: number): void; onattribentity(codepoint: number): void; - onattribend(quote: QuoteType): void; + onattribend(quote: QuoteType, endIndex: number): void; onattribname(start: number, length: number): void; oncdata(start: number, length: number): void; - onclosetag(start: number, length: number): void; + onclosetag(start: number, endIndex: number): void; oncomment(start: number, length: number): void; - ondeclaration(start: number, length: number): void; + ondeclaration(start: number, endIndex: number): void; onend(): void; onerror(error: Error, state?: State): void; onopentagend(): void; - onopentagname(start: number, length: number): void; - onprocessinginstruction(start: number, length: number): void; + onopentagname(start: number, endIndex: number): void; + onprocessinginstruction(start: number, endIndex: number): void; onselfclosingtag(): void; ontext(start: number, length: number): void; ontextentity(codepoint: number): void; @@ -426,10 +426,7 @@ export default class Tokenizer { } private stateInTagName(c: number): void { if (isEndOfTagSection(c)) { - this.cbs.onopentagname( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onopentagname(this.sectionStart, this._index); this.sectionStart = -1; this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); @@ -449,10 +446,7 @@ export default class Tokenizer { } private stateInClosingTagName(c: number): void { if (c === CharCodes.Gt || isWhitespace(c)) { - this.cbs.onclosetag( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onclosetag(this.sectionStart, this._index); this.sectionStart = -1; this._state = State.AfterClosingTagName; this.stateAfterClosingTagName(c); @@ -510,11 +504,11 @@ export default class Tokenizer { if (c === CharCodes.Eq) { this._state = State.BeforeAttributeValue; } else if (c === CharCodes.Slash || c === CharCodes.Gt) { - this.cbs.onattribend(QuoteType.NoValue); + this.cbs.onattribend(QuoteType.NoValue, this._index); this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (!isWhitespace(c)) { - this.cbs.onattribend(QuoteType.NoValue); + this.cbs.onattribend(QuoteType.NoValue, this._index); this._state = State.InAttributeName; this.sectionStart = this._index; } @@ -545,7 +539,8 @@ export default class Tokenizer { this.cbs.onattribend( quote === CharCodes.DoubleQuote ? QuoteType.Double - : QuoteType.Single + : QuoteType.Single, + this._index ); this._state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { @@ -566,7 +561,7 @@ export default class Tokenizer { this._index - this.sectionStart ); this.sectionStart = -1; - this.cbs.onattribend(QuoteType.Unquoted); + this.cbs.onattribend(QuoteType.Unquoted, this._index); this._state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { @@ -587,20 +582,14 @@ export default class Tokenizer { } private stateInDeclaration(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.ondeclaration( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.ondeclaration(this.sectionStart, this._index); this._state = State.Text; this.sectionStart = this._index + 1; } } private stateInProcessingInstruction(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.onprocessinginstruction( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onprocessinginstruction(this.sectionStart, this._index); this._state = State.Text; this.sectionStart = this._index + 1; } @@ -706,7 +695,7 @@ export default class Tokenizer { } } - private emitNamedEntity() { + private emitNamedEntity(): void { if (this.entityResult !== 0) { if (this.entityTrie[this.entityResult] & BinTrieFlags.MULTI_BYTE) { const first = this.entityTrie[this.entityResult + 1]; @@ -960,7 +949,7 @@ export default class Tokenizer { } } - private emitPartial(start: number, length: number) { + private emitPartial(start: number, length: number): void { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag @@ -970,7 +959,7 @@ export default class Tokenizer { this.cbs.ontext(start, length); } } - private emitCodePoint(cp: number) { + private emitCodePoint(cp: number): void { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 4951ecd2a..5f641da6b 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -22,7 +22,7 @@ Array [ Array [ "onopentagname", 1, - 6, + 7, ], Array [ "onselfclosingtag", @@ -30,7 +30,7 @@ Array [ Array [ "onopentagname", 11, - 3, + 14, ], Array [ "onopentagend", @@ -38,7 +38,7 @@ Array [ Array [ "onclosetag", 17, - 3, + 20, ], Array [ "onend", @@ -51,7 +51,7 @@ Array [ Array [ "onopentagname", 1, - 5, + 6, ], Array [ "onselfclosingtag", @@ -59,7 +59,7 @@ Array [ Array [ "onopentagname", 10, - 3, + 13, ], Array [ "onopentagend", @@ -67,7 +67,7 @@ Array [ Array [ "onclosetag", 16, - 3, + 19, ], Array [ "onend", @@ -80,7 +80,7 @@ Array [ Array [ "onopentagname", 1, - 5, + 6, ], Array [ "onselfclosingtag", @@ -88,7 +88,7 @@ Array [ Array [ "onopentagname", 10, - 3, + 13, ], Array [ "onopentagend", @@ -96,7 +96,7 @@ Array [ Array [ "onclosetag", 16, - 3, + 19, ], Array [ "onend", @@ -109,7 +109,7 @@ Array [ Array [ "onopentagname", 1, - 6, + 7, ], Array [ "onopentagend", @@ -117,12 +117,12 @@ Array [ Array [ "onclosetag", 10, - 6, + 16, ], Array [ "onopentagname", 18, - 3, + 21, ], Array [ "onopentagend", @@ -130,7 +130,7 @@ Array [ Array [ "onclosetag", 24, - 3, + 27, ], Array [ "onend", @@ -143,7 +143,7 @@ Array [ Array [ "onopentagname", 1, - 5, + 6, ], Array [ "onopentagend", @@ -151,12 +151,12 @@ Array [ Array [ "onclosetag", 9, - 5, + 14, ], Array [ "onopentagname", 16, - 3, + 19, ], Array [ "onopentagend", @@ -164,7 +164,7 @@ Array [ Array [ "onclosetag", 22, - 3, + 25, ], Array [ "onend", @@ -177,7 +177,7 @@ Array [ Array [ "onopentagname", 1, - 5, + 6, ], Array [ "onopentagend", @@ -185,12 +185,12 @@ Array [ Array [ "onclosetag", 9, - 5, + 14, ], Array [ "onopentagname", 16, - 3, + 19, ], Array [ "onopentagend", @@ -198,7 +198,7 @@ Array [ Array [ "onclosetag", 22, - 3, + 25, ], Array [ "onend", From 397ef7865b9a2406ae093d831ff9c3ba96f05645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 21:10:15 +0000 Subject: [PATCH 4/7] Decode numeric entities on the go --- src/Tokenizer.ts | 52 ++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index b0b65b914..52944ee26 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -104,6 +104,13 @@ function isASCIIAlpha(c: number): boolean { ); } +function isHexDigit(c: number): boolean { + return ( + (c >= CharCodes.UpperA && c <= CharCodes.UpperF) || + (c >= CharCodes.LowerA && c <= CharCodes.LowerF) + ); +} + export enum QuoteType { NoValue = 0, Unquoted = 1, @@ -629,6 +636,7 @@ export default class Tokenizer { private trieIndex = 0; private trieCurrent = 0; + /** For named entities, the index of the value. For numeric entities, the code point. */ private entityResult = 0; private entityExcess = 0; @@ -728,9 +736,10 @@ export default class Tokenizer { } } - private decodeNumericEntity(base: 10 | 16, strict: boolean) { + private emitNumericEntity(strict: boolean) { const entityStart = this._index - this.entityExcess - 1; - const numberStart = entityStart + 2 + (base >> 4); + const numberStart = + entityStart + 2 + Number(this._state === State.InHexEntity); if (numberStart !== this._index) { // Emit leading data if any @@ -741,44 +750,43 @@ export default class Tokenizer { ); } - // Parse entity - const entity = this.buffer.substring(numberStart, this._index); - const parsed = parseInt(entity, base); - this.emitCodePoint(parsed); + this.emitCodePoint(this.entityResult); this.sectionStart = this._index + Number(strict); } this._state = this.baseState; } private stateInNumericEntity(c: number): void { if (c === CharCodes.Semi) { - this.decodeNumericEntity(10, true); - } else if (!isNumber(c)) { + this.emitNumericEntity(true); + } else if (isNumber(c)) { + this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero); + this.entityExcess++; + } else { if (this.allowLegacyEntity()) { - this.decodeNumericEntity(10, false); + this.emitNumericEntity(false); } else { this._state = this.baseState; } this._index--; - } else { - this.entityExcess++; } } private stateInHexEntity(c: number): void { if (c === CharCodes.Semi) { - this.decodeNumericEntity(16, true); - } else if ( - (c < CharCodes.LowerA || c > CharCodes.LowerF) && - (c < CharCodes.UpperA || c > CharCodes.UpperF) && - !isNumber(c) - ) { + this.emitNumericEntity(true); + } else if (isNumber(c)) { + this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero); + this.entityExcess++; + } else if (isHexDigit(c)) { + this.entityResult = + this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10); + this.entityExcess++; + } else { if (this.allowLegacyEntity()) { - this.decodeNumericEntity(16, false); + this.emitNumericEntity(false); } else { this._state = this.baseState; } this._index--; - } else { - this.entityExcess++; } } @@ -921,13 +929,13 @@ export default class Tokenizer { this._state === State.InNumericEntity && this.allowLegacyEntity() ) { - this.decodeNumericEntity(10, false); + this.emitNumericEntity(false); // All trailing data will have been consumed } else if ( this._state === State.InHexEntity && this.allowLegacyEntity() ) { - this.decodeNumericEntity(16, false); + this.emitNumericEntity(false); // All trailing data will have been consumed } else if ( this._state === State.InTagName || From f30745478018befe9e83c32f95be1cb8d7a42160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 21:24:51 +0000 Subject: [PATCH 5/7] Pass `endIndex` for all callbacks --- src/Parser.ts | 65 +++++++++-------- src/Tokenizer.ts | 88 ++++++++---------------- src/__snapshots__/Tokenizer.spec.ts.snap | 14 +++- 3 files changed, 72 insertions(+), 95 deletions(-) diff --git a/src/Parser.ts b/src/Parser.ts index 2dcffd0a1..da4744a2d 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -236,12 +236,11 @@ export class Parser implements Callbacks { // Tokenizer event handlers /** @internal */ - ontext(start: number, length: number): void { - const data = this.getSubstr(start, length); - const idx = start + length; - this.endIndex = idx - 1; + ontext(start: number, endIndex: number): void { + const data = this.getSlice(start, endIndex); + this.endIndex = endIndex - 1; this.cbs.ontext?.(data); - this.startIndex = idx; + this.startIndex = endIndex; } /** @internal */ @@ -260,7 +259,7 @@ export class Parser implements Callbacks { onopentagname(start: number, endIndex: number): void { this.endIndex = endIndex; - let name = this.getSubstr(start, endIndex - start); + let name = this.getSlice(start, endIndex); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -299,7 +298,6 @@ export class Parser implements Callbacks { private endOpenTag(isImplied: boolean) { this.startIndex = this.openTagStart; - this.endIndex = this.tokenizer.getIndex(); if (this.attribs) { this.cbs.onopentag?.(this.tagname, this.attribs, isImplied); @@ -313,18 +311,19 @@ export class Parser implements Callbacks { } /** @internal */ - onopentagend(): void { + onopentagend(endIndex: number): void { + this.endIndex = endIndex; this.endOpenTag(false); // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ onclosetag(start: number, endIndex: number): void { this.endIndex = endIndex; - let name = this.getSubstr(start, endIndex - start); + let name = this.getSlice(start, endIndex); if (this.lowerCaseTagNames) { name = name.toLowerCase(); @@ -359,11 +358,12 @@ export class Parser implements Callbacks { } // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ - onselfclosingtag(): void { + onselfclosingtag(endIndex: number): void { + this.endIndex = endIndex; if ( this.options.xmlMode || this.options.recognizeSelfClosing || @@ -372,10 +372,10 @@ export class Parser implements Callbacks { this.closeCurrentTag(false); // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } else { // Ignore the fact that the tag is self-closing. - this.onopentagend(); + this.onopentagend(endIndex); } } @@ -392,9 +392,9 @@ export class Parser implements Callbacks { } /** @internal */ - onattribname(start: number, length: number): void { + onattribname(start: number, endIndex: number): void { this.startIndex = start; - const name = this.getSubstr(start, length); + const name = this.getSlice(start, endIndex); this.attribname = this.lowerCaseAttributeNames ? name.toLowerCase() @@ -402,8 +402,8 @@ export class Parser implements Callbacks { } /** @internal */ - onattribdata(start: number, length: number): void { - this.attribvalue += this.getSubstr(start, length); + onattribdata(start: number, endIndex: number): void { + this.attribvalue += this.getSlice(start, endIndex); } /** @internal */ @@ -449,7 +449,7 @@ export class Parser implements Callbacks { /** @internal */ ondeclaration(start: number, endIndex: number): void { this.endIndex = endIndex; - const value = this.getSubstr(start, endIndex - start); + const value = this.getSlice(start, endIndex); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); @@ -457,13 +457,13 @@ export class Parser implements Callbacks { } // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ onprocessinginstruction(start: number, endIndex: number): void { this.endIndex = endIndex; - const value = this.getSubstr(start, endIndex - start); + const value = this.getSlice(start, endIndex); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); @@ -471,25 +471,24 @@ export class Parser implements Callbacks { } // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ - oncomment(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); - const value = this.getSubstr(start, length); + oncomment(start: number, endIndex: number, offset: number): void { + this.endIndex = endIndex; - this.cbs.oncomment?.(value); + this.cbs.oncomment?.(this.getSlice(start, endIndex - offset)); this.cbs.oncommentend?.(); // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ - oncdata(start: number, length: number): void { - this.endIndex = this.tokenizer.getIndex(); - const value = this.getSubstr(start, length); + oncdata(start: number, endIndex: number, offset: number): void { + this.endIndex = endIndex; + const value = this.getSlice(start, endIndex - offset); if (this.options.xmlMode || this.options.recognizeCDATA) { this.cbs.oncdatastart?.(); @@ -501,7 +500,7 @@ export class Parser implements Callbacks { } // Set `startIndex` for next node - this.startIndex = this.endIndex + 1; + this.startIndex = endIndex + 1; } /** @internal */ @@ -552,8 +551,8 @@ export class Parser implements Callbacks { private buffer = ""; - private getSubstr(start: number, length: number) { - return this.buffer.substr(start, length); + private getSlice(start: number, end: number) { + return this.buffer.slice(start, end); } /** diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 52944ee26..901b20dd6 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -119,21 +119,21 @@ export enum QuoteType { } export interface Callbacks { - onattribdata(start: number, length: number): void; + onattribdata(start: number, endIndex: number): void; onattribentity(codepoint: number): void; onattribend(quote: QuoteType, endIndex: number): void; - onattribname(start: number, length: number): void; - oncdata(start: number, length: number): void; + onattribname(start: number, endIndex: number): void; + oncdata(start: number, endIndex: number, endOffset: number): void; onclosetag(start: number, endIndex: number): void; - oncomment(start: number, length: number): void; + oncomment(start: number, endIndex: number, endOffset: number): void; ondeclaration(start: number, endIndex: number): void; onend(): void; onerror(error: Error, state?: State): void; - onopentagend(): void; + onopentagend(endIndex: number): void; onopentagname(start: number, endIndex: number): void; onprocessinginstruction(start: number, endIndex: number): void; - onselfclosingtag(): void; - ontext(start: number, length: number): void; + onselfclosingtag(endIndex: number): void; + ontext(start: number, endIndex: number): void; ontextentity(codepoint: number): void; } @@ -237,10 +237,7 @@ export default class Tokenizer { (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) ) { if (this._index > this.sectionStart) { - this.cbs.ontext( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.ontext(this.sectionStart, this._index); } this._state = State.BeforeTagName; this.sectionStart = this._index; @@ -281,10 +278,7 @@ export default class Tokenizer { // Spoof the index so that reported locations match up. const actualIndex = this._index; this._index = endOfText; - this.cbs.ontext( - this.sectionStart, - endOfText - this.sectionStart - ); + this.cbs.ontext(this.sectionStart, endOfText); this._index = actualIndex; } @@ -365,13 +359,10 @@ export default class Tokenizer { private stateInCommentLike(c: number): void { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { - // Remove 2 trailing chars - const length = this._index - 2 - this.sectionStart; - if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(this.sectionStart, length); + this.cbs.oncdata(this.sectionStart, this._index, 2); } else { - this.cbs.oncomment(this.sectionStart, length); + this.cbs.oncomment(this.sectionStart, this._index, 2); } this.sequenceIndex = 0; @@ -468,7 +459,7 @@ export default class Tokenizer { } private stateBeforeAttributeName(c: number): void { if (c === CharCodes.Gt) { - this.cbs.onopentagend(); + this.cbs.onopentagend(this._index); if (this.isSpecial) { this._state = State.InSpecialTag; this.sequenceIndex = 0; @@ -486,7 +477,7 @@ export default class Tokenizer { } private stateInSelfClosingTag(c: number): void { if (c === CharCodes.Gt) { - this.cbs.onselfclosingtag(); + this.cbs.onselfclosingtag(this._index); this._state = State.Text; this.baseState = State.Text; this.sectionStart = this._index + 1; @@ -498,10 +489,7 @@ export default class Tokenizer { } private stateInAttributeName(c: number): void { if (c === CharCodes.Eq || isEndOfTagSection(c)) { - this.cbs.onattribname( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onattribname(this.sectionStart, this._index); this.sectionStart = -1; this._state = State.AfterAttributeName; this.stateAfterAttributeName(c); @@ -538,10 +526,7 @@ export default class Tokenizer { c === quote || (!this.decodeEntities && this.fastForwardTo(quote)) ) { - this.cbs.onattribdata( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onattribdata(this.sectionStart, this._index); this.sectionStart = -1; this.cbs.onattribend( quote === CharCodes.DoubleQuote @@ -563,10 +548,7 @@ export default class Tokenizer { } private stateInAttributeValueNoQuotes(c: number): void { if (isWhitespace(c) || c === CharCodes.Gt) { - this.cbs.onattribdata( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onattribdata(this.sectionStart, this._index); this.sectionStart = -1; this.cbs.onattribend(QuoteType.Unquoted, this._index); this._state = State.BeforeAttributeName; @@ -614,10 +596,7 @@ export default class Tokenizer { } private stateInSpecialComment(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.oncomment( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.oncomment(this.sectionStart, this._index, 0); this._state = State.Text; this.sectionStart = this._index + 1; } @@ -686,10 +665,7 @@ export default class Tokenizer { const entityStart = this._index - this.entityExcess + 1; if (entityStart > this.sectionStart) { - this.emitPartial( - this.sectionStart, - entityStart - this.sectionStart - ); + this.emitPartial(this.sectionStart, entityStart); } // If this is a surrogate pair, consume the next two bytes @@ -744,10 +720,7 @@ export default class Tokenizer { if (numberStart !== this._index) { // Emit leading data if any if (entityStart > this.sectionStart) { - this.emitPartial( - this.sectionStart, - entityStart - this.sectionStart - ); + this.emitPartial(this.sectionStart, entityStart); } this.emitCodePoint(this.entityResult); @@ -808,20 +781,14 @@ export default class Tokenizer { this._state === State.Text || (this._state === State.InSpecialTag && this.sequenceIndex === 0) ) { - this.cbs.ontext( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.ontext(this.sectionStart, this._index); this.sectionStart = this._index; } else if ( this._state === State.InAttributeValueDq || this._state === State.InAttributeValueSq || this._state === State.InAttributeValueNq ) { - this.cbs.onattribdata( - this.sectionStart, - this._index - this.sectionStart - ); + this.cbs.onattribdata(this.sectionStart, this._index); this.sectionStart = this._index; } } @@ -918,12 +885,11 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { - const remaining = this.buffer.length - this.sectionStart; if (this._state === State.InCommentLike) { if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(this.sectionStart, remaining); + this.cbs.oncdata(this.sectionStart, this.buffer.length, 0); } else { - this.cbs.oncomment(this.sectionStart, remaining); + this.cbs.oncomment(this.sectionStart, this.buffer.length, 0); } } else if ( this._state === State.InNumericEntity && @@ -953,18 +919,18 @@ export default class Tokenizer { * respective callback signals that the tag should be ignored. */ } else { - this.cbs.ontext(this.sectionStart, remaining); + this.cbs.ontext(this.sectionStart, this.buffer.length); } } - private emitPartial(start: number, length: number): void { + private emitPartial(start: number, endIndex: number): void { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag ) { - this.cbs.onattribdata(start, length); + this.cbs.onattribdata(start, endIndex); } else { - this.cbs.ontext(start, length); + this.cbs.ontext(start, endIndex); } } private emitCodePoint(cp: number): void { diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 5f641da6b..91d8ecf27 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -9,7 +9,7 @@ Array [ Array [ "ontext", 5, - 7, + 12, ], Array [ "onend", @@ -26,6 +26,7 @@ Array [ ], Array [ "onselfclosingtag", + 9, ], Array [ "onopentagname", @@ -34,6 +35,7 @@ Array [ ], Array [ "onopentagend", + 14, ], Array [ "onclosetag", @@ -55,6 +57,7 @@ Array [ ], Array [ "onselfclosingtag", + 8, ], Array [ "onopentagname", @@ -63,6 +66,7 @@ Array [ ], Array [ "onopentagend", + 13, ], Array [ "onclosetag", @@ -84,6 +88,7 @@ Array [ ], Array [ "onselfclosingtag", + 8, ], Array [ "onopentagname", @@ -92,6 +97,7 @@ Array [ ], Array [ "onopentagend", + 13, ], Array [ "onclosetag", @@ -113,6 +119,7 @@ Array [ ], Array [ "onopentagend", + 7, ], Array [ "onclosetag", @@ -126,6 +133,7 @@ Array [ ], Array [ "onopentagend", + 21, ], Array [ "onclosetag", @@ -147,6 +155,7 @@ Array [ ], Array [ "onopentagend", + 6, ], Array [ "onclosetag", @@ -160,6 +169,7 @@ Array [ ], Array [ "onopentagend", + 19, ], Array [ "onclosetag", @@ -181,6 +191,7 @@ Array [ ], Array [ "onopentagend", + 6, ], Array [ "onclosetag", @@ -194,6 +205,7 @@ Array [ ], Array [ "onopentagend", + 19, ], Array [ "onclosetag", From ca09b29d5db8a09cb26a5850f1fde1208414cbd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 22:14:19 +0000 Subject: [PATCH 6/7] Don't concat strings and only pass a single string at a time to the tokenizer --- src/Parser.spec.ts | 4 +-- src/Parser.ts | 74 +++++++++++++++++++++++++++++++++++++--------- src/Tokenizer.ts | 40 +++++++++++-------------- 3 files changed, 79 insertions(+), 39 deletions(-) diff --git a/src/Parser.spec.ts b/src/Parser.spec.ts index e546c5da7..88fb2bde9 100644 --- a/src/Parser.spec.ts +++ b/src/Parser.spec.ts @@ -48,11 +48,11 @@ describe("API", () => { p.resume(); expect(onText).toHaveBeenCalledTimes(1); p.pause(); - p.end("foo"); + p.end("bar"); expect(onText).toHaveBeenCalledTimes(1); p.resume(); expect(onText).toHaveBeenCalledTimes(2); - expect(onText).toHaveBeenLastCalledWith("foo"); + expect(onText).toHaveBeenLastCalledWith("bar"); }); test("should back out of numeric entities (#125)", () => { diff --git a/src/Parser.ts b/src/Parser.ts index da4744a2d..2339f9e83 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -218,6 +218,13 @@ export class Parser implements Callbacks { private readonly lowerCaseAttributeNames: boolean; private readonly tokenizer: Tokenizer; + private readonly buffers: string[] = []; + private bufferOffset = 0; + /** The index of the last written buffer. Used when resuming after a `pause()`. */ + private writeIndex = 0; + /** Indicates whether the parser has finished running / `.end` has been called. */ + private ended = false; + constructor( cbs?: Partial<Handler> | null, private readonly options: ParserOptions = {} @@ -503,11 +510,6 @@ export class Parser implements Callbacks { this.startIndex = endIndex + 1; } - /** @internal */ - onerror(err: Error): void { - this.cbs.onerror?.(err); - } - /** @internal */ onend(): void { if (this.cbs.onclosetag) { @@ -531,11 +533,14 @@ export class Parser implements Callbacks { this.tagname = ""; this.attribname = ""; this.attribs = null; - this.stack = []; + this.stack.length = 0; this.startIndex = 0; this.endIndex = 0; this.cbs.onparserinit?.(this); - this.buffer = ""; + this.buffers.length = 0; + this.bufferOffset = 0; + this.writeIndex = 0; + this.ended = false; } /** @@ -549,10 +554,28 @@ export class Parser implements Callbacks { this.end(data); } - private buffer = ""; - private getSlice(start: number, end: number) { - return this.buffer.slice(start, end); + while (start - this.bufferOffset >= this.buffers[0].length) { + this.shiftBuffer(); + } + + let str = this.buffers[0].slice( + start - this.bufferOffset, + end - this.bufferOffset + ); + + while (end - this.bufferOffset > this.buffers[0].length) { + this.shiftBuffer(); + str += this.buffers[0].slice(0, end - this.bufferOffset); + } + + return str; + } + + private shiftBuffer(): void { + this.bufferOffset += this.buffers[0].length; + this.writeIndex--; + this.buffers.shift(); } /** @@ -561,8 +584,16 @@ export class Parser implements Callbacks { * @param chunk Chunk to parse. */ public write(chunk: string): void { - this.buffer += chunk; - this.tokenizer.write(chunk); + if (this.ended) { + this.cbs.onerror?.(new Error(".write() after done!")); + return; + } + + this.buffers.push(chunk); + if (this.tokenizer.running) { + this.tokenizer.write(chunk); + this.writeIndex++; + } } /** @@ -571,8 +602,14 @@ export class Parser implements Callbacks { * @param chunk Optional final chunk to parse. */ public end(chunk?: string): void { - if (chunk) this.buffer += chunk; - this.tokenizer.end(chunk); + if (this.ended) { + this.cbs.onerror?.(Error(".end() after done!")); + return; + } + + if (chunk) this.write(chunk); + this.ended = true; + this.tokenizer.end(); } /** @@ -587,6 +624,15 @@ export class Parser implements Callbacks { */ public resume(): void { this.tokenizer.resume(); + + while ( + this.tokenizer.running && + this.writeIndex < this.buffers.length + ) { + this.tokenizer.write(this.buffers[this.writeIndex++]); + } + + if (this.ended) this.tokenizer.end(); } /** diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 901b20dd6..543b93514 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -128,7 +128,6 @@ export interface Callbacks { oncomment(start: number, endIndex: number, endOffset: number): void; ondeclaration(start: number, endIndex: number): void; onend(): void; - onerror(error: Error, state?: State): void; onopentagend(endIndex: number): void; onopentagname(start: number, endIndex: number): void; onprocessinginstruction(start: number, endIndex: number): void; @@ -166,9 +165,9 @@ export default class Tokenizer { /** For special parsing behavior inside of script and style tags. */ private isSpecial = false; /** Indicates whether the tokenizer has been paused. */ - private running = true; - /** Indicates whether the tokenizer has finished running / `.end` has been called. */ - private ended = false; + public running = true; + /** The offset of the current buffer. */ + private offset = 0; private readonly xmlMode: boolean; private readonly decodeEntities: boolean; @@ -194,19 +193,16 @@ export default class Tokenizer { this.baseState = State.Text; this.currentSequence = undefined!; this.running = true; - this.ended = false; + this.offset = 0; } public write(chunk: string): void { - if (this.ended) return this.cbs.onerror(Error(".write() after done!")); - this.buffer += chunk; + this.offset += this.buffer.length; + this.buffer = chunk; this.parse(); } - public end(chunk?: string): void { - if (this.ended) return this.cbs.onerror(Error(".end() after done!")); - if (chunk) this.write(chunk); - this.ended = true; + public end(): void { if (this.running) this.finish(); } @@ -216,12 +212,9 @@ export default class Tokenizer { public resume(): void { this.running = true; - if (this._index < this.buffer.length) { + if (this._index < this.buffer.length + this.offset) { this.parse(); } - if (this.ended) { - this.finish(); - } } /** @@ -331,8 +324,8 @@ export default class Tokenizer { * @returns Whether the character was found. */ private fastForwardTo(c: number): boolean { - while (++this._index < this.buffer.length) { - if (this.buffer.charCodeAt(this._index) === c) { + while (++this._index < this.buffer.length + this.offset) { + if (this.buffer.charCodeAt(this._index - this.offset) === c) { return true; } } @@ -343,7 +336,7 @@ export default class Tokenizer { * * TODO: Refactor `parse` to increment index before calling states. */ - this._index = this.buffer.length - 1; + this._index = this.buffer.length + this.offset - 1; return false; } @@ -795,7 +788,7 @@ export default class Tokenizer { } private shouldContinue() { - return this._index < this.buffer.length && this.running; + return this._index < this.buffer.length + this.offset && this.running; } /** @@ -805,7 +798,7 @@ export default class Tokenizer { */ private parse() { while (this.shouldContinue()) { - const c = this.buffer.charCodeAt(this._index); + const c = this.buffer.charCodeAt(this._index - this.offset); if (this._state === State.Text) { this.stateText(c); } else if (this._state === State.SpecialStartSequence) { @@ -885,11 +878,12 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { + const endIndex = this.buffer.length + this.offset; if (this._state === State.InCommentLike) { if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(this.sectionStart, this.buffer.length, 0); + this.cbs.oncdata(this.sectionStart, endIndex, 0); } else { - this.cbs.oncomment(this.sectionStart, this.buffer.length, 0); + this.cbs.oncomment(this.sectionStart, endIndex, 0); } } else if ( this._state === State.InNumericEntity && @@ -919,7 +913,7 @@ export default class Tokenizer { * respective callback signals that the tag should be ignored. */ } else { - this.cbs.ontext(this.sectionStart, this.buffer.length); + this.cbs.ontext(this.sectionStart, endIndex); } } From 09ac7e188db9722d83b6564b518456351b5b54e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Tue, 14 Dec 2021 22:16:50 +0000 Subject: [PATCH 7/7] Remove `_` prefix from tokenizer private props --- src/Tokenizer.ts | 336 +++++++++++++++++++++++------------------------ 1 file changed, 168 insertions(+), 168 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 543b93514..93643215c 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -153,13 +153,13 @@ const Sequences = { export default class Tokenizer { /** The current state the tokenizer is in. */ - private _state = State.Text; + private state = State.Text; /** The read buffer. */ private buffer = ""; /** The beginning of the section that is currently being read. */ public sectionStart = 0; /** The index within the buffer that we are currently looking at. */ - private _index = 0; + private index = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ private baseState = State.Text; /** For special parsing behavior inside of script and style tags. */ @@ -186,10 +186,10 @@ export default class Tokenizer { } public reset(): void { - this._state = State.Text; + this.state = State.Text; this.buffer = ""; this.sectionStart = 0; - this._index = 0; + this.index = 0; this.baseState = State.Text; this.currentSequence = undefined!; this.running = true; @@ -212,7 +212,7 @@ export default class Tokenizer { public resume(): void { this.running = true; - if (this._index < this.buffer.length + this.offset) { + if (this.index < this.buffer.length + this.offset) { this.parse(); } } @@ -221,7 +221,7 @@ export default class Tokenizer { * The current index within all of the written data. */ public getIndex(): number { - return this._index; + return this.index; } private stateText(c: number): void { @@ -229,13 +229,13 @@ export default class Tokenizer { c === CharCodes.Lt || (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) ) { - if (this._index > this.sectionStart) { - this.cbs.ontext(this.sectionStart, this._index); + if (this.index > this.sectionStart) { + this.cbs.ontext(this.sectionStart, this.index); } - this._state = State.BeforeTagName; - this.sectionStart = this._index; + this.state = State.BeforeTagName; + this.sectionStart = this.index; } else if (this.decodeEntities && c === CharCodes.Amp) { - this._state = State.BeforeEntity; + this.state = State.BeforeEntity; } } @@ -257,7 +257,7 @@ export default class Tokenizer { } this.sequenceIndex = 0; - this._state = State.InTagName; + this.state = State.InTagName; this.stateInTagName(c); } @@ -265,14 +265,14 @@ export default class Tokenizer { private stateInSpecialTag(c: number): void { if (this.sequenceIndex === this.currentSequence.length) { if (c === CharCodes.Gt || isWhitespace(c)) { - const endOfText = this._index - this.currentSequence.length; + const endOfText = this.index - this.currentSequence.length; if (this.sectionStart < endOfText) { // Spoof the index so that reported locations match up. - const actualIndex = this._index; - this._index = endOfText; + const actualIndex = this.index; + this.index = endOfText; this.cbs.ontext(this.sectionStart, endOfText); - this._index = actualIndex; + this.index = actualIndex; } this.isSpecial = false; @@ -290,7 +290,7 @@ export default class Tokenizer { if (this.currentSequence === Sequences.TitleEnd) { // We have to parse entities in <title> tags. if (this.decodeEntities && c === CharCodes.Amp) { - this._state = State.BeforeEntity; + this.state = State.BeforeEntity; } } else if (this.fastForwardTo(CharCodes.Lt)) { // Outside of <title> tags, we can fast-forward. @@ -305,14 +305,14 @@ export default class Tokenizer { private stateCDATASequence(c: number): void { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { - this._state = State.InCommentLike; + this.state = State.InCommentLike; this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; - this.sectionStart = this._index + 1; + this.sectionStart = this.index + 1; } } else { this.sequenceIndex = 0; - this._state = State.InDeclaration; + this.state = State.InDeclaration; this.stateInDeclaration(c); // Reconsume the character } } @@ -324,8 +324,8 @@ export default class Tokenizer { * @returns Whether the character was found. */ private fastForwardTo(c: number): boolean { - while (++this._index < this.buffer.length + this.offset) { - if (this.buffer.charCodeAt(this._index - this.offset) === c) { + while (++this.index < this.buffer.length + this.offset) { + if (this.buffer.charCodeAt(this.index - this.offset) === c) { return true; } } @@ -336,7 +336,7 @@ export default class Tokenizer { * * TODO: Refactor `parse` to increment index before calling states. */ - this._index = this.buffer.length + this.offset - 1; + this.index = this.buffer.length + this.offset - 1; return false; } @@ -353,14 +353,14 @@ export default class Tokenizer { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { if (this.currentSequence === Sequences.CdataEnd) { - this.cbs.oncdata(this.sectionStart, this._index, 2); + this.cbs.oncdata(this.sectionStart, this.index, 2); } else { - this.cbs.oncomment(this.sectionStart, this._index, 2); + this.cbs.oncomment(this.sectionStart, this.index, 2); } this.sequenceIndex = 0; - this.sectionStart = this._index + 1; - this._state = State.Text; + this.sectionStart = this.index + 1; + this.state = State.Text; } } else if (this.sequenceIndex === 0) { // Fast-forward to the first character of the sequence @@ -387,39 +387,39 @@ export default class Tokenizer { this.isSpecial = true; this.currentSequence = sequence; this.sequenceIndex = offset; - this._state = State.SpecialStartSequence; + this.state = State.SpecialStartSequence; } private stateBeforeTagName(c: number): void { if (c === CharCodes.ExclamationMark) { - this._state = State.BeforeDeclaration; - this.sectionStart = this._index + 1; + this.state = State.BeforeDeclaration; + this.sectionStart = this.index + 1; } else if (c === CharCodes.Questionmark) { - this._state = State.InProcessingInstruction; - this.sectionStart = this._index + 1; + this.state = State.InProcessingInstruction; + this.sectionStart = this.index + 1; } else if (this.isTagStartChar(c)) { const lower = c | 0x20; - this.sectionStart = this._index; + this.sectionStart = this.index; if (!this.xmlMode && lower === Sequences.TitleEnd[2]) { this.startSpecial(Sequences.TitleEnd, 3); } else { - this._state = + this.state = !this.xmlMode && lower === Sequences.ScriptEnd[2] ? State.BeforeSpecialS : State.InTagName; } } else if (c === CharCodes.Slash) { - this._state = State.BeforeClosingTagName; + this.state = State.BeforeClosingTagName; } else { - this._state = State.Text; + this.state = State.Text; this.stateText(c); } } private stateInTagName(c: number): void { if (isEndOfTagSection(c)) { - this.cbs.onopentagname(this.sectionStart, this._index); + this.cbs.onopentagname(this.sectionStart, this.index); this.sectionStart = -1; - this._state = State.BeforeAttributeName; + this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } } @@ -427,90 +427,90 @@ export default class Tokenizer { if (isWhitespace(c)) { // Ignore } else if (c === CharCodes.Gt) { - this._state = State.Text; + this.state = State.Text; } else { - this._state = this.isTagStartChar(c) + this.state = this.isTagStartChar(c) ? State.InClosingTagName : State.InSpecialComment; - this.sectionStart = this._index; + this.sectionStart = this.index; } } private stateInClosingTagName(c: number): void { if (c === CharCodes.Gt || isWhitespace(c)) { - this.cbs.onclosetag(this.sectionStart, this._index); + this.cbs.onclosetag(this.sectionStart, this.index); this.sectionStart = -1; - this._state = State.AfterClosingTagName; + this.state = State.AfterClosingTagName; this.stateAfterClosingTagName(c); } } private stateAfterClosingTagName(c: number): void { // Skip everything until ">" if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this._state = State.Text; - this.sectionStart = this._index + 1; + this.state = State.Text; + this.sectionStart = this.index + 1; } } private stateBeforeAttributeName(c: number): void { if (c === CharCodes.Gt) { - this.cbs.onopentagend(this._index); + this.cbs.onopentagend(this.index); if (this.isSpecial) { - this._state = State.InSpecialTag; + this.state = State.InSpecialTag; this.sequenceIndex = 0; } else { - this._state = State.Text; + this.state = State.Text; } - this.baseState = this._state; - this.sectionStart = this._index + 1; + this.baseState = this.state; + this.sectionStart = this.index + 1; } else if (c === CharCodes.Slash) { - this._state = State.InSelfClosingTag; + this.state = State.InSelfClosingTag; } else if (!isWhitespace(c)) { - this._state = State.InAttributeName; - this.sectionStart = this._index; + this.state = State.InAttributeName; + this.sectionStart = this.index; } } private stateInSelfClosingTag(c: number): void { if (c === CharCodes.Gt) { - this.cbs.onselfclosingtag(this._index); - this._state = State.Text; + this.cbs.onselfclosingtag(this.index); + this.state = State.Text; this.baseState = State.Text; - this.sectionStart = this._index + 1; + this.sectionStart = this.index + 1; this.isSpecial = false; // Reset special state, in case of self-closing special tags } else if (!isWhitespace(c)) { - this._state = State.BeforeAttributeName; + this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } } private stateInAttributeName(c: number): void { if (c === CharCodes.Eq || isEndOfTagSection(c)) { - this.cbs.onattribname(this.sectionStart, this._index); + this.cbs.onattribname(this.sectionStart, this.index); this.sectionStart = -1; - this._state = State.AfterAttributeName; + this.state = State.AfterAttributeName; this.stateAfterAttributeName(c); } } private stateAfterAttributeName(c: number): void { if (c === CharCodes.Eq) { - this._state = State.BeforeAttributeValue; + this.state = State.BeforeAttributeValue; } else if (c === CharCodes.Slash || c === CharCodes.Gt) { - this.cbs.onattribend(QuoteType.NoValue, this._index); - this._state = State.BeforeAttributeName; + this.cbs.onattribend(QuoteType.NoValue, this.index); + this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (!isWhitespace(c)) { - this.cbs.onattribend(QuoteType.NoValue, this._index); - this._state = State.InAttributeName; - this.sectionStart = this._index; + this.cbs.onattribend(QuoteType.NoValue, this.index); + this.state = State.InAttributeName; + this.sectionStart = this.index; } } private stateBeforeAttributeValue(c: number): void { if (c === CharCodes.DoubleQuote) { - this._state = State.InAttributeValueDq; - this.sectionStart = this._index + 1; + this.state = State.InAttributeValueDq; + this.sectionStart = this.index + 1; } else if (c === CharCodes.SingleQuote) { - this._state = State.InAttributeValueSq; - this.sectionStart = this._index + 1; + this.state = State.InAttributeValueSq; + this.sectionStart = this.index + 1; } else if (!isWhitespace(c)) { - this.sectionStart = this._index; - this._state = State.InAttributeValueNq; + this.sectionStart = this.index; + this.state = State.InAttributeValueNq; this.stateInAttributeValueNoQuotes(c); // Reconsume token } } @@ -519,18 +519,18 @@ export default class Tokenizer { c === quote || (!this.decodeEntities && this.fastForwardTo(quote)) ) { - this.cbs.onattribdata(this.sectionStart, this._index); + this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; this.cbs.onattribend( quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single, - this._index + this.index ); - this._state = State.BeforeAttributeName; + this.state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { - this.baseState = this._state; - this._state = State.BeforeEntity; + this.baseState = this.state; + this.state = State.BeforeEntity; } } private stateInAttributeValueDoubleQuotes(c: number): void { @@ -541,22 +541,22 @@ export default class Tokenizer { } private stateInAttributeValueNoQuotes(c: number): void { if (isWhitespace(c) || c === CharCodes.Gt) { - this.cbs.onattribdata(this.sectionStart, this._index); + this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; - this.cbs.onattribend(QuoteType.Unquoted, this._index); - this._state = State.BeforeAttributeName; + this.cbs.onattribend(QuoteType.Unquoted, this.index); + this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { - this.baseState = this._state; - this._state = State.BeforeEntity; + this.baseState = this.state; + this.state = State.BeforeEntity; } } private stateBeforeDeclaration(c: number): void { if (c === CharCodes.OpeningSquareBracket) { - this._state = State.CDATASequence; + this.state = State.CDATASequence; this.sequenceIndex = 0; } else { - this._state = + this.state = c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration; @@ -564,34 +564,34 @@ export default class Tokenizer { } private stateInDeclaration(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.ondeclaration(this.sectionStart, this._index); - this._state = State.Text; - this.sectionStart = this._index + 1; + this.cbs.ondeclaration(this.sectionStart, this.index); + this.state = State.Text; + this.sectionStart = this.index + 1; } } private stateInProcessingInstruction(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.onprocessinginstruction(this.sectionStart, this._index); - this._state = State.Text; - this.sectionStart = this._index + 1; + this.cbs.onprocessinginstruction(this.sectionStart, this.index); + this.state = State.Text; + this.sectionStart = this.index + 1; } } private stateBeforeComment(c: number): void { if (c === CharCodes.Dash) { - this._state = State.InCommentLike; + this.state = State.InCommentLike; this.currentSequence = Sequences.CommentEnd; // Allow short comments (eg. <!-->) this.sequenceIndex = 2; - this.sectionStart = this._index + 1; + this.sectionStart = this.index + 1; } else { - this._state = State.InDeclaration; + this.state = State.InDeclaration; } } private stateInSpecialComment(c: number): void { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { - this.cbs.oncomment(this.sectionStart, this._index, 0); - this._state = State.Text; - this.sectionStart = this._index + 1; + this.cbs.oncomment(this.sectionStart, this.index, 0); + this.state = State.Text; + this.sectionStart = this.index + 1; } } private stateBeforeSpecialS(c: number): void { @@ -601,7 +601,7 @@ export default class Tokenizer { } else if (lower === Sequences.StyleEnd[3]) { this.startSpecial(Sequences.StyleEnd, 4); } else { - this._state = State.InTagName; + this.state = State.InTagName; this.stateInTagName(c); // Consume the token again } } @@ -618,13 +618,13 @@ export default class Tokenizer { this.entityResult = 0; if (c === CharCodes.Num) { - this._state = State.BeforeNumericEntity; + this.state = State.BeforeNumericEntity; } else if (c === CharCodes.Amp) { // We have two `&` characters in a row. Stay in the current state. } else { this.trieIndex = 0; this.trieCurrent = this.entityTrie[0]; - this._state = State.InNamedEntity; + this.state = State.InNamedEntity; this.stateInNamedEntity(c); } } @@ -641,7 +641,7 @@ export default class Tokenizer { if (this.trieIndex < 0) { this.emitNamedEntity(); - this._index--; + this.index--; return; } @@ -655,7 +655,7 @@ export default class Tokenizer { this.trieIndex += 1; } else { // Add 1 as we have already incremented the excess - const entityStart = this._index - this.entityExcess + 1; + const entityStart = this.index - this.entityExcess + 1; if (entityStart > this.sectionStart) { this.emitPartial(this.sectionStart, entityStart); @@ -667,7 +667,7 @@ export default class Tokenizer { 1 + Number((this.trieCurrent & BinTrieFlags.MULTI_BYTE) !== 0); this.entityExcess = 0; - this.sectionStart = this._index + 1; + this.sectionStart = this.index + 1; } } } @@ -692,34 +692,34 @@ export default class Tokenizer { } } - this._state = this.baseState; + this.state = this.baseState; } private stateBeforeNumericEntity(c: number): void { if ((c | 0x20) === CharCodes.LowerX) { this.entityExcess++; - this._state = State.InHexEntity; + this.state = State.InHexEntity; } else { - this._state = State.InNumericEntity; + this.state = State.InNumericEntity; this.stateInNumericEntity(c); } } private emitNumericEntity(strict: boolean) { - const entityStart = this._index - this.entityExcess - 1; + const entityStart = this.index - this.entityExcess - 1; const numberStart = - entityStart + 2 + Number(this._state === State.InHexEntity); + entityStart + 2 + Number(this.state === State.InHexEntity); - if (numberStart !== this._index) { + if (numberStart !== this.index) { // Emit leading data if any if (entityStart > this.sectionStart) { this.emitPartial(this.sectionStart, entityStart); } this.emitCodePoint(this.entityResult); - this.sectionStart = this._index + Number(strict); + this.sectionStart = this.index + Number(strict); } - this._state = this.baseState; + this.state = this.baseState; } private stateInNumericEntity(c: number): void { if (c === CharCodes.Semi) { @@ -731,9 +731,9 @@ export default class Tokenizer { if (this.allowLegacyEntity()) { this.emitNumericEntity(false); } else { - this._state = this.baseState; + this.state = this.baseState; } - this._index--; + this.index--; } } private stateInHexEntity(c: number): void { @@ -750,9 +750,9 @@ export default class Tokenizer { if (this.allowLegacyEntity()) { this.emitNumericEntity(false); } else { - this._state = this.baseState; + this.state = this.baseState; } - this._index--; + this.index--; } } @@ -769,26 +769,26 @@ export default class Tokenizer { */ private cleanup() { // If we are inside of text or attributes, emit what we already have. - if (this.running && this.sectionStart !== this._index) { + if (this.running && this.sectionStart !== this.index) { if ( - this._state === State.Text || - (this._state === State.InSpecialTag && this.sequenceIndex === 0) + this.state === State.Text || + (this.state === State.InSpecialTag && this.sequenceIndex === 0) ) { - this.cbs.ontext(this.sectionStart, this._index); - this.sectionStart = this._index; + this.cbs.ontext(this.sectionStart, this.index); + this.sectionStart = this.index; } else if ( - this._state === State.InAttributeValueDq || - this._state === State.InAttributeValueSq || - this._state === State.InAttributeValueNq + this.state === State.InAttributeValueDq || + this.state === State.InAttributeValueSq || + this.state === State.InAttributeValueNq ) { - this.cbs.onattribdata(this.sectionStart, this._index); - this.sectionStart = this._index; + this.cbs.onattribdata(this.sectionStart, this.index); + this.sectionStart = this.index; } } } private shouldContinue() { - return this._index < this.buffer.length + this.offset && this.running; + return this.index < this.buffer.length + this.offset && this.running; } /** @@ -798,79 +798,79 @@ export default class Tokenizer { */ private parse() { while (this.shouldContinue()) { - const c = this.buffer.charCodeAt(this._index - this.offset); - if (this._state === State.Text) { + const c = this.buffer.charCodeAt(this.index - this.offset); + if (this.state === State.Text) { this.stateText(c); - } else if (this._state === State.SpecialStartSequence) { + } else if (this.state === State.SpecialStartSequence) { this.stateSpecialStartSequence(c); - } else if (this._state === State.InSpecialTag) { + } else if (this.state === State.InSpecialTag) { this.stateInSpecialTag(c); - } else if (this._state === State.CDATASequence) { + } else if (this.state === State.CDATASequence) { this.stateCDATASequence(c); - } else if (this._state === State.InAttributeValueDq) { + } else if (this.state === State.InAttributeValueDq) { this.stateInAttributeValueDoubleQuotes(c); - } else if (this._state === State.InAttributeName) { + } else if (this.state === State.InAttributeName) { this.stateInAttributeName(c); - } else if (this._state === State.InCommentLike) { + } else if (this.state === State.InCommentLike) { this.stateInCommentLike(c); - } else if (this._state === State.InSpecialComment) { + } else if (this.state === State.InSpecialComment) { this.stateInSpecialComment(c); - } else if (this._state === State.BeforeAttributeName) { + } else if (this.state === State.BeforeAttributeName) { this.stateBeforeAttributeName(c); - } else if (this._state === State.InTagName) { + } else if (this.state === State.InTagName) { this.stateInTagName(c); - } else if (this._state === State.InClosingTagName) { + } else if (this.state === State.InClosingTagName) { this.stateInClosingTagName(c); - } else if (this._state === State.BeforeTagName) { + } else if (this.state === State.BeforeTagName) { this.stateBeforeTagName(c); - } else if (this._state === State.AfterAttributeName) { + } else if (this.state === State.AfterAttributeName) { this.stateAfterAttributeName(c); - } else if (this._state === State.InAttributeValueSq) { + } else if (this.state === State.InAttributeValueSq) { this.stateInAttributeValueSingleQuotes(c); - } else if (this._state === State.BeforeAttributeValue) { + } else if (this.state === State.BeforeAttributeValue) { this.stateBeforeAttributeValue(c); - } else if (this._state === State.BeforeClosingTagName) { + } else if (this.state === State.BeforeClosingTagName) { this.stateBeforeClosingTagName(c); - } else if (this._state === State.AfterClosingTagName) { + } else if (this.state === State.AfterClosingTagName) { this.stateAfterClosingTagName(c); - } else if (this._state === State.BeforeSpecialS) { + } else if (this.state === State.BeforeSpecialS) { this.stateBeforeSpecialS(c); - } else if (this._state === State.InAttributeValueNq) { + } else if (this.state === State.InAttributeValueNq) { this.stateInAttributeValueNoQuotes(c); - } else if (this._state === State.InSelfClosingTag) { + } else if (this.state === State.InSelfClosingTag) { this.stateInSelfClosingTag(c); - } else if (this._state === State.InDeclaration) { + } else if (this.state === State.InDeclaration) { this.stateInDeclaration(c); - } else if (this._state === State.BeforeDeclaration) { + } else if (this.state === State.BeforeDeclaration) { this.stateBeforeDeclaration(c); - } else if (this._state === State.BeforeComment) { + } else if (this.state === State.BeforeComment) { this.stateBeforeComment(c); - } else if (this._state === State.InProcessingInstruction) { + } else if (this.state === State.InProcessingInstruction) { this.stateInProcessingInstruction(c); - } else if (this._state === State.InNamedEntity) { + } else if (this.state === State.InNamedEntity) { this.stateInNamedEntity(c); - } else if (this._state === State.BeforeEntity) { + } else if (this.state === State.BeforeEntity) { this.stateBeforeEntity(c); - } else if (this._state === State.InHexEntity) { + } else if (this.state === State.InHexEntity) { this.stateInHexEntity(c); - } else if (this._state === State.InNumericEntity) { + } else if (this.state === State.InNumericEntity) { this.stateInNumericEntity(c); } else { // `this._state === State.BeforeNumericEntity` this.stateBeforeNumericEntity(c); } - this._index++; + this.index++; } this.cleanup(); } private finish() { - if (this._state === State.InNamedEntity) { + if (this.state === State.InNamedEntity) { this.emitNamedEntity(); } // If there is remaining data, emit it in a reasonable way - if (this.sectionStart < this._index) { + if (this.sectionStart < this.index) { this.handleTrailingData(); } this.cbs.onend(); @@ -879,34 +879,34 @@ export default class Tokenizer { /** Handle any trailing data. */ private handleTrailingData() { const endIndex = this.buffer.length + this.offset; - if (this._state === State.InCommentLike) { + if (this.state === State.InCommentLike) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, endIndex, 0); } else { this.cbs.oncomment(this.sectionStart, endIndex, 0); } } else if ( - this._state === State.InNumericEntity && + this.state === State.InNumericEntity && this.allowLegacyEntity() ) { this.emitNumericEntity(false); // All trailing data will have been consumed } else if ( - this._state === State.InHexEntity && + this.state === State.InHexEntity && this.allowLegacyEntity() ) { this.emitNumericEntity(false); // All trailing data will have been consumed } else if ( - this._state === State.InTagName || - this._state === State.BeforeAttributeName || - this._state === State.BeforeAttributeValue || - this._state === State.AfterAttributeName || - this._state === State.InAttributeName || - this._state === State.InAttributeValueSq || - this._state === State.InAttributeValueDq || - this._state === State.InAttributeValueNq || - this._state === State.InClosingTagName + this.state === State.InTagName || + this.state === State.BeforeAttributeName || + this.state === State.BeforeAttributeValue || + this.state === State.AfterAttributeName || + this.state === State.InAttributeName || + this.state === State.InAttributeValueSq || + this.state === State.InAttributeValueDq || + this.state === State.InAttributeValueNq || + this.state === State.InClosingTagName ) { /* * If we are currently in an opening or closing tag, us not calling the