From 7e30cdae50213aaef8e1535ad28702b301c505ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 17 Mar 2022 11:01:44 +0000 Subject: [PATCH] refactor: Remove `getNextToken` method (#461) --- packages/parse5-parser-stream/lib/index.ts | 73 ++++++++----------- packages/parse5-sax-parser/lib/index.ts | 8 +- packages/parse5/lib/parser/index.ts | 38 +--------- packages/parse5/lib/tokenizer/index.ts | 51 ++++++++++--- .../tokenizer/tokenizer-location-info.test.ts | 33 ++++----- test/utils/generate-tokenization-tests.ts | 19 ++--- 6 files changed, 95 insertions(+), 127 deletions(-) diff --git a/packages/parse5-parser-stream/lib/index.ts b/packages/parse5-parser-stream/lib/index.ts index d01f46b59..7e81095cc 100644 --- a/packages/parse5-parser-stream/lib/index.ts +++ b/packages/parse5-parser-stream/lib/index.ts @@ -3,6 +3,8 @@ import { Parser, ParserOptions } from 'parse5/dist/parser/index.js'; import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js'; import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js'; +/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */ + /** * Streaming HTML parser with scripting support. * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable). @@ -28,8 +30,7 @@ import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js */ export class ParserStream extends Writable { private lastChunkWritten = false; - private writeCallback: null | (() => void) = null; - private pausedByScript = false; + private writeCallback: undefined | (() => void) = undefined; public parser: Parser; private pendingHtmlInsertions: string[] = []; @@ -42,7 +43,31 @@ export class ParserStream constructor(options?: ParserOptions) { super({ decodeStrings: false }); - this.parser = new Parser(options); + const resume = (): void => { + for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) { + this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]); + } + + this.pendingHtmlInsertions.length = 0; + + //NOTE: keep parsing if we don't wait for the next input chunk + this.parser.tokenizer.resume(this.writeCallback); + }; + + const documentWrite = (html: string): void => { + if (!this.parser.stopped) { + this.pendingHtmlInsertions.push(html); + } + }; + + const scriptHandler = (scriptElement: T['element']): void => { + if (this.listenerCount('script') > 0) { + this.parser.tokenizer.pause(); + this.emit('script', scriptElement, documentWrite, resume); + } + }; + + this.parser = new Parser(options, undefined, undefined, scriptHandler); this.document = this.parser.document; } @@ -53,8 +78,7 @@ export class ParserStream } this.writeCallback = callback; - this.parser.tokenizer.write(chunk, this.lastChunkWritten); - this._runParsingLoop(); + this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback); } // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`. @@ -64,45 +88,6 @@ export class ParserStream this.lastChunkWritten = true; super.end(chunk || '', encoding, callback); } - - //Scriptable parser implementation - private _runParsingLoop(): void { - this.parser.runParsingLoopForCurrentChunk(this.writeCallback, this._scriptHandler); - } - - private _resume = (): void => { - if (!this.pausedByScript) { - throw new Error('Parser was already resumed'); - } - - while (this.pendingHtmlInsertions.length > 0) { - const html = this.pendingHtmlInsertions.pop()!; - - this.parser.tokenizer.insertHtmlAtCurrentPos(html); - } - - this.pausedByScript = false; - - //NOTE: keep parsing if we don't wait for the next input chunk - if (this.parser.tokenizer.active) { - this._runParsingLoop(); - } - }; - - private _documentWrite = (html: string): void => { - if (!this.parser.stopped) { - this.pendingHtmlInsertions.push(html); - } - }; - - private _scriptHandler = (scriptElement: T['element']): void => { - if (this.listenerCount('script') > 0) { - this.pausedByScript = true; - this.emit('script', scriptElement, this._documentWrite, this._resume); - } else { - this._runParsingLoop(); - } - }; } export interface ParserStream { diff --git a/packages/parse5-sax-parser/lib/index.ts b/packages/parse5-sax-parser/lib/index.ts index 741934087..e16c5a72b 100644 --- a/packages/parse5-sax-parser/lib/index.ts +++ b/packages/parse5-sax-parser/lib/index.ts @@ -123,23 +123,17 @@ export class SAXParser extends Transform implements TokenHandler { */ public stop(): void { this.stopped = true; + this.tokenizer.pause(); } //Internals protected _transformChunk(chunk: string): string { if (!this.stopped) { this.tokenizer.write(chunk, this.lastChunkWritten); - this._runParsingLoop(); } return chunk; } - private _runParsingLoop(): void { - while (!this.stopped && this.tokenizer.active) { - this.tokenizer.getNextToken(); - } - } - /** @internal */ onCharacter({ chars, location }: CharacterToken): void { if (this.pendingText === null) { diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 2a8bbad96..f18f424f2 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -129,7 +129,8 @@ export class Parser implements TokenHandler, Stack public constructor( options?: ParserOptions, document?: T['document'], - public fragmentContext: T['element'] | null = null + public fragmentContext: T['element'] | null = null, + public scriptHandler: null | ((pendingScript: T['element']) => void) = null ) { this.options = { ...defaultParserOptions, @@ -160,7 +161,6 @@ export class Parser implements TokenHandler, Stack const parser = new this(options); parser.tokenizer.write(html, true); - parser._runParsingLoop(null); return parser.document; } @@ -195,7 +195,6 @@ export class Parser implements TokenHandler, Stack parser._resetInsertionMode(); parser._findFormInFragmentContext(); parser.tokenizer.write(html, true); - parser._runParsingLoop(null); const rootElement = opts.treeAdapter.getFirstChild(documentMock) as T['parentNode']; const fragment = opts.treeAdapter.createDocumentFragment(); @@ -215,7 +214,6 @@ export class Parser implements TokenHandler, Stack headElement: null | T['element'] = null; formElement: null | T['element'] = null; - pendingScript: null | T['element'] = null; openElements: OpenElementStack; activeFormattingElements: FormattingElementList; @@ -253,36 +251,6 @@ export class Parser implements TokenHandler, Stack this.onParseError(err); } - //Parsing loop - private _runParsingLoop(scriptHandler: null | ((scriptElement: T['element']) => void)): void { - while (!this.stopped) { - this.tokenizer.getNextToken(); - - if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) { - break; - } - } - } - - public runParsingLoopForCurrentChunk( - writeCallback: null | (() => void), - scriptHandler: (scriptElement: T['element']) => void - ): void { - this._runParsingLoop(scriptHandler); - - if (scriptHandler && this.pendingScript) { - const script = this.pendingScript; - - this.pendingScript = null; - - scriptHandler(script); - - return; - } - - writeCallback?.(); - } - //Stack events onItemPush(node: T['parentNode'], tid: number, isTop: boolean): void { this.treeAdapter.onItemPush?.(node); @@ -2576,7 +2544,7 @@ function eofInBody(p: Parser, token: EOFToken): //------------------------------------------------------------------ function endTagInText(p: Parser, token: TagToken): void { if (token.tagID === $.SCRIPT) { - p.pendingScript = p.openElements.current; + p.scriptHandler?.(p.openElements.current); } p.openElements.pop(); diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 48397a61f..51bc0873f 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -223,8 +223,9 @@ export interface TokenHandler { export class Tokenizer { public preprocessor: Preprocessor; - /** Indicates that the next token has been emitted, and `getNextToken` should return. */ - private hasEmitted = false; + private paused = false; + /** Ensures that the parsing loop isn't run multiple times at once. */ + private inLoop = false; /** * Indicates that the current adjusted node exists, is not an element in the HTML namespace, @@ -274,10 +275,12 @@ export class Tokenizer { }; } - //API - public getNextToken(): void { - this.hasEmitted = false; - while (!this.hasEmitted && this.active) { + private _runParsingLoop(): void { + if (this.inLoop) return; + + this.inLoop = true; + + while (this.active && !this.paused) { this.consumedAfterSnapshot = 0; const cp = this._consume(); @@ -286,16 +289,46 @@ export class Tokenizer { this._callState(cp); } } + + this.inLoop = false; } - public write(chunk: string, isLastChunk: boolean): void { + //API + public pause(): void { + this.paused = true; + } + + public resume(writeCallback?: () => void): void { + if (!this.paused) { + throw new Error('Parser was already resumed'); + } + + this.paused = false; + + // Necessary for synchronous resume. + if (this.inLoop) return; + + this._runParsingLoop(); + + if (!this.paused) { + writeCallback?.(); + } + } + + public write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void { this.active = true; this.preprocessor.write(chunk, isLastChunk); + this._runParsingLoop(); + + if (!this.paused) { + writeCallback?.(); + } } public insertHtmlAtCurrentPos(chunk: string): void { this.active = true; this.preprocessor.insertHtmlAtCurrentPos(chunk); + this._runParsingLoop(); } //Hibernation @@ -440,7 +473,6 @@ export class Tokenizer { ct.location.endOffset = this.preprocessor.offset + 1; } - this.hasEmitted = true; this.currentLocation = this.getCurrentLocation(-1); } @@ -508,7 +540,6 @@ export class Tokenizer { } } - this.hasEmitted = true; this.currentCharacterToken = null; } } @@ -524,7 +555,7 @@ export class Tokenizer { this._emitCurrentCharacterToken(location); this.handler.onEof({ type: TokenType.EOF, location }); - this.hasEmitted = true; + this.active = false; } //Characters emission diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index eb9ff1856..6d99fe9f9 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -3,22 +3,19 @@ import { Tokenizer, TokenizerMode, TokenHandler } from './index.js'; import { Location, EOFToken, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js'; -interface LocationInfoTestCase { - initialMode: typeof TokenizerMode[keyof typeof TokenizerMode]; - lastStartTagName: string; - htmlChunks: string[]; -} - /** Receives events and immediately compares them against the expected values. */ class LocationInfoHandler implements TokenHandler { public sawEof = false; + /** All HTML chunks concatenated. */ + private html: string; /** The index of the last html chunk. */ private idx = 0; /** All of the lines in the input. */ private lines: string[]; - constructor(private testCase: LocationInfoTestCase, private html: string) { - this.lines = html.split(/\r?\n/g); + constructor(private htmlChunks: string[]) { + this.html = htmlChunks.join(''); + this.lines = this.html.split(/\r?\n/g); } private validateLocation(location: Location | null): void { @@ -26,7 +23,7 @@ class LocationInfoHandler implements TokenHandler { //Offsets const actual = this.html.substring(location.startOffset, location.endOffset); - const chunk = this.testCase.htmlChunks[this.idx]; + const chunk = this.htmlChunks[this.idx]; assert.strictEqual(actual, chunk); @@ -65,7 +62,7 @@ class LocationInfoHandler implements TokenHandler { assert.strictEqual(location.endOffset, location.startOffset); assert.strictEqual(location.endOffset, this.html.length); - assert.strictEqual(this.idx, this.testCase.htmlChunks.length); + assert.strictEqual(this.idx, this.htmlChunks.length); this.sawEof = true; } @@ -166,14 +163,9 @@ it('Location Info (Tokenizer)', () => { ]; for (const testCase of testCases) { - const html = testCase.htmlChunks.join(''); - const handler = new LocationInfoHandler(testCase, html); + const { htmlChunks } = testCase; + const handler = new LocationInfoHandler(htmlChunks); const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler); - const lastChunkIdx = testCase.htmlChunks.length - 1; - - for (let i = 0; i < testCase.htmlChunks.length; i++) { - tokenizer.write(testCase.htmlChunks[i], i === lastChunkIdx); - } // NOTE: set small waterline for testing purposes tokenizer.preprocessor.bufferWaterline = 8; @@ -181,8 +173,11 @@ it('Location Info (Tokenizer)', () => { tokenizer.lastStartTagName = testCase.lastStartTagName; tokenizer.inForeignNode = !!testCase.inForeignNode; - while (!handler.sawEof) { - tokenizer.getNextToken(); + for (let i = 0; i < htmlChunks.length; i++) { + tokenizer.write(htmlChunks[i], i === htmlChunks.length - 1); } + + assert.ok(handler.sawEof); + assert.ok(!tokenizer.active); } }); diff --git a/test/utils/generate-tokenization-tests.ts b/test/utils/generate-tokenization-tests.ts index c7bb05526..0c3976328 100644 --- a/test/utils/generate-tokenization-tests.ts +++ b/test/utils/generate-tokenization-tests.ts @@ -95,14 +95,9 @@ class TokenizeHandler implements TokenSourceData, TokenHandler { public errors: TokenError[] = []; } -function tokenize( - createTokenSource: TokenSourceCreator, - chunks: string | string[], - testData: LoadedTest -): TokenSourceData { +function tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData { const result = new TokenizeHandler(testData); const tokenizer = createTokenSource(result); - let chunkIdx = 0; // NOTE: set small waterline for testing purposes tokenizer.preprocessor.bufferWaterline = 8; @@ -112,14 +107,14 @@ function tokenize( tokenizer.lastStartTagName = testData.lastStartTag; } - while (!result.sawEof) { - if (tokenizer.active) { - tokenizer.getNextToken(); - } else { - tokenizer.write(chunks[chunkIdx], ++chunkIdx === chunks.length); - } + for (let i = 0; i < chunks.length; i++) { + assert.ok(!result.sawEof); + tokenizer.write(chunks[i], i === chunks.length - 1); } + assert.ok(result.sawEof); + assert.ok(!tokenizer.active); + // Sort errors by line and column result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col);