From f09686e2c0edc0b16204d38b3b4698f43d321161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 09:50:02 +0000 Subject: [PATCH 1/6] refactor(parser): Consume tokenizer events --- packages/parse5/lib/parser/index.ts | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index de93f2a91..638f26dce 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1,5 +1,4 @@ -import { TokenizerMode } from '../tokenizer/index.js'; -import { QueuedTokenizer } from '../tokenizer/queued.js'; +import { TokenHandler, Tokenizer, TokenizerMode } from '../tokenizer/index.js'; import { OpenElementStack } from './open-element-stack.js'; import { FormattingElementList, ElementEntry, EntryType } from './formatting-element-list.js'; import * as defaultTreeAdapter from '../tree-adapters/default.js'; @@ -120,9 +119,9 @@ const defaultParserOptions = { }; //Parser -export class Parser { +export class Parser implements TokenHandler { treeAdapter: TreeAdapter; - private onParseError: ParserErrorHandler | null; + onParseError: ParserErrorHandler | null; private currentToken: Token | null = null; public options: Required>; public document: T['document']; @@ -147,7 +146,7 @@ export class Parser { this.document = document ?? this.treeAdapter.createDocument(); - this.tokenizer = new QueuedTokenizer(this.options); + this.tokenizer = new Tokenizer(this.options, this); this.activeFormattingElements = new FormattingElementList(this.treeAdapter); this.fragmentContextID = fragmentContext ? getTagID(this.treeAdapter.getTagName(fragmentContext)) : $.UNKNOWN; @@ -211,7 +210,8 @@ export class Parser { return fragment; } - tokenizer: QueuedTokenizer; + tokenizer: Tokenizer; + stopped = false; insertionMode = InsertionMode.INITIAL; originalInsertionMode = InsertionMode.INITIAL; @@ -261,13 +261,7 @@ export class Parser { //Parsing loop private _runParsingLoop(scriptHandler: null | ((scriptElement: T['element']) => void)): void { while (!this.stopped) { - const token = this.tokenizer.getNextToken(); - - this._processToken(token); - - if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { - this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); - } + this.tokenizer.getNextToken(); if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) { break; @@ -963,6 +957,13 @@ export class Parser { } else { this._startTagOutsideForeignContent(token); } + + // FIXME: This can currently fire unexpectedly. + if (token.selfClosing && !token.ackSelfClosing) { + this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); + // Prevent this error from being shown again. + token.ackSelfClosing = true; + } } _startTagOutsideForeignContent(token: TagToken): void { switch (this.insertionMode) { From ecb414a4be664a77d1d917db88512b9941ed0722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 09:50:24 +0000 Subject: [PATCH 2/6] Remove now unused classes & interfaces --- packages/parse5/lib/common/token.ts | 6 +- packages/parse5/lib/tokenizer/queued.ts | 77 +------------------------ 2 files changed, 3 insertions(+), 80 deletions(-) diff --git a/packages/parse5/lib/common/token.ts b/packages/parse5/lib/common/token.ts index ee4bd14ba..f9028d2c8 100644 --- a/packages/parse5/lib/common/token.ts +++ b/packages/parse5/lib/common/token.ts @@ -96,13 +96,9 @@ export interface EOFToken extends TokenBase { readonly type: TokenType.EOF; } -interface HibernationToken extends TokenBase { - readonly type: TokenType.HIBERNATION; -} - export interface CharacterToken extends TokenBase { type: TokenType.CHARACTER | TokenType.NULL_CHARACTER | TokenType.WHITESPACE_CHARACTER; chars: string; } -export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | HibernationToken | CharacterToken; +export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken; diff --git a/packages/parse5/lib/tokenizer/queued.ts b/packages/parse5/lib/tokenizer/queued.ts index 6ac8178a2..8935518aa 100644 --- a/packages/parse5/lib/tokenizer/queued.ts +++ b/packages/parse5/lib/tokenizer/queued.ts @@ -1,9 +1,5 @@ -import { TokenType, Token, CharacterToken, DoctypeToken, TagToken, EOFToken, CommentToken } from '../common/token.js'; -import { TokenHandler, Tokenizer, TokenizerOptions, TokenizerMode } from './index.js'; -import type { ParserErrorHandler } from '../common/error-codes.js'; -import type { Preprocessor } from './preprocessor.js'; - -const HIBERNATION_TOKEN: Token = { type: TokenType.HIBERNATION, location: null }; +import { Token, CharacterToken, DoctypeToken, TagToken, EOFToken, CommentToken } from '../common/token.js'; +import { TokenHandler } from './index.js'; /** A token handler implemnetation that calls the same function for all tokens. */ export abstract class SinglePathHandler implements TokenHandler { @@ -34,72 +30,3 @@ export abstract class SinglePathHandler implements TokenHandler { this.handleToken(token); } } - -class QueuedHandler extends SinglePathHandler { - private tokenQueue: Token[] = []; - - protected handleToken(token: Token): void { - this.tokenQueue.push(token); - } - - constructor(public onParseError: ParserErrorHandler | null) { - super(); - } - - public getNextToken(tokenizer: Tokenizer): Token { - while (this.tokenQueue.length === 0 && tokenizer.active) { - tokenizer.getNextToken(); - } - - if (this.tokenQueue.length === 0 && !tokenizer.active) { - this.tokenQueue.push(HIBERNATION_TOKEN); - } - - return this.tokenQueue.shift()!; - } -} - -export interface QueuedTokenizerOptions extends TokenizerOptions { - onParseError?: ParserErrorHandler | null; -} - -/** - * Provides the same interface as the old tokenizer, while allowing users to - * read data one token at a time. - */ -export class QueuedTokenizer { - private tokenizer: Tokenizer; - private handler: QueuedHandler; - - constructor(options: QueuedTokenizerOptions) { - this.handler = new QueuedHandler(options.onParseError ?? null); - this.tokenizer = new Tokenizer(options, this.handler); - } - - set allowCDATA(val: boolean) { - this.tokenizer.allowCDATA = val; - } - - get preprocessor(): Preprocessor { - return this.tokenizer.preprocessor; - } - get active(): boolean { - return this.tokenizer.active; - } - - set state(val: typeof TokenizerMode[keyof typeof TokenizerMode]) { - this.tokenizer.state = val; - } - - public write(chunk: string, isLastChunk: boolean): void { - this.tokenizer.write(chunk, isLastChunk); - } - - public insertHtmlAtCurrentPos(str: string): void { - this.tokenizer.insertHtmlAtCurrentPos(str); - } - - public getNextToken(): Token { - return this.handler.getNextToken(this.tokenizer); - } -} From 05adea5a8bbd6f063732e20181f8d37f213805e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 09:52:16 +0000 Subject: [PATCH 3/6] Merge `SinglePathHandler` into `LocationInfoHandler` --- packages/parse5/lib/tokenizer/queued.ts | 32 ------------------- .../tokenizer/tokenizer-location-info.test.ts | 31 ++++++++++++++---- 2 files changed, 25 insertions(+), 38 deletions(-) delete mode 100644 packages/parse5/lib/tokenizer/queued.ts diff --git a/packages/parse5/lib/tokenizer/queued.ts b/packages/parse5/lib/tokenizer/queued.ts deleted file mode 100644 index 8935518aa..000000000 --- a/packages/parse5/lib/tokenizer/queued.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { Token, CharacterToken, DoctypeToken, TagToken, EOFToken, CommentToken } from '../common/token.js'; -import { TokenHandler } from './index.js'; - -/** A token handler implemnetation that calls the same function for all tokens. */ -export abstract class SinglePathHandler implements TokenHandler { - protected abstract handleToken(token: Token): void; - - onComment(token: CommentToken): void { - this.handleToken(token); - } - onDoctype(token: DoctypeToken): void { - this.handleToken(token); - } - onStartTag(token: TagToken): void { - this.handleToken(token); - } - onEndTag(token: TagToken): void { - this.handleToken(token); - } - onEof(token: EOFToken): void { - this.handleToken(token); - } - onCharacter(token: CharacterToken): void { - this.handleToken(token); - } - onNullCharacter(token: CharacterToken): void { - this.handleToken(token); - } - onWhitespaceCharacter(token: CharacterToken): void { - this.handleToken(token); - } -} diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index 6e918d548..044e7e1f4 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -1,7 +1,6 @@ import * as assert from 'node:assert'; -import { Tokenizer, TokenizerMode } from './index.js'; -import { SinglePathHandler } from './queued.js'; -import { Location, EOFToken, Token } from '../common/token.js'; +import { Tokenizer, TokenizerMode, TokenHandler } from './index.js'; +import { Location, EOFToken, Token, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js'; interface LocationInfoTestCase { @@ -11,7 +10,7 @@ interface LocationInfoTestCase { } /** Receives events and immediately compares them against the expected values. */ -class LocationInfoHandler extends SinglePathHandler { +class LocationInfoHandler implements TokenHandler { public sawEof = false; /** The index of the last html chunk. */ private idx = 0; @@ -19,7 +18,6 @@ class LocationInfoHandler extends SinglePathHandler { private lines: string[]; constructor(private testCase: LocationInfoTestCase, private html: string) { - super(); this.lines = html.split(/\r?\n/g); } @@ -45,7 +43,28 @@ class LocationInfoHandler extends SinglePathHandler { this.idx += 1; } - override onEof({ location }: EOFToken): void { + onComment(token: CommentToken): void { + this.handleToken(token); + } + onDoctype(token: DoctypeToken): void { + this.handleToken(token); + } + onStartTag(token: TagToken): void { + this.handleToken(token); + } + onEndTag(token: TagToken): void { + this.handleToken(token); + } + onCharacter(token: CharacterToken): void { + this.handleToken(token); + } + onNullCharacter(token: CharacterToken): void { + this.handleToken(token); + } + onWhitespaceCharacter(token: CharacterToken): void { + this.handleToken(token); + } + onEof({ location }: EOFToken): void { assert.ok(location); assert.strictEqual(location.endOffset, location.startOffset); assert.strictEqual(location.endOffset, this.html.length); From 91dea8361ad6f6a70989658eec9749bfa332072c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 16:26:55 +0000 Subject: [PATCH 4/6] Fix false positives for nested start tag calls --- packages/parse5/lib/parser/index.ts | 32 +++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 638f26dce..ab989bfdd 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -595,7 +595,7 @@ export class Parser implements TokenHandler { break; } case TokenType.START_TAG: { - this.onStartTag(token); + this._processStartTag(token); break; } case TokenType.END_TAG: { @@ -952,18 +952,28 @@ export class Parser implements TokenHandler { this.skipNextNewLine = false; this.currentToken = token; + this._processStartTag(token); + + if (token.selfClosing && !token.ackSelfClosing) { + this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); + } + } + /** + * Processes a given start tag. + * + * `onStartTag` checks if a self-closing tag was recognized. When a token + * is moved inbetween multiple insertion modes, this check for self-closing + * could lead to false positives. To avoid this, `_processStartTag` is used + * for nested calls. + * + * @param token The token to process. + */ + _processStartTag(token: TagToken): void { if (this.shouldProcessStartTagTokenInForeignContent(token)) { startTagInForeignContent(this, token); } else { this._startTagOutsideForeignContent(token); } - - // FIXME: This can currently fire unexpectedly. - if (token.selfClosing && !token.ackSelfClosing) { - this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); - // Prevent this error from being shown again. - token.ackSelfClosing = true; - } } _startTagOutsideForeignContent(token: TagToken): void { switch (this.insertionMode) { @@ -2628,7 +2638,7 @@ function tableStartTagInTable(p: Parser, token: if (p.openElements.hasInTableScope($.TABLE)) { p.openElements.popUntilTagNamePopped($.TABLE); p._resetInsertionMode(); - p.onStartTag(token); + p._processStartTag(token); } } @@ -3128,7 +3138,7 @@ function startTagInSelect(p: Parser, token: Tag p._resetInsertionMode(); if (token.tagID !== $.SELECT) { - p.onStartTag(token); + p._processStartTag(token); } } break; @@ -3198,7 +3208,7 @@ function startTagInSelectInTable(p: Parser, tok ) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p.onStartTag(token); + p._processStartTag(token); } else { startTagInSelect(p, token); } From 3dac52baf2cc1afd1c86d29af2b43ff167f03d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 16:50:35 +0000 Subject: [PATCH 5/6] Update generate parser feedback test script --- .../generate-parser-feedback-test/index.ts | 141 ++++++++++-------- 1 file changed, 82 insertions(+), 59 deletions(-) diff --git a/scripts/generate-parser-feedback-test/index.ts b/scripts/generate-parser-feedback-test/index.ts index c0e3ca815..27cde98e9 100644 --- a/scripts/generate-parser-feedback-test/index.ts +++ b/scripts/generate-parser-feedback-test/index.ts @@ -5,8 +5,8 @@ import * as defaultTreeAdapter from 'parse5/dist/tree-adapters/default.js'; import { HtmlLibToken } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; import { parseDatFile } from 'parse5-test-utils/utils/parse-dat-file.js'; import { addSlashes } from 'parse5-test-utils/utils/common.js'; -import { TokenType, Token } from 'parse5/dist/common/token.js'; -import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js'; +import { CharacterToken, CommentToken, DoctypeToken, TagToken } from '../../packages/parse5/dist/common/token.js'; +import type { TreeAdapterTypeMap } from '../../packages/parse5/dist/tree-adapters/interface.js'; // eslint-disable-next-line no-console main().catch(console.error); @@ -23,78 +23,101 @@ function main(): Promise { return Promise.all(convertPromises); } -function appendToken(dest: Token[], token: Token): void { - if (token.type === TokenType.EOF) return; - - if (token.type === TokenType.NULL_CHARACTER || token.type === TokenType.WHITESPACE_CHARACTER) { - token.type = TokenType.CHARACTER; - } - - if (token.type === TokenType.CHARACTER) { - const lastToken = dest[dest.length - 1]; - if (lastToken?.type === TokenType.CHARACTER) { - lastToken.chars += token.chars; - return; - } - } - - dest.push(token); -} - -function convertTokenToHtml5Lib(token: Token): HtmlLibToken { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: - return ['Character', token.chars]; - - case TokenType.START_TAG: { - const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); - const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; - - if (token.selfClosing) { - startTagEntry.push(true); - } +function collectParserTokens(html: string): HtmlLibToken[] { + const tokens: HtmlLibToken[] = []; - return startTagEntry; - } + class ExtendedParser extends Parser { + private isTopLevel = true; + /** + * We only want to add tokens once. We guard against recursive calls + * using the `isTopLevel` flag. + */ + private guardTopLevel(fn: () => void, getToken: () => HtmlLibToken): void { + const { isTopLevel } = this; + this.isTopLevel = false; - case TokenType.END_TAG: - // NOTE: parser feedback simulator can produce adjusted SVG - // tag names for end tag tokens so we need to lower case it - return ['EndTag', token.tagName.toLowerCase()]; + fn(); - case TokenType.COMMENT: - return ['Comment', token.data]; + if (isTopLevel) { + this.isTopLevel = true; - case TokenType.DOCTYPE: - return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; + const token = getToken(); - default: - throw new TypeError(`Unrecognized token type: ${token.type}`); - } -} + if (token[0] === 'Character') { + if (token[1] == null || token[1].length === 0) { + return; + } -function collectParserTokens(html: string): HtmlLibToken[] { - const tokens: Token[] = []; + const lastToken = tokens[tokens.length - 1]; - class ExtendedParser extends Parser { - override _processInputToken(token: Token): void { - super._processInputToken(token); + if (lastToken?.[0] === 'Character') { + lastToken[1] += token[1]; + return; + } + } - // NOTE: Needed to split attributes of duplicate and - // which are otherwise merged as per tree constructor spec - if (token.type === TokenType.START_TAG) { - token.attrs = [...token.attrs]; + tokens.push(token); } + } - appendToken(tokens, token); + override onComment(token: CommentToken): void { + this.guardTopLevel( + () => super.onComment(token), + () => ['Comment', token.data] + ); + } + override onDoctype(token: DoctypeToken): void { + this.guardTopLevel( + () => super.onDoctype(token), + () => ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks] + ); + } + override onStartTag(token: TagToken): void { + this.guardTopLevel( + () => super.onStartTag(token), + () => { + const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); + const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; + + if (token.selfClosing) { + startTagEntry.push(true); + } + + return startTagEntry; + } + ); + } + override onEndTag(token: TagToken): void { + this.guardTopLevel( + () => super.onEndTag(token), + // NOTE: parser feedback simulator can produce adjusted SVG + // tag names for end tag tokens so we need to lower case it + () => ['EndTag', token.tagName.toLowerCase()] + ); + } + override onCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onCharacter(token), + () => ['Character', token.chars] + ); + } + override onNullCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onNullCharacter(token), + () => ['Character', token.chars] + ); + } + override onWhitespaceCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onWhitespaceCharacter(token), + () => ['Character', token.chars] + ); } } ExtendedParser.parse(html); - return tokens.map((token) => convertTokenToHtml5Lib(token)); + return tokens; } function generateParserFeedbackTest(parserTestFile: string): string { From 4c19be2efa0cbc7b2ce8169fb77b3a835e907b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Sun, 27 Feb 2022 17:23:52 +0000 Subject: [PATCH 6/6] Inline small method --- .../tokenizer/tokenizer-location-info.test.ts | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index 044e7e1f4..eb9ff1856 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -1,6 +1,6 @@ import * as assert from 'node:assert'; import { Tokenizer, TokenizerMode, TokenHandler } from './index.js'; -import { Location, EOFToken, Token, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js'; +import { Location, EOFToken, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js'; interface LocationInfoTestCase { @@ -21,10 +21,6 @@ class LocationInfoHandler implements TokenHandler { this.lines = html.split(/\r?\n/g); } - protected handleToken(token: Token): void { - this.validateLocation(token.location); - } - private validateLocation(location: Location | null): void { assert.ok(location); @@ -43,26 +39,26 @@ class LocationInfoHandler implements TokenHandler { this.idx += 1; } - onComment(token: CommentToken): void { - this.handleToken(token); + onComment({ location }: CommentToken): void { + this.validateLocation(location); } - onDoctype(token: DoctypeToken): void { - this.handleToken(token); + onDoctype({ location }: DoctypeToken): void { + this.validateLocation(location); } - onStartTag(token: TagToken): void { - this.handleToken(token); + onStartTag({ location }: TagToken): void { + this.validateLocation(location); } - onEndTag(token: TagToken): void { - this.handleToken(token); + onEndTag({ location }: TagToken): void { + this.validateLocation(location); } - onCharacter(token: CharacterToken): void { - this.handleToken(token); + onCharacter({ location }: CharacterToken): void { + this.validateLocation(location); } - onNullCharacter(token: CharacterToken): void { - this.handleToken(token); + onNullCharacter({ location }: CharacterToken): void { + this.validateLocation(location); } - onWhitespaceCharacter(token: CharacterToken): void { - this.handleToken(token); + onWhitespaceCharacter({ location }: CharacterToken): void { + this.validateLocation(location); } onEof({ location }: EOFToken): void { assert.ok(location);