diff --git a/packages/parse5/lib/common/token.ts b/packages/parse5/lib/common/token.ts index ee4bd14ba..f9028d2c8 100644 --- a/packages/parse5/lib/common/token.ts +++ b/packages/parse5/lib/common/token.ts @@ -96,13 +96,9 @@ export interface EOFToken extends TokenBase { readonly type: TokenType.EOF; } -interface HibernationToken extends TokenBase { - readonly type: TokenType.HIBERNATION; -} - export interface CharacterToken extends TokenBase { type: TokenType.CHARACTER | TokenType.NULL_CHARACTER | TokenType.WHITESPACE_CHARACTER; chars: string; } -export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | HibernationToken | CharacterToken; +export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | CharacterToken; diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index de93f2a91..ab989bfdd 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1,5 +1,4 @@ -import { TokenizerMode } from '../tokenizer/index.js'; -import { QueuedTokenizer } from '../tokenizer/queued.js'; +import { TokenHandler, Tokenizer, TokenizerMode } from '../tokenizer/index.js'; import { OpenElementStack } from './open-element-stack.js'; import { FormattingElementList, ElementEntry, EntryType } from './formatting-element-list.js'; import * as defaultTreeAdapter from '../tree-adapters/default.js'; @@ -120,9 +119,9 @@ const defaultParserOptions = { }; //Parser -export class Parser { +export class Parser implements TokenHandler { treeAdapter: TreeAdapter; - private onParseError: ParserErrorHandler | null; + onParseError: ParserErrorHandler | null; private currentToken: Token | null = null; public options: Required>; public document: T['document']; @@ -147,7 +146,7 @@ export class Parser { this.document = document ?? this.treeAdapter.createDocument(); - this.tokenizer = new QueuedTokenizer(this.options); + this.tokenizer = new Tokenizer(this.options, this); this.activeFormattingElements = new FormattingElementList(this.treeAdapter); this.fragmentContextID = fragmentContext ? getTagID(this.treeAdapter.getTagName(fragmentContext)) : $.UNKNOWN; @@ -211,7 +210,8 @@ export class Parser { return fragment; } - tokenizer: QueuedTokenizer; + tokenizer: Tokenizer; + stopped = false; insertionMode = InsertionMode.INITIAL; originalInsertionMode = InsertionMode.INITIAL; @@ -261,13 +261,7 @@ export class Parser { //Parsing loop private _runParsingLoop(scriptHandler: null | ((scriptElement: T['element']) => void)): void { while (!this.stopped) { - const token = this.tokenizer.getNextToken(); - - this._processToken(token); - - if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { - this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); - } + this.tokenizer.getNextToken(); if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) { break; @@ -601,7 +595,7 @@ export class Parser { break; } case TokenType.START_TAG: { - this.onStartTag(token); + this._processStartTag(token); break; } case TokenType.END_TAG: { @@ -958,6 +952,23 @@ export class Parser { this.skipNextNewLine = false; this.currentToken = token; + this._processStartTag(token); + + if (token.selfClosing && !token.ackSelfClosing) { + this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); + } + } + /** + * Processes a given start tag. + * + * `onStartTag` checks if a self-closing tag was recognized. When a token + * is moved inbetween multiple insertion modes, this check for self-closing + * could lead to false positives. To avoid this, `_processStartTag` is used + * for nested calls. + * + * @param token The token to process. + */ + _processStartTag(token: TagToken): void { if (this.shouldProcessStartTagTokenInForeignContent(token)) { startTagInForeignContent(this, token); } else { @@ -2627,7 +2638,7 @@ function tableStartTagInTable(p: Parser, token: if (p.openElements.hasInTableScope($.TABLE)) { p.openElements.popUntilTagNamePopped($.TABLE); p._resetInsertionMode(); - p.onStartTag(token); + p._processStartTag(token); } } @@ -3127,7 +3138,7 @@ function startTagInSelect(p: Parser, token: Tag p._resetInsertionMode(); if (token.tagID !== $.SELECT) { - p.onStartTag(token); + p._processStartTag(token); } } break; @@ -3197,7 +3208,7 @@ function startTagInSelectInTable(p: Parser, tok ) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p.onStartTag(token); + p._processStartTag(token); } else { startTagInSelect(p, token); } diff --git a/packages/parse5/lib/tokenizer/queued.ts b/packages/parse5/lib/tokenizer/queued.ts deleted file mode 100644 index 6ac8178a2..000000000 --- a/packages/parse5/lib/tokenizer/queued.ts +++ /dev/null @@ -1,105 +0,0 @@ -import { TokenType, Token, CharacterToken, DoctypeToken, TagToken, EOFToken, CommentToken } from '../common/token.js'; -import { TokenHandler, Tokenizer, TokenizerOptions, TokenizerMode } from './index.js'; -import type { ParserErrorHandler } from '../common/error-codes.js'; -import type { Preprocessor } from './preprocessor.js'; - -const HIBERNATION_TOKEN: Token = { type: TokenType.HIBERNATION, location: null }; - -/** A token handler implemnetation that calls the same function for all tokens. */ -export abstract class SinglePathHandler implements TokenHandler { - protected abstract handleToken(token: Token): void; - - onComment(token: CommentToken): void { - this.handleToken(token); - } - onDoctype(token: DoctypeToken): void { - this.handleToken(token); - } - onStartTag(token: TagToken): void { - this.handleToken(token); - } - onEndTag(token: TagToken): void { - this.handleToken(token); - } - onEof(token: EOFToken): void { - this.handleToken(token); - } - onCharacter(token: CharacterToken): void { - this.handleToken(token); - } - onNullCharacter(token: CharacterToken): void { - this.handleToken(token); - } - onWhitespaceCharacter(token: CharacterToken): void { - this.handleToken(token); - } -} - -class QueuedHandler extends SinglePathHandler { - private tokenQueue: Token[] = []; - - protected handleToken(token: Token): void { - this.tokenQueue.push(token); - } - - constructor(public onParseError: ParserErrorHandler | null) { - super(); - } - - public getNextToken(tokenizer: Tokenizer): Token { - while (this.tokenQueue.length === 0 && tokenizer.active) { - tokenizer.getNextToken(); - } - - if (this.tokenQueue.length === 0 && !tokenizer.active) { - this.tokenQueue.push(HIBERNATION_TOKEN); - } - - return this.tokenQueue.shift()!; - } -} - -export interface QueuedTokenizerOptions extends TokenizerOptions { - onParseError?: ParserErrorHandler | null; -} - -/** - * Provides the same interface as the old tokenizer, while allowing users to - * read data one token at a time. - */ -export class QueuedTokenizer { - private tokenizer: Tokenizer; - private handler: QueuedHandler; - - constructor(options: QueuedTokenizerOptions) { - this.handler = new QueuedHandler(options.onParseError ?? null); - this.tokenizer = new Tokenizer(options, this.handler); - } - - set allowCDATA(val: boolean) { - this.tokenizer.allowCDATA = val; - } - - get preprocessor(): Preprocessor { - return this.tokenizer.preprocessor; - } - get active(): boolean { - return this.tokenizer.active; - } - - set state(val: typeof TokenizerMode[keyof typeof TokenizerMode]) { - this.tokenizer.state = val; - } - - public write(chunk: string, isLastChunk: boolean): void { - this.tokenizer.write(chunk, isLastChunk); - } - - public insertHtmlAtCurrentPos(str: string): void { - this.tokenizer.insertHtmlAtCurrentPos(str); - } - - public getNextToken(): Token { - return this.handler.getNextToken(this.tokenizer); - } -} diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index 6e918d548..eb9ff1856 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -1,7 +1,6 @@ import * as assert from 'node:assert'; -import { Tokenizer, TokenizerMode } from './index.js'; -import { SinglePathHandler } from './queued.js'; -import { Location, EOFToken, Token } from '../common/token.js'; +import { Tokenizer, TokenizerMode, TokenHandler } from './index.js'; +import { Location, EOFToken, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js'; interface LocationInfoTestCase { @@ -11,7 +10,7 @@ interface LocationInfoTestCase { } /** Receives events and immediately compares them against the expected values. */ -class LocationInfoHandler extends SinglePathHandler { +class LocationInfoHandler implements TokenHandler { public sawEof = false; /** The index of the last html chunk. */ private idx = 0; @@ -19,14 +18,9 @@ class LocationInfoHandler extends SinglePathHandler { private lines: string[]; constructor(private testCase: LocationInfoTestCase, private html: string) { - super(); this.lines = html.split(/\r?\n/g); } - protected handleToken(token: Token): void { - this.validateLocation(token.location); - } - private validateLocation(location: Location | null): void { assert.ok(location); @@ -45,7 +39,28 @@ class LocationInfoHandler extends SinglePathHandler { this.idx += 1; } - override onEof({ location }: EOFToken): void { + onComment({ location }: CommentToken): void { + this.validateLocation(location); + } + onDoctype({ location }: DoctypeToken): void { + this.validateLocation(location); + } + onStartTag({ location }: TagToken): void { + this.validateLocation(location); + } + onEndTag({ location }: TagToken): void { + this.validateLocation(location); + } + onCharacter({ location }: CharacterToken): void { + this.validateLocation(location); + } + onNullCharacter({ location }: CharacterToken): void { + this.validateLocation(location); + } + onWhitespaceCharacter({ location }: CharacterToken): void { + this.validateLocation(location); + } + onEof({ location }: EOFToken): void { assert.ok(location); assert.strictEqual(location.endOffset, location.startOffset); assert.strictEqual(location.endOffset, this.html.length); diff --git a/scripts/generate-parser-feedback-test/index.ts b/scripts/generate-parser-feedback-test/index.ts index c0e3ca815..27cde98e9 100644 --- a/scripts/generate-parser-feedback-test/index.ts +++ b/scripts/generate-parser-feedback-test/index.ts @@ -5,8 +5,8 @@ import * as defaultTreeAdapter from 'parse5/dist/tree-adapters/default.js'; import { HtmlLibToken } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; import { parseDatFile } from 'parse5-test-utils/utils/parse-dat-file.js'; import { addSlashes } from 'parse5-test-utils/utils/common.js'; -import { TokenType, Token } from 'parse5/dist/common/token.js'; -import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js'; +import { CharacterToken, CommentToken, DoctypeToken, TagToken } from '../../packages/parse5/dist/common/token.js'; +import type { TreeAdapterTypeMap } from '../../packages/parse5/dist/tree-adapters/interface.js'; // eslint-disable-next-line no-console main().catch(console.error); @@ -23,78 +23,101 @@ function main(): Promise { return Promise.all(convertPromises); } -function appendToken(dest: Token[], token: Token): void { - if (token.type === TokenType.EOF) return; - - if (token.type === TokenType.NULL_CHARACTER || token.type === TokenType.WHITESPACE_CHARACTER) { - token.type = TokenType.CHARACTER; - } - - if (token.type === TokenType.CHARACTER) { - const lastToken = dest[dest.length - 1]; - if (lastToken?.type === TokenType.CHARACTER) { - lastToken.chars += token.chars; - return; - } - } - - dest.push(token); -} - -function convertTokenToHtml5Lib(token: Token): HtmlLibToken { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: - return ['Character', token.chars]; - - case TokenType.START_TAG: { - const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); - const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; - - if (token.selfClosing) { - startTagEntry.push(true); - } +function collectParserTokens(html: string): HtmlLibToken[] { + const tokens: HtmlLibToken[] = []; - return startTagEntry; - } + class ExtendedParser extends Parser { + private isTopLevel = true; + /** + * We only want to add tokens once. We guard against recursive calls + * using the `isTopLevel` flag. + */ + private guardTopLevel(fn: () => void, getToken: () => HtmlLibToken): void { + const { isTopLevel } = this; + this.isTopLevel = false; - case TokenType.END_TAG: - // NOTE: parser feedback simulator can produce adjusted SVG - // tag names for end tag tokens so we need to lower case it - return ['EndTag', token.tagName.toLowerCase()]; + fn(); - case TokenType.COMMENT: - return ['Comment', token.data]; + if (isTopLevel) { + this.isTopLevel = true; - case TokenType.DOCTYPE: - return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; + const token = getToken(); - default: - throw new TypeError(`Unrecognized token type: ${token.type}`); - } -} + if (token[0] === 'Character') { + if (token[1] == null || token[1].length === 0) { + return; + } -function collectParserTokens(html: string): HtmlLibToken[] { - const tokens: Token[] = []; + const lastToken = tokens[tokens.length - 1]; - class ExtendedParser extends Parser { - override _processInputToken(token: Token): void { - super._processInputToken(token); + if (lastToken?.[0] === 'Character') { + lastToken[1] += token[1]; + return; + } + } - // NOTE: Needed to split attributes of duplicate and - // which are otherwise merged as per tree constructor spec - if (token.type === TokenType.START_TAG) { - token.attrs = [...token.attrs]; + tokens.push(token); } + } - appendToken(tokens, token); + override onComment(token: CommentToken): void { + this.guardTopLevel( + () => super.onComment(token), + () => ['Comment', token.data] + ); + } + override onDoctype(token: DoctypeToken): void { + this.guardTopLevel( + () => super.onDoctype(token), + () => ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks] + ); + } + override onStartTag(token: TagToken): void { + this.guardTopLevel( + () => super.onStartTag(token), + () => { + const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); + const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; + + if (token.selfClosing) { + startTagEntry.push(true); + } + + return startTagEntry; + } + ); + } + override onEndTag(token: TagToken): void { + this.guardTopLevel( + () => super.onEndTag(token), + // NOTE: parser feedback simulator can produce adjusted SVG + // tag names for end tag tokens so we need to lower case it + () => ['EndTag', token.tagName.toLowerCase()] + ); + } + override onCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onCharacter(token), + () => ['Character', token.chars] + ); + } + override onNullCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onNullCharacter(token), + () => ['Character', token.chars] + ); + } + override onWhitespaceCharacter(token: CharacterToken): void { + this.guardTopLevel( + () => super.onWhitespaceCharacter(token), + () => ['Character', token.chars] + ); } } ExtendedParser.parse(html); - return tokens.map((token) => convertTokenToHtml5Lib(token)); + return tokens; } function generateParserFeedbackTest(parserTestFile: string): string {