From 8daef5ad40cd6832a72be8b31516cb2ceb6b01ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 26 Jan 2022 17:14:40 -0800 Subject: [PATCH 01/12] refactor(parser): Introduce token callbacks --- packages/parse5/lib/parser/index.ts | 1396 +++++++++++---------------- 1 file changed, 546 insertions(+), 850 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 3ddb4905b..44101606e 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -267,30 +267,9 @@ export class Parser { while (!this.stopped) { const token = this.tokenizer.getNextToken(); - if (token.type === TokenType.HIBERNATION) { - break; - } - - if (this.skipNextNewLine) { - this.skipNextNewLine = false; - - if ( - token.type === TokenType.WHITESPACE_CHARACTER && - token.chars.charCodeAt(0) === unicode.CODE_POINTS.LINE_FEED - ) { - if (token.chars.length === 1) { - continue; - } - - token.chars = token.chars.substr(1); - } - } - - this.currentToken = token; - - this._processInputToken(token); + this._processToken(token); - if (scriptHandler !== null && this.pendingScript) { + if (token.type === TokenType.HIBERNATION || (scriptHandler !== null && this.pendingScript)) { break; } } @@ -567,7 +546,7 @@ export class Parser { } //Token processing - private _shouldProcessTokenInForeignContent(token: Token): boolean { + private shouldProcessStartTagTokenInForeignContent(token: TagToken): boolean { let current: T['parentNode']; let currentTagId: number; @@ -578,180 +557,69 @@ export class Parser { ({ current, currentTagId } = this.openElements); } - const ns = this.treeAdapter.getNamespaceURI(current); - //NOTE: We won't get here with current === document, or ns === NS.HTML if ( - token.type === TokenType.START_TAG && token.tagID === $.SVG && this.treeAdapter.getTagName(current) === TN.ANNOTATION_XML && - ns === NS.MATHML + this.treeAdapter.getNamespaceURI(current) === NS.MATHML ) { return false; } - const isCharacterToken = - token.type === TokenType.CHARACTER || - token.type === TokenType.NULL_CHARACTER || - token.type === TokenType.WHITESPACE_CHARACTER; - - const isMathMLTextStartTag = - token.type === TokenType.START_TAG && token.tagID !== $.MGLYPH && token.tagID !== $.MALIGNMARK; + return token.tagID === $.MGLYPH || token.tagID === $.MALIGNMARK + ? !this._isIntegrationPoint(currentTagId, current, NS.HTML) + : !this._isIntegrationPoint(currentTagId, current); + } - if ((isMathMLTextStartTag || isCharacterToken) && this._isIntegrationPoint(currentTagId, current, NS.MATHML)) { - return false; - } + private shouldProcessTextInForeignContent(): boolean { + let current: T['parentNode']; + let currentTagId: number; - if ( - (token.type === TokenType.START_TAG || isCharacterToken) && - this._isIntegrationPoint(currentTagId, current, NS.HTML) - ) { - return false; + if (this.openElements.stackTop === 0 && this.fragmentContext) { + current = this.fragmentContext; + currentTagId = this.fragmentContextID; + } else { + ({ current, currentTagId } = this.openElements); } - return token.type !== TokenType.EOF; + return !this._isIntegrationPoint(currentTagId, current); } _processToken(token: Token): void { - switch (this.insertionMode) { - case InsertionMode.INITIAL: { - modeInitial(this, token); - break; - } - case InsertionMode.BEFORE_HTML: { - modeBeforeHtml(this, token); - break; - } - case InsertionMode.BEFORE_HEAD: { - modeBeforeHead(this, token); - break; - } - case InsertionMode.IN_HEAD: { - modeInHead(this, token); - break; - } - case InsertionMode.IN_HEAD_NO_SCRIPT: { - modeInHeadNoScript(this, token); - break; - } - case InsertionMode.AFTER_HEAD: { - modeAfterHead(this, token); - break; - } - case InsertionMode.IN_BODY: { - modeInBody(this, token); - break; - } - case InsertionMode.TEXT: { - modeText(this, token); - break; - } - case InsertionMode.IN_TABLE: { - modeInTable(this, token); - break; - } - case InsertionMode.IN_TABLE_TEXT: { - modeInTableText(this, token); - break; - } - case InsertionMode.IN_CAPTION: { - modeInCaption(this, token); - break; - } - case InsertionMode.IN_COLUMN_GROUP: { - modeInColumnGroup(this, token); - break; - } - case InsertionMode.IN_TABLE_BODY: { - modeInTableBody(this, token); - break; - } - case InsertionMode.IN_ROW: { - modeInRow(this, token); - break; - } - case InsertionMode.IN_CELL: { - modeInCell(this, token); - break; - } - case InsertionMode.IN_SELECT: { - modeInSelect(this, token); - break; - } - case InsertionMode.IN_SELECT_IN_TABLE: { - modeInSelectInTable(this, token); - break; - } - case InsertionMode.IN_TEMPLATE: { - modeInTemplate(this, token); - break; - } - case InsertionMode.AFTER_BODY: { - modeAfterBody(this, token); - break; - } - case InsertionMode.IN_FRAMESET: { - modeInFrameset(this, token); - break; - } - case InsertionMode.AFTER_FRAMESET: { - modeAfterFrameset(this, token); - break; - } - case InsertionMode.AFTER_AFTER_BODY: { - modeAfterAfterBody(this, token); - break; - } - case InsertionMode.AFTER_AFTER_FRAMESET: { - modeAfterAfterFrameset(this, token); - break; - } - default: - // Do nothing - } - } - - _processTokenInForeignContent(token: Token): void { switch (token.type) { case TokenType.CHARACTER: { - characterInForeignContent(this, token); + this.onCharacterToken(token); break; } case TokenType.NULL_CHARACTER: { - nullCharacterInForeignContent(this, token); + this.onNullCharacterToken(token); break; } - case TokenType.WHITESPACE_CHARACTER: { - this._insertCharacters(token); + case TokenType.COMMENT: { + this.onCommentToken(token); break; } - case TokenType.COMMENT: { - appendComment(this, token); + case TokenType.DOCTYPE: { + this.onDoctypeToken(token); break; } case TokenType.START_TAG: { - startTagInForeignContent(this, token); + this.onStartTagToken(token); break; } case TokenType.END_TAG: { - endTagInForeignContent(this, token); + this.onEndTagToken(token); + break; + } + case TokenType.EOF: { + this.onEofToken(token); + break; + } + case TokenType.WHITESPACE_CHARACTER: { + this.onWhitespaceCharacterToken(token); break; } - default: - // Do nothing - } - } - - _processInputToken(token: Token): void { - if (this._considerForeignContent && this._shouldProcessTokenInForeignContent(token)) { - this._processTokenInForeignContent(token); - } else { - this._processToken(token); - } - - if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { - this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); } } @@ -920,12 +788,449 @@ export class Parser { return SPECIAL_ELEMENTS[ns].has(id); } + + onCharacterToken(token: CharacterToken): void { + this.skipNextNewLine = false; + if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + characterInForeignContent(this, token); + return; + } + + switch (this.insertionMode) { + case InsertionMode.INITIAL: + tokenInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HTML: + tokenBeforeHtml(this, token); + break; + case InsertionMode.BEFORE_HEAD: + tokenBeforeHead(this, token); + break; + case InsertionMode.IN_HEAD: + tokenInHead(this, token); + break; + case InsertionMode.IN_HEAD_NO_SCRIPT: + tokenInHeadNoScript(this, token); + break; + case InsertionMode.AFTER_HEAD: + tokenAfterHead(this, token); + break; + case InsertionMode.IN_BODY: + case InsertionMode.IN_CAPTION: + case InsertionMode.IN_CELL: + case InsertionMode.IN_TEMPLATE: + characterInBody(this, token); + break; + case InsertionMode.TEXT: + case InsertionMode.IN_SELECT: + case InsertionMode.IN_SELECT_IN_TABLE: + this._insertCharacters(token); + break; + case InsertionMode.IN_TABLE: + case InsertionMode.IN_TABLE_BODY: + case InsertionMode.IN_ROW: + characterInTable(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + characterInTableText(this, token); + break; + case InsertionMode.IN_COLUMN_GROUP: + tokenInColumnGroup(this, token); + break; + case InsertionMode.AFTER_BODY: + tokenAfterBody(this, token); + break; + case InsertionMode.AFTER_AFTER_BODY: + tokenAfterAfterBody(this, token); + break; + default: + // Do nothing + } + } + onNullCharacterToken(token: CharacterToken): void { + this.skipNextNewLine = false; + if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + nullCharacterInForeignContent(this, token); + return; + } + + switch (this.insertionMode) { + case InsertionMode.INITIAL: + tokenInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HTML: + tokenBeforeHtml(this, token); + break; + case InsertionMode.BEFORE_HEAD: + tokenBeforeHead(this, token); + break; + case InsertionMode.IN_HEAD: + tokenInHead(this, token); + break; + case InsertionMode.IN_HEAD_NO_SCRIPT: + tokenInHeadNoScript(this, token); + break; + case InsertionMode.AFTER_HEAD: + tokenAfterHead(this, token); + break; + case InsertionMode.TEXT: + this._insertCharacters(token); + break; + case InsertionMode.IN_TABLE: + case InsertionMode.IN_TABLE_BODY: + case InsertionMode.IN_ROW: + characterInTable(this, token); + break; + case InsertionMode.IN_COLUMN_GROUP: + tokenInColumnGroup(this, token); + break; + case InsertionMode.AFTER_BODY: + tokenAfterBody(this, token); + break; + case InsertionMode.AFTER_AFTER_BODY: + tokenAfterAfterBody(this, token); + break; + default: + // Do nothing + } + } + onCommentToken(token: CommentToken): void { + this.skipNextNewLine = false; + if (this._considerForeignContent) { + appendComment(this, token); + return; + } + + switch (this.insertionMode) { + case InsertionMode.INITIAL: + case InsertionMode.BEFORE_HTML: + case InsertionMode.BEFORE_HEAD: + case InsertionMode.IN_HEAD: + case InsertionMode.IN_HEAD_NO_SCRIPT: + case InsertionMode.AFTER_HEAD: + case InsertionMode.IN_BODY: + case InsertionMode.IN_TABLE: + case InsertionMode.IN_CAPTION: + case InsertionMode.IN_COLUMN_GROUP: + case InsertionMode.IN_TABLE_BODY: + case InsertionMode.IN_ROW: + case InsertionMode.IN_CELL: + case InsertionMode.IN_SELECT: + case InsertionMode.IN_SELECT_IN_TABLE: + case InsertionMode.IN_TEMPLATE: + case InsertionMode.IN_FRAMESET: + case InsertionMode.AFTER_FRAMESET: + appendComment(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + tokenInTableText(this, token); + break; + case InsertionMode.AFTER_BODY: + appendCommentToRootHtmlElement(this, token); + break; + case InsertionMode.AFTER_AFTER_BODY: + case InsertionMode.AFTER_AFTER_FRAMESET: + appendCommentToDocument(this, token); + break; + default: + // Do nothing + } + } + onDoctypeToken(token: DoctypeToken): void { + this.skipNextNewLine = false; + switch (this.insertionMode) { + case InsertionMode.INITIAL: + doctypeInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HEAD: + case InsertionMode.IN_HEAD: + case InsertionMode.IN_HEAD_NO_SCRIPT: + case InsertionMode.AFTER_HEAD: + this._err(token, ERR.misplacedDoctype); + break; + case InsertionMode.IN_TABLE_TEXT: + tokenInTableText(this, token); + break; + default: + // Do nothing + } + } + onStartTagToken(token: TagToken): void { + this.skipNextNewLine = false; + this.currentToken = token; + + if (this._considerForeignContent && this.shouldProcessStartTagTokenInForeignContent(token)) { + startTagInForeignContent(this, token); + } else { + this._startTagOutsideForeignContent(token); + } + + if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { + this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); + } + } + _startTagOutsideForeignContent(token: TagToken): void { + switch (this.insertionMode) { + case InsertionMode.INITIAL: + tokenInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HTML: + startTagBeforeHtml(this, token); + break; + case InsertionMode.BEFORE_HEAD: + startTagBeforeHead(this, token); + break; + case InsertionMode.IN_HEAD: + startTagInHead(this, token); + break; + case InsertionMode.IN_HEAD_NO_SCRIPT: + startTagInHeadNoScript(this, token); + break; + case InsertionMode.AFTER_HEAD: + startTagAfterHead(this, token); + break; + case InsertionMode.IN_BODY: + startTagInBody(this, token); + break; + case InsertionMode.IN_TABLE: + startTagInTable(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + tokenInTableText(this, token); + break; + case InsertionMode.IN_CAPTION: + startTagInCaption(this, token); + break; + case InsertionMode.IN_COLUMN_GROUP: + startTagInColumnGroup(this, token); + break; + case InsertionMode.IN_TABLE_BODY: + startTagInTableBody(this, token); + break; + case InsertionMode.IN_ROW: + startTagInRow(this, token); + break; + case InsertionMode.IN_CELL: + startTagInCell(this, token); + break; + case InsertionMode.IN_SELECT: + startTagInSelect(this, token); + break; + case InsertionMode.IN_SELECT_IN_TABLE: + startTagInSelectInTable(this, token); + break; + case InsertionMode.IN_TEMPLATE: + startTagInTemplate(this, token); + break; + case InsertionMode.AFTER_BODY: + startTagAfterBody(this, token); + break; + case InsertionMode.IN_FRAMESET: + startTagInFrameset(this, token); + break; + case InsertionMode.AFTER_FRAMESET: + startTagAfterFrameset(this, token); + break; + case InsertionMode.AFTER_AFTER_BODY: + startTagAfterAfterBody(this, token); + break; + case InsertionMode.AFTER_AFTER_FRAMESET: + startTagAfterAfterFrameset(this, token); + break; + default: + // Do nothing + } + } + onEndTagToken(token: TagToken): void { + this.skipNextNewLine = false; + this.currentToken = token; + + if (this._considerForeignContent) { + endTagInForeignContent(this, token); + } else { + this._endTagOutsideForeignContent(token); + } + } + _endTagOutsideForeignContent(token: TagToken): void { + switch (this.insertionMode) { + case InsertionMode.INITIAL: + tokenInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HTML: + endTagBeforeHtml(this, token); + break; + case InsertionMode.BEFORE_HEAD: + endTagBeforeHead(this, token); + break; + case InsertionMode.IN_HEAD: + endTagInHead(this, token); + break; + case InsertionMode.IN_HEAD_NO_SCRIPT: + endTagInHeadNoScript(this, token); + break; + case InsertionMode.AFTER_HEAD: + endTagAfterHead(this, token); + break; + case InsertionMode.IN_BODY: + endTagInBody(this, token); + break; + case InsertionMode.TEXT: + endTagInText(this, token); + break; + case InsertionMode.IN_TABLE: + endTagInTable(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + tokenInTableText(this, token); + break; + case InsertionMode.IN_CAPTION: + endTagInCaption(this, token); + break; + case InsertionMode.IN_COLUMN_GROUP: + endTagInColumnGroup(this, token); + break; + case InsertionMode.IN_TABLE_BODY: + endTagInTableBody(this, token); + break; + case InsertionMode.IN_ROW: + endTagInRow(this, token); + break; + case InsertionMode.IN_CELL: + endTagInCell(this, token); + break; + case InsertionMode.IN_SELECT: + endTagInSelect(this, token); + break; + case InsertionMode.IN_SELECT_IN_TABLE: + endTagInSelectInTable(this, token); + break; + case InsertionMode.IN_TEMPLATE: + endTagInTemplate(this, token); + break; + case InsertionMode.AFTER_BODY: + endTagAfterBody(this, token); + break; + case InsertionMode.IN_FRAMESET: + endTagInFrameset(this, token); + break; + case InsertionMode.AFTER_FRAMESET: + endTagAfterFrameset(this, token); + break; + case InsertionMode.AFTER_AFTER_BODY: + tokenAfterAfterBody(this, token); + break; + default: + // Do nothing + } + } + onEofToken(token: EOFToken): void { + this.skipNextNewLine = false; + switch (this.insertionMode) { + case InsertionMode.INITIAL: + tokenInInitialMode(this, token); + break; + case InsertionMode.BEFORE_HTML: + tokenBeforeHtml(this, token); + break; + case InsertionMode.BEFORE_HEAD: + tokenBeforeHead(this, token); + break; + case InsertionMode.IN_HEAD: + tokenInHead(this, token); + break; + case InsertionMode.IN_HEAD_NO_SCRIPT: + tokenInHeadNoScript(this, token); + break; + case InsertionMode.AFTER_HEAD: + tokenAfterHead(this, token); + break; + case InsertionMode.IN_BODY: + case InsertionMode.IN_TABLE: + case InsertionMode.IN_CAPTION: + case InsertionMode.IN_COLUMN_GROUP: + case InsertionMode.IN_TABLE_BODY: + case InsertionMode.IN_ROW: + case InsertionMode.IN_CELL: + case InsertionMode.IN_SELECT: + case InsertionMode.IN_SELECT_IN_TABLE: + eofInBody(this, token); + break; + case InsertionMode.TEXT: + eofInText(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + tokenInTableText(this, token); + break; + case InsertionMode.IN_TEMPLATE: + eofInTemplate(this, token); + break; + case InsertionMode.AFTER_BODY: + case InsertionMode.IN_FRAMESET: + case InsertionMode.AFTER_FRAMESET: + case InsertionMode.AFTER_AFTER_BODY: + case InsertionMode.AFTER_AFTER_FRAMESET: + stopParsing(this, token); + break; + default: + // Do nothing + } + } + onWhitespaceCharacterToken(token: CharacterToken): void { + if (this.skipNextNewLine) { + this.skipNextNewLine = false; + + if (token.chars.charCodeAt(0) === unicode.CODE_POINTS.LINE_FEED) { + if (token.chars.length === 1) { + return; + } + + token.chars = token.chars.substr(1); + } + } + + if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + this._insertCharacters(token); + return; + } + + switch (this.insertionMode) { + case InsertionMode.IN_HEAD: + case InsertionMode.IN_HEAD_NO_SCRIPT: + case InsertionMode.AFTER_HEAD: + case InsertionMode.TEXT: + case InsertionMode.IN_COLUMN_GROUP: + case InsertionMode.IN_SELECT: + case InsertionMode.IN_SELECT_IN_TABLE: + case InsertionMode.IN_FRAMESET: + case InsertionMode.AFTER_FRAMESET: + this._insertCharacters(token); + break; + case InsertionMode.IN_BODY: + case InsertionMode.IN_CAPTION: + case InsertionMode.IN_CELL: + case InsertionMode.IN_TEMPLATE: + case InsertionMode.AFTER_BODY: + case InsertionMode.AFTER_AFTER_BODY: + case InsertionMode.AFTER_AFTER_FRAMESET: + whitespaceCharacterInBody(this, token); + break; + case InsertionMode.IN_TABLE: + case InsertionMode.IN_TABLE_BODY: + case InsertionMode.IN_ROW: + characterInTable(this, token); + break; + case InsertionMode.IN_TABLE_TEXT: + whitespaceCharacterInTableText(this, token); + break; + default: + // Do nothing + } + } } //Adoption agency algorithm //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adoptionAgency) //------------------------------------------------------------------ - //Steps 5-8 of the algorithm function aaObtainFormattingElementEntry( p: Parser, @@ -1124,26 +1429,6 @@ function stopParsing(p: Parser, token: EOFToken // The "initial" insertion mode //------------------------------------------------------------------ -function modeInitial(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.DOCTYPE: { - doctypeInInitialMode(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - // Ignore token - break; - } - default: { - tokenInInitialMode(p, token); - } - } -} - function doctypeInInitialMode(p: Parser, token: DoctypeToken): void { p._setDocumentType(token); @@ -1162,90 +1447,36 @@ function tokenInInitialMode(p: Parser, token: T p._err(token, ERR.missingDoctype, true); p.treeAdapter.setDocumentMode(p.document, DOCUMENT_MODE.QUIRKS); p.insertionMode = InsertionMode.BEFORE_HTML; - modeBeforeHtml(p, token); + p._processToken(token); } // The "before html" insertion mode //------------------------------------------------------------------ -function modeBeforeHtml(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.EOF: { - tokenBeforeHtml(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagBeforeHtml(p, token); - break; - } - case TokenType.END_TAG: { - endTagBeforeHtml(p, token); - break; - } - default: - // Do nothing - } -} - function startTagBeforeHtml(p: Parser, token: TagToken): void { - if (token.tagID === $.HTML) { - p._insertElement(token, NS.HTML); - p.insertionMode = InsertionMode.BEFORE_HEAD; - } else { - tokenBeforeHtml(p, token); - } -} - -function endTagBeforeHtml(p: Parser, token: TagToken): void { - const tn = token.tagID; - - if (tn === $.HTML || tn === $.HEAD || tn === $.BODY || tn === $.BR) { - tokenBeforeHtml(p, token); - } -} - -function tokenBeforeHtml(p: Parser, token: Token): void { - p._insertFakeRootElement(); - p.insertionMode = InsertionMode.BEFORE_HEAD; - modeBeforeHead(p, token); -} - -// The "before head" insertion mode -//------------------------------------------------------------------ -function modeBeforeHead(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.EOF: { - tokenBeforeHead(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.DOCTYPE: { - p._err(token, ERR.misplacedDoctype); - break; - } - case TokenType.START_TAG: { - startTagBeforeHead(p, token); - break; - } - case TokenType.END_TAG: { - endTagBeforeHead(p, token); - break; - } - default: - // Do nothing + if (token.tagID === $.HTML) { + p._insertElement(token, NS.HTML); + p.insertionMode = InsertionMode.BEFORE_HEAD; + } else { + tokenBeforeHtml(p, token); + } +} + +function endTagBeforeHtml(p: Parser, token: TagToken): void { + const tn = token.tagID; + + if (tn === $.HTML || tn === $.HEAD || tn === $.BODY || tn === $.BR) { + tokenBeforeHtml(p, token); } } +function tokenBeforeHtml(p: Parser, token: Token): void { + p._insertFakeRootElement(); + p.insertionMode = InsertionMode.BEFORE_HEAD; + p._processToken(token); +} + +// The "before head" insertion mode +//------------------------------------------------------------------ function startTagBeforeHead(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -1278,44 +1509,11 @@ function tokenBeforeHead(p: Parser, token: Toke p._insertFakeElement(TN.HEAD, $.HEAD); p.headElement = p.openElements.current; p.insertionMode = InsertionMode.IN_HEAD; - modeInHead(p, token); + p._processToken(token); } // The "in head" insertion mode //------------------------------------------------------------------ -function modeInHead(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.EOF: { - tokenInHead(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.DOCTYPE: { - p._err(token, ERR.misplacedDoctype); - break; - } - case TokenType.START_TAG: { - startTagInHead(p, token); - break; - } - case TokenType.END_TAG: { - endTagInHead(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInHead(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -1410,44 +1608,11 @@ function endTagInHead(p: Parser, token: TagToke function tokenInHead(p: Parser, token: Token): void { p.openElements.pop(); p.insertionMode = InsertionMode.AFTER_HEAD; - modeAfterHead(p, token); + tokenAfterHead(p, token); } // The "in head no script" insertion mode //------------------------------------------------------------------ -function modeInHeadNoScript(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.EOF: { - tokenInHeadNoScript(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.DOCTYPE: { - p._err(token, ERR.misplacedDoctype); - break; - } - case TokenType.START_TAG: { - startTagInHeadNoScript(p, token); - break; - } - case TokenType.END_TAG: { - endTagInHeadNoScript(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInHeadNoScript(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -1497,44 +1662,11 @@ function tokenInHeadNoScript(p: Parser, token: p._err(token, errCode); p.openElements.pop(); p.insertionMode = InsertionMode.IN_HEAD; - modeInHead(p, token); + p._processToken(token); } // The "after head" insertion mode //------------------------------------------------------------------ -function modeAfterHead(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.EOF: { - tokenAfterHead(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.DOCTYPE: { - p._err(token, ERR.misplacedDoctype); - break; - } - case TokenType.START_TAG: { - startTagAfterHead(p, token); - break; - } - case TokenType.END_TAG: { - endTagAfterHead(p, token); - break; - } - default: - // Do nothing - } -} - function startTagAfterHead(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -1599,7 +1731,7 @@ function endTagAfterHead(p: Parser, token: TagT function tokenAfterHead(p: Parser, token: Token): void { p._insertFakeElement(TN.BODY, $.BODY); p.insertionMode = InsertionMode.IN_BODY; - modeInBody(p, token); + p._processToken(token); } // The "in body" insertion mode @@ -2210,7 +2342,7 @@ function bodyEndTagInBody(p: Parser, token: Tag function htmlEndTagInBody(p: Parser, token: TagToken): void { if (p.openElements.hasInScope($.BODY)) { p.insertionMode = InsertionMode.AFTER_BODY; - modeAfterBody(p, token); + endTagAfterBody(p, token); } } @@ -2422,27 +2554,6 @@ function eofInBody(p: Parser, token: EOFToken): // The "text" insertion mode //------------------------------------------------------------------ -function modeText(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.END_TAG: { - endTagInText(p, token); - break; - } - case TokenType.EOF: { - eofInText(p, token); - break; - } - default: - // Do nothing - } -} - function endTagInText(p: Parser, token: TagToken): void { if (token.tagID === $.SCRIPT) { p.pendingScript = p.openElements.current; @@ -2456,47 +2567,29 @@ function eofInText(p: Parser, token: EOFToken): p._err(token, ERR.eofInElementThatCanContainOnlyText); p.openElements.pop(); p.insertionMode = p.originalInsertionMode; - p._processToken(token); + p.onEofToken(token); } // The "in table" insertion mode //------------------------------------------------------------------ -function modeInTable(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - characterInTable(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInTable(p, token); - break; - } - case TokenType.END_TAG: { - endTagInTable(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function characterInTable(p: Parser, token: CharacterToken): void { if (TABLE_STRUCTURE_TAGS.has(p.openElements.currentTagId)) { p.pendingCharacterTokens = []; p.hasNonWhitespacePendingCharacterToken = false; p.originalInsertionMode = p.insertionMode; p.insertionMode = InsertionMode.IN_TABLE_TEXT; - modeInTableText(p, token); + + switch (token.type) { + case TokenType.CHARACTER: { + characterInTableText(p, token); + break; + } + case TokenType.WHITESPACE_CHARACTER: { + whitespaceCharacterInTableText(p, token); + break; + } + // Ignore null + } } else { tokenInTable(p, token); } @@ -2519,7 +2612,7 @@ function colStartTagInTable(p: Parser, token: T p.openElements.clearBackToTableContext(); p._insertFakeElement(TN.COLGROUP, $.COLGROUP); p.insertionMode = InsertionMode.IN_COLUMN_GROUP; - modeInColumnGroup(p, token); + startTagInColumnGroup(p, token); } function tbodyStartTagInTable(p: Parser, token: TagToken): void { @@ -2532,14 +2625,14 @@ function tdStartTagInTable(p: Parser, token: Ta p.openElements.clearBackToTableContext(); p._insertFakeElement(TN.TBODY, $.TBODY); p.insertionMode = InsertionMode.IN_TABLE_BODY; - modeInTableBody(p, token); + startTagInTableBody(p, token); } function tableStartTagInTable(p: Parser, token: TagToken): void { if (p.openElements.hasInTableScope($.TABLE)) { p.openElements.popUntilTagNamePopped($.TABLE); p._resetInsertionMode(); - p._processToken(token); + p.onStartTagToken(token); } } @@ -2655,26 +2748,6 @@ function tokenInTable(p: Parser, token: Token): // The "in table text" insertion mode //------------------------------------------------------------------ -function modeInTableText(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: { - characterInTableText(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInTableText(p, token); - break; - } - case TokenType.NULL_CHARACTER: { - // Ignore token - break; - } - default: { - tokenInTableText(p, token); - } - } -} - function whitespaceCharacterInTableText(p: Parser, token: CharacterToken): void { p.pendingCharacterTokens.push(token); } @@ -2703,37 +2776,6 @@ function tokenInTableText(p: Parser, token: Tok // The "in caption" insertion mode //------------------------------------------------------------------ -function modeInCaption(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: { - characterInBody(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInCaption(p, token); - break; - } - case TokenType.END_TAG: { - endTagInCaption(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - const TABLE_VOID_ELEMENTS = new Set([$.CAPTION, $.COL, $.COLGROUP, $.TBODY, $.TD, $.TFOOT, $.TH, $.THEAD, $.TR]); function startTagInCaption(p: Parser, token: TagToken): void { @@ -2745,7 +2787,7 @@ function startTagInCaption(p: Parser, token: Ta p.openElements.popUntilTagNamePopped($.CAPTION); p.activeFormattingElements.clearToLastMarker(); p.insertionMode = InsertionMode.IN_TABLE; - modeInTable(p, token); + startTagInTable(p, token); } } else { startTagInBody(p, token); @@ -2765,7 +2807,7 @@ function endTagInCaption(p: Parser, token: TagT p.insertionMode = InsertionMode.IN_TABLE; if (tn === $.TABLE) { - modeInTable(p, token); + endTagInTable(p, token); } } break; @@ -2791,38 +2833,6 @@ function endTagInCaption(p: Parser, token: TagT // The "in column group" insertion mode //------------------------------------------------------------------ -function modeInColumnGroup(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: { - tokenInColumnGroup(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInColumnGroup(p, token); - break; - } - case TokenType.END_TAG: { - endTagInColumnGroup(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInColumnGroup(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -2862,50 +2872,21 @@ function endTagInColumnGroup(p: Parser, token: break; } default: { - tokenInColumnGroup(p, token); - } - } -} - -function tokenInColumnGroup(p: Parser, token: Token): void { - if (p.openElements.currentTagId === $.COLGROUP) { - p.openElements.pop(); - p.insertionMode = InsertionMode.IN_TABLE; - modeInTable(p, token); - } -} - -// The "in table body" insertion mode -//------------------------------------------------------------------ -function modeInTableBody(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - characterInTable(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInTableBody(p, token); - break; - } - case TokenType.END_TAG: { - endTagInTableBody(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; + tokenInColumnGroup(p, token); } - default: - // Do nothing } } +function tokenInColumnGroup(p: Parser, token: Token): void { + if (p.openElements.currentTagId === $.COLGROUP) { + p.openElements.pop(); + p.insertionMode = InsertionMode.IN_TABLE; + p._processToken(token); + } +} + +// The "in table body" insertion mode +//------------------------------------------------------------------ function startTagInTableBody(p: Parser, token: TagToken): void { switch (token.tagID) { case $.TR: { @@ -2919,7 +2900,7 @@ function startTagInTableBody(p: Parser, token: p.openElements.clearBackToTableBodyContext(); p._insertFakeElement(TN.TR, $.TR); p.insertionMode = InsertionMode.IN_ROW; - modeInRow(p, token); + startTagInRow(p, token); break; } case $.CAPTION: @@ -2932,7 +2913,7 @@ function startTagInTableBody(p: Parser, token: p.openElements.clearBackToTableBodyContext(); p.openElements.pop(); p.insertionMode = InsertionMode.IN_TABLE; - modeInTable(p, token); + startTagInTable(p, token); } break; } @@ -2961,7 +2942,7 @@ function endTagInTableBody(p: Parser, token: Ta p.openElements.clearBackToTableBodyContext(); p.openElements.pop(); p.insertionMode = InsertionMode.IN_TABLE; - modeInTable(p, token); + endTagInTable(p, token); } break; } @@ -2984,35 +2965,6 @@ function endTagInTableBody(p: Parser, token: Ta // The "in row" insertion mode //------------------------------------------------------------------ -function modeInRow(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - characterInTable(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInRow(p, token); - break; - } - case TokenType.END_TAG: { - endTagInRow(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInRow(p: Parser, token: TagToken): void { switch (token.tagID) { case $.TH: @@ -3034,7 +2986,7 @@ function startTagInRow(p: Parser, token: TagTok p.openElements.clearBackToTableRowContext(); p.openElements.pop(); p.insertionMode = InsertionMode.IN_TABLE_BODY; - modeInTableBody(p, token); + startTagInTableBody(p, token); } break; } @@ -3059,7 +3011,7 @@ function endTagInRow(p: Parser, token: TagToken p.openElements.clearBackToTableRowContext(); p.openElements.pop(); p.insertionMode = InsertionMode.IN_TABLE_BODY; - modeInTableBody(p, token); + endTagInTableBody(p, token); } break; } @@ -3070,7 +3022,7 @@ function endTagInRow(p: Parser, token: TagToken p.openElements.clearBackToTableRowContext(); p.openElements.pop(); p.insertionMode = InsertionMode.IN_TABLE_BODY; - modeInTableBody(p, token); + endTagInTableBody(p, token); } break; } @@ -3091,44 +3043,13 @@ function endTagInRow(p: Parser, token: TagToken // The "in cell" insertion mode //------------------------------------------------------------------ -function modeInCell(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: { - characterInBody(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInCell(p, token); - break; - } - case TokenType.END_TAG: { - endTagInCell(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInCell(p: Parser, token: TagToken): void { const tn = token.tagID; if (TABLE_VOID_ELEMENTS.has(tn)) { if (p.openElements.hasInTableScope($.TD) || p.openElements.hasInTableScope($.TH)) { p._closeTableCell(); - p._processToken(token); + p.onStartTagToken(token); } } else { startTagInBody(p, token); @@ -3156,7 +3077,7 @@ function endTagInCell(p: Parser, token: TagToke case $.TR: { if (p.openElements.hasInTableScope(tn)) { p._closeTableCell(); - p._processToken(token); + p.onEndTagToken(token); } break; } @@ -3176,34 +3097,6 @@ function endTagInCell(p: Parser, token: TagToke // The "in select" insertion mode //------------------------------------------------------------------ -function modeInSelect(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInSelect(p, token); - break; - } - case TokenType.END_TAG: { - endTagInSelect(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInSelect(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -3239,7 +3132,7 @@ function startTagInSelect(p: Parser, token: Tag p._resetInsertionMode(); if (token.tagID !== $.SELECT) { - p._processToken(token); + p.onStartTagToken(token); } } break; @@ -3294,34 +3187,6 @@ function endTagInSelect(p: Parser, token: TagTo // The "in select in table" insertion mode //------------------------------------------------------------------ -function modeInSelectInTable(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInSelectInTable(p, token); - break; - } - case TokenType.END_TAG: { - endTagInSelectInTable(p, token); - break; - } - case TokenType.EOF: { - eofInBody(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInSelectInTable(p: Parser, token: TagToken): void { const tn = token.tagID; @@ -3337,7 +3202,7 @@ function startTagInSelectInTable(p: Parser, tok ) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p._processToken(token); + p.onStartTagToken(token); } else { startTagInSelect(p, token); } @@ -3359,7 +3224,7 @@ function endTagInSelectInTable(p: Parser, token if (p.openElements.hasInTableScope(tn)) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p._processToken(token); + p.onEndTagToken(token); } } else { endTagInSelect(p, token); @@ -3368,37 +3233,6 @@ function endTagInSelectInTable(p: Parser, token // The "in template" insertion mode //------------------------------------------------------------------ -function modeInTemplate(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: { - characterInBody(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInTemplate(p, token); - break; - } - case TokenType.END_TAG: { - endTagInTemplate(p, token); - break; - } - case TokenType.EOF: { - eofInTemplate(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInTemplate(p: Parser, token: TagToken): void { switch (token.tagID) { // First, handle tags that can start without a mode change @@ -3423,28 +3257,28 @@ function startTagInTemplate(p: Parser, token: T case $.THEAD: p.tmplInsertionModeStack[0] = InsertionMode.IN_TABLE; p.insertionMode = InsertionMode.IN_TABLE; - modeInTable(p, token); + startTagInTable(p, token); break; case $.COL: p.tmplInsertionModeStack[0] = InsertionMode.IN_COLUMN_GROUP; p.insertionMode = InsertionMode.IN_COLUMN_GROUP; - modeInColumnGroup(p, token); + startTagInColumnGroup(p, token); break; case $.TR: p.tmplInsertionModeStack[0] = InsertionMode.IN_TABLE_BODY; p.insertionMode = InsertionMode.IN_TABLE_BODY; - modeInTableBody(p, token); + startTagInTableBody(p, token); break; case $.TD: case $.TH: p.tmplInsertionModeStack[0] = InsertionMode.IN_ROW; p.insertionMode = InsertionMode.IN_ROW; - modeInRow(p, token); + startTagInRow(p, token); break; default: p.tmplInsertionModeStack[0] = InsertionMode.IN_BODY; p.insertionMode = InsertionMode.IN_BODY; - modeInBody(p, token); + startTagInBody(p, token); } } @@ -3460,7 +3294,7 @@ function eofInTemplate(p: Parser, token: EOFTok p.activeFormattingElements.clearToLastMarker(); p.tmplInsertionModeStack.shift(); p._resetInsertionMode(); - p._processToken(token); + p.onEofToken(token); } else { stopParsing(p, token); } @@ -3468,38 +3302,6 @@ function eofInTemplate(p: Parser, token: EOFTok // The "after body" insertion mode //------------------------------------------------------------------ -function modeAfterBody(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: { - tokenAfterBody(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendCommentToRootHtmlElement(p, token); - break; - } - case TokenType.START_TAG: { - startTagAfterBody(p, token); - break; - } - case TokenType.END_TAG: { - endTagAfterBody(p, token); - break; - } - case TokenType.EOF: { - stopParsing(p, token); - break; - } - default: - // Do nothing - } -} - function startTagAfterBody(p: Parser, token: TagToken): void { if (token.tagID === $.HTML) { startTagInBody(p, token); @@ -3531,33 +3333,6 @@ function tokenAfterBody(p: Parser, token: Token // The "in frameset" insertion mode //------------------------------------------------------------------ -function modeInFrameset(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagInFrameset(p, token); - break; - } - case TokenType.END_TAG: { - endTagInFrameset(p, token); - break; - } - case TokenType.EOF: { - stopParsing(p, token); - break; - } - default: - // Do nothing - } -} - function startTagInFrameset(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -3594,33 +3369,6 @@ function endTagInFrameset(p: Parser, token: Tag // The "after frameset" insertion mode //------------------------------------------------------------------ -function modeAfterFrameset(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.WHITESPACE_CHARACTER: { - p._insertCharacters(token); - break; - } - case TokenType.COMMENT: { - appendComment(p, token); - break; - } - case TokenType.START_TAG: { - startTagAfterFrameset(p, token); - break; - } - case TokenType.END_TAG: { - endTagAfterFrameset(p, token); - break; - } - case TokenType.EOF: { - stopParsing(p, token); - break; - } - default: - // Do nothing - } -} - function startTagAfterFrameset(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -3644,35 +3392,6 @@ function endTagAfterFrameset(p: Parser, token: // The "after after body" insertion mode //------------------------------------------------------------------ -function modeAfterAfterBody(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.END_TAG: { - tokenAfterAfterBody(p, token); - break; - } - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendCommentToDocument(p, token); - break; - } - case TokenType.START_TAG: { - startTagAfterAfterBody(p, token); - break; - } - case TokenType.EOF: { - stopParsing(p, token); - break; - } - default: - // Do nothing - } -} - function startTagAfterAfterBody(p: Parser, token: TagToken): void { if (token.tagID === $.HTML) { startTagInBody(p, token); @@ -3688,29 +3407,6 @@ function tokenAfterAfterBody(p: Parser, token: // The "after after frameset" insertion mode //------------------------------------------------------------------ -function modeAfterAfterFrameset(p: Parser, token: Token): void { - switch (token.type) { - case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); - break; - } - case TokenType.COMMENT: { - appendCommentToDocument(p, token); - break; - } - case TokenType.START_TAG: { - startTagAfterAfterFrameset(p, token); - break; - } - case TokenType.EOF: { - stopParsing(p, token); - break; - } - default: - // Do nothing - } -} - function startTagAfterAfterFrameset(p: Parser, token: TagToken): void { switch (token.tagID) { case $.HTML: { @@ -3747,7 +3443,7 @@ function startTagInForeignContent(p: Parser, to p.openElements.pop(); } - p._processToken(token); + p._startTagOutsideForeignContent(token); } else { const current = p._getAdjustedCurrentElement(); const currentNs = p.treeAdapter.getNamespaceURI(current); @@ -3776,7 +3472,7 @@ function endTagInForeignContent(p: Parser, toke const element = p.openElements.items[i]; if (p.treeAdapter.getNamespaceURI(element) === NS.HTML) { - p._processToken(token); + p._endTagOutsideForeignContent(token); break; } From 3887ac4fcb323e563ebcfe888e24c02753f1feb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 26 Jan 2022 17:17:00 -0800 Subject: [PATCH 02/12] Use tokenizer `allowCDATA` to determine foreign content of text --- packages/parse5/lib/parser/index.ts | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 44101606e..5c951f40a 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -572,20 +572,6 @@ export class Parser { : !this._isIntegrationPoint(currentTagId, current); } - private shouldProcessTextInForeignContent(): boolean { - let current: T['parentNode']; - let currentTagId: number; - - if (this.openElements.stackTop === 0 && this.fragmentContext) { - current = this.fragmentContext; - currentTagId = this.fragmentContextID; - } else { - ({ current, currentTagId } = this.openElements); - } - - return !this._isIntegrationPoint(currentTagId, current); - } - _processToken(token: Token): void { switch (token.type) { case TokenType.CHARACTER: { @@ -791,7 +777,8 @@ export class Parser { onCharacterToken(token: CharacterToken): void { this.skipNextNewLine = false; - if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + + if (this.tokenizer.allowCDATA) { characterInForeignContent(this, token); return; } @@ -849,7 +836,8 @@ export class Parser { } onNullCharacterToken(token: CharacterToken): void { this.skipNextNewLine = false; - if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + + if (this.tokenizer.allowCDATA) { nullCharacterInForeignContent(this, token); return; } @@ -896,6 +884,7 @@ export class Parser { } onCommentToken(token: CommentToken): void { this.skipNextNewLine = false; + if (this._considerForeignContent) { appendComment(this, token); return; @@ -1188,7 +1177,7 @@ export class Parser { } } - if (this._considerForeignContent && this.shouldProcessTextInForeignContent()) { + if (this.tokenizer.allowCDATA) { this._insertCharacters(token); return; } From b0e60bd4622a3931e6077ce117b02c2a661fce50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 26 Jan 2022 20:56:11 -0800 Subject: [PATCH 03/12] Fix `tokenInHead`, revert some changes --- packages/parse5/lib/parser/index.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 5c951f40a..52b38fff9 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1220,6 +1220,7 @@ export class Parser { //Adoption agency algorithm //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adoptionAgency) //------------------------------------------------------------------ + //Steps 5-8 of the algorithm function aaObtainFormattingElementEntry( p: Parser, @@ -1597,7 +1598,7 @@ function endTagInHead(p: Parser, token: TagToke function tokenInHead(p: Parser, token: Token): void { p.openElements.pop(); p.insertionMode = InsertionMode.AFTER_HEAD; - tokenAfterHead(p, token); + p._processToken(token); } // The "in head no script" insertion mode @@ -1720,7 +1721,7 @@ function endTagAfterHead(p: Parser, token: TagT function tokenAfterHead(p: Parser, token: Token): void { p._insertFakeElement(TN.BODY, $.BODY); p.insertionMode = InsertionMode.IN_BODY; - p._processToken(token); + modeInBody(p, token); } // The "in body" insertion mode @@ -3038,7 +3039,7 @@ function startTagInCell(p: Parser, token: TagTo if (TABLE_VOID_ELEMENTS.has(tn)) { if (p.openElements.hasInTableScope($.TD) || p.openElements.hasInTableScope($.TH)) { p._closeTableCell(); - p.onStartTagToken(token); + startTagInRow(p, token); } } else { startTagInBody(p, token); @@ -3066,7 +3067,7 @@ function endTagInCell(p: Parser, token: TagToke case $.TR: { if (p.openElements.hasInTableScope(tn)) { p._closeTableCell(); - p.onEndTagToken(token); + endTagInRow(p, token); } break; } From b0a1d794a4e51f1d3962d9f7263e76653290ea56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Wed, 26 Jan 2022 21:09:06 -0800 Subject: [PATCH 04/12] Fix self closing being reported multiple times for some start tags --- packages/parse5/lib/parser/index.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 52b38fff9..9c1beaad7 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -269,6 +269,10 @@ export class Parser { this._processToken(token); + if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { + this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); + } + if (token.type === TokenType.HIBERNATION || (scriptHandler !== null && this.pendingScript)) { break; } @@ -953,10 +957,6 @@ export class Parser { } else { this._startTagOutsideForeignContent(token); } - - if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { - this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); - } } _startTagOutsideForeignContent(token: TagToken): void { switch (this.insertionMode) { From b0f08eddcdf5d01ea1775b8f26881119b9ab57c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 27 Jan 2022 16:10:15 -0500 Subject: [PATCH 05/12] Shorten callback names --- packages/parse5/lib/parser/index.ts | 44 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 9c1beaad7..f719f8cf2 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -579,35 +579,35 @@ export class Parser { _processToken(token: Token): void { switch (token.type) { case TokenType.CHARACTER: { - this.onCharacterToken(token); + this.onCharacter(token); break; } case TokenType.NULL_CHARACTER: { - this.onNullCharacterToken(token); + this.onNullCharacter(token); break; } case TokenType.COMMENT: { - this.onCommentToken(token); + this.onComment(token); break; } case TokenType.DOCTYPE: { - this.onDoctypeToken(token); + this.onDoctype(token); break; } case TokenType.START_TAG: { - this.onStartTagToken(token); + this.onStartTag(token); break; } case TokenType.END_TAG: { - this.onEndTagToken(token); + this.onEndTag(token); break; } case TokenType.EOF: { - this.onEofToken(token); + this.onEof(token); break; } case TokenType.WHITESPACE_CHARACTER: { - this.onWhitespaceCharacterToken(token); + this.onWhitespaceCharacter(token); break; } } @@ -779,7 +779,7 @@ export class Parser { return SPECIAL_ELEMENTS[ns].has(id); } - onCharacterToken(token: CharacterToken): void { + onCharacter(token: CharacterToken): void { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { @@ -838,7 +838,7 @@ export class Parser { // Do nothing } } - onNullCharacterToken(token: CharacterToken): void { + onNullCharacter(token: CharacterToken): void { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { @@ -886,7 +886,7 @@ export class Parser { // Do nothing } } - onCommentToken(token: CommentToken): void { + onComment(token: CommentToken): void { this.skipNextNewLine = false; if (this._considerForeignContent) { @@ -929,7 +929,7 @@ export class Parser { // Do nothing } } - onDoctypeToken(token: DoctypeToken): void { + onDoctype(token: DoctypeToken): void { this.skipNextNewLine = false; switch (this.insertionMode) { case InsertionMode.INITIAL: @@ -948,7 +948,7 @@ export class Parser { // Do nothing } } - onStartTagToken(token: TagToken): void { + onStartTag(token: TagToken): void { this.skipNextNewLine = false; this.currentToken = token; @@ -1030,7 +1030,7 @@ export class Parser { // Do nothing } } - onEndTagToken(token: TagToken): void { + onEndTag(token: TagToken): void { this.skipNextNewLine = false; this.currentToken = token; @@ -1112,7 +1112,7 @@ export class Parser { // Do nothing } } - onEofToken(token: EOFToken): void { + onEof(token: EOFToken): void { this.skipNextNewLine = false; switch (this.insertionMode) { case InsertionMode.INITIAL: @@ -1164,7 +1164,7 @@ export class Parser { // Do nothing } } - onWhitespaceCharacterToken(token: CharacterToken): void { + onWhitespaceCharacter(token: CharacterToken): void { if (this.skipNextNewLine) { this.skipNextNewLine = false; @@ -2557,7 +2557,7 @@ function eofInText(p: Parser, token: EOFToken): p._err(token, ERR.eofInElementThatCanContainOnlyText); p.openElements.pop(); p.insertionMode = p.originalInsertionMode; - p.onEofToken(token); + p.onEof(token); } // The "in table" insertion mode @@ -2622,7 +2622,7 @@ function tableStartTagInTable(p: Parser, token: if (p.openElements.hasInTableScope($.TABLE)) { p.openElements.popUntilTagNamePopped($.TABLE); p._resetInsertionMode(); - p.onStartTagToken(token); + p.onStartTag(token); } } @@ -3122,7 +3122,7 @@ function startTagInSelect(p: Parser, token: Tag p._resetInsertionMode(); if (token.tagID !== $.SELECT) { - p.onStartTagToken(token); + p.onStartTag(token); } } break; @@ -3192,7 +3192,7 @@ function startTagInSelectInTable(p: Parser, tok ) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p.onStartTagToken(token); + p.onStartTag(token); } else { startTagInSelect(p, token); } @@ -3214,7 +3214,7 @@ function endTagInSelectInTable(p: Parser, token if (p.openElements.hasInTableScope(tn)) { p.openElements.popUntilTagNamePopped($.SELECT); p._resetInsertionMode(); - p.onEndTagToken(token); + p.onEndTag(token); } } else { endTagInSelect(p, token); @@ -3284,7 +3284,7 @@ function eofInTemplate(p: Parser, token: EOFTok p.activeFormattingElements.clearToLastMarker(); p.tmplInsertionModeStack.shift(); p._resetInsertionMode(); - p.onEofToken(token); + p.onEof(token); } else { stopParsing(p, token); } From 6d6a9b68ace01270a4d0d208b4bbf33c4794e967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 27 Jan 2022 17:32:14 -0500 Subject: [PATCH 06/12] Simplify `shouldProcessStartTagTokenInForeignContent` --- packages/parse5/lib/parser/index.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index f719f8cf2..606ee307a 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -551,6 +551,9 @@ export class Parser { //Token processing private shouldProcessStartTagTokenInForeignContent(token: TagToken): boolean { + // Check that neither current === document, or ns === NS.HTML + if (!this._considerForeignContent) return false; + let current: T['parentNode']; let currentTagId: number; @@ -561,8 +564,6 @@ export class Parser { ({ current, currentTagId } = this.openElements); } - //NOTE: We won't get here with current === document, or ns === NS.HTML - if ( token.tagID === $.SVG && this.treeAdapter.getTagName(current) === TN.ANNOTATION_XML && @@ -571,9 +572,14 @@ export class Parser { return false; } - return token.tagID === $.MGLYPH || token.tagID === $.MALIGNMARK - ? !this._isIntegrationPoint(currentTagId, current, NS.HTML) - : !this._isIntegrationPoint(currentTagId, current); + return ( + // Check that `current` is not an integration point for HTML or MathML elements. + this.tokenizer.allowCDATA || + // If it _is_ an integration point, then we might have to check that it is not an HTML + // integration point. + ((token.tagID === $.MGLYPH || token.tagID === $.MALIGNMARK) && + !this._isIntegrationPoint(currentTagId, current, NS.HTML)) + ); } _processToken(token: Token): void { @@ -952,7 +958,7 @@ export class Parser { this.skipNextNewLine = false; this.currentToken = token; - if (this._considerForeignContent && this.shouldProcessStartTagTokenInForeignContent(token)) { + if (this.shouldProcessStartTagTokenInForeignContent(token)) { startTagInForeignContent(this, token); } else { this._startTagOutsideForeignContent(token); From 95b9fe471b5ea449947894e595c45777300a07d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 28 Jan 2022 21:08:06 -0500 Subject: [PATCH 07/12] Pass primitive values to some fns --- packages/parse5/lib/parser/index.ts | 96 ++++++++++++++++------------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 606ee307a..c7fb7953a 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -28,6 +28,7 @@ import { EOFToken, LocationWithAttributes, ElementLocation, + Location, } from '../common/token.js'; //Misc constants @@ -273,7 +274,7 @@ export class Parser { this._err(token, ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); } - if (token.type === TokenType.HIBERNATION || (scriptHandler !== null && this.pendingScript)) { + if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) { break; } } @@ -483,7 +484,7 @@ export class Parser { } } - _insertCharacters(token: CharacterToken): void { + _insertCharacters(chars: string, location: Location | null): void { let parent; let beforeElement; @@ -491,17 +492,17 @@ export class Parser { ({ parent, beforeElement } = this._findFosterParentingLocation()); if (beforeElement) { - this.treeAdapter.insertTextBefore(parent, token.chars, beforeElement); + this.treeAdapter.insertTextBefore(parent, chars, beforeElement); } else { - this.treeAdapter.insertText(parent, token.chars); + this.treeAdapter.insertText(parent, chars); } } else { parent = this.openElements.currentTmplContentOrNode; - this.treeAdapter.insertText(parent, token.chars); + this.treeAdapter.insertText(parent, chars); } - if (!token.location) return; + if (!location) return; const siblings = this.treeAdapter.getChildNodes(parent); const textNodeIdx = beforeElement ? siblings.lastIndexOf(beforeElement) : siblings.length; @@ -511,10 +512,10 @@ export class Parser { const tnLoc = this.treeAdapter.getNodeSourceCodeLocation(textNode); if (tnLoc) { - const { endLine, endCol, endOffset } = token.location; + const { endLine, endCol, endOffset } = location; this.treeAdapter.updateNodeSourceCodeLocation(textNode, { endLine, endCol, endOffset }); } else if (this.options.sourceCodeLocationInfo) { - this.treeAdapter.setNodeSourceCodeLocation(textNode, token.location); + this.treeAdapter.setNodeSourceCodeLocation(textNode, location); } } @@ -530,20 +531,21 @@ export class Parser { const ctLoc = closingToken.location; const tn = this.treeAdapter.getTagName(element); - // NOTE: For cases like

- First 'p' closes without a closing - // tag and for cases like

- 'p' closes without a closing tag. - const isClosingEndTag = closingToken.type === TokenType.END_TAG && tn === closingToken.tagName; - const endLoc: Partial = {}; - if (isClosingEndTag) { - endLoc.endTag = { ...ctLoc }; - endLoc.endLine = ctLoc.endLine; - endLoc.endCol = ctLoc.endCol; - endLoc.endOffset = ctLoc.endOffset; - } else { - endLoc.endLine = ctLoc.startLine; - endLoc.endCol = ctLoc.startCol; - endLoc.endOffset = ctLoc.startOffset; - } + const endLoc: Partial = + // NOTE: For cases like

- First 'p' closes without a closing + // tag and for cases like

- 'p' closes without a closing tag. + closingToken.type === TokenType.END_TAG && tn === closingToken.tagName + ? { + endTag: { ...ctLoc }, + endLine: ctLoc.endLine, + endCol: ctLoc.endCol, + endOffset: ctLoc.endOffset, + } + : { + endLine: ctLoc.startLine, + endCol: ctLoc.startCol, + endOffset: ctLoc.startOffset, + }; this.treeAdapter.updateNodeSourceCodeLocation(element, endLoc); } @@ -789,7 +791,7 @@ export class Parser { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { - characterInForeignContent(this, token); + characterInForeignContent(this, token.chars, token.location); return; } @@ -816,12 +818,12 @@ export class Parser { case InsertionMode.IN_CAPTION: case InsertionMode.IN_CELL: case InsertionMode.IN_TEMPLATE: - characterInBody(this, token); + characterInBody(this, token.chars, token.location); break; case InsertionMode.TEXT: case InsertionMode.IN_SELECT: case InsertionMode.IN_SELECT_IN_TABLE: - this._insertCharacters(token); + this._insertCharacters(token.chars, token.location); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -848,7 +850,7 @@ export class Parser { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { - nullCharacterInForeignContent(this, token); + nullCharacterInForeignContent(this, token.location); return; } @@ -872,7 +874,7 @@ export class Parser { tokenAfterHead(this, token); break; case InsertionMode.TEXT: - this._insertCharacters(token); + this._insertCharacters(token.chars, token.location); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -1184,7 +1186,7 @@ export class Parser { } if (this.tokenizer.allowCDATA) { - this._insertCharacters(token); + this._insertCharacters(token.chars, token.location); return; } @@ -1198,7 +1200,7 @@ export class Parser { case InsertionMode.IN_SELECT_IN_TABLE: case InsertionMode.IN_FRAMESET: case InsertionMode.AFTER_FRAMESET: - this._insertCharacters(token); + this._insertCharacters(token.chars, token.location); break; case InsertionMode.IN_BODY: case InsertionMode.IN_CAPTION: @@ -1207,7 +1209,7 @@ export class Parser { case InsertionMode.AFTER_BODY: case InsertionMode.AFTER_AFTER_BODY: case InsertionMode.AFTER_AFTER_FRAMESET: - whitespaceCharacterInBody(this, token); + whitespaceCharacterInBody(this, token.chars, token.location); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -1735,11 +1737,11 @@ function tokenAfterHead(p: Parser, token: Token function modeInBody(p: Parser, token: Token): void { switch (token.type) { case TokenType.CHARACTER: { - characterInBody(p, token); + characterInBody(p, token.chars, token.location); break; } case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token); + whitespaceCharacterInBody(p, token.chars, token.location); break; } case TokenType.COMMENT: { @@ -1763,14 +1765,18 @@ function modeInBody(p: Parser, token: Token): v } } -function whitespaceCharacterInBody(p: Parser, token: CharacterToken): void { +function whitespaceCharacterInBody( + p: Parser, + chars: string, + location: Location | null +): void { p._reconstructActiveFormattingElements(); - p._insertCharacters(token); + p._insertCharacters(chars, location); } -function characterInBody(p: Parser, token: CharacterToken): void { +function characterInBody(p: Parser, chars: string, location: Location | null): void { p._reconstructActiveFormattingElements(); - p._insertCharacters(token); + p._insertCharacters(chars, location); p.framesetOk = false; } @@ -2570,7 +2576,7 @@ function eofInText(p: Parser, token: EOFToken): //------------------------------------------------------------------ function characterInTable(p: Parser, token: CharacterToken): void { if (TABLE_STRUCTURE_TAGS.has(p.openElements.currentTagId)) { - p.pendingCharacterTokens = []; + p.pendingCharacterTokens.length = 0; p.hasNonWhitespacePendingCharacterToken = false; p.originalInsertionMode = p.insertionMode; p.insertionMode = InsertionMode.IN_TABLE_TEXT; @@ -2762,7 +2768,8 @@ function tokenInTableText(p: Parser, token: Tok } } else { for (; i < p.pendingCharacterTokens.length; i++) { - p._insertCharacters(p.pendingCharacterTokens[i]); + const { chars, location } = p.pendingCharacterTokens[i]; + p._insertCharacters(chars, location); } } @@ -3420,13 +3427,16 @@ function startTagAfterAfterFrameset(p: Parser, // The rules for parsing tokens in foreign content //------------------------------------------------------------------ -function nullCharacterInForeignContent(p: Parser, token: CharacterToken): void { - token.chars = unicode.REPLACEMENT_CHARACTER; - p._insertCharacters(token); +function nullCharacterInForeignContent(p: Parser, location: Location | null): void { + p._insertCharacters(unicode.REPLACEMENT_CHARACTER, location); } -function characterInForeignContent(p: Parser, token: CharacterToken): void { - p._insertCharacters(token); +function characterInForeignContent( + p: Parser, + chars: string, + location: Location | null +): void { + p._insertCharacters(chars, location); p.framesetOk = false; } From cc634101839cc8c1cb1a168e609d755e6de26df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 11 Feb 2022 13:29:01 +0000 Subject: [PATCH 08/12] Revert "Pass primitive values to some fns" This reverts commit 95b9fe471b5ea449947894e595c45777300a07d9. --- packages/parse5/lib/parser/index.ts | 63 +++++++++++++---------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index c7fb7953a..e56b21c63 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -28,7 +28,6 @@ import { EOFToken, LocationWithAttributes, ElementLocation, - Location, } from '../common/token.js'; //Misc constants @@ -484,7 +483,7 @@ export class Parser { } } - _insertCharacters(chars: string, location: Location | null): void { + _insertCharacters(token: CharacterToken): void { let parent; let beforeElement; @@ -492,17 +491,17 @@ export class Parser { ({ parent, beforeElement } = this._findFosterParentingLocation()); if (beforeElement) { - this.treeAdapter.insertTextBefore(parent, chars, beforeElement); + this.treeAdapter.insertTextBefore(parent, token.chars, beforeElement); } else { - this.treeAdapter.insertText(parent, chars); + this.treeAdapter.insertText(parent, token.chars); } } else { parent = this.openElements.currentTmplContentOrNode; - this.treeAdapter.insertText(parent, chars); + this.treeAdapter.insertText(parent, token.chars); } - if (!location) return; + if (!token.location) return; const siblings = this.treeAdapter.getChildNodes(parent); const textNodeIdx = beforeElement ? siblings.lastIndexOf(beforeElement) : siblings.length; @@ -512,10 +511,10 @@ export class Parser { const tnLoc = this.treeAdapter.getNodeSourceCodeLocation(textNode); if (tnLoc) { - const { endLine, endCol, endOffset } = location; + const { endLine, endCol, endOffset } = token.location; this.treeAdapter.updateNodeSourceCodeLocation(textNode, { endLine, endCol, endOffset }); } else if (this.options.sourceCodeLocationInfo) { - this.treeAdapter.setNodeSourceCodeLocation(textNode, location); + this.treeAdapter.setNodeSourceCodeLocation(textNode, token.location); } } @@ -791,7 +790,7 @@ export class Parser { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { - characterInForeignContent(this, token.chars, token.location); + characterInForeignContent(this, token); return; } @@ -818,12 +817,12 @@ export class Parser { case InsertionMode.IN_CAPTION: case InsertionMode.IN_CELL: case InsertionMode.IN_TEMPLATE: - characterInBody(this, token.chars, token.location); + characterInBody(this, token); break; case InsertionMode.TEXT: case InsertionMode.IN_SELECT: case InsertionMode.IN_SELECT_IN_TABLE: - this._insertCharacters(token.chars, token.location); + this._insertCharacters(token); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -850,7 +849,7 @@ export class Parser { this.skipNextNewLine = false; if (this.tokenizer.allowCDATA) { - nullCharacterInForeignContent(this, token.location); + nullCharacterInForeignContent(this, token); return; } @@ -874,7 +873,7 @@ export class Parser { tokenAfterHead(this, token); break; case InsertionMode.TEXT: - this._insertCharacters(token.chars, token.location); + this._insertCharacters(token); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -1186,7 +1185,7 @@ export class Parser { } if (this.tokenizer.allowCDATA) { - this._insertCharacters(token.chars, token.location); + this._insertCharacters(token); return; } @@ -1200,7 +1199,7 @@ export class Parser { case InsertionMode.IN_SELECT_IN_TABLE: case InsertionMode.IN_FRAMESET: case InsertionMode.AFTER_FRAMESET: - this._insertCharacters(token.chars, token.location); + this._insertCharacters(token); break; case InsertionMode.IN_BODY: case InsertionMode.IN_CAPTION: @@ -1209,7 +1208,7 @@ export class Parser { case InsertionMode.AFTER_BODY: case InsertionMode.AFTER_AFTER_BODY: case InsertionMode.AFTER_AFTER_FRAMESET: - whitespaceCharacterInBody(this, token.chars, token.location); + whitespaceCharacterInBody(this, token); break; case InsertionMode.IN_TABLE: case InsertionMode.IN_TABLE_BODY: @@ -1737,11 +1736,11 @@ function tokenAfterHead(p: Parser, token: Token function modeInBody(p: Parser, token: Token): void { switch (token.type) { case TokenType.CHARACTER: { - characterInBody(p, token.chars, token.location); + characterInBody(p, token); break; } case TokenType.WHITESPACE_CHARACTER: { - whitespaceCharacterInBody(p, token.chars, token.location); + whitespaceCharacterInBody(p, token); break; } case TokenType.COMMENT: { @@ -1765,18 +1764,14 @@ function modeInBody(p: Parser, token: Token): v } } -function whitespaceCharacterInBody( - p: Parser, - chars: string, - location: Location | null -): void { +function whitespaceCharacterInBody(p: Parser, token: CharacterToken): void { p._reconstructActiveFormattingElements(); - p._insertCharacters(chars, location); + p._insertCharacters(token); } -function characterInBody(p: Parser, chars: string, location: Location | null): void { +function characterInBody(p: Parser, token: CharacterToken): void { p._reconstructActiveFormattingElements(); - p._insertCharacters(chars, location); + p._insertCharacters(token); p.framesetOk = false; } @@ -2768,8 +2763,7 @@ function tokenInTableText(p: Parser, token: Tok } } else { for (; i < p.pendingCharacterTokens.length; i++) { - const { chars, location } = p.pendingCharacterTokens[i]; - p._insertCharacters(chars, location); + p._insertCharacters(p.pendingCharacterTokens[i]); } } @@ -3427,16 +3421,13 @@ function startTagAfterAfterFrameset(p: Parser, // The rules for parsing tokens in foreign content //------------------------------------------------------------------ -function nullCharacterInForeignContent(p: Parser, location: Location | null): void { - p._insertCharacters(unicode.REPLACEMENT_CHARACTER, location); +function nullCharacterInForeignContent(p: Parser, token: CharacterToken): void { + token.chars = unicode.REPLACEMENT_CHARACTER; + p._insertCharacters(token); } -function characterInForeignContent( - p: Parser, - chars: string, - location: Location | null -): void { - p._insertCharacters(chars, location); +function characterInForeignContent(p: Parser, token: CharacterToken): void { + p._insertCharacters(token); p.framesetOk = false; } From be7394dc40831ede28a2205754cba8c2db6f69f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 11 Feb 2022 14:05:58 +0000 Subject: [PATCH 09/12] Fix `generate-parser-feedback-test` --- scripts/generate-parser-feedback-test/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generate-parser-feedback-test/index.ts b/scripts/generate-parser-feedback-test/index.ts index 5f24b6ec1..dacd8216f 100644 --- a/scripts/generate-parser-feedback-test/index.ts +++ b/scripts/generate-parser-feedback-test/index.ts @@ -44,8 +44,8 @@ function collectParserTokens(html: string): ReturnType and // which are otherwise merged as per tree constructor spec From 7099e8d5772e2b21b3ac56eb90e747a96a6e2fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 11 Feb 2022 14:12:46 +0000 Subject: [PATCH 10/12] Don't set `skipNextNewLine = false` on EOF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We will stop parsing — no need to do this anymore --- packages/parse5/lib/parser/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index e56b21c63..61525262e 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1120,7 +1120,6 @@ export class Parser { } } onEof(token: EOFToken): void { - this.skipNextNewLine = false; switch (this.insertionMode) { case InsertionMode.INITIAL: tokenInInitialMode(this, token); From 4bd7d8815b2c31a01219901e0abf6e7caef28f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 11 Feb 2022 14:33:06 +0000 Subject: [PATCH 11/12] Rename `_considerForeignContent` and `allowCDATA` I'm not super happy with the new names, but they are an improvement. --- .../lib/parser-feedback-simulator.ts | 4 ++-- packages/parse5/lib/parser/index.ts | 21 ++++++++++--------- packages/parse5/lib/tokenizer/index.ts | 10 +++++++-- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts b/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts index 73b4221e0..e7019d879 100644 --- a/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts +++ b/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts @@ -64,13 +64,13 @@ export class ParserFeedbackSimulator { private _enterNamespace(namespace: NS): void { this.namespaceStack.unshift(namespace); this.inForeignContent = namespace !== NS.HTML; - this.tokenizer.allowCDATA = this.inForeignContent; + this.tokenizer.inForeignNode = this.inForeignContent; } private _leaveCurrentNamespace(): void { this.namespaceStack.shift(); this.inForeignContent = this.namespaceStack[0] !== NS.HTML; - this.tokenizer.allowCDATA = this.inForeignContent; + this.tokenizer.inForeignNode = this.inForeignContent; } //Token handlers diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 61525262e..8ff4fa2ff 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -191,7 +191,8 @@ export class Parser { openElements!: OpenElementStack; activeFormattingElements!: FormattingElementList; - private _considerForeignContent = false; + /** Indicates that the current node is not an element in the HTML namespace */ + private currentNotInHTML = false; /** * The template insertion mode stack is maintained from the left. @@ -326,8 +327,8 @@ export class Parser { private _setContextModes(current: T['parentNode'], tid: number): void { const isHTML = current === this.document || this.treeAdapter.getNamespaceURI(current) === NS.HTML; - this._considerForeignContent = !isHTML; - this.tokenizer.allowCDATA = !isHTML && !this._isIntegrationPoint(tid, current); + this.currentNotInHTML = !isHTML; + this.tokenizer.inForeignNode = !isHTML && !this._isIntegrationPoint(tid, current); } _switchToTextParsing( @@ -553,7 +554,7 @@ export class Parser { //Token processing private shouldProcessStartTagTokenInForeignContent(token: TagToken): boolean { // Check that neither current === document, or ns === NS.HTML - if (!this._considerForeignContent) return false; + if (!this.currentNotInHTML) return false; let current: T['parentNode']; let currentTagId: number; @@ -575,7 +576,7 @@ export class Parser { return ( // Check that `current` is not an integration point for HTML or MathML elements. - this.tokenizer.allowCDATA || + this.tokenizer.inForeignNode || // If it _is_ an integration point, then we might have to check that it is not an HTML // integration point. ((token.tagID === $.MGLYPH || token.tagID === $.MALIGNMARK) && @@ -789,7 +790,7 @@ export class Parser { onCharacter(token: CharacterToken): void { this.skipNextNewLine = false; - if (this.tokenizer.allowCDATA) { + if (this.tokenizer.inForeignNode) { characterInForeignContent(this, token); return; } @@ -848,7 +849,7 @@ export class Parser { onNullCharacter(token: CharacterToken): void { this.skipNextNewLine = false; - if (this.tokenizer.allowCDATA) { + if (this.tokenizer.inForeignNode) { nullCharacterInForeignContent(this, token); return; } @@ -896,7 +897,7 @@ export class Parser { onComment(token: CommentToken): void { this.skipNextNewLine = false; - if (this._considerForeignContent) { + if (this.currentNotInHTML) { appendComment(this, token); return; } @@ -1041,7 +1042,7 @@ export class Parser { this.skipNextNewLine = false; this.currentToken = token; - if (this._considerForeignContent) { + if (this.currentNotInHTML) { endTagInForeignContent(this, token); } else { this._endTagOutsideForeignContent(token); @@ -1183,7 +1184,7 @@ export class Parser { } } - if (this.tokenizer.allowCDATA) { + if (this.tokenizer.inForeignNode) { this._insertCharacters(token); return; } diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 0068ea477..66db2ae9d 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -209,7 +209,13 @@ export class Tokenizer { private tokenQueue: Token[] = []; - public allowCDATA = false; + /** + * Indicates that the current adjusted node exists, is not an element in the HTML namespace, + * and that it is not an integration point for either MathML or HTML. + * + * @see {@link https://html.spec.whatwg.org/#tree-construction} + */ + public inForeignNode = false; public lastStartTagName = ''; public active = false; @@ -1961,7 +1967,7 @@ export class Tokenizer { } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) { this.state = State.DOCTYPE; } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) { - if (this.allowCDATA) { + if (this.inForeignNode) { this.state = State.CDATA_SECTION; } else { this._err(ERR.cdataInHtmlContent); From 81561551508332187d666f5dccb87e748850256d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 11 Feb 2022 15:33:39 +0000 Subject: [PATCH 12/12] Add link to multi-page HTML spec --- packages/parse5/lib/tokenizer/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 66db2ae9d..f91073a6b 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -213,7 +213,7 @@ export class Tokenizer { * Indicates that the current adjusted node exists, is not an element in the HTML namespace, * and that it is not an integration point for either MathML or HTML. * - * @see {@link https://html.spec.whatwg.org/#tree-construction} + * @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction} */ public inForeignNode = false; public lastStartTagName = '';