From 173171db08011178218044fa458a981d63fe4ac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 4 Mar 2022 08:33:56 +0000 Subject: [PATCH] fix(tokenizer): Drop chunks after emitting tokens (#432) --- .../test/rewriting-stream.test.ts | 32 ++++++++++++++++--- packages/parse5-sax-parser/lib/index.ts | 4 +++ packages/parse5/lib/tokenizer/index.ts | 17 ++++------ packages/parse5/lib/tokenizer/preprocessor.ts | 6 +++- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index e1437555d..c38264fc5 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -3,6 +3,7 @@ import { outdent } from 'outdent'; import { RewritingStream } from '../lib/index.js'; import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js'; import { getStringDiffMsg, writeChunkedToStream, WritableStreamStub } from 'parse5-test-utils/utils/common.js'; +import { finished } from 'node:stream'; const srcHtml = outdent` @@ -17,6 +18,9 @@ const srcHtml = outdent` `; +const LONG_TEXT = 'a'.repeat((1 << 16) + 1); +const LONG_TEXT_WITH_COMMENT = `${'a'.repeat((1 << 16) - 5)}`; + function createRewriterTest({ src, expected, @@ -28,13 +32,17 @@ function createRewriterTest({ expected: string; assignTokenHandlers?: (rewriter: RewritingStream) => void; }) { - return (done: () => void): void => { + return (done: (err?: unknown) => void): void => { const rewriter = new RewritingStream(); const writable = new WritableStreamStub(); - writable.once('finish', () => { - assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); - done(); + finished(writable, () => { + try { + assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); + done(); + } catch (error) { + done(error); + } }); rewriter.pipe(writable); @@ -305,4 +313,20 @@ describe('RewritingStream', () => { assert.throws(() => stream.write(buf), TypeError); }); + + it( + 'Should pass long text correctly (GH-292)', + createRewriterTest({ + src: LONG_TEXT, + expected: LONG_TEXT, + }) + ); + + it( + 'Should emit comment after text correctly', + createRewriterTest({ + src: LONG_TEXT_WITH_COMMENT, + expected: LONG_TEXT_WITH_COMMENT, + }) + ); }); diff --git a/packages/parse5-sax-parser/lib/index.ts b/packages/parse5-sax-parser/lib/index.ts index 53e492599..636c900b6 100644 --- a/packages/parse5-sax-parser/lib/index.ts +++ b/packages/parse5-sax-parser/lib/index.ts @@ -156,6 +156,10 @@ export class SAXParser extends Transform implements TokenHandler { }; } } + + if (this.tokenizer.preprocessor.willDropParsedChunk()) { + this._emitPendingText(); + } } /** @internal */ diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index b25296e00..e2d83abb1 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -465,16 +465,22 @@ export class Tokenizer { this.handler.onEndTag(ct); } + + this.preprocessor.dropParsedChunk(); } private emitCurrentComment(ct: CommentToken): void { this.prepareToken(ct); this.handler.onComment(ct); + + this.preprocessor.dropParsedChunk(); } private emitCurrentDoctype(ct: DoctypeToken): void { this.prepareToken(ct); this.handler.onDoctype(ct); + + this.preprocessor.dropParsedChunk(); } private _emitCurrentCharacterToken(nextLocation: Location | null): void { @@ -536,6 +542,7 @@ export class Tokenizer { if (this.currentCharacterToken.type !== type) { this.currentLocation = this.getCurrentLocation(0); this._emitCurrentCharacterToken(this.currentLocation); + this.preprocessor.dropParsedChunk(); } else { this.currentCharacterToken.chars += ch; return; @@ -969,8 +976,6 @@ export class Tokenizer { // Data state //------------------------------------------------------------------ private _stateData(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.TAG_OPEN; @@ -999,8 +1004,6 @@ export class Tokenizer { // RCDATA state //------------------------------------------------------------------ private _stateRcdata(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.AMPERSAND: { this.returnState = State.RCDATA; @@ -1029,8 +1032,6 @@ export class Tokenizer { // RAWTEXT state //------------------------------------------------------------------ private _stateRawtext(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.RAWTEXT_LESS_THAN_SIGN; @@ -1054,8 +1055,6 @@ export class Tokenizer { // Script data state //------------------------------------------------------------------ private _stateScriptData(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; @@ -1079,8 +1078,6 @@ export class Tokenizer { // PLAINTEXT state //------------------------------------------------------------------ private _statePlaintext(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.NULL: { this._err(ERR.unexpectedNullCharacter); diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts index 7dbc82193..4b986d95b 100644 --- a/packages/parse5/lib/tokenizer/preprocessor.ts +++ b/packages/parse5/lib/tokenizer/preprocessor.ts @@ -97,8 +97,12 @@ export class Preprocessor { return cp; } + public willDropParsedChunk(): boolean { + return this.pos > this.bufferWaterline; + } + public dropParsedChunk(): void { - if (this.pos > this.bufferWaterline) { + if (this.willDropParsedChunk()) { this.html = this.html.substring(this.pos); this.lineStartPos -= this.pos; this.droppedBufferSize += this.pos;