From f9462d3f337a3630314c3d620be49f017d3e1a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 11:55:07 +0000 Subject: [PATCH 1/8] fix(tokenizer): Drop chunks after emitting tokens Fixes #292 --- packages/parse5/lib/tokenizer/index.ts | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index b25296e00..3916e1bcf 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -465,16 +465,22 @@ export class Tokenizer { this.handler.onEndTag(ct); } + + this.preprocessor.dropParsedChunk(); } private emitCurrentComment(ct: CommentToken): void { this.prepareToken(ct); this.handler.onComment(ct); + + this.preprocessor.dropParsedChunk(); } private emitCurrentDoctype(ct: DoctypeToken): void { this.prepareToken(ct); this.handler.onDoctype(ct); + + this.preprocessor.dropParsedChunk(); } private _emitCurrentCharacterToken(nextLocation: Location | null): void { @@ -969,8 +975,6 @@ export class Tokenizer { // Data state //------------------------------------------------------------------ private _stateData(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.TAG_OPEN; @@ -999,8 +1003,6 @@ export class Tokenizer { // RCDATA state //------------------------------------------------------------------ private _stateRcdata(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.AMPERSAND: { this.returnState = State.RCDATA; @@ -1029,8 +1031,6 @@ export class Tokenizer { // RAWTEXT state //------------------------------------------------------------------ private _stateRawtext(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.RAWTEXT_LESS_THAN_SIGN; @@ -1054,8 +1054,6 @@ export class Tokenizer { // Script data state //------------------------------------------------------------------ private _stateScriptData(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.LESS_THAN_SIGN: { this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; @@ -1079,8 +1077,6 @@ export class Tokenizer { // PLAINTEXT state //------------------------------------------------------------------ private _statePlaintext(cp: number): void { - this.preprocessor.dropParsedChunk(); - switch (cp) { case $.NULL: { this._err(ERR.unexpectedNullCharacter); From a041b6105c16a44a26012f8a3170623fda79e65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 11:55:46 +0000 Subject: [PATCH 2/8] Add test case Fixes #357 Co-Authored-By: Masaki Hara <41755+qnighy@users.noreply.github.com> --- .../test/rewriting-stream.test.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index e1437555d..ca913682b 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -305,4 +305,22 @@ describe('RewritingStream', () => { assert.throws(() => stream.write(buf), TypeError); }); + + it('Regression - RewritingStream - should pass long text correctly (GH-292)', (done) => { + const source = 'a'.repeat(65_540); + const parser = new RewritingStream(); + let output = ''; + + parser.on('data', (data) => { + output += data.toString(); + }); + + parser.once('finish', () => { + assert.strictEqual(output.length, source.length); + done(); + }); + + parser.write(source); + parser.end(); + }); }); From d462b535d7793f9a872ef0b3d027d54092ed4a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:09:39 +0000 Subject: [PATCH 3/8] Use `close` instead of `finish` event Otherwise we might miss some data --- .../test/rewriting-stream.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index ca913682b..24a673e56 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -32,7 +32,7 @@ function createRewriterTest({ const rewriter = new RewritingStream(); const writable = new WritableStreamStub(); - writable.once('finish', () => { + writable.once('close', () => { assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); done(); }); @@ -290,7 +290,7 @@ describe('RewritingStream', () => { assert.strictEqual(text, 'text'); }); - parser.once('finish', () => { + parser.once('close', () => { assert.ok(foundText); done(); }); From 7770297c5ae31c23394b0b053b5bf1898db9dc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:10:32 +0000 Subject: [PATCH 4/8] Add `willDropParsedChunk` preprocessor method And use it to emit text blocks from sax parser as soon as we might drop the buffer. --- .../test/rewriting-stream.test.ts | 36 ++++++++++++++++--- packages/parse5-sax-parser/lib/index.ts | 4 +++ packages/parse5/lib/tokenizer/index.ts | 1 + packages/parse5/lib/tokenizer/preprocessor.ts | 6 +++- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index 24a673e56..548e39b4b 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -306,8 +306,8 @@ describe('RewritingStream', () => { assert.throws(() => stream.write(buf), TypeError); }); - it('Regression - RewritingStream - should pass long text correctly (GH-292)', (done) => { - const source = 'a'.repeat(65_540); + it('Should pass long text correctly (GH-292)', (done) => { + const source = 'a'.repeat((1 << 16) + 1); const parser = new RewritingStream(); let output = ''; @@ -315,9 +315,35 @@ describe('RewritingStream', () => { output += data.toString(); }); - parser.once('finish', () => { - assert.strictEqual(output.length, source.length); - done(); + parser.once('close', () => { + try { + assert.strictEqual(output.length, source.length); + done(); + } catch (error) { + done(error); + } + }); + + parser.write(source); + parser.end(); + }); + + it.only('Should emit comment after text correctly', (done) => { + const source = `${'a'.repeat((1 << 16) - 3)}`; + const parser = new RewritingStream(); + let output = ''; + + parser.on('data', (data) => { + output += data.toString(); + }); + + parser.once('close', () => { + try { + assert.strictEqual(output.length, source.length); + done(); + } catch (error) { + done(error); + } }); parser.write(source); diff --git a/packages/parse5-sax-parser/lib/index.ts b/packages/parse5-sax-parser/lib/index.ts index 53e492599..636c900b6 100644 --- a/packages/parse5-sax-parser/lib/index.ts +++ b/packages/parse5-sax-parser/lib/index.ts @@ -156,6 +156,10 @@ export class SAXParser extends Transform implements TokenHandler { }; } } + + if (this.tokenizer.preprocessor.willDropParsedChunk()) { + this._emitPendingText(); + } } /** @internal */ diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 3916e1bcf..e2d83abb1 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -542,6 +542,7 @@ export class Tokenizer { if (this.currentCharacterToken.type !== type) { this.currentLocation = this.getCurrentLocation(0); this._emitCurrentCharacterToken(this.currentLocation); + this.preprocessor.dropParsedChunk(); } else { this.currentCharacterToken.chars += ch; return; diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts index 7dbc82193..4b986d95b 100644 --- a/packages/parse5/lib/tokenizer/preprocessor.ts +++ b/packages/parse5/lib/tokenizer/preprocessor.ts @@ -97,8 +97,12 @@ export class Preprocessor { return cp; } + public willDropParsedChunk(): boolean { + return this.pos > this.bufferWaterline; + } + public dropParsedChunk(): void { - if (this.pos > this.bufferWaterline) { + if (this.willDropParsedChunk()) { this.html = this.html.substring(this.pos); this.lineStartPos -= this.pos; this.droppedBufferSize += this.pos; From 23e10735233358e1c79e397a9be453ed84c4f050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:21:23 +0000 Subject: [PATCH 5/8] Rm `it.only` --- .../parse5-html-rewriting-stream/test/rewriting-stream.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index 548e39b4b..73dba8563 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -328,7 +328,7 @@ describe('RewritingStream', () => { parser.end(); }); - it.only('Should emit comment after text correctly', (done) => { + it('Should emit comment after text correctly', (done) => { const source = `${'a'.repeat((1 << 16) - 3)}`; const parser = new RewritingStream(); let output = ''; From 0f56f385b2727cc58fa21f5288cbee1aface3df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:35:18 +0000 Subject: [PATCH 6/8] Use `createRewriterTest` for added tests --- .../test/rewriting-stream.test.ts | 69 +++++++------------ 1 file changed, 24 insertions(+), 45 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index 73dba8563..7c61cd0f8 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -17,6 +17,9 @@ const srcHtml = outdent` `; +const LONG_TEXT = 'a'.repeat((1 << 16) + 1); +const LONG_TEXT_WITH_COMMENT = `${'a'.repeat((1 << 16) - 5)}`; + function createRewriterTest({ src, expected, @@ -28,13 +31,17 @@ function createRewriterTest({ expected: string; assignTokenHandlers?: (rewriter: RewritingStream) => void; }) { - return (done: () => void): void => { + return (done: (err?: unknown) => void): void => { const rewriter = new RewritingStream(); const writable = new WritableStreamStub(); writable.once('close', () => { - assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); - done(); + try { + assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); + done(); + } catch (error) { + done(error); + } }); rewriter.pipe(writable); @@ -306,47 +313,19 @@ describe('RewritingStream', () => { assert.throws(() => stream.write(buf), TypeError); }); - it('Should pass long text correctly (GH-292)', (done) => { - const source = 'a'.repeat((1 << 16) + 1); - const parser = new RewritingStream(); - let output = ''; - - parser.on('data', (data) => { - output += data.toString(); - }); - - parser.once('close', () => { - try { - assert.strictEqual(output.length, source.length); - done(); - } catch (error) { - done(error); - } - }); - - parser.write(source); - parser.end(); - }); - - it('Should emit comment after text correctly', (done) => { - const source = `${'a'.repeat((1 << 16) - 3)}`; - const parser = new RewritingStream(); - let output = ''; - - parser.on('data', (data) => { - output += data.toString(); - }); - - parser.once('close', () => { - try { - assert.strictEqual(output.length, source.length); - done(); - } catch (error) { - done(error); - } - }); + it( + 'Should pass long text correctly (GH-292)', + createRewriterTest({ + src: LONG_TEXT, + expected: LONG_TEXT, + }) + ); - parser.write(source); - parser.end(); - }); + it( + 'Should emit comment after text correctly', + createRewriterTest({ + src: LONG_TEXT_WITH_COMMENT, + expected: LONG_TEXT_WITH_COMMENT, + }) + ); }); From 5b617676a12bbf9eb5ebab223250b669bad1648e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:46:40 +0000 Subject: [PATCH 7/8] Use `stream.finished` in test The `close` doesn't seem to work in Node 12. --- .../test/rewriting-stream.test.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index 7c61cd0f8..2139cb2b7 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -3,6 +3,7 @@ import { outdent } from 'outdent'; import { RewritingStream } from '../lib/index.js'; import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js'; import { getStringDiffMsg, writeChunkedToStream, WritableStreamStub } from 'parse5-test-utils/utils/common.js'; +import { finished } from 'node:stream'; const srcHtml = outdent` @@ -35,7 +36,7 @@ function createRewriterTest({ const rewriter = new RewritingStream(); const writable = new WritableStreamStub(); - writable.once('close', () => { + finished(writable, () => { try { assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected)); done(); @@ -297,7 +298,7 @@ describe('RewritingStream', () => { assert.strictEqual(text, 'text'); }); - parser.once('close', () => { + parser.once('finished', () => { assert.ok(foundText); done(); }); From 88f06b94b99e442f5365e4926515e45b4fe822d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:47:50 +0000 Subject: [PATCH 8/8] Update rewriting-stream.test.ts --- .../parse5-html-rewriting-stream/test/rewriting-stream.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts index 2139cb2b7..c38264fc5 100644 --- a/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts +++ b/packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts @@ -298,7 +298,7 @@ describe('RewritingStream', () => { assert.strictEqual(text, 'text'); }); - parser.once('finished', () => { + parser.once('finish', () => { assert.ok(foundText); done(); });