Skip to content

Commit

Permalink
fix(tokenizer): Drop chunks after emitting tokens (#432)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 committed Mar 4, 2022
1 parent 6e7b230 commit 790c756
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 15 deletions.
Expand Up @@ -3,6 +3,7 @@ import { outdent } from 'outdent';
import { RewritingStream } from '../lib/index.js';
import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js';
import { getStringDiffMsg, writeChunkedToStream, WritableStreamStub } from 'parse5-test-utils/utils/common.js';
import { finished } from 'node:stream';

const srcHtml = outdent`
<!DOCTYPE html "">
Expand All @@ -17,6 +18,9 @@ const srcHtml = outdent`
</html>
`;

const LONG_TEXT = 'a'.repeat((1 << 16) + 1);
const LONG_TEXT_WITH_COMMENT = `${'a'.repeat((1 << 16) - 5)}<!-- comment -->`;

function createRewriterTest({
src,
expected,
Expand All @@ -28,13 +32,17 @@ function createRewriterTest({
expected: string;
assignTokenHandlers?: (rewriter: RewritingStream) => void;
}) {
return (done: () => void): void => {
return (done: (err?: unknown) => void): void => {
const rewriter = new RewritingStream();
const writable = new WritableStreamStub();

writable.once('finish', () => {
assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected));
done();
finished(writable, () => {
try {
assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected));
done();
} catch (error) {
done(error);
}
});

rewriter.pipe(writable);
Expand Down Expand Up @@ -305,4 +313,20 @@ describe('RewritingStream', () => {

assert.throws(() => stream.write(buf), TypeError);
});

it(
'Should pass long text correctly (GH-292)',
createRewriterTest({
src: LONG_TEXT,
expected: LONG_TEXT,
})
);

it(
'Should emit comment after text correctly',
createRewriterTest({
src: LONG_TEXT_WITH_COMMENT,
expected: LONG_TEXT_WITH_COMMENT,
})
);
});
4 changes: 4 additions & 0 deletions packages/parse5-sax-parser/lib/index.ts
Expand Up @@ -156,6 +156,10 @@ export class SAXParser extends Transform implements TokenHandler {
};
}
}

if (this.tokenizer.preprocessor.willDropParsedChunk()) {
this._emitPendingText();
}
}

/** @internal */
Expand Down
17 changes: 7 additions & 10 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -465,16 +465,22 @@ export class Tokenizer {

this.handler.onEndTag(ct);
}

this.preprocessor.dropParsedChunk();
}

private emitCurrentComment(ct: CommentToken): void {
this.prepareToken(ct);
this.handler.onComment(ct);

this.preprocessor.dropParsedChunk();
}

private emitCurrentDoctype(ct: DoctypeToken): void {
this.prepareToken(ct);
this.handler.onDoctype(ct);

this.preprocessor.dropParsedChunk();
}

private _emitCurrentCharacterToken(nextLocation: Location | null): void {
Expand Down Expand Up @@ -536,6 +542,7 @@ export class Tokenizer {
if (this.currentCharacterToken.type !== type) {
this.currentLocation = this.getCurrentLocation(0);
this._emitCurrentCharacterToken(this.currentLocation);
this.preprocessor.dropParsedChunk();
} else {
this.currentCharacterToken.chars += ch;
return;
Expand Down Expand Up @@ -969,8 +976,6 @@ export class Tokenizer {
// Data state
//------------------------------------------------------------------
private _stateData(cp: number): void {
this.preprocessor.dropParsedChunk();

switch (cp) {
case $.LESS_THAN_SIGN: {
this.state = State.TAG_OPEN;
Expand Down Expand Up @@ -999,8 +1004,6 @@ export class Tokenizer {
// RCDATA state
//------------------------------------------------------------------
private _stateRcdata(cp: number): void {
this.preprocessor.dropParsedChunk();

switch (cp) {
case $.AMPERSAND: {
this.returnState = State.RCDATA;
Expand Down Expand Up @@ -1029,8 +1032,6 @@ export class Tokenizer {
// RAWTEXT state
//------------------------------------------------------------------
private _stateRawtext(cp: number): void {
this.preprocessor.dropParsedChunk();

switch (cp) {
case $.LESS_THAN_SIGN: {
this.state = State.RAWTEXT_LESS_THAN_SIGN;
Expand All @@ -1054,8 +1055,6 @@ export class Tokenizer {
// Script data state
//------------------------------------------------------------------
private _stateScriptData(cp: number): void {
this.preprocessor.dropParsedChunk();

switch (cp) {
case $.LESS_THAN_SIGN: {
this.state = State.SCRIPT_DATA_LESS_THAN_SIGN;
Expand All @@ -1079,8 +1078,6 @@ export class Tokenizer {
// PLAINTEXT state
//------------------------------------------------------------------
private _statePlaintext(cp: number): void {
this.preprocessor.dropParsedChunk();

switch (cp) {
case $.NULL: {
this._err(ERR.unexpectedNullCharacter);
Expand Down
6 changes: 5 additions & 1 deletion packages/parse5/lib/tokenizer/preprocessor.ts
Expand Up @@ -97,8 +97,12 @@ export class Preprocessor {
return cp;
}

public willDropParsedChunk(): boolean {
return this.pos > this.bufferWaterline;
}

public dropParsedChunk(): void {
if (this.pos > this.bufferWaterline) {
if (this.willDropParsedChunk()) {
this.html = this.html.substring(this.pos);
this.lineStartPos -= this.pos;
this.droppedBufferSize += this.pos;
Expand Down

0 comments on commit 790c756

Please sign in to comment.