Skip to content

Commit

Permalink
refactor: Remove getNextToken method (#461)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 committed Mar 17, 2022
1 parent 848851a commit dc5a0a6
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 127 deletions.
73 changes: 29 additions & 44 deletions packages/parse5-parser-stream/lib/index.ts
Expand Up @@ -3,6 +3,8 @@ import { Parser, ParserOptions } from 'parse5/dist/parser/index.js';
import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js';
import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js';

/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */

/**
* Streaming HTML parser with scripting support.
* A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
Expand All @@ -28,8 +30,7 @@ import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js
*/
export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable {
private lastChunkWritten = false;
private writeCallback: null | (() => void) = null;
private pausedByScript = false;
private writeCallback: undefined | (() => void) = undefined;

public parser: Parser<T>;
private pendingHtmlInsertions: string[] = [];
Expand All @@ -42,7 +43,31 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
constructor(options?: ParserOptions<T>) {
super({ decodeStrings: false });

this.parser = new Parser(options);
const resume = (): void => {
for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) {
this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]);
}

this.pendingHtmlInsertions.length = 0;

//NOTE: keep parsing if we don't wait for the next input chunk
this.parser.tokenizer.resume(this.writeCallback);
};

const documentWrite = (html: string): void => {
if (!this.parser.stopped) {
this.pendingHtmlInsertions.push(html);
}
};

const scriptHandler = (scriptElement: T['element']): void => {
if (this.listenerCount('script') > 0) {
this.parser.tokenizer.pause();
this.emit('script', scriptElement, documentWrite, resume);
}
};

this.parser = new Parser(options, undefined, undefined, scriptHandler);
this.document = this.parser.document;
}

Expand All @@ -53,8 +78,7 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
}

this.writeCallback = callback;
this.parser.tokenizer.write(chunk, this.lastChunkWritten);
this._runParsingLoop();
this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback);
}

// TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`.
Expand All @@ -64,45 +88,6 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
this.lastChunkWritten = true;
super.end(chunk || '', encoding, callback);
}

//Scriptable parser implementation
private _runParsingLoop(): void {
this.parser.runParsingLoopForCurrentChunk(this.writeCallback, this._scriptHandler);
}

private _resume = (): void => {
if (!this.pausedByScript) {
throw new Error('Parser was already resumed');
}

while (this.pendingHtmlInsertions.length > 0) {
const html = this.pendingHtmlInsertions.pop()!;

this.parser.tokenizer.insertHtmlAtCurrentPos(html);
}

this.pausedByScript = false;

//NOTE: keep parsing if we don't wait for the next input chunk
if (this.parser.tokenizer.active) {
this._runParsingLoop();
}
};

private _documentWrite = (html: string): void => {
if (!this.parser.stopped) {
this.pendingHtmlInsertions.push(html);
}
};

private _scriptHandler = (scriptElement: T['element']): void => {
if (this.listenerCount('script') > 0) {
this.pausedByScript = true;
this.emit('script', scriptElement, this._documentWrite, this._resume);
} else {
this._runParsingLoop();
}
};
}

export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> {
Expand Down
8 changes: 1 addition & 7 deletions packages/parse5-sax-parser/lib/index.ts
Expand Up @@ -123,23 +123,17 @@ export class SAXParser extends Transform implements TokenHandler {
*/
public stop(): void {
this.stopped = true;
this.tokenizer.pause();
}

//Internals
protected _transformChunk(chunk: string): string {
if (!this.stopped) {
this.tokenizer.write(chunk, this.lastChunkWritten);
this._runParsingLoop();
}
return chunk;
}

private _runParsingLoop(): void {
while (!this.stopped && this.tokenizer.active) {
this.tokenizer.getNextToken();
}
}

/** @internal */
onCharacter({ chars, location }: CharacterToken): void {
if (this.pendingText === null) {
Expand Down
38 changes: 3 additions & 35 deletions packages/parse5/lib/parser/index.ts
Expand Up @@ -129,7 +129,8 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
public constructor(
options?: ParserOptions<T>,
document?: T['document'],
public fragmentContext: T['element'] | null = null
public fragmentContext: T['element'] | null = null,
public scriptHandler: null | ((pendingScript: T['element']) => void) = null
) {
this.options = {
...defaultParserOptions,
Expand Down Expand Up @@ -160,7 +161,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
const parser = new this(options);

parser.tokenizer.write(html, true);
parser._runParsingLoop(null);

return parser.document;
}
Expand Down Expand Up @@ -195,7 +195,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
parser._resetInsertionMode();
parser._findFormInFragmentContext();
parser.tokenizer.write(html, true);
parser._runParsingLoop(null);

const rootElement = opts.treeAdapter.getFirstChild(documentMock) as T['parentNode'];
const fragment = opts.treeAdapter.createDocumentFragment();
Expand All @@ -215,7 +214,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack

headElement: null | T['element'] = null;
formElement: null | T['element'] = null;
pendingScript: null | T['element'] = null;

openElements: OpenElementStack<T>;
activeFormattingElements: FormattingElementList<T>;
Expand Down Expand Up @@ -253,36 +251,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
this.onParseError(err);
}

//Parsing loop
private _runParsingLoop(scriptHandler: null | ((scriptElement: T['element']) => void)): void {
while (!this.stopped) {
this.tokenizer.getNextToken();

if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) {
break;
}
}
}

public runParsingLoopForCurrentChunk(
writeCallback: null | (() => void),
scriptHandler: (scriptElement: T['element']) => void
): void {
this._runParsingLoop(scriptHandler);

if (scriptHandler && this.pendingScript) {
const script = this.pendingScript;

this.pendingScript = null;

scriptHandler(script);

return;
}

writeCallback?.();
}

//Stack events
onItemPush(node: T['parentNode'], tid: number, isTop: boolean): void {
this.treeAdapter.onItemPush?.(node);
Expand Down Expand Up @@ -2576,7 +2544,7 @@ function eofInBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: EOFToken):
//------------------------------------------------------------------
function endTagInText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken): void {
if (token.tagID === $.SCRIPT) {
p.pendingScript = p.openElements.current;
p.scriptHandler?.(p.openElements.current);
}

p.openElements.pop();
Expand Down
51 changes: 41 additions & 10 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -223,8 +223,9 @@ export interface TokenHandler {
export class Tokenizer {
public preprocessor: Preprocessor;

/** Indicates that the next token has been emitted, and `getNextToken` should return. */
private hasEmitted = false;
private paused = false;
/** Ensures that the parsing loop isn't run multiple times at once. */
private inLoop = false;

/**
* Indicates that the current adjusted node exists, is not an element in the HTML namespace,
Expand Down Expand Up @@ -274,10 +275,12 @@ export class Tokenizer {
};
}

//API
public getNextToken(): void {
this.hasEmitted = false;
while (!this.hasEmitted && this.active) {
private _runParsingLoop(): void {
if (this.inLoop) return;

this.inLoop = true;

while (this.active && !this.paused) {
this.consumedAfterSnapshot = 0;

const cp = this._consume();
Expand All @@ -286,16 +289,46 @@ export class Tokenizer {
this._callState(cp);
}
}

this.inLoop = false;
}

public write(chunk: string, isLastChunk: boolean): void {
//API
public pause(): void {
this.paused = true;
}

public resume(writeCallback?: () => void): void {
if (!this.paused) {
throw new Error('Parser was already resumed');
}

this.paused = false;

// Necessary for synchronous resume.
if (this.inLoop) return;

this._runParsingLoop();

if (!this.paused) {
writeCallback?.();
}
}

public write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void {
this.active = true;
this.preprocessor.write(chunk, isLastChunk);
this._runParsingLoop();

if (!this.paused) {
writeCallback?.();
}
}

public insertHtmlAtCurrentPos(chunk: string): void {
this.active = true;
this.preprocessor.insertHtmlAtCurrentPos(chunk);
this._runParsingLoop();
}

//Hibernation
Expand Down Expand Up @@ -440,7 +473,6 @@ export class Tokenizer {
ct.location.endOffset = this.preprocessor.offset + 1;
}

this.hasEmitted = true;
this.currentLocation = this.getCurrentLocation(-1);
}

Expand Down Expand Up @@ -508,7 +540,6 @@ export class Tokenizer {
}
}

this.hasEmitted = true;
this.currentCharacterToken = null;
}
}
Expand All @@ -524,7 +555,7 @@ export class Tokenizer {

this._emitCurrentCharacterToken(location);
this.handler.onEof({ type: TokenType.EOF, location });
this.hasEmitted = true;
this.active = false;
}

//Characters emission
Expand Down
33 changes: 14 additions & 19 deletions packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
Expand Up @@ -3,30 +3,27 @@ import { Tokenizer, TokenizerMode, TokenHandler } from './index.js';
import { Location, EOFToken, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js';
import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js';

interface LocationInfoTestCase {
initialMode: typeof TokenizerMode[keyof typeof TokenizerMode];
lastStartTagName: string;
htmlChunks: string[];
}

/** Receives events and immediately compares them against the expected values. */
class LocationInfoHandler implements TokenHandler {
public sawEof = false;
/** All HTML chunks concatenated. */
private html: string;
/** The index of the last html chunk. */
private idx = 0;
/** All of the lines in the input. */
private lines: string[];

constructor(private testCase: LocationInfoTestCase, private html: string) {
this.lines = html.split(/\r?\n/g);
constructor(private htmlChunks: string[]) {
this.html = htmlChunks.join('');
this.lines = this.html.split(/\r?\n/g);
}

private validateLocation(location: Location | null): void {
assert.ok(location);

//Offsets
const actual = this.html.substring(location.startOffset, location.endOffset);
const chunk = this.testCase.htmlChunks[this.idx];
const chunk = this.htmlChunks[this.idx];

assert.strictEqual(actual, chunk);

Expand Down Expand Up @@ -65,7 +62,7 @@ class LocationInfoHandler implements TokenHandler {
assert.strictEqual(location.endOffset, location.startOffset);
assert.strictEqual(location.endOffset, this.html.length);

assert.strictEqual(this.idx, this.testCase.htmlChunks.length);
assert.strictEqual(this.idx, this.htmlChunks.length);

this.sawEof = true;
}
Expand Down Expand Up @@ -166,23 +163,21 @@ it('Location Info (Tokenizer)', () => {
];

for (const testCase of testCases) {
const html = testCase.htmlChunks.join('');
const handler = new LocationInfoHandler(testCase, html);
const { htmlChunks } = testCase;
const handler = new LocationInfoHandler(htmlChunks);
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);
const lastChunkIdx = testCase.htmlChunks.length - 1;

for (let i = 0; i < testCase.htmlChunks.length; i++) {
tokenizer.write(testCase.htmlChunks[i], i === lastChunkIdx);
}

// NOTE: set small waterline for testing purposes
tokenizer.preprocessor.bufferWaterline = 8;
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
tokenizer.inForeignNode = !!testCase.inForeignNode;

while (!handler.sawEof) {
tokenizer.getNextToken();
for (let i = 0; i < htmlChunks.length; i++) {
tokenizer.write(htmlChunks[i], i === htmlChunks.length - 1);
}

assert.ok(handler.sawEof);
assert.ok(!tokenizer.active);
}
});

0 comments on commit dc5a0a6

Please sign in to comment.