Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Remove getNextToken method #461

Merged
merged 4 commits into from Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
73 changes: 29 additions & 44 deletions packages/parse5-parser-stream/lib/index.ts
Expand Up @@ -3,6 +3,8 @@ import { Parser, ParserOptions } from 'parse5/dist/parser/index.js';
import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js';
import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js';

/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */

/**
* Streaming HTML parser with scripting support.
* A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
Expand All @@ -28,8 +30,7 @@ import type { DefaultTreeAdapterMap } from 'parse5/dist/tree-adapters/default.js
*/
export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable {
private lastChunkWritten = false;
private writeCallback: null | (() => void) = null;
private pausedByScript = false;
private writeCallback: undefined | (() => void) = undefined;

public parser: Parser<T>;
private pendingHtmlInsertions: string[] = [];
Expand All @@ -42,7 +43,31 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
constructor(options?: ParserOptions<T>) {
super({ decodeStrings: false });

this.parser = new Parser(options);
const resume = (): void => {
for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) {
this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]);
}

this.pendingHtmlInsertions.length = 0;

//NOTE: keep parsing if we don't wait for the next input chunk
this.parser.tokenizer.resume(this.writeCallback);
};

const documentWrite = (html: string): void => {
if (!this.parser.stopped) {
this.pendingHtmlInsertions.push(html);
}
};

const scriptHandler = (scriptElement: T['element']): void => {
if (this.listenerCount('script') > 0) {
this.parser.tokenizer.pause();
this.emit('script', scriptElement, documentWrite, resume);
}
};

this.parser = new Parser(options, undefined, undefined, scriptHandler);
this.document = this.parser.document;
}

Expand All @@ -53,8 +78,7 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
}

this.writeCallback = callback;
this.parser.tokenizer.write(chunk, this.lastChunkWritten);
this._runParsingLoop();
this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback);
}

// TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`.
Expand All @@ -64,45 +88,6 @@ export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>
this.lastChunkWritten = true;
super.end(chunk || '', encoding, callback);
}

//Scriptable parser implementation
private _runParsingLoop(): void {
this.parser.runParsingLoopForCurrentChunk(this.writeCallback, this._scriptHandler);
}

private _resume = (): void => {
if (!this.pausedByScript) {
throw new Error('Parser was already resumed');
}

while (this.pendingHtmlInsertions.length > 0) {
const html = this.pendingHtmlInsertions.pop()!;

this.parser.tokenizer.insertHtmlAtCurrentPos(html);
}

this.pausedByScript = false;

//NOTE: keep parsing if we don't wait for the next input chunk
if (this.parser.tokenizer.active) {
this._runParsingLoop();
}
};

private _documentWrite = (html: string): void => {
if (!this.parser.stopped) {
this.pendingHtmlInsertions.push(html);
}
};

private _scriptHandler = (scriptElement: T['element']): void => {
if (this.listenerCount('script') > 0) {
this.pausedByScript = true;
this.emit('script', scriptElement, this._documentWrite, this._resume);
} else {
this._runParsingLoop();
}
};
}

export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> {
Expand Down
8 changes: 1 addition & 7 deletions packages/parse5-sax-parser/lib/index.ts
Expand Up @@ -123,23 +123,17 @@ export class SAXParser extends Transform implements TokenHandler {
*/
public stop(): void {
this.stopped = true;
this.tokenizer.pause();
}

//Internals
protected _transformChunk(chunk: string): string {
if (!this.stopped) {
this.tokenizer.write(chunk, this.lastChunkWritten);
this._runParsingLoop();
}
return chunk;
}

private _runParsingLoop(): void {
while (!this.stopped && this.tokenizer.active) {
this.tokenizer.getNextToken();
}
}

/** @internal */
onCharacter({ chars, location }: CharacterToken): void {
if (this.pendingText === null) {
Expand Down
38 changes: 3 additions & 35 deletions packages/parse5/lib/parser/index.ts
Expand Up @@ -129,7 +129,8 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
public constructor(
options?: ParserOptions<T>,
document?: T['document'],
public fragmentContext: T['element'] | null = null
public fragmentContext: T['element'] | null = null,
public scriptHandler: null | ((pendingScript: T['element']) => void) = null
) {
this.options = {
...defaultParserOptions,
Expand Down Expand Up @@ -160,7 +161,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
const parser = new this(options);

parser.tokenizer.write(html, true);
parser._runParsingLoop(null);

return parser.document;
}
Expand Down Expand Up @@ -195,7 +195,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
parser._resetInsertionMode();
parser._findFormInFragmentContext();
parser.tokenizer.write(html, true);
parser._runParsingLoop(null);

const rootElement = opts.treeAdapter.getFirstChild(documentMock) as T['parentNode'];
const fragment = opts.treeAdapter.createDocumentFragment();
Expand All @@ -215,7 +214,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack

headElement: null | T['element'] = null;
formElement: null | T['element'] = null;
pendingScript: null | T['element'] = null;

openElements: OpenElementStack<T>;
activeFormattingElements: FormattingElementList<T>;
Expand Down Expand Up @@ -253,36 +251,6 @@ export class Parser<T extends TreeAdapterTypeMap> implements TokenHandler, Stack
this.onParseError(err);
}

//Parsing loop
private _runParsingLoop(scriptHandler: null | ((scriptElement: T['element']) => void)): void {
while (!this.stopped) {
this.tokenizer.getNextToken();

if (!this.tokenizer.active || (scriptHandler !== null && this.pendingScript)) {
break;
}
}
}

public runParsingLoopForCurrentChunk(
writeCallback: null | (() => void),
scriptHandler: (scriptElement: T['element']) => void
): void {
this._runParsingLoop(scriptHandler);

if (scriptHandler && this.pendingScript) {
const script = this.pendingScript;

this.pendingScript = null;

scriptHandler(script);

return;
}

writeCallback?.();
}

//Stack events
onItemPush(node: T['parentNode'], tid: number, isTop: boolean): void {
this.treeAdapter.onItemPush?.(node);
Expand Down Expand Up @@ -2576,7 +2544,7 @@ function eofInBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: EOFToken):
//------------------------------------------------------------------
function endTagInText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken): void {
if (token.tagID === $.SCRIPT) {
p.pendingScript = p.openElements.current;
p.scriptHandler?.(p.openElements.current);
}

p.openElements.pop();
Expand Down
51 changes: 41 additions & 10 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -223,8 +223,9 @@ export interface TokenHandler {
export class Tokenizer {
public preprocessor: Preprocessor;

/** Indicates that the next token has been emitted, and `getNextToken` should return. */
private hasEmitted = false;
private paused = false;
/** Ensures that the parsing loop isn't run multiple times at once. */
private inLoop = false;

/**
* Indicates that the current adjusted node exists, is not an element in the HTML namespace,
Expand Down Expand Up @@ -274,10 +275,12 @@ export class Tokenizer {
};
}

//API
public getNextToken(): void {
this.hasEmitted = false;
while (!this.hasEmitted && this.active) {
private _runParsingLoop(): void {
if (this.inLoop) return;

this.inLoop = true;

while (this.active && !this.paused) {
this.consumedAfterSnapshot = 0;

const cp = this._consume();
Expand All @@ -286,16 +289,46 @@ export class Tokenizer {
this._callState(cp);
}
}

this.inLoop = false;
}

public write(chunk: string, isLastChunk: boolean): void {
//API
public pause(): void {
this.paused = true;
}

public resume(writeCallback?: () => void): void {
if (!this.paused) {
throw new Error('Parser was already resumed');
}

this.paused = false;

// Necessary for synchronous resume.
if (this.inLoop) return;

this._runParsingLoop();

if (!this.paused) {
writeCallback?.();
}
}

public write(chunk: string, isLastChunk: boolean, writeCallback?: () => void): void {
this.active = true;
this.preprocessor.write(chunk, isLastChunk);
this._runParsingLoop();

if (!this.paused) {
writeCallback?.();
}
}

public insertHtmlAtCurrentPos(chunk: string): void {
this.active = true;
this.preprocessor.insertHtmlAtCurrentPos(chunk);
this._runParsingLoop();
}

//Hibernation
Expand Down Expand Up @@ -440,7 +473,6 @@ export class Tokenizer {
ct.location.endOffset = this.preprocessor.offset + 1;
}

this.hasEmitted = true;
this.currentLocation = this.getCurrentLocation(-1);
}

Expand Down Expand Up @@ -508,7 +540,6 @@ export class Tokenizer {
}
}

this.hasEmitted = true;
this.currentCharacterToken = null;
}
}
Expand All @@ -524,7 +555,7 @@ export class Tokenizer {

this._emitCurrentCharacterToken(location);
this.handler.onEof({ type: TokenType.EOF, location });
this.hasEmitted = true;
this.active = false;
}

//Characters emission
Expand Down
33 changes: 14 additions & 19 deletions packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
Expand Up @@ -3,30 +3,27 @@ import { Tokenizer, TokenizerMode, TokenHandler } from './index.js';
import { Location, EOFToken, CharacterToken, DoctypeToken, TagToken, CommentToken } from '../common/token.js';
import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js';

interface LocationInfoTestCase {
initialMode: typeof TokenizerMode[keyof typeof TokenizerMode];
lastStartTagName: string;
htmlChunks: string[];
}

/** Receives events and immediately compares them against the expected values. */
class LocationInfoHandler implements TokenHandler {
public sawEof = false;
/** All HTML chunks concatenated. */
private html: string;
/** The index of the last html chunk. */
private idx = 0;
/** All of the lines in the input. */
private lines: string[];

constructor(private testCase: LocationInfoTestCase, private html: string) {
this.lines = html.split(/\r?\n/g);
constructor(private htmlChunks: string[]) {
this.html = htmlChunks.join('');
this.lines = this.html.split(/\r?\n/g);
}

private validateLocation(location: Location | null): void {
assert.ok(location);

//Offsets
const actual = this.html.substring(location.startOffset, location.endOffset);
const chunk = this.testCase.htmlChunks[this.idx];
const chunk = this.htmlChunks[this.idx];

assert.strictEqual(actual, chunk);

Expand Down Expand Up @@ -65,7 +62,7 @@ class LocationInfoHandler implements TokenHandler {
assert.strictEqual(location.endOffset, location.startOffset);
assert.strictEqual(location.endOffset, this.html.length);

assert.strictEqual(this.idx, this.testCase.htmlChunks.length);
assert.strictEqual(this.idx, this.htmlChunks.length);

this.sawEof = true;
}
Expand Down Expand Up @@ -166,23 +163,21 @@ it('Location Info (Tokenizer)', () => {
];

for (const testCase of testCases) {
const html = testCase.htmlChunks.join('');
const handler = new LocationInfoHandler(testCase, html);
const { htmlChunks } = testCase;
const handler = new LocationInfoHandler(htmlChunks);
const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler);
const lastChunkIdx = testCase.htmlChunks.length - 1;

for (let i = 0; i < testCase.htmlChunks.length; i++) {
tokenizer.write(testCase.htmlChunks[i], i === lastChunkIdx);
}

// NOTE: set small waterline for testing purposes
tokenizer.preprocessor.bufferWaterline = 8;
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
tokenizer.inForeignNode = !!testCase.inForeignNode;

while (!handler.sawEof) {
tokenizer.getNextToken();
for (let i = 0; i < htmlChunks.length; i++) {
tokenizer.write(htmlChunks[i], i === htmlChunks.length - 1);
}

assert.ok(handler.sawEof);
assert.ok(!tokenizer.active);
}
});