Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(tokenizer): Use explicit offsets for locations #402

Merged
merged 4 commits into from Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 35 additions & 33 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -220,6 +220,7 @@ export class Tokenizer {

private consumedAfterSnapshot = -1;

private ctLoc: Location | null;
private currentCharacterToken: CharacterToken | null = null;
private currentToken: Token | null = null;
private currentAttr: Attribute = { name: '', value: '' };
Expand All @@ -231,6 +232,7 @@ export class Tokenizer {
this.addLocationInfo = !!options.sourceCodeLocationInfo;
this.onParseError = options.onParseError ?? null;
this.preprocessor = new Preprocessor(options);
this.ctLoc = this.getCurrentLocation(-1);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not 0?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doubly negated offset makes things a bit difficult here. An offset of +1 means "start one character ago". -1 means "start one character after our last seen one".

The preprocessor is initiated at position -1. We access the location here before we read anything, so have to give a negative offset of -1 to get the position of the first character in the input. The same is done again in _emitCurrentToken.

I have played with flipping the sign of getCurrentLocation's offset, but then other places are hard to read.

}

//Errors
Expand All @@ -239,16 +241,17 @@ export class Tokenizer {
}

private currentAttrLocation: Location | null = null;
private ctLoc: Location | null = null;
private _getCurrentLocation(): Location | null {

// NOTE: `offset` may never run across line boundaries.
private getCurrentLocation(offset: number): Location | null {
wooorm marked this conversation as resolved.
Show resolved Hide resolved
if (!this.addLocationInfo) {
return null;
}

return {
startLine: this.preprocessor.line,
startCol: this.preprocessor.col,
startOffset: this.preprocessor.offset,
startCol: this.preprocessor.col - offset,
startOffset: this.preprocessor.offset - offset,
endLine: -1,
endCol: -1,
endOffset: -1,
Expand Down Expand Up @@ -334,7 +337,7 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(1),
};
}

Expand All @@ -346,15 +349,15 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(2),
};
}

private _createCommentToken(): void {
private _createCommentToken(offset: number): void {
this.currentToken = {
type: TokenType.COMMENT,
data: '',
location: this.ctLoc,
location: this.getCurrentLocation(offset),
};
}

Expand All @@ -378,15 +381,15 @@ export class Tokenizer {
}

private _createEOFToken(): void {
const ctLoc = this._getCurrentLocation();
const location = this.getCurrentLocation(0);

if (ctLoc) {
ctLoc.endLine = ctLoc.startLine;
ctLoc.endCol = ctLoc.startCol;
ctLoc.endOffset = ctLoc.startOffset;
if (location) {
location.endLine = location.startLine;
location.endCol = location.startCol;
location.endOffset = location.startOffset;
}

this.currentToken = { type: TokenType.EOF, location: ctLoc };
this.currentToken = { type: TokenType.EOF, location };
}

//Tag attributes
Expand All @@ -395,7 +398,7 @@ export class Tokenizer {
name: attrNameFirstCh,
value: '',
};
this.currentAttrLocation = this._getCurrentLocation();
this.currentAttrLocation = this.getCurrentLocation(0);
}

private _leaveAttrName(): void {
Expand Down Expand Up @@ -426,10 +429,10 @@ export class Tokenizer {

//Token emission
private _emitCurrentToken(): void {
this._emitCurrentCharacterToken();

const ct = this.currentToken!;

this._emitCurrentCharacterToken(ct.location);

this.currentToken = null;

//NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
Expand Down Expand Up @@ -462,16 +465,17 @@ export class Tokenizer {
}

this.tokenQueue.push(ct);
this.ctLoc = this.getCurrentLocation(-1);
}

private _emitCurrentCharacterToken(): void {
private _emitCurrentCharacterToken(nextLocation: Location | null): void {
if (this.currentCharacterToken) {
//NOTE: if we have pending character token make it's end location equal to the
//current token's start location.
if (this.ctLoc && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = this.ctLoc.startLine;
this.currentCharacterToken.location.endCol = this.ctLoc.startCol;
this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset;
if (nextLocation && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = nextLocation.startLine;
this.currentCharacterToken.location.endCol = nextLocation.startCol;
this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
}

this.tokenQueue.push(this.currentCharacterToken);
Expand All @@ -496,7 +500,8 @@ export class Tokenizer {
//3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void {
if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
this._emitCurrentCharacterToken();
this.ctLoc = this.getCurrentLocation(0);
this._emitCurrentCharacterToken(this.ctLoc);
}

if (this.currentCharacterToken) {
Expand Down Expand Up @@ -925,7 +930,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand Down Expand Up @@ -956,7 +960,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRcdata(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.AMPERSAND: {
Expand Down Expand Up @@ -987,7 +990,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRawtext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1013,7 +1015,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateScriptData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1039,7 +1040,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _statePlaintext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.NULL: {
Expand Down Expand Up @@ -1076,7 +1076,7 @@ export class Tokenizer {
}
case $.QUESTION_MARK: {
this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
this._createCommentToken();
this._createCommentToken(1);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
break;
Expand Down Expand Up @@ -1118,7 +1118,7 @@ export class Tokenizer {
}
default: {
this._err(ERR.invalidFirstCharacterOfTagName);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down Expand Up @@ -1956,16 +1956,18 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateMarkupDeclarationOpen(cp: number): void {
if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) {
this._createCommentToken();
this._createCommentToken($$.DASH_DASH.length + 1);
this.state = State.COMMENT_START;
} else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) {
// NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here.
this.ctLoc = this.getCurrentLocation($$.DOCTYPE.length + 1);
this.state = State.DOCTYPE;
} else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) {
if (this.allowCDATA) {
this.state = State.CDATA_SECTION;
} else {
this._err(ERR.cdataInHtmlContent);
this._createCommentToken();
this._createCommentToken($$.CDATA_START.length + 1);
(this.currentToken as CommentToken).data = '[CDATA[';
this.state = State.BOGUS_COMMENT;
}
Expand All @@ -1975,7 +1977,7 @@ export class Tokenizer {
//results are no longer valid and we will need to start over.
else if (!this._ensureHibernation()) {
this._err(ERR.incorrectlyOpenedComment);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down
5 changes: 3 additions & 2 deletions packages/parse5/lib/tokenizer/preprocessor.ts
Expand Up @@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16;
export class Preprocessor {
public html = '';
private pos = -1;
private lastGapPos = -1;
// NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
private lastGapPos = -2;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-2 seems weird, undefined or so perhaps? Or would that make the types too complex?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would complicate things quite a bit. The issue here is the check Number(this.lastGapPos !== this.pos) below. -2 is weird, but does make things work as expected.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Infinity (or -Infinity) perhaps?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-Infinity seems even weirder to me tbh 😄

-2 still has a resemblance to the not found return value of indexOf, with -Infinity I am taken aback by the sudden floating point number for something that is supposed to be an index.

private gapStack: number[] = [];
private skipNextNewLine = false;
private lastChunkWritten = false;
Expand Down Expand Up @@ -106,7 +107,7 @@ export class Preprocessor {
this.lineStartPos -= this.pos;
this.droppedBufferSize += this.pos;
this.pos = 0;
this.lastGapPos = -1;
this.lastGapPos = -2;
this.gapStack.length = 0;
}
}
Expand Down
19 changes: 19 additions & 0 deletions packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
Expand Up @@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => {
lastStartTagName: 'plaintext',
htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
htmlChunks: [
'\n',
'<!-- regular comment -->',
'<! bogus comment >',
'<? another bogus comment >',
'</!yet another bogus comment>',
'<![CDATA[ cdata as a bogus comment >',
],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
allowCDATA: true,
htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
},
];

for (const testCase of testCases) {
Expand All @@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => {
tokenizer.preprocessor.bufferWaterline = 8;
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
tokenizer.allowCDATA = !!testCase.allowCDATA;

for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
if (token.type === TokenType.HIBERNATION) {
Expand Down