Skip to content

Commit

Permalink
refactor(tokenizer): Use explicit offsets for locations (inikulin#402)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 authored and jmbpwtw committed Feb 16, 2023
1 parent 5b10ce8 commit 68f90b5
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 44 deletions.
84 changes: 42 additions & 42 deletions packages/parse5/lib/tokenizer/index.ts
Expand Up @@ -220,6 +220,7 @@ export class Tokenizer {

private consumedAfterSnapshot = -1;

private currentLocation: Location | null;
private currentCharacterToken: CharacterToken | null = null;
private currentToken: Token | null = null;
private currentAttr: Attribute = { name: '', value: '' };
Expand All @@ -231,24 +232,24 @@ export class Tokenizer {
this.addLocationInfo = !!options.sourceCodeLocationInfo;
this.onParseError = options.onParseError ?? null;
this.preprocessor = new Preprocessor(options);
this.currentLocation = this.getCurrentLocation(-1);
}

//Errors
private _err(code: ERR): void {
this.onParseError?.(this.preprocessor.getError(code));
}

private currentAttrLocation: Location | null = null;
private ctLoc: Location | null = null;
private _getCurrentLocation(): Location | null {
// NOTE: `offset` may never run across line boundaries.
private getCurrentLocation(offset: number): Location | null {
if (!this.addLocationInfo) {
return null;
}

return {
startLine: this.preprocessor.line,
startCol: this.preprocessor.col,
startOffset: this.preprocessor.offset,
startCol: this.preprocessor.col - offset,
startOffset: this.preprocessor.offset - offset,
endLine: -1,
endCol: -1,
endOffset: -1,
Expand Down Expand Up @@ -334,7 +335,7 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(1),
};
}

Expand All @@ -346,15 +347,15 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(2),
};
}

private _createCommentToken(): void {
private _createCommentToken(offset: number): void {
this.currentToken = {
type: TokenType.COMMENT,
data: '',
location: this.ctLoc,
location: this.getCurrentLocation(offset),
};
}

Expand All @@ -365,28 +366,28 @@ export class Tokenizer {
forceQuirks: false,
publicId: null,
systemId: null,
location: this.ctLoc,
location: this.currentLocation,
};
}

private _createCharacterToken(type: CharacterToken['type'], chars: string): void {
this.currentCharacterToken = {
type,
chars,
location: this.ctLoc,
location: this.currentLocation,
};
}

private _createEOFToken(): void {
const ctLoc = this._getCurrentLocation();
const location = this.getCurrentLocation(0);

if (ctLoc) {
ctLoc.endLine = ctLoc.startLine;
ctLoc.endCol = ctLoc.startCol;
ctLoc.endOffset = ctLoc.startOffset;
if (location) {
location.endLine = location.startLine;
location.endCol = location.startCol;
location.endOffset = location.startOffset;
}

this.currentToken = { type: TokenType.EOF, location: ctLoc };
this.currentToken = { type: TokenType.EOF, location };
}

//Tag attributes
Expand All @@ -395,7 +396,7 @@ export class Tokenizer {
name: attrNameFirstCh,
value: '',
};
this.currentAttrLocation = this._getCurrentLocation();
this.currentLocation = this.getCurrentLocation(0);
}

private _leaveAttrName(): void {
Expand All @@ -404,9 +405,9 @@ export class Tokenizer {
if (getTokenAttr(token, this.currentAttr.name) === null) {
token.attrs.push(this.currentAttr);

if (token.location) {
if (token.location && this.currentLocation) {
const attrLocations = (token.location.attrs ??= Object.create(null));
attrLocations[this.currentAttr.name] = this.currentAttrLocation!;
attrLocations[this.currentAttr.name] = this.currentLocation;

// Set end location
this._leaveAttrValue();
Expand All @@ -417,19 +418,19 @@ export class Tokenizer {
}

private _leaveAttrValue(): void {
if (this.currentAttrLocation) {
this.currentAttrLocation.endLine = this.preprocessor.line;
this.currentAttrLocation.endCol = this.preprocessor.col;
this.currentAttrLocation.endOffset = this.preprocessor.offset;
if (this.currentLocation) {
this.currentLocation.endLine = this.preprocessor.line;
this.currentLocation.endCol = this.preprocessor.col;
this.currentLocation.endOffset = this.preprocessor.offset;
}
}

//Token emission
private _emitCurrentToken(): void {
this._emitCurrentCharacterToken();

const ct = this.currentToken!;

this._emitCurrentCharacterToken(ct.location);

this.currentToken = null;

//NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
Expand Down Expand Up @@ -462,16 +463,17 @@ export class Tokenizer {
}

this.tokenQueue.push(ct);
this.currentLocation = this.getCurrentLocation(-1);
}

private _emitCurrentCharacterToken(): void {
private _emitCurrentCharacterToken(nextLocation: Location | null): void {
if (this.currentCharacterToken) {
//NOTE: if we have pending character token make it's end location equal to the
//current token's start location.
if (this.ctLoc && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = this.ctLoc.startLine;
this.currentCharacterToken.location.endCol = this.ctLoc.startCol;
this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset;
if (nextLocation && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = nextLocation.startLine;
this.currentCharacterToken.location.endCol = nextLocation.startCol;
this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
}

this.tokenQueue.push(this.currentCharacterToken);
Expand All @@ -496,7 +498,8 @@ export class Tokenizer {
//3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void {
if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
this._emitCurrentCharacterToken();
this.currentLocation = this.getCurrentLocation(0);
this._emitCurrentCharacterToken(this.currentLocation);
}

if (this.currentCharacterToken) {
Expand Down Expand Up @@ -925,7 +928,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand Down Expand Up @@ -956,7 +958,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRcdata(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.AMPERSAND: {
Expand Down Expand Up @@ -987,7 +988,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRawtext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1013,7 +1013,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateScriptData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1039,7 +1038,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _statePlaintext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.NULL: {
Expand Down Expand Up @@ -1076,7 +1074,7 @@ export class Tokenizer {
}
case $.QUESTION_MARK: {
this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
this._createCommentToken();
this._createCommentToken(1);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
break;
Expand Down Expand Up @@ -1118,7 +1116,7 @@ export class Tokenizer {
}
default: {
this._err(ERR.invalidFirstCharacterOfTagName);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down Expand Up @@ -1956,16 +1954,18 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateMarkupDeclarationOpen(cp: number): void {
if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) {
this._createCommentToken();
this._createCommentToken($$.DASH_DASH.length + 1);
this.state = State.COMMENT_START;
} else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) {
// NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here.
this.currentLocation = this.getCurrentLocation($$.DOCTYPE.length + 1);
this.state = State.DOCTYPE;
} else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) {
if (this.allowCDATA) {
this.state = State.CDATA_SECTION;
} else {
this._err(ERR.cdataInHtmlContent);
this._createCommentToken();
this._createCommentToken($$.CDATA_START.length + 1);
(this.currentToken as CommentToken).data = '[CDATA[';
this.state = State.BOGUS_COMMENT;
}
Expand All @@ -1975,7 +1975,7 @@ export class Tokenizer {
//results are no longer valid and we will need to start over.
else if (!this._ensureHibernation()) {
this._err(ERR.incorrectlyOpenedComment);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down
5 changes: 3 additions & 2 deletions packages/parse5/lib/tokenizer/preprocessor.ts
Expand Up @@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16;
export class Preprocessor {
public html = '';
private pos = -1;
private lastGapPos = -1;
// NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
private lastGapPos = -2;
private gapStack: number[] = [];
private skipNextNewLine = false;
private lastChunkWritten = false;
Expand Down Expand Up @@ -106,7 +107,7 @@ export class Preprocessor {
this.lineStartPos -= this.pos;
this.droppedBufferSize += this.pos;
this.pos = 0;
this.lastGapPos = -1;
this.lastGapPos = -2;
this.gapStack.length = 0;
}
}
Expand Down
19 changes: 19 additions & 0 deletions packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
Expand Up @@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => {
lastStartTagName: 'plaintext',
htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
htmlChunks: [
'\n',
'<!-- regular comment -->',
'<! bogus comment >',
'<? another bogus comment >',
'</!yet another bogus comment>',
'<![CDATA[ cdata as a bogus comment >',
],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
allowCDATA: true,
htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
},
];

for (const testCase of testCases) {
Expand All @@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => {
tokenizer.preprocessor.bufferWaterline = 8;
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
tokenizer.allowCDATA = !!testCase.allowCDATA;

for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
if (token.type === TokenType.HIBERNATION) {
Expand Down

0 comments on commit 68f90b5

Please sign in to comment.