refactor(tokenizer): Use explicit offsets for locations (inikulin#402)

jmbpwtw · Feb 16, 2023 · 68f90b5 · 68f90b5
1 parent 5b10ce8
commit 68f90b5
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 44 deletions.
diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts
@@ -220,6 +220,7 @@ export class Tokenizer {
 
     private consumedAfterSnapshot = -1;
 
+    private currentLocation: Location | null;
     private currentCharacterToken: CharacterToken | null = null;
     private currentToken: Token | null = null;
     private currentAttr: Attribute = { name: '', value: '' };
@@ -231,24 +232,24 @@ export class Tokenizer {
         this.addLocationInfo = !!options.sourceCodeLocationInfo;
         this.onParseError = options.onParseError ?? null;
         this.preprocessor = new Preprocessor(options);
+        this.currentLocation = this.getCurrentLocation(-1);
     }
 
     //Errors
     private _err(code: ERR): void {
         this.onParseError?.(this.preprocessor.getError(code));
     }
 
-    private currentAttrLocation: Location | null = null;
-    private ctLoc: Location | null = null;
-    private _getCurrentLocation(): Location | null {
+    // NOTE: `offset` may never run across line boundaries.
+    private getCurrentLocation(offset: number): Location | null {
         if (!this.addLocationInfo) {
             return null;
         }
 
         return {
             startLine: this.preprocessor.line,
-            startCol: this.preprocessor.col,
-            startOffset: this.preprocessor.offset,
+            startCol: this.preprocessor.col - offset,
+            startOffset: this.preprocessor.offset - offset,
             endLine: -1,
             endCol: -1,
             endOffset: -1,
@@ -334,7 +335,7 @@ export class Tokenizer {
             selfClosing: false,
             ackSelfClosing: false,
             attrs: [],
-            location: this.ctLoc,
+            location: this.getCurrentLocation(1),
         };
     }
 
@@ -346,15 +347,15 @@ export class Tokenizer {
             selfClosing: false,
             ackSelfClosing: false,
             attrs: [],
-            location: this.ctLoc,
+            location: this.getCurrentLocation(2),
         };
     }
 
-    private _createCommentToken(): void {
+    private _createCommentToken(offset: number): void {
         this.currentToken = {
             type: TokenType.COMMENT,
             data: '',
-            location: this.ctLoc,
+            location: this.getCurrentLocation(offset),
         };
     }
 
@@ -365,28 +366,28 @@ export class Tokenizer {
             forceQuirks: false,
             publicId: null,
             systemId: null,
-            location: this.ctLoc,
+            location: this.currentLocation,
         };
     }
 
     private _createCharacterToken(type: CharacterToken['type'], chars: string): void {
         this.currentCharacterToken = {
             type,
             chars,
-            location: this.ctLoc,
+            location: this.currentLocation,
         };
     }
 
     private _createEOFToken(): void {
-        const ctLoc = this._getCurrentLocation();
+        const location = this.getCurrentLocation(0);
 
-        if (ctLoc) {
-            ctLoc.endLine = ctLoc.startLine;
-            ctLoc.endCol = ctLoc.startCol;
-            ctLoc.endOffset = ctLoc.startOffset;
+        if (location) {
+            location.endLine = location.startLine;
+            location.endCol = location.startCol;
+            location.endOffset = location.startOffset;
         }
 
-        this.currentToken = { type: TokenType.EOF, location: ctLoc };
+        this.currentToken = { type: TokenType.EOF, location };
     }
 
     //Tag attributes
@@ -395,7 +396,7 @@ export class Tokenizer {
             name: attrNameFirstCh,
             value: '',
         };
-        this.currentAttrLocation = this._getCurrentLocation();
+        this.currentLocation = this.getCurrentLocation(0);
     }
 
     private _leaveAttrName(): void {
@@ -404,9 +405,9 @@ export class Tokenizer {
         if (getTokenAttr(token, this.currentAttr.name) === null) {
             token.attrs.push(this.currentAttr);
 
-            if (token.location) {
+            if (token.location && this.currentLocation) {
                 const attrLocations = (token.location.attrs ??= Object.create(null));
-                attrLocations[this.currentAttr.name] = this.currentAttrLocation!;
+                attrLocations[this.currentAttr.name] = this.currentLocation;
 
                 // Set end location
                 this._leaveAttrValue();
@@ -417,19 +418,19 @@ export class Tokenizer {
     }
 
     private _leaveAttrValue(): void {
-        if (this.currentAttrLocation) {
-            this.currentAttrLocation.endLine = this.preprocessor.line;
-            this.currentAttrLocation.endCol = this.preprocessor.col;
-            this.currentAttrLocation.endOffset = this.preprocessor.offset;
+        if (this.currentLocation) {
+            this.currentLocation.endLine = this.preprocessor.line;
+            this.currentLocation.endCol = this.preprocessor.col;
+            this.currentLocation.endOffset = this.preprocessor.offset;
         }
     }
 
     //Token emission
     private _emitCurrentToken(): void {
-        this._emitCurrentCharacterToken();
-
         const ct = this.currentToken!;
 
+        this._emitCurrentCharacterToken(ct.location);
+
         this.currentToken = null;
 
         //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
@@ -462,16 +463,17 @@ export class Tokenizer {
         }
 
         this.tokenQueue.push(ct);
+        this.currentLocation = this.getCurrentLocation(-1);
     }
 
-    private _emitCurrentCharacterToken(): void {
+    private _emitCurrentCharacterToken(nextLocation: Location | null): void {
         if (this.currentCharacterToken) {
             //NOTE: if we have pending character token make it's end location equal to the
             //current token's start location.
-            if (this.ctLoc && this.currentCharacterToken.location) {
-                this.currentCharacterToken.location.endLine = this.ctLoc.startLine;
-                this.currentCharacterToken.location.endCol = this.ctLoc.startCol;
-                this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset;
+            if (nextLocation && this.currentCharacterToken.location) {
+                this.currentCharacterToken.location.endLine = nextLocation.startLine;
+                this.currentCharacterToken.location.endCol = nextLocation.startCol;
+                this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
             }
 
             this.tokenQueue.push(this.currentCharacterToken);
@@ -496,7 +498,8 @@ export class Tokenizer {
     //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
     private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void {
         if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
-            this._emitCurrentCharacterToken();
+            this.currentLocation = this.getCurrentLocation(0);
+            this._emitCurrentCharacterToken(this.currentLocation);
         }
 
         if (this.currentCharacterToken) {
@@ -925,7 +928,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateData(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -956,7 +958,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateRcdata(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.AMPERSAND: {
@@ -987,7 +988,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateRawtext(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -1013,7 +1013,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateScriptData(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -1039,7 +1038,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _statePlaintext(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.NULL: {
@@ -1076,7 +1074,7 @@ export class Tokenizer {
                 }
                 case $.QUESTION_MARK: {
                     this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
-                    this._createCommentToken();
+                    this._createCommentToken(1);
                     this.state = State.BOGUS_COMMENT;
                     this._stateBogusComment(cp);
                     break;
@@ -1118,7 +1116,7 @@ export class Tokenizer {
                 }
                 default: {
                     this._err(ERR.invalidFirstCharacterOfTagName);
-                    this._createCommentToken();
+                    this._createCommentToken(2);
                     this.state = State.BOGUS_COMMENT;
                     this._stateBogusComment(cp);
                 }
@@ -1956,16 +1954,18 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateMarkupDeclarationOpen(cp: number): void {
         if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) {
-            this._createCommentToken();
+            this._createCommentToken($$.DASH_DASH.length + 1);
             this.state = State.COMMENT_START;
         } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) {
+            // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here.
+            this.currentLocation = this.getCurrentLocation($$.DOCTYPE.length + 1);
             this.state = State.DOCTYPE;
         } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) {
             if (this.allowCDATA) {
                 this.state = State.CDATA_SECTION;
             } else {
                 this._err(ERR.cdataInHtmlContent);
-                this._createCommentToken();
+                this._createCommentToken($$.CDATA_START.length + 1);
                 (this.currentToken as CommentToken).data = '[CDATA[';
                 this.state = State.BOGUS_COMMENT;
             }
@@ -1975,7 +1975,7 @@ export class Tokenizer {
         //results are no longer valid and we will need to start over.
         else if (!this._ensureHibernation()) {
             this._err(ERR.incorrectlyOpenedComment);
-            this._createCommentToken();
+            this._createCommentToken(2);
             this.state = State.BOGUS_COMMENT;
             this._stateBogusComment(cp);
         }

diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts
@@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16;
 export class Preprocessor {
     public html = '';
     private pos = -1;
-    private lastGapPos = -1;
+    // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
+    private lastGapPos = -2;
     private gapStack: number[] = [];
     private skipNextNewLine = false;
     private lastChunkWritten = false;
@@ -106,7 +107,7 @@ export class Preprocessor {
             this.lineStartPos -= this.pos;
             this.droppedBufferSize += this.pos;
             this.pos = 0;
-            this.lastGapPos = -1;
+            this.lastGapPos = -2;
             this.gapStack.length = 0;
         }
     }

diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
@@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => {
             lastStartTagName: 'plaintext',
             htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
         },
+        {
+            initialMode: TokenizerMode.DATA,
+            lastStartTagName: '',
+            htmlChunks: [
+                '\n',
+                '<!-- regular comment -->',
+                '<! bogus comment >',
+                '<? another bogus comment >',
+                '</!yet another bogus comment>',
+                '<![CDATA[ cdata as a bogus comment >',
+            ],
+        },
+        {
+            initialMode: TokenizerMode.DATA,
+            lastStartTagName: '',
+            allowCDATA: true,
+            htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
+        },
     ];
 
     for (const testCase of testCases) {
@@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => {
         tokenizer.preprocessor.bufferWaterline = 8;
         tokenizer.state = testCase.initialMode;
         tokenizer.lastStartTagName = testCase.lastStartTagName;
+        tokenizer.allowCDATA = !!testCase.allowCDATA;
 
         for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
             if (token.type === TokenType.HIBERNATION) {