inikulin · fb55 · Feb 24, 2022 · Feb 11, 2022 · Feb 11, 2022 · Feb 12, 2022
diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts
@@ -220,6 +220,7 @@ export class Tokenizer {
 
     private consumedAfterSnapshot = -1;
 
+    private ctLoc: Location | null;
     private currentCharacterToken: CharacterToken | null = null;
     private currentToken: Token | null = null;
     private currentAttr: Attribute = { name: '', value: '' };
@@ -231,6 +232,7 @@ export class Tokenizer {
         this.addLocationInfo = !!options.sourceCodeLocationInfo;
         this.onParseError = options.onParseError ?? null;
         this.preprocessor = new Preprocessor(options);
+        this.ctLoc = this.getCurrentLocation(-1);
     }
 
     //Errors
@@ -239,16 +241,17 @@ export class Tokenizer {
     }
 
     private currentAttrLocation: Location | null = null;
-    private ctLoc: Location | null = null;
-    private _getCurrentLocation(): Location | null {
+
+    // NOTE: `offset` may never run across line boundaries.
+    private getCurrentLocation(offset: number): Location | null {
         if (!this.addLocationInfo) {
             return null;
         }
 
         return {
             startLine: this.preprocessor.line,
-            startCol: this.preprocessor.col,
-            startOffset: this.preprocessor.offset,
+            startCol: this.preprocessor.col - offset,
+            startOffset: this.preprocessor.offset - offset,
             endLine: -1,
             endCol: -1,
             endOffset: -1,
@@ -334,7 +337,7 @@ export class Tokenizer {
             selfClosing: false,
             ackSelfClosing: false,
             attrs: [],
-            location: this.ctLoc,
+            location: this.getCurrentLocation(1),
         };
     }
 
@@ -346,15 +349,15 @@ export class Tokenizer {
             selfClosing: false,
             ackSelfClosing: false,
             attrs: [],
-            location: this.ctLoc,
+            location: this.getCurrentLocation(2),
         };
     }
 
-    private _createCommentToken(): void {
+    private _createCommentToken(offset: number): void {
         this.currentToken = {
             type: TokenType.COMMENT,
             data: '',
-            location: this.ctLoc,
+            location: this.getCurrentLocation(offset),
         };
     }
 
@@ -378,15 +381,15 @@ export class Tokenizer {
     }
 
     private _createEOFToken(): void {
-        const ctLoc = this._getCurrentLocation();
+        const location = this.getCurrentLocation(0);
 
-        if (ctLoc) {
-            ctLoc.endLine = ctLoc.startLine;
-            ctLoc.endCol = ctLoc.startCol;
-            ctLoc.endOffset = ctLoc.startOffset;
+        if (location) {
+            location.endLine = location.startLine;
+            location.endCol = location.startCol;
+            location.endOffset = location.startOffset;
         }
 
-        this.currentToken = { type: TokenType.EOF, location: ctLoc };
+        this.currentToken = { type: TokenType.EOF, location };
     }
 
     //Tag attributes
@@ -395,7 +398,7 @@ export class Tokenizer {
             name: attrNameFirstCh,
             value: '',
         };
-        this.currentAttrLocation = this._getCurrentLocation();
+        this.currentAttrLocation = this.getCurrentLocation(0);
     }
 
     private _leaveAttrName(): void {
@@ -426,10 +429,10 @@ export class Tokenizer {
 
     //Token emission
     private _emitCurrentToken(): void {
-        this._emitCurrentCharacterToken();
-
         const ct = this.currentToken!;
 
+        this._emitCurrentCharacterToken(ct.location);
+
         this.currentToken = null;
 
         //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
@@ -462,16 +465,17 @@ export class Tokenizer {
         }
 
         this.tokenQueue.push(ct);
+        this.ctLoc = this.getCurrentLocation(-1);
     }
 
-    private _emitCurrentCharacterToken(): void {
+    private _emitCurrentCharacterToken(nextLocation: Location | null): void {
         if (this.currentCharacterToken) {
             //NOTE: if we have pending character token make it's end location equal to the
             //current token's start location.
-            if (this.ctLoc && this.currentCharacterToken.location) {
-                this.currentCharacterToken.location.endLine = this.ctLoc.startLine;
-                this.currentCharacterToken.location.endCol = this.ctLoc.startCol;
-                this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset;
+            if (nextLocation && this.currentCharacterToken.location) {
+                this.currentCharacterToken.location.endLine = nextLocation.startLine;
+                this.currentCharacterToken.location.endCol = nextLocation.startCol;
+                this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
             }
 
             this.tokenQueue.push(this.currentCharacterToken);
@@ -496,7 +500,8 @@ export class Tokenizer {
     //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
     private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void {
         if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
-            this._emitCurrentCharacterToken();
+            this.ctLoc = this.getCurrentLocation(0);
+            this._emitCurrentCharacterToken(this.ctLoc);
         }
 
         if (this.currentCharacterToken) {
@@ -925,7 +930,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateData(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -956,7 +960,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateRcdata(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.AMPERSAND: {
@@ -987,7 +990,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateRawtext(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -1013,7 +1015,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateScriptData(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.LESS_THAN_SIGN: {
@@ -1039,7 +1040,6 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _statePlaintext(cp: number): void {
         this.preprocessor.dropParsedChunk();
-        this.ctLoc = this._getCurrentLocation();
 
         switch (cp) {
             case $.NULL: {
@@ -1076,7 +1076,7 @@ export class Tokenizer {
                 }
                 case $.QUESTION_MARK: {
                     this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
-                    this._createCommentToken();
+                    this._createCommentToken(1);
                     this.state = State.BOGUS_COMMENT;
                     this._stateBogusComment(cp);
                     break;
@@ -1118,7 +1118,7 @@ export class Tokenizer {
                 }
                 default: {
                     this._err(ERR.invalidFirstCharacterOfTagName);
-                    this._createCommentToken();
+                    this._createCommentToken(2);
                     this.state = State.BOGUS_COMMENT;
                     this._stateBogusComment(cp);
                 }
@@ -1956,16 +1956,18 @@ export class Tokenizer {
     //------------------------------------------------------------------
     private _stateMarkupDeclarationOpen(cp: number): void {
         if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) {
-            this._createCommentToken();
+            this._createCommentToken($$.DASH_DASH.length + 1);
             this.state = State.COMMENT_START;
         } else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) {
+            // NOTE: Doctypes tokens are created without fixed offsets. We keep track of the moment a doctype *might* start here.
+            this.ctLoc = this.getCurrentLocation($$.DOCTYPE.length + 1);
             this.state = State.DOCTYPE;
         } else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) {
             if (this.allowCDATA) {
                 this.state = State.CDATA_SECTION;
             } else {
                 this._err(ERR.cdataInHtmlContent);
-                this._createCommentToken();
+                this._createCommentToken($$.CDATA_START.length + 1);
                 (this.currentToken as CommentToken).data = '[CDATA[';
                 this.state = State.BOGUS_COMMENT;
             }
@@ -1975,7 +1977,7 @@ export class Tokenizer {
         //results are no longer valid and we will need to start over.
         else if (!this._ensureHibernation()) {
             this._err(ERR.incorrectlyOpenedComment);
-            this._createCommentToken();
+            this._createCommentToken(2);
             this.state = State.BOGUS_COMMENT;
             this._stateBogusComment(cp);
         }

diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts
@@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16;
 export class Preprocessor {
     public html = '';
     private pos = -1;
-    private lastGapPos = -1;
+    // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
+    private lastGapPos = -2;
     private gapStack: number[] = [];
     private skipNextNewLine = false;
     private lastChunkWritten = false;
@@ -106,7 +107,7 @@ export class Preprocessor {
             this.lineStartPos -= this.pos;
             this.droppedBufferSize += this.pos;
             this.pos = 0;
-            this.lastGapPos = -1;
+            this.lastGapPos = -2;
             this.gapStack.length = 0;
         }
     }

diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
@@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => {
             lastStartTagName: 'plaintext',
             htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
         },
+        {
+            initialMode: TokenizerMode.DATA,
+            lastStartTagName: '',
+            htmlChunks: [
+                '\n',
+                '<!-- regular comment -->',
+                '<! bogus comment >',
+                '<? another bogus comment >',
+                '</!yet another bogus comment>',
+                '<![CDATA[ cdata as a bogus comment >',
+            ],
+        },
+        {
+            initialMode: TokenizerMode.DATA,
+            lastStartTagName: '',
+            allowCDATA: true,
+            htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
+        },
     ];
 
     for (const testCase of testCases) {
@@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => {
         tokenizer.preprocessor.bufferWaterline = 8;
         tokenizer.state = testCase.initialMode;
         tokenizer.lastStartTagName = testCase.lastStartTagName;
+        tokenizer.allowCDATA = !!testCase.allowCDATA;
 
         for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
             if (token.type === TokenType.HIBERNATION) {