packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts

import * as assert from 'node:assert';
import { Tokenizer, TokenizerMode } from './index.js';
import { TokenType } from '../common/token.js';
import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js';

it('Location Info (Tokenizer)', () => {
    const testCases = [
        {
            initialMode: TokenizerMode.DATA,
            lastStartTagName: '',
            htmlChunks: [
                '\r\n',
                '<!DOCTYPE html>',
                '\n',
                '<!-- Test -->',
                '\n',
                '<head>',
                '\n   ',
                '<meta charset="utf-8">',
                '<title>',
                '   ',
                'node.js',
                '\u0000',
                '</title>',
                '\n',
                '</head>',
                '\n',
                '<body id="front">',
                '\n',
                '<div id="intro">',
                '\n   ',
                '<p\n>',
                '\n       ',
                'Node.js',
                ' ',
                'is',
                ' ',
                'a',
                '\n       ',
                'platform',
                ' ',
                'built',
                ' ',
                'on',
                '\n       ',
                '<a href="http://code.google.com/p/v8/">',
                '\n       ',
                "Chrome's",
                ' ',
                'JavaScript',
                ' ',
                'runtime',
                '\n       ',
                '</a>',
                '\n',
                '</div>',
                '</body>',
            ],
        },
        {
            initialMode: TokenizerMode.RCDATA,
            lastStartTagName: 'title',
            htmlChunks: ['<div>Test', ' \n   ', 'hey', ' ', 'ya!', '</title>', '<!--Yo-->'],
        },
        {
            initialMode: TokenizerMode.RAWTEXT,
            lastStartTagName: 'style',
            htmlChunks: ['.header{', ' \n   ', 'color:red;', '\n', '}', '</style>', 'Some', ' ', 'text'],
        },
        {
            initialMode: TokenizerMode.SCRIPT_DATA,
            lastStartTagName: 'script',
            htmlChunks: ['var', ' ', 'a=c', ' ', '-', ' ', 'd;', '\n', 'a<--d;', '</script>', '<div>'],
        },
        {
            initialMode: TokenizerMode.PLAINTEXT,
            lastStartTagName: 'plaintext',
            htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
        },
        {
            initialMode: TokenizerMode.DATA,
            lastStartTagName: '',
            htmlChunks: [
                '\n',
                '<!-- regular comment -->',
                '<! bogus comment >',
                '<? another bogus comment >',
                '</!yet another bogus comment>',
                '<![CDATA[ cdata as a bogus comment >',
            ],
        },
        {
            initialMode: TokenizerMode.DATA,
            lastStartTagName: '',
            allowCDATA: true,
            htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
        },
    ];

    for (const testCase of testCases) {
        const html = testCase.htmlChunks.join('');
        const lines = html.split(/\r?\n/g);
        const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true });
        const lastChunkIdx = testCase.htmlChunks.length - 1;

        for (let i = 0; i < testCase.htmlChunks.length; i++) {
            tokenizer.write(testCase.htmlChunks[i], i === lastChunkIdx);
        }

        // NOTE: set small waterline for testing purposes
        tokenizer.preprocessor.bufferWaterline = 8;
        tokenizer.state = testCase.initialMode;
        tokenizer.lastStartTagName = testCase.lastStartTagName;
        tokenizer.allowCDATA = !!testCase.allowCDATA;

        for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
            if (token.type === TokenType.HIBERNATION) {
                continue;
            }

            assert.ok(token.location);

            //Offsets
            let actual = html.substring(token.location.startOffset, token.location.endOffset);

            assert.strictEqual(actual, testCase.htmlChunks[j]);

            //Line/col
            actual = getSubstringByLineCol(lines, token.location);

            const expected = normalizeNewLine(testCase.htmlChunks[j]);

            assert.strictEqual(actual, expected);

            token = tokenizer.getNextToken();
            j++;
        }
    }
});