Skip to content

Commit

Permalink
refactor: Port Tokenizer to TS
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 committed Jan 7, 2022
1 parent 66df66b commit dee793b
Show file tree
Hide file tree
Showing 14 changed files with 1,190 additions and 705 deletions.
18 changes: 17 additions & 1 deletion .eslintrc.json
Expand Up @@ -35,5 +35,21 @@
},
"parserOptions": {
"sourceType": "module"
}
},
"overrides": [
{
"files": "*.ts",
"extends": [
"plugin:@typescript-eslint/eslint-recommended",
"plugin:@typescript-eslint/recommended",
"prettier"
],
"rules": {
"@typescript-eslint/no-non-null-assertion": "warn",
"@typescript-eslint/no-explicit-any": "warn",

"@typescript-eslint/no-unused-vars": ["error", { "argsIgnorePattern": "^_" }]
}
}
]
}
375 changes: 375 additions & 0 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Expand Up @@ -4,6 +4,7 @@
"type": "module",
"devDependencies": {
"@types/jest": "^27.0.2",
"@typescript-eslint/eslint-plugin": "^5.9.0",
"dedent": "^0.7.0",
"eslint": "^8.2.0",
"eslint-config-prettier": "^8.3.0",
Expand Down
@@ -1,4 +1,5 @@
import { DOCUMENT_MODE } from './html.js';
import { DoctypeToken } from './token.js';

//Const
const VALID_DOCTYPE_NAME = 'html';
Expand Down Expand Up @@ -83,26 +84,26 @@ const LIMITED_QUIRKS_WITH_SYSTEM_ID_PUBLIC_ID_PREFIXES = [
];

//Utils
function enquoteDoctypeId(id) {
function enquoteDoctypeId(id: string): string {
const quote = id.includes('"') ? "'" : '"';

return quote + id + quote;
}

function hasPrefix(publicId, prefixes) {
function hasPrefix(publicId: string, prefixes: string[]): boolean {
return prefixes.some((prefix) => publicId.startsWith(prefix));
}

//API
export function isConforming(token) {
export function isConforming(token: DoctypeToken): boolean {
return (
token.name === VALID_DOCTYPE_NAME &&
token.publicId === null &&
(token.systemId === null || token.systemId === VALID_SYSTEM_ID)
);
}

export function getDocumentMode(token) {
export function getDocumentMode(token: DoctypeToken): string | null {
if (token.name !== VALID_DOCTYPE_NAME) {
return DOCUMENT_MODE.QUIRKS;
}
Expand Down Expand Up @@ -139,7 +140,7 @@ export function getDocumentMode(token) {
return DOCUMENT_MODE.NO_QUIRKS;
}

export function serializeContent(name, publicId, systemId) {
export function serializeContent(name: string, publicId: string | null, systemId: string | null): string {
let str = '!DOCTYPE ';

if (name) {
Expand Down
File renamed without changes.
@@ -1,5 +1,6 @@
import { Tokenizer } from '../tokenizer/index.js';
import * as HTML from './html.js';
import { TagToken } from './token.js';

//Aliases
const $ = HTML.TAG_NAMES;
Expand Down Expand Up @@ -180,7 +181,7 @@ const EXITS_FOREIGN_CONTENT = new Set([
]);

//Check exit from foreign content
export function causesExit(startTagToken) {
export function causesExit(startTagToken: TagToken) {
const tn = startTagToken.tagName;
const isFontWithAttrs =
tn === $.FONT &&
Expand All @@ -192,7 +193,7 @@ export function causesExit(startTagToken) {
}

//Token adjustments
export function adjustTokenMathMLAttrs(token) {
export function adjustTokenMathMLAttrs(token: TagToken) {
for (let i = 0; i < token.attrs.length; i++) {
if (token.attrs[i].name === DEFINITION_URL_ATTR) {
token.attrs[i].name = ADJUSTED_DEFINITION_URL_ATTR;
Expand All @@ -201,7 +202,7 @@ export function adjustTokenMathMLAttrs(token) {
}
}

export function adjustTokenSVGAttrs(token) {
export function adjustTokenSVGAttrs(token: TagToken) {
for (let i = 0; i < token.attrs.length; i++) {
const adjustedAttrName = SVG_ATTRS_ADJUSTMENT_MAP.get(token.attrs[i].name);

Expand All @@ -211,7 +212,7 @@ export function adjustTokenSVGAttrs(token) {
}
}

export function adjustTokenXMLAttrs(token) {
export function adjustTokenXMLAttrs(token: TagToken) {
for (let i = 0; i < token.attrs.length; i++) {
const adjustedAttrEntry = XML_ATTRS_ADJUSTMENT_MAP.get(token.attrs[i].name);

Expand All @@ -223,7 +224,7 @@ export function adjustTokenXMLAttrs(token) {
}
}

export function adjustTokenSVGTagName(token) {
export function adjustTokenSVGTagName(token: TagToken) {
const adjustedTagName = SVG_TAG_NAMES_ADJUSTMENT_MAP.get(token.tagName);

if (adjustedTagName) {
Expand All @@ -232,11 +233,11 @@ export function adjustTokenSVGTagName(token) {
}

//Integration points
function isMathMLTextIntegrationPoint(tn, ns) {
function isMathMLTextIntegrationPoint(tn: string, ns: string) {
return ns === NS.MATHML && (tn === $.MI || tn === $.MO || tn === $.MN || tn === $.MS || tn === $.MTEXT);
}

function isHtmlIntegrationPoint(tn, ns, attrs) {
function isHtmlIntegrationPoint(tn: string, ns: string, attrs: TagToken['attrs']) {
if (ns === NS.MATHML && tn === $.ANNOTATION_XML) {
for (let i = 0; i < attrs.length; i++) {
if (attrs[i].name === ATTRS.ENCODING) {
Expand All @@ -250,7 +251,7 @@ function isHtmlIntegrationPoint(tn, ns, attrs) {
return ns === NS.SVG && (tn === $.FOREIGN_OBJECT || tn === $.DESC || tn === $.TITLE);
}

export function isIntegrationPoint(tn, ns, attrs, foreignNS) {
export function isIntegrationPoint(tn: string, ns: string, attrs: TagToken['attrs'], foreignNS: string) {
if ((!foreignNS || foreignNS === NS.HTML) && isHtmlIntegrationPoint(tn, ns, attrs)) {
return true;
}
Expand Down
File renamed without changes.
52 changes: 52 additions & 0 deletions packages/parse5/lib/common/token.ts
@@ -0,0 +1,52 @@
export enum TokenType {
CHARACTER = 'CHARACTER_TOKEN',
NULL_CHARACTER = 'NULL_CHARACTER_TOKEN',
WHITESPACE_CHARACTER = 'WHITESPACE_CHARACTER_TOKEN',
START_TAG = 'START_TAG_TOKEN',
END_TAG = 'END_TAG_TOKEN',
COMMENT = 'COMMENT_TOKEN',
DOCTYPE = 'DOCTYPE_TOKEN',
EOF = 'EOF_TOKEN',
HIBERNATION = 'HIBERNATION_TOKEN',
}

export interface DoctypeToken {
type: TokenType.DOCTYPE;
name: string | null;
forceQuirks: boolean;
publicId: string | null;
systemId: string | null;
}

export interface TagToken {
type: TokenType.START_TAG | TokenType.END_TAG;
tagName: string;
selfClosing: boolean;
ackSelfClosing: boolean;
attrs: {
name: string;
value: string;
prefix: string | null;
namespace: string | null;
}[];
}

export interface CommentToken {
type: TokenType.COMMENT;
data: string;
}

interface EOFToken {
type: TokenType.EOF;
}

interface HibernationToken {
type: TokenType.HIBERNATION;
}

export interface CharacterToken {
type: TokenType.CHARACTER | TokenType.NULL_CHARACTER | TokenType.WHITESPACE_CHARACTER;
chars: string;
}

export type Token = DoctypeToken | TagToken | CommentToken | EOFToken | HibernationToken | CharacterToken;
Expand Up @@ -43,35 +43,35 @@ export const CODE_POINTS = {
};

export const CODE_POINT_SEQUENCES = {
DASH_DASH_STRING: [0x2d, 0x2d], //--
DOCTYPE_STRING: [0x44, 0x4f, 0x43, 0x54, 0x59, 0x50, 0x45], //DOCTYPE
CDATA_START_STRING: [0x5b, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5b], //[CDATA[
SCRIPT_STRING: [0x73, 0x63, 0x72, 0x69, 0x70, 0x74], //script
PUBLIC_STRING: [0x50, 0x55, 0x42, 0x4c, 0x49, 0x43], //PUBLIC
SYSTEM_STRING: [0x53, 0x59, 0x53, 0x54, 0x45, 0x4d], //SYSTEM
DASH_DASH_STRING: new Uint16Array([0x2d, 0x2d]), //--
DOCTYPE_STRING: new Uint16Array([0x44, 0x4f, 0x43, 0x54, 0x59, 0x50, 0x45]), //DOCTYPE
CDATA_START_STRING: new Uint16Array([0x5b, 0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), //[CDATA[
SCRIPT_STRING: new Uint16Array([0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), //script
PUBLIC_STRING: new Uint16Array([0x50, 0x55, 0x42, 0x4c, 0x49, 0x43]), //PUBLIC
SYSTEM_STRING: new Uint16Array([0x53, 0x59, 0x53, 0x54, 0x45, 0x4d]), //SYSTEM
};

//Surrogates
export function isSurrogate(cp) {
export function isSurrogate(cp: number) {
return cp >= 0xd8_00 && cp <= 0xdf_ff;
}

export function isSurrogatePair(cp) {
export function isSurrogatePair(cp: number) {
return cp >= 0xdc_00 && cp <= 0xdf_ff;
}

export function getSurrogatePairCodePoint(cp1, cp2) {
export function getSurrogatePairCodePoint(cp1: number, cp2: number) {
return (cp1 - 0xd8_00) * 0x4_00 + 0x24_00 + cp2;
}

//NOTE: excluding NULL and ASCII whitespace
export function isControlCodePoint(cp) {
export function isControlCodePoint(cp: number) {
return (
(cp !== 0x20 && cp !== 0x0a && cp !== 0x0d && cp !== 0x09 && cp !== 0x0c && cp >= 0x01 && cp <= 0x1f) ||
(cp >= 0x7f && cp <= 0x9f)
);
}

export function isUndefinedCodePoint(cp) {
export function isUndefinedCodePoint(cp: number) {
return (cp >= 0xfd_d0 && cp <= 0xfd_ef) || UNDEFINED_CODE_POINTS.has(cp);
}

0 comments on commit dee793b

Please sign in to comment.