Skip to content

Commit

Permalink
feat: Use trie data structure to decrease file size (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhnns committed May 30, 2018
1 parent 0143558 commit 59f951b
Show file tree
Hide file tree
Showing 30 changed files with 2,102 additions and 985 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -34,3 +34,6 @@ node_modules

# Optional REPL history
.node_repl_history

# We don't check in the compiled lists. They are created upon npm publish and npm install.
build/tries
4 changes: 3 additions & 1 deletion README.md
Expand Up @@ -10,13 +10,15 @@ parse-domain

Since domains are handled differently across different countries and organizations, splitting a URL into sub-domain, domain and top-level-domain parts is not a simple regexp. **parse-domain** uses a [large list of known top-level domains](https://publicsuffix.org/list/public_suffix_list.dat) from publicsuffix.org to recognize different parts of the domain.

This module uses a [trie](https://en.wikipedia.org/wiki/Trie) data structure under the hood to ensure the smallest possible library size and the fastest lookup. The library is roughly 30KB minified and gzipped. Since publicsuffix.org is frequently updated, the data structure is built on `npm install` as a `postinstall` hook. If something goes wrong during that step, the library falls back to a prebuilt list that has been built at the time of publishing.

<br />

Installation
------------------------------------------------------------------------

```sh
npm install --save parse-domain
npm install parse-domain
```

<br />
Expand Down
50 changes: 0 additions & 50 deletions lib/build/buildRegex.js

This file was deleted.

27 changes: 17 additions & 10 deletions lib/parseDomain.js
@@ -1,29 +1,36 @@
"use strict";

const knownTlds = require("./tld.js");
const normalize = require("./normalize.js");
const lookUp = require("./tries/lookUp");
const icannTrie = require("../lists/icann.complete");
const privateTrie = require("../lists/private.complete");

const urlParts = /^(https?:\/\/)?([^/]*@)?(.+?)(:\d{2,5})?([/?].*)?$/; // 1 = protocol, 2 = auth, 3 = domain, 4 = port, 5 = path
const dot = /\./g;
const emptyArr = [];

function matchTld(domain, options) {
let tld = null;

// for potentially unrecognized tlds, try matching against custom tlds
if (options.customTlds) {
// try matching against a built regexp of custom tlds
tld = domain.match(options.customTlds);
const tld = domain.match(options.customTlds);

if (tld !== null) {
return tld[0];
}
}

// If no custom tlds, check if tld is supported
if (tld === null) {
tld = domain.match(options.privateTlds ? knownTlds : knownTlds.icann);
if (tld === null) {
return null;
const tries = (options.privateTlds ? [privateTrie] : emptyArr).concat(icannTrie);

for (const trie of tries) {
const tld = lookUp(trie, domain);

if (tld !== null) {
return "." + tld;
}
}

return tld[0];
return null;
}

/**
Expand Down
5 changes: 0 additions & 5 deletions lib/tld.js

This file was deleted.

37 changes: 37 additions & 0 deletions lib/tries/lookUp.js
@@ -0,0 +1,37 @@
"use strict";

const WILDCARD = "*";
const EXCEPTION = "!";

function lookUp(trie, hostname) {
const domains = hostname.split(".").reverse();
const tlds = [];
let currentTrie = trie;

for (let i = 0; i < domains.length; i++) {
const domain = domains[i];
const isWildcardRule = currentTrie.has(WILDCARD);

if (isWildcardRule) {
if (currentTrie.has(EXCEPTION + domain) === false) {
tlds.push(domain);
}
break;
}
if (currentTrie.has(domain) === false) {
break;
}
tlds.push(domain);

const value = currentTrie.get(domain);

if (value === true) {
break;
}
currentTrie = value;
}

return tlds.length === 0 ? null : tlds.reverse().join(".");
}

module.exports = lookUp;
52 changes: 52 additions & 0 deletions lib/tries/parsePubSuffixList.js
@@ -0,0 +1,52 @@
"use strict";

const matchNewLine = /\r?\n/;
const matchComments = /^\s*\/\//;
const matchWhitespace = /^\s*$/;
const lists = [
{
name: "icann",
markers: {
start: "// ===BEGIN ICANN DOMAINS===",
end: "// ===END ICANN DOMAINS===",
},
},
{
name: "private",
markers: {
start: "// ===BEGIN PRIVATE DOMAINS===",
end: "// ===END PRIVATE DOMAINS===",
},
},
];

function isWanted(line) {
return matchComments.test(line) === false && matchWhitespace.test(line) === false;
}

function parsePubSuffixList(listContent) {
return lists
.map(list => {
const start = listContent.indexOf(list.markers.start);
const end = listContent.indexOf(list.markers.end);

if (start === -1) {
throw new Error(`Missing start marker of ${ list.name } list`);
}
if (end === -1) {
throw new Error(`Missing end marker of ${ list.name } list`);
}

return listContent.slice(start, end);
})
.map(listContent => listContent.split(matchNewLine).filter(isWanted))
.reduce((result, lines, i) => {
const listName = lists[i].name;

result[listName] = lines;

return result;
}, {});
}

module.exports = parsePubSuffixList;
56 changes: 56 additions & 0 deletions lib/tries/parseTrie.js
@@ -0,0 +1,56 @@
"use strict";

const SEPARATORS = require("./separators");

function parseTrie(input) {
let map = new Map();
const parentMaps = [map];
let domain = "";

function setDomain(value) {
if (domain === "") {
return;
}
map.set(domain, value);
domain = "";
}

for (let i = 0; i < input.length; i++) {
const char = input.charAt(i);

switch (char) {
case SEPARATORS.SAME: {
setDomain(true);
continue;
}
case SEPARATORS.DOWN: {
const childMap = new Map();

setDomain(childMap);
parentMaps.push(map);
map = childMap;
continue;
}
case SEPARATORS.RESET: {
setDomain(true);
// Remove all parent maps but the top most
parentMaps.length = 1;
map = parentMaps[0];
continue;
}
case SEPARATORS.UP: {
setDomain(true);
map = parentMaps.pop();
continue;
}
}

domain += char;
}

setDomain(true);

return parentMaps[0];
}

module.exports = parseTrie;
8 changes: 8 additions & 0 deletions lib/tries/separators.js
@@ -0,0 +1,8 @@
"use strict";

module.exports = {
UP: "<", // one level up
SAME: ",", // same level
DOWN: ">", // one level down
RESET: "|", // reset level index and start new
};
115 changes: 115 additions & 0 deletions lib/tries/serializeTrie.js
@@ -0,0 +1,115 @@
"use strict";

const SEPARATORS = require("./separators");

const TYPE_COMPLETE = "complete";
const TYPE_LIGHT = "light";
const POSSIBLE_TYPES = [TYPE_COMPLETE, TYPE_LIGHT];
const LINE_FILTERS = {
[TYPE_COMPLETE]: () => true,
[TYPE_LIGHT]: line => line.length > 1,
};

function compareLinesAt(lineA, lineB, i) {
const endOfLineA = i === lineA.length;
const endOfLineB = i === lineB.length;

if (endOfLineA || endOfLineB) {
return lineA.length - lineB.length;
}

return lineA[i].localeCompare(lineB[i]) || compareLinesAt(lineA, lineB, i + 1);
}

function findIndexOfDifference(lineA, lineB) {
const maxLength = Math.max(lineA.length, lineB.length);
let i;

for (i = 0; i < maxLength; i++) {
if (lineA[i] !== lineB[i]) {
return i;
}
}

return -1;
}

function lineToString(line, i, arr) {
let indexOfDifference = 0;
let separatorFromPrev = "";

if (i > 0) {
const prevLine = arr[i - 1];

indexOfDifference = findIndexOfDifference(line, prevLine);
if (indexOfDifference === -1) {
// Identical lines
return "";
}
if (indexOfDifference === 0) {
// line and prevLine are completely different
separatorFromPrev = SEPARATORS.RESET;
} else if (prevLine.length === line.length && indexOfDifference === line.length - 1) {
// only the last part of line and prevLine are different
separatorFromPrev = SEPARATORS.SAME;
} else if (indexOfDifference > prevLine.length - 1) {
// we don't need to go up the hierarchy first because prevLine is part of line
// so let's just start with an initial down separator
separatorFromPrev = SEPARATORS.DOWN;
} else {
// line and prevLine are different, but share a common root at indexOfDifference - 1
// we now need to go up the hierarchy to the common root
separatorFromPrev = new Array(prevLine.length - indexOfDifference - 1).fill(SEPARATORS.UP)
.join("");
}
}

return separatorFromPrev + line.slice(indexOfDifference).join(SEPARATORS.DOWN);
}

function serializeTrie(parsedList, type) {
type = type || TYPE_COMPLETE;
/**
* parsedList looks like:
* [
* "com",
* "co.uk",
* "gov.uk",
* "静岡.jp",
* "岐阜.jp",
* "موقع"
* ]
*
* The resulting tree looks like this:
* com uk jp موقع
* / \ / \
* co gov 静岡 岐阜
*
* And the textual representation of the trie looks like (using SEPARATORS):
* com|uk>co,gov|jp>静岡,岐阜|موقع
*
* With type "light", all domains with no subdomain are excluded from the serialized trie:
* uk>co,gov|jp>静岡,岐阜
*/

if (POSSIBLE_TYPES.indexOf(type) === -1) {
throw new Error(
`Cannot serialize trie: Unknown trie type "${ type }". Expected type to be one of ${ POSSIBLE_TYPES.map(
JSON.stringify
).join(", ") }`
);
}

return parsedList
.map(line => line.split("."))
.filter(LINE_FILTERS[type])
.map(line => line.reverse())
.sort((lineA, lineB) => compareLinesAt(lineA, lineB, 0))
.map(lineToString)
.join("");
}

serializeTrie.TYPE_COMPLETE = TYPE_COMPLETE;
serializeTrie.TYPE_LIGHT = TYPE_LIGHT;

module.exports = serializeTrie;

0 comments on commit 59f951b

Please sign in to comment.