feat: Use trie data structure to decrease file size (#33)

peerigon · May 30, 2018 · 59f951b · 59f951b
1 parent 0143558
commit 59f951b
Show file tree

Hide file tree

Showing 30 changed files with 2,102 additions and 985 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,6 @@ node_modules
 
 # Optional REPL history
 .node_repl_history
+
+# We don't check in the compiled lists. They are created upon npm publish and npm install.
+build/tries
diff --git a/README.md b/README.md
@@ -10,13 +10,15 @@ parse-domain
 
 Since domains are handled differently across different countries and organizations, splitting a URL into sub-domain, domain and top-level-domain parts is not a simple regexp. **parse-domain** uses a [large list of known top-level domains](https://publicsuffix.org/list/public_suffix_list.dat) from publicsuffix.org to recognize different parts of the domain.
 
+This module uses a [trie](https://en.wikipedia.org/wiki/Trie) data structure under the hood to ensure the smallest possible library size and the fastest lookup. The library is roughly 30KB minified and gzipped. Since publicsuffix.org is frequently updated, the data structure is built on `npm install` as a `postinstall` hook. If something goes wrong during that step, the library falls back to a prebuilt list that has been built at the time of publishing.
+
 <br />
 
 Installation
 ------------------------------------------------------------------------
 
 ```sh
-npm install --save parse-domain
+npm install parse-domain
 ```
 
 <br />

diff --git a/lib/build/buildRegex.js b/lib/build/buildRegex.js
diff --git a/lib/parseDomain.js b/lib/parseDomain.js
@@ -1,29 +1,36 @@
 "use strict";
 
-const knownTlds = require("./tld.js");
 const normalize = require("./normalize.js");
+const lookUp = require("./tries/lookUp");
+const icannTrie = require("../lists/icann.complete");
+const privateTrie = require("../lists/private.complete");
 
 const urlParts = /^(https?:\/\/)?([^/]*@)?(.+?)(:\d{2,5})?([/?].*)?$/; // 1 = protocol, 2 = auth, 3 = domain, 4 = port, 5 = path
 const dot = /\./g;
+const emptyArr = [];
 
 function matchTld(domain, options) {
-    let tld = null;
-
     // for potentially unrecognized tlds, try matching against custom tlds
     if (options.customTlds) {
         // try matching against a built regexp of custom tlds
-        tld = domain.match(options.customTlds);
+        const tld = domain.match(options.customTlds);
+
+        if (tld !== null) {
+            return tld[0];
+        }
     }
 
-    // If no custom tlds, check if tld is supported
-    if (tld === null) {
-        tld = domain.match(options.privateTlds ? knownTlds : knownTlds.icann);
-        if (tld === null) {
-            return null;
+    const tries = (options.privateTlds ? [privateTrie] : emptyArr).concat(icannTrie);
+
+    for (const trie of tries) {
+        const tld = lookUp(trie, domain);
+
+        if (tld !== null) {
+            return "." + tld;
         }
     }
 
-    return tld[0];
+    return null;
 }
 
 /**

diff --git a/lib/tld.js b/lib/tld.js
diff --git a/lib/tries/lookUp.js b/lib/tries/lookUp.js
@@ -0,0 +1,37 @@
+"use strict";
+
+const WILDCARD = "*";
+const EXCEPTION = "!";
+
+function lookUp(trie, hostname) {
+    const domains = hostname.split(".").reverse();
+    const tlds = [];
+    let currentTrie = trie;
+
+    for (let i = 0; i < domains.length; i++) {
+        const domain = domains[i];
+        const isWildcardRule = currentTrie.has(WILDCARD);
+
+        if (isWildcardRule) {
+            if (currentTrie.has(EXCEPTION + domain) === false) {
+                tlds.push(domain);
+            }
+            break;
+        }
+        if (currentTrie.has(domain) === false) {
+            break;
+        }
+        tlds.push(domain);
+
+        const value = currentTrie.get(domain);
+
+        if (value === true) {
+            break;
+        }
+        currentTrie = value;
+    }
+
+    return tlds.length === 0 ? null : tlds.reverse().join(".");
+}
+
+module.exports = lookUp;
diff --git a/lib/tries/parsePubSuffixList.js b/lib/tries/parsePubSuffixList.js
@@ -0,0 +1,52 @@
+"use strict";
+
+const matchNewLine = /\r?\n/;
+const matchComments = /^\s*\/\//;
+const matchWhitespace = /^\s*$/;
+const lists = [
+    {
+        name: "icann",
+        markers: {
+            start: "// ===BEGIN ICANN DOMAINS===",
+            end: "// ===END ICANN DOMAINS===",
+        },
+    },
+    {
+        name: "private",
+        markers: {
+            start: "// ===BEGIN PRIVATE DOMAINS===",
+            end: "// ===END PRIVATE DOMAINS===",
+        },
+    },
+];
+
+function isWanted(line) {
+    return matchComments.test(line) === false && matchWhitespace.test(line) === false;
+}
+
+function parsePubSuffixList(listContent) {
+    return lists
+        .map(list => {
+            const start = listContent.indexOf(list.markers.start);
+            const end = listContent.indexOf(list.markers.end);
+
+            if (start === -1) {
+                throw new Error(`Missing start marker of ${ list.name } list`);
+            }
+            if (end === -1) {
+                throw new Error(`Missing end marker of ${ list.name } list`);
+            }
+
+            return listContent.slice(start, end);
+        })
+        .map(listContent => listContent.split(matchNewLine).filter(isWanted))
+        .reduce((result, lines, i) => {
+            const listName = lists[i].name;
+
+            result[listName] = lines;
+
+            return result;
+        }, {});
+}
+
+module.exports = parsePubSuffixList;
diff --git a/lib/tries/parseTrie.js b/lib/tries/parseTrie.js
@@ -0,0 +1,56 @@
+"use strict";
+
+const SEPARATORS = require("./separators");
+
+function parseTrie(input) {
+    let map = new Map();
+    const parentMaps = [map];
+    let domain = "";
+
+    function setDomain(value) {
+        if (domain === "") {
+            return;
+        }
+        map.set(domain, value);
+        domain = "";
+    }
+
+    for (let i = 0; i < input.length; i++) {
+        const char = input.charAt(i);
+
+        switch (char) {
+            case SEPARATORS.SAME: {
+                setDomain(true);
+                continue;
+            }
+            case SEPARATORS.DOWN: {
+                const childMap = new Map();
+
+                setDomain(childMap);
+                parentMaps.push(map);
+                map = childMap;
+                continue;
+            }
+            case SEPARATORS.RESET: {
+                setDomain(true);
+                // Remove all parent maps but the top most
+                parentMaps.length = 1;
+                map = parentMaps[0];
+                continue;
+            }
+            case SEPARATORS.UP: {
+                setDomain(true);
+                map = parentMaps.pop();
+                continue;
+            }
+        }
+
+        domain += char;
+    }
+
+    setDomain(true);
+
+    return parentMaps[0];
+}
+
+module.exports = parseTrie;
diff --git a/lib/tries/separators.js b/lib/tries/separators.js
@@ -0,0 +1,8 @@
+"use strict";
+
+module.exports = {
+    UP: "<", // one level up
+    SAME: ",", // same level
+    DOWN: ">", // one level down
+    RESET: "|", // reset level index and start new
+};
diff --git a/lib/tries/serializeTrie.js b/lib/tries/serializeTrie.js
@@ -0,0 +1,115 @@
+"use strict";
+
+const SEPARATORS = require("./separators");
+
+const TYPE_COMPLETE = "complete";
+const TYPE_LIGHT = "light";
+const POSSIBLE_TYPES = [TYPE_COMPLETE, TYPE_LIGHT];
+const LINE_FILTERS = {
+    [TYPE_COMPLETE]: () => true,
+    [TYPE_LIGHT]: line => line.length > 1,
+};
+
+function compareLinesAt(lineA, lineB, i) {
+    const endOfLineA = i === lineA.length;
+    const endOfLineB = i === lineB.length;
+
+    if (endOfLineA || endOfLineB) {
+        return lineA.length - lineB.length;
+    }
+
+    return lineA[i].localeCompare(lineB[i]) || compareLinesAt(lineA, lineB, i + 1);
+}
+
+function findIndexOfDifference(lineA, lineB) {
+    const maxLength = Math.max(lineA.length, lineB.length);
+    let i;
+
+    for (i = 0; i < maxLength; i++) {
+        if (lineA[i] !== lineB[i]) {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+function lineToString(line, i, arr) {
+    let indexOfDifference = 0;
+    let separatorFromPrev = "";
+
+    if (i > 0) {
+        const prevLine = arr[i - 1];
+
+        indexOfDifference = findIndexOfDifference(line, prevLine);
+        if (indexOfDifference === -1) {
+            // Identical lines
+            return "";
+        }
+        if (indexOfDifference === 0) {
+            // line and prevLine are completely different
+            separatorFromPrev = SEPARATORS.RESET;
+        } else if (prevLine.length === line.length && indexOfDifference === line.length - 1) {
+            // only the last part of line and prevLine are different
+            separatorFromPrev = SEPARATORS.SAME;
+        } else if (indexOfDifference > prevLine.length - 1) {
+            // we don't need to go up the hierarchy first because prevLine is part of line
+            // so let's just start with an initial down separator
+            separatorFromPrev = SEPARATORS.DOWN;
+        } else {
+            // line and prevLine are different, but share a common root at indexOfDifference - 1
+            // we now need to go up the hierarchy to the common root
+            separatorFromPrev = new Array(prevLine.length - indexOfDifference - 1).fill(SEPARATORS.UP)
+                .join("");
+        }
+    }
+
+    return separatorFromPrev + line.slice(indexOfDifference).join(SEPARATORS.DOWN);
+}
+
+function serializeTrie(parsedList, type) {
+    type = type || TYPE_COMPLETE;
+    /**
+     * parsedList looks like:
+     * [
+     *  "com",
+     *  "co.uk",
+     *  "gov.uk",
+     *  "静岡.jp",
+     *  "岐阜.jp",
+     *  "موقع"
+     * ]
+     *
+     * The resulting tree looks like this:
+     * com      uk          jp         موقع
+     *         /  \        /  \
+     *       co   gov   静岡   岐阜
+     *
+     * And the textual representation of the trie looks like (using SEPARATORS):
+     * com|uk>co,gov|jp>静岡,岐阜|موقع
+     *
+     * With type "light", all domains with no subdomain are excluded from the serialized trie:
+     * uk>co,gov|jp>静岡,岐阜
+     */
+
+    if (POSSIBLE_TYPES.indexOf(type) === -1) {
+        throw new Error(
+            `Cannot serialize trie: Unknown trie type "${ type }". Expected type to be one of ${ POSSIBLE_TYPES.map(
+                JSON.stringify
+            ).join(", ") }`
+        );
+    }
+
+    return parsedList
+        .map(line => line.split("."))
+        .filter(LINE_FILTERS[type])
+        .map(line => line.reverse())
+        .sort((lineA, lineB) => compareLinesAt(lineA, lineB, 0))
+        .map(lineToString)
+        .join("");
+}
+
+serializeTrie.TYPE_COMPLETE = TYPE_COMPLETE;
+serializeTrie.TYPE_LIGHT = TYPE_LIGHT;
+
+module.exports = serializeTrie;