(enh) private __emitTokens callback to allow custom grammar parsers (…

…#3620) * private __emitTokens API * remove addKeyword from Emitter API * use language: namespace scope prefix to handle sublanguages * add emitTokens docs
highlightjs · Mar 19, 2023 · 11bce6a · 11bce6a
1 parent aa58ffa
commit 11bce6a
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 51 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,7 @@
 Improvements:
 
 - added a function to default export to generate a fresh highlighter instance to be used by extensions [WisamMechano][]
+- added BETA `__emitTokens` key to grammars to allow then to direct their own parsing, only using Highlight.js for the HTML rendering [Josh Goebel][]
 
 New Grammars:
 
@@ -24,6 +25,7 @@ Parser:
 
 - add removePlugin api [faga295][]
 
+[Josh Goebel]: https://github.com/joshgoebel
 [Timur Kamaev]: https://github.com/doiftrue
 [Leopard20]: https://github.com/Leopard20/
 [WisamMechano]: https://github.com/wisammechano

diff --git a/docs/mode-reference.rst b/docs/mode-reference.rst
@@ -105,6 +105,52 @@ Disables autodetection for this language.
 (defaults to false, meaning auto-detect is enabled)
 
 
+__emitTokens
+^^^^^^^^^^^^
+
+.. warning::
+
+  **This is currently still private/beta API**, though it's expected to be fairly stable.
+
+  It should land in version 12.0.
+
+Allows grammars to bundle custom parsers - bypassing the default parser and grammar mode definitions.  This should be a function that accepts the raw source code as the first argument and an "Emitter" object as the second.
+
+A custom parser may parse the source as it sees fit - making calls to the Emitter along the way - allowing Highlight.js to generate and theme the final HTML.
+
+The **Emitter** API is trivial:
+
+- ``addText(text)``
+- ``startScope(name)``
+- ``endScope()``
+
+Given:
+
+::
+
+  hello beautiful world!
+
+
+Assuming beautiful is a keyword our Emitter calls might look something like:
+
+::
+
+  addText("hello ")
+  startScope("keyword")
+  addText("beautiful")
+  endScope()
+  addText(" world!")
+
+Resulting in the following generated HTML:
+
+.. code-block:: html
+
+  hello <span class="hljs-keyword">beautiful</span> world!
+
+.. note::
+
+  The intended use of ``addText`` is larger chunks of plain text, not individual characters.  Custom parsers should buffer plain text output into complete strings rather than sending output one character at a time.
+
 compilerExtensions
 ^^^^^^^^^^^^^^^^^^
 

diff --git a/src/highlight.js b/src/highlight.js
@@ -15,6 +15,7 @@ import * as packageJSON from '../package.json';
 import * as logger from "./lib/logger.js";
 import HTMLInjectionError from "./lib/html_injection_error.js";
 
+
 /**
 @typedef {import('highlight.js').Mode} Mode
 @typedef {import('highlight.js').CompiledMode} CompiledMode
@@ -224,7 +225,7 @@ const HLJS = function(hljs) {
             buf += match[0];
           } else {
             const cssClass = language.classNameAliases[kind] || kind;
-            emitter.addKeyword(match[0], cssClass);
+            emitKeyword(match[0], cssClass);
           }
         } else {
           buf += match[0];
@@ -259,7 +260,7 @@ const HLJS = function(hljs) {
       if (top.relevance > 0) {
         relevance += result.relevance;
       }
-      emitter.addSublanguage(result._emitter, result.language);
+      emitter.__addSublanguage(result._emitter, result.language);
     }
 
     function processBuffer() {
@@ -271,6 +272,18 @@ const HLJS = function(hljs) {
       modeBuffer = '';
     }
 
+    /**
+     * @param {string} text
+     * @param {string} scope
+     */
+    function emitKeyword(keyword, scope) {
+      if (keyword === "") return;
+
+      emitter.startScope(scope);
+      emitter.addText(keyword);
+      emitter.endScope();
+    }
+
     /**
      * @param {CompiledScope} scope
      * @param {RegExpMatchArray} match
@@ -283,7 +296,7 @@ const HLJS = function(hljs) {
         const klass = language.classNameAliases[scope[i]] || scope[i];
         const text = match[i];
         if (klass) {
-          emitter.addKeyword(text, klass);
+          emitKeyword(text, klass);
         } else {
           modeBuffer = text;
           processKeywords();
@@ -304,7 +317,7 @@ const HLJS = function(hljs) {
       if (mode.beginScope) {
         // beginScope just wraps the begin match itself in a scope
         if (mode.beginScope._wrap) {
-          emitter.addKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
+          emitKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
           modeBuffer = "";
         } else if (mode.beginScope._multi) {
           // at this point modeBuffer should just be the match
@@ -415,7 +428,7 @@ const HLJS = function(hljs) {
       const origin = top;
       if (top.endScope && top.endScope._wrap) {
         processBuffer();
-        emitter.addKeyword(lexeme, top.endScope._wrap);
+        emitKeyword(lexeme, top.endScope._wrap);
       } else if (top.endScope && top.endScope._multi) {
         processBuffer();
         emitMultiClass(top.endScope, match);
@@ -558,30 +571,34 @@ const HLJS = function(hljs) {
     let resumeScanAtSamePosition = false;
 
     try {
-      top.matcher.considerAll();
-
-      for (;;) {
-        iterations++;
-        if (resumeScanAtSamePosition) {
-          // only regexes not matched previously will now be
-          // considered for a potential match
-          resumeScanAtSamePosition = false;
-        } else {
-          top.matcher.considerAll();
-        }
-        top.matcher.lastIndex = index;
+      if (!language.__emitTokens) {
+        top.matcher.considerAll();
+
+        for (;;) {
+          iterations++;
+          if (resumeScanAtSamePosition) {
+            // only regexes not matched previously will now be
+            // considered for a potential match
+            resumeScanAtSamePosition = false;
+          } else {
+            top.matcher.considerAll();
+          }
+          top.matcher.lastIndex = index;
 
-        const match = top.matcher.exec(codeToHighlight);
-        // console.log("match", match[0], match.rule && match.rule.begin)
+          const match = top.matcher.exec(codeToHighlight);
+          // console.log("match", match[0], match.rule && match.rule.begin)
 
-        if (!match) break;
+          if (!match) break;
 
-        const beforeMatch = codeToHighlight.substring(index, match.index);
-        const processedCount = processLexeme(beforeMatch, match);
-        index = match.index + processedCount;
+          const beforeMatch = codeToHighlight.substring(index, match.index);
+          const processedCount = processLexeme(beforeMatch, match);
+          index = match.index + processedCount;
+        }
+        processLexeme(codeToHighlight.substring(index));
+      } else {
+        language.__emitTokens(codeToHighlight, emitter);
       }
-      processLexeme(codeToHighlight.substring(index));
-      emitter.closeAllNodes();
+
       emitter.finalize();
       result = emitter.toHTML();
 

diff --git a/src/lib/html_renderer.js b/src/lib/html_renderer.js
@@ -21,7 +21,7 @@ const SPAN_CLOSE = '</span>';
 const emitsWrappingTags = (node) => {
   // rarely we can have a sublanguage where language is undefined
   // TODO: track down why
-  return !!node.scope || (node.sublanguage && node.language);
+  return !!node.scope;
 };
 
 /**
@@ -30,13 +30,19 @@ const emitsWrappingTags = (node) => {
  * @param {{prefix:string}} options
  */
 const scopeToCSSClass = (name, { prefix }) => {
+  // sub-language
+  if (name.startsWith("language:")) {
+    return name.replace("language:", "language-");
+  }
+  // tiered scope: comment.line
   if (name.includes(".")) {
     const pieces = name.split(".");
     return [
       `${prefix}${pieces.shift()}`,
       ...(pieces.map((x, i) => `${x}${"_".repeat(i + 1)}`))
     ].join(" ");
   }
+  // simple scope
   return `${prefix}${name}`;
 };
 
@@ -69,12 +75,8 @@ export default class HTMLRenderer {
   openNode(node) {
     if (!emitsWrappingTags(node)) return;
 
-    let className = "";
-    if (node.sublanguage) {
-      className = `language-${node.language}`;
-    } else {
-      className = scopeToCSSClass(node.scope, { prefix: this.classPrefix });
-    }
+    const className = scopeToCSSClass(node.scope,
+      { prefix: this.classPrefix });
     this.span(className);
   }
 

diff --git a/src/lib/token_tree.js b/src/lib/token_tree.js
@@ -106,13 +106,11 @@ class TokenTree {
 
   Minimal interface:
 
-  - addKeyword(text, scope)
   - addText(text)
-  - addSublanguage(emitter, subLanguageName)
+  - __addSublanguage(emitter, subLanguageName)
+  - startScope(scope)
+  - endScope()
   - finalize()
-  - openNode(scope)
-  - closeNode()
-  - closeAllNodes()
   - toHTML()
 
 */
@@ -131,34 +129,31 @@ export default class TokenTreeEmitter extends TokenTree {
 
   /**
    * @param {string} text
-   * @param {string} scope
    */
-  addKeyword(text, scope) {
+  addText(text) {
     if (text === "") { return; }
 
-    this.openNode(scope);
-    this.addText(text);
-    this.closeNode();
+    this.add(text);
   }
 
-  /**
-   * @param {string} text
-   */
-  addText(text) {
-    if (text === "") { return; }
+  /** @param {string} scope */
+  startScope(scope) {
+    this.openNode(scope);
+  }
 
-    this.add(text);
+  endScope() {
+    this.closeNode();
   }
 
   /**
    * @param {Emitter & {root: DataNode}} emitter
    * @param {string} name
    */
-  addSublanguage(emitter, name) {
+  __addSublanguage(emitter, name) {
     /** @type DataNode */
     const node = emitter.root;
-    node.sublanguage = true;
-    node.language = name;
+    if (name) node.scope = `language:${name}`;
+
     this.add(node);
   }
 
@@ -168,6 +163,7 @@ export default class TokenTreeEmitter extends TokenTree {
   }
 
   finalize() {
+    this.closeAllNodes();
     return true;
   }
 }