Skip to content

Commit

Permalink
(enh) private __emitTokens callback to allow custom grammar parsers (
Browse files Browse the repository at this point in the history
…#3620)

* private __emitTokens API
* remove addKeyword from Emitter API
* use language: namespace scope prefix to handle sublanguages
* add emitTokens docs
  • Loading branch information
joshgoebel committed Mar 19, 2023
1 parent aa58ffa commit 11bce6a
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 51 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Expand Up @@ -3,6 +3,7 @@
Improvements:

- added a function to default export to generate a fresh highlighter instance to be used by extensions [WisamMechano][]
- added BETA `__emitTokens` key to grammars to allow then to direct their own parsing, only using Highlight.js for the HTML rendering [Josh Goebel][]

New Grammars:

Expand All @@ -24,6 +25,7 @@ Parser:

- add removePlugin api [faga295][]

[Josh Goebel]: https://github.com/joshgoebel
[Timur Kamaev]: https://github.com/doiftrue
[Leopard20]: https://github.com/Leopard20/
[WisamMechano]: https://github.com/wisammechano
Expand Down
46 changes: 46 additions & 0 deletions docs/mode-reference.rst
Expand Up @@ -105,6 +105,52 @@ Disables autodetection for this language.
(defaults to false, meaning auto-detect is enabled)


__emitTokens
^^^^^^^^^^^^

.. warning::

**This is currently still private/beta API**, though it's expected to be fairly stable.

It should land in version 12.0.

Allows grammars to bundle custom parsers - bypassing the default parser and grammar mode definitions. This should be a function that accepts the raw source code as the first argument and an "Emitter" object as the second.

A custom parser may parse the source as it sees fit - making calls to the Emitter along the way - allowing Highlight.js to generate and theme the final HTML.

The **Emitter** API is trivial:

- ``addText(text)``
- ``startScope(name)``
- ``endScope()``

Given:

::

hello beautiful world!


Assuming beautiful is a keyword our Emitter calls might look something like:

::

addText("hello ")
startScope("keyword")
addText("beautiful")
endScope()
addText(" world!")

Resulting in the following generated HTML:

.. code-block:: html

hello <span class="hljs-keyword">beautiful</span> world!

.. note::

The intended use of ``addText`` is larger chunks of plain text, not individual characters. Custom parsers should buffer plain text output into complete strings rather than sending output one character at a time.

compilerExtensions
^^^^^^^^^^^^^^^^^^

Expand Down
67 changes: 42 additions & 25 deletions src/highlight.js
Expand Up @@ -15,6 +15,7 @@ import * as packageJSON from '../package.json';
import * as logger from "./lib/logger.js";
import HTMLInjectionError from "./lib/html_injection_error.js";


/**
@typedef {import('highlight.js').Mode} Mode
@typedef {import('highlight.js').CompiledMode} CompiledMode
Expand Down Expand Up @@ -224,7 +225,7 @@ const HLJS = function(hljs) {
buf += match[0];
} else {
const cssClass = language.classNameAliases[kind] || kind;
emitter.addKeyword(match[0], cssClass);
emitKeyword(match[0], cssClass);
}
} else {
buf += match[0];
Expand Down Expand Up @@ -259,7 +260,7 @@ const HLJS = function(hljs) {
if (top.relevance > 0) {
relevance += result.relevance;
}
emitter.addSublanguage(result._emitter, result.language);
emitter.__addSublanguage(result._emitter, result.language);
}

function processBuffer() {
Expand All @@ -271,6 +272,18 @@ const HLJS = function(hljs) {
modeBuffer = '';
}

/**
* @param {string} text
* @param {string} scope
*/
function emitKeyword(keyword, scope) {
if (keyword === "") return;

emitter.startScope(scope);
emitter.addText(keyword);
emitter.endScope();
}

/**
* @param {CompiledScope} scope
* @param {RegExpMatchArray} match
Expand All @@ -283,7 +296,7 @@ const HLJS = function(hljs) {
const klass = language.classNameAliases[scope[i]] || scope[i];
const text = match[i];
if (klass) {
emitter.addKeyword(text, klass);
emitKeyword(text, klass);
} else {
modeBuffer = text;
processKeywords();
Expand All @@ -304,7 +317,7 @@ const HLJS = function(hljs) {
if (mode.beginScope) {
// beginScope just wraps the begin match itself in a scope
if (mode.beginScope._wrap) {
emitter.addKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
emitKeyword(modeBuffer, language.classNameAliases[mode.beginScope._wrap] || mode.beginScope._wrap);
modeBuffer = "";
} else if (mode.beginScope._multi) {
// at this point modeBuffer should just be the match
Expand Down Expand Up @@ -415,7 +428,7 @@ const HLJS = function(hljs) {
const origin = top;
if (top.endScope && top.endScope._wrap) {
processBuffer();
emitter.addKeyword(lexeme, top.endScope._wrap);
emitKeyword(lexeme, top.endScope._wrap);
} else if (top.endScope && top.endScope._multi) {
processBuffer();
emitMultiClass(top.endScope, match);
Expand Down Expand Up @@ -558,30 +571,34 @@ const HLJS = function(hljs) {
let resumeScanAtSamePosition = false;

try {
top.matcher.considerAll();

for (;;) {
iterations++;
if (resumeScanAtSamePosition) {
// only regexes not matched previously will now be
// considered for a potential match
resumeScanAtSamePosition = false;
} else {
top.matcher.considerAll();
}
top.matcher.lastIndex = index;
if (!language.__emitTokens) {
top.matcher.considerAll();

for (;;) {
iterations++;
if (resumeScanAtSamePosition) {
// only regexes not matched previously will now be
// considered for a potential match
resumeScanAtSamePosition = false;
} else {
top.matcher.considerAll();
}
top.matcher.lastIndex = index;

const match = top.matcher.exec(codeToHighlight);
// console.log("match", match[0], match.rule && match.rule.begin)
const match = top.matcher.exec(codeToHighlight);
// console.log("match", match[0], match.rule && match.rule.begin)

if (!match) break;
if (!match) break;

const beforeMatch = codeToHighlight.substring(index, match.index);
const processedCount = processLexeme(beforeMatch, match);
index = match.index + processedCount;
const beforeMatch = codeToHighlight.substring(index, match.index);
const processedCount = processLexeme(beforeMatch, match);
index = match.index + processedCount;
}
processLexeme(codeToHighlight.substring(index));
} else {
language.__emitTokens(codeToHighlight, emitter);
}
processLexeme(codeToHighlight.substring(index));
emitter.closeAllNodes();

emitter.finalize();
result = emitter.toHTML();

Expand Down
16 changes: 9 additions & 7 deletions src/lib/html_renderer.js
Expand Up @@ -21,7 +21,7 @@ const SPAN_CLOSE = '</span>';
const emitsWrappingTags = (node) => {
// rarely we can have a sublanguage where language is undefined
// TODO: track down why
return !!node.scope || (node.sublanguage && node.language);
return !!node.scope;
};

/**
Expand All @@ -30,13 +30,19 @@ const emitsWrappingTags = (node) => {
* @param {{prefix:string}} options
*/
const scopeToCSSClass = (name, { prefix }) => {
// sub-language
if (name.startsWith("language:")) {
return name.replace("language:", "language-");
}
// tiered scope: comment.line
if (name.includes(".")) {
const pieces = name.split(".");
return [
`${prefix}${pieces.shift()}`,
...(pieces.map((x, i) => `${x}${"_".repeat(i + 1)}`))
].join(" ");
}
// simple scope
return `${prefix}${name}`;
};

Expand Down Expand Up @@ -69,12 +75,8 @@ export default class HTMLRenderer {
openNode(node) {
if (!emitsWrappingTags(node)) return;

let className = "";
if (node.sublanguage) {
className = `language-${node.language}`;
} else {
className = scopeToCSSClass(node.scope, { prefix: this.classPrefix });
}
const className = scopeToCSSClass(node.scope,
{ prefix: this.classPrefix });
this.span(className);
}

Expand Down
34 changes: 15 additions & 19 deletions src/lib/token_tree.js
Expand Up @@ -106,13 +106,11 @@ class TokenTree {
Minimal interface:
- addKeyword(text, scope)
- addText(text)
- addSublanguage(emitter, subLanguageName)
- __addSublanguage(emitter, subLanguageName)
- startScope(scope)
- endScope()
- finalize()
- openNode(scope)
- closeNode()
- closeAllNodes()
- toHTML()
*/
Expand All @@ -131,34 +129,31 @@ export default class TokenTreeEmitter extends TokenTree {

/**
* @param {string} text
* @param {string} scope
*/
addKeyword(text, scope) {
addText(text) {
if (text === "") { return; }

this.openNode(scope);
this.addText(text);
this.closeNode();
this.add(text);
}

/**
* @param {string} text
*/
addText(text) {
if (text === "") { return; }
/** @param {string} scope */
startScope(scope) {
this.openNode(scope);
}

this.add(text);
endScope() {
this.closeNode();
}

/**
* @param {Emitter & {root: DataNode}} emitter
* @param {string} name
*/
addSublanguage(emitter, name) {
__addSublanguage(emitter, name) {
/** @type DataNode */
const node = emitter.root;
node.sublanguage = true;
node.language = name;
if (name) node.scope = `language:${name}`;

this.add(node);
}

Expand All @@ -168,6 +163,7 @@ export default class TokenTreeEmitter extends TokenTree {
}

finalize() {
this.closeAllNodes();
return true;
}
}

0 comments on commit 11bce6a

Please sign in to comment.