Skip to content

Commit

Permalink
Add pygments2chroma_xml.py
Browse files Browse the repository at this point in the history
This script automatically generates a lexer in xml format from Pygments.
  • Loading branch information
Forest0923 authored and alecthomas committed Oct 6, 2022
1 parent 65c601b commit 739028e
Showing 1 changed file with 191 additions and 0 deletions.
191 changes: 191 additions & 0 deletions _tools/pygments2chroma_xml.py
@@ -0,0 +1,191 @@
import functools
import importlib
import json
import os
import re
import sys
import types
import html

import pystache
from pygments import lexer as pygments_lexer
from pygments.token import _TokenType

TEMPLATE = r'''
<lexer>
<config>
<name>{{name}}</name>
{{#aliases}}
<alias>{{alias}}</alias>
{{/aliases}}
{{#filenames}}
<filename>{{filename}}</filename>
{{/filenames}}
{{#mimetypes}}
<mime_type>{{mimetype}}</mime_type>
{{/mimetypes}}
{{#re_ignorecase}}
<case_insensitive>true</case_insensitive>
{{/re_ignorecase}}
{{#re_dotall}}
<dot_all>true</dot_all>
{{/re_dotall}}
{{#re_not_multiline}}
<not_multiline>true</not_multiline>
{{/re_not_multiline}}
</config>
<rules>
{{#tokens}}
<state name="{{state}}">
{{#rules}}
{{{.}}}
{{/rules}}
</state>
{{/tokens}}
</rules>
</lexer>
'''


def xml_regex(s):
return xml_string(s)

def xml_string(s):
s = html.escape(s)
return '"' + s + '"'


def to_camel_case(snake_str):
components = snake_str.split('_')
return ''.join(x.title() for x in components)


def warning(message):
print('warning: ' + message, file=sys.stderr)


def resolve_emitter(emitter):
if isinstance(emitter, types.FunctionType):
if repr(emitter).startswith('<function bygroups.'):
args = emitter.__closure__[0].cell_contents
emitter = '<bygroups>%s</bygroups>' % ''.join(resolve_emitter(e) for e in args)
elif repr(emitter).startswith('<function using.'):
args = emitter.__closure__[0].cell_contents
if isinstance(args, dict):
state = 'root'
if 'stack' in args:
state = args['stack'][1]
args.pop('stack')
assert args == {}, args
emitter = '<usingself state="%s"/>' % state
elif issubclass(args, pygments_lexer.Lexer):
name = args.__name__
if name.endswith('Lexer'):
name = name[:-5]
emitter = '<using state="%s"/>' % state
else:
raise ValueError('only support "using" with lexer classes, not %r' % args)
else:
warning('unsupported emitter function %r' % emitter)
emitter = '?? %r ??' % emitter
elif isinstance(emitter, _TokenType):
emitter = '<token type="%s"/>' % str(emitter).replace('.', '')[5:]
elif emitter is None:
return 'None'
else:
raise ValueError('unsupported emitter type %r' % emitter)
assert isinstance(emitter, str)
return emitter


def process_state_action(action):
if isinstance(action, tuple):
return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
if action.startswith('#'):
action = action[1:]
if action== 'pop':
action = '<pop depth="1"/>'
elif action.startswith('pop:'):
action = '<pop depth="%s"/>' % action[4:]
elif action == 'push':
action = '<push/>'
elif action.startswith('push:'):
action = '<push state="%s"/>' % action[5:]
else:
raise ValueError('unsupported action %r' % (action,))
else:
action = '<push state="%s"/>' % action
return (action,)


def translate_rules(rules):
out = []
for rule in rules:
if isinstance(rule, tuple):
regex = rule[0]
if isinstance(regex, str):
regex = xml_regex(regex)
elif isinstance(regex, pygments_lexer.words):
regex = xml_string('%s(%s)%s' % (regex.prefix,
'|'.join(w for w in regex.words),
regex.suffix))
else:
raise ValueError('expected regex string but got %r' % regex)
emitter = resolve_emitter(rule[1])
if len(rule) == 2:
modifier = ''
elif type(rule[2]) is str:
modifier = process_state_action(rule[2])[0]
elif isinstance(rule[2], pygments_lexer.combined):
modifier = '<combined state="%s"/>' % '" state="'.join(rule[2])
elif type(rule[2]) is tuple:
modifier = '<push state="%s"/>' % '" state="'.join(rule[2])
else:
raise ValueError('unsupported modifier %r' % (rule[2],))
out.append('<rule pattern={}>{}{}</rule>'.format(regex, emitter, modifier))
elif isinstance(rule, pygments_lexer.include):
out.append('<rule><include state="{}"/></rule>'.format(rule))
elif isinstance(rule, pygments_lexer.default):
process_state_action(rule.state)
out.append('<rule>{}</rule>'.format(''.join(process_state_action(rule.state))))
else:
raise ValueError('unsupported rule %r' % (rule,))
return out


class TemplateView(object):
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)

def re_not_multiline(self):
return not (self.regex_flags & re.MULTILINE)

def re_dotall(self):
return self.regex_flags & re.DOTALL

def re_ignorecase(self):
return self.regex_flags & re.IGNORECASE


def main():
package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)

package = importlib.import_module(package_name)

lexer_cls = getattr(package, symbol_name)

assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'

print(pystache.render(TEMPLATE, TemplateView(
name=lexer_cls.name,
regex_flags=lexer_cls.flags,
aliases=[{'alias': alias} for alias in lexer_cls.aliases],
filenames=[{'filename': filename} for filename in lexer_cls.filenames],
mimetypes=[{'mimetype': mimetype} for mimetype in lexer_cls.mimetypes],
tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
)))


if __name__ == '__main__':
main()

0 comments on commit 739028e

Please sign in to comment.