Skip to content

Commit

Permalink
Perl: Improved tokenization (#3241)
Browse files Browse the repository at this point in the history
  • Loading branch information
RunDevelopment committed Dec 10, 2021
1 parent a3905c0 commit f22ea9f
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 206 deletions.
341 changes: 153 additions & 188 deletions components/prism-perl.js
@@ -1,191 +1,156 @@
Prism.languages.perl = {
'comment': [
{
// POD
pattern: /(^\s*)=\w[\s\S]*?=cut.*/m,
(function (Prism) {

var brackets = /(?:\((?:[^()\\]|\\[\s\S])*\)|\{(?:[^{}\\]|\\[\s\S])*\}|\[(?:[^[\]\\]|\\[\s\S])*\]|<(?:[^<>\\]|\\[\s\S])*>)/.source;

Prism.languages.perl = {
'comment': [
{
// POD
pattern: /(^\s*)=\w[\s\S]*?=cut.*/m,
lookbehind: true,
greedy: true
},
{
pattern: /(^|[^\\$])#.*/,
lookbehind: true,
greedy: true
}
],
// TODO Could be nice to handle Heredoc too.
'string': [
{
pattern: RegExp(
/\b(?:q|qq|qw|qx)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// q/.../
/([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/.source,

// q a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// q(...)
// q{...}
// q[...]
// q<...>
brackets,
].join('|') +
')'
),
greedy: true
},

// "...", `...`
{
pattern: /("|`)(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// '...'
// FIXME Multi-line single-quoted strings are not supported as they would break variables containing '
{
pattern: /'(?:[^'\\\r\n]|\\.)*'/,
greedy: true
}
],
'regex': [
{
pattern: RegExp(
/\b(?:m|qr)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// m/.../
/([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/.source,

// m a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// m(...)
// m{...}
// m[...]
// m<...>
brackets,
].join('|') +
')' +
/[msixpodualngc]*/.source
),
greedy: true
},

// The lookbehinds prevent -s from breaking
{
pattern: RegExp(
/(^|[^-])\b(?:s|tr|y)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// s/.../.../
// eslint-disable-next-line regexp/strict
/([^a-zA-Z0-9\s{(\[<])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// s a...a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\3)[^\\]|\\[\s\S])*\3(?:(?!\3)[^\\]|\\[\s\S])*\3/.source,

// s(...)(...)
// s{...}{...}
// s[...][...]
// s<...><...>
// s(...)[...]
brackets + /\s*/.source + brackets,
].join('|') +
')' +
/[msixpodualngcer]*/.source
),
lookbehind: true,
greedy: true
},

// /.../
// The look-ahead tries to prevent two divisions on
// the same line from being highlighted as regex.
// This does not support multi-line regex.
{
pattern: /\/(?:[^\/\\\r\n]|\\.)*\/[msixpodualngc]*(?=\s*(?:$|[\r\n,.;})&|\-+*~<>!?^]|(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|x|xor)\b))/,
greedy: true
}
],

// FIXME Not sure about the handling of ::, ', and #
'variable': [
// ${^POSTMATCH}
/[&*$@%]\{\^[A-Z]+\}/,
// $^V
/[&*$@%]\^[A-Z_]/,
// ${...}
/[&*$@%]#?(?=\{)/,
// $foo
/[&*$@%]#?(?:(?:::)*'?(?!\d)[\w$]+(?![\w$]))+(?:::)*/,
// $1
/[&*$@%]\d+/,
// $_, @_, %!
// The negative lookahead prevents from breaking the %= operator
/(?!%=)[$@%][!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~]/
],
'filehandle': {
// <>, <FOO>, _
pattern: /<(?![<=])\S*?>|\b_\b/,
alias: 'symbol'
},
'v-string': {
// v1.2, 1.2.3
pattern: /v\d+(?:\.\d+)*|\d+(?:\.\d+){2,}/,
alias: 'string'
},
'function': {
pattern: /(\bsub[ \t]+)\w+/,
lookbehind: true
},
{
pattern: /(^|[^\\$])#.*/,
lookbehind: true
}
],
// TODO Could be nice to handle Heredoc too.
'string': [
// q/.../
{
pattern: /\b(?:q|qq|qw|qx)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// q a...a
{
pattern: /\b(?:q|qq|qw|qx)\s+([a-zA-Z0-9])(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// q(...)
{
pattern: /\b(?:q|qq|qw|qx)\s*\((?:[^()\\]|\\[\s\S])*\)/,
greedy: true
},

// q{...}
{
pattern: /\b(?:q|qq|qw|qx)\s*\{(?:[^{}\\]|\\[\s\S])*\}/,
greedy: true
},

// q[...]
{
pattern: /\b(?:q|qq|qw|qx)\s*\[(?:[^[\]\\]|\\[\s\S])*\]/,
greedy: true
},

// q<...>
{
pattern: /\b(?:q|qq|qw|qx)\s*<(?:[^<>\\]|\\[\s\S])*>/,
greedy: true
},

// "...", `...`
{
pattern: /("|`)(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// '...'
// FIXME Multi-line single-quoted strings are not supported as they would break variables containing '
{
pattern: /'(?:[^'\\\r\n]|\\.)*'/,
greedy: true
}
],
'regex': [
// m/.../
{
pattern: /\b(?:m|qr)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1[msixpodualngc]*/,
greedy: true
},

// m a...a
{
pattern: /\b(?:m|qr)\s+([a-zA-Z0-9])(?:(?!\1)[^\\]|\\[\s\S])*\1[msixpodualngc]*/,
greedy: true
},

// m(...)
{
pattern: /\b(?:m|qr)\s*\((?:[^()\\]|\\[\s\S])*\)[msixpodualngc]*/,
greedy: true
},

// m{...}
{
pattern: /\b(?:m|qr)\s*\{(?:[^{}\\]|\\[\s\S])*\}[msixpodualngc]*/,
greedy: true
},

// m[...]
{
pattern: /\b(?:m|qr)\s*\[(?:[^[\]\\]|\\[\s\S])*\][msixpodualngc]*/,
greedy: true
},

// m<...>
{
pattern: /\b(?:m|qr)\s*<(?:[^<>\\]|\\[\s\S])*>[msixpodualngc]*/,
greedy: true
},

// The lookbehinds prevent -s from breaking
// FIXME We don't handle change of separator like s(...)[...]
// s/.../.../
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s a...a...a
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s+([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s(...)(...)
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\((?:[^()\\]|\\[\s\S])*\)\s*\((?:[^()\\]|\\[\s\S])*\)[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s{...}{...}
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\{(?:[^{}\\]|\\[\s\S])*\}\s*\{(?:[^{}\\]|\\[\s\S])*\}[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s[...][...]
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\[(?:[^[\]\\]|\\[\s\S])*\]\s*\[(?:[^[\]\\]|\\[\s\S])*\][msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s<...><...>
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*<(?:[^<>\\]|\\[\s\S])*>\s*<(?:[^<>\\]|\\[\s\S])*>[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// /.../
// The look-ahead tries to prevent two divisions on
// the same line from being highlighted as regex.
// This does not support multi-line regex.
{
pattern: /\/(?:[^\/\\\r\n]|\\.)*\/[msixpodualngc]*(?=\s*(?:$|[\r\n,.;})&|\-+*~<>!?^]|(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|x|xor)\b))/,
greedy: true
}
],
'keyword': /\b(?:any|break|continue|default|delete|die|do|else|elsif|eval|for|foreach|given|goto|if|last|local|my|next|our|package|print|redo|require|return|say|state|sub|switch|undef|unless|until|use|when|while)\b/,
'number': /\b(?:0x[\dA-Fa-f](?:_?[\dA-Fa-f])*|0b[01](?:_?[01])*|(?:(?:\d(?:_?\d)*)?\.)?\d(?:_?\d)*(?:[Ee][+-]?\d+)?)\b/,
'operator': /-[rwxoRWXOezsfdlpSbctugkTBMAC]\b|\+[+=]?|-[-=>]?|\*\*?=?|\/\/?=?|=[=~>]?|~[~=]?|\|\|?=?|&&?=?|<(?:=>?|<=?)?|>>?=?|![~=]?|[%^]=?|\.(?:=|\.\.?)?|[\\?]|\bx(?:=|\b)|\b(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|xor)\b/,
'punctuation': /[{}[\];(),:]/
};

// FIXME Not sure about the handling of ::, ', and #
'variable': [
// ${^POSTMATCH}
/[&*$@%]\{\^[A-Z]+\}/,
// $^V
/[&*$@%]\^[A-Z_]/,
// ${...}
/[&*$@%]#?(?=\{)/,
// $foo
/[&*$@%]#?(?:(?:::)*'?(?!\d)[\w$]+(?![\w$]))+(?:::)*/,
// $1
/[&*$@%]\d+/,
// $_, @_, %!
// The negative lookahead prevents from breaking the %= operator
/(?!%=)[$@%][!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~]/
],
'filehandle': {
// <>, <FOO>, _
pattern: /<(?![<=])\S*>|\b_\b/,
alias: 'symbol'
},
'vstring': {
// v1.2, 1.2.3
pattern: /v\d+(?:\.\d+)*|\d+(?:\.\d+){2,}/,
alias: 'string'
},
'function': {
pattern: /sub \w+/i,
inside: {
keyword: /sub/
}
},
'keyword': /\b(?:any|break|continue|default|delete|die|do|else|elsif|eval|for|foreach|given|goto|if|last|local|my|next|our|package|print|redo|require|return|say|state|sub|switch|undef|unless|until|use|when|while)\b/,
'number': /\b(?:0x[\dA-Fa-f](?:_?[\dA-Fa-f])*|0b[01](?:_?[01])*|(?:(?:\d(?:_?\d)*)?\.)?\d(?:_?\d)*(?:[Ee][+-]?\d+)?)\b/,
'operator': /-[rwxoRWXOezsfdlpSbctugkTBMAC]\b|\+[+=]?|-[-=>]?|\*\*?=?|\/\/?=?|=[=~>]?|~[~=]?|\|\|?=?|&&?=?|<(?:=>?|<=?)?|>>?=?|![~=]?|[%^]=?|\.(?:=|\.\.?)?|[\\?]|\bx(?:=|\b)|\b(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|xor)\b/,
'punctuation': /[{}[\];(),:]/
};
}(Prism));
2 changes: 1 addition & 1 deletion components/prism-perl.min.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/languages/perl/function_feature.test
Expand Up @@ -4,10 +4,10 @@ sub Foo_Bar42
----------------------------------------------------

[
["function", [["keyword", "sub"], " foo"]],
["function", [["keyword", "sub"], " Foo_Bar42"]]
["keyword", "sub"], ["function", "foo"],
["keyword", "sub"], ["function", "Foo_Bar42"]
]

----------------------------------------------------

Checks for functions.
Checks for functions.
12 changes: 11 additions & 1 deletion tests/languages/perl/regex_feature.test
Expand Up @@ -64,6 +64,12 @@ s<foo
bar><foo
baz>

tr()<>c
y{foo\<bar}[ba\>a]
s<foo
bar>(foo
baz)

//
/foo/gsx
/foo\/bar/n
Expand Down Expand Up @@ -119,11 +125,15 @@ baz>
["regex", "y<foo\\<bar><ba\\>a>"],
["regex", "s<foo\r\nbar><foo\r\nbaz>"],

["regex", "tr()<>c"],
["regex", "y{foo\\<bar}[ba\\>a]"],
["regex", "s<foo\r\nbar>(foo\r\nbaz)"],

["regex", "//"],
["regex", "/foo/gsx"],
["regex", "/foo\\/bar/n"]
]

----------------------------------------------------

Checks for regex and regex quote-like operators.
Checks for regex and regex quote-like operators.

0 comments on commit f22ea9f

Please sign in to comment.