Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perl: Improved tokenization #3241

Merged
merged 2 commits into from Dec 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
341 changes: 153 additions & 188 deletions components/prism-perl.js
@@ -1,191 +1,156 @@
Prism.languages.perl = {
'comment': [
{
// POD
pattern: /(^\s*)=\w[\s\S]*?=cut.*/m,
(function (Prism) {

var brackets = /(?:\((?:[^()\\]|\\[\s\S])*\)|\{(?:[^{}\\]|\\[\s\S])*\}|\[(?:[^[\]\\]|\\[\s\S])*\]|<(?:[^<>\\]|\\[\s\S])*>)/.source;

Prism.languages.perl = {
'comment': [
{
// POD
pattern: /(^\s*)=\w[\s\S]*?=cut.*/m,
lookbehind: true,
greedy: true
},
{
pattern: /(^|[^\\$])#.*/,
lookbehind: true,
greedy: true
}
],
// TODO Could be nice to handle Heredoc too.
'string': [
{
pattern: RegExp(
/\b(?:q|qq|qw|qx)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// q/.../
/([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/.source,

// q a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// q(...)
// q{...}
// q[...]
// q<...>
brackets,
].join('|') +
')'
),
greedy: true
},

// "...", `...`
{
pattern: /("|`)(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// '...'
// FIXME Multi-line single-quoted strings are not supported as they would break variables containing '
{
pattern: /'(?:[^'\\\r\n]|\\.)*'/,
greedy: true
}
],
'regex': [
{
pattern: RegExp(
/\b(?:m|qr)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// m/.../
/([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/.source,

// m a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// m(...)
// m{...}
// m[...]
// m<...>
brackets,
].join('|') +
')' +
/[msixpodualngc]*/.source
),
greedy: true
},

// The lookbehinds prevent -s from breaking
{
pattern: RegExp(
/(^|[^-])\b(?:s|tr|y)(?![a-zA-Z0-9])\s*/.source +
'(?:' +
[
// s/.../.../
// eslint-disable-next-line regexp/strict
/([^a-zA-Z0-9\s{(\[<])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2/.source,

// s a...a...a
// eslint-disable-next-line regexp/strict
/([a-zA-Z0-9])(?:(?!\3)[^\\]|\\[\s\S])*\3(?:(?!\3)[^\\]|\\[\s\S])*\3/.source,

// s(...)(...)
// s{...}{...}
// s[...][...]
// s<...><...>
// s(...)[...]
brackets + /\s*/.source + brackets,
].join('|') +
')' +
/[msixpodualngcer]*/.source
),
lookbehind: true,
greedy: true
},

// /.../
// The look-ahead tries to prevent two divisions on
// the same line from being highlighted as regex.
// This does not support multi-line regex.
{
pattern: /\/(?:[^\/\\\r\n]|\\.)*\/[msixpodualngc]*(?=\s*(?:$|[\r\n,.;})&|\-+*~<>!?^]|(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|x|xor)\b))/,
greedy: true
}
],

// FIXME Not sure about the handling of ::, ', and #
'variable': [
// ${^POSTMATCH}
/[&*$@%]\{\^[A-Z]+\}/,
// $^V
/[&*$@%]\^[A-Z_]/,
// ${...}
/[&*$@%]#?(?=\{)/,
// $foo
/[&*$@%]#?(?:(?:::)*'?(?!\d)[\w$]+(?![\w$]))+(?:::)*/,
// $1
/[&*$@%]\d+/,
// $_, @_, %!
// The negative lookahead prevents from breaking the %= operator
/(?!%=)[$@%][!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~]/
],
'filehandle': {
// <>, <FOO>, _
pattern: /<(?![<=])\S*?>|\b_\b/,
alias: 'symbol'
},
'v-string': {
// v1.2, 1.2.3
pattern: /v\d+(?:\.\d+)*|\d+(?:\.\d+){2,}/,
alias: 'string'
},
'function': {
pattern: /(\bsub[ \t]+)\w+/,
lookbehind: true
},
{
pattern: /(^|[^\\$])#.*/,
lookbehind: true
}
],
// TODO Could be nice to handle Heredoc too.
'string': [
// q/.../
{
pattern: /\b(?:q|qq|qw|qx)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// q a...a
{
pattern: /\b(?:q|qq|qw|qx)\s+([a-zA-Z0-9])(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// q(...)
{
pattern: /\b(?:q|qq|qw|qx)\s*\((?:[^()\\]|\\[\s\S])*\)/,
greedy: true
},

// q{...}
{
pattern: /\b(?:q|qq|qw|qx)\s*\{(?:[^{}\\]|\\[\s\S])*\}/,
greedy: true
},

// q[...]
{
pattern: /\b(?:q|qq|qw|qx)\s*\[(?:[^[\]\\]|\\[\s\S])*\]/,
greedy: true
},

// q<...>
{
pattern: /\b(?:q|qq|qw|qx)\s*<(?:[^<>\\]|\\[\s\S])*>/,
greedy: true
},

// "...", `...`
{
pattern: /("|`)(?:(?!\1)[^\\]|\\[\s\S])*\1/,
greedy: true
},

// '...'
// FIXME Multi-line single-quoted strings are not supported as they would break variables containing '
{
pattern: /'(?:[^'\\\r\n]|\\.)*'/,
greedy: true
}
],
'regex': [
// m/.../
{
pattern: /\b(?:m|qr)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\1)[^\\]|\\[\s\S])*\1[msixpodualngc]*/,
greedy: true
},

// m a...a
{
pattern: /\b(?:m|qr)\s+([a-zA-Z0-9])(?:(?!\1)[^\\]|\\[\s\S])*\1[msixpodualngc]*/,
greedy: true
},

// m(...)
{
pattern: /\b(?:m|qr)\s*\((?:[^()\\]|\\[\s\S])*\)[msixpodualngc]*/,
greedy: true
},

// m{...}
{
pattern: /\b(?:m|qr)\s*\{(?:[^{}\\]|\\[\s\S])*\}[msixpodualngc]*/,
greedy: true
},

// m[...]
{
pattern: /\b(?:m|qr)\s*\[(?:[^[\]\\]|\\[\s\S])*\][msixpodualngc]*/,
greedy: true
},

// m<...>
{
pattern: /\b(?:m|qr)\s*<(?:[^<>\\]|\\[\s\S])*>[msixpodualngc]*/,
greedy: true
},

// The lookbehinds prevent -s from breaking
// FIXME We don't handle change of separator like s(...)[...]
// s/.../.../
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*([^a-zA-Z0-9\s{(\[<])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s a...a...a
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s+([a-zA-Z0-9])(?:(?!\2)[^\\]|\\[\s\S])*\2(?:(?!\2)[^\\]|\\[\s\S])*\2[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s(...)(...)
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\((?:[^()\\]|\\[\s\S])*\)\s*\((?:[^()\\]|\\[\s\S])*\)[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s{...}{...}
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\{(?:[^{}\\]|\\[\s\S])*\}\s*\{(?:[^{}\\]|\\[\s\S])*\}[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s[...][...]
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*\[(?:[^[\]\\]|\\[\s\S])*\]\s*\[(?:[^[\]\\]|\\[\s\S])*\][msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// s<...><...>
{
pattern: /(^|[^-]\b)(?:s|tr|y)\s*<(?:[^<>\\]|\\[\s\S])*>\s*<(?:[^<>\\]|\\[\s\S])*>[msixpodualngcer]*/,
lookbehind: true,
greedy: true
},

// /.../
// The look-ahead tries to prevent two divisions on
// the same line from being highlighted as regex.
// This does not support multi-line regex.
{
pattern: /\/(?:[^\/\\\r\n]|\\.)*\/[msixpodualngc]*(?=\s*(?:$|[\r\n,.;})&|\-+*~<>!?^]|(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|x|xor)\b))/,
greedy: true
}
],
'keyword': /\b(?:any|break|continue|default|delete|die|do|else|elsif|eval|for|foreach|given|goto|if|last|local|my|next|our|package|print|redo|require|return|say|state|sub|switch|undef|unless|until|use|when|while)\b/,
'number': /\b(?:0x[\dA-Fa-f](?:_?[\dA-Fa-f])*|0b[01](?:_?[01])*|(?:(?:\d(?:_?\d)*)?\.)?\d(?:_?\d)*(?:[Ee][+-]?\d+)?)\b/,
'operator': /-[rwxoRWXOezsfdlpSbctugkTBMAC]\b|\+[+=]?|-[-=>]?|\*\*?=?|\/\/?=?|=[=~>]?|~[~=]?|\|\|?=?|&&?=?|<(?:=>?|<=?)?|>>?=?|![~=]?|[%^]=?|\.(?:=|\.\.?)?|[\\?]|\bx(?:=|\b)|\b(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|xor)\b/,
'punctuation': /[{}[\];(),:]/
};

// FIXME Not sure about the handling of ::, ', and #
'variable': [
// ${^POSTMATCH}
/[&*$@%]\{\^[A-Z]+\}/,
// $^V
/[&*$@%]\^[A-Z_]/,
// ${...}
/[&*$@%]#?(?=\{)/,
// $foo
/[&*$@%]#?(?:(?:::)*'?(?!\d)[\w$]+(?![\w$]))+(?:::)*/,
// $1
/[&*$@%]\d+/,
// $_, @_, %!
// The negative lookahead prevents from breaking the %= operator
/(?!%=)[$@%][!"#$%&'()*+,\-.\/:;<=>?@[\\\]^_`{|}~]/
],
'filehandle': {
// <>, <FOO>, _
pattern: /<(?![<=])\S*>|\b_\b/,
alias: 'symbol'
},
'vstring': {
// v1.2, 1.2.3
pattern: /v\d+(?:\.\d+)*|\d+(?:\.\d+){2,}/,
alias: 'string'
},
'function': {
pattern: /sub \w+/i,
inside: {
keyword: /sub/
}
},
'keyword': /\b(?:any|break|continue|default|delete|die|do|else|elsif|eval|for|foreach|given|goto|if|last|local|my|next|our|package|print|redo|require|return|say|state|sub|switch|undef|unless|until|use|when|while)\b/,
'number': /\b(?:0x[\dA-Fa-f](?:_?[\dA-Fa-f])*|0b[01](?:_?[01])*|(?:(?:\d(?:_?\d)*)?\.)?\d(?:_?\d)*(?:[Ee][+-]?\d+)?)\b/,
'operator': /-[rwxoRWXOezsfdlpSbctugkTBMAC]\b|\+[+=]?|-[-=>]?|\*\*?=?|\/\/?=?|=[=~>]?|~[~=]?|\|\|?=?|&&?=?|<(?:=>?|<=?)?|>>?=?|![~=]?|[%^]=?|\.(?:=|\.\.?)?|[\\?]|\bx(?:=|\b)|\b(?:and|cmp|eq|ge|gt|le|lt|ne|not|or|xor)\b/,
'punctuation': /[{}[\];(),:]/
};
}(Prism));
2 changes: 1 addition & 1 deletion components/prism-perl.min.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/languages/perl/function_feature.test
Expand Up @@ -4,10 +4,10 @@ sub Foo_Bar42
----------------------------------------------------

[
["function", [["keyword", "sub"], " foo"]],
["function", [["keyword", "sub"], " Foo_Bar42"]]
["keyword", "sub"], ["function", "foo"],
["keyword", "sub"], ["function", "Foo_Bar42"]
]

----------------------------------------------------

Checks for functions.
Checks for functions.
12 changes: 11 additions & 1 deletion tests/languages/perl/regex_feature.test
Expand Up @@ -64,6 +64,12 @@ s<foo
bar><foo
baz>

tr()<>c
y{foo\<bar}[ba\>a]
s<foo
bar>(foo
baz)

//
/foo/gsx
/foo\/bar/n
Expand Down Expand Up @@ -119,11 +125,15 @@ baz>
["regex", "y<foo\\<bar><ba\\>a>"],
["regex", "s<foo\r\nbar><foo\r\nbaz>"],

["regex", "tr()<>c"],
["regex", "y{foo\\<bar}[ba\\>a]"],
["regex", "s<foo\r\nbar>(foo\r\nbaz)"],

["regex", "//"],
["regex", "/foo/gsx"],
["regex", "/foo\\/bar/n"]
]

----------------------------------------------------

Checks for regex and regex quote-like operators.
Checks for regex and regex quote-like operators.