From d216e602f38e3ff9acec69962b40ada07322e076 Mon Sep 17 00:00:00 2001 From: Michael Schmidt Date: Sun, 12 Sep 2021 20:00:45 +0200 Subject: [PATCH] Tests: Improved dection of empty patterns (#3058) --- components/prism-asciidoc.js | 2 +- components/prism-asciidoc.min.js | 2 +- components/prism-promql.js | 2 +- components/prism-promql.min.js | 2 +- package.json | 1 + tests/pattern-tests.js | 100 ++++++++++++++----------------- 6 files changed, 49 insertions(+), 60 deletions(-) diff --git a/components/prism-asciidoc.js b/components/prism-asciidoc.js index 486ca12fff..8909f14b33 100644 --- a/components/prism-asciidoc.js +++ b/components/prism-asciidoc.js @@ -35,7 +35,7 @@ pattern: /^\|={3,}(?:(?:\r?\n|\r(?!\n)).*)*?(?:\r?\n|\r)\|={3,}$/m, inside: { 'specifiers': { - pattern: /(?!\|)(?:(?:(?:\d+(?:\.\d+)?|\.\d+)[+*])?(?:[<^>](?:\.[<^>])?|\.[<^>])?[a-z]*)(?=\|)/, + pattern: /(?:(?:(?:\d+(?:\.\d+)?|\.\d+)[+*](?:[<^>](?:\.[<^>])?|\.[<^>])?|[<^>](?:\.[<^>])?|\.[<^>])[a-z]*|[a-z]+)(?=\|)/, alias: 'attr-value' }, 'punctuation': { diff --git a/components/prism-asciidoc.min.js b/components/prism-asciidoc.min.js index 9df76136b9..dc59cbf3b0 100644 --- a/components/prism-asciidoc.min.js +++ b/components/prism-asciidoc.min.js @@ -1 +1 @@ -!function(t){var n={pattern:/(^[ \t]*)\[(?!\[)(?:(["'$`])(?:(?!\2)[^\\]|\\.)*\2|\[(?:[^\[\]\\]|\\.)*\]|[^\[\]\\"'$`]|\\.)*\]/m,lookbehind:!0,inside:{quoted:{pattern:/([$`])(?:(?!\1)[^\\]|\\.)*\1/,inside:{punctuation:/^[$`]|[$`]$/}},interpreted:{pattern:/'(?:[^'\\]|\\.)*'/,inside:{punctuation:/^'|'$/}},string:/"(?:[^"\\]|\\.)*"/,variable:/\w+(?==)/,punctuation:/^\[|\]$|,/,operator:/=/,"attr-value":/(?!^\s+$).+/}},a=t.languages.asciidoc={"comment-block":{pattern:/^(\/{4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1/m,alias:"comment"},table:{pattern:/^\|={3,}(?:(?:\r?\n|\r(?!\n)).*)*?(?:\r?\n|\r)\|={3,}$/m,inside:{specifiers:{pattern:/(?!\|)(?:(?:(?:\d+(?:\.\d+)?|\.\d+)[+*])?(?:[<^>](?:\.[<^>])?|\.[<^>])?[a-z]*)(?=\|)/,alias:"attr-value"},punctuation:{pattern:/(^|[^\\])[|!]=*/,lookbehind:!0}}},"passthrough-block":{pattern:/^(\+{4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^\++|\++$/}},"literal-block":{pattern:/^(-{4,}|\.{4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^(?:-+|\.+)|(?:-+|\.+)$/}},"other-block":{pattern:/^(--|\*{4,}|_{4,}|={4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^(?:-+|\*+|_+|=+)|(?:-+|\*+|_+|=+)$/}},"list-punctuation":{pattern:/(^[ \t]*)(?:-|\*{1,5}|\.{1,5}|(?:[a-z]|\d+)\.|[xvi]+\))(?= )/im,lookbehind:!0,alias:"punctuation"},"list-label":{pattern:/(^[ \t]*)[a-z\d].+(?::{2,4}|;;)(?=\s)/im,lookbehind:!0,alias:"symbol"},"indented-block":{pattern:/((\r?\n|\r)\2)([ \t]+)\S.*(?:(?:\r?\n|\r)\3.+)*(?=\2{2}|$)/,lookbehind:!0},comment:/^\/\/.*/m,title:{pattern:/^.+(?:\r?\n|\r)(?:={3,}|-{3,}|~{3,}|\^{3,}|\+{3,})$|^={1,5} .+|^\.(?![\s.]).*/m,alias:"important",inside:{punctuation:/^(?:\.|=+)|(?:=+|-+|~+|\^+|\++)$/}},"attribute-entry":{pattern:/^:[^:\r\n]+:(?: .*?(?: \+(?:\r?\n|\r).*?)*)?$/m,alias:"tag"},attributes:n,hr:{pattern:/^'{3,}$/m,alias:"punctuation"},"page-break":{pattern:/^<{3,}$/m,alias:"punctuation"},admonition:{pattern:/^(?:TIP|NOTE|IMPORTANT|WARNING|CAUTION):/m,alias:"keyword"},callout:[{pattern:/(^[ \t]*)/m,lookbehind:!0,alias:"symbol"},{pattern:/<\d+>/,alias:"symbol"}],macro:{pattern:/\b[a-z\d][a-z\d-]*::?(?:[^\s\[\]]*\[(?:[^\]\\"']|(["'])(?:(?!\1)[^\\]|\\.)*\1|\\.)*\])/,inside:{function:/^[a-z\d-]+(?=:)/,punctuation:/^::?/,attributes:{pattern:/(?:\[(?:[^\]\\"']|(["'])(?:(?!\1)[^\\]|\\.)*\1|\\.)*\])/,inside:n.inside}}},inline:{pattern:/(^|[^\\])(?:(?:\B\[(?:[^\]\\"']|(["'])(?:(?!\2)[^\\]|\\.)*\2|\\.)*\])?(?:\b_(?!\s)(?: _|[^_\\\r\n]|\\.)+(?:(?:\r?\n|\r)(?: _|[^_\\\r\n]|\\.)+)*_\b|\B``(?!\s).+?(?:(?:\r?\n|\r).+?)*''\B|\B`(?!\s)(?:[^`'\s]|\s+\S)+['`]\B|\B(['*+#])(?!\s)(?: \3|(?!\3)[^\\\r\n]|\\.)+(?:(?:\r?\n|\r)(?: \3|(?!\3)[^\\\r\n]|\\.)+)*\3\B)|(?:\[(?:[^\]\\"']|(["'])(?:(?!\4)[^\\]|\\.)*\4|\\.)*\])?(?:(__|\*\*|\+\+\+?|##|\$\$|[~^]).+?(?:(?:\r?\n|\r).+?)*\5|\{[^}\r\n]+\}|\[\[\[?.+?(?:(?:\r?\n|\r).+?)*\]?\]\]|<<.+?(?:(?:\r?\n|\r).+?)*>>|\(\(\(?.+?(?:(?:\r?\n|\r).+?)*\)?\)\)))/m,lookbehind:!0,inside:{attributes:n,url:{pattern:/^(?:\[\[\[?.+?\]?\]\]|<<.+?>>)$/,inside:{punctuation:/^(?:\[\[\[?|<<)|(?:\]\]\]?|>>)$/}},"attribute-ref":{pattern:/^\{.+\}$/,inside:{variable:{pattern:/(^\{)[a-z\d,+_-]+/,lookbehind:!0},operator:/^[=?!#%@$]|!(?=[:}])/,punctuation:/^\{|\}$|::?/}},italic:{pattern:/^(['_])[\s\S]+\1$/,inside:{punctuation:/^(?:''?|__?)|(?:''?|__?)$/}},bold:{pattern:/^\*[\s\S]+\*$/,inside:{punctuation:/^\*\*?|\*\*?$/}},punctuation:/^(?:``?|\+{1,3}|##?|\$\$|[~^]|\(\(\(?)|(?:''?|\+{1,3}|##?|\$\$|[~^`]|\)?\)\))$/}},replacement:{pattern:/\((?:C|TM|R)\)/,alias:"builtin"},entity:/&#?[\da-z]{1,8};/i,"line-continuation":{pattern:/(^| )\+$/m,lookbehind:!0,alias:"punctuation"}};function i(t){for(var n={},i=0,e=(t=t.split(" ")).length;i](?:\.[<^>])?|\.[<^>])?|[<^>](?:\.[<^>])?|\.[<^>])[a-z]*|[a-z]+)(?=\|)/,alias:"attr-value"},punctuation:{pattern:/(^|[^\\])[|!]=*/,lookbehind:!0}}},"passthrough-block":{pattern:/^(\+{4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^\++|\++$/}},"literal-block":{pattern:/^(-{4,}|\.{4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^(?:-+|\.+)|(?:-+|\.+)$/}},"other-block":{pattern:/^(--|\*{4,}|_{4,}|={4,})(?:\r?\n|\r)(?:[\s\S]*(?:\r?\n|\r))??\1$/m,inside:{punctuation:/^(?:-+|\*+|_+|=+)|(?:-+|\*+|_+|=+)$/}},"list-punctuation":{pattern:/(^[ \t]*)(?:-|\*{1,5}|\.{1,5}|(?:[a-z]|\d+)\.|[xvi]+\))(?= )/im,lookbehind:!0,alias:"punctuation"},"list-label":{pattern:/(^[ \t]*)[a-z\d].+(?::{2,4}|;;)(?=\s)/im,lookbehind:!0,alias:"symbol"},"indented-block":{pattern:/((\r?\n|\r)\2)([ \t]+)\S.*(?:(?:\r?\n|\r)\3.+)*(?=\2{2}|$)/,lookbehind:!0},comment:/^\/\/.*/m,title:{pattern:/^.+(?:\r?\n|\r)(?:={3,}|-{3,}|~{3,}|\^{3,}|\+{3,})$|^={1,5} .+|^\.(?![\s.]).*/m,alias:"important",inside:{punctuation:/^(?:\.|=+)|(?:=+|-+|~+|\^+|\++)$/}},"attribute-entry":{pattern:/^:[^:\r\n]+:(?: .*?(?: \+(?:\r?\n|\r).*?)*)?$/m,alias:"tag"},attributes:n,hr:{pattern:/^'{3,}$/m,alias:"punctuation"},"page-break":{pattern:/^<{3,}$/m,alias:"punctuation"},admonition:{pattern:/^(?:TIP|NOTE|IMPORTANT|WARNING|CAUTION):/m,alias:"keyword"},callout:[{pattern:/(^[ \t]*)/m,lookbehind:!0,alias:"symbol"},{pattern:/<\d+>/,alias:"symbol"}],macro:{pattern:/\b[a-z\d][a-z\d-]*::?(?:[^\s\[\]]*\[(?:[^\]\\"']|(["'])(?:(?!\1)[^\\]|\\.)*\1|\\.)*\])/,inside:{function:/^[a-z\d-]+(?=:)/,punctuation:/^::?/,attributes:{pattern:/(?:\[(?:[^\]\\"']|(["'])(?:(?!\1)[^\\]|\\.)*\1|\\.)*\])/,inside:n.inside}}},inline:{pattern:/(^|[^\\])(?:(?:\B\[(?:[^\]\\"']|(["'])(?:(?!\2)[^\\]|\\.)*\2|\\.)*\])?(?:\b_(?!\s)(?: _|[^_\\\r\n]|\\.)+(?:(?:\r?\n|\r)(?: _|[^_\\\r\n]|\\.)+)*_\b|\B``(?!\s).+?(?:(?:\r?\n|\r).+?)*''\B|\B`(?!\s)(?:[^`'\s]|\s+\S)+['`]\B|\B(['*+#])(?!\s)(?: \3|(?!\3)[^\\\r\n]|\\.)+(?:(?:\r?\n|\r)(?: \3|(?!\3)[^\\\r\n]|\\.)+)*\3\B)|(?:\[(?:[^\]\\"']|(["'])(?:(?!\4)[^\\]|\\.)*\4|\\.)*\])?(?:(__|\*\*|\+\+\+?|##|\$\$|[~^]).+?(?:(?:\r?\n|\r).+?)*\5|\{[^}\r\n]+\}|\[\[\[?.+?(?:(?:\r?\n|\r).+?)*\]?\]\]|<<.+?(?:(?:\r?\n|\r).+?)*>>|\(\(\(?.+?(?:(?:\r?\n|\r).+?)*\)?\)\)))/m,lookbehind:!0,inside:{attributes:n,url:{pattern:/^(?:\[\[\[?.+?\]?\]\]|<<.+?>>)$/,inside:{punctuation:/^(?:\[\[\[?|<<)|(?:\]\]\]?|>>)$/}},"attribute-ref":{pattern:/^\{.+\}$/,inside:{variable:{pattern:/(^\{)[a-z\d,+_-]+/,lookbehind:!0},operator:/^[=?!#%@$]|!(?=[:}])/,punctuation:/^\{|\}$|::?/}},italic:{pattern:/^(['_])[\s\S]+\1$/,inside:{punctuation:/^(?:''?|__?)|(?:''?|__?)$/}},bold:{pattern:/^\*[\s\S]+\*$/,inside:{punctuation:/^\*\*?|\*\*?$/}},punctuation:/^(?:``?|\+{1,3}|##?|\$\$|[~^]|\(\(\(?)|(?:''?|\+{1,3}|##?|\$\$|[~^`]|\)?\)\))$/}},replacement:{pattern:/\((?:C|TM|R)\)/,alias:"builtin"},entity:/&#?[\da-z]{1,8};/i,"line-continuation":{pattern:/(^| )\+$/m,lookbehind:!0,alias:"punctuation"}};function i(t){for(var n={},i=0,e=(t=t.split(" ")).length;i=|>|\b(?:and|unless|or)\b/i,punctuation:/[{};()`,.[\]]/}}(Prism); \ No newline at end of file +!function(t){var n=["on","ignoring","group_right","group_left","by","without"],a=["sum","min","max","avg","group","stddev","stdvar","count","count_values","bottomk","topk","quantile"].concat(n,["offset"]);t.languages.promql={comment:{pattern:/(^[ \t]*)#.*/m,lookbehind:!0},"vector-match":{pattern:new RegExp("((?:"+n.join("|")+")\\s*)\\([^)]*\\)"),lookbehind:!0,inside:{"label-key":{pattern:/\b[^,]+\b/,alias:"attr-name"},punctuation:/[(),]/}},"context-labels":{pattern:/\{[^{}]*\}/,inside:{"label-key":{pattern:/\b[a-z_]\w*(?=\s*(?:=|![=~]))/,alias:"attr-name"},"label-value":{pattern:/(["'`])(?:\\[\s\S]|(?!\1)[^\\])*\1/,greedy:!0,alias:"attr-value"},punctuation:/\{|\}|=~?|![=~]|,/}},"context-range":[{pattern:/\[[\w\s:]+\]/,inside:{punctuation:/\[|\]|:/,"range-duration":{pattern:/\b(?:\d+(?:[smhdwy]|ms))+\b/i,alias:"number"}}},{pattern:/(\boffset\s+)\w+/,lookbehind:!0,inside:{"range-duration":{pattern:/\b(?:\d+(?:[smhdwy]|ms))+\b/i,alias:"number"}}}],keyword:new RegExp("\\b(?:"+a.join("|")+")\\b","i"),function:/\b[a-z_]\w*(?=\s*\()/i,number:/[-+]?(?:(?:\b\d+(?:\.\d+)?|\B\.\d+)(?:e[-+]?\d+)?\b|\b(?:0x[0-9a-f]+|nan|inf)\b)/i,operator:/[\^*/%+-]|==|!=|<=|<|>=|>|\b(?:and|unless|or)\b/i,punctuation:/[{};()`,.[\]]/}}(Prism); \ No newline at end of file diff --git a/package.json b/package.json index afea4802c9..2c5190094d 100755 --- a/package.json +++ b/package.json @@ -59,6 +59,7 @@ "npm-run-all": "^4.1.5", "pump": "^3.0.0", "refa": "^0.9.1", + "regexp-ast-analysis": "^0.2.4", "regexpp": "^3.2.0", "scslre": "^0.1.6", "simple-git": "^1.107.0", diff --git a/tests/pattern-tests.js b/tests/pattern-tests.js index 282021a171..5ebaaa64bb 100644 --- a/tests/pattern-tests.js +++ b/tests/pattern-tests.js @@ -12,6 +12,7 @@ const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, const scslre = require('scslre'); const path = require('path'); const { argv } = require('yargs'); +const RAA = require('regexp-ast-analysis'); /** * A map from language id to a list of code snippets in that language. @@ -130,6 +131,7 @@ function testPatterns(Prism, mainLanguage) { * @property {string} name * @property {any} parent * @property {boolean} lookbehind Whether the first capturing group of the pattern is a Prism lookbehind group. + * @property {CapturingGroup | undefined} lookbehindGroup * @property {{ key: string, value: any }[]} path * @property {(message: string) => void} reportError */ @@ -163,6 +165,8 @@ function testPatterns(Prism, mainLanguage) { } const parent = path.length > 1 ? path[path.length - 2].value : undefined; + const lookbehind = key === 'pattern' && parent && !!parent.lookbehind; + const lookbehindGroup = lookbehind ? getFirstCapturingGroup(ast.pattern) : undefined; callback({ pattern: value, ast, @@ -170,7 +174,8 @@ function testPatterns(Prism, mainLanguage) { name: key, parent, path, - lookbehind: key === 'pattern' && parent && !!parent.lookbehind, + lookbehind, + lookbehindGroup, reportError: message => errors.push(message) }); } catch (error) { @@ -231,9 +236,10 @@ function testPatterns(Prism, mainLanguage) { it('- should not match the empty string', function () { - forEachPattern(({ pattern, tokenPath }) => { + forEachPattern(({ ast, pattern, tokenPath }) => { // test for empty string - assert.notMatch('', pattern, `${tokenPath}: ${pattern} should not match the empty string.\n\n` + const empty = RAA.isPotentiallyZeroLength(ast.pattern.alternatives); + assert.isFalse(empty, `${tokenPath}: ${pattern} should not match the empty string.\n\n` + `Patterns that do match the empty string can potentially cause infinitely many empty tokens. ` + `Make sure that all patterns always consume at least one character.`); }); @@ -256,47 +262,37 @@ function testPatterns(Prism, mainLanguage) { }); it('- should not have lookbehind groups that can be preceded by other some characters', function () { - forEachPattern(({ ast, tokenPath, lookbehind }) => { - if (!lookbehind) { - return; + forEachPattern(({ tokenPath, lookbehindGroup }) => { + if (lookbehindGroup && !isFirstMatch(lookbehindGroup)) { + assert.fail(`${tokenPath}: The lookbehind group ${lookbehindGroup.raw} might be preceded by some characters.\n\n` + + `Prism assumes that the lookbehind group, if captured, is the first thing matched by the regex. ` + + `If characters might precede the lookbehind group (e.g. /a?(b)c/), then Prism cannot correctly apply the lookbehind correctly in all cases.\n` + + `To fix this, either remove the preceding characters or include them in the lookbehind group.`); } - forEachCapturingGroup(ast.pattern, ({ group, number }) => { - if (number === 1 && !isFirstMatch(group)) { - assert.fail(`${tokenPath}: The lookbehind group ${group.raw} might be preceded by some characters.\n\n` - + `Prism assumes that the lookbehind group, if captured, is the first thing matched by the regex. ` - + `If characters might precede the lookbehind group (e.g. /a?(b)c/), then Prism cannot correctly apply the lookbehind correctly in all cases.\n` - + `To fix this, either remove the preceding characters or include them in the lookbehind group.`); - } - }); }); }); it('- should not have lookbehind groups that only have zero-width alternatives', function () { - forEachPattern(({ ast, tokenPath, lookbehind, reportError }) => { - if (!lookbehind) { - return; + forEachPattern(({ tokenPath, lookbehindGroup, reportError }) => { + if (lookbehindGroup && RAA.isZeroLength(lookbehindGroup)) { + const groupContent = lookbehindGroup.raw.substr(1, lookbehindGroup.raw.length - 2); + const replacement = lookbehindGroup.alternatives.length === 1 ? groupContent : `(?:${groupContent})`; + reportError(`${tokenPath}: The lookbehind group ${lookbehindGroup.raw} does not consume characters.\n\n` + + `Therefor it is not necessary to use a lookbehind group.\n` + + `To fix this, replace the lookbehind group with ${replacement} and remove the 'lookbehind' property.`); } - forEachCapturingGroup(ast.pattern, ({ group, number }) => { - if (number === 1 && isAlwaysZeroWidth(group)) { - const groupContent = group.raw.substr(1, group.raw.length - 2); - const replacement = group.alternatives.length === 1 ? groupContent : `(?:${groupContent})`; - reportError(`${tokenPath}: The lookbehind group ${group.raw} does not consume characters.\n\n` - + `Therefor it is not necessary to use a lookbehind group.\n` - + `To fix this, replace the lookbehind group with ${replacement} and remove the 'lookbehind' property.`); - } - }); }); }); it('- should not have unused capturing groups', function () { - forEachPattern(({ ast, tokenPath, lookbehind, reportError }) => { + forEachPattern(({ ast, tokenPath, lookbehindGroup, reportError }) => { forEachCapturingGroup(ast.pattern, ({ group, number }) => { - const isLookbehindGroup = lookbehind && number === 1; + const isLookbehindGroup = group === lookbehindGroup; if (group.references.length === 0 && !isLookbehindGroup) { const fixes = []; fixes.push(`Make this group a non-capturing group ('(?:...)' instead of '(...)'). (It's usually this option.)`); fixes.push(`Reference this group with a backreference (use '\\${number}' for this).`); - if (number === 1 && !lookbehind) { + if (number === 1 && !lookbehindGroup) { if (isFirstMatch(group)) { fixes.push(`Add a 'lookbehind: true' declaration.`); } else { @@ -392,28 +388,26 @@ function testPatterns(Prism, mainLanguage) { /** - * Returns whether the given element will always have zero width meaning that it doesn't consume characters. + * Returns the first capturing group in the given pattern. * - * @param {Element} element - * @returns {boolean} + * @param {Pattern} pattern + * @returns {CapturingGroup | undefined} */ -function isAlwaysZeroWidth(element) { - switch (element.type) { - case 'Assertion': - // assertions == ^, $, \b, lookarounds - return true; - case 'Quantifier': - return element.max === 0 || isAlwaysZeroWidth(element.element); - case 'CapturingGroup': - case 'Group': - // every element in every alternative has to be of zero length - return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth)); - case 'Backreference': - // on if the group referred to is of zero length - return isAlwaysZeroWidth(element.resolved); - default: - return false; // what's left are characters +function getFirstCapturingGroup(pattern) { + let cap = undefined; + + try { + visitRegExpAST(pattern, { + onCapturingGroupEnter(node) { + cap = node; + throw new Error('stop'); + } + }); + } catch (error) { + // ignore errors } + + return cap; } /** @@ -427,7 +421,7 @@ function isFirstMatch(element) { switch (parent.type) { case 'Alternative': { // all elements before this element have to of zero length - if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) { + if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(RAA.isZeroLength)) { return false; } const grandParent = parent.parent; @@ -457,13 +451,7 @@ function isFirstMatch(element) { * @returns {boolean} */ function underAStar(node) { - if (node.type === 'Quantifier' && node.max > 10) { - return true; - } else if (node.parent) { - return underAStar(node.parent); - } else { - return false; - } + return RAA.getEffectiveMaximumRepetition(node) > 10; } /**