From 9fb431fc4b9c6078acabdfdde58352cef8559743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antti=20Korpim=C3=A4ki?= Date: Wed, 24 Nov 2021 03:04:40 +0100 Subject: [PATCH] Do own HTML comment parsing; drop parse5 dep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Did this myself because I guess hell will freeze over before anyone releases my merged parse5 bugfix PR. (╯°□°)╯︵ ┻━┻ https://github.com/inikulin/parse5/pull/326 So shell heredocs will work inside comments now. Closes #4. It's nice to get rid of a big dependency too, and to avoid the runtime cost of doing full HTML parsing when we only needed the comments. And now we have line numbers and helpful error messages for unclosed comments. --- package-lock.json | 27 ------------ package.json | 1 - src/main.js | 105 +++++++++++++++++++++++++++++++++++++++++----- test.ls | 60 ++++++++++++++++++++++++++ 4 files changed, 154 insertions(+), 39 deletions(-) diff --git a/package-lock.json b/package-lock.json index 5c522b8..e0e6e26 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,6 @@ "async": "^3.2.1", "diff-match-patch": "^1.0.5", "kleur": "^4.1.4", - "parse5-sax-parser": "^6.0.1", "remark-parse": "^10.0.1", "supports-color": "^9.1.0", "unified": "^10.1.1" @@ -1804,19 +1803,6 @@ "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/parse5": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz", - "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==" - }, - "node_modules/parse5-sax-parser": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5-sax-parser/-/parse5-sax-parser-6.0.1.tgz", - "integrity": "sha512-kXX+5S81lgESA0LsDuGjAlBybImAChYRMT+/uKCEXFBFOeEhS52qUCydGhU3qLRD8D9DVjaUo821WK7DM4iCeg==", - "dependencies": { - "parse5": "^6.0.1" - } - }, "node_modules/path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", @@ -3594,19 +3580,6 @@ "is-hexadecimal": "^2.0.0" } }, - "parse5": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz", - "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==" - }, - "parse5-sax-parser": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5-sax-parser/-/parse5-sax-parser-6.0.1.tgz", - "integrity": "sha512-kXX+5S81lgESA0LsDuGjAlBybImAChYRMT+/uKCEXFBFOeEhS52qUCydGhU3qLRD8D9DVjaUo821WK7DM4iCeg==", - "requires": { - "parse5": "^6.0.1" - } - }, "path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", diff --git a/package.json b/package.json index 2cb5c76..15a4ab5 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,6 @@ "async": "^3.2.1", "diff-match-patch": "^1.0.5", "kleur": "^4.1.4", - "parse5-sax-parser": "^6.0.1", "remark-parse": "^10.0.1", "supports-color": "^9.1.0", "unified": "^10.1.1" diff --git a/src/main.js b/src/main.js index 6e7b6ca..481c3cf 100644 --- a/src/main.js +++ b/src/main.js @@ -5,7 +5,6 @@ import remarkParse from 'remark-parse' import async from 'async' import supportsColor from 'supports-color' import color from 'kleur' -import saxParser from 'parse5-sax-parser' import { exec } from 'child_process' import DMP from 'diff-match-patch' @@ -338,15 +337,99 @@ const runTests = (queue, options) => { } } -const extractHtmlComments = function(input){ - var comments, x$, p; - comments = []; - x$ = p = new saxParser(); - x$.on('comment', function(it){ - return comments.push(it.text); - }); - x$.end(input); - return comments; +const extractHtmlComments = function(input, nodePositionInMarkdown){ + // Reference: https://html.spec.whatwg.org/#comments + // + // Comments are generally ``, where `stuff` is disallowed from + // containing the ending delimiter. However, the comment delimiters may also + // occur inside CDATA blocks, where we do *not* want to parse them. + + const comments = [] + + const CDATA_OPENER = '' + const COMMENT_OPENER = '' + const IN_CDATA = Symbol('parser in CDATA') + const IN_COMMENT = Symbol('parser in comment') + const BASE = Symbol('parser in base state') + let state = BASE + let nextIndex = 0 + let done = false + + while (!done) { + const rest = input.slice(nextIndex) + + switch (state) { + case BASE: + // Parse the rest of whichever we see first. CDATA "swallows" + // comments, and vice-versa. + const cdataIndex = rest.indexOf(CDATA_OPENER) + const commentIndex = rest.indexOf(COMMENT_OPENER) + + if (cdataIndex === -1 && commentIndex === -1) { // No more of either; done + done = true + } else if (cdataIndex === -1 && commentIndex >= 0) { // Comment only + state = IN_COMMENT + nextIndex += commentIndex + } else if (cdataIndex >= 0 && commentIndex === -1) { // CDATA only + state = IN_CDATA + nextIndex += cdataIndex + } else { // Matched both. Go with the earlier one. + if (cdataIndex < commentIndex) { // CDATA earlier + state = IN_CDATA + nextIndex += cdataIndex + } else { // Comment earlier + state = IN_COMMENT + nextIndex += commentIndex + } + } + break + + case IN_COMMENT: { + // Parse end of comment + const closerIndex = rest.indexOf(COMMENT_CLOSER) + if (closerIndex >= 0) { + comments.push(rest.slice(0, closerIndex)) + nextIndex += closerIndex + state = BASE + } else { + // Unterminated comment + const openerIndex = input.slice(nextIndex) + const line = input.slice(0, nextIndex).split('\n').length + + nodePositionInMarkdown.start.line - 1 + parsingError(`'' where appropriate.` + + `\nCheck that '-->' doesn't occur anywhere unexpected.` + }) + } + break + } + + case IN_CDATA: { + // Parse end of CDATA + const closerIndex = rest.indexOf(CDATA_CLOSER) + if (closerIndex >= 0) { + nextIndex += closerIndex + state = BASE + } else { + // Unterminated CDATA + const line = input.slice(0, nextIndex).split('\n').length + + nodePositionInMarkdown.start.line - 1 + parsingError(`''` + + ` where appropriate.` + + `\nCheck that ']]>' doesn't occur anywhere unexpected.` + }) + } + break + } + } + } + + return comments }; /* @@ -462,7 +545,7 @@ const parseAndRunTests = (text, options={jobs: 1}) => { const visitMarkdownNode = (node) => { if (node.type === 'html') { - extractHtmlComments(node.value).forEach((comment) => { + extractHtmlComments(node.value, node.position).forEach((comment) => { // Optional whitespace, followed by '!test', more optional whitespace, // then the commands we actually care about. diff --git a/test.ls b/test.ls index c1c773f..f0df4c9 100644 --- a/test.ls +++ b/test.ls @@ -258,6 +258,66 @@ txm-expect do """ +txm-expect do + name: "using << heredocs in comments works" + input: """ + + + + + doesn't matter + + + + YEAH + + """ + expect-stdout: """ + TAP version 13 + 1..1 + ok 1 test name + + # 1/1 passed + # OK + + """ + +txm-expect do + name: "comment inside CDATA is not parsed" + input: """ + + ]]> + """ + expect-stdout: /1..0\n# no tests/ + +txm-expect do + name: "unterminated comment" + expect-exit: 2 + input: """ + + + true + + """ + expect-stdout: /unterminated HTML CDATA[\s\S]*line 2/ + txm-expect do name: "no program specified" input: """