Skip to content

Commit

Permalink
Do own HTML comment parsing; drop parse5 dep
Browse files Browse the repository at this point in the history
Did this myself because I guess hell will freeze over before
anyone releases my merged parse5 bugfix PR.  (╯°□°)╯︵ ┻━┻
inikulin/parse5#326

So shell heredocs will work inside comments now.  Closes #4.

It's nice to get rid of a big dependency too, and to avoid the runtime
cost of doing full HTML parsing when we only needed the comments.  And
now we have line numbers and helpful error messages for unclosed
comments.
  • Loading branch information
anko committed Nov 24, 2021
1 parent a37e85c commit 9fb431f
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 39 deletions.
27 changes: 0 additions & 27 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion package.json
Expand Up @@ -40,7 +40,6 @@
"async": "^3.2.1",
"diff-match-patch": "^1.0.5",
"kleur": "^4.1.4",
"parse5-sax-parser": "^6.0.1",
"remark-parse": "^10.0.1",
"supports-color": "^9.1.0",
"unified": "^10.1.1"
Expand Down
105 changes: 94 additions & 11 deletions src/main.js
Expand Up @@ -5,7 +5,6 @@ import remarkParse from 'remark-parse'
import async from 'async'
import supportsColor from 'supports-color'
import color from 'kleur'
import saxParser from 'parse5-sax-parser'
import { exec } from 'child_process'
import DMP from 'diff-match-patch'

Expand Down Expand Up @@ -338,15 +337,99 @@ const runTests = (queue, options) => {
}
}

const extractHtmlComments = function(input){
var comments, x$, p;
comments = [];
x$ = p = new saxParser();
x$.on('comment', function(it){
return comments.push(it.text);
});
x$.end(input);
return comments;
const extractHtmlComments = function(input, nodePositionInMarkdown){
// Reference: https://html.spec.whatwg.org/#comments
//
// Comments are generally `<!-- stuff -->`, where `stuff` is disallowed from
// containing the ending delimiter. However, the comment delimiters may also
// occur inside CDATA blocks, where we do *not* want to parse them.

const comments = []

const CDATA_OPENER = '<![CDATA['
const CDATA_CLOSER = ']]>'
const COMMENT_OPENER = '<!--'
const COMMENT_CLOSER = '-->'
const IN_CDATA = Symbol('parser in CDATA')
const IN_COMMENT = Symbol('parser in comment')
const BASE = Symbol('parser in base state')
let state = BASE
let nextIndex = 0
let done = false

while (!done) {
const rest = input.slice(nextIndex)

switch (state) {
case BASE:
// Parse the rest of whichever we see first. CDATA "swallows"
// comments, and vice-versa.
const cdataIndex = rest.indexOf(CDATA_OPENER)
const commentIndex = rest.indexOf(COMMENT_OPENER)

if (cdataIndex === -1 && commentIndex === -1) { // No more of either; done
done = true
} else if (cdataIndex === -1 && commentIndex >= 0) { // Comment only
state = IN_COMMENT
nextIndex += commentIndex
} else if (cdataIndex >= 0 && commentIndex === -1) { // CDATA only
state = IN_CDATA
nextIndex += cdataIndex
} else { // Matched both. Go with the earlier one.
if (cdataIndex < commentIndex) { // CDATA earlier
state = IN_CDATA
nextIndex += cdataIndex
} else { // Comment earlier
state = IN_COMMENT
nextIndex += commentIndex
}
}
break

case IN_COMMENT: {
// Parse end of comment
const closerIndex = rest.indexOf(COMMENT_CLOSER)
if (closerIndex >= 0) {
comments.push(rest.slice(0, closerIndex))
nextIndex += closerIndex
state = BASE
} else {
// Unterminated comment
const openerIndex = input.slice(nextIndex)
const line = input.slice(0, nextIndex).split('\n').length
+ nodePositionInMarkdown.start.line - 1
parsingError(`'<!--'`, 'unterminated HTML comment', {
location: formatPosition({ start: { line }, end: { line } }),
'how to fix': `Terminate the comment with '-->' where appropriate.`
+ `\nCheck that '-->' doesn't occur anywhere unexpected.`
})
}
break
}

case IN_CDATA: {
// Parse end of CDATA
const closerIndex = rest.indexOf(CDATA_CLOSER)
if (closerIndex >= 0) {
nextIndex += closerIndex
state = BASE
} else {
// Unterminated CDATA
const line = input.slice(0, nextIndex).split('\n').length
+ nodePositionInMarkdown.start.line - 1
parsingError(`'<![CDATA['`, 'unterminated HTML CDATA section', {
location: formatPosition({ start: { line }, end: { line } }),
'how to fix': `Terminate the CDATA section with ']]>'`
+ ` where appropriate.`
+ `\nCheck that ']]>' doesn't occur anywhere unexpected.`
})
}
break
}
}
}

return comments
};

/*
Expand Down Expand Up @@ -462,7 +545,7 @@ const parseAndRunTests = (text, options={jobs: 1}) => {
const visitMarkdownNode = (node) => {

if (node.type === 'html') {
extractHtmlComments(node.value).forEach((comment) => {
extractHtmlComments(node.value, node.position).forEach((comment) => {

// Optional whitespace, followed by '!test', more optional whitespace,
// then the commands we actually care about.
Expand Down
60 changes: 60 additions & 0 deletions test.ls
Expand Up @@ -258,6 +258,66 @@ txm-expect do
"""

txm-expect do
name: "using << heredocs in comments works"
input: """
<!-- !test program node <<X
console.log('YEAH')
X -->
<!-- !test in test name -->
doesn't matter
<!-- !test out test name -->
YEAH
"""
expect-stdout: """
TAP version 13
1..1
ok 1 test name
# 1/1 passed
# OK
"""

txm-expect do
name: "comment inside CDATA is not parsed"
input: """
<![CDATA[
This would be an error if it were parsed:
<!-- !test in whatever -->
]]>
"""
expect-stdout: /1..0\n# no tests/

txm-expect do
name: "unterminated comment"
expect-exit: 2
input: """
<!-- !test check whatever
true
"""
expect-stdout: /unterminated HTML comment[\s\S]*line 1/

txm-expect do
name: "unterminated CDATA section"
expect-exit: 2
input: """
other stuff
<![CDATA[
<!-- !test check whatever -->
true
"""
expect-stdout: /unterminated HTML CDATA[\s\S]*line 2/

txm-expect do
name: "no program specified"
input: """
Expand Down

0 comments on commit 9fb431f

Please sign in to comment.