Do own HTML comment parsing; drop parse5 dep

Did this myself because I guess hell will freeze over before anyone releases my merged parse5 bugfix PR. (╯°□°)╯︵ ┻━┻ inikulin/parse5#326 So shell heredocs will work inside comments now. Closes #4. It's nice to get rid of a big dependency too, and to avoid the runtime cost of doing full HTML parsing when we only needed the comments. And now we have line numbers and helpful error messages for unclosed comments.
anko · Nov 24, 2021 · 9fb431f · 9fb431f
1 parent a37e85c
commit 9fb431f
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 39 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -40,7 +40,6 @@
     "async": "^3.2.1",
     "diff-match-patch": "^1.0.5",
     "kleur": "^4.1.4",
-    "parse5-sax-parser": "^6.0.1",
     "remark-parse": "^10.0.1",
     "supports-color": "^9.1.0",
     "unified": "^10.1.1"

diff --git a/src/main.js b/src/main.js
@@ -5,7 +5,6 @@ import remarkParse from 'remark-parse'
 import async from 'async'
 import supportsColor from 'supports-color'
 import color from 'kleur'
-import saxParser from 'parse5-sax-parser'
 import { exec } from 'child_process'
 import DMP from 'diff-match-patch'
 
@@ -338,15 +337,99 @@ const runTests = (queue, options) => {
   }
 }
 
-const extractHtmlComments = function(input){
-  var comments, x$, p;
-  comments = [];
-  x$ = p = new saxParser();
-  x$.on('comment', function(it){
-    return comments.push(it.text);
-  });
-  x$.end(input);
-  return comments;
+const extractHtmlComments = function(input, nodePositionInMarkdown){
+  // Reference: https://html.spec.whatwg.org/#comments
+  //
+  // Comments are generally `<!-- stuff -->`, where `stuff` is disallowed from
+  // containing the ending delimiter.  However, the comment delimiters may also
+  // occur inside CDATA blocks, where we do *not* want to parse them.
+
+  const comments = []
+
+  const CDATA_OPENER = '<![CDATA['
+  const CDATA_CLOSER = ']]>'
+  const COMMENT_OPENER = '<!--'
+  const COMMENT_CLOSER = '-->'
+  const IN_CDATA = Symbol('parser in CDATA')
+  const IN_COMMENT = Symbol('parser in comment')
+  const BASE = Symbol('parser in base state')
+  let state = BASE
+  let nextIndex = 0
+  let done = false
+
+  while (!done) {
+    const rest = input.slice(nextIndex)
+
+    switch (state) {
+      case BASE:
+        // Parse the rest of whichever we see first.  CDATA "swallows"
+        // comments, and vice-versa.
+        const cdataIndex = rest.indexOf(CDATA_OPENER)
+        const commentIndex = rest.indexOf(COMMENT_OPENER)
+
+        if (cdataIndex === -1 && commentIndex === -1) { // No more of either; done
+          done = true
+        } else if (cdataIndex === -1 && commentIndex >= 0) { // Comment only
+          state = IN_COMMENT
+          nextIndex += commentIndex
+        } else if (cdataIndex >= 0 && commentIndex === -1) { // CDATA only
+          state = IN_CDATA
+          nextIndex += cdataIndex
+        } else { // Matched both.  Go with the earlier one.
+          if (cdataIndex < commentIndex) { // CDATA earlier
+            state = IN_CDATA
+            nextIndex += cdataIndex
+          } else { // Comment earlier
+            state = IN_COMMENT
+            nextIndex += commentIndex
+          }
+        }
+        break
+
+      case IN_COMMENT: {
+        // Parse end of comment
+        const closerIndex = rest.indexOf(COMMENT_CLOSER)
+        if (closerIndex >= 0) {
+          comments.push(rest.slice(0, closerIndex))
+          nextIndex += closerIndex
+          state = BASE
+        } else {
+          // Unterminated comment
+          const openerIndex = input.slice(nextIndex)
+          const line = input.slice(0, nextIndex).split('\n').length
+            + nodePositionInMarkdown.start.line - 1
+          parsingError(`'<!--'`, 'unterminated HTML comment', {
+            location: formatPosition({ start: { line }, end: { line } }),
+            'how to fix': `Terminate the comment with '-->' where appropriate.`
+              + `\nCheck that '-->' doesn't occur anywhere unexpected.`
+          })
+        }
+        break
+      }
+
+      case IN_CDATA: {
+        // Parse end of CDATA
+        const closerIndex = rest.indexOf(CDATA_CLOSER)
+        if (closerIndex >= 0) {
+          nextIndex += closerIndex
+          state = BASE
+        } else {
+          // Unterminated CDATA
+          const line = input.slice(0, nextIndex).split('\n').length
+            + nodePositionInMarkdown.start.line - 1
+          parsingError(`'<![CDATA['`, 'unterminated HTML CDATA section', {
+            location: formatPosition({ start: { line }, end: { line } }),
+            'how to fix': `Terminate the CDATA section with ']]>'`
+              + ` where appropriate.`
+              + `\nCheck that ']]>' doesn't occur anywhere unexpected.`
+          })
+        }
+        break
+      }
+    }
+  }
+
+  return comments
 };
 
 /*
@@ -462,7 +545,7 @@ const parseAndRunTests = (text, options={jobs: 1}) => {
   const visitMarkdownNode = (node) => {
 
     if (node.type === 'html') {
-      extractHtmlComments(node.value).forEach((comment) => {
+      extractHtmlComments(node.value, node.position).forEach((comment) => {
 
         // Optional whitespace, followed by '!test', more optional whitespace,
         // then the commands we actually care about.

diff --git a/test.ls b/test.ls
@@ -258,6 +258,66 @@ txm-expect do
 
   """
 
+txm-expect do
+  name: "using << heredocs in comments works"
+  input: """
+  <!-- !test program node <<X
+  console.log('YEAH')
+  X -->
+
+  <!-- !test in test name -->
+
+      doesn't matter
+
+  <!-- !test out test name -->
+
+      YEAH
+
+  """
+  expect-stdout: """
+  TAP version 13
+  1..1
+  ok 1 test name
+
+  # 1/1 passed
+  # OK
+
+  """
+
+txm-expect do
+  name: "comment inside CDATA is not parsed"
+  input: """
+  <![CDATA[
+    This would be an error if it were parsed:
+    <!-- !test in whatever -->
+  ]]>
+  """
+  expect-stdout: /1..0\n# no tests/
+
+txm-expect do
+  name: "unterminated comment"
+  expect-exit: 2
+  input: """
+  <!-- !test check whatever
+
+    true
+
+  """
+  expect-stdout: /unterminated HTML comment[\s\S]*line 1/
+
+txm-expect do
+  name: "unterminated CDATA section"
+  expect-exit: 2
+  input: """
+  other stuff
+  <![CDATA[
+  <!-- !test check whatever -->
+
+      true
+
+  """
+  expect-stdout: /unterminated HTML CDATA[\s\S]*line 2/
+
 txm-expect do
   name: "no program specified"
   input: """