Draft implementation of block begin/end

This is a rough draft for https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore It includes tests, but not code to trigger it from unknown-words/check-spelling. Initial file format for `block-delimiters.list`: ```block-delimiters.list # Description of format 1 <begin token for format 1> <end token for format 1> # Description of format 2 <begin token for format 2> <end token for format 2> ``` Note that this file format has serious drawbacks and is likely to be replaced before this feature ships. Possible replacement file format under consideration: ```block-ignore.rules name: format 1 begin-pattern: (regular-expression) end-pattern: (regular-expression) file-path-pattern: (regular-expression) stop-after-pattern: (regular-expression) ```
check-spelling · Apr 12, 2024 · bd16416 · bd16416
1 parent a8f247c
commit bd16416
Show file tree

Hide file tree

Showing 7 changed files with 138 additions and 5 deletions.
diff --git a/.github/actions/spelling/block-delimiters.list b/.github/actions/spelling/block-delimiters.list
@@ -0,0 +1,3 @@
+# block ignore
+<!--no check spelling-->
+<!--/-->
diff --git a/.github/actions/spelling/expect/README.md.txt b/.github/actions/spelling/expect/README.md.txt
@@ -1,6 +1,4 @@
 gsutil
-ikea
-microsoft
 spammed
 timeframe
 workflows
diff --git a/action.yml b/action.yml
@@ -208,7 +208,7 @@ inputs:
   warnings:
     description: "List of events that are warnings (items that are neither warnings nor notices will result in an :x:)"
     required: false
-    default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration
+    default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,unclosed-block-ignore-begin,unclosed-block-ignore-end
   notices:
     description: "List of events that are notices (items that are neither warnings nor notices will result in an :x:)"
     required: false

diff --git a/lib/CheckSpelling/UnknownWordSplitter.pm b/lib/CheckSpelling/UnknownWordSplitter.pm
@@ -20,6 +20,9 @@ use CheckSpelling::Util;
 our $VERSION='0.1.0';
 
 my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
+my $begin_block_re = '';
+my @begin_block_list = ();
+my @end_block_list = ();
 my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
 my ($shortest, $longest) = (255, 0);
 my @forbidden_re_list;
@@ -94,6 +97,30 @@ sub not_empty {
   return defined $thing && $thing ne ''
 }
 
+sub parse_block_list {
+  my ($re) = @_;
+  my @file;
+  return @file unless (open(FILE, '<:utf8', $re));
+
+  local $/=undef;
+  my $file=<FILE>;
+  my $last_line = $.;
+  close FILE;
+  for (split /\R/, $file) {
+    next if /^#/;
+    chomp;
+    next unless /^./;
+    push @file, $_;
+  }
+
+  unless ($#file % 2 == 1) {
+    print STDERR "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n";
+    @file = ();
+  }
+
+  return @file;
+}
+
 sub valid_word {
   # shortest_word is an absolute
   our ($shortest, $longest, $shortest_word, $longest_word);
@@ -178,6 +205,7 @@ sub hunspell_dictionary {
 sub init {
   my ($configuration) = @_;
   our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
+  our ($begin_block_re, @begin_block_list, @end_block_list);
   our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
   our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
   if ($hunspell_dictionary_path) {
@@ -191,6 +219,23 @@ sub init {
       print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
     }
   }
+
+  if (-e "$configuration/block-delimiters.list") {
+    my @block_delimiters = parse_block_list "$configuration/block-delimiters.list";
+    if (@block_delimiters) {
+      @begin_block_list = ();
+      @end_block_list = ();
+
+      while (@block_delimiters) {
+        my ($begin, $end) = splice @block_delimiters, 0, 2;
+        push @begin_block_list, $begin;
+        push @end_block_list, $end;
+      }
+
+      $begin_block_re = join '|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list);
+    }
+  }
+
   my (@patterns_re_list, %in_patterns_re_list);
   if (-e "$configuration/patterns.txt") {
     @patterns_re_list = file_to_list "$configuration/patterns.txt";
@@ -306,6 +351,7 @@ sub split_file {
     $unrecognized, $shortest, $largest_file, $words,
     $word_match, %unique, %unique_unrecognized, $forbidden_re,
     @forbidden_re_list, $patterns_re, %dictionary,
+    $begin_block_re, @begin_block_list, @end_block_list,
     $candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file
   );
   our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
@@ -370,8 +416,9 @@ sub split_file {
     local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
     alarm $timeout;
 
+    my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
     my $offset = 0;
-    while (<FILE>) {
+    LINE: while (<FILE>) {
       $_ = decode_utf8($_, FB_DEFAULT);
       if (/[\x{D800}-\x{DFFF}]/) {
         skip_file($temp_dir, "file contains a UTF-16 surrogate. This is not supported. (utf16-surrogate)\n");
@@ -381,6 +428,30 @@ sub split_file {
       s/^\x{FEFF}// if $. == 1;
       next unless /./;
       my $raw_line = $_;
+      my $parsed_block_markers;
+
+      # hook for custom multiline based text exclusions:
+      if ($begin_block_re) {
+        FIND_END_MARKER: while (1) {
+          while ($next_end_marker ne '') {
+            next LINE unless /\Q$next_end_marker\E/;
+            s/.*?\Q$next_end_marker\E//;
+            ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
+            $parsed_block_markers = 1;
+          }
+          my @captured = (/^.*?$begin_block_re/);
+          last unless (@captured);
+          for my $capture (0 .. $#captured) {
+            if ($captured[$capture]) {
+              ($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1");
+              s/^.*?\Q$begin_block_list[$capture]\E//;
+              $parsed_block_markers = 1;
+              next FIND_END_MARKER;
+            }
+          }
+        }
+        next if $parsed_block_markers;
+      }
 
       # hook for custom line based text exclusions:
       if (defined $patterns_re) {
@@ -484,6 +555,12 @@ sub split_file {
         }
       }
     }
+    if ($next_end_marker) {
+      if ($start_marker_line) {
+        print WARNINGS ":$start_marker_line, Warning - failed to find matching end marker for `$current_begin_marker` (unclosed-block-ignore-begin)\n";
+      }
+      print WARNINGS ":$.:1 ... 1, Warning - expected to find end block marker `$next_end_marker` (unclosed-block-ignore-end)\n";
+    }
 
     alarm 0;
   };

diff --git a/sarif.json b/sarif.json
@@ -556,6 +556,58 @@
                   "code-reviews"
                 ]
               }
+            },
+            {
+              "id": "unclosed-block-ignore-begin",
+              "name": "UnclosedBlockIgnoreBegin",
+              "helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-begin",
+              "shortDescription": {
+                "text": "Unclosed block ignore (begin)"
+              },
+              "fullDescription": {
+                "text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the found begin mark."
+              },
+              "help": {
+                "text": "?",
+                "markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n"
+              },
+              "defaultConfiguration": {
+                "level": "warning"
+              },
+              "properties": {
+                "precision": "low",
+                "problem.severity": "warning",
+                "tags": [
+                  "source-code",
+                  "code-reviews"
+                ]
+              }
+            },
+            {
+              "id": "unclosed-block-ignore-end",
+              "name": "UnclosedBlockIgnoreEnd",
+              "helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-end",
+              "shortDescription": {
+                "text": "Unclosed block ignore (end)"
+              },
+              "fullDescription": {
+                "text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the missing end mark."
+              },
+              "help": {
+                "text": "?",
+                "markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n"
+              },
+              "defaultConfiguration": {
+                "level": "warning"
+              },
+              "properties": {
+                "precision": "low",
+                "problem.severity": "warning",
+                "tags": [
+                  "source-code",
+                  "code-reviews"
+                ]
+              }
             }
           ]
         }

diff --git a/t/UnknownWordSplitter.t b/t/UnknownWordSplitter.t
@@ -11,7 +11,7 @@ use File::Temp qw/ tempfile tempdir /;
 use IO::Capture::Stderr;
 
 use Test::More;
-plan tests => 42;
+plan tests => 55;
 
 use_ok('CheckSpelling::UnknownWordSplitter');
 

diff --git a/unknown-words.sh b/unknown-words.sh
@@ -1085,6 +1085,7 @@ define_variables() {
   patterns="$splitter_configuration/patterns.txt"
   forbidden_path="$splitter_configuration/forbidden.txt"
   candidates_path="$splitter_configuration/candidates.txt"
+  block_delimiters_path="$splitter_configuration/block-delimiters.list";
   excludes="$spellchecker/excludes.txt"
   excludes_path="$temp/excludes.txt"
   only="$spellchecker/only.txt"
@@ -2016,6 +2017,8 @@ set_up_files() {
     fi
     get_project_files line_forbidden.patterns "$forbidden_path"
     get_project_files candidate.patterns "$candidates_path"
+
+    get_project_files block-delimiters.list "$block_delimiters_path"
   fi
   extra_dictionaries_cover_entries="$(mktemp)"
   get_project_files line_masks.patterns "$patterns_path"