Skip to content

Commit

Permalink
Draft implementation of block begin/end
Browse files Browse the repository at this point in the history
This is a rough draft for https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore

It includes tests, but not code to trigger it from unknown-words/check-spelling.

Initial file format for `block-delimiters.list`:

     ```block-delimiters.list
     # Description of format 1
     <begin token for format 1>
     <end token for format 1>

     # Description of format 2
     <begin token for format 2>
     <end token for format 2>
     ```

Note that this file format has serious drawbacks and is likely to be
replaced before this feature ships.

Possible replacement file format under consideration:

     ```block-ignore.rules
     name: format 1
     begin-pattern: (regular-expression)
     end-pattern: (regular-expression)
     file-path-pattern: (regular-expression)
     stop-after-pattern: (regular-expression)
     ```
  • Loading branch information
jsoref committed Apr 12, 2024
1 parent a8f247c commit bd16416
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .github/actions/spelling/block-delimiters.list
@@ -0,0 +1,3 @@
# block ignore
<!--no check spelling-->
<!--/-->
2 changes: 0 additions & 2 deletions .github/actions/spelling/expect/README.md.txt
@@ -1,6 +1,4 @@
gsutil
ikea
microsoft
spammed
timeframe
workflows
2 changes: 1 addition & 1 deletion action.yml
Expand Up @@ -208,7 +208,7 @@ inputs:
warnings:
description: "List of events that are warnings (items that are neither warnings nor notices will result in an :x:)"
required: false
default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration
default: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,unclosed-block-ignore-begin,unclosed-block-ignore-end
notices:
description: "List of events that are notices (items that are neither warnings nor notices will result in an :x:)"
required: false
Expand Down
79 changes: 78 additions & 1 deletion lib/CheckSpelling/UnknownWordSplitter.pm
Expand Up @@ -20,6 +20,9 @@ use CheckSpelling::Util;
our $VERSION='0.1.0';

my ($longest_word, $shortest_word, $word_match, $forbidden_re, $patterns_re, $candidates_re, $disable_word_collating, $check_file_names);
my $begin_block_re = '';
my @begin_block_list = ();
my @end_block_list = ();
my ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
my ($shortest, $longest) = (255, 0);
my @forbidden_re_list;
Expand Down Expand Up @@ -94,6 +97,30 @@ sub not_empty {
return defined $thing && $thing ne ''
}

sub parse_block_list {
my ($re) = @_;
my @file;
return @file unless (open(FILE, '<:utf8', $re));

local $/=undef;
my $file=<FILE>;
my $last_line = $.;
close FILE;
for (split /\R/, $file) {
next if /^#/;
chomp;
next unless /^./;
push @file, $_;
}

unless ($#file % 2 == 1) {
print STDERR "$re:$last_line:Block delimiters must come in pairs (uneven-block-delimiters)\n";
@file = ();
}

return @file;
}

sub valid_word {
# shortest_word is an absolute
our ($shortest, $longest, $shortest_word, $longest_word);
Expand Down Expand Up @@ -178,6 +205,7 @@ sub hunspell_dictionary {
sub init {
my ($configuration) = @_;
our ($word_match, %unique, $patterns_re, @forbidden_re_list, $forbidden_re, @candidates_re_list, $candidates_re);
our ($begin_block_re, @begin_block_list, @end_block_list);
our $hunspell_dictionary_path = CheckSpelling::Util::get_file_from_env('hunspell_dictionary_path', '');
our $timeout = CheckSpelling::Util::get_val_from_env('splitter_timeout', 30);
if ($hunspell_dictionary_path) {
Expand All @@ -191,6 +219,23 @@ sub init {
print STDERR "Could not load Text::Hunspell for dictionaries (hunspell-unavailable)\n";
}
}

if (-e "$configuration/block-delimiters.list") {
my @block_delimiters = parse_block_list "$configuration/block-delimiters.list";
if (@block_delimiters) {
@begin_block_list = ();
@end_block_list = ();

while (@block_delimiters) {
my ($begin, $end) = splice @block_delimiters, 0, 2;
push @begin_block_list, $begin;
push @end_block_list, $end;
}

$begin_block_re = join '|', (map { '('.quote_re("\Q$_\E").')' } @begin_block_list);
}
}

my (@patterns_re_list, %in_patterns_re_list);
if (-e "$configuration/patterns.txt") {
@patterns_re_list = file_to_list "$configuration/patterns.txt";
Expand Down Expand Up @@ -306,6 +351,7 @@ sub split_file {
$unrecognized, $shortest, $largest_file, $words,
$word_match, %unique, %unique_unrecognized, $forbidden_re,
@forbidden_re_list, $patterns_re, %dictionary,
$begin_block_re, @begin_block_list, @end_block_list,
$candidates_re, @candidates_re_list, $check_file_names, $use_magic_file, $disable_minified_file
);
our ($ignore_pattern, $upper_pattern, $lower_pattern, $not_lower_pattern, $not_upper_or_lower_pattern, $punctuation_pattern);
Expand Down Expand Up @@ -370,8 +416,9 @@ sub split_file {
local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
alarm $timeout;

my ($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
my $offset = 0;
while (<FILE>) {
LINE: while (<FILE>) {
$_ = decode_utf8($_, FB_DEFAULT);
if (/[\x{D800}-\x{DFFF}]/) {
skip_file($temp_dir, "file contains a UTF-16 surrogate. This is not supported. (utf16-surrogate)\n");
Expand All @@ -381,6 +428,30 @@ sub split_file {
s/^\x{FEFF}// if $. == 1;
next unless /./;
my $raw_line = $_;
my $parsed_block_markers;

# hook for custom multiline based text exclusions:
if ($begin_block_re) {
FIND_END_MARKER: while (1) {
while ($next_end_marker ne '') {
next LINE unless /\Q$next_end_marker\E/;
s/.*?\Q$next_end_marker\E//;
($current_begin_marker, $next_end_marker, $start_marker_line) = ('', '', '');
$parsed_block_markers = 1;
}
my @captured = (/^.*?$begin_block_re/);
last unless (@captured);
for my $capture (0 .. $#captured) {
if ($captured[$capture]) {
($current_begin_marker, $next_end_marker, $start_marker_line) = ($begin_block_list[$capture], $end_block_list[$capture], "$.:1 ... 1");
s/^.*?\Q$begin_block_list[$capture]\E//;
$parsed_block_markers = 1;
next FIND_END_MARKER;
}
}
}
next if $parsed_block_markers;
}

# hook for custom line based text exclusions:
if (defined $patterns_re) {
Expand Down Expand Up @@ -484,6 +555,12 @@ sub split_file {
}
}
}
if ($next_end_marker) {
if ($start_marker_line) {
print WARNINGS ":$start_marker_line, Warning - failed to find matching end marker for `$current_begin_marker` (unclosed-block-ignore-begin)\n";
}
print WARNINGS ":$.:1 ... 1, Warning - expected to find end block marker `$next_end_marker` (unclosed-block-ignore-end)\n";
}

alarm 0;
};
Expand Down
52 changes: 52 additions & 0 deletions sarif.json
Expand Up @@ -556,6 +556,58 @@
"code-reviews"
]
}
},
{
"id": "unclosed-block-ignore-begin",
"name": "UnclosedBlockIgnoreBegin",
"helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-begin",
"shortDescription": {
"text": "Unclosed block ignore (begin)"
},
"fullDescription": {
"text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the found begin mark."
},
"help": {
"text": "?",
"markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n"
},
"defaultConfiguration": {
"level": "warning"
},
"properties": {
"precision": "low",
"problem.severity": "warning",
"tags": [
"source-code",
"code-reviews"
]
}
},
{
"id": "unclosed-block-ignore-end",
"name": "UnclosedBlockIgnoreEnd",
"helpUri": "https://github.com/check-spelling/check-spelling/wiki/Event-descriptions#unclosed-block-ignore-end",
"shortDescription": {
"text": "Unclosed block ignore (end)"
},
"fullDescription": {
"text": "A begin block ignore was found but not a corresponding end block ignore. This is associated with the missing end mark."
},
"help": {
"text": "?",
"markdown": "**Remediation (click \"Show more\" below)**:\n\n- Check to see if the content has text that should correspond to the end block ignore but does not, if so, correct it.\n- If the begin block ignore is too general, consider making it more specific. See [block ignore examples](https://github.com/check-spelling/check-spelling/wiki/Configuration-Examples:-block-delimiters) and [block ignore feature](https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Block-Ignore)\n\n"
},
"defaultConfiguration": {
"level": "warning"
},
"properties": {
"precision": "low",
"problem.severity": "warning",
"tags": [
"source-code",
"code-reviews"
]
}
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion t/UnknownWordSplitter.t
Expand Up @@ -11,7 +11,7 @@ use File::Temp qw/ tempfile tempdir /;
use IO::Capture::Stderr;

use Test::More;
plan tests => 42;
plan tests => 55;

use_ok('CheckSpelling::UnknownWordSplitter');

Expand Down
3 changes: 3 additions & 0 deletions unknown-words.sh
Expand Up @@ -1085,6 +1085,7 @@ define_variables() {
patterns="$splitter_configuration/patterns.txt"
forbidden_path="$splitter_configuration/forbidden.txt"
candidates_path="$splitter_configuration/candidates.txt"
block_delimiters_path="$splitter_configuration/block-delimiters.list";
excludes="$spellchecker/excludes.txt"
excludes_path="$temp/excludes.txt"
only="$spellchecker/only.txt"
Expand Down Expand Up @@ -2016,6 +2017,8 @@ set_up_files() {
fi
get_project_files line_forbidden.patterns "$forbidden_path"
get_project_files candidate.patterns "$candidates_path"

get_project_files block-delimiters.list "$block_delimiters_path"
fi
extra_dictionaries_cover_entries="$(mktemp)"
get_project_files line_masks.patterns "$patterns_path"
Expand Down

0 comments on commit bd16416

Please sign in to comment.