From 3a03733ec9d76beb574b0aeeae44b3e06c87ef04 Mon Sep 17 00:00:00 2001 From: Braydon Kains <93549768+braydonk@users.noreply.github.com> Date: Sun, 2 Apr 2023 10:21:20 -0400 Subject: [PATCH] yamlfmt: created content analyzer (#106) * yamlfmt: created content analyzer In some use cases the provided patterns of exclusion aren't enough, for example when something generates a lot of yaml files in weird places, or when single files should be ignored out of large directories. This PR adds the following: * The concept of and ability to read metadata, with the first type of metadata being "ignore" * A new ContentAnalyzer interface and a BasicContentAnalyzer which will accept an array of regex pattern strings. This will first read the metadata from the content of each file to find the ignore metadata, then will match the content to the regex patterns provided to determine which patterns will be excluded. * yamlfmt: improve metadata errors Change metadata errors into a wrapped error struct so that the line number and path can be included in the resulting error. Make metadata errors non-fatal to `yamlfmt` as a whole. Also add the docs in this commit. --- Makefile | 18 ++- command/command.go | 27 +++- content_analyzer.go | 76 ++++++++++++ content_analyzer_test.go | 116 ++++++++++++++++++ docs/config-file.md | 1 + docs/metadata.md | 24 ++++ internal/collections/set.go | 19 +++ internal/tempfile/path.go | 44 ++++++- metadata.go | 100 +++++++++++++++ metadata_test.go | 79 ++++++++++++ testdata/content_analyzer/has_ignore/x.yaml | 2 + testdata/content_analyzer/has_ignore/y.yaml | 1 + testdata/content_analyzer/regex_ignore/x.yaml | 2 + testdata/content_analyzer/regex_ignore/y.yaml | 1 + 14 files changed, 501 insertions(+), 9 deletions(-) create mode 100644 content_analyzer.go create mode 100644 content_analyzer_test.go create mode 100644 docs/metadata.md create mode 100644 metadata.go create mode 100644 metadata_test.go create mode 100644 testdata/content_analyzer/has_ignore/x.yaml create mode 100644 testdata/content_analyzer/has_ignore/y.yaml create mode 100644 testdata/content_analyzer/regex_ignore/x.yaml create mode 100644 testdata/content_analyzer/regex_ignore/y.yaml diff --git a/Makefile b/Makefile index c693c40..6b52ce2 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,23 @@ +.PHONY: build build: go build ./cmd/yamlfmt +.PHONY: test +test: + go test ./... + +.PHONY: test_v +test_v: + go test -v ./... + +.PHONY: install install: go install ./cmd/yamlfmt +.PHONY: install_tools install_tools: go install github.com/google/addlicense@latest +.PHONY: addlicense addlicense: addlicense -c "Google LLC" -l apache . - -test_diff: - go test -v -mod=mod github.com/google/yamlfmt/internal/diff - -test_basic_formatter: - go test -v -mod=mod github.com/google/yamlfmt/formatters/basic \ No newline at end of file diff --git a/command/command.go b/command/command.go index e4bdf0b..4cf6680 100644 --- a/command/command.go +++ b/command/command.go @@ -49,6 +49,7 @@ type Config struct { Extensions []string `mapstructure:"extensions"` Include []string `mapstructure:"include"` Exclude []string `mapstructure:"exclude"` + RegexExclude []string `mapstructure:"regex_exclude"` Doublestar bool `mapstructure:"doublestar"` LineEnding yamlfmt.LineBreakStyle `mapstructure:"line_ending"` FormatterConfig *FormatterConfig `mapstructure:"formatter,omitempty"` @@ -110,10 +111,15 @@ func (c *Command) Run() error { Quiet: c.Quiet, } - paths, err := c.collectPaths() + collectedPaths, err := c.collectPaths() if err != nil { return err } + paths, err := c.analyzePaths(collectedPaths) + if err != nil { + log.Printf("path analysis found the following errors:\n%v", err) + log.Println("Continuing...") + } switch c.Operation { case OperationFormat: @@ -137,7 +143,11 @@ func (c *Command) Run() error { if err != nil { return err } - log.Print(out) + if out.Message == "" { + log.Print("No files will be changed.") + } else { + log.Print(out) + } case OperationStdin: stdinYaml, err := readFromStdin() if err != nil { @@ -158,6 +168,15 @@ func (c *Command) collectPaths() ([]string, error) { return collector.CollectPaths() } +func (c *Command) analyzePaths(paths []string) ([]string, error) { + analyzer, err := c.makeAnalyzer() + if err != nil { + return nil, err + } + includePaths, _, err := analyzer.ExcludePathsByContent(paths) + return includePaths, err +} + func (c *Command) makePathCollector() yamlfmt.PathCollector { if c.Config.Doublestar { return &yamlfmt.DoublestarCollector{ @@ -172,6 +191,10 @@ func (c *Command) makePathCollector() yamlfmt.PathCollector { } } +func (c *Command) makeAnalyzer() (yamlfmt.ContentAnalyzer, error) { + return yamlfmt.NewBasicContentAnalyzer(c.Config.RegexExclude) +} + func readFromStdin() ([]byte, error) { stdin := bufio.NewReader(os.Stdin) data := []byte{} diff --git a/content_analyzer.go b/content_analyzer.go new file mode 100644 index 0000000..10da3de --- /dev/null +++ b/content_analyzer.go @@ -0,0 +1,76 @@ +package yamlfmt + +import ( + "os" + "regexp" + + "github.com/google/yamlfmt/internal/collections" +) + +type ContentAnalyzer interface { + ExcludePathsByContent(paths []string) ([]string, []string, error) +} + +type BasicContentAnalyzer struct { + RegexPatterns []*regexp.Regexp +} + +func NewBasicContentAnalyzer(patterns []string) (BasicContentAnalyzer, error) { + analyzer := BasicContentAnalyzer{RegexPatterns: []*regexp.Regexp{}} + compileErrs := collections.Errors{} + for _, pattern := range patterns { + re, err := regexp.Compile(pattern) + if err != nil { + compileErrs = append(compileErrs, err) + continue + } + analyzer.RegexPatterns = append(analyzer.RegexPatterns, re) + } + return analyzer, compileErrs.Combine() +} + +func (a BasicContentAnalyzer) ExcludePathsByContent(paths []string) ([]string, []string, error) { + pathsToFormat := collections.SliceToSet(paths) + pathsExcluded := []string{} + pathErrs := collections.Errors{} + + for _, path := range paths { + content, err := os.ReadFile(path) + if err != nil { + pathErrs = append(pathErrs, err) + continue + } + + // Search metadata for ignore + metadata, mdErrs := ReadMetadata(content, path) + if len(mdErrs) != 0 { + pathErrs = append(pathErrs, mdErrs...) + } + ignoreFound := false + for md := range metadata { + if md.Type == MetadataIgnore { + ignoreFound = true + break + } + } + if ignoreFound { + pathsExcluded = append(pathsExcluded, path) + pathsToFormat.Remove(path) + continue + } + + // Check if content matches any regex + matched := false + for _, pattern := range a.RegexPatterns { + if pattern.Match(content) { + matched = true + } + } + if matched { + pathsExcluded = append(pathsExcluded, path) + pathsToFormat.Remove(path) + } + } + + return pathsToFormat.ToSlice(), pathsExcluded, pathErrs.Combine() +} diff --git a/content_analyzer_test.go b/content_analyzer_test.go new file mode 100644 index 0000000..f31ee50 --- /dev/null +++ b/content_analyzer_test.go @@ -0,0 +1,116 @@ +package yamlfmt_test + +import ( + "path/filepath" + "testing" + + "github.com/google/yamlfmt" + "github.com/google/yamlfmt/internal/collections" + "github.com/google/yamlfmt/internal/tempfile" +) + +const testdataBase = "testdata/content_analyzer" + +func TestBasicContentAnalyzer(t *testing.T) { + testCases := []struct { + name string + testdataDir string + excludePatterns []string + expectedPaths collections.Set[string] + expectedExcluded collections.Set[string] + }{ + { + name: "has ignore metadata", + testdataDir: "has_ignore", + excludePatterns: []string{}, + expectedPaths: collections.Set[string]{ + "y.yaml": {}, + }, + expectedExcluded: collections.Set[string]{ + "x.yaml": {}, + }, + }, + { + name: "matches regex pattern", + testdataDir: "regex_ignore", + excludePatterns: []string{ + ".*generated by.*", + }, + expectedPaths: collections.Set[string]{ + "y.yaml": {}, + }, + expectedExcluded: collections.Set[string]{ + "x.yaml": {}, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + tempPath := t.TempDir() + testdataDir := filepath.Join(testdataBase, tc.testdataDir) + paths, err := tempfile.ReplicateDirectory(testdataDir, tempPath) + if err != nil { + t.Fatalf("could not replicate testdata directory %s: %v", tc.testdataDir, err) + } + err = paths.CreateAll() + if err != nil { + t.Fatalf("could not create full test directory: %v", err) + } + contentAnalyzer, err := yamlfmt.NewBasicContentAnalyzer(tc.excludePatterns) + if err != nil { + t.Fatalf("could not create content analyzer: %v", err) + } + collector := &yamlfmt.FilepathCollector{ + Include: []string{tempPath}, + Exclude: []string{}, + Extensions: []string{"yaml", "yml"}, + } + collectedPaths, err := collector.CollectPaths() + if err != nil { + t.Fatalf("CollectPaths failed: %v", err) + } + resultPaths, excludedPaths, err := contentAnalyzer.ExcludePathsByContent(collectedPaths) + if err != nil { + t.Fatalf("expected content analyzer to work, got error: %v", err) + } + resultPathsTrimmed, err := pathsTempdirTrimmed(resultPaths, tempPath) + if err != nil { + t.Fatalf("expected trimming tempdir from result not to have error: %v", err) + } + if !tc.expectedPaths.Equals(collections.SliceToSet(resultPathsTrimmed)) { + t.Fatalf("expected files:\n%v\ngot:\n%v", tc.expectedPaths, resultPaths) + } + excludePathsTrimmed, err := pathsTempdirTrimmed(excludedPaths, tempPath) + if err != nil { + t.Fatalf("expected trimming tempdir from excluded not to have error: %v", err) + } + if !tc.expectedExcluded.Equals(collections.SliceToSet(excludePathsTrimmed)) { + t.Fatalf("expected excludsions:\n%v\ngot:\n%v", tc.expectedExcluded, excludedPaths) + } + }) + } +} + +func TestBadNewContentAnalyzer(t *testing.T) { + // Illegal because no closing ) + badPattern := "%^3412098(]fj" + _, err := yamlfmt.NewBasicContentAnalyzer([]string{badPattern}) + if err == nil { + t.Fatalf("expected there to be an error") + } +} + +func pathsTempdirTrimmed(paths []string, tempDir string) ([]string, error) { + trimmedPaths := []string{} + for _, path := range paths { + trimmedPath, err := filepath.Rel(tempDir, path) + if err != nil { + return nil, err + } + trimmedPaths = append(trimmedPaths, trimmedPath) + } + return trimmedPaths, nil +} diff --git a/docs/config-file.md b/docs/config-file.md index 79080ee..0897819 100644 --- a/docs/config-file.md +++ b/docs/config-file.md @@ -22,6 +22,7 @@ The command package defines the main command engine that `cmd/yamlfmt` uses. It | `doublestar` | bool | false | Use [doublestar](https://github.com/bmatcuk/doublestar) for include and exclude paths. (This was the default before 0.7.0) | | `include` | []string | [] | The paths for the command to include for formatting. See [Specifying Paths][] for more details. | | `exclude` | []string | [] | The paths for the command to exclude from formatting. See [Specifying Paths][] for more details. | +| `regex_exclude` | []string | [] | Regex patterns to match file contents for, if the file content matches the regex the file will be excluded. Use [Golang regexes](https://regex101.com/). | | `extensions` | []string | [] | The extensions to use for standard mode path collection. See [Specifying Paths][] for more details. | | `formatter` | map[string]any | default basic formatter | Formatter settings. See [Formatter](#formatter) for more details. | diff --git a/docs/metadata.md b/docs/metadata.md new file mode 100644 index 0000000..86b7a20 --- /dev/null +++ b/docs/metadata.md @@ -0,0 +1,24 @@ +# Metadata + +The `yamlfmt` library supports recognizing a limited amount of metadata from a yaml file. + +## How to specify + +Metadata is specified with a special token, followed by a colon, and then a type. For example, to add `ignore` metadata to a file: +``` +# !yamlfmt!:ignore +``` +If this string `!yamlfmt!:ignore` is anywhere in the file, the file will be dropped from the paths to format. + +The format of `!yamlfmt!:type` is strict; there must be a colon separating the metadata identifier and the type, and there must be no whitespace separating anything within the metadata identifier block. For example either of these will cause an error: +``` +# !yamlfmt!: ignore +# !yamlfmt!ignore +``` +Metadata errors are considered non-fatal, and `yamlfmt` will attempt to continue despite them. + +## Types + +| Type | Example | Description | +|:-------|:-------------------|:------------| +| ignore | `!yamlfmt!:ignore` | If found, `yamlfmt` will exclude the file from formatting. | \ No newline at end of file diff --git a/internal/collections/set.go b/internal/collections/set.go index 4201cbd..4a04ec9 100644 --- a/internal/collections/set.go +++ b/internal/collections/set.go @@ -23,6 +23,25 @@ func (s Set[T]) ToSlice() []T { return sl } +func (s Set[T]) Clone() Set[T] { + newSet := Set[T]{} + for el := range s { + newSet.Add(el) + } + return newSet +} + +func (s Set[T]) Equals(rhs Set[T]) bool { + if len(s) != len(rhs) { + return false + } + rhsClone := rhs.Clone() + for el := range s { + rhsClone.Remove(el) + } + return len(rhsClone) == 0 +} + func SliceToSet[T comparable](sl []T) Set[T] { set := Set[T]{} for _, el := range sl { diff --git a/internal/tempfile/path.go b/internal/tempfile/path.go index f8a3a51..8f129d5 100644 --- a/internal/tempfile/path.go +++ b/internal/tempfile/path.go @@ -1,14 +1,18 @@ package tempfile import ( + "io/fs" "os" "path/filepath" + + "github.com/google/yamlfmt/internal/collections" ) type Path struct { BasePath string FileName string IsDir bool + Content []byte } func (p *Path) Create() error { @@ -17,7 +21,7 @@ func (p *Path) Create() error { if p.IsDir { err = os.Mkdir(file, os.ModePerm) } else { - err = os.WriteFile(file, []byte{}, os.ModePerm) + err = os.WriteFile(file, p.Content, os.ModePerm) } return err } @@ -25,3 +29,41 @@ func (p *Path) Create() error { func (p *Path) fullPath() string { return filepath.Join(p.BasePath, p.FileName) } + +type Paths []Path + +func (ps Paths) CreateAll() error { + errs := collections.Errors{} + for _, path := range ps { + errs = append(errs, path.Create()) + } + return errs.Combine() +} + +func ReplicateDirectory(dir string, newBase string) (Paths, error) { + paths := Paths{} + + err := filepath.Walk(dir, func(path string, info fs.FileInfo, walkErr error) error { + if walkErr != nil { + return walkErr + } + content := []byte{} + + if !info.IsDir() { + readContent, err := os.ReadFile(path) + if err != nil { + return err + } + content = readContent + } + + paths = append(paths, Path{ + BasePath: newBase, + FileName: info.Name(), + IsDir: info.IsDir(), + Content: content, + }) + return nil + }) + return paths, err +} diff --git a/metadata.go b/metadata.go new file mode 100644 index 0000000..deea91a --- /dev/null +++ b/metadata.go @@ -0,0 +1,100 @@ +package yamlfmt + +import ( + "errors" + "fmt" + "strings" + "unicode" + + "github.com/google/yamlfmt/internal/collections" +) + +const MetadataIdentifier = "!yamlfmt!" + +type MetadataType string + +const ( + MetadataIgnore MetadataType = "ignore" +) + +func IsMetadataType(mdValueStr string) bool { + mdTypes := collections.Set[MetadataType]{} + mdTypes.Add(MetadataIgnore) + return mdTypes.Contains(MetadataType(mdValueStr)) +} + +type Metadata struct { + Type MetadataType + LineNum int +} + +var ( + ErrMalformedMetadata = errors.New("metadata: malformed string") + ErrUnrecognizedMetadata = errors.New("metadata: unrecognized type") +) + +type MetadataError struct { + err error + path string + lineNum int + lineStr string +} + +func (e *MetadataError) Error() string { + return fmt.Sprintf( + "%v: %s:%d:%s", + e.err, + e.path, + e.lineNum, + e.lineStr, + ) +} + +func (e *MetadataError) Unwrap() error { + return e.err +} + +func ReadMetadata(content []byte, path string) (collections.Set[Metadata], collections.Errors) { + metadata := collections.Set[Metadata]{} + mdErrs := collections.Errors{} + // This could be `\r\n` but it won't affect the outcome of this operation. + contentLines := strings.Split(string(content), "\n") + for i, line := range contentLines { + mdidIndex := strings.Index(line, MetadataIdentifier) + if mdidIndex == -1 { + continue + } + mdStr := scanMetadata(line, mdidIndex) + mdComponents := strings.Split(mdStr, ":") + if len(mdComponents) != 2 { + mdErrs = append(mdErrs, &MetadataError{ + path: path, + lineNum: i + 1, + err: ErrMalformedMetadata, + lineStr: line, + }) + continue + } + if IsMetadataType(mdComponents[1]) { + metadata.Add(Metadata{LineNum: i + 1, Type: MetadataType(mdComponents[1])}) + } else { + mdErrs = append(mdErrs, &MetadataError{ + path: path, + lineNum: i + 1, + err: ErrUnrecognizedMetadata, + lineStr: line, + }) + } + } + return metadata, mdErrs +} + +func scanMetadata(line string, index int) string { + mdBytes := []byte{} + i := index + for i < len(line) && !unicode.IsSpace(rune(line[i])) { + mdBytes = append(mdBytes, line[i]) + i++ + } + return string(mdBytes) +} diff --git a/metadata_test.go b/metadata_test.go new file mode 100644 index 0000000..9b0c6f1 --- /dev/null +++ b/metadata_test.go @@ -0,0 +1,79 @@ +package yamlfmt_test + +import ( + "errors" + "testing" + + "github.com/google/yamlfmt" + "github.com/google/yamlfmt/internal/collections" +) + +type errorChecker func(t *testing.T, errs collections.Errors) + +func checkErrNil(t *testing.T, errs collections.Errors) { + combinedErr := errs.Combine() + if combinedErr != nil { + t.Fatalf("expected error to be nil, got: %v", combinedErr) + } +} + +func TestReadMetadata(t *testing.T) { + testCases := []struct { + name string + content string + expected collections.Set[yamlfmt.Metadata] + errCheck errorChecker + }{ + { + name: "contains no metadata", + content: "", + expected: collections.Set[yamlfmt.Metadata]{}, + errCheck: checkErrNil, + }, + { + name: "has ignore metadata", + content: "# !yamlfmt!:ignore\na: 1", + expected: collections.Set[yamlfmt.Metadata]{ + {Type: yamlfmt.MetadataIgnore, LineNum: 1}: {}, + }, + errCheck: checkErrNil, + }, + { + name: "has bad metadata", + content: "# !yamlfmt!fjghgh", + expected: collections.Set[yamlfmt.Metadata]{}, + errCheck: func(t *testing.T, errs collections.Errors) { + if len(errs) != 1 { + t.Fatalf("expected 1 error, got %d:\n%v", len(errs), errs.Combine()) + } + if errors.Unwrap(errs[0]) != yamlfmt.ErrMalformedMetadata { + t.Fatalf("expected ErrMalformedMetadata, got: %v", errs[0]) + } + }, + }, + { + name: "has unrecognized metadata type", + content: "# !yamlfmt!:lulsorandom", + expected: collections.Set[yamlfmt.Metadata]{}, + errCheck: func(t *testing.T, errs collections.Errors) { + if len(errs) != 1 { + t.Fatalf("expected 1 error, got %d:\n%v", len(errs), errs.Combine()) + } + if errors.Unwrap(errs[0]) != yamlfmt.ErrUnrecognizedMetadata { + t.Fatalf("expected ErrUnrecognizedMetadata, got: %v", errs[0]) + } + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + md, err := yamlfmt.ReadMetadata([]byte(tc.content), "test.yaml") + if !md.Equals(tc.expected) { + t.Fatalf("Mismatched metadata:\nexpected: %v\ngot: %v", tc.expected, md) + } + t.Logf("got error: %v", err) + tc.errCheck(t, err) + }) + } +} diff --git a/testdata/content_analyzer/has_ignore/x.yaml b/testdata/content_analyzer/has_ignore/x.yaml new file mode 100644 index 0000000..4a76bd3 --- /dev/null +++ b/testdata/content_analyzer/has_ignore/x.yaml @@ -0,0 +1,2 @@ +# !yamlfmt!:ignore +a: 1 diff --git a/testdata/content_analyzer/has_ignore/y.yaml b/testdata/content_analyzer/has_ignore/y.yaml new file mode 100644 index 0000000..a8926a5 --- /dev/null +++ b/testdata/content_analyzer/has_ignore/y.yaml @@ -0,0 +1 @@ +a: 1 diff --git a/testdata/content_analyzer/regex_ignore/x.yaml b/testdata/content_analyzer/regex_ignore/x.yaml new file mode 100644 index 0000000..c0ae45b --- /dev/null +++ b/testdata/content_analyzer/regex_ignore/x.yaml @@ -0,0 +1,2 @@ +# generated by: some tool +a: 1 diff --git a/testdata/content_analyzer/regex_ignore/y.yaml b/testdata/content_analyzer/regex_ignore/y.yaml new file mode 100644 index 0000000..a8926a5 --- /dev/null +++ b/testdata/content_analyzer/regex_ignore/y.yaml @@ -0,0 +1 @@ +a: 1