New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prefactor package hashing #4816
Changes from 1 commit
d835690
c760457
4720f40
4b1fd4f
be50d8e
2b7e154
7bb65aa
57cd58e
8258d86
a55c004
a5f3b1b
5f3bcbe
269e49b
2df5983
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,12 +31,12 @@ type Tracker struct { | |
globalHash string | ||
pipeline fs.Pipeline | ||
|
||
packageInputsHashes packageFileHashes | ||
packageInputsHashes map[string]string | ||
|
||
// packageInputsExpandedHashes is a map of a hashkey to a list of files that are inputs to the task. | ||
// Writes to this map happen during CalculateFileHash(). Since this happens synchronously | ||
// before walking the task graph, it does not need to be protected by a mutex. | ||
packageInputsExpandedHashes map[packageFileHashKey]map[turbopath.AnchoredUnixPath]string | ||
packageInputsExpandedHashes map[string]map[turbopath.AnchoredUnixPath]string | ||
|
||
// mu is a mutex that we can lock/unlock to read/write from maps | ||
// the fields below should be protected by the mutex. | ||
|
@@ -62,44 +62,13 @@ func NewTracker(rootNode string, globalHash string, pipeline fs.Pipeline) *Track | |
} | ||
} | ||
|
||
// packageFileSpec defines a combination of a package and optional set of input globs | ||
type packageFileSpec struct { | ||
pkg string | ||
inputs []string | ||
// packageFileHashInputs defines a combination of a package and optional set of input globs | ||
type packageFileHashInputs struct { | ||
taskID string | ||
taskDefinition *fs.TaskDefinition | ||
packageName string | ||
} | ||
|
||
func specFromPackageTask(packageTask *nodes.PackageTask) packageFileSpec { | ||
return packageFileSpec{ | ||
pkg: packageTask.PackageName, | ||
inputs: packageTask.TaskDefinition.Inputs, | ||
} | ||
} | ||
|
||
// packageFileHashKey is a hashable representation of a packageFileSpec. | ||
type packageFileHashKey string | ||
|
||
// hashes the inputs for a packageTask | ||
func (pfs packageFileSpec) ToKey() packageFileHashKey { | ||
sort.Strings(pfs.inputs) | ||
return packageFileHashKey(fmt.Sprintf("%v#%v", pfs.pkg, strings.Join(pfs.inputs, "!"))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This probably can't go wrong, but we already have a unique identifier, the taskID. This was to enable multiple tasks in the same package with the same specified inputs to not have to rerun file hashing. It's an optimization, but not actually the ideal one. Now that everything calls into I do not add this feature in this PR. |
||
} | ||
|
||
func (pfs *packageFileSpec) getHashObject(pkg *fs.PackageJSON, repoRoot turbopath.AbsoluteSystemPath) (map[turbopath.AnchoredUnixPath]string, error) { | ||
return hashing.GetPackageFileHashes(repoRoot, pkg.Dir, pfs.inputs) | ||
} | ||
|
||
func (pfs *packageFileSpec) hash(hashObject map[turbopath.AnchoredUnixPath]string) (string, error) { | ||
hashOfFiles, otherErr := fs.HashObject(hashObject) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
From
|
||
if otherErr != nil { | ||
return "", otherErr | ||
} | ||
return hashOfFiles, nil | ||
} | ||
|
||
// packageFileHashes is a map from a package and optional input globs to the hash of | ||
// the matched files in the package. | ||
type packageFileHashes map[packageFileHashKey]string | ||
|
||
// CalculateFileHashes hashes each unique package-inputs combination that is present | ||
// in the task graph. Must be called before calculating task hashes. | ||
func (th *Tracker) CalculateFileHashes( | ||
|
@@ -119,8 +88,9 @@ func (th *Tracker) CalculateFileHashes( | |
if taskID == th.rootNode { | ||
continue | ||
} | ||
pkgName, _ := util.GetPackageTaskFromId(taskID) | ||
if pkgName == th.rootNode { | ||
|
||
packageName, _ := util.GetPackageTaskFromId(taskID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're not character constrained, so clear names that match structs are good. |
||
if packageName == th.rootNode { | ||
continue | ||
} | ||
|
||
|
@@ -129,45 +99,51 @@ func (th *Tracker) CalculateFileHashes( | |
return fmt.Errorf("missing pipeline entry %v", taskID) | ||
} | ||
|
||
pfs := &packageFileSpec{ | ||
pkg: pkgName, | ||
inputs: taskDefinition.Inputs, | ||
pfs := &packageFileHashInputs{ | ||
taskID, | ||
taskDefinition, | ||
packageName, | ||
} | ||
|
||
hashTasks.Add(pfs) | ||
} | ||
|
||
hashes := make(map[packageFileHashKey]string, len(hashTasks)) | ||
hashObjects := make(map[packageFileHashKey]map[turbopath.AnchoredUnixPath]string, len(hashTasks)) | ||
hashQueue := make(chan *packageFileSpec, workerCount) | ||
hashes := make(map[string]string, len(hashTasks)) | ||
hashObjects := make(map[string]map[turbopath.AnchoredUnixPath]string, len(hashTasks)) | ||
hashQueue := make(chan *packageFileHashInputs, workerCount) | ||
hashErrs := &errgroup.Group{} | ||
|
||
for i := 0; i < workerCount; i++ { | ||
hashErrs.Go(func() error { | ||
for packageFileSpec := range hashQueue { | ||
pkg, ok := workspaceInfos.PackageJSONs[packageFileSpec.pkg] | ||
for packageFileHashInputs := range hashQueue { | ||
pkg, ok := workspaceInfos.PackageJSONs[packageFileHashInputs.packageName] | ||
if !ok { | ||
return fmt.Errorf("cannot find package %v", packageFileSpec.pkg) | ||
return fmt.Errorf("cannot find package %v", packageFileHashInputs.packageName) | ||
} | ||
hashObject, err := packageFileSpec.getHashObject(pkg, repoRoot) | ||
|
||
// Get the hashes of each file, keyed by the path. | ||
hashObject, err := hashing.GetPackageFileHashes(repoRoot, pkg.Dir, packageFileHashInputs.taskDefinition.Inputs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentionally keeps the same signature that everywhere else does and doesn't redirect it so that we don't accidentally end up in the "N ways to do things" situation again. |
||
if err != nil { | ||
return err | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And then cause it to break |
||
} | ||
hash, err := packageFileSpec.hash(hashObject) | ||
|
||
// Get the combined hash of all the files. | ||
hash, err := fs.HashObject(hashObject) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No indirection where behavior can be modified spookily at a distance. Optimizing for clarity in hash construction. |
||
if err != nil { | ||
return err | ||
} | ||
|
||
// Save off the hash information, keyed by package task. | ||
th.mu.Lock() | ||
pfsKey := packageFileSpec.ToKey() | ||
hashes[pfsKey] = hash | ||
hashObjects[pfsKey] = hashObject | ||
hashes[packageFileHashInputs.taskID] = hash | ||
hashObjects[packageFileHashInputs.taskID] = hashObject | ||
Comment on lines
+138
to
+139
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yesss. this tripped me up a lot when i was working on this code, thank you There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is going to result in rehashing the same package multiple times if there is more than one task in that package. For instance, if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I doubled checked what I wrote and it looks like there's a bug where it's not doing what we expect. This change would result in rehashing the same package multiple times if we were already properly deduping sets of files to hash, but we are not. The reason we are not currently properly deduping is because we are adding the addresses of The original author (me) probably should've noticed this given the effort to produce a key for a map later on, and knowing that the set is implemented via a map under the hood. I'm going to file an issue to fix this post-port, since it will be much easier to do the struct value equality in Rust. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The actual deduping I want to do is to push all files into a set, do a set intersection, and only hash files that don't already have a hash. There's no particular reason to tie it to a package or task. |
||
th.mu.Unlock() | ||
} | ||
return nil | ||
}) | ||
} | ||
for ht := range hashTasks { | ||
hashQueue <- ht.(*packageFileSpec) | ||
hashQueue <- ht.(*packageFileHashInputs) | ||
} | ||
close(hashQueue) | ||
err := hashErrs.Wait() | ||
|
@@ -272,12 +248,9 @@ func (th *Tracker) calculateDependencyHashes(dependencySet dag.Set) ([]string, e | |
// that it has previously been called on its task-graph dependencies. File hashes must be calculated | ||
// first. | ||
func (th *Tracker) CalculateTaskHash(packageTask *nodes.PackageTask, dependencySet dag.Set, logger hclog.Logger, args []string, useOldTaskHashable bool) (string, error) { | ||
pfs := specFromPackageTask(packageTask) | ||
pkgFileHashKey := pfs.ToKey() | ||
|
||
hashOfFiles, ok := th.packageInputsHashes[pkgFileHashKey] | ||
hashOfFiles, ok := th.packageInputsHashes[packageTask.TaskID] | ||
if !ok { | ||
return "", fmt.Errorf("cannot find package-file hash for %v", pkgFileHashKey) | ||
return "", fmt.Errorf("cannot find package-file hash for %v", packageTask.TaskID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This error message is now infinitely better. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm surprised you didn't say "rounds-to-infinity" in true nathan style 😉 😛 |
||
} | ||
|
||
var keyMatchers []string | ||
|
@@ -333,8 +306,7 @@ func (th *Tracker) CalculateTaskHash(packageTask *nodes.PackageTask, dependencyS | |
|
||
// GetExpandedInputs gets the expanded set of inputs for a given PackageTask | ||
func (th *Tracker) GetExpandedInputs(packageTask *nodes.PackageTask) map[turbopath.AnchoredUnixPath]string { | ||
pfs := specFromPackageTask(packageTask) | ||
expandedInputs := th.packageInputsExpandedHashes[pfs.ToKey()] | ||
expandedInputs := th.packageInputsExpandedHashes[packageTask.TaskID] | ||
inputsCopy := make(map[turbopath.AnchoredUnixPath]string, len(expandedInputs)) | ||
|
||
for path, hash := range expandedInputs { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was a "what do we need to calculate the file hash?" object but it was too clever by half.
It is still that, but it stops trying to be clever.