Skip to content

Commit

Permalink
feat: add configurable backoff and retries for Zarf operations
Browse files Browse the repository at this point in the history
Signed-off-by: Eddie Zaneski <eddiezane@gmail.com>
Co-authored-by: Wayne Starr <racer159@live.com>
  • Loading branch information
eddiezane and Racer159 committed Mar 4, 2024
1 parent b807e15 commit 094a0d8
Show file tree
Hide file tree
Showing 18 changed files with 132 additions and 72 deletions.
4 changes: 4 additions & 0 deletions docs/2-the-zarf-cli/100-cli-commands/zarf_dev_deploy.md
Expand Up @@ -14,13 +14,17 @@ zarf dev deploy [flags]
## Options

```
--adopt-existing-resources Adopts any pre-existing K8s resources into the Helm charts managed by Zarf. ONLY use when you have existing deployments you want Zarf to takeover.
--components string Comma-separated list of components to deploy. Adding this flag will skip the prompts for selected components. Globbing component names with '*' and deselecting 'default' components with a leading '-' are also supported.
--create-set stringToString Specify package variables to set on the command line (KEY=value) (default [])
--deploy-set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
-f, --flavor string The flavor of components to include in the resulting package (i.e. have a matching or empty "only.flavor" key)
-h, --help help for deploy
--no-yolo Disable the YOLO mode default override and create / deploy the package as-defined
--registry-override stringToString Specify a map of domains to override on package create when pulling images (e.g. --registry-override docker.io=dockerio-reg.enterprise.intranet) (default [])
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
--timeout duration Timeout for Helm operations such as installs and rollbacks (default 15m0s)
```

## Options inherited from parent commands
Expand Down
1 change: 1 addition & 0 deletions docs/2-the-zarf-cli/100-cli-commands/zarf_init.md
Expand Up @@ -70,6 +70,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA
--registry-push-username string Username to access to the registry Zarf is configured to use (default "zarf-push")
--registry-secret string Registry secret value
--registry-url string External registry url address to use for this Zarf cluster
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
--storage-class string Specify the storage class to use for the registry and git server. E.g. --storage-class=standard
Expand Down
Expand Up @@ -23,6 +23,7 @@ zarf package create [ DIRECTORY ] [flags]
-m, --max-package-size int Specify the maximum size of the package in megabytes, packages larger than this will be split into multiple parts to be loaded onto smaller media (i.e. DVDs). Use 0 to disable splitting.
-o, --output string Specify the output (either a directory or an oci:// URL) for the created Zarf package
--registry-override stringToString Specify a map of domains to override on package create when pulling images (e.g. --registry-override docker.io=dockerio-reg.enterprise.intranet) (default [])
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
-s, --sbom View SBOM contents after creating the package
--sbom-out string Specify an output directory for the SBOMs from the created Zarf package
--set stringToString Specify package variables to set on the command line (KEY=value) (default [])
Expand Down
Expand Up @@ -19,6 +19,7 @@ zarf package deploy [ PACKAGE_SOURCE ] [flags]
--components string Comma-separated list of components to deploy. Adding this flag will skip the prompts for selected components. Globbing component names with '*' and deselecting 'default' components with a leading '-' are also supported.
--confirm Confirms package deployment without prompting. ONLY use with packages you trust. Skips prompts to review SBOM, configure variables, select optional components and review potential breaking changes.
-h, --help help for deploy
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
--shasum string Shasum of the package to deploy. Required if deploying a remote package and "--insecure" is not provided
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
Expand Down
Expand Up @@ -49,6 +49,7 @@ $ zarf package mirror-resources <your-package.tar.zst> \
--registry-push-password string Password for the push-user to connect to the registry
--registry-push-username string Username to access to the registry Zarf is configured to use (default "zarf-push")
--registry-url string External registry url address to use for this Zarf cluster
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
```

## Options inherited from parent commands
Expand Down
4 changes: 3 additions & 1 deletion src/cmd/common/viper.go
Expand Up @@ -84,6 +84,7 @@ const (
VPkgDeploySget = "package.deploy.sget"
VPkgDeploySkipWebhooks = "package.deploy.skip_webhooks"
VPkgDeployTimeout = "package.deploy.timeout"
VPkgRetries = "package.deploy.retries"

// Package publish config keys

Expand Down Expand Up @@ -184,7 +185,8 @@ func setDefaults() {

// Package defaults that are non-zero values
v.SetDefault(VPkgOCIConcurrency, 3)
v.SetDefault(VPkgRetries, config.ZarfDefaultRetries)

// Deploy opts that are non-zero values
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultHelmTimeout)
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultTimeout)
}
6 changes: 6 additions & 0 deletions src/cmd/dev.go
Expand Up @@ -282,6 +282,12 @@ func bindDevDeployFlags(v *viper.Viper) {

devDeployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "deploy-set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)

// Always require adopt-existing-resources flag (no viper)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)
devDeployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

devDeployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
devDeployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)

devDeployFlags.BoolVar(&pkgConfig.CreateOpts.NoYOLO, "no-yolo", v.GetBool(common.VDevDeployNoYolo), lang.CmdDevDeployFlagNoYolo)
Expand Down
3 changes: 1 addition & 2 deletions src/cmd/initialize.go
Expand Up @@ -216,11 +216,10 @@ func init() {
// Flags that control how a deployment proceeds
// Always require adopt-existing-resources flag (no viper)
initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

initCmd.Flags().DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

initCmd.Flags().IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
initCmd.Flags().StringVarP(&pkgConfig.PkgOpts.PublicKeyPath, "key", "k", v.GetString(common.VPkgPublicKey), lang.CmdPackageFlagFlagPublicKey)

initCmd.Flags().SortFlags = true
Expand Down
6 changes: 4 additions & 2 deletions src/cmd/package.go
Expand Up @@ -364,6 +364,8 @@ func bindCreateFlags(v *viper.Viper) {
createFlags.StringVarP(&pkgConfig.CreateOpts.SigningKeyPath, "key", "k", v.GetString(common.VPkgCreateSigningKey), lang.CmdPackageCreateFlagDeprecatedKey)
createFlags.StringVar(&pkgConfig.CreateOpts.SigningKeyPassword, "key-pass", v.GetString(common.VPkgCreateSigningKeyPassword), lang.CmdPackageCreateFlagDeprecatedKeyPassword)

createFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)

createFlags.MarkHidden("output-directory")
createFlags.MarkHidden("key")
createFlags.MarkHidden("key-pass")
Expand All @@ -377,11 +379,10 @@ func bindDeployFlags(v *viper.Viper) {

// Always require adopt-existing-resources flag (no viper)
deployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

deployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

deployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

deployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
deployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)
deployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)
deployFlags.StringVar(&pkgConfig.PkgOpts.Shasum, "shasum", v.GetString(common.VPkgDeployShasum), lang.CmdPackageDeployFlagShasum)
Expand All @@ -403,6 +404,7 @@ func bindMirrorFlags(v *viper.Viper) {

mirrorFlags.BoolVar(&pkgConfig.MirrorOpts.NoImgChecksum, "no-img-checksum", false, lang.CmdPackageMirrorFlagNoChecksum)

mirrorFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
mirrorFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageMirrorFlagComponents)

// Flags for using an external Git server
Expand Down
7 changes: 5 additions & 2 deletions src/config/config.go
Expand Up @@ -99,8 +99,11 @@ var (
operationStartTime = time.Now().Unix()
dataInjectionMarker = ".zarf-injection-%d"

ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")
ZarfDefaultHelmTimeout = 15 * time.Minute
ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")

// Default Time Vars
ZarfDefaultTimeout = 15 * time.Minute
ZarfDefaultRetries = 3
)

// GetArch returns the arch based on a priority list with options for overriding.
Expand Down
1 change: 1 addition & 0 deletions src/config/lang/english.go
Expand Up @@ -232,6 +232,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA
CmdPackageShort = "Zarf package commands for creating, deploying, and inspecting packages"
CmdPackageFlagConcurrency = "Number of concurrent layer operations to perform when interacting with a remote package."
CmdPackageFlagFlagPublicKey = "Path to public key file for validating signed packages"
CmdPackageFlagRetries = "Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs"

CmdPackageCreateShort = "Creates a Zarf package from a given directory or the current directory"
CmdPackageCreateLong = "Builds an archive of resources and dependencies defined by the 'zarf.yaml' in the specified directory.\n" +
Expand Down
96 changes: 46 additions & 50 deletions src/internal/packager/helm/chart.go
Expand Up @@ -5,9 +5,12 @@
package helm

import (
"errors"
"fmt"
"time"

"github.com/defenseunicorns/zarf/src/pkg/utils/helpers"

"github.com/Masterminds/semver/v3"
"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/types"
Expand All @@ -24,9 +27,6 @@ import (
"helm.sh/helm/v3/pkg/storage/driver"
)

// Set the default number of Helm install/upgrade attempts to 3
const defaultHelmAttempts = 3

// InstallOrUpgradeChart performs a helm install of the given chart.
func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
fromMessage := h.chart.URL
Expand Down Expand Up @@ -63,49 +63,14 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
return nil, "", fmt.Errorf("unable to create helm renderer: %w", err)
}

attempt := 0
for {
attempt++

histClient := action.NewHistory(h.actionConfig)
histClient := action.NewHistory(h.actionConfig)
tryHelm := func() error {
var err error
releases, histErr := histClient.Run(h.chart.ReleaseName)

if attempt > 3 {
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Attempt %d of %d to install chart", attempt, defaultHelmAttempts)

spinner.Updatef("Checking for existing helm deployment")

if histErr == driver.ErrReleaseNotFound {
if errors.Is(histErr, driver.ErrReleaseNotFound) {
// No prior release, try to install it.
spinner.Updatef("Attempting chart installation")

Expand All @@ -119,19 +84,50 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
output, err = h.upgradeChart(lastRelease, postRender)
} else {
// 😭 things aren't working
return nil, "", fmt.Errorf("unable to verify the chart installation status: %w", histErr)
return fmt.Errorf("unable to verify the chart installation status: %w", histErr)
}

if err != nil {
message.Warnf("Unable to complete helm chart install/upgrade, waiting 10 seconds and trying again: %s", err.Error())
return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err)
}

// Simply wait for dust to settle and try again.
time.Sleep(10 * time.Second)
} else {
message.Debug(output.Info.Description)
spinner.Success()
break
message.Debug(output.Info.Description)
spinner.Success()
return nil
}

err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf)
if err != nil {
// Try to rollback any deployed releases
releases, _ := histClient.Run(h.chart.ReleaseName)
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries)
}

// return any collected connect strings for zarf connect.
Expand Down
11 changes: 7 additions & 4 deletions src/internal/packager/helm/common.go
Expand Up @@ -33,6 +33,7 @@ type Helm struct {
component types.ZarfComponent
cluster *cluster.Cluster
timeout time.Duration
retries int

kubeVersion string

Expand All @@ -52,7 +53,7 @@ func New(chart types.ZarfChart, chartPath string, valuesPath string, mods ...Mod
chart: chart,
chartPath: chartPath,
valuesPath: valuesPath,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -67,7 +68,8 @@ func NewClusterOnly(cfg *types.PackagerConfig, cluster *cluster.Cluster) *Helm {
return &Helm{
cfg: cfg,
cluster: cluster,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
retries: config.ZarfDefaultRetries,
}
}

Expand Down Expand Up @@ -118,7 +120,7 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
NoWait: manifest.NoWait,
},
chartOverride: tmpChart,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -131,13 +133,14 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
}

// WithDeployInfo adds the necessary information to deploy a given chart
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration) Modifier {
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration, retries int) Modifier {
return func(h *Helm) {
h.component = component
h.cfg = cfg
h.cluster = cluster
h.valuesOverrides = valuesOverrides
h.timeout = timeout
h.retries = retries
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/pkg/packager/create_stages.go
Expand Up @@ -163,8 +163,8 @@ func (p *Packager) assemble() error {
return err
}

if err := helpers.Retry(doPull, 3, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after 3 attempts: %w", err)
if err := helpers.Retry(doPull, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after %d attempts: %w", p.cfg.PkgOpts.Retries, err)
}

for _, imgInfo := range pulled {
Expand Down
12 changes: 7 additions & 5 deletions src/pkg/packager/deploy.go
Expand Up @@ -470,7 +470,7 @@ func (p *Packager) pushImagesToRegistry(componentImages []string, noImgChecksum

return helpers.Retry(func() error {
return imgConfig.PushToZarfRegistry()
}, 3, 5*time.Second, message.Warnf)
}, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf)
}

// Push all of the components git repos to the configured git server.
Expand Down Expand Up @@ -511,8 +511,8 @@ func (p *Packager) pushReposToRepository(reposPath string, repos []string) error
return gitClient.PushRepo(repoURL, reposPath)
}

// Try repo push up to 3 times
if err := helpers.Retry(tryPush, 3, 5*time.Second, message.Warnf); err != nil {
// Try repo push up to retry limit
if err := helpers.Retry(tryPush, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to push repo %s to the Git Server: %w", repoURL, err)
}
}
Expand Down Expand Up @@ -549,7 +549,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath
p.cfg,
p.cluster,
valuesOverrides,
p.cfg.DeployOpts.Timeout),
p.cfg.DeployOpts.Timeout,
p.cfg.PkgOpts.Retries),
)

addedConnectStrings, installedChartName, err := helmCfg.InstallOrUpgradeChart()
Expand Down Expand Up @@ -596,7 +597,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath
p.cfg,
p.cluster,
nil,
p.cfg.DeployOpts.Timeout),
p.cfg.DeployOpts.Timeout,
p.cfg.PkgOpts.Retries),
)
if err != nil {
return installedCharts, err
Expand Down

0 comments on commit 094a0d8

Please sign in to comment.