Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add configurable backoff and retries for Zarf operations #2345

Merged
merged 1 commit into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/2-the-zarf-cli/100-cli-commands/zarf_dev_deploy.md
Expand Up @@ -14,13 +14,17 @@ zarf dev deploy [flags]
## Options

```
--adopt-existing-resources Adopts any pre-existing K8s resources into the Helm charts managed by Zarf. ONLY use when you have existing deployments you want Zarf to takeover.
--components string Comma-separated list of components to deploy. Adding this flag will skip the prompts for selected components. Globbing component names with '*' and deselecting 'default' components with a leading '-' are also supported.
--create-set stringToString Specify package variables to set on the command line (KEY=value) (default [])
--deploy-set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
-f, --flavor string The flavor of components to include in the resulting package (i.e. have a matching or empty "only.flavor" key)
-h, --help help for deploy
--no-yolo Disable the YOLO mode default override and create / deploy the package as-defined
--registry-override stringToString Specify a map of domains to override on package create when pulling images (e.g. --registry-override docker.io=dockerio-reg.enterprise.intranet) (default [])
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
--timeout duration Timeout for Helm operations such as installs and rollbacks (default 15m0s)
```

## Options inherited from parent commands
Expand Down
1 change: 1 addition & 0 deletions docs/2-the-zarf-cli/100-cli-commands/zarf_init.md
Expand Up @@ -70,6 +70,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA
--registry-push-username string Username to access to the registry Zarf is configured to use (default "zarf-push")
--registry-secret string Registry secret value
--registry-url string External registry url address to use for this Zarf cluster
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
--storage-class string Specify the storage class to use for the registry and git server. E.g. --storage-class=standard
Expand Down
Expand Up @@ -23,6 +23,7 @@ zarf package create [ DIRECTORY ] [flags]
-m, --max-package-size int Specify the maximum size of the package in megabytes, packages larger than this will be split into multiple parts to be loaded onto smaller media (i.e. DVDs). Use 0 to disable splitting.
-o, --output string Specify the output (either a directory or an oci:// URL) for the created Zarf package
--registry-override stringToString Specify a map of domains to override on package create when pulling images (e.g. --registry-override docker.io=dockerio-reg.enterprise.intranet) (default [])
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
-s, --sbom View SBOM contents after creating the package
--sbom-out string Specify an output directory for the SBOMs from the created Zarf package
--set stringToString Specify package variables to set on the command line (KEY=value) (default [])
Expand Down
Expand Up @@ -19,6 +19,7 @@ zarf package deploy [ PACKAGE_SOURCE ] [flags]
--components string Comma-separated list of components to deploy. Adding this flag will skip the prompts for selected components. Globbing component names with '*' and deselecting 'default' components with a leading '-' are also supported.
--confirm Confirms package deployment without prompting. ONLY use with packages you trust. Skips prompts to review SBOM, configure variables, select optional components and review potential breaking changes.
-h, --help help for deploy
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
--set stringToString Specify deployment variables to set on the command line (KEY=value) (default [])
--shasum string Shasum of the package to deploy. Required if deploying a remote package and "--insecure" is not provided
--skip-webhooks [alpha] Skip waiting for external webhooks to execute as each package component is deployed
Expand Down
Expand Up @@ -49,6 +49,7 @@ $ zarf package mirror-resources <your-package.tar.zst> \
--registry-push-password string Password for the push-user to connect to the registry
--registry-push-username string Username to access to the registry Zarf is configured to use (default "zarf-push")
--registry-url string External registry url address to use for this Zarf cluster
--retries int Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs (default 3)
```

## Options inherited from parent commands
Expand Down
4 changes: 3 additions & 1 deletion src/cmd/common/viper.go
Expand Up @@ -84,6 +84,7 @@ const (
VPkgDeploySget = "package.deploy.sget"
VPkgDeploySkipWebhooks = "package.deploy.skip_webhooks"
VPkgDeployTimeout = "package.deploy.timeout"
VPkgRetries = "package.deploy.retries"

// Package publish config keys

Expand Down Expand Up @@ -184,7 +185,8 @@ func setDefaults() {

// Package defaults that are non-zero values
v.SetDefault(VPkgOCIConcurrency, 3)
v.SetDefault(VPkgRetries, config.ZarfDefaultRetries)

// Deploy opts that are non-zero values
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultHelmTimeout)
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultTimeout)
}
6 changes: 6 additions & 0 deletions src/cmd/dev.go
Expand Up @@ -297,6 +297,12 @@ func bindDevDeployFlags(v *viper.Viper) {

devDeployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "deploy-set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)

// Always require adopt-existing-resources flag (no viper)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)
devDeployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

devDeployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
devDeployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)

devDeployFlags.BoolVar(&pkgConfig.CreateOpts.NoYOLO, "no-yolo", v.GetBool(common.VDevDeployNoYolo), lang.CmdDevDeployFlagNoYolo)
Expand Down
3 changes: 1 addition & 2 deletions src/cmd/initialize.go
Expand Up @@ -217,11 +217,10 @@ func init() {
// Flags that control how a deployment proceeds
// Always require adopt-existing-resources flag (no viper)
initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

initCmd.Flags().DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

initCmd.Flags().IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
initCmd.Flags().StringVarP(&pkgConfig.PkgOpts.PublicKeyPath, "key", "k", v.GetString(common.VPkgPublicKey), lang.CmdPackageFlagFlagPublicKey)

initCmd.Flags().SortFlags = true
Expand Down
6 changes: 4 additions & 2 deletions src/cmd/package.go
Expand Up @@ -365,6 +365,8 @@ func bindCreateFlags(v *viper.Viper) {
createFlags.StringVarP(&pkgConfig.CreateOpts.SigningKeyPath, "key", "k", v.GetString(common.VPkgCreateSigningKey), lang.CmdPackageCreateFlagDeprecatedKey)
createFlags.StringVar(&pkgConfig.CreateOpts.SigningKeyPassword, "key-pass", v.GetString(common.VPkgCreateSigningKeyPassword), lang.CmdPackageCreateFlagDeprecatedKeyPassword)

createFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)

createFlags.MarkHidden("output-directory")
createFlags.MarkHidden("key")
createFlags.MarkHidden("key-pass")
Expand All @@ -378,11 +380,10 @@ func bindDeployFlags(v *viper.Viper) {

// Always require adopt-existing-resources flag (no viper)
deployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

deployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

deployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

deployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
deployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)
deployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)
deployFlags.StringVar(&pkgConfig.PkgOpts.Shasum, "shasum", v.GetString(common.VPkgDeployShasum), lang.CmdPackageDeployFlagShasum)
Expand All @@ -404,6 +405,7 @@ func bindMirrorFlags(v *viper.Viper) {

mirrorFlags.BoolVar(&pkgConfig.MirrorOpts.NoImgChecksum, "no-img-checksum", false, lang.CmdPackageMirrorFlagNoChecksum)

mirrorFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
mirrorFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageMirrorFlagComponents)

// Flags for using an external Git server
Expand Down
7 changes: 5 additions & 2 deletions src/config/config.go
Expand Up @@ -87,8 +87,11 @@ var (
operationStartTime = time.Now().Unix()
dataInjectionMarker = ".zarf-injection-%d"

ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")
ZarfDefaultHelmTimeout = 15 * time.Minute
ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")

// Default Time Vars
ZarfDefaultTimeout = 15 * time.Minute
ZarfDefaultRetries = 3
)

// GetArch returns the arch based on a priority list with options for overriding.
Expand Down
1 change: 1 addition & 0 deletions src/config/lang/english.go
Expand Up @@ -232,6 +232,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA
CmdPackageShort = "Zarf package commands for creating, deploying, and inspecting packages"
CmdPackageFlagConcurrency = "Number of concurrent layer operations to perform when interacting with a remote package."
CmdPackageFlagFlagPublicKey = "Path to public key file for validating signed packages"
CmdPackageFlagRetries = "Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs"

CmdPackageCreateShort = "Creates a Zarf package from a given directory or the current directory"
CmdPackageCreateLong = "Builds an archive of resources and dependencies defined by the 'zarf.yaml' in the specified directory.\n" +
Expand Down
98 changes: 47 additions & 51 deletions src/internal/packager/helm/chart.go
Expand Up @@ -5,9 +5,12 @@
package helm

import (
"errors"
"fmt"
"time"

"github.com/defenseunicorns/zarf/src/pkg/utils/helpers"

"github.com/Masterminds/semver/v3"
"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/types"
Expand All @@ -24,9 +27,6 @@ import (
"helm.sh/helm/v3/pkg/storage/driver"
)

// Set the default number of Helm install/upgrade attempts to 3
const defaultHelmAttempts = 3

// InstallOrUpgradeChart performs a helm install of the given chart.
func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
fromMessage := h.chart.URL
Expand All @@ -39,8 +39,6 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
fromMessage)
defer spinner.Stop()

var output *release.Release

// If no release name is specified, use the chart name.
if h.chart.ReleaseName == "" {
h.chart.ReleaseName = h.chart.Name
Expand All @@ -63,49 +61,16 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
return nil, "", fmt.Errorf("unable to create helm renderer: %w", err)
}

attempt := 0
for {
attempt++
histClient := action.NewHistory(h.actionConfig)
tryHelm := func() error {
var err error
var output *release.Release

histClient := action.NewHistory(h.actionConfig)
releases, histErr := histClient.Run(h.chart.ReleaseName)

if attempt > 3 {
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Attempt %d of %d to install chart", attempt, defaultHelmAttempts)

spinner.Updatef("Checking for existing helm deployment")

if histErr == driver.ErrReleaseNotFound {
if errors.Is(histErr, driver.ErrReleaseNotFound) {
// No prior release, try to install it.
spinner.Updatef("Attempting chart installation")

Expand All @@ -119,19 +84,50 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
output, err = h.upgradeChart(lastRelease, postRender)
} else {
// 😭 things aren't working
return nil, "", fmt.Errorf("unable to verify the chart installation status: %w", histErr)
return fmt.Errorf("unable to verify the chart installation status: %w", histErr)
}

if err != nil {
message.Warnf("Unable to complete helm chart install/upgrade, waiting 10 seconds and trying again: %s", err.Error())
return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err)
}
Noxsios marked this conversation as resolved.
Show resolved Hide resolved

// Simply wait for dust to settle and try again.
time.Sleep(10 * time.Second)
} else {
message.Debug(output.Info.Description)
spinner.Success()
break
message.Debug(output.Info.Description)
spinner.Success()
return nil
}

err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf)
if err != nil {
// Try to rollback any deployed releases
releases, _ := histClient.Run(h.chart.ReleaseName)
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries)
}

// return any collected connect strings for zarf connect.
Expand Down
11 changes: 7 additions & 4 deletions src/internal/packager/helm/common.go
Expand Up @@ -33,6 +33,7 @@ type Helm struct {
component types.ZarfComponent
cluster *cluster.Cluster
timeout time.Duration
retries int

kubeVersion string

Expand All @@ -52,7 +53,7 @@ func New(chart types.ZarfChart, chartPath string, valuesPath string, mods ...Mod
chart: chart,
chartPath: chartPath,
valuesPath: valuesPath,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -67,7 +68,8 @@ func NewClusterOnly(cfg *types.PackagerConfig, cluster *cluster.Cluster) *Helm {
return &Helm{
cfg: cfg,
cluster: cluster,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
retries: config.ZarfDefaultRetries,
}
}

Expand Down Expand Up @@ -118,7 +120,7 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
NoWait: manifest.NoWait,
},
chartOverride: tmpChart,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -131,13 +133,14 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
}

// WithDeployInfo adds the necessary information to deploy a given chart
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration) Modifier {
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration, retries int) Modifier {
return func(h *Helm) {
h.component = component
h.cfg = cfg
h.cluster = cluster
h.valuesOverrides = valuesOverrides
h.timeout = timeout
h.retries = retries
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/pkg/packager/create_stages.go
Expand Up @@ -163,8 +163,8 @@ func (p *Packager) assemble() error {
return err
}

if err := helpers.Retry(doPull, 3, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after 3 attempts: %w", err)
if err := helpers.Retry(doPull, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after %d attempts: %w", p.cfg.PkgOpts.Retries, err)
}

for _, imgInfo := range pulled {
Expand Down