Skip to content

Commit dc27102

Browse files
authoredOct 8, 2024··
feat: support using exponential backoff between self heal attempts (#20275)
Signed-off-by: Alexander Matyushentsev <AMatyushentsev@gmail.com>
1 parent 6002c7d commit dc27102

17 files changed

+1033
-723
lines changed
 

‎assets/swagger.json

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎cmd/argocd-application-controller/commands/argocd_application_controller.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/redis/go-redis/v9"
1414
log "github.com/sirupsen/logrus"
1515
"github.com/spf13/cobra"
16+
"k8s.io/apimachinery/pkg/util/wait"
1617
"k8s.io/client-go/kubernetes"
1718
"k8s.io/client-go/tools/clientcmd"
1819

@@ -57,6 +58,9 @@ func NewCommand() *cobra.Command {
5758
repoServerAddress string
5859
repoServerTimeoutSeconds int
5960
selfHealTimeoutSeconds int
61+
selfHealBackoffTimeoutSeconds int
62+
selfHealBackoffFactor int
63+
selfHealBackoffCapSeconds int
6064
statusProcessors int
6165
operationProcessors int
6266
glogLevel int
@@ -156,6 +160,14 @@ func NewCommand() *cobra.Command {
156160
kubectl := kubeutil.NewKubectl()
157161
clusterSharding, err := sharding.GetClusterSharding(kubeClient, settingsMgr, shardingAlgorithm, enableDynamicClusterDistribution)
158162
errors.CheckError(err)
163+
var selfHealBackoff *wait.Backoff
164+
if selfHealBackoffTimeoutSeconds != 0 {
165+
selfHealBackoff = &wait.Backoff{
166+
Duration: time.Duration(selfHealBackoffTimeoutSeconds) * time.Second,
167+
Factor: float64(selfHealBackoffFactor),
168+
Cap: time.Duration(selfHealBackoffCapSeconds) * time.Second,
169+
}
170+
}
159171
appController, err = controller.NewApplicationController(
160172
namespace,
161173
settingsMgr,
@@ -168,6 +180,7 @@ func NewCommand() *cobra.Command {
168180
hardResyncDuration,
169181
time.Duration(appResyncJitter)*time.Second,
170182
time.Duration(selfHealTimeoutSeconds)*time.Second,
183+
selfHealBackoff,
171184
time.Duration(repoErrorGracePeriod)*time.Second,
172185
metricsPort,
173186
metricsCacheExpiration,
@@ -231,7 +244,10 @@ func NewCommand() *cobra.Command {
231244
command.Flags().IntVar(&glogLevel, "gloglevel", 0, "Set the glog logging level")
232245
command.Flags().IntVar(&metricsPort, "metrics-port", common.DefaultPortArgoCDMetrics, "Start metrics server on given port")
233246
command.Flags().DurationVar(&metricsCacheExpiration, "metrics-cache-expiration", env.ParseDurationFromEnv("ARGOCD_APPLICATION_CONTROLLER_METRICS_CACHE_EXPIRATION", 0*time.Second, 0, math.MaxInt64), "Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)")
234-
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 5, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
247+
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 0, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
248+
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
249+
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
250+
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
235251
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
236252
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
237253
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")

‎controller/appcontroller.go

+24-3
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ type ApplicationController struct {
130130
statusHardRefreshTimeout time.Duration
131131
statusRefreshJitter time.Duration
132132
selfHealTimeout time.Duration
133+
selfHealBackOff *wait.Backoff
133134
repoClientset apiclient.Clientset
134135
db db.ArgoDB
135136
settingsMgr *settings_util.SettingsManager
@@ -160,6 +161,7 @@ func NewApplicationController(
160161
appHardResyncPeriod time.Duration,
161162
appResyncJitter time.Duration,
162163
selfHealTimeout time.Duration,
164+
selfHealBackoff *wait.Backoff,
163165
repoErrorGracePeriod time.Duration,
164166
metricsPort int,
165167
metricsCacheExpiration time.Duration,
@@ -201,6 +203,7 @@ func NewApplicationController(
201203
auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController, enableK8sEvent),
202204
settingsMgr: settingsMgr,
203205
selfHealTimeout: selfHealTimeout,
206+
selfHealBackOff: selfHealBackoff,
204207
clusterSharding: clusterSharding,
205208
projByNameCache: sync.Map{},
206209
applicationNamespaces: applicationNamespaces,
@@ -1985,6 +1988,9 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
19851988
InitiatedBy: appv1.OperationInitiator{Automated: true},
19861989
Retry: appv1.RetryStrategy{Limit: 5},
19871990
}
1991+
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
1992+
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
1993+
}
19881994
if app.Spec.SyncPolicy.Retry != nil {
19891995
op.Retry = *app.Spec.SyncPolicy.Retry
19901996
}
@@ -2002,6 +2008,7 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
20022008
return nil, 0
20032009
} else if alreadyAttempted && selfHeal {
20042010
if shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app); shouldSelfHeal {
2011+
op.Sync.SelfHealAttemptsCount++
20052012
for _, resource := range resources {
20062013
if resource.Status != appv1.SyncStatusCodeSynced {
20072014
op.Sync.Resources = append(op.Sync.Resources, appv1.SyncOperationResource{
@@ -2120,10 +2127,24 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool,
21202127
}
21212128

21222129
var retryAfter time.Duration
2123-
if app.Status.OperationState.FinishedAt == nil {
2124-
retryAfter = ctrl.selfHealTimeout
2130+
if ctrl.selfHealBackOff == nil {
2131+
if app.Status.OperationState.FinishedAt == nil {
2132+
retryAfter = ctrl.selfHealTimeout
2133+
} else {
2134+
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
2135+
}
21252136
} else {
2126-
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
2137+
backOff := *ctrl.selfHealBackOff
2138+
backOff.Steps = int(app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount)
2139+
var delay time.Duration
2140+
for backOff.Steps > 0 {
2141+
delay = backOff.Step()
2142+
}
2143+
if app.Status.OperationState.FinishedAt == nil {
2144+
retryAfter = delay
2145+
} else {
2146+
retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time)
2147+
}
21272148
}
21282149
return retryAfter <= 0, retryAfter
21292150
}

‎controller/appcontroller_test.go

+68-2
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,18 @@ import (
44
"context"
55
"encoding/json"
66
"errors"
7+
"fmt"
78
"testing"
89
"time"
910

11+
clustercache "github.com/argoproj/gitops-engine/pkg/cache"
1012
"github.com/argoproj/gitops-engine/pkg/utils/kube/kubetest"
1113
"github.com/sirupsen/logrus"
1214
"github.com/stretchr/testify/require"
1315
"k8s.io/apimachinery/pkg/api/resource"
16+
"k8s.io/apimachinery/pkg/util/wait"
1417
"k8s.io/client-go/rest"
15-
16-
clustercache "github.com/argoproj/gitops-engine/pkg/cache"
18+
"k8s.io/utils/ptr"
1719

1820
"github.com/argoproj/argo-cd/v2/common"
1921
statecache "github.com/argoproj/argo-cd/v2/controller/cache"
@@ -157,6 +159,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
157159
time.Hour,
158160
time.Second,
159161
time.Minute,
162+
nil,
160163
time.Second*10,
161164
common.DefaultPortArgoCDMetrics,
162165
data.metricsCacheExpiration,
@@ -2191,3 +2194,66 @@ func TestAlreadyAttemptSync(t *testing.T) {
21912194
assert.False(t, attempted)
21922195
})
21932196
}
2197+
2198+
func assertDurationAround(t *testing.T, expected time.Duration, actual time.Duration) {
2199+
delta := time.Second / 2
2200+
assert.GreaterOrEqual(t, expected, actual-delta)
2201+
assert.LessOrEqual(t, expected, actual+delta)
2202+
}
2203+
2204+
func TestSelfHealExponentialBackoff(t *testing.T) {
2205+
ctrl := newFakeController(&fakeData{}, nil)
2206+
ctrl.selfHealBackOff = &wait.Backoff{
2207+
Factor: 3,
2208+
Duration: 2 * time.Second,
2209+
Cap: 5 * time.Minute,
2210+
}
2211+
2212+
app := &v1alpha1.Application{
2213+
Status: v1alpha1.ApplicationStatus{
2214+
OperationState: &v1alpha1.OperationState{
2215+
Operation: v1alpha1.Operation{
2216+
Sync: &v1alpha1.SyncOperation{},
2217+
},
2218+
},
2219+
},
2220+
}
2221+
2222+
testCases := []struct {
2223+
attempts int64
2224+
finishedAt *metav1.Time
2225+
expectedDuration time.Duration
2226+
shouldSelfHeal bool
2227+
}{{
2228+
attempts: 0,
2229+
finishedAt: ptr.To(metav1.Now()),
2230+
expectedDuration: 0,
2231+
shouldSelfHeal: true,
2232+
}, {
2233+
attempts: 1,
2234+
finishedAt: ptr.To(metav1.Now()),
2235+
expectedDuration: 2 * time.Second,
2236+
shouldSelfHeal: false,
2237+
}, {
2238+
attempts: 2,
2239+
finishedAt: ptr.To(metav1.Now()),
2240+
expectedDuration: 6 * time.Second,
2241+
shouldSelfHeal: false,
2242+
}, {
2243+
attempts: 3,
2244+
finishedAt: nil,
2245+
expectedDuration: 18 * time.Second,
2246+
shouldSelfHeal: false,
2247+
}}
2248+
2249+
for i := range testCases {
2250+
tc := testCases[i]
2251+
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
2252+
app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = tc.attempts
2253+
app.Status.OperationState.FinishedAt = tc.finishedAt
2254+
ok, duration := ctrl.shouldSelfHeal(app)
2255+
require.Equal(t, ok, tc.shouldSelfHeal)
2256+
assertDurationAround(t, tc.expectedDuration, duration)
2257+
})
2258+
}
2259+
}

‎docs/operator-manual/argocd-cmd-params-cm.yaml

+5-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,11 @@ data:
4747
controller.log.level: "info"
4848
# Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)
4949
controller.metrics.cache.expiration: "24h0m0s"
50-
# Specifies timeout between application self heal attempts (default 5)
51-
controller.self.heal.timeout.seconds: "5"
50+
# Specifies exponential backoff timeout parameters between application self heal attempts
51+
controller.self.heal.timeout.seconds: "2"
52+
controller.self.heal.backoff.factor: "3"
53+
controller.self.heal.backoff.cap.seconds: "300"
54+
5255
# Cache expiration for app state (default 1h0m0s)
5356
controller.app.state.cache.expiration: "1h0m0s"
5457
# Specifies if resource health should be persisted in app CRD (default true)

‎docs/operator-manual/server-commands/argocd-application-controller.md

+4-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎manifests/base/application-controller-deployment/argocd-application-controller-deployment.yaml

+18
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,24 @@ spec:
9797
name: argocd-cmd-params-cm
9898
key: controller.self.heal.timeout.seconds
9999
optional: true
100+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
101+
valueFrom:
102+
configMapKeyRef:
103+
name: argocd-cmd-params-cm
104+
key: controller.self.heal.backoff.timeout.seconds
105+
optional: true
106+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
107+
valueFrom:
108+
configMapKeyRef:
109+
name: argocd-cmd-params-cm
110+
key: controller.self.heal.backoff.factor
111+
optional: true
112+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
113+
valueFrom:
114+
configMapKeyRef:
115+
name: argocd-cmd-params-cm
116+
key: controller.self.heal.backoff.cap.seconds
117+
optional: true
100118
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
101119
valueFrom:
102120
configMapKeyRef:

‎manifests/base/application-controller/argocd-application-controller-statefulset.yaml

+18
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,24 @@ spec:
100100
name: argocd-cmd-params-cm
101101
key: controller.self.heal.timeout.seconds
102102
optional: true
103+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
104+
valueFrom:
105+
configMapKeyRef:
106+
name: argocd-cmd-params-cm
107+
key: controller.self.heal.backoff.timeout.seconds
108+
optional: true
109+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
110+
valueFrom:
111+
configMapKeyRef:
112+
name: argocd-cmd-params-cm
113+
key: controller.self.heal.backoff.factor
114+
optional: true
115+
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
116+
valueFrom:
117+
configMapKeyRef:
118+
name: argocd-cmd-params-cm
119+
key: controller.self.heal.backoff.cap.seconds
120+
optional: true
103121
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
104122
valueFrom:
105123
configMapKeyRef:

‎manifests/core-install.yaml

+28
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎manifests/crds/application-crd.yaml

+10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)
Please sign in to comment.