From 5af67dc4ec367e2ea92b9cd2824083fcc963af24 Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Fri, 5 Nov 2021 12:22:59 -0600 Subject: [PATCH] osd: increase wait timeout for osd prepare cleanup When a reconcile is started for OSDs, the prepare jobs are first deleted from a previous reconcile. The timeout for the osd prepare job deletion was only 40s. After that timeout, the reconcile attempts to continue waiting for the pod, but of course will never complete since the OSD prepare was not running in the first place, causing the reconcile to wait indefinitely. In the reported issue, the osd prepare jobs were actually deleted successfully, the timeout just wasn't long enough. Pods need at least a minute to be forcefully deleted, so we increase the timeout to 90s to give it some extra buffer. Signed-off-by: Travis Nielsen (cherry picked from commit 427996a7c0cf3948580f9132e476e2c387fb15a8) --- pkg/daemon/ceph/osd/remove.go | 2 +- pkg/operator/ceph/cluster/osd/create.go | 7 +------ pkg/operator/k8sutil/job.go | 8 +++++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pkg/daemon/ceph/osd/remove.go b/pkg/daemon/ceph/osd/remove.go index a4f0326665f2..12bb49c335b4 100644 --- a/pkg/daemon/ceph/osd/remove.go +++ b/pkg/daemon/ceph/osd/remove.go @@ -106,7 +106,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf logger.Infof("removing the osd prepare job %q", prepareJob.GetName()) if err := k8sutil.DeleteBatchJob(clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil { if err != nil { - // Continue deleting the OSD prepare job even if the deployment fails to be deleted + // Continue with the cleanup even if the job fails to be deleted logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err) } } diff --git a/pkg/operator/ceph/cluster/osd/create.go b/pkg/operator/ceph/cluster/osd/create.go index eaba45506fe0..3ec8ff7e413f 100644 --- a/pkg/operator/ceph/cluster/osd/create.go +++ b/pkg/operator/ceph/cluster/osd/create.go @@ -26,7 +26,6 @@ import ( opcontroller "github.com/rook/rook/pkg/operator/ceph/controller" "github.com/rook/rook/pkg/operator/k8sutil" v1 "k8s.io/api/core/v1" - kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/version" ) @@ -372,11 +371,7 @@ func (c *Cluster) runPrepareJob(osdProps *osdProperties, config *provisionConfig } if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, false); err != nil { - if !kerrors.IsAlreadyExists(err) { - return errors.Wrapf(err, "failed to run provisioning job for %s %q", nodeOrPVC, nodeOrPVCName) - } - logger.Infof("letting preexisting OSD provisioning job run to completion for %s %q", nodeOrPVC, nodeOrPVCName) - return nil + return errors.Wrapf(err, "failed to run osd provisioning job for %s %q", nodeOrPVC, nodeOrPVCName) } logger.Infof("started OSD provisioning job for %s %q", nodeOrPVC, nodeOrPVCName) diff --git a/pkg/operator/k8sutil/job.go b/pkg/operator/k8sutil/job.go index acfb27045ca0..ad0e50686227 100644 --- a/pkg/operator/k8sutil/job.go +++ b/pkg/operator/k8sutil/job.go @@ -51,7 +51,7 @@ func RunReplaceableJob(clientset kubernetes.Interface, job *batch.Job, deleteIfF logger.Infof("Removing previous job %s to start a new one", job.Name) err := DeleteBatchJob(clientset, job.Namespace, existingJob.Name, true) if err != nil { - logger.Warningf("failed to remove job %s. %+v", job.Name, err) + return fmt.Errorf("failed to remove job %s. %+v", job.Name, err) } } @@ -103,8 +103,10 @@ func DeleteBatchJob(clientset kubernetes.Interface, namespace, name string, wait return nil } - retries := 20 - sleepInterval := 2 * time.Second + // Retry for the job to be deleted for 90s. A pod can easily take 60s to timeout before + // deletion so we add some buffer to that time. + retries := 30 + sleepInterval := 3 * time.Second for i := 0; i < retries; i++ { _, err := clientset.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil && errors.IsNotFound(err) {