osd: increase wait timeout for osd prepare cleanup

When a reconcile is started for OSDs, the prepare jobs are first deleted from a previous reconcile. The timeout for the osd prepare job deletion was only 40s. After that timeout, the reconcile attempts to continue waiting for the pod, but of course will never complete since the OSD prepare was not running in the first place, causing the reconcile to wait indefinitely. In the reported issue, the osd prepare jobs were actually deleted successfully, the timeout just wasn't long enough. Pods need at least a minute to be forcefully deleted, so we increase the timeout to 90s to give it some extra buffer. Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
rook · Nov 5, 2021 · 427996a · 427996a
1 parent 3b85de4
commit 427996a
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 10 deletions.
diff --git a/pkg/daemon/ceph/osd/remove.go b/pkg/daemon/ceph/osd/remove.go
@@ -104,7 +104,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
 				logger.Infof("removing the osd prepare job %q", prepareJob.GetName())
 				if err := k8sutil.DeleteBatchJob(clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil {
 					if err != nil {
-						// Continue deleting the OSD prepare job even if the deployment fails to be deleted
+						// Continue with the cleanup even if the job fails to be deleted
 						logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err)
 					}
 				}

diff --git a/pkg/operator/ceph/cluster/osd/create.go b/pkg/operator/ceph/cluster/osd/create.go
@@ -26,7 +26,6 @@ import (
 	opcontroller "github.com/rook/rook/pkg/operator/ceph/controller"
 	"github.com/rook/rook/pkg/operator/k8sutil"
 	v1 "k8s.io/api/core/v1"
-	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/version"
 )
@@ -368,11 +367,7 @@ func (c *Cluster) runPrepareJob(osdProps *osdProperties, config *provisionConfig
 	}
 
 	if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, false); err != nil {
-		if !kerrors.IsAlreadyExists(err) {
-			return errors.Wrapf(err, "failed to run provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
-		}
-		logger.Infof("letting preexisting OSD provisioning job run to completion for %s %q", nodeOrPVC, nodeOrPVCName)
-		return nil
+		return errors.Wrapf(err, "failed to run osd provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
 	}
 
 	logger.Infof("started OSD provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)

diff --git a/pkg/operator/k8sutil/job.go b/pkg/operator/k8sutil/job.go
@@ -51,7 +51,7 @@ func RunReplaceableJob(clientset kubernetes.Interface, job *batch.Job, deleteIfF
 		logger.Infof("Removing previous job %s to start a new one", job.Name)
 		err := DeleteBatchJob(clientset, job.Namespace, existingJob.Name, true)
 		if err != nil {
-			logger.Warningf("failed to remove job %s. %+v", job.Name, err)
+			return fmt.Errorf("failed to remove job %s. %+v", job.Name, err)
 		}
 	}
 
@@ -103,8 +103,10 @@ func DeleteBatchJob(clientset kubernetes.Interface, namespace, name string, wait
 		return nil
 	}
 
-	retries := 20
-	sleepInterval := 2 * time.Second
+	// Retry for the job to be deleted for 90s. A pod can easily take 60s to timeout before
+	// deletion so we add some buffer to that time.
+	retries := 30
+	sleepInterval := 3 * time.Second
 	for i := 0; i < retries; i++ {
 		_, err := clientset.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
 		if err != nil && errors.IsNotFound(err) {