Skip to content

Commit

Permalink
osd: increase wait timeout for osd prepare cleanup
Browse files Browse the repository at this point in the history
When a reconcile is started for OSDs, the prepare jobs are first
deleted from a previous reconcile. The timeout for the osd prepare
job deletion was only 40s. After that timeout, the reconcile attempts
to continue waiting for the pod, but of course will never complete
since the OSD prepare was not running in the first place, causing the
reconcile to wait indefinitely. In the reported issue, the osd prepare
jobs were actually deleted successfully, the timeout just wasn't long
enough. Pods need at least a minute to be forcefully deleted,
so we increase the timeout to 90s to give it some extra buffer.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
  • Loading branch information
travisn committed Nov 5, 2021
1 parent 3b85de4 commit 427996a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 10 deletions.
2 changes: 1 addition & 1 deletion pkg/daemon/ceph/osd/remove.go
Expand Up @@ -104,7 +104,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
logger.Infof("removing the osd prepare job %q", prepareJob.GetName())
if err := k8sutil.DeleteBatchJob(clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil {
if err != nil {
// Continue deleting the OSD prepare job even if the deployment fails to be deleted
// Continue with the cleanup even if the job fails to be deleted
logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err)
}
}
Expand Down
7 changes: 1 addition & 6 deletions pkg/operator/ceph/cluster/osd/create.go
Expand Up @@ -26,7 +26,6 @@ import (
opcontroller "github.com/rook/rook/pkg/operator/ceph/controller"
"github.com/rook/rook/pkg/operator/k8sutil"
v1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/version"
)
Expand Down Expand Up @@ -368,11 +367,7 @@ func (c *Cluster) runPrepareJob(osdProps *osdProperties, config *provisionConfig
}

if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, false); err != nil {
if !kerrors.IsAlreadyExists(err) {
return errors.Wrapf(err, "failed to run provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}
logger.Infof("letting preexisting OSD provisioning job run to completion for %s %q", nodeOrPVC, nodeOrPVCName)
return nil
return errors.Wrapf(err, "failed to run osd provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}

logger.Infof("started OSD provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
Expand Down
8 changes: 5 additions & 3 deletions pkg/operator/k8sutil/job.go
Expand Up @@ -51,7 +51,7 @@ func RunReplaceableJob(clientset kubernetes.Interface, job *batch.Job, deleteIfF
logger.Infof("Removing previous job %s to start a new one", job.Name)
err := DeleteBatchJob(clientset, job.Namespace, existingJob.Name, true)
if err != nil {
logger.Warningf("failed to remove job %s. %+v", job.Name, err)
return fmt.Errorf("failed to remove job %s. %+v", job.Name, err)
}
}

Expand Down Expand Up @@ -103,8 +103,10 @@ func DeleteBatchJob(clientset kubernetes.Interface, namespace, name string, wait
return nil
}

retries := 20
sleepInterval := 2 * time.Second
// Retry for the job to be deleted for 90s. A pod can easily take 60s to timeout before
// deletion so we add some buffer to that time.
retries := 30
sleepInterval := 3 * time.Second
for i := 0; i < retries; i++ {
_, err := clientset.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
Expand Down

0 comments on commit 427996a

Please sign in to comment.