Skip to content

Commit

Permalink
Merge pull request #9116 from travisn/osd-prepare-cleanup-timeout
Browse files Browse the repository at this point in the history
osd: Increase wait timeout for osd prepare cleanup
  • Loading branch information
travisn committed Nov 5, 2021
2 parents 3b85de4 + 427996a commit 6f4a68c
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 10 deletions.
2 changes: 1 addition & 1 deletion pkg/daemon/ceph/osd/remove.go
Expand Up @@ -104,7 +104,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
logger.Infof("removing the osd prepare job %q", prepareJob.GetName())
if err := k8sutil.DeleteBatchJob(clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil {
if err != nil {
// Continue deleting the OSD prepare job even if the deployment fails to be deleted
// Continue with the cleanup even if the job fails to be deleted
logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err)
}
}
Expand Down
7 changes: 1 addition & 6 deletions pkg/operator/ceph/cluster/osd/create.go
Expand Up @@ -26,7 +26,6 @@ import (
opcontroller "github.com/rook/rook/pkg/operator/ceph/controller"
"github.com/rook/rook/pkg/operator/k8sutil"
v1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/version"
)
Expand Down Expand Up @@ -368,11 +367,7 @@ func (c *Cluster) runPrepareJob(osdProps *osdProperties, config *provisionConfig
}

if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, false); err != nil {
if !kerrors.IsAlreadyExists(err) {
return errors.Wrapf(err, "failed to run provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}
logger.Infof("letting preexisting OSD provisioning job run to completion for %s %q", nodeOrPVC, nodeOrPVCName)
return nil
return errors.Wrapf(err, "failed to run osd provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}

logger.Infof("started OSD provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
Expand Down
8 changes: 5 additions & 3 deletions pkg/operator/k8sutil/job.go
Expand Up @@ -51,7 +51,7 @@ func RunReplaceableJob(clientset kubernetes.Interface, job *batch.Job, deleteIfF
logger.Infof("Removing previous job %s to start a new one", job.Name)
err := DeleteBatchJob(clientset, job.Namespace, existingJob.Name, true)
if err != nil {
logger.Warningf("failed to remove job %s. %+v", job.Name, err)
return fmt.Errorf("failed to remove job %s. %+v", job.Name, err)
}
}

Expand Down Expand Up @@ -103,8 +103,10 @@ func DeleteBatchJob(clientset kubernetes.Interface, namespace, name string, wait
return nil
}

retries := 20
sleepInterval := 2 * time.Second
// Retry for the job to be deleted for 90s. A pod can easily take 60s to timeout before
// deletion so we add some buffer to that time.
retries := 30
sleepInterval := 3 * time.Second
for i := 0; i < retries; i++ {
_, err := clientset.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
Expand Down

0 comments on commit 6f4a68c

Please sign in to comment.