Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: Increase wait timeout for osd prepare cleanup #9116

Merged
merged 1 commit into from Nov 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/daemon/ceph/osd/remove.go
Expand Up @@ -104,7 +104,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
logger.Infof("removing the osd prepare job %q", prepareJob.GetName())
if err := k8sutil.DeleteBatchJob(clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil {
if err != nil {
// Continue deleting the OSD prepare job even if the deployment fails to be deleted
// Continue with the cleanup even if the job fails to be deleted
logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err)
}
}
Expand Down
7 changes: 1 addition & 6 deletions pkg/operator/ceph/cluster/osd/create.go
Expand Up @@ -26,7 +26,6 @@ import (
opcontroller "github.com/rook/rook/pkg/operator/ceph/controller"
"github.com/rook/rook/pkg/operator/k8sutil"
v1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/version"
)
Expand Down Expand Up @@ -368,11 +367,7 @@ func (c *Cluster) runPrepareJob(osdProps *osdProperties, config *provisionConfig
}

if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, false); err != nil {
if !kerrors.IsAlreadyExists(err) {
return errors.Wrapf(err, "failed to run provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}
logger.Infof("letting preexisting OSD provisioning job run to completion for %s %q", nodeOrPVC, nodeOrPVCName)
return nil
return errors.Wrapf(err, "failed to run osd provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
}

logger.Infof("started OSD provisioning job for %s %q", nodeOrPVC, nodeOrPVCName)
Expand Down
8 changes: 5 additions & 3 deletions pkg/operator/k8sutil/job.go
Expand Up @@ -51,7 +51,7 @@ func RunReplaceableJob(clientset kubernetes.Interface, job *batch.Job, deleteIfF
logger.Infof("Removing previous job %s to start a new one", job.Name)
err := DeleteBatchJob(clientset, job.Namespace, existingJob.Name, true)
if err != nil {
logger.Warningf("failed to remove job %s. %+v", job.Name, err)
return fmt.Errorf("failed to remove job %s. %+v", job.Name, err)
}
}

Expand Down Expand Up @@ -103,8 +103,10 @@ func DeleteBatchJob(clientset kubernetes.Interface, namespace, name string, wait
return nil
}

retries := 20
sleepInterval := 2 * time.Second
// Retry for the job to be deleted for 90s. A pod can easily take 60s to timeout before
// deletion so we add some buffer to that time.
retries := 30
sleepInterval := 3 * time.Second
Comment on lines +108 to +109
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing I could suggest is adding a comment here that explains this case so we don't change this in the future.

I'm not sure if it would be truly useful to add unit/integration tests for this given that it is really about the behavior of kubernetes and the code's understanding of it.

for i := 0; i < retries; i++ {
_, err := clientset.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
Expand Down