Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Nov 24, 2021
1 parent 2c96461 commit 0051fc0
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 14 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/canary-integration-test.yml
Expand Up @@ -69,6 +69,18 @@ jobs:
- name: check-ownerreferences
run: tests/scripts/github-action-helper.sh check_ownerreferences

- name: test osd removal jobs
run: |
kubectl -n rook-ceph delete deploy/rook-ceph-operator
kubectl -n rook-ceph delete deploy/rook-ceph-osd-1 --grace-period=0 --force
sed -i 's/<OSD-IDs>/1/' cluster/examples/kubernetes/ceph/osd-purge.yaml
kubectl -n rook-ceph create -f cluster/examples/kubernetes/ceph/osd-purge.yaml
kubectl -n rook-ceph exec $toolbox -- ceph status
toolbox=$(kubectl get pod -l app=rook-ceph-tools -n rook-ceph -o jsonpath='{.items[*].metadata.name}')
timeout 120 sh -c "while kubectl -n rook-ceph exec $toolbox -- ceph osd tree|grep -q 'osd.1'; do echo 'waiting for ceph osd 1 to be gone'; sleep 1; done"
kubectl -n rook-ceph exec $toolbox -- ceph status
kubectl -n rook-ceph exec $toolbox -- ceph osd tree
- name: collect common logs
if: always()
uses: ./.github/workflows/collect-logs
Expand Down
14 changes: 13 additions & 1 deletion cluster/examples/kubernetes/ceph/osd-purge.yaml
Expand Up @@ -29,7 +29,19 @@ spec:
# TODO: Insert the OSD ID in the last parameter that is to be removed
# The OSD IDs are a comma-separated list. For example: "0" or "0,2".
# If you want to preserve the OSD PVCs, set `--preserve-pvc true`.
args: ["ceph", "osd", "remove", "--preserve-pvc", "false", "--osd-ids", "<OSD-IDs>"]
#
# A --force-osd-removal option is available if the OSD should be destroyed even though the
# removal could lead to data loss.
args:
- "ceph"
- "osd"
- "remove"
- "--preserve-pvc"
- "false"
- "--force-osd-removal"
- "false"
- "--osd-ids"
- "<OSD-IDs>"
env:
- name: POD_NAMESPACE
valueFrom:
Expand Down
5 changes: 4 additions & 1 deletion cmd/rook/ceph/osd.go
Expand Up @@ -72,6 +72,7 @@ var (
lvBackedPV bool
osdIDsToRemove string
preservePVC bool
forceOSDRemoval bool
)

func addOSDFlags(command *cobra.Command) {
Expand Down Expand Up @@ -101,6 +102,7 @@ func addOSDFlags(command *cobra.Command) {
// flags for removing OSDs that are unhealthy or otherwise should be purged from the cluster
osdRemoveCmd.Flags().StringVar(&osdIDsToRemove, "osd-ids", "", "OSD IDs to remove from the cluster")
osdRemoveCmd.Flags().BoolVar(&preservePVC, "preserve-pvc", false, "Whether PVCs for OSDs will be deleted")
osdRemoveCmd.Flags().BoolVar(&forceOSDRemoval, "force-osd-removal", false, "Whether to force remove the OSD")

// add the subcommands to the parent osd command
osdCmd.AddCommand(osdConfigCmd,
Expand Down Expand Up @@ -266,10 +268,11 @@ func removeOSDs(cmd *cobra.Command, args []string) error {
clusterInfo.Context = cmd.Context()

// Run OSD remove sequence
err := osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVC)
err := osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVC, forceOSDRemoval)
if err != nil {
rook.TerminateFatal(err)
}

return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/daemon/ceph/client/osd.go
Expand Up @@ -263,7 +263,7 @@ func OsdSafeToDestroy(context *clusterd.Context, clusterInfo *ClusterInfo, osdID

var output SafeToDestroyStatus
if err := json.Unmarshal(buf, &output); err != nil {
return false, errors.Wrap(err, "failed to unmarshal safe-to-destroy response")
return false, errors.Wrapf(err, "failed to unmarshal safe-to-destroy response. %s", string(buf))
}
if len(output.SafeToDestroy) != 0 && output.SafeToDestroy[0] == osdID {
return true, nil
Expand Down
40 changes: 29 additions & 11 deletions pkg/daemon/ceph/osd/remove.go
Expand Up @@ -32,8 +32,7 @@ import (
)

// RemoveOSDs purges a list of OSDs from the cluster
func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC bool) error {

func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC, forceOSDRemoval bool) error {
// Generate the ceph config for running ceph commands similar to the operator
if err := client.WriteCephConfig(context, clusterInfo); err != nil {
return errors.Wrap(err, "failed to write the ceph config")
Expand Down Expand Up @@ -63,16 +62,35 @@ func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osds
logger.Infof("osd.%d is marked 'DOWN'", osdID)
}

// Check we can remove the OSD in case jobs are fired in parallel
// loop forever until the osd is ok-to-stop (the job might timeout eventually but it's
// better than losing the osd data...)
// Check we can remove the OSD
// Loop forever until the osd is safe-to-destroy
for {
_, err := client.OSDOkToStop(context, clusterInfo, osdID, 1)
isSafeToDestroy, err := client.OsdSafeToDestroy(context, clusterInfo, osdID)
if err != nil {
logger.Errorf("failed to check if osd %d is ok to stop or not ok to stop, retrying in 1m. %v", osdID, err)
time.Sleep(1 * time.Minute)
// If we want to force remove the OSD and there was an error let's break outside of
// the loop and proceed with the OSD removal
if forceOSDRemoval {
logger.Errorf("failed to check if osd %d is safe to destroy, but force removal is enabled so proceeding with removal. %v", osdID, err)
break
} else {
logger.Errorf("failed to check if osd %d is safe to destroy, retrying in 1m. %v", osdID, err)
time.Sleep(1 * time.Minute)
}
}

// If no error and the OSD is safe to destroy, we can proceed with the OSD removal
if isSafeToDestroy {
logger.Infof("osd.%d is ok to destroy", osdID)
break
} else {
logger.Infof("osd.%d is ok to stop", osdID)
// Else we wait until the OSD can be removed
logger.Warningf("osd.%d is NOT be ok to destroy , retrying in 1m until success. %v", osdID, err)
time.Sleep(1 * time.Minute)
}

// If we arrive here and forceOSDRemoval is true, we should proceed with the OSD removal
if forceOSDRemoval {
logger.Infof("osd.%d is NOT be ok to destroy but force removal is enabled so proceeding with removal", osdID)
break
}
}
Expand Down Expand Up @@ -157,8 +175,8 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
}

// purge the osd
logger.Infof("purging osd.%d", osdID)
purgeOSDArgs := []string{"osd", "purge", fmt.Sprintf("osd.%d", osdID), "--force", "--yes-i-really-mean-it"}
logger.Infof("destroying osd.%d", osdID)
purgeOSDArgs := []string{"osd", "destroy", fmt.Sprintf("osd.%d", osdID), "--yes-i-really-mean-it"}
_, err = client.NewCephCommand(clusterdContext, clusterInfo, purgeOSDArgs).Run()
if err != nil {
logger.Errorf("failed to purge osd.%d. %v", osdID, err)
Expand Down

0 comments on commit 0051fc0

Please sign in to comment.