Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Nov 24, 2021
1 parent bdc60ba commit 6cb3645
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 17 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/canary-integration-test.yml
Expand Up @@ -69,6 +69,18 @@ jobs:
- name: check-ownerreferences
run: tests/scripts/github-action-helper.sh check_ownerreferences

- name: test osd removal jobs
run: |
kubectl -n rook-ceph delete deploy/rook-ceph-operator
kubectl -n rook-ceph delete deploy/rook-ceph-osd-1 --grace-period=0 --force
sed -i 's/${FAILED_OSD_IDS}/1/' cluster/examples/kubernetes/ceph/osd-removal.yaml
kubectl -n rook-ceph create -f cluster/examples/kubernetes/ceph/osd-removal.yaml
kubectl -n rook-ceph exec $toolbox -- ceph status
toolbox=$(kubectl get pod -l app=rook-ceph-tools -n rook-ceph -o jsonpath='{.items[*].metadata.name}')
timeout 120 sh -c "while kubectl -n rook-ceph exec $toolbox -- ceph osd tree|grep -q 'osd.1'; do echo 'waiting for ceph osd 1 to be gone'; sleep 1; done"
kubectl -n rook-ceph exec $toolbox -- ceph status
kubectl -n rook-ceph exec $toolbox -- ceph osd tree
- name: collect common logs
if: always()
uses: ./.github/workflows/collect-logs
Expand Down
56 changes: 56 additions & 0 deletions cluster/examples/kubernetes/ceph/osd-removal.yaml
@@ -0,0 +1,56 @@
apiVersion: batch/v1
kind: Job
metadata:
name: rook-ceph-toolbox-job
namespace: rook-ceph # namespace:cluster
labels:
app: ceph-toolbox-job
spec:
template:
spec:
initContainers:
- name: config-init
image: rook/ceph:master
command: ["/usr/local/bin/toolbox.sh"]
args: ["--skip-watch"]
imagePullPolicy: IfNotPresent
env:
- name: ROOK_CEPH_USERNAME
valueFrom:
secretKeyRef:
name: rook-ceph-mon
key: ceph-username
- name: ROOK_CEPH_SECRET
valueFrom:
secretKeyRef:
name: rook-ceph-mon
key: ceph-secret
volumeMounts:
- mountPath: /etc/ceph
name: ceph-config
- name: mon-endpoint-volume
mountPath: /etc/rook
containers:
- name: script
image: rook/ceph:master
volumeMounts:
- mountPath: /etc/ceph
name: ceph-config
readOnly: true
command:
- "rook"
args:
- "ceph"
- "osd"
- "remove"
- "--osd-ids=${FAILED_OSD_IDS}"
volumes:
- name: mon-endpoint-volume
configMap:
name: rook-ceph-mon-endpoints
items:
- key: data
path: mon-endpoints
- name: ceph-config
emptyDir: {}
restartPolicy: Never
5 changes: 4 additions & 1 deletion cmd/rook/ceph/osd.go
Expand Up @@ -72,6 +72,7 @@ var (
lvBackedPV bool
osdIDsToRemove string
preservePVC bool
forceOSDRemoval bool
)

func addOSDFlags(command *cobra.Command) {
Expand Down Expand Up @@ -101,6 +102,7 @@ func addOSDFlags(command *cobra.Command) {
// flags for removing OSDs that are unhealthy or otherwise should be purged from the cluster
osdRemoveCmd.Flags().StringVar(&osdIDsToRemove, "osd-ids", "", "OSD IDs to remove from the cluster")
osdRemoveCmd.Flags().BoolVar(&preservePVC, "preserve-pvc", false, "Whether PVCs for OSDs will be deleted")
osdRemoveCmd.Flags().BoolVar(&forceOSDRemoval, "force-osd-removal", false, "Whether to force remove the OSD")

// add the subcommands to the parent osd command
osdCmd.AddCommand(osdConfigCmd,
Expand Down Expand Up @@ -266,10 +268,11 @@ func removeOSDs(cmd *cobra.Command, args []string) error {
clusterInfo.Context = cmd.Context()

// Run OSD remove sequence
err := osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVC)
err := osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVC, forceOSDRemoval)
if err != nil {
rook.TerminateFatal(err)
}

return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/daemon/ceph/client/osd.go
Expand Up @@ -263,7 +263,7 @@ func OsdSafeToDestroy(context *clusterd.Context, clusterInfo *ClusterInfo, osdID

var output SafeToDestroyStatus
if err := json.Unmarshal(buf, &output); err != nil {
return false, errors.Wrap(err, "failed to unmarshal safe-to-destroy response")
return false, errors.Wrapf(err, "failed to unmarshal safe-to-destroy response. %s", string(buf))
}
if len(output.SafeToDestroy) != 0 && output.SafeToDestroy[0] == osdID {
return true, nil
Expand Down
33 changes: 18 additions & 15 deletions pkg/daemon/ceph/osd/remove.go
Expand Up @@ -32,8 +32,7 @@ import (
)

// RemoveOSDs purges a list of OSDs from the cluster
func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC bool) error {

func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC, forceOSDRemoval bool) error {
// Generate the ceph config for running ceph commands similar to the operator
if err := client.WriteCephConfig(context, clusterInfo); err != nil {
return errors.Wrap(err, "failed to write the ceph config")
Expand Down Expand Up @@ -63,18 +62,22 @@ func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osds
logger.Infof("osd.%d is marked 'DOWN'", osdID)
}

// Check we can remove the OSD in case jobs are fired in parallel
// loop forever until the osd is ok-to-stop (the job might timeout eventually but it's
// better than losing the osd data...)
for {
_, err := client.OSDOkToStop(context, clusterInfo, osdID, 1)
if err != nil {
logger.Errorf("failed to check if osd %d is ok to stop or not ok to stop, retrying in 1m. %v", osdID, err)
time.Sleep(1 * time.Minute)
} else {
logger.Infof("osd.%d is ok to stop", osdID)
break
// Check we can remove the OSD
// Loop forever until the osd is safe-to-destroy
if !forceOSDRemoval {
for {
isSafeToDestroy, err := client.OsdSafeToDestroy(context, clusterInfo, osdID)
if err != nil {
logger.Errorf("failed to check if osd %d is ok to stop or not ok to stop, retrying in 1m. %v", osdID, err)
time.Sleep(1 * time.Minute)
}
if isSafeToDestroy {
logger.Infof("osd.%d is ok to destroy", osdID)
break
}
}
} else {
logger.Infof("osd.%d might NOT be ok to destroy but force removal is enabled", osdID)
}

logger.Infof("OSD %d is ready to be removed, proceeding", osdID)
Expand Down Expand Up @@ -157,8 +160,8 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
}

// purge the osd
logger.Infof("purging osd.%d", osdID)
purgeOSDArgs := []string{"osd", "purge", fmt.Sprintf("osd.%d", osdID), "--force", "--yes-i-really-mean-it"}
logger.Infof("destroying osd.%d", osdID)
purgeOSDArgs := []string{"osd", "destroy", fmt.Sprintf("osd.%d", osdID), "--yes-i-really-mean-it"}
_, err = client.NewCephCommand(clusterdContext, clusterInfo, purgeOSDArgs).Run()
if err != nil {
logger.Errorf("failed to purge osd.%d. %v", osdID, err)
Expand Down

0 comments on commit 6cb3645

Please sign in to comment.