From 7402c2cce681103248202e0d09f85d7cc1159642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 23 Nov 2021 18:49:43 +0100
Subject: [PATCH] osd: check if osd is ok-to-stop before removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If multiple removal jobs are fired in parallel, there is a risk of
losing data since we will forcefully remove the OSD. It's also simply
true if a single OSD is not safe to destroy, there is also a risk of
data loss.

So now, we check if the OSD is safe-to-destroy first and then proceed.
The code waits forever and retries every minute unless the
--force-osd-removal flag is passed.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/canary-integration-test.yml | 15 +++++
 .../examples/kubernetes/ceph/osd-purge.yaml   | 14 ++++-
 cmd/rook/ceph/osd.go                          | 27 ++++++++-
 pkg/daemon/ceph/client/crash.go               | 14 +++--
 pkg/daemon/ceph/client/osd.go                 |  2 +-
 pkg/daemon/ceph/osd/remove.go                 | 55 ++++++++++++++++---
 tests/scripts/collect-logs.sh                 |  4 ++
 7 files changed, 115 insertions(+), 16 deletions(-)
diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml
index 5821161395de..80e8b9a1745f 100644
--- a/.github/workflows/canary-integration-test.yml
+++ b/.github/workflows/canary-integration-test.yml
@@ -69,6 +69,21 @@ jobs:
     - name: check-ownerreferences
       run: tests/scripts/github-action-helper.sh check_ownerreferences
 
+    - name: test osd removal jobs
+      run: |
+        kubectl -n rook-ceph delete deploy/rook-ceph-operator
+        kubectl -n rook-ceph delete deploy/rook-ceph-osd-1 --grace-period=0 --force
+        sed -i 's/<OSD-IDs>/1/' cluster/examples/kubernetes/ceph/osd-purge.yaml
+        # the CI must force the deletion since we use replica 1 on 2 OSDs
+        sed -i 's/false/true/' cluster/examples/kubernetes/ceph/osd-purge.yaml
+        sed -i 's|rook/ceph:master|rook/ceph:local-build|' cluster/examples/kubernetes/ceph/osd-purge.yaml
+        kubectl -n rook-ceph create -f cluster/examples/kubernetes/ceph/osd-purge.yaml
+        toolbox=$(kubectl get pod -l app=rook-ceph-tools -n rook-ceph -o jsonpath='{.items[*].metadata.name}')
+        kubectl -n rook-ceph exec $toolbox -- ceph status
+        timeout 120 sh -c "until kubectl -n rook-ceph exec $toolbox -- ceph osd tree|grep -qE 'osd.1.*.destroyed'; do echo 'waiting for ceph osd 1 to be destroyed'; sleep 1; done"
+        kubectl -n rook-ceph exec $toolbox -- ceph status
+        kubectl -n rook-ceph exec $toolbox -- ceph osd tree
+
     - name: collect common logs
       if: always()
       uses: ./.github/workflows/collect-logs
diff --git a/cluster/examples/kubernetes/ceph/osd-purge.yaml b/cluster/examples/kubernetes/ceph/osd-purge.yaml
index d00a6ebbec31..cda943794560 100644
--- a/cluster/examples/kubernetes/ceph/osd-purge.yaml
+++ b/cluster/examples/kubernetes/ceph/osd-purge.yaml
@@ -29,7 +29,19 @@ spec:
           # TODO: Insert the OSD ID in the last parameter that is to be removed
           # The OSD IDs are a comma-separated list. For example: "0" or "0,2".
           # If you want to preserve the OSD PVCs, set `--preserve-pvc true`.
-          args: ["ceph", "osd", "remove", "--preserve-pvc", "false", "--osd-ids", "<OSD-IDs>"]
+          #
+          # A --force-osd-removal option is available if the OSD should be destroyed even though the
+          # removal could lead to data loss.
+          args:
+            - "ceph"
+            - "osd"
+            - "remove"
+            - "--preserve-pvc"
+            - "false"
+            - "--force-osd-removal"
+            - "false"
+            - "--osd-ids"
+            - "<OSD-IDs>"
           env:
             - name: POD_NAMESPACE
               valueFrom:
diff --git a/cmd/rook/ceph/osd.go b/cmd/rook/ceph/osd.go
index a57d6d1f47d5..f7adb70b4458 100644
--- a/cmd/rook/ceph/osd.go
+++ b/cmd/rook/ceph/osd.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"encoding/json"
 	"os"
+	"strconv"
 	"strings"
 
 	"k8s.io/client-go/kubernetes"
@@ -71,7 +72,8 @@ var (
 	blockPath               string
 	lvBackedPV              bool
 	osdIDsToRemove          string
-	preservePVC             bool
+	preservePVC             string
+	forceOSDRemoval         string
 )
 
 func addOSDFlags(command *cobra.Command) {
@@ -100,7 +102,8 @@ func addOSDFlags(command *cobra.Command) {
 
 	// flags for removing OSDs that are unhealthy or otherwise should be purged from the cluster
 	osdRemoveCmd.Flags().StringVar(&osdIDsToRemove, "osd-ids", "", "OSD IDs to remove from the cluster")
-	osdRemoveCmd.Flags().BoolVar(&preservePVC, "preserve-pvc", false, "Whether PVCs for OSDs will be deleted")
+	osdRemoveCmd.Flags().StringVar(&preservePVC, "preserve-pvc", "false", "Whether PVCs for OSDs will be deleted")
+	osdRemoveCmd.Flags().StringVar(&forceOSDRemoval, "force-osd-removal", "false", "Whether to force remove the OSD")
 
 	// add the subcommands to the parent osd command
 	osdCmd.AddCommand(osdConfigCmd,
@@ -265,11 +268,29 @@ func removeOSDs(cmd *cobra.Command, args []string) error {
 
 	clusterInfo.Context = cmd.Context()
 
+	// We use strings instead of bool since the flag package has issues with parsing bools, or
+	// perhaps it's the translation between YAML and code... It's unclear but see:
+	// starting Rook v1.7.0-alpha.0.660.gb13faecc8 with arguments '/usr/local/bin/rook ceph osd remove --preserve-pvc false --force-osd-removal false --osd-ids 1'
+	// flag values: --force-osd-removal=true, --help=false, --log-level=DEBUG, --operator-image=,
+	// --osd-ids=1, --preserve-pvc=true, --service-account=
+	//
+	// Bools are false but they are interpreted true by the flag package.
+
+	forceOSDRemovalBool, err := strconv.ParseBool(forceOSDRemoval)
+	if err != nil {
+		return errors.Wrapf(err, "failed to parse --force-osd-removal flag")
+	}
+	preservePVCBool, err := strconv.ParseBool(preservePVC)
+	if err != nil {
+		return errors.Wrapf(err, "failed to parse --preserve-pvc flag")
+	}
+
 	// Run OSD remove sequence
-	err := osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVC)
+	err = osddaemon.RemoveOSDs(context, &clusterInfo, strings.Split(osdIDsToRemove, ","), preservePVCBool, forceOSDRemovalBool)
 	if err != nil {
 		rook.TerminateFatal(err)
 	}
+
 	return nil
 }
 
diff --git a/pkg/daemon/ceph/client/crash.go b/pkg/daemon/ceph/client/crash.go
index 8f8d478a9f0a..81366134a91e 100644
--- a/pkg/daemon/ceph/client/crash.go
+++ b/pkg/daemon/ceph/client/crash.go
@@ -57,26 +57,31 @@ type CrashList struct {
 
 // GetCrashList gets the list of Crashes.
 func GetCrashList(context *clusterd.Context, clusterInfo *ClusterInfo) ([]CrashList, error) {
-	crashargs := []string{"crash", "ls"}
-	output, err := NewCephCommand(context, clusterInfo, crashargs).Run()
+	crashArgs := []string{"crash", "ls"}
+	output, err := NewCephCommand(context, clusterInfo, crashArgs).Run()
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to list ceph crash")
 	}
+
 	var crash []CrashList
 	err = json.Unmarshal(output, &crash)
 	if err != nil {
 		return nil, errors.Wrapf(err, "failed to unmarshal crash ls response. %s", string(output))
 	}
+
 	return crash, err
 }
 
 // ArchiveCrash archives the crash with respective crashID
 func ArchiveCrash(context *clusterd.Context, clusterInfo *ClusterInfo, crashID string) error {
-	crashsilenceargs := []string{"crash", "archive", crashID}
-	_, err := NewCephCommand(context, clusterInfo, crashsilenceargs).Run()
+	logger.Infof("silencing crash %q", crashID)
+	crashSilenceArgs := []string{"crash", "archive", crashID}
+	_, err := NewCephCommand(context, clusterInfo, crashSilenceArgs).Run()
 	if err != nil {
 		return errors.Wrapf(err, "failed to archive crash %q", crashID)
 	}
+
+	logger.Infof("successfully silenced crash %q", crashID)
 	return nil
 }
 
@@ -86,5 +91,6 @@ func GetCrash(context *clusterd.Context, clusterInfo *ClusterInfo) ([]CrashList,
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to list ceph crash")
 	}
+
 	return crash, nil
 }
diff --git a/pkg/daemon/ceph/client/osd.go b/pkg/daemon/ceph/client/osd.go
index c571ec800057..4df569b07202 100644
--- a/pkg/daemon/ceph/client/osd.go
+++ b/pkg/daemon/ceph/client/osd.go
@@ -263,7 +263,7 @@ func OsdSafeToDestroy(context *clusterd.Context, clusterInfo *ClusterInfo, osdID
 
 	var output SafeToDestroyStatus
 	if err := json.Unmarshal(buf, &output); err != nil {
-		return false, errors.Wrap(err, "failed to unmarshal safe-to-destroy response")
+		return false, errors.Wrapf(err, "failed to unmarshal safe-to-destroy response. %s", string(buf))
 	}
 	if len(output.SafeToDestroy) != 0 && output.SafeToDestroy[0] == osdID {
 		return true, nil
diff --git a/pkg/daemon/ceph/osd/remove.go b/pkg/daemon/ceph/osd/remove.go
index 81ec4863ddb8..afee93a67ef3 100644
--- a/pkg/daemon/ceph/osd/remove.go
+++ b/pkg/daemon/ceph/osd/remove.go
@@ -19,6 +19,7 @@ package osd
 import (
 	"fmt"
 	"strconv"
+	"time"
 
 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -31,8 +32,7 @@ import (
 )
 
 // RemoveOSDs purges a list of OSDs from the cluster
-func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC bool) error {
-
+func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC, forceOSDRemoval bool) error {
 	// Generate the ceph config for running ceph commands similar to the operator
 	if err := client.WriteCephConfig(context, clusterInfo); err != nil {
 		return errors.Wrap(err, "failed to write the ceph config")
@@ -58,8 +58,43 @@ func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osds
 		if status == upStatus {
 			logger.Infof("osd.%d is healthy. It cannot be removed unless it is 'down'", osdID)
 			continue
+		} else {
+			logger.Infof("osd.%d is marked 'DOWN'", osdID)
+		}
+
+		// Check we can remove the OSD
+		// Loop forever until the osd is safe-to-destroy
+		for {
+			isSafeToDestroy, err := client.OsdSafeToDestroy(context, clusterInfo, osdID)
+			if err != nil {
+				// If we want to force remove the OSD and there was an error let's break outside of
+				// the loop and proceed with the OSD removal
+				if forceOSDRemoval {
+					logger.Errorf("failed to check if osd %d is safe to destroy, but force removal is enabled so proceeding with removal. %v", osdID, err)
+					break
+				} else {
+					logger.Errorf("failed to check if osd %d is safe to destroy, retrying in 1m. %v", osdID, err)
+					time.Sleep(1 * time.Minute)
+					continue
+				}
+			}
+
+			// If no error and the OSD is safe to destroy, we can proceed with the OSD removal
+			if isSafeToDestroy {
+				logger.Infof("osd.%d is safe to destroy, proceeding", osdID)
+				break
+			} else {
+				// If we arrive here and forceOSDRemoval is true, we should proceed with the OSD removal
+				if forceOSDRemoval {
+					logger.Infof("osd.%d is NOT be ok to destroy but force removal is enabled so proceeding with removal", osdID)
+					break
+				}
+				// Else we wait until the OSD can be removed
+				logger.Warningf("osd.%d is NOT be ok to destroy, retrying in 1m until success", osdID)
+				time.Sleep(1 * time.Minute)
+			}
 		}
-		logger.Infof("osd.%d is marked 'DOWN'. Removing it", osdID)
+
 		removeOSD(context, clusterInfo, osdID, preservePVC)
 	}
 
@@ -74,6 +109,7 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
 	}
 
 	// Mark the OSD as out.
+	logger.Infof("marking osd.%d out", osdID)
 	args := []string{"osd", "out", fmt.Sprintf("osd.%d", osdID)}
 	_, err = client.NewCephCommand(clusterdContext, clusterInfo, args).Run()
 	if err != nil {
@@ -138,18 +174,21 @@ func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInf
 	}
 
 	// purge the osd
-	purgeosdargs := []string{"osd", "purge", fmt.Sprintf("osd.%d", osdID), "--force", "--yes-i-really-mean-it"}
-	_, err = client.NewCephCommand(clusterdContext, clusterInfo, purgeosdargs).Run()
+	logger.Infof("destroying osd.%d", osdID)
+	purgeOSDArgs := []string{"osd", "destroy", fmt.Sprintf("osd.%d", osdID), "--yes-i-really-mean-it"}
+	_, err = client.NewCephCommand(clusterdContext, clusterInfo, purgeOSDArgs).Run()
 	if err != nil {
 		logger.Errorf("failed to purge osd.%d. %v", osdID, err)
 	}
 
 	// Attempting to remove the parent host. Errors can be ignored if there are other OSDs on the same host
-	hostargs := []string{"osd", "crush", "rm", hostName}
-	_, err = client.NewCephCommand(clusterdContext, clusterInfo, hostargs).Run()
+	logger.Infof("removing osd.%d from ceph", osdID)
+	hostArgs := []string{"osd", "crush", "rm", hostName}
+	_, err = client.NewCephCommand(clusterdContext, clusterInfo, hostArgs).Run()
 	if err != nil {
 		logger.Errorf("failed to remove CRUSH host %q. %v", hostName, err)
 	}
+
 	// call archiveCrash to silence crash warning in ceph health if any
 	archiveCrash(clusterdContext, clusterInfo, osdID)
 
@@ -167,6 +206,7 @@ func archiveCrash(clusterdContext *clusterd.Context, clusterInfo *client.Cluster
 		logger.Info("no ceph crash to silence")
 		return
 	}
+
 	var crashID string
 	for _, c := range crash {
 		if c.Entity == fmt.Sprintf("osd.%d", osdID) {
@@ -174,6 +214,7 @@ func archiveCrash(clusterdContext *clusterd.Context, clusterInfo *client.Cluster
 			break
 		}
 	}
+
 	err = client.ArchiveCrash(clusterdContext, clusterInfo, crashID)
 	if err != nil {
 		logger.Errorf("failed to archive the crash %q. %v", crashID, err)
diff --git a/tests/scripts/collect-logs.sh b/tests/scripts/collect-logs.sh
index f0bf62cebb43..fce61d60f88e 100755
--- a/tests/scripts/collect-logs.sh
+++ b/tests/scripts/collect-logs.sh
@@ -26,9 +26,13 @@ done
 kubectl -n "${OPERATOR_NAMESPACE}" logs deploy/rook-ceph-operator > "${LOG_DIR}"/operator-logs.txt
 kubectl -n "${OPERATOR_NAMESPACE}" get pods -o wide > "${LOG_DIR}"/operator-pods-list.txt
 kubectl -n "${CLUSTER_NAMESPACE}" get pods -o wide > "${LOG_DIR}"/cluster-pods-list.txt
+kubectl -n "${CLUSTER_NAMESPACE}" get jobs -o wide > "${LOG_DIR}"/cluster-jobs-list.txt
 prepare_job="$(kubectl -n "${CLUSTER_NAMESPACE}" get job -l app=rook-ceph-osd-prepare --output name | awk 'FNR <= 1')" # outputs job/<name>
+removal_job="$(kubectl -n "${CLUSTER_NAMESPACE}" get job -l app=rook-ceph-purge-osd --output name | awk 'FNR <= 1')" # outputs job/<name>
 kubectl -n "${CLUSTER_NAMESPACE}" describe "${prepare_job}" > "${LOG_DIR}"/osd-prepare-describe.txt
 kubectl -n "${CLUSTER_NAMESPACE}" logs "${prepare_job}" > "${LOG_DIR}"/osd-prepare-logs.txt
+kubectl -n "${CLUSTER_NAMESPACE}" describe "${removal_job}" > "${LOG_DIR}"/osd-removal-describe.txt
+kubectl -n "${CLUSTER_NAMESPACE}" logs "${removal_job}" > "${LOG_DIR}"/osd-removal-logs.txt
 kubectl -n "${CLUSTER_NAMESPACE}" logs deploy/rook-ceph-osd-0 --all-containers > "${LOG_DIR}"/rook-ceph-osd-0-logs.txt
 kubectl -n "${CLUSTER_NAMESPACE}" logs deploy/rook-ceph-osd-1 --all-containers > "${LOG_DIR}"/rook-ceph-osd-1-logs.txt
 kubectl get all -n "${OPERATOR_NAMESPACE}" -o wide > "${LOG_DIR}"/operator-wide.txt