-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
remove.go
222 lines (198 loc) · 8.54 KB
/
remove.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/*
Copyright 2020 The Rook Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package osd
import (
"fmt"
"strconv"
"time"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/pkg/errors"
"github.com/rook/rook/pkg/clusterd"
"github.com/rook/rook/pkg/daemon/ceph/client"
"github.com/rook/rook/pkg/operator/ceph/cluster/osd"
"github.com/rook/rook/pkg/operator/k8sutil"
)
// RemoveOSDs purges a list of OSDs from the cluster
func RemoveOSDs(context *clusterd.Context, clusterInfo *client.ClusterInfo, osdsToRemove []string, preservePVC, forceOSDRemoval bool) error {
// Generate the ceph config for running ceph commands similar to the operator
if err := client.WriteCephConfig(context, clusterInfo); err != nil {
return errors.Wrap(err, "failed to write the ceph config")
}
osdDump, err := client.GetOSDDump(context, clusterInfo)
if err != nil {
return errors.Wrap(err, "failed to get osd dump")
}
for _, osdIDStr := range osdsToRemove {
osdID, err := strconv.Atoi(osdIDStr)
if err != nil {
logger.Errorf("invalid OSD ID: %s. %v", osdIDStr, err)
continue
}
logger.Infof("validating status of osd.%d", osdID)
status, _, err := osdDump.StatusByID(int64(osdID))
if err != nil {
return errors.Wrapf(err, "failed to get osd status for osd %d", osdID)
}
const upStatus int64 = 1
if status == upStatus {
logger.Infof("osd.%d is healthy. It cannot be removed unless it is 'down'", osdID)
continue
} else {
logger.Infof("osd.%d is marked 'DOWN'", osdID)
}
// Check we can remove the OSD
// Loop forever until the osd is safe-to-destroy
for {
isSafeToDestroy, err := client.OsdSafeToDestroy(context, clusterInfo, osdID)
if err != nil {
// If we want to force remove the OSD and there was an error let's break outside of
// the loop and proceed with the OSD removal
if forceOSDRemoval {
logger.Errorf("failed to check if osd %d is safe to destroy, but force removal is enabled so proceeding with removal. %v", osdID, err)
break
} else {
logger.Errorf("failed to check if osd %d is safe to destroy, retrying in 1m. %v", osdID, err)
time.Sleep(1 * time.Minute)
continue
}
}
// If no error and the OSD is safe to destroy, we can proceed with the OSD removal
if isSafeToDestroy {
logger.Infof("osd.%d is safe to destroy, proceeding", osdID)
break
} else {
// If we arrive here and forceOSDRemoval is true, we should proceed with the OSD removal
if forceOSDRemoval {
logger.Infof("osd.%d is NOT be ok to destroy but force removal is enabled so proceeding with removal", osdID)
break
}
// Else we wait until the OSD can be removed
logger.Warningf("osd.%d is NOT be ok to destroy, retrying in 1m until success", osdID)
time.Sleep(1 * time.Minute)
}
}
removeOSD(context, clusterInfo, osdID, preservePVC)
}
return nil
}
func removeOSD(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInfo, osdID int, preservePVC bool) {
// Get the host where the OSD is found
hostName, err := client.GetCrushHostName(clusterdContext, clusterInfo, osdID)
if err != nil {
logger.Errorf("failed to get the host where osd.%d is running. %v", osdID, err)
}
// Mark the OSD as out.
logger.Infof("marking osd.%d out", osdID)
args := []string{"osd", "out", fmt.Sprintf("osd.%d", osdID)}
_, err = client.NewCephCommand(clusterdContext, clusterInfo, args).Run()
if err != nil {
logger.Errorf("failed to exclude osd.%d out of the crush map. %v", osdID, err)
}
// Remove the OSD deployment
deploymentName := fmt.Sprintf("rook-ceph-osd-%d", osdID)
deployment, err := clusterdContext.Clientset.AppsV1().Deployments(clusterInfo.Namespace).Get(clusterInfo.Context, deploymentName, metav1.GetOptions{})
if err != nil {
logger.Errorf("failed to fetch the deployment %q. %v", deploymentName, err)
} else {
logger.Infof("removing the OSD deployment %q", deploymentName)
if err := k8sutil.DeleteDeployment(clusterInfo.Context, clusterdContext.Clientset, clusterInfo.Namespace, deploymentName); err != nil {
if err != nil {
// Continue purging the OSD even if the deployment fails to be deleted
logger.Errorf("failed to delete deployment for OSD %d. %v", osdID, err)
}
}
if pvcName, ok := deployment.GetLabels()[osd.OSDOverPVCLabelKey]; ok {
labelSelector := fmt.Sprintf("%s=%s", osd.OSDOverPVCLabelKey, pvcName)
prepareJobList, err := clusterdContext.Clientset.BatchV1().Jobs(clusterInfo.Namespace).List(clusterInfo.Context, metav1.ListOptions{LabelSelector: labelSelector})
if err != nil && !kerrors.IsNotFound(err) {
logger.Errorf("failed to list osd prepare jobs with pvc %q. %v ", pvcName, err)
}
// Remove osd prepare job
for _, prepareJob := range prepareJobList.Items {
logger.Infof("removing the osd prepare job %q", prepareJob.GetName())
if err := k8sutil.DeleteBatchJob(clusterInfo.Context, clusterdContext.Clientset, clusterInfo.Namespace, prepareJob.GetName(), false); err != nil {
if err != nil {
// Continue with the cleanup even if the job fails to be deleted
logger.Errorf("failed to delete prepare job for osd %q. %v", prepareJob.GetName(), err)
}
}
}
if preservePVC {
// Detach the OSD PVC from Rook. We will continue OSD deletion even if failed to remove PVC label
logger.Infof("detach the OSD PVC %q from Rook", pvcName)
if pvc, err := clusterdContext.Clientset.CoreV1().PersistentVolumeClaims(clusterInfo.Namespace).Get(clusterInfo.Context, pvcName, metav1.GetOptions{}); err != nil {
logger.Errorf("failed to get pvc for OSD %q. %v", pvcName, err)
} else {
labels := pvc.GetLabels()
delete(labels, osd.CephDeviceSetPVCIDLabelKey)
pvc.SetLabels(labels)
if _, err := clusterdContext.Clientset.CoreV1().PersistentVolumeClaims(clusterInfo.Namespace).Update(clusterInfo.Context, pvc, metav1.UpdateOptions{}); err != nil {
logger.Errorf("failed to remove label %q from pvc for OSD %q. %v", osd.CephDeviceSetPVCIDLabelKey, pvcName, err)
}
}
} else {
// Remove the OSD PVC
logger.Infof("removing the OSD PVC %q", pvcName)
if err := clusterdContext.Clientset.CoreV1().PersistentVolumeClaims(clusterInfo.Namespace).Delete(clusterInfo.Context, pvcName, metav1.DeleteOptions{}); err != nil {
if err != nil {
// Continue deleting the OSD PVC even if PVC deletion fails
logger.Errorf("failed to delete pvc for OSD %q. %v", pvcName, err)
}
}
}
} else {
logger.Infof("did not find a pvc name to remove for osd %q", deploymentName)
}
}
// purge the osd
logger.Infof("purging osd.%d", osdID)
purgeOSDArgs := []string{"osd", "purge", fmt.Sprintf("osd.%d", osdID), "--force", "--yes-i-really-mean-it"}
_, err = client.NewCephCommand(clusterdContext, clusterInfo, purgeOSDArgs).Run()
if err != nil {
logger.Errorf("failed to purge osd.%d. %v", osdID, err)
}
// Attempting to remove the parent host. Errors can be ignored if there are other OSDs on the same host
logger.Infof("attempting to remove host %q from crush map if not in use", osdID)
hostArgs := []string{"osd", "crush", "rm", hostName}
_, err = client.NewCephCommand(clusterdContext, clusterInfo, hostArgs).Run()
if err != nil {
logger.Infof("failed to remove CRUSH host %q. %v", hostName, err)
}
// call archiveCrash to silence crash warning in ceph health if any
archiveCrash(clusterdContext, clusterInfo, osdID)
logger.Infof("completed removal of OSD %d", osdID)
}
func archiveCrash(clusterdContext *clusterd.Context, clusterInfo *client.ClusterInfo, osdID int) {
// The ceph health warning should be silenced by archiving the crash
crash, err := client.GetCrash(clusterdContext, clusterInfo)
if err != nil {
logger.Errorf("failed to list ceph crash. %v", err)
return
}
if crash != nil {
logger.Info("no ceph crash to silence")
return
}
var crashID string
for _, c := range crash {
if c.Entity == fmt.Sprintf("osd.%d", osdID) {
crashID = c.ID
break
}
}
err = client.ArchiveCrash(clusterdContext, clusterInfo, crashID)
if err != nil {
logger.Errorf("failed to archive the crash %q. %v", crashID, err)
}
}