/
osd.go
394 lines (340 loc) · 12.5 KB
/
osd.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
Copyright 2016 The Rook Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package client
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"github.com/pkg/errors"
"github.com/rook/rook/pkg/clusterd"
)
type OSDUsage struct {
OSDNodes []OSDNodeUsage `json:"nodes"`
Summary struct {
TotalKB json.Number `json:"total_kb"`
TotalUsedKB json.Number `json:"total_kb_used"`
TotalAvailKB json.Number `json:"total_kb_avail"`
AverageUtil json.Number `json:"average_utilization"`
} `json:"summary"`
}
type OSDNodeUsage struct {
ID int `json:"id"`
Name string `json:"name"`
CrushWeight json.Number `json:"crush_weight"`
Depth json.Number `json:"depth"`
Reweight json.Number `json:"reweight"`
KB json.Number `json:"kb"`
UsedKB json.Number `json:"kb_used"`
AvailKB json.Number `json:"kb_avail"`
Utilization json.Number `json:"utilization"`
Variance json.Number `json:"var"`
Pgs json.Number `json:"pgs"`
}
type OSDPerfStats struct {
PerfInfo []struct {
ID json.Number `json:"id"`
Stats struct {
CommitLatency json.Number `json:"commit_latency_ms"`
ApplyLatency json.Number `json:"apply_latency_ms"`
} `json:"perf_stats"`
} `json:"osd_perf_infos"`
}
type OSDDump struct {
OSDs []struct {
OSD json.Number `json:"osd"`
Up json.Number `json:"up"`
In json.Number `json:"in"`
} `json:"osds"`
Flags string `json:"flags"`
CrushNodeFlags map[string][]string `json:"crush_node_flags"`
}
// IsFlagSet checks if an OSD flag is set
func (dump *OSDDump) IsFlagSet(checkFlag string) bool {
flags := strings.Split(dump.Flags, ",")
for _, flag := range flags {
if flag == checkFlag {
return true
}
}
return false
}
// IsFlagSetOnCrushUnit checks if an OSD flag is set on specified Crush unit
func (dump *OSDDump) IsFlagSetOnCrushUnit(checkFlag, crushUnit string) bool {
for unit, list := range dump.CrushNodeFlags {
if crushUnit == unit {
for _, flag := range list {
if flag == checkFlag {
return true
}
}
}
}
return false
}
// UpdateFlagOnCrushUnit checks if the flag is in the desired state and sets/unsets if it isn't. Mitigates redundant calls
// it returns true if the value was changed
func (dump *OSDDump) UpdateFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, set bool, crushUnit, flag string) (bool, error) {
flagSet := dump.IsFlagSetOnCrushUnit(flag, crushUnit)
if flagSet && !set {
err := UnsetFlagOnCrushUnit(context, clusterInfo, crushUnit, flag)
if err != nil {
return true, err
}
return true, nil
}
if !flagSet && set {
err := SetFlagOnCrushUnit(context, clusterInfo, crushUnit, flag)
if err != nil {
return true, err
}
return true, nil
}
return false, nil
}
// SetFlagOnCrushUnit sets the specified flag on the crush unit
func SetFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, crushUnit, flag string) error {
args := []string{"osd", "set-group", flag, crushUnit}
cmd := NewCephCommand(context, clusterInfo, args)
_, err := cmd.Run()
if err != nil {
return errors.Wrapf(err, "failed to set flag %s on %s", crushUnit, flag)
}
return nil
}
// UnsetFlagOnCrushUnit unsets the specified flag on the crush unit
func UnsetFlagOnCrushUnit(context *clusterd.Context, clusterInfo *ClusterInfo, crushUnit, flag string) error {
args := []string{"osd", "unset-group", flag, crushUnit}
cmd := NewCephCommand(context, clusterInfo, args)
_, err := cmd.Run()
if err != nil {
return errors.Wrapf(err, "failed to unset flag %s on %s", crushUnit, flag)
}
return nil
}
type SafeToDestroyStatus struct {
SafeToDestroy []int `json:"safe_to_destroy"`
}
// OsdTree represents the CRUSH hierarchy
type OsdTree struct {
Nodes []struct {
ID int `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
TypeID int `json:"type_id"`
Children []int `json:"children,omitempty"`
PoolWeights struct {
} `json:"pool_weights,omitempty"`
CrushWeight float64 `json:"crush_weight,omitempty"`
Depth int `json:"depth,omitempty"`
Exists int `json:"exists,omitempty"`
Status string `json:"status,omitempty"`
Reweight float64 `json:"reweight,omitempty"`
PrimaryAffinity float64 `json:"primary_affinity,omitempty"`
} `json:"nodes"`
Stray []struct {
ID int `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
TypeID int `json:"type_id"`
CrushWeight float64 `json:"crush_weight"`
Depth int `json:"depth"`
Exists int `json:"exists"`
Status string `json:"status"`
Reweight float64 `json:"reweight"`
PrimaryAffinity float64 `json:"primary_affinity"`
} `json:"stray"`
}
// OsdList returns the list of OSD by their IDs
type OsdList []int
// StatusByID returns status and inCluster states for given OSD id
func (dump *OSDDump) StatusByID(id int64) (int64, int64, error) {
for _, d := range dump.OSDs {
i, err := d.OSD.Int64()
if err != nil {
return 0, 0, err
}
if id == i {
in, err := d.In.Int64()
if err != nil {
return 0, 0, err
}
up, err := d.Up.Int64()
if err != nil {
return 0, 0, err
}
return up, in, nil
}
}
return 0, 0, errors.Errorf("not found osd.%d in OSDDump", id)
}
func GetOSDUsage(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDUsage, error) {
args := []string{"osd", "df"}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return nil, errors.Wrap(err, "failed to get osd df")
}
var osdUsage OSDUsage
if err := json.Unmarshal(buf, &osdUsage); err != nil {
return nil, errors.Wrap(err, "failed to unmarshal osd df response")
}
return &osdUsage, nil
}
func GetOSDPerfStats(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDPerfStats, error) {
args := []string{"osd", "perf"}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return nil, errors.Wrap(err, "failed to get osd perf")
}
var osdPerfStats OSDPerfStats
if err := json.Unmarshal(buf, &osdPerfStats); err != nil {
return nil, errors.Wrap(err, "failed to unmarshal osd perf response")
}
return &osdPerfStats, nil
}
func GetOSDDump(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDDump, error) {
args := []string{"osd", "dump"}
cmd := NewCephCommand(context, clusterInfo, args)
buf, err := cmd.Run()
if err != nil {
return nil, errors.Wrap(err, "failed to get osd dump")
}
var osdDump OSDDump
if err := json.Unmarshal(buf, &osdDump); err != nil {
return nil, errors.Wrap(err, "failed to unmarshal osd dump response")
}
return &osdDump, nil
}
func OSDOut(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int) (string, error) {
args := []string{"osd", "out", strconv.Itoa(osdID)}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
return string(buf), err
}
func OsdSafeToDestroy(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int) (bool, error) {
args := []string{"osd", "safe-to-destroy", strconv.Itoa(osdID)}
cmd := NewCephCommand(context, clusterInfo, args)
buf, err := cmd.Run()
if err != nil {
return false, errors.Wrap(err, "failed to get safe-to-destroy status")
}
var output SafeToDestroyStatus
if err := json.Unmarshal(buf, &output); err != nil {
return false, errors.Wrapf(err, "failed to unmarshal safe-to-destroy response. %s", string(buf))
}
if len(output.SafeToDestroy) != 0 && output.SafeToDestroy[0] == osdID {
return true, nil
}
return false, nil
}
// HostTree returns the osd tree
func HostTree(context *clusterd.Context, clusterInfo *ClusterInfo) (OsdTree, error) {
var output OsdTree
args := []string{"osd", "tree"}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return output, errors.Wrap(err, "failed to get osd tree")
}
err = json.Unmarshal(buf, &output)
if err != nil {
return output, errors.Wrap(err, "failed to unmarshal 'osd tree' response")
}
return output, nil
}
// OsdListNum returns the list of OSDs
func OsdListNum(context *clusterd.Context, clusterInfo *ClusterInfo) (OsdList, error) {
var output OsdList
args := []string{"osd", "ls"}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return output, errors.Wrap(err, "failed to get osd list")
}
err = json.Unmarshal(buf, &output)
if err != nil {
return output, errors.Wrap(err, "failed to unmarshal 'osd ls' response")
}
return output, nil
}
// OSDDeviceClass report device class for osd
type OSDDeviceClass struct {
ID int `json:"osd"`
DeviceClass string `json:"device_class,omitempty"`
}
// OSDDeviceClasses returns the device classes for particular OsdIDs
func OSDDeviceClasses(context *clusterd.Context, clusterInfo *ClusterInfo, osdIds []string) ([]OSDDeviceClass, error) {
var deviceClasses []OSDDeviceClass
args := []string{"osd", "crush", "get-device-class"}
args = append(args, osdIds...)
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return deviceClasses, errors.Wrap(err, "failed to get device-class info")
}
err = json.Unmarshal(buf, &deviceClasses)
if err != nil {
return deviceClasses, errors.Wrap(err, "failed to unmarshal 'osd crush get-device-class' response")
}
return deviceClasses, nil
}
// OSDOkToStopStats report detailed information about which OSDs are okay to stop
type OSDOkToStopStats struct {
OkToStop bool `json:"ok_to_stop"`
OSDs []int `json:"osds"`
NumOkPGs int `json:"num_ok_pgs"`
NumNotOkPGs int `json:"num_not_ok_pgs"`
BadBecomeInactive []string `json:"bad_become_inactive"`
OkBecomeDegraded []string `json:"ok_become_degraded"`
}
// OSDOkToStop returns a list of OSDs that can be stopped that includes the OSD ID given.
// This is relevant, for example, when checking which OSDs can be updated.
// The number of OSDs returned is limited by the value set in maxReturned.
// maxReturned=0 is the same as maxReturned=1.
func OSDOkToStop(context *clusterd.Context, clusterInfo *ClusterInfo, osdID, maxReturned int) ([]int, error) {
args := []string{"osd", "ok-to-stop", strconv.Itoa(osdID)}
returnsList := false // does the ceph call return a list of OSD IDs?
if clusterInfo.CephVersion.IsAtLeastPacific() {
returnsList = true
// NOTE: if the number of OSD IDs given in the CLI arg query is Q and --max=N is given, if
// N < Q, Ceph treats the query as though max=Q instead, always returning at least Q OSDs.
args = append(args, fmt.Sprintf("--max=%d", maxReturned))
}
buf, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
// is not ok to stop (or command error)
return []int{}, errors.Wrapf(err, "OSD %d is not ok to stop", osdID)
}
if !returnsList {
// If does not return list, just return a slice including only the OSD ID queried
return []int{osdID}, nil
}
var stats OSDOkToStopStats
err = json.Unmarshal(buf, &stats)
if err != nil {
// Since the command succeeded we still know that at least the given OSD ID is ok to
// stop, so we do not *have* to return an error. However, it is good to do it anyway so
// that we can catch breaking changes to JSON output in CI testing. As a middle ground
// here, return error but also return the given OSD ID in the output in case the calling
// function wants to recover from this case.
return []int{osdID}, errors.Wrapf(err, "failed to unmarshal 'osd ok-to-stop %d' response", osdID)
}
return stats.OSDs, nil
}
// SetPrimaryAffinity assigns primary-affinity (within range [0.0, 1.0]) to a specific OSD.
func SetPrimaryAffinity(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int, affinity string) error {
logger.Infof("setting osd.%d with primary-affinity %q", osdID, affinity)
args := []string{"osd", "primary-affinity", fmt.Sprintf("osd.%d", osdID), affinity}
_, err := NewCephCommand(context, clusterInfo, args).Run()
if err != nil {
return errors.Wrapf(err, "failed to set osd.%d with primary-affinity %q", osdID, affinity)
}
logger.Infof("successfully applied osd.%d primary-affinity %q", osdID, affinity)
return nil
}