From eadcd757b3da1d503f0764163f99a030f7dd3989 Mon Sep 17 00:00:00 2001 From: Blaine Gardner Date: Mon, 4 Oct 2021 13:06:46 -0600 Subject: [PATCH 1/2] rgw: replace period update --commit with function Replace calls to 'radosgw-admin period update --commit' with an idempotent function. Resolves #8879 Signed-off-by: Blaine Gardner --- pkg/operator/ceph/object/admin.go | 115 ++++ pkg/operator/ceph/object/admin_test.go | 572 +++++++++++++++++++ pkg/operator/ceph/object/controller.go | 5 +- pkg/operator/ceph/object/controller_test.go | 83 ++- pkg/operator/ceph/object/objectstore.go | 48 +- pkg/operator/ceph/object/objectstore_test.go | 178 +++--- 6 files changed, 858 insertions(+), 143 deletions(-) diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go index dbc271244168..f1c0c1f3dae0 100644 --- a/pkg/operator/ceph/object/admin.go +++ b/pkg/operator/ceph/object/admin.go @@ -18,13 +18,16 @@ package object import ( "context" + "encoding/json" "fmt" "net/http" "net/http/httputil" "regexp" + "strings" "github.com/ceph/go-ceph/rgw/admin" "github.com/coreos/pkg/capnslog" + "github.com/google/go-cmp/cmp" "github.com/pkg/errors" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/clusterd" @@ -294,6 +297,118 @@ func isInvalidFlagError(err error) bool { return exitCode == 22 } +// CommitConfigChanges commits changes to RGW configs for realm/zonegroup/zone changes idempotently. +// Under the hood, this updates the RGW config period and commits the change if changes are detected. +func CommitConfigChanges(c *Context) error { + currentPeriod, err := runAdminCommand(c, true, "period", "get") + if err != nil { + return errorOrIsNotFound(err, "failed to get the current RGW configuration period to see if it needs changed") + } + + // this stages the current config changees and returns what the new period config will look like + // without committing the changes + stagedPeriod, err := runAdminCommand(c, true, "period", "update") + if err != nil { + return errorOrIsNotFound(err, "failed to stage the current RGW configuration period") + } + + shouldCommit, err := periodWillChange(currentPeriod, stagedPeriod) + if err != nil { + return errors.Wrap(err, "failed to determine if the staged RGW configuration period is different from current") + } + + // DO NOT MODIFY nsName here. It is part of the integration test checks noted below. + nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name) + if !shouldCommit { + logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName) + return nil + } + // don't expect json output since we don't intend to use the output from the command + _, err = runAdminCommand(c, false, "period", "update", "--commit") + if err != nil { + return errorOrIsNotFound(err, "failed to commit RGW configuration period changes") + } + + return nil +} + +// return true if the configuration period will change if the staged period is committed +func periodWillChange(current, staged string) (bool, error) { + // Rook wants to check if there are any differences in the current period versus the period that + // is staged to be applied/committed. If there are differences, then Rook should "commit" the + // staged period changes to instruct RGWs to update their runtime configuration. + // + // For many RGW interactions, Rook often creates a typed struct to unmarshal RGW JSON output + // into. In those cases Rook is able to opt in to only a small subset of specific fields it + // needs. This keeps the coupling between Rook and RGW JSON output as loose as possible while + // still being specific enough for Rook to operate. + // + // For this implementation, we could use a strongly-typed struct here to unmarshal into, and we + // could use DisallowUnknownFields() to cause an error if the RGW JSON output changes to flag + // when the existing implementation might be invalidated. This relies on an extremely tight + // coupling between Rook and the JSON output from RGW. The failure mode of this implementation + // is to return an error from the reconcile when there are unmarshalling errors, which results + // in CephObjectStores that could not be updated if a version of Ceph changes the RGW output. + // + // In the chosen implementation, we unmarshal into "dumb" data structures that create a loose + // coupling. With these, we must ignore the fields that we have observed to change between the + // current and staged periods when we should *not* commit an un-changed period. The failure mode + // of this implementation is that if the RGW output changes its structure, Rook may detect + // differences where there are none. This would result in Rook committing the period more often + // than necessary. Committing the period results in a short period of downtime while RGWs reload + // their configuration, but we opt for this inconvenience in lieu of blocking reconciliation. + // + // For any implementation, if the RGW changes the behavior of its output but not the structure, + // Rook could commit unnecessary period changes or fail to commit necessary period changes + // depending on how the RGW output has changed. Rook cannot detect this class of failures, and + // the behavior cannot be specifically known. + var currentJSON map[string]interface{} + var stagedJSON map[string]interface{} + var err error + + err = json.Unmarshal([]byte(current), ¤tJSON) + if err != nil { + return true, errors.Wrap(err, "failed to unmarshal current RGW configuration period") + } + err = json.Unmarshal([]byte(staged), &stagedJSON) + if err != nil { + return true, errors.Wrap(err, "failed to unmarshal staged RGW configuration period") + } + + // There are some values in the periods that we don't care to diff because they are always + // different in the staged period, even when no updates are needed. Sometimes, the values are + // reported as different in the staging output but aren't actually changed upon commit. + ignorePaths := cmp.FilterPath(func(path cmp.Path) bool { + // path.String() outputs nothing for the crude map[string]interface{} JSON struct + // Example of path.GoString() output for a long path in the period JSON: + // root["period_map"].(map[string]interface {})["short_zone_ids"].([]interface {})[0].(map[string]interface {})["val"].(float64) + switch path.GoString() { + case `root["id"]`: + // "id" is always changed in the staged period, but it doesn't always update. + return true + case `root["predecessor_uuid"]`: + // "predecessor_uuid" is always changed in the staged period, but it doesn't always update. + return true + case `root["realm_epoch"]`: + // "realm_epoch" is always incremented in the staged period, but it doesn't always increment. + return true + case `root["epoch"]`: + // Strangely, "epoch" is not incremented in the staged period even though it is always + // incremented upon an actual commit. It could be argued that this behavior is a bug. + // Ignore this value to handle the possibility that the behavior changes in the future. + return true + default: + return false + } + }, cmp.Ignore()) + + diff := cmp.Diff(currentJSON, stagedJSON, ignorePaths) + diff = strings.TrimSpace(diff) + logger.Debugf("RGW config period diff:\n%s", diff) + + return (diff != ""), nil +} + func GetAdminOPSUserCredentials(objContext *Context, spec *cephv1.ObjectStoreSpec) (string, string, error) { ns := objContext.clusterInfo.Namespace diff --git a/pkg/operator/ceph/object/admin_test.go b/pkg/operator/ceph/object/admin_test.go index 5e05060b2342..30b84e23ee51 100644 --- a/pkg/operator/ceph/object/admin_test.go +++ b/pkg/operator/ceph/object/admin_test.go @@ -21,6 +21,7 @@ import ( "testing" "time" + "github.com/pkg/errors" v1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" "github.com/rook/rook/pkg/clusterd" "github.com/rook/rook/pkg/daemon/ceph/client" @@ -158,3 +159,574 @@ func TestRunAdminCommandNoMultisite(t *testing.T) { assert.EqualError(t, err, "no pods found with selector \"rook-ceph-mgr\"") }) } + +func TestCommitConfigChanges(t *testing.T) { + // control the return values from calling get/update on period + type commandReturns struct { + periodGetOutput string // empty implies error + periodUpdateOutput string // empty implies error + periodCommitError bool + } + + // control whether we should expect certain 'get' calls + type expectCommands struct { + // note: always expect period get to be called + periodUpdate bool + periodCommit bool + } + + // vars used to check if commands were called + var ( + periodGetCalled = false + periodUpdateCalled = false + periodCommitCalled = false + ) + + setupTest := func(returns commandReturns) *clusterd.Context { + // reset vars for checking if commands were called + periodGetCalled = false + periodUpdateCalled = false + periodCommitCalled = false + + executor := &exectest.MockExecutor{ + MockExecuteCommandWithTimeout: func(timeout time.Duration, command string, args ...string) (string, error) { + if command == "radosgw-admin" { + if args[0] == "period" { + if args[1] == "get" { + periodGetCalled = true + if returns.periodGetOutput == "" { + return "", errors.New("fake period get error") + } + return returns.periodGetOutput, nil + } + if args[1] == "update" { + if args[2] == "--commit" { + periodCommitCalled = true + if returns.periodCommitError { + return "", errors.New("fake period update --commit error") + } + return "", nil // success + } + periodUpdateCalled = true + if returns.periodUpdateOutput == "" { + return "", errors.New("fake period update (no --commit) error") + } + return returns.periodUpdateOutput, nil + } + } + } + + t.Fatalf("unhandled command: %s %v", command, args) + panic("unhandled command") + }, + } + + return &clusterd.Context{ + Executor: executor, + } + } + + expectNoErr := false // want no error + expectErr := true // want an error + + tests := []struct { + name string + commandReturns commandReturns + expectCommands expectCommands + wantErr bool + }{ + // a bit more background: creating a realm creates the first period epoch. When Rook creates + // zonegroup and zone, it results in many changes to the period. + {"real-world first reconcile (many changes, should commit period)", + commandReturns{ + periodGetOutput: firstPeriodGet, + periodUpdateOutput: firstPeriodUpdate, + }, + expectCommands{ + periodUpdate: true, + periodCommit: true, + }, + expectNoErr, + }, + // note: this also tests that we support the output changing in the future to increment "epoch" + {"real-world second reconcile (no changes, should not commit period)", + commandReturns{ + periodGetOutput: secondPeriodGet, + periodUpdateOutput: secondPeriodUpdateWithoutChanges, + }, + expectCommands{ + periodUpdate: true, + periodCommit: false, + }, + expectNoErr, + }, + {"second reconcile with changes", + commandReturns{ + periodGetOutput: secondPeriodGet, + periodUpdateOutput: secondPeriodUpdateWithChanges, + }, + expectCommands{ + periodUpdate: true, + periodCommit: true, + }, + expectNoErr, + }, + {"invalid get json", + commandReturns{ + periodGetOutput: `{"ids": [}`, // json obj with incomplete array that won't parse + periodUpdateOutput: firstPeriodUpdate, + }, + expectCommands{ + periodUpdate: true, + periodCommit: false, + }, + expectErr, + }, + {"invalid update json", + commandReturns{ + periodGetOutput: firstPeriodGet, + periodUpdateOutput: `{"ids": [}`, + }, + expectCommands{ + periodUpdate: true, + periodCommit: false, + }, + expectErr, + }, + {"fail period get", + commandReturns{ + periodGetOutput: "", // error + periodUpdateOutput: firstPeriodUpdate, + }, + expectCommands{ + periodUpdate: false, + periodCommit: false, + }, + expectErr, + }, + {"fail period update", + commandReturns{ + periodGetOutput: firstPeriodGet, + periodUpdateOutput: "", // error + }, + expectCommands{ + periodUpdate: true, + periodCommit: false, + }, + expectErr, + }, + {"fail period commit", + commandReturns{ + periodGetOutput: firstPeriodGet, + periodUpdateOutput: firstPeriodUpdate, + periodCommitError: true, + }, + expectCommands{ + periodUpdate: true, + periodCommit: true, + }, + expectErr, + }, + {"configs are removed", + commandReturns{ + periodGetOutput: secondPeriodUpdateWithChanges, + periodUpdateOutput: secondPeriodUpdateWithoutChanges, + }, + expectCommands{ + periodUpdate: true, + periodCommit: true, + }, + expectNoErr, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := setupTest(tt.commandReturns) + objCtx := NewContext(ctx, &client.ClusterInfo{Namespace: "my-cluster"}, "my-store") + + err := CommitConfigChanges(objCtx) + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + assert.True(t, periodGetCalled) + assert.Equal(t, tt.expectCommands.periodUpdate, periodUpdateCalled) + assert.Equal(t, tt.expectCommands.periodCommit, periodCommitCalled) + }) + } +} + +// example real-world output from 'radosgw-admin period get' after initial realm, zonegroup, and +// zone creation and before 'radosgw-admin period update --commit' +const firstPeriodGet = `{ + "id": "5338e008-26db-4013-92f5-c51505a917e2", + "epoch": 1, + "predecessor_uuid": "", + "sync_status": [], + "period_map": { + "id": "5338e008-26db-4013-92f5-c51505a917e2", + "zonegroups": [], + "short_zone_ids": [] + }, + "master_zonegroup": "", + "master_zone": "", + "period_config": { + "bucket_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + }, + "user_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + } + }, + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "realm_name": "my-store", + "realm_epoch": 1 +}` + +// example real-world output from 'radosgw-admin period update' after initial realm, zonegroup, and +// zone creation and before 'radosgw-admin period update --commit' +const firstPeriodUpdate = `{ + "id": "94ba560d-a560-431d-8ed4-85a2891f9122:staging", + "epoch": 1, + "predecessor_uuid": "5338e008-26db-4013-92f5-c51505a917e2", + "sync_status": [], + "period_map": { + "id": "5338e008-26db-4013-92f5-c51505a917e2", + "zonegroups": [ + { + "id": "1580fd1d-a065-4484-82ff-329e9a779999", + "name": "my-store", + "api_name": "my-store", + "is_master": "true", + "endpoints": [ + "http://10.105.59.166:80" + ], + "hostnames": [], + "hostnames_s3website": [], + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "zones": [ + { + "id": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "name": "my-store", + "endpoints": [ + "http://10.105.59.166:80" + ], + "log_meta": "false", + "log_data": "false", + "bucket_index_max_shards": 11, + "read_only": "false", + "tier_type": "", + "sync_from_all": "true", + "sync_from": [], + "redirect_zone": "" + } + ], + "placement_targets": [ + { + "name": "default-placement", + "tags": [], + "storage_classes": [ + "STANDARD" + ] + } + ], + "default_placement": "default-placement", + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "sync_policy": { + "groups": [] + } + } + ], + "short_zone_ids": [ + { + "key": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "val": 1698422904 + } + ] + }, + "master_zonegroup": "1580fd1d-a065-4484-82ff-329e9a779999", + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "period_config": { + "bucket_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + }, + "user_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + } + }, + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "realm_name": "my-store", + "realm_epoch": 2 +}` + +// example real-world output from 'radosgw-admin period get' after the first period commit +const secondPeriodGet = `{ + "id": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "epoch": 1, + "predecessor_uuid": "5338e008-26db-4013-92f5-c51505a917e2", + "sync_status": [], + "period_map": { + "id": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "zonegroups": [ + { + "id": "1580fd1d-a065-4484-82ff-329e9a779999", + "name": "my-store", + "api_name": "my-store", + "is_master": "true", + "endpoints": [ + "http://10.105.59.166:80" + ], + "hostnames": [], + "hostnames_s3website": [], + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "zones": [ + { + "id": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "name": "my-store", + "endpoints": [ + "http://10.105.59.166:80" + ], + "log_meta": "false", + "log_data": "false", + "bucket_index_max_shards": 11, + "read_only": "false", + "tier_type": "", + "sync_from_all": "true", + "sync_from": [], + "redirect_zone": "" + } + ], + "placement_targets": [ + { + "name": "default-placement", + "tags": [], + "storage_classes": [ + "STANDARD" + ] + } + ], + "default_placement": "default-placement", + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "sync_policy": { + "groups": [] + } + } + ], + "short_zone_ids": [ + { + "key": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "val": 1698422904 + } + ] + }, + "master_zonegroup": "1580fd1d-a065-4484-82ff-329e9a779999", + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "period_config": { + "bucket_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + }, + "user_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + } + }, + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "realm_name": "my-store", + "realm_epoch": 2 +}` + +// example real-world output from 'radosgw-admin period update' after the first period commit, +// and with no changes since the first commit +// note: output was modified to increment the epoch to make sure this code works in case the "epoch" +// behavior changes in radosgw-admin in the future +const secondPeriodUpdateWithoutChanges = `{ + "id": "94ba560d-a560-431d-8ed4-85a2891f9122:staging", + "epoch": 2, + "predecessor_uuid": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "sync_status": [], + "period_map": { + "id": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "zonegroups": [ + { + "id": "1580fd1d-a065-4484-82ff-329e9a779999", + "name": "my-store", + "api_name": "my-store", + "is_master": "true", + "endpoints": [ + "http://10.105.59.166:80" + ], + "hostnames": [], + "hostnames_s3website": [], + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "zones": [ + { + "id": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "name": "my-store", + "endpoints": [ + "http://10.105.59.166:80" + ], + "log_meta": "false", + "log_data": "false", + "bucket_index_max_shards": 11, + "read_only": "false", + "tier_type": "", + "sync_from_all": "true", + "sync_from": [], + "redirect_zone": "" + } + ], + "placement_targets": [ + { + "name": "default-placement", + "tags": [], + "storage_classes": [ + "STANDARD" + ] + } + ], + "default_placement": "default-placement", + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "sync_policy": { + "groups": [] + } + } + ], + "short_zone_ids": [ + { + "key": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "val": 1698422904 + } + ] + }, + "master_zonegroup": "1580fd1d-a065-4484-82ff-329e9a779999", + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "period_config": { + "bucket_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + }, + "user_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + } + }, + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "realm_name": "my-store", + "realm_epoch": 3 +}` + +// example output from 'radosgw-admin period update' after the first period commit, +// and with un-committed changes since the first commit (endpoint added to zonegroup and zone) +const secondPeriodUpdateWithChanges = `{ + "id": "94ba560d-a560-431d-8ed4-85a2891f9122:staging", + "epoch": 1, + "predecessor_uuid": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "sync_status": [], + "period_map": { + "id": "600c23a6-2452-4fc0-96b4-0c78b9b7c439", + "zonegroups": [ + { + "id": "1580fd1d-a065-4484-82ff-329e9a779999", + "name": "my-store", + "api_name": "my-store", + "is_master": "true", + "endpoints": [ + "http://10.105.59.166:80", + "https://10.105.59.166:443" + ], + "hostnames": [], + "hostnames_s3website": [], + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "zones": [ + { + "id": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "name": "my-store", + "endpoints": [ + "http://10.105.59.166:80", + "https://10.105.59.166:443" + ], + "log_meta": "false", + "log_data": "false", + "bucket_index_max_shards": 11, + "read_only": "false", + "tier_type": "", + "sync_from_all": "true", + "sync_from": [], + "redirect_zone": "" + } + ], + "placement_targets": [ + { + "name": "default-placement", + "tags": [], + "storage_classes": [ + "STANDARD" + ] + } + ], + "default_placement": "default-placement", + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "sync_policy": { + "groups": [] + } + } + ], + "short_zone_ids": [ + { + "key": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "val": 1698422904 + } + ] + }, + "master_zonegroup": "1580fd1d-a065-4484-82ff-329e9a779999", + "master_zone": "cea71d3a-9d22-45fb-a4e8-04fc6a494a50", + "period_config": { + "bucket_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + }, + "user_quota": { + "enabled": false, + "check_on_raw": false, + "max_size": -1, + "max_size_kb": 0, + "max_objects": -1 + } + }, + "realm_id": "94ba560d-a560-431d-8ed4-85a2891f9122", + "realm_name": "my-store", + "realm_epoch": 3 +}` diff --git a/pkg/operator/ceph/object/controller.go b/pkg/operator/ceph/object/controller.go index 85e3370f06e0..dab69a5d2dfc 100644 --- a/pkg/operator/ceph/object/controller.go +++ b/pkg/operator/ceph/object/controller.go @@ -76,6 +76,9 @@ var controllerTypeMeta = metav1.TypeMeta{ var currentAndDesiredCephVersion = opcontroller.CurrentAndDesiredCephVersion +// allow this to be overridden for unit tests +var cephObjectStoreDependents = CephObjectStoreDependents + // ReconcileCephObjectStore reconciles a cephObjectStore object type ReconcileCephObjectStore struct { client client.Client @@ -252,7 +255,7 @@ func (r *ReconcileCephObjectStore) reconcile(request reconcile.Request) (reconci return reconcile.Result{}, cephObjectStore, errors.Wrapf(err, "failed to check for object buckets. failed to get admin ops API context") } - deps, err := CephObjectStoreDependents(r.context, r.clusterInfo, cephObjectStore, objCtx, opsCtx) + deps, err := cephObjectStoreDependents(r.context, r.clusterInfo, cephObjectStore, objCtx, opsCtx) if err != nil { return reconcile.Result{}, cephObjectStore, err } diff --git a/pkg/operator/ceph/object/controller_test.go b/pkg/operator/ceph/object/controller_test.go index 8bc9282c39c1..350987946e62 100644 --- a/pkg/operator/ceph/object/controller_test.go +++ b/pkg/operator/ceph/object/controller_test.go @@ -28,12 +28,14 @@ import ( "github.com/pkg/errors" cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1" rookclient "github.com/rook/rook/pkg/client/clientset/versioned/fake" + rookfake "github.com/rook/rook/pkg/client/clientset/versioned/fake" "github.com/rook/rook/pkg/client/clientset/versioned/scheme" "github.com/rook/rook/pkg/clusterd" "github.com/rook/rook/pkg/daemon/ceph/client" cephver "github.com/rook/rook/pkg/operator/ceph/version" "github.com/rook/rook/pkg/operator/k8sutil" "github.com/rook/rook/pkg/operator/test" + "github.com/rook/rook/pkg/util/dependents" exectest "github.com/rook/rook/pkg/util/exec/test" "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" @@ -285,7 +287,20 @@ func TestCephObjectStoreController(t *testing.T) { capnslog.SetGlobalLogLevel(capnslog.DEBUG) os.Setenv("ROOK_LOG_LEVEL", "DEBUG") + commitConfigChangesOrig := commitConfigChanges + defer func() { commitConfigChanges = commitConfigChangesOrig }() + + // make sure joining multisite calls to commit config changes + calledCommitConfigChanges := false + commitConfigChanges = func(c *Context) error { + calledCommitConfigChanges = true + return nil + } + setupNewEnvironment := func(additionalObjects ...runtime.Object) *ReconcileCephObjectStore { + // reset var we use to check if we have called to commit config changes + calledCommitConfigChanges = false + // A Pool resource with metadata and spec. objectStore := &cephv1.CephObjectStore{ ObjectMeta: metav1.ObjectMeta{ @@ -360,6 +375,7 @@ func TestCephObjectStoreController(t *testing.T) { res, err := r.Reconcile(ctx, req) assert.NoError(t, err) assert.True(t, res.Requeue) + assert.False(t, calledCommitConfigChanges) }) t.Run("error - ceph cluster not ready", func(t *testing.T) { @@ -381,6 +397,7 @@ func TestCephObjectStoreController(t *testing.T) { res, err := r.Reconcile(ctx, req) assert.NoError(t, err) assert.True(t, res.Requeue) + assert.False(t, calledCommitConfigChanges) }) // set up an environment that has a ready ceph cluster, and return the reconciler for it @@ -483,6 +500,9 @@ func TestCephObjectStoreController(t *testing.T) { // we don't actually care if Requeue is true if there is an error assert.True(t, res.Requeue) assert.Contains(t, err.Error(), "failed to start rgw health checker") assert.Contains(t, err.Error(), "induced error creating admin ops API connection") + + // health checker should start up after committing config changes + assert.True(t, calledCommitConfigChanges) }) t.Run("success - object store is running", func(t *testing.T) { @@ -498,8 +518,8 @@ func TestCephObjectStoreController(t *testing.T) { assert.Equal(t, cephv1.ConditionProgressing, objectStore.Status.Phase, objectStore) assert.NotEmpty(t, objectStore.Status.Info["endpoint"], objectStore) assert.Equal(t, "http://rook-ceph-rgw-my-store.rook-ceph.svc:80", objectStore.Status.Info["endpoint"], objectStore) + assert.True(t, calledCommitConfigChanges) }) - } func TestCephObjectStoreControllerMultisite(t *testing.T) { @@ -635,6 +655,16 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) { }, } + commitConfigChangesOrig := commitConfigChanges + defer func() { commitConfigChanges = commitConfigChangesOrig }() + + // make sure joining multisite calls to commit config changes + calledCommitConfigChanges := false + commitConfigChanges = func(c *Context) error { + calledCommitConfigChanges = true + return nil + } + clientset := test.New(t, 3) c := &clusterd.Context{ Executor: executor, @@ -668,9 +698,50 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) { }, } - res, err := r.Reconcile(ctx, req) - assert.NoError(t, err) - assert.False(t, res.Requeue) - err = r.client.Get(context.TODO(), req.NamespacedName, objectStore) - assert.NoError(t, err) + currentAndDesiredCephVersion = func(rookImage string, namespace string, jobName string, ownerInfo *k8sutil.OwnerInfo, context *clusterd.Context, cephClusterSpec *cephv1.ClusterSpec, clusterInfo *client.ClusterInfo) (*cephver.CephVersion, *cephver.CephVersion, error) { + return &cephver.Pacific, &cephver.Pacific, nil + } + + t.Run("create an object store", func(t *testing.T) { + res, err := r.Reconcile(ctx, req) + assert.NoError(t, err) + assert.False(t, res.Requeue) + assert.True(t, calledCommitConfigChanges) + err = r.client.Get(context.TODO(), req.NamespacedName, objectStore) + assert.NoError(t, err) + }) + + t.Run("delete the same store", func(t *testing.T) { + calledCommitConfigChanges = false + + // no dependents + dependentsChecked := false + cephObjectStoreDependentsOrig := cephObjectStoreDependents + defer func() { cephObjectStoreDependents = cephObjectStoreDependentsOrig }() + cephObjectStoreDependents = func(clusterdCtx *clusterd.Context, clusterInfo *client.ClusterInfo, store *cephv1.CephObjectStore, objCtx *Context, opsCtx *AdminOpsContext) (*dependents.DependentList, error) { + dependentsChecked = true + return &dependents.DependentList{}, nil + } + + err = r.client.Get(context.TODO(), req.NamespacedName, objectStore) + assert.NoError(t, err) + objectStore.DeletionTimestamp = &metav1.Time{ + Time: time.Now(), + } + err = r.client.Update(ctx, objectStore) + + // have to also track the same objects in the rook clientset + r.context.RookClientset = rookfake.NewSimpleClientset( + objectRealm, + objectZoneGroup, + objectZone, + objectStore, + ) + + res, err := r.Reconcile(ctx, req) + assert.NoError(t, err) + assert.False(t, res.Requeue) + assert.True(t, dependentsChecked) + assert.True(t, calledCommitConfigChanges) + }) } diff --git a/pkg/operator/ceph/object/objectstore.go b/pkg/operator/ceph/object/objectstore.go index dcb70f3c6767..7356423e3bed 100644 --- a/pkg/operator/ceph/object/objectstore.go +++ b/pkg/operator/ceph/object/objectstore.go @@ -86,6 +86,9 @@ type realmType struct { Realms []string `json:"realms"` } +// allow commitConfigChanges to be overridden for unit testing +var commitConfigChanges = CommitConfigChanges + func deleteRealmAndPools(objContext *Context, spec cephv1.ObjectStoreSpec) error { if spec.IsMultisite() { // since pools for object store are created by the zone, the object store only needs to be removed from the zone @@ -146,12 +149,11 @@ func removeObjectStoreFromMultisite(objContext *Context, spec cephv1.ObjectStore logger.Infof("WARNING: No other zone in realm %q can commit to the period or pull the realm until you create another object-store in zone %q", objContext.Realm, objContext.Zone) } - // the period will help notify other zones of changes if there are multi-zones - _, err = runAdminCommand(objContext, false, "period", "update", "--commit") - if err != nil { - return errors.Wrap(err, "failed to update period after removing an endpoint from the zone") + // this will notify other zones of changes if there are multi-zones + if err := commitConfigChanges(objContext); err != nil { + nsName := fmt.Sprintf("%s/%s", objContext.clusterInfo.Namespace, objContext.Name) + return errors.Wrapf(err, "failed to commit config changes after removing CephObjectStore %q from multi-site", nsName) } - logger.Infof("successfully updated period for realm %q after removal of object-store %q", objContext.Realm, objContext.Name) return nil } @@ -362,13 +364,11 @@ func createMultisite(objContext *Context, endpointArg string) error { realmArg := fmt.Sprintf("--rgw-realm=%s", objContext.Realm) zoneGroupArg := fmt.Sprintf("--rgw-zonegroup=%s", objContext.ZoneGroup) - updatePeriod := false // create the realm if it doesn't exist yet output, err := RunAdminCommandNoMultisite(objContext, true, "realm", "get", realmArg) if err != nil { // ENOENT means “No such file or directory” if code, err := exec.ExtractExitCode(err); err == nil && code == int(syscall.ENOENT) { - updatePeriod = true output, err = RunAdminCommandNoMultisite(objContext, false, "realm", "create", realmArg) if err != nil { return errorOrIsNotFound(err, "failed to create ceph realm %q, for reason %q", objContext.ZoneGroup, output) @@ -384,7 +384,6 @@ func createMultisite(objContext *Context, endpointArg string) error { if err != nil { // ENOENT means “No such file or directory” if code, err := exec.ExtractExitCode(err); err == nil && code == int(syscall.ENOENT) { - updatePeriod = true output, err = RunAdminCommandNoMultisite(objContext, false, "zonegroup", "create", "--master", realmArg, zoneGroupArg, endpointArg) if err != nil { return errorOrIsNotFound(err, "failed to create ceph zone group %q, for reason %q", objContext.ZoneGroup, output) @@ -400,7 +399,6 @@ func createMultisite(objContext *Context, endpointArg string) error { if err != nil { // ENOENT means “No such file or directory” if code, err := exec.ExtractExitCode(err); err == nil && code == int(syscall.ENOENT) { - updatePeriod = true output, err = runAdminCommand(objContext, false, "zone", "create", "--master", endpointArg) if err != nil { return errorOrIsNotFound(err, "failed to create ceph zone %q, for reason %q", objContext.Zone, output) @@ -411,27 +409,9 @@ func createMultisite(objContext *Context, endpointArg string) error { } } - // check if the period exists - output, err = runAdminCommand(objContext, false, "period", "get") - if err != nil { - code, err := exec.ExtractExitCode(err) - // ENOENT means “No such file or directory” - if err == nil && code == int(syscall.ENOENT) { - // period does not exist and so needs to be created - logger.Debugf("period must be updated for CephObjectStore %q because it does not exist", objContext.Name) - updatePeriod = true - } else { - return errorOrIsNotFound(err, "'radosgw-admin period get' failed with code %d, for reason %q", strconv.Itoa(code), output) - } - } - - if updatePeriod { - // the period will help notify other zones of changes if there are multi-zones - _, err = runAdminCommand(objContext, false, "period", "update", "--commit") - if err != nil { - return errorOrIsNotFound(err, "failed to update period") - } - logger.Debugf("updated period for realm %q", objContext.Realm) + if err := commitConfigChanges(objContext); err != nil { + nsName := fmt.Sprintf("%s/%s", objContext.clusterInfo.Namespace, objContext.Name) + return errors.Wrapf(err, "failed to commit config changes after creating multisite config for CephObjectStore %q", nsName) } logger.Infof("Multisite for object-store: realm=%s, zonegroup=%s, zone=%s", objContext.Realm, objContext.ZoneGroup, objContext.Zone) @@ -471,11 +451,11 @@ func joinMultisite(objContext *Context, endpointArg, zoneEndpoints, namespace st } logger.Debugf("endpoints for zone %q are now %q", objContext.Zone, zoneEndpoints) - // the period will help notify other zones of changes if there are multi-zones - _, err = RunAdminCommandNoMultisite(objContext, false, "period", "update", "--commit", realmArg, zoneGroupArg, zoneArg) - if err != nil { - return errorOrIsNotFound(err, "failed to update period") + if err := commitConfigChanges(objContext); err != nil { + nsName := fmt.Sprintf("%s/%s", objContext.clusterInfo.Namespace, objContext.Name) + return errors.Wrapf(err, "failed to commit config changes for CephObjectStore %q when joining multisite ", nsName) } + logger.Infof("added object store %q to realm %q, zonegroup %q, zone %q", objContext.Name, objContext.Realm, objContext.ZoneGroup, objContext.Zone) // create system user for realm for master zone in master zonegorup for multisite scenario diff --git a/pkg/operator/ceph/object/objectstore_test.go b/pkg/operator/ceph/object/objectstore_test.go index 5b952340eee0..51cbec7c4053 100644 --- a/pkg/operator/ceph/object/objectstore_test.go +++ b/pkg/operator/ceph/object/objectstore_test.go @@ -301,40 +301,40 @@ func TestMockExecHelperProcess(t *testing.T) { func Test_createMultisite(t *testing.T) { // control the return values from calling get/create/update on resources type commandReturns struct { - realmExists bool - zoneGroupExists bool - zoneExists bool - periodExists bool - failCreateRealm bool - failCreateZoneGroup bool - failCreateZone bool - failUpdatePeriod bool + realmExists bool + zoneGroupExists bool + zoneExists bool + failCreateRealm bool + failCreateZoneGroup bool + failCreateZone bool + failCommitConfigChanges bool } // control whether we should expect certain 'get' calls type expectCommands struct { - getRealm bool - createRealm bool - getZoneGroup bool - createZoneGroup bool - getZone bool - createZone bool - getPeriod bool - updatePeriod bool + getRealm bool + createRealm bool + getZoneGroup bool + createZoneGroup bool + getZone bool + createZone bool + commitConfigChanges bool } // vars used for testing if calls were made var ( - calledGetRealm = false - calledGetZoneGroup = false - calledGetZone = false - calledGetPeriod = false - calledCreateRealm = false - calledCreateZoneGroup = false - calledCreateZone = false - calledUpdatePeriod = false + calledGetRealm = false + calledGetZoneGroup = false + calledGetZone = false + calledCreateRealm = false + calledCreateZoneGroup = false + calledCreateZone = false + calledCommitConfigChanges = false ) + commitConfigChangesOrig := commitConfigChanges + defer func() { commitConfigChanges = commitConfigChangesOrig }() + enoentIfNotExist := func(resourceExists bool) (string, error) { if !resourceExists { return "", exectest.MockExecCommandReturns(t, "", "", int(syscall.ENOENT)) @@ -357,8 +357,15 @@ func Test_createMultisite(t *testing.T) { calledCreateZoneGroup = false calledGetZone = false calledCreateZone = false - calledGetPeriod = false - calledUpdatePeriod = false + calledCommitConfigChanges = false + + commitConfigChanges = func(c *Context) error { + calledCommitConfigChanges = true + if env.failCommitConfigChanges { + return errors.New("fake error from CommitConfigChanges") + } + return nil + } return &exectest.MockExecutor{ MockExecuteCommandWithTimeout: func(timeout time.Duration, command string, arg ...string) (string, error) { @@ -391,19 +398,10 @@ func Test_createMultisite(t *testing.T) { calledCreateZone = true return errorIfFail(env.failCreateZone) } - case "period": - switch arg[1] { - case "get": - calledGetPeriod = true - return enoentIfNotExist(env.periodExists) - case "update": - calledUpdatePeriod = true - return errorIfFail(env.failUpdatePeriod) - } } } t.Fatalf("unhandled command: %s %v", command, arg) - return "", nil + panic("unhandled command") }, } } @@ -417,19 +415,18 @@ func Test_createMultisite(t *testing.T) { expectCommands expectCommands wantErr bool }{ - {"create realm, zonegroup, and zone; period update", + {"create realm, zonegroup, and zone; commit config", commandReturns{ // nothing exists, and all should succeed }, expectCommands{ - getRealm: true, - createRealm: true, - getZoneGroup: true, - createZoneGroup: true, - getZone: true, - createZone: true, - getPeriod: true, - updatePeriod: true, + getRealm: true, + createRealm: true, + getZoneGroup: true, + createZoneGroup: true, + getZone: true, + createZone: true, + commitConfigChanges: true, }, expectNoErr}, {"fail creating realm", @@ -468,85 +465,63 @@ func Test_createMultisite(t *testing.T) { // when we fail to create zone, we should not continue }, expectErr}, - {"fail period update", + {"fail commit config", commandReturns{ - failUpdatePeriod: true, + failCommitConfigChanges: true, }, expectCommands{ - getRealm: true, - createRealm: true, - getZoneGroup: true, - createZoneGroup: true, - getZone: true, - createZone: true, - getPeriod: true, - updatePeriod: true, + getRealm: true, + createRealm: true, + getZoneGroup: true, + createZoneGroup: true, + getZone: true, + createZone: true, + commitConfigChanges: true, }, expectErr}, - {"realm exists; create zonegroup and zone; period update", + {"realm exists; create zonegroup and zone; commit config", commandReturns{ realmExists: true, }, expectCommands{ - getRealm: true, - createRealm: false, - getZoneGroup: true, - createZoneGroup: true, - getZone: true, - createZone: true, - getPeriod: true, - updatePeriod: true, - }, - expectNoErr}, - {"realm and zonegroup exist; create zone; period update", - commandReturns{ - realmExists: true, - zoneGroupExists: true, - }, - expectCommands{ - getRealm: true, - createRealm: false, - getZoneGroup: true, - createZoneGroup: false, - getZone: true, - createZone: true, - getPeriod: true, - updatePeriod: true, + getRealm: true, + createRealm: false, + getZoneGroup: true, + createZoneGroup: true, + getZone: true, + createZone: true, + commitConfigChanges: true, }, expectNoErr}, - {"realm, zonegroup, and zone exist; period update", + {"realm and zonegroup exist; create zone; commit config", commandReturns{ realmExists: true, zoneGroupExists: true, - zoneExists: true, }, expectCommands{ - getRealm: true, - createRealm: false, - getZoneGroup: true, - createZoneGroup: false, - getZone: true, - createZone: false, - getPeriod: true, - updatePeriod: true, + getRealm: true, + createRealm: false, + getZoneGroup: true, + createZoneGroup: false, + getZone: true, + createZone: true, + commitConfigChanges: true, }, expectNoErr}, - {"realm, zonegroup, zone, and period all exist", + {"realm, zonegroup, and zone exist; commit config", commandReturns{ realmExists: true, zoneGroupExists: true, zoneExists: true, - periodExists: true, }, expectCommands{ - getRealm: true, - createRealm: false, - getZoneGroup: true, - createZoneGroup: false, - getZone: true, - createZone: false, - getPeriod: true, - updatePeriod: false, + getRealm: true, + createRealm: false, + getZoneGroup: true, + createZoneGroup: false, + getZone: true, + createZone: false, + commitConfigChanges: true, }, expectNoErr}, } @@ -566,8 +541,7 @@ func Test_createMultisite(t *testing.T) { assert.Equal(t, tt.expectCommands.createZoneGroup, calledCreateZoneGroup) assert.Equal(t, tt.expectCommands.getZone, calledGetZone) assert.Equal(t, tt.expectCommands.createZone, calledCreateZone) - assert.Equal(t, tt.expectCommands.getPeriod, calledGetPeriod) - assert.Equal(t, tt.expectCommands.updatePeriod, calledUpdatePeriod) + assert.Equal(t, tt.expectCommands.commitConfigChanges, calledCommitConfigChanges) if tt.wantErr { assert.Error(t, err) } else { From 956430826cc2cff0fdd61ca18d96ba8a647eb275 Mon Sep 17 00:00:00 2001 From: Blaine Gardner Date: Tue, 5 Oct 2021 14:26:17 -0600 Subject: [PATCH 2/2] rgw: add integration test for committing period Add to the RGW multisite integration test a verification that the RGW period is committed on the first reconcile and not committed on the second reconcile. Do this in the multisite test so that we verify that this works for both the primary and secondary multi-site cluster. To add this test, the github-action-helper.sh script had to be modified to 1. actually deploy the version of Rook under test 2. adjust how functions are called to not lose the `-e` in a subshell 3. fix wait_for_prepare_pod helper that had a failure in the middle of its operation that didn't cause failures in the past Signed-off-by: Blaine Gardner --- .github/workflows/canary-integration-test.yml | 13 +++ pkg/operator/ceph/object/admin.go | 5 +- pkg/operator/ceph/object/controller_test.go | 6 +- tests/scripts/github-action-helper.sh | 106 ++++++++++++------ tests/scripts/validate_cluster.sh | 2 + 5 files changed, 94 insertions(+), 38 deletions(-) diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml index e92f79f35fd7..550160788dd0 100644 --- a/.github/workflows/canary-integration-test.yml +++ b/.github/workflows/canary-integration-test.yml @@ -953,6 +953,19 @@ jobs: - name: write an object to one cluster, read from the other run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2 + # if this test fails, it could mean the RGW `period get` or `period update` output has changed + - name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile + run: | + ns_name_primary='"rook-ceph/multisite-store"' # double quotes intended + ns_name_secondary='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended + committed_msg="committing changes to RGW configuration period for CephObjectStore" + tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary}" + tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary}" + tests/scripts/github-action-helper.sh restart_operator + not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore" + tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary}" 120 + tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary}" 90 + - name: upload test result uses: actions/upload-artifact@v2 if: always() diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go index f1c0c1f3dae0..7e7dc78760cd 100644 --- a/pkg/operator/ceph/object/admin.go +++ b/pkg/operator/ceph/object/admin.go @@ -320,9 +320,12 @@ func CommitConfigChanges(c *Context) error { // DO NOT MODIFY nsName here. It is part of the integration test checks noted below. nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name) if !shouldCommit { - logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName) + // DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests. + logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName) return nil } + // DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests. + logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName) // don't expect json output since we don't intend to use the output from the command _, err = runAdminCommand(c, false, "period", "update", "--commit") if err != nil { diff --git a/pkg/operator/ceph/object/controller_test.go b/pkg/operator/ceph/object/controller_test.go index 350987946e62..6adfd0c3ba23 100644 --- a/pkg/operator/ceph/object/controller_test.go +++ b/pkg/operator/ceph/object/controller_test.go @@ -685,7 +685,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) { context: c, objectStoreContexts: make(map[string]*objectStoreHealth), recorder: k8sutil.NewEventReporter(record.NewFakeRecorder(5)), - opManagerContext: context.TODO(), + opManagerContext: ctx, } _, err := r.context.Clientset.CoreV1().Secrets(namespace).Create(ctx, secret, metav1.CreateOptions{}) @@ -707,7 +707,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) { assert.NoError(t, err) assert.False(t, res.Requeue) assert.True(t, calledCommitConfigChanges) - err = r.client.Get(context.TODO(), req.NamespacedName, objectStore) + err = r.client.Get(ctx, req.NamespacedName, objectStore) assert.NoError(t, err) }) @@ -723,7 +723,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) { return &dependents.DependentList{}, nil } - err = r.client.Get(context.TODO(), req.NamespacedName, objectStore) + err = r.client.Get(ctx, req.NamespacedName, objectStore) assert.NoError(t, err) objectStore.DeletionTimestamp = &metav1.Time{ Time: time.Now(), diff --git a/tests/scripts/github-action-helper.sh b/tests/scripts/github-action-helper.sh index 3f5d913c0e24..ab8323de97f2 100755 --- a/tests/scripts/github-action-helper.sh +++ b/tests/scripts/github-action-helper.sh @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -xe +set -xeEo pipefail ############# # VARIABLES # @@ -145,12 +145,12 @@ function validate_yaml() { } function create_cluster_prerequisites() { - cd cluster/examples/kubernetes/ceph - kubectl create -f crds.yaml -f common.yaml + # this might be called from another function that has already done a cd + ( cd cluster/examples/kubernetes/ceph && kubectl create -f crds.yaml -f common.yaml ) } function deploy_manifest_with_local_build() { - sed -i "s|image: rook/ceph:[0-9a-zA-Z.]*|image: rook/ceph:local-build|g" $1 + sed -i "s|image: rook/ceph:.*|image: rook/ceph:local-build|g" $1 kubectl create -f $1 } @@ -169,23 +169,31 @@ function deploy_cluster() { } function wait_for_prepare_pod() { - timeout 180 bash <<-'EOF' - while true; do - if [[ "$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare --field-selector=status.phase=Running)" -gt 1 ]]; then - break - fi - sleep 5 - done - kubectl -n rook-ceph logs --follow pod/$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[0].metadata.name}') -EOF - timeout 60 bash <<-'EOF' - until kubectl -n rook-ceph logs $(kubectl -n rook-ceph get pod -l app=rook-ceph-osd,ceph_daemon_id=0 -o jsonpath='{.items[*].metadata.name}') --all-containers || true; do - echo "waiting for osd container" + get_pod_cmd=(kubectl --namespace rook-ceph get pod --no-headers) + timeout=450 + start_time="${SECONDS}" + while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do + pods="$("${get_pod_cmd[@]}" --selector=rook-ceph-osd-prepare --output custom-columns=NAME:.metadata.name,PHASE:status.phase)" + if echo "$pods" | grep 'Running\|Succeeded\|Failed'; then break; fi + echo 'waiting for at least one osd prepare pod to be running or finished' + sleep 5 + done + pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd-prepare --output name | head -n1)" + kubectl --namespace rook-ceph logs --follow "$pod" + timeout=60 + start_time="${SECONDS}" + while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do + pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output custom-columns=NAME:.metadata.name,PHASE:status.phase)" + if echo "$pod" | grep 'Running'; then break; fi + echo 'waiting for OSD 0 pod to be running' sleep 1 done -EOF - kubectl -n rook-ceph describe job/"$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[*].metadata.name}')" || true - kubectl -n rook-ceph describe deploy/rook-ceph-osd-0 || true + # getting the below logs is a best-effort attempt, so use '|| true' to allow failures + pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output name)" || true + kubectl --namespace rook-ceph logs "$pod" || true + job="$(kubectl --namespace rook-ceph get job --selector app=rook-ceph-osd-prepare --output name | head -n1)" || true + kubectl -n rook-ceph describe "$job" || true + kubectl -n rook-ceph describe deployment/rook-ceph-osd-0 || true } function wait_for_ceph_to_be_ready() { @@ -217,12 +225,27 @@ function create_LV_on_disk() { function deploy_first_rook_cluster() { BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) + create_cluster_prerequisites cd cluster/examples/kubernetes/ceph/ - kubectl create -f crds.yaml -f common.yaml -f operator.yaml + + deploy_manifest_with_local_build operator.yaml yq w -i -d1 cluster-test.yaml spec.dashboard.enabled false yq w -i -d1 cluster-test.yaml spec.storage.useAllDevices false yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"1 - kubectl create -f cluster-test.yaml -f toolbox.yaml + kubectl create -f cluster-test.yaml + deploy_manifest_with_local_build toolbox.yaml +} + +function deploy_second_rook_cluster() { + BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) + cd cluster/examples/kubernetes/ceph/ + NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f - + sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml + yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2 + yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external" + kubectl create -f cluster-test.yaml + yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary + deploy_manifest_with_local_build toolbox.yaml toolbox.yaml } function wait_for_rgw_pods() { @@ -237,15 +260,32 @@ function wait_for_rgw_pods() { } -function deploy_second_rook_cluster() { - BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) - cd cluster/examples/kubernetes/ceph/ - NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f - - sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml - yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2 - yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external" - yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary - kubectl create -f cluster-test.yaml -f toolbox.yaml +function verify_operator_log_message() { + local message="$1" # param 1: the message to verify exists + local namespace="${2:-rook-ceph}" # optional param 2: the namespace of the CephCluster (default: rook-ceph) + kubectl --namespace "$namespace" logs deployment/rook-ceph-operator | grep "$message" +} + +function wait_for_operator_log_message() { + local message="$1" # param 1: the message to look for + local timeout="$2" # param 2: the timeout for waiting for the message to exist + local namespace="${3:-rook-ceph}" # optional param 3: the namespace of the CephCluster (default: rook-ceph) + start_time="${SECONDS}" + while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do + if verify_operator_log_message "$message" "$namespace"; then return 0; fi + sleep 5 + done + echo "timed out" >&2 && return 1 +} + +function restart_operator () { + local namespace="${1:-rook-ceph}" # optional param 1: the namespace of the CephCluster (default: rook-ceph) + kubectl --namespace "$namespace" delete pod --selector app=rook-ceph=operator + # wait for new pod to be running + get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers) + timeout 20 bash -c \ + "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done" + "${get_pod_cmd[@]}" } function write_object_to_cluster1_read_from_cluster2() { @@ -275,7 +315,5 @@ EOF FUNCTION="$1" shift # remove function arg now that we've recorded it # call the function with the remainder of the user-provided args -if ! $FUNCTION "$@"; then - echo "Call to $FUNCTION was not successful" >&2 - exit 1 -fi +# -e, -E, and -o=pipefail will ensure this script returns a failure if a part of the function fails +$FUNCTION "$@" diff --git a/tests/scripts/validate_cluster.sh b/tests/scripts/validate_cluster.sh index 0b60a51feeb6..2900f2154a4b 100755 --- a/tests/scripts/validate_cluster.sh +++ b/tests/scripts/validate_cluster.sh @@ -39,6 +39,8 @@ function wait_for_daemon () { sleep 1 let timeout=timeout-1 done + echo current status: + $EXEC_COMMAND -s return 1 }