Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rgw: replace period update --commit with function #8911

Merged
merged 2 commits into from Oct 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/canary-integration-test.yml
Expand Up @@ -953,6 +953,19 @@ jobs:
- name: write an object to one cluster, read from the other
run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2

# if this test fails, it could mean the RGW `period get` or `period update` output has changed
- name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile
run: |
ns_name_primary='"rook-ceph/multisite-store"' # double quotes intended
ns_name_secondary='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended
committed_msg="committing changes to RGW configuration period for CephObjectStore"
tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary}"
tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary}"
tests/scripts/github-action-helper.sh restart_operator
not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore"
tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary}" 120
tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary}" 90

- name: upload test result
uses: actions/upload-artifact@v2
if: always()
Expand Down
118 changes: 118 additions & 0 deletions pkg/operator/ceph/object/admin.go
Expand Up @@ -18,13 +18,16 @@ package object

import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httputil"
"regexp"
"strings"

"github.com/ceph/go-ceph/rgw/admin"
"github.com/coreos/pkg/capnslog"
"github.com/google/go-cmp/cmp"
"github.com/pkg/errors"
cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
"github.com/rook/rook/pkg/clusterd"
Expand Down Expand Up @@ -294,6 +297,121 @@ func isInvalidFlagError(err error) bool {
return exitCode == 22
}

// CommitConfigChanges commits changes to RGW configs for realm/zonegroup/zone changes idempotently.
// Under the hood, this updates the RGW config period and commits the change if changes are detected.
func CommitConfigChanges(c *Context) error {
currentPeriod, err := runAdminCommand(c, true, "period", "get")
if err != nil {
return errorOrIsNotFound(err, "failed to get the current RGW configuration period to see if it needs changed")
}

// this stages the current config changees and returns what the new period config will look like
// without committing the changes
stagedPeriod, err := runAdminCommand(c, true, "period", "update")
if err != nil {
return errorOrIsNotFound(err, "failed to stage the current RGW configuration period")
}

shouldCommit, err := periodWillChange(currentPeriod, stagedPeriod)
if err != nil {
return errors.Wrap(err, "failed to determine if the staged RGW configuration period is different from current")
}

// DO NOT MODIFY nsName here. It is part of the integration test checks noted below.
nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name)
if !shouldCommit {
// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName)
return nil
}
// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName)
// don't expect json output since we don't intend to use the output from the command
_, err = runAdminCommand(c, false, "period", "update", "--commit")
if err != nil {
return errorOrIsNotFound(err, "failed to commit RGW configuration period changes")
}

return nil
}

// return true if the configuration period will change if the staged period is committed
func periodWillChange(current, staged string) (bool, error) {
travisn marked this conversation as resolved.
Show resolved Hide resolved
// Rook wants to check if there are any differences in the current period versus the period that
// is staged to be applied/committed. If there are differences, then Rook should "commit" the
// staged period changes to instruct RGWs to update their runtime configuration.
//
// For many RGW interactions, Rook often creates a typed struct to unmarshal RGW JSON output
// into. In those cases Rook is able to opt in to only a small subset of specific fields it
// needs. This keeps the coupling between Rook and RGW JSON output as loose as possible while
// still being specific enough for Rook to operate.
//
// For this implementation, we could use a strongly-typed struct here to unmarshal into, and we
// could use DisallowUnknownFields() to cause an error if the RGW JSON output changes to flag
// when the existing implementation might be invalidated. This relies on an extremely tight
// coupling between Rook and the JSON output from RGW. The failure mode of this implementation
// is to return an error from the reconcile when there are unmarshalling errors, which results
// in CephObjectStores that could not be updated if a version of Ceph changes the RGW output.
//
// In the chosen implementation, we unmarshal into "dumb" data structures that create a loose
// coupling. With these, we must ignore the fields that we have observed to change between the
// current and staged periods when we should *not* commit an un-changed period. The failure mode
// of this implementation is that if the RGW output changes its structure, Rook may detect
// differences where there are none. This would result in Rook committing the period more often
// than necessary. Committing the period results in a short period of downtime while RGWs reload
// their configuration, but we opt for this inconvenience in lieu of blocking reconciliation.
//
// For any implementation, if the RGW changes the behavior of its output but not the structure,
// Rook could commit unnecessary period changes or fail to commit necessary period changes
// depending on how the RGW output has changed. Rook cannot detect this class of failures, and
// the behavior cannot be specifically known.
var currentJSON map[string]interface{}
var stagedJSON map[string]interface{}
BlaineEXE marked this conversation as resolved.
Show resolved Hide resolved
var err error

err = json.Unmarshal([]byte(current), &currentJSON)
if err != nil {
return true, errors.Wrap(err, "failed to unmarshal current RGW configuration period")
thotz marked this conversation as resolved.
Show resolved Hide resolved
}
err = json.Unmarshal([]byte(staged), &stagedJSON)
if err != nil {
return true, errors.Wrap(err, "failed to unmarshal staged RGW configuration period")
}

// There are some values in the periods that we don't care to diff because they are always
// different in the staged period, even when no updates are needed. Sometimes, the values are
// reported as different in the staging output but aren't actually changed upon commit.
ignorePaths := cmp.FilterPath(func(path cmp.Path) bool {
// path.String() outputs nothing for the crude map[string]interface{} JSON struct
// Example of path.GoString() output for a long path in the period JSON:
// root["period_map"].(map[string]interface {})["short_zone_ids"].([]interface {})[0].(map[string]interface {})["val"].(float64)
switch path.GoString() {
case `root["id"]`:
// "id" is always changed in the staged period, but it doesn't always update.
return true
case `root["predecessor_uuid"]`:
// "predecessor_uuid" is always changed in the staged period, but it doesn't always update.
return true
case `root["realm_epoch"]`:
// "realm_epoch" is always incremented in the staged period, but it doesn't always increment.
return true
case `root["epoch"]`:
// Strangely, "epoch" is not incremented in the staged period even though it is always
// incremented upon an actual commit. It could be argued that this behavior is a bug.
// Ignore this value to handle the possibility that the behavior changes in the future.
return true
default:
return false
}
}, cmp.Ignore())

diff := cmp.Diff(currentJSON, stagedJSON, ignorePaths)
thotz marked this conversation as resolved.
Show resolved Hide resolved
diff = strings.TrimSpace(diff)
logger.Debugf("RGW config period diff:\n%s", diff)

return (diff != ""), nil
}

func GetAdminOPSUserCredentials(objContext *Context, spec *cephv1.ObjectStoreSpec) (string, string, error) {
ns := objContext.clusterInfo.Namespace

Expand Down