From 9b484a9ca0345ea213c99739903add65517c10bf Mon Sep 17 00:00:00 2001
From: Blaine Gardner <blaine.gardner@redhat.com>
Date: Tue, 5 Oct 2021 14:26:17 -0600
Subject: [PATCH] rgw: add integration test for committing period

Add to the RGW multisite integration test a verification that the RGW
period is committed on the first reconcile and not committed on the
second reconcile.

Do this in the multisite test so that we verify that this works for
both the primary and secondary multi-site cluster.

Signed-off-by: Blaine Gardner <blaine.gardner@redhat.com>
---
 .github/workflows/canary-integration-test.yml | 19 ++++++++++
 pkg/operator/ceph/object/admin.go             |  7 +++-
 tests/scripts/github-action-helper.sh         | 37 ++++++++++++++++---
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml
index e92f79f35fd74..68c5c19906b52 100644
--- a/.github/workflows/canary-integration-test.yml
+++ b/.github/workflows/canary-integration-test.yml
@@ -953,6 +953,25 @@ jobs:
     - name: write an object to one cluster, read from the other
       run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2
 
+    # if this test fails, it could mean the RGW `period get` or `period update` output has changed
+    - name: verify the RGW configuration period was committed for both clusters
+      run: |
+        committed_msg="committing changes to RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg}" rook-ceph
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg}" rook-ceph-secondary
+
+    - name: restart operators for both clusters
+      run: |
+        tests/scripts/github-action-helper.sh restart_operator rook-ceph
+        tests/scripts/github-action-helper.sh restart_operator rook-ceph-secondary
+
+    # if this test fails, it could mean the RGW `period get` or `period update` output has changed
+    - name: wait for both clusters to report that the RGW configuration period was NOT committed on the second reconcile
+      run: |
+        not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg}" 60 rook-ceph
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg}" 20 rook-ceph-secondary
+
     - name: upload test result
       uses: actions/upload-artifact@v2
       if: always()
diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go
index bac57005ce118..5fedaa51c29d0 100644
--- a/pkg/operator/ceph/object/admin.go
+++ b/pkg/operator/ceph/object/admin.go
@@ -315,11 +315,14 @@ func CommitConfigChanges(c *Context) error {
 		return errors.Wrap(err, "failed to determine if the staged RGW configuration period is different from current")
 	}
 
+	nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name)
 	if !shouldCommit {
-		nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name)
-		logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName)
+		// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+		logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName)
 		return nil
 	}
+	// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+	logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName)
 	// don't expect json output since we don't intend to use the output from the command
 	_, err = runAdminCommand(c, false, "period", "update", "--commit")
 	if err != nil {
diff --git a/tests/scripts/github-action-helper.sh b/tests/scripts/github-action-helper.sh
index 3f5d913c0e24c..2e5fe2616d7c9 100755
--- a/tests/scripts/github-action-helper.sh
+++ b/tests/scripts/github-action-helper.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -xe
+set -xeEo pipefail
 
 #############
 # VARIABLES #
@@ -237,6 +237,35 @@ function wait_for_rgw_pods() {
 
 }
 
+function verify_operator_log_message() {
+  local message="$1"  # param 1: the message to verify exists
+  local namespace="${2:-rook-ceph}"  # optional param 2: the namespace of the CephCluster (default: rook-ceph)
+  kubectl --namespace "$namespace" logs deployment/rook-ceph-operator | grep "$message"
+}
+
+function wait_for_operator_log_message() {
+  local message="$1"  # param 1: the message to look for
+  local timeout="$2"  # param 2: the timeout for waiting for the message to exist
+  local namespace="${3:-rook-ceph}"  # optional param 3: the namespace of the CephCluster (default: rook-ceph)
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    if verify_operator_log_message "$message" "$namespace"; then return 0; fi
+    sleep 5
+  done
+  echo "timed out" >&2 && return 1
+}
+
+function restart_operator () {
+  local namespace="${1:-rook-ceph}"  # optional param 1: the namespace of the CephCluster (default: rook-ceph)
+  get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers)
+  pod="$("${get_pod_cmd[@]}" --output name)"
+  # --output name gives "pod/<name>", so don't specify the kind here
+  kubectl --namespace "$namespace" delete "$pod" # waits for pod to be deleted but not new pod to be running
+  timeout 20 bash -c \
+    "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done"
+  "${get_pod_cmd[@]}"
+}
+
 function deploy_second_rook_cluster() {
   BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
   cd cluster/examples/kubernetes/ceph/
@@ -275,7 +304,5 @@ EOF
 FUNCTION="$1"
 shift # remove function arg now that we've recorded it
 # call the function with the remainder of the user-provided args
-if ! $FUNCTION "$@"; then
-  echo "Call to $FUNCTION was not successful" >&2
-  exit 1
-fi
+# -e, -E, and -o=pipefail will ensure this script returns a failure if a part of the function fails
+$FUNCTION "$@"