From 956430826cc2cff0fdd61ca18d96ba8a647eb275 Mon Sep 17 00:00:00 2001
From: Blaine Gardner <blaine.gardner@redhat.com>
Date: Tue, 5 Oct 2021 14:26:17 -0600
Subject: [PATCH] rgw: add integration test for committing period

Add to the RGW multisite integration test a verification that the RGW
period is committed on the first reconcile and not committed on the
second reconcile.

Do this in the multisite test so that we verify that this works for
both the primary and secondary multi-site cluster.

To add this test, the github-action-helper.sh script had to be modified
to
1. actually deploy the version of Rook under test
2. adjust how functions are called to not lose the `-e` in a subshell
3. fix wait_for_prepare_pod helper that had a failure in the middle
   of its operation that didn't cause failures in the past

Signed-off-by: Blaine Gardner <blaine.gardner@redhat.com>
---
 .github/workflows/canary-integration-test.yml |  13 +++
 pkg/operator/ceph/object/admin.go             |   5 +-
 pkg/operator/ceph/object/controller_test.go   |   6 +-
 tests/scripts/github-action-helper.sh         | 106 ++++++++++++------
 tests/scripts/validate_cluster.sh             |   2 +
 5 files changed, 94 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml
index e92f79f35fd7..550160788dd0 100644
--- a/.github/workflows/canary-integration-test.yml
+++ b/.github/workflows/canary-integration-test.yml
@@ -953,6 +953,19 @@ jobs:
     - name: write an object to one cluster, read from the other
       run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2
 
+    # if this test fails, it could mean the RGW `period get` or `period update` output has changed
+    - name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile
+      run: |
+        ns_name_primary='"rook-ceph/multisite-store"' # double quotes intended
+        ns_name_secondary='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended
+        committed_msg="committing changes to RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary}"
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary}"
+        tests/scripts/github-action-helper.sh restart_operator
+        not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary}" 120
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary}" 90
+
     - name: upload test result
       uses: actions/upload-artifact@v2
       if: always()
diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go
index f1c0c1f3dae0..7e7dc78760cd 100644
--- a/pkg/operator/ceph/object/admin.go
+++ b/pkg/operator/ceph/object/admin.go
@@ -320,9 +320,12 @@ func CommitConfigChanges(c *Context) error {
 	// DO NOT MODIFY nsName here. It is part of the integration test checks noted below.
 	nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name)
 	if !shouldCommit {
-		logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName)
+		// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+		logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName)
 		return nil
 	}
+	// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+	logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName)
 	// don't expect json output since we don't intend to use the output from the command
 	_, err = runAdminCommand(c, false, "period", "update", "--commit")
 	if err != nil {
diff --git a/pkg/operator/ceph/object/controller_test.go b/pkg/operator/ceph/object/controller_test.go
index 350987946e62..6adfd0c3ba23 100644
--- a/pkg/operator/ceph/object/controller_test.go
+++ b/pkg/operator/ceph/object/controller_test.go
@@ -685,7 +685,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 		context:             c,
 		objectStoreContexts: make(map[string]*objectStoreHealth),
 		recorder:            k8sutil.NewEventReporter(record.NewFakeRecorder(5)),
-		opManagerContext:    context.TODO(),
+		opManagerContext:    ctx,
 	}
 
 	_, err := r.context.Clientset.CoreV1().Secrets(namespace).Create(ctx, secret, metav1.CreateOptions{})
@@ -707,7 +707,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 		assert.NoError(t, err)
 		assert.False(t, res.Requeue)
 		assert.True(t, calledCommitConfigChanges)
-		err = r.client.Get(context.TODO(), req.NamespacedName, objectStore)
+		err = r.client.Get(ctx, req.NamespacedName, objectStore)
 		assert.NoError(t, err)
 	})
 
@@ -723,7 +723,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 			return &dependents.DependentList{}, nil
 		}
 
-		err = r.client.Get(context.TODO(), req.NamespacedName, objectStore)
+		err = r.client.Get(ctx, req.NamespacedName, objectStore)
 		assert.NoError(t, err)
 		objectStore.DeletionTimestamp = &metav1.Time{
 			Time: time.Now(),
diff --git a/tests/scripts/github-action-helper.sh b/tests/scripts/github-action-helper.sh
index 3f5d913c0e24..ab8323de97f2 100755
--- a/tests/scripts/github-action-helper.sh
+++ b/tests/scripts/github-action-helper.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -xe
+set -xeEo pipefail
 
 #############
 # VARIABLES #
@@ -145,12 +145,12 @@ function validate_yaml() {
 }
 
 function create_cluster_prerequisites() {
-  cd cluster/examples/kubernetes/ceph
-  kubectl create -f crds.yaml -f common.yaml
+  # this might be called from another function that has already done a cd
+  ( cd cluster/examples/kubernetes/ceph && kubectl create -f crds.yaml -f common.yaml )
 }
 
 function deploy_manifest_with_local_build() {
-  sed -i "s|image: rook/ceph:[0-9a-zA-Z.]*|image: rook/ceph:local-build|g" $1
+  sed -i "s|image: rook/ceph:.*|image: rook/ceph:local-build|g" $1
   kubectl create -f $1
 }
 
@@ -169,23 +169,31 @@ function deploy_cluster() {
 }
 
 function wait_for_prepare_pod() {
-  timeout 180 bash <<-'EOF'
-    while true; do
-      if [[ "$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare --field-selector=status.phase=Running)" -gt 1 ]]; then
-        break
-      fi
-      sleep 5
-    done
-    kubectl -n rook-ceph logs --follow pod/$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[0].metadata.name}')
-EOF
-  timeout 60 bash <<-'EOF'
-  until kubectl -n rook-ceph logs $(kubectl -n rook-ceph get pod -l app=rook-ceph-osd,ceph_daemon_id=0 -o jsonpath='{.items[*].metadata.name}') --all-containers || true; do
-    echo "waiting for osd container"
+  get_pod_cmd=(kubectl --namespace rook-ceph get pod --no-headers)
+  timeout=450
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    pods="$("${get_pod_cmd[@]}" --selector=rook-ceph-osd-prepare --output custom-columns=NAME:.metadata.name,PHASE:status.phase)"
+    if echo "$pods" | grep 'Running\|Succeeded\|Failed'; then break; fi
+    echo 'waiting for at least one osd prepare pod to be running or finished'
+    sleep 5
+  done
+  pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd-prepare --output name | head -n1)"
+  kubectl --namespace rook-ceph logs --follow "$pod"
+  timeout=60
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output custom-columns=NAME:.metadata.name,PHASE:status.phase)"
+    if echo "$pod" | grep 'Running'; then break; fi
+    echo 'waiting for OSD 0 pod to be running'
     sleep 1
   done
-EOF
-  kubectl -n rook-ceph describe job/"$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[*].metadata.name}')" || true
-  kubectl -n rook-ceph describe deploy/rook-ceph-osd-0 || true
+  # getting the below logs is a best-effort attempt, so use '|| true' to allow failures
+  pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output name)" || true
+  kubectl --namespace rook-ceph logs "$pod" || true
+  job="$(kubectl --namespace rook-ceph get job --selector app=rook-ceph-osd-prepare --output name | head -n1)" || true
+  kubectl -n rook-ceph describe "$job" || true
+  kubectl -n rook-ceph describe deployment/rook-ceph-osd-0 || true
 }
 
 function wait_for_ceph_to_be_ready() {
@@ -217,12 +225,27 @@ function create_LV_on_disk() {
 
 function deploy_first_rook_cluster() {
   BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
+  create_cluster_prerequisites
   cd cluster/examples/kubernetes/ceph/
-  kubectl create -f crds.yaml -f common.yaml -f operator.yaml
+
+  deploy_manifest_with_local_build operator.yaml
   yq w -i -d1 cluster-test.yaml spec.dashboard.enabled false
   yq w -i -d1 cluster-test.yaml spec.storage.useAllDevices false
   yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"1
-  kubectl create -f cluster-test.yaml -f toolbox.yaml
+  kubectl create -f cluster-test.yaml
+  deploy_manifest_with_local_build toolbox.yaml
+}
+
+function deploy_second_rook_cluster() {
+  BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
+  cd cluster/examples/kubernetes/ceph/
+  NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f -
+  sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml
+  yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2
+  yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external"
+  kubectl create -f cluster-test.yaml
+  yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary
+  deploy_manifest_with_local_build toolbox.yaml toolbox.yaml
 }
 
 function wait_for_rgw_pods() {
@@ -237,15 +260,32 @@ function wait_for_rgw_pods() {
 
 }
 
-function deploy_second_rook_cluster() {
-  BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
-  cd cluster/examples/kubernetes/ceph/
-  NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f -
-  sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml
-  yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2
-  yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external"
-  yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary
-  kubectl create -f cluster-test.yaml -f toolbox.yaml
+function verify_operator_log_message() {
+  local message="$1"  # param 1: the message to verify exists
+  local namespace="${2:-rook-ceph}"  # optional param 2: the namespace of the CephCluster (default: rook-ceph)
+  kubectl --namespace "$namespace" logs deployment/rook-ceph-operator | grep "$message"
+}
+
+function wait_for_operator_log_message() {
+  local message="$1"  # param 1: the message to look for
+  local timeout="$2"  # param 2: the timeout for waiting for the message to exist
+  local namespace="${3:-rook-ceph}"  # optional param 3: the namespace of the CephCluster (default: rook-ceph)
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    if verify_operator_log_message "$message" "$namespace"; then return 0; fi
+    sleep 5
+  done
+  echo "timed out" >&2 && return 1
+}
+
+function restart_operator () {
+  local namespace="${1:-rook-ceph}"  # optional param 1: the namespace of the CephCluster (default: rook-ceph)
+  kubectl --namespace "$namespace" delete pod --selector app=rook-ceph=operator
+  # wait for new pod to be running
+  get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers)
+  timeout 20 bash -c \
+    "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done"
+  "${get_pod_cmd[@]}"
 }
 
 function write_object_to_cluster1_read_from_cluster2() {
@@ -275,7 +315,5 @@ EOF
 FUNCTION="$1"
 shift # remove function arg now that we've recorded it
 # call the function with the remainder of the user-provided args
-if ! $FUNCTION "$@"; then
-  echo "Call to $FUNCTION was not successful" >&2
-  exit 1
-fi
+# -e, -E, and -o=pipefail will ensure this script returns a failure if a part of the function fails
+$FUNCTION "$@"
diff --git a/tests/scripts/validate_cluster.sh b/tests/scripts/validate_cluster.sh
index 0b60a51feeb6..2900f2154a4b 100755
--- a/tests/scripts/validate_cluster.sh
+++ b/tests/scripts/validate_cluster.sh
@@ -39,6 +39,8 @@ function wait_for_daemon () {
     sleep 1
     let timeout=timeout-1
   done
+  echo current status:
+  $EXEC_COMMAND -s
 
   return 1
 }