rgw: add integration test for committing period

Add to the RGW multisite integration test a verification that the RGW period is committed on the first reconcile and not committed on the second reconcile. Do this in the multisite test so that we verify that this works for both the primary and secondary multi-site cluster. To add this test, the github-action-helper.sh script had to be modified to 1. actually deploy the version of Rook under test 2. adjust how functions are called to not lose the `-e` in a subshell 3. fix wait_for_prepare_pod helper that had a failure in the middle of its operation that didn't cause failures in the past Signed-off-by: Blaine Gardner <blaine.gardner@redhat.com>
rook · Oct 11, 2021 · 9564308 · 9564308
1 parent eadcd75
commit 9564308
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 38 deletions.
diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml
@@ -953,6 +953,19 @@ jobs:
     - name: write an object to one cluster, read from the other
       run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2
 
+    # if this test fails, it could mean the RGW `period get` or `period update` output has changed
+    - name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile
+      run: |
+        ns_name_primary='"rook-ceph/multisite-store"' # double quotes intended
+        ns_name_secondary='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended
+        committed_msg="committing changes to RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary}"
+        tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary}"
+        tests/scripts/github-action-helper.sh restart_operator
+        not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore"
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary}" 120
+        tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary}" 90
+
     - name: upload test result
       uses: actions/upload-artifact@v2
       if: always()

diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go
@@ -320,9 +320,12 @@ func CommitConfigChanges(c *Context) error {
 	// DO NOT MODIFY nsName here. It is part of the integration test checks noted below.
 	nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name)
 	if !shouldCommit {
-		logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName)
+		// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+		logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName)
 		return nil
 	}
+	// DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests.
+	logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName)
 	// don't expect json output since we don't intend to use the output from the command
 	_, err = runAdminCommand(c, false, "period", "update", "--commit")
 	if err != nil {

diff --git a/pkg/operator/ceph/object/controller_test.go b/pkg/operator/ceph/object/controller_test.go
@@ -685,7 +685,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 		context:             c,
 		objectStoreContexts: make(map[string]*objectStoreHealth),
 		recorder:            k8sutil.NewEventReporter(record.NewFakeRecorder(5)),
-		opManagerContext:    context.TODO(),
+		opManagerContext:    ctx,
 	}
 
 	_, err := r.context.Clientset.CoreV1().Secrets(namespace).Create(ctx, secret, metav1.CreateOptions{})
@@ -707,7 +707,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 		assert.NoError(t, err)
 		assert.False(t, res.Requeue)
 		assert.True(t, calledCommitConfigChanges)
-		err = r.client.Get(context.TODO(), req.NamespacedName, objectStore)
+		err = r.client.Get(ctx, req.NamespacedName, objectStore)
 		assert.NoError(t, err)
 	})
 
@@ -723,7 +723,7 @@ func TestCephObjectStoreControllerMultisite(t *testing.T) {
 			return &dependents.DependentList{}, nil
 		}
 
-		err = r.client.Get(context.TODO(), req.NamespacedName, objectStore)
+		err = r.client.Get(ctx, req.NamespacedName, objectStore)
 		assert.NoError(t, err)
 		objectStore.DeletionTimestamp = &metav1.Time{
 			Time: time.Now(),

diff --git a/tests/scripts/github-action-helper.sh b/tests/scripts/github-action-helper.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -xe
+set -xeEo pipefail
 
 #############
 # VARIABLES #
@@ -145,12 +145,12 @@ function validate_yaml() {
 }
 
 function create_cluster_prerequisites() {
-  cd cluster/examples/kubernetes/ceph
-  kubectl create -f crds.yaml -f common.yaml
+  # this might be called from another function that has already done a cd
+  ( cd cluster/examples/kubernetes/ceph && kubectl create -f crds.yaml -f common.yaml )
 }
 
 function deploy_manifest_with_local_build() {
-  sed -i "s|image: rook/ceph:[0-9a-zA-Z.]*|image: rook/ceph:local-build|g" $1
+  sed -i "s|image: rook/ceph:.*|image: rook/ceph:local-build|g" $1
   kubectl create -f $1
 }
 
@@ -169,23 +169,31 @@ function deploy_cluster() {
 }
 
 function wait_for_prepare_pod() {
-  timeout 180 bash <<-'EOF'
-    while true; do
-      if [[ "$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare --field-selector=status.phase=Running)" -gt 1 ]]; then
-        break
-      fi
-      sleep 5
-    done
-    kubectl -n rook-ceph logs --follow pod/$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[0].metadata.name}')
-EOF
-  timeout 60 bash <<-'EOF'
-  until kubectl -n rook-ceph logs $(kubectl -n rook-ceph get pod -l app=rook-ceph-osd,ceph_daemon_id=0 -o jsonpath='{.items[*].metadata.name}') --all-containers || true; do
-    echo "waiting for osd container"
+  get_pod_cmd=(kubectl --namespace rook-ceph get pod --no-headers)
+  timeout=450
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    pods="$("${get_pod_cmd[@]}" --selector=rook-ceph-osd-prepare --output custom-columns=NAME:.metadata.name,PHASE:status.phase)"
+    if echo "$pods" | grep 'Running\|Succeeded\|Failed'; then break; fi
+    echo 'waiting for at least one osd prepare pod to be running or finished'
+    sleep 5
+  done
+  pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd-prepare --output name | head -n1)"
+  kubectl --namespace rook-ceph logs --follow "$pod"
+  timeout=60
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output custom-columns=NAME:.metadata.name,PHASE:status.phase)"
+    if echo "$pod" | grep 'Running'; then break; fi
+    echo 'waiting for OSD 0 pod to be running'
     sleep 1
   done
-EOF
-  kubectl -n rook-ceph describe job/"$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[*].metadata.name}')" || true
-  kubectl -n rook-ceph describe deploy/rook-ceph-osd-0 || true
+  # getting the below logs is a best-effort attempt, so use '|| true' to allow failures
+  pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 --output name)" || true
+  kubectl --namespace rook-ceph logs "$pod" || true
+  job="$(kubectl --namespace rook-ceph get job --selector app=rook-ceph-osd-prepare --output name | head -n1)" || true
+  kubectl -n rook-ceph describe "$job" || true
+  kubectl -n rook-ceph describe deployment/rook-ceph-osd-0 || true
 }
 
 function wait_for_ceph_to_be_ready() {
@@ -217,12 +225,27 @@ function create_LV_on_disk() {
 
 function deploy_first_rook_cluster() {
   BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
+  create_cluster_prerequisites
   cd cluster/examples/kubernetes/ceph/
-  kubectl create -f crds.yaml -f common.yaml -f operator.yaml
+
+  deploy_manifest_with_local_build operator.yaml
   yq w -i -d1 cluster-test.yaml spec.dashboard.enabled false
   yq w -i -d1 cluster-test.yaml spec.storage.useAllDevices false
   yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"1
-  kubectl create -f cluster-test.yaml -f toolbox.yaml
+  kubectl create -f cluster-test.yaml
+  deploy_manifest_with_local_build toolbox.yaml
+}
+
+function deploy_second_rook_cluster() {
+  BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
+  cd cluster/examples/kubernetes/ceph/
+  NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f -
+  sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml
+  yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2
+  yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external"
+  kubectl create -f cluster-test.yaml
+  yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary
+  deploy_manifest_with_local_build toolbox.yaml toolbox.yaml
 }
 
 function wait_for_rgw_pods() {
@@ -237,15 +260,32 @@ function wait_for_rgw_pods() {
 
 }
 
-function deploy_second_rook_cluster() {
-  BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1)
-  cd cluster/examples/kubernetes/ceph/
-  NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f -
-  sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml
-  yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2
-  yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external"
-  yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary
-  kubectl create -f cluster-test.yaml -f toolbox.yaml
+function verify_operator_log_message() {
+  local message="$1"  # param 1: the message to verify exists
+  local namespace="${2:-rook-ceph}"  # optional param 2: the namespace of the CephCluster (default: rook-ceph)
+  kubectl --namespace "$namespace" logs deployment/rook-ceph-operator | grep "$message"
+}
+
+function wait_for_operator_log_message() {
+  local message="$1"  # param 1: the message to look for
+  local timeout="$2"  # param 2: the timeout for waiting for the message to exist
+  local namespace="${3:-rook-ceph}"  # optional param 3: the namespace of the CephCluster (default: rook-ceph)
+  start_time="${SECONDS}"
+  while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do
+    if verify_operator_log_message "$message" "$namespace"; then return 0; fi
+    sleep 5
+  done
+  echo "timed out" >&2 && return 1
+}
+
+function restart_operator () {
+  local namespace="${1:-rook-ceph}"  # optional param 1: the namespace of the CephCluster (default: rook-ceph)
+  kubectl --namespace "$namespace" delete pod --selector app=rook-ceph=operator
+  # wait for new pod to be running
+  get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers)
+  timeout 20 bash -c \
+    "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done"
+  "${get_pod_cmd[@]}"
 }
 
 function write_object_to_cluster1_read_from_cluster2() {
@@ -275,7 +315,5 @@ EOF
 FUNCTION="$1"
 shift # remove function arg now that we've recorded it
 # call the function with the remainder of the user-provided args
-if ! $FUNCTION "$@"; then
-  echo "Call to $FUNCTION was not successful" >&2
-  exit 1
-fi
+# -e, -E, and -o=pipefail will ensure this script returns a failure if a part of the function fails
+$FUNCTION "$@"
diff --git a/tests/scripts/validate_cluster.sh b/tests/scripts/validate_cluster.sh
@@ -39,6 +39,8 @@ function wait_for_daemon () {
     sleep 1
     let timeout=timeout-1
   done
+  echo current status:
+  $EXEC_COMMAND -s
 
   return 1
 }