diff --git a/.github/workflows/canary-integration-test.yml b/.github/workflows/canary-integration-test.yml index e92f79f35fd74..550160788dd0f 100644 --- a/.github/workflows/canary-integration-test.yml +++ b/.github/workflows/canary-integration-test.yml @@ -953,6 +953,19 @@ jobs: - name: write an object to one cluster, read from the other run: tests/scripts/github-action-helper.sh write_object_to_cluster1_read_from_cluster2 + # if this test fails, it could mean the RGW `period get` or `period update` output has changed + - name: RGW configuration period should be committed on first reconcile and not be committed on second reconcile + run: | + ns_name_primary='"rook-ceph/multisite-store"' # double quotes intended + ns_name_secondary='"rook-ceph-secondary/zone-b-multisite-store"' # double quotes intended + committed_msg="committing changes to RGW configuration period for CephObjectStore" + tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_primary}" + tests/scripts/github-action-helper.sh verify_operator_log_message "${committed_msg} ${ns_name_secondary}" + tests/scripts/github-action-helper.sh restart_operator + not_committed_msg="there are no changes to commit for RGW configuration period for CephObjectStore" + tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_primary}" 120 + tests/scripts/github-action-helper.sh wait_for_operator_log_message "${not_committed_msg} ${ns_name_secondary}" 90 + - name: upload test result uses: actions/upload-artifact@v2 if: always() diff --git a/pkg/operator/ceph/object/admin.go b/pkg/operator/ceph/object/admin.go index 2f8e89491c668..b671832269161 100644 --- a/pkg/operator/ceph/object/admin.go +++ b/pkg/operator/ceph/object/admin.go @@ -318,9 +318,12 @@ func CommitConfigChanges(c *Context) error { // DO NOT MODIFY nsName here. It is part of the integration test checks noted below. nsName := fmt.Sprintf("%s/%s", c.clusterInfo.Namespace, c.Name) if !shouldCommit { - logger.Debugf("not committing changes to RGW configuration period for CephObjectStore %q", nsName) + // DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests. + logger.Infof("there are no changes to commit for RGW configuration period for CephObjectStore %q", nsName) return nil } + // DO NOT MODIFY THE MESSAGE BELOW. It is checked in integration tests. + logger.Infof("committing changes to RGW configuration period for CephObjectStore %q", nsName) // don't expect json output since we don't intend to use the output from the command _, err = runAdminCommand(c, false, "period", "update", "--commit") if err != nil { diff --git a/tests/scripts/github-action-helper.sh b/tests/scripts/github-action-helper.sh index 3f5d913c0e24c..e67e826615de3 100755 --- a/tests/scripts/github-action-helper.sh +++ b/tests/scripts/github-action-helper.sh @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -xe +set -xeEo pipefail ############# # VARIABLES # @@ -145,12 +145,12 @@ function validate_yaml() { } function create_cluster_prerequisites() { - cd cluster/examples/kubernetes/ceph - kubectl create -f crds.yaml -f common.yaml + # this might be called from another function that has already done a cd + ( cd cluster/examples/kubernetes/ceph && kubectl create -f crds.yaml -f common.yaml ) } function deploy_manifest_with_local_build() { - sed -i "s|image: rook/ceph:[0-9a-zA-Z.]*|image: rook/ceph:local-build|g" $1 + sed -i "s|image: rook/ceph:.*|image: rook/ceph:local-build|g" $1 kubectl create -f $1 } @@ -169,23 +169,24 @@ function deploy_cluster() { } function wait_for_prepare_pod() { - timeout 180 bash <<-'EOF' - while true; do - if [[ "$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare --field-selector=status.phase=Running)" -gt 1 ]]; then - break - fi - sleep 5 - done - kubectl -n rook-ceph logs --follow pod/$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[0].metadata.name}') -EOF - timeout 60 bash <<-'EOF' - until kubectl -n rook-ceph logs $(kubectl -n rook-ceph get pod -l app=rook-ceph-osd,ceph_daemon_id=0 -o jsonpath='{.items[*].metadata.name}') --all-containers || true; do - echo "waiting for osd container" - sleep 1 - done -EOF - kubectl -n rook-ceph describe job/"$(kubectl -n rook-ceph get pod -l app=rook-ceph-osd-prepare -o jsonpath='{.items[*].metadata.name}')" || true - kubectl -n rook-ceph describe deploy/rook-ceph-osd-0 || true + get_pod_cmd=(kubectl --namespace rook-ceph get pod --no-headers --output name) + timeout 210 bash -c \ + "until [[ -n \"\$(${get_pod_cmd[*]} --selector app=rook-ceph-osd-prepare --field-selector=status.phase=Running | head -n1)\" ]]; do \ + echo waiting for osd prepare pod to be running. current: ; \ + ${get_pod_cmd[*]} --selector app=rook-ceph-osd-prepare ;\ + sleep 5 ; \ + done" + pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd-prepare | head -n1)" + kubectl --namespace rook-ceph logs --follow "$pod" + timeout 60 bash -c \ + "until [[ -n \"\$(${get_pod_cmd[*]} --selector app=rook-ceph-osd,ceph_daemon_id=0 --field-selector=status.phase=Running | head -n1)\" ]]; do \ + echo waiting for osd pod to be running && sleep 1; \ + done" + pod="$("${get_pod_cmd[@]}" --selector app=rook-ceph-osd,ceph_daemon_id=0 | head -n1)" || true + kubectl --namespace rook-ceph logs "$pod" || true + job="$(kubectl --namespace rook-ceph get job --selector app=rook-ceph-osd-prepare --output name | head -n1)" || true + kubectl -n rook-ceph describe "$job" || true + kubectl -n rook-ceph describe deployment/rook-ceph-osd-0 || true } function wait_for_ceph_to_be_ready() { @@ -217,12 +218,27 @@ function create_LV_on_disk() { function deploy_first_rook_cluster() { BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) + create_cluster_prerequisites cd cluster/examples/kubernetes/ceph/ - kubectl create -f crds.yaml -f common.yaml -f operator.yaml + + deploy_manifest_with_local_build operator.yaml yq w -i -d1 cluster-test.yaml spec.dashboard.enabled false yq w -i -d1 cluster-test.yaml spec.storage.useAllDevices false yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"1 - kubectl create -f cluster-test.yaml -f toolbox.yaml + kubectl create -f cluster-test.yaml + deploy_manifest_with_local_build toolbox.yaml +} + +function deploy_second_rook_cluster() { + BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) + cd cluster/examples/kubernetes/ceph/ + NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f - + sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml + yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2 + yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external" + kubectl create -f cluster-test.yaml + yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary + deploy_manifest_with_local_build toolbox.yaml toolbox.yaml } function wait_for_rgw_pods() { @@ -237,15 +253,32 @@ function wait_for_rgw_pods() { } -function deploy_second_rook_cluster() { - BLOCK=$(sudo lsblk|awk '/14G/ {print $1}'| head -1) - cd cluster/examples/kubernetes/ceph/ - NAMESPACE=rook-ceph-secondary envsubst < common-second-cluster.yaml | kubectl create -f - - sed -i 's/namespace: rook-ceph/namespace: rook-ceph-secondary/g' cluster-test.yaml - yq w -i -d1 cluster-test.yaml spec.storage.deviceFilter "${BLOCK}"2 - yq w -i -d1 cluster-test.yaml spec.dataDirHostPath "/var/lib/rook-external" - yq w -i toolbox.yaml metadata.namespace rook-ceph-secondary - kubectl create -f cluster-test.yaml -f toolbox.yaml +function verify_operator_log_message() { + local message="$1" # param 1: the message to verify exists + local namespace="${2:-rook-ceph}" # optional param 2: the namespace of the CephCluster (default: rook-ceph) + kubectl --namespace "$namespace" logs deployment/rook-ceph-operator | grep "$message" +} + +function wait_for_operator_log_message() { + local message="$1" # param 1: the message to look for + local timeout="$2" # param 2: the timeout for waiting for the message to exist + local namespace="${3:-rook-ceph}" # optional param 3: the namespace of the CephCluster (default: rook-ceph) + start_time="${SECONDS}" + while [[ $(( SECONDS - start_time )) -lt $timeout ]]; do + if verify_operator_log_message "$message" "$namespace"; then return 0; fi + sleep 5 + done + echo "timed out" >&2 && return 1 +} + +function restart_operator () { + local namespace="${1:-rook-ceph}" # optional param 1: the namespace of the CephCluster (default: rook-ceph) + kubectl --namespace "$namespace" delete pod --selector app=rook-ceph=operator + # wait for new pod to be running + get_pod_cmd=(kubectl --namespace "$namespace" get pod --selector app=rook-ceph-operator --no-headers) + timeout 20 bash -c \ + "until [[ -n \"\$(${get_pod_cmd[*]} --field-selector=status.phase=Running 2>/dev/null)\" ]] ; do echo waiting && sleep 1; done" + "${get_pod_cmd[@]}" } function write_object_to_cluster1_read_from_cluster2() { @@ -275,7 +308,5 @@ EOF FUNCTION="$1" shift # remove function arg now that we've recorded it # call the function with the remainder of the user-provided args -if ! $FUNCTION "$@"; then - echo "Call to $FUNCTION was not successful" >&2 - exit 1 -fi +# -e, -E, and -o=pipefail will ensure this script returns a failure if a part of the function fails +$FUNCTION "$@" diff --git a/tests/scripts/validate_cluster.sh b/tests/scripts/validate_cluster.sh index 0b60a51feeb66..bc14afe7571fc 100755 --- a/tests/scripts/validate_cluster.sh +++ b/tests/scripts/validate_cluster.sh @@ -30,7 +30,7 @@ EXEC_COMMAND="kubectl -n rook-ceph exec $(kubectl get pod -l app=rook-ceph-tools trap display_status SIGINT ERR function wait_for_daemon () { - timeout=90 + timeout=120 daemon_to_test=$1 while [ $timeout -ne 0 ]; do if eval $daemon_to_test; then