From d2dbcd0520eb1a63818a3da5d13bb62b2fe38bef Mon Sep 17 00:00:00 2001 From: Charith Ellawala Date: Wed, 14 Jun 2023 13:17:57 +0100 Subject: [PATCH] chore(ci): Downgrade telepresence (#1641) The latest spate of E2E failures with random timeouts seem to be related to the new release of Telepresence. This PR pins the version of Telepresence to the last-known good version. This PR also includes a change to dump pod logs on test failure to help with debugging. Signed-off-by: Charith Ellawala --- .github/workflows/e2e.yaml | 5 +++-- e2e/run.sh | 2 +- internal/server/tests.go | 12 +++++++++--- internal/test/e2e/setup.go | 13 +++++++++++++ internal/test/e2e/tests.go | 16 +++++++++++++--- 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 738af9ab8..a2e26a7b4 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -21,10 +21,10 @@ jobs: - name: Install Helmfile and Telepresence run: | mkdir bin - curl -fL https://github.com/helmfile/helmfile/releases/download/v0.153.1/helmfile_0.153.1_linux_amd64.tar.gz -o bin/helmfile.tar.gz + curl -fL https://github.com/helmfile/helmfile/releases/download/v0.154.0/helmfile_0.154.0_linux_amd64.tar.gz -o bin/helmfile.tar.gz tar -xf bin/helmfile.tar.gz -C bin chmod +x bin/helmfile - curl -fL https://app.getambassador.io/download/tel2/linux/amd64/latest/telepresence -o bin/telepresence + curl -fL https://ambassador-labs.gateway.scarf.sh/telepresenceio/telepresence/releases/download/v2.13.2/telepresence-linux-amd64 -o bin/telepresence chmod +x bin/telepresence echo "$(pwd)/bin" >> $GITHUB_PATH mkdir -p ~/.config/telepresence @@ -33,6 +33,7 @@ jobs: helm: 60s trafficManagerAPI: 30s EOF + go install github.com/stern/stern@latest - name: Initialize Helmfile run: helmfile init --force diff --git a/e2e/run.sh b/e2e/run.sh index e5a7ab516..73007c875 100755 --- a/e2e/run.sh +++ b/e2e/run.sh @@ -42,5 +42,5 @@ if [[ "$#" -gt "0" ]]; then # E.g. e2e/run.sh ./mysql/... -args -run-id=xxxxx -no-cleanup run_tests "$@" else - run_tests ./... -args -no-cleanup="$E2E_NO_CLEANUP" -command-timeout=4m + run_tests ./... -args -no-cleanup="$E2E_NO_CLEANUP" -command-timeout=5m fi diff --git a/internal/server/tests.go b/internal/server/tests.go index d5bfd364b..0e7cdd0fa 100644 --- a/internal/server/tests.go +++ b/internal/server/tests.go @@ -133,6 +133,12 @@ func (tr *TestRunner) executeGRPCTestCase(grpcConn *grpc.ClientConn, tc *private ctx, cancelFunc := context.WithTimeout(context.Background(), tr.Timeout) defer cancelFunc() + backoffConf := backoff.WithContext( + backoff.WithMaxRetries( + backoff.NewConstantBackOff(time.Millisecond*retryBackoffDelay), + tr.CerbosClientMaxRetries), + ctx) + switch call := tc.CallKind.(type) { case *privatev1.ServerTestCase_CheckResourceSet: cerbosClient := svcv1.NewCerbosServiceClient(grpcConn) @@ -140,21 +146,21 @@ func (tr *TestRunner) executeGRPCTestCase(grpcConn *grpc.ClientConn, tc *private err = backoff.Retry(func() error { have, err = cerbosClient.CheckResourceSet(ctx, call.CheckResourceSet.Input) return err - }, backoff.WithMaxRetries(backoff.NewConstantBackOff(time.Millisecond*retryBackoffDelay), tr.CerbosClientMaxRetries)) + }, backoffConf) case *privatev1.ServerTestCase_CheckResourceBatch: cerbosClient := svcv1.NewCerbosServiceClient(grpcConn) want = call.CheckResourceBatch.WantResponse err = backoff.Retry(func() error { have, err = cerbosClient.CheckResourceBatch(ctx, call.CheckResourceBatch.Input) return err - }, backoff.WithMaxRetries(backoff.NewConstantBackOff(time.Millisecond*retryBackoffDelay), tr.CerbosClientMaxRetries)) + }, backoffConf) case *privatev1.ServerTestCase_CheckResources: cerbosClient := svcv1.NewCerbosServiceClient(grpcConn) want = call.CheckResources.WantResponse err = backoff.Retry(func() error { have, err = cerbosClient.CheckResources(ctx, call.CheckResources.Input) return err - }, backoff.WithMaxRetries(backoff.NewConstantBackOff(time.Millisecond*retryBackoffDelay), tr.CerbosClientMaxRetries)) + }, backoffConf) case *privatev1.ServerTestCase_PlaygroundValidate: playgroundClient := svcv1.NewCerbosPlaygroundServiceClient(grpcConn) want = call.PlaygroundValidate.WantResponse diff --git a/internal/test/e2e/setup.go b/internal/test/e2e/setup.go index 06b1889e3..6d42e027d 100644 --- a/internal/test/e2e/setup.go +++ b/internal/test/e2e/setup.go @@ -36,6 +36,14 @@ func Teardown(ctx Ctx) error { } func Cmd(ctx Ctx, name string, args ...string) error { + return execCmd(ctx, false, name, args...) +} + +func CmdWithOutput(ctx Ctx, name string, args ...string) error { + return execCmd(ctx, true, name, args...) +} + +func execCmd(ctx Ctx, showOutput bool, name string, args ...string) error { c := cmd.NewCmd(name, args...) c.Env = ctx.Environ() @@ -47,6 +55,9 @@ func Cmd(ctx Ctx, name string, args ...string) error { select { case done := <-status: if done.Complete && done.Error == nil && done.Exit == 0 { + if showOutput { + dumpOutput(ctx, done) + } return nil } @@ -83,9 +94,11 @@ func checkCerbosIsUp(ctx Ctx) func() error { ctx.Logf("Checking whether Cerbos is up") resp, err := client.Get(healthURL) if err != nil { + ctx.Logf("Error during healthcheck: %v", err) return err } if resp.StatusCode != http.StatusOK { + ctx.Logf("Received health status: %q", resp.Status) return fmt.Errorf("received status %q", resp.Status) } diff --git a/internal/test/e2e/tests.go b/internal/test/e2e/tests.go index fd48f94c9..dbda9e52b 100644 --- a/internal/test/e2e/tests.go +++ b/internal/test/e2e/tests.go @@ -7,21 +7,24 @@ package e2e import ( "crypto/tls" + "fmt" "testing" "time" - "github.com/cerbos/cerbos/client" - "github.com/cerbos/cerbos/internal/server" "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + + "github.com/cerbos/cerbos/client" + "github.com/cerbos/cerbos/internal/server" ) const ( AdminSuite = "admin" ChecksSuite = "checks" PlanResourcesSuite = "plan_resources" + testTimeout = 90 * time.Second // Things are slower inside Kind ) type Opt func(*suiteOpt) @@ -101,6 +104,13 @@ func RunSuites(t *testing.T, opts ...Opt) { } require.NoError(t, Setup(ctx)) + t.Cleanup(func() { + if t.Failed() { + if err := CmdWithOutput(ctx, "stern", ".*", fmt.Sprintf("--namespace=%s", ctx.Namespace()), "--no-follow"); err != nil { + t.Logf("Failed to grab logs: %v", err) + } + } + }) if sopt.postSetup != nil { ctx.Logf("Running PostSetup function") @@ -109,7 +119,7 @@ func RunSuites(t *testing.T, opts ...Opt) { } tr := server.LoadTestCases(t, sopt.suites...) - tr.Timeout = 30 * time.Second // Things are slower inside Kind + tr.Timeout = testTimeout if sopt.overlayMaxRetries != 0 { tr.WithCerbosClientRetries(sopt.overlayMaxRetries)