From 67069f011fd3009729df554e8cf97776670407d8 Mon Sep 17 00:00:00 2001 From: Tom Proctor Date: Fri, 8 Oct 2021 11:30:04 +0100 Subject: [PATCH] agent: tolerate partial restore failure from persistent cache (#12718) * agent: tolerate partial restore failure from persistent cache * Review comments: improved consistency, test robustness, comments, assertions --- changelog/12688.txt | 7 +- changelog/12718.txt | 3 + command/agent/cache/lease_cache.go | 57 +++++++---- command/agent/cache/lease_cache_test.go | 129 ++++++++++++++++-------- 4 files changed, 131 insertions(+), 65 deletions(-) create mode 100644 changelog/12718.txt diff --git a/changelog/12688.txt b/changelog/12688.txt index 56624722b98ee..e57e56bf7f32b 100644 --- a/changelog/12688.txt +++ b/changelog/12688.txt @@ -1,4 +1,3 @@ -+```release-note:improvement -+auth/kubernetes: validate JWT against the provided role on alias look ahead operations -+``` - +```release-note:improvement +auth/kubernetes: validate JWT against the provided role on alias look ahead operations +``` diff --git a/changelog/12718.txt b/changelog/12718.txt new file mode 100644 index 0000000000000..1e786d78c569f --- /dev/null +++ b/changelog/12718.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent/cache: tolerate partial restore failure from persistent cache +``` diff --git a/command/agent/cache/lease_cache.go b/command/agent/cache/lease_cache.go index a8b2d4bd88cea..ad6d71c0c902f 100644 --- a/command/agent/cache/lease_cache.go +++ b/command/agent/cache/lease_cache.go @@ -17,6 +17,7 @@ import ( "time" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-secure-stdlib/base62" "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/command/agent/cache/cacheboltdb" @@ -969,56 +970,69 @@ func (c *LeaseCache) Flush() error { // tokens first, since restoring a lease's renewal context and watcher requires // looking up the token in the cachememdb. func (c *LeaseCache) Restore(ctx context.Context, storage *cacheboltdb.BoltStorage) error { + var errors *multierror.Error + // Process tokens first tokens, err := storage.GetByType(ctx, cacheboltdb.TokenType) if err != nil { - return err - } - if err := c.restoreTokens(tokens); err != nil { - return err + errors = multierror.Append(errors, err) + } else { + if err := c.restoreTokens(tokens); err != nil { + errors = multierror.Append(errors, err) + } } // Then process auth leases authLeases, err := storage.GetByType(ctx, cacheboltdb.AuthLeaseType) if err != nil { - return err - } - if err := c.restoreLeases(authLeases); err != nil { - return err + errors = multierror.Append(errors, err) + } else { + if err := c.restoreLeases(authLeases); err != nil { + errors = multierror.Append(errors, err) + } } // Then process secret leases secretLeases, err := storage.GetByType(ctx, cacheboltdb.SecretLeaseType) if err != nil { - return err - } - if err := c.restoreLeases(secretLeases); err != nil { - return err + errors = multierror.Append(errors, err) + } else { + if err := c.restoreLeases(secretLeases); err != nil { + errors = multierror.Append(errors, err) + } } - return nil + return errors.ErrorOrNil() } func (c *LeaseCache) restoreTokens(tokens [][]byte) error { + var errors *multierror.Error + for _, token := range tokens { newIndex, err := cachememdb.Deserialize(token) if err != nil { - return err + errors = multierror.Append(errors, err) + continue } newIndex.RenewCtxInfo = c.createCtxInfo(nil) if err := c.db.Set(newIndex); err != nil { - return err + errors = multierror.Append(errors, err) + continue } c.logger.Trace("restored token", "id", newIndex.ID) } - return nil + + return errors.ErrorOrNil() } func (c *LeaseCache) restoreLeases(leases [][]byte) error { + var errors *multierror.Error + for _, lease := range leases { newIndex, err := cachememdb.Deserialize(lease) if err != nil { - return err + errors = multierror.Append(errors, err) + continue } // Check if this lease has already expired @@ -1031,14 +1045,17 @@ func (c *LeaseCache) restoreLeases(leases [][]byte) error { } if err := c.restoreLeaseRenewCtx(newIndex); err != nil { - return err + errors = multierror.Append(errors, err) + continue } if err := c.db.Set(newIndex); err != nil { - return err + errors = multierror.Append(errors, err) + continue } c.logger.Trace("restored lease", "id", newIndex.ID, "path", newIndex.RequestPath) } - return nil + + return errors.ErrorOrNil() } // restoreLeaseRenewCtx re-creates a RenewCtx for an index object and starts diff --git a/command/agent/cache/lease_cache_test.go b/command/agent/cache/lease_cache_test.go index 32842c059d3e1..66fb56be6b202 100644 --- a/command/agent/cache/lease_cache_test.go +++ b/command/agent/cache/lease_cache_test.go @@ -16,6 +16,7 @@ import ( "github.com/go-test/deep" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/command/agent/cache/cacheboltdb" "github.com/hashicorp/vault/command/agent/cache/cachememdb" @@ -711,13 +712,20 @@ func setupBoltStorage(t *testing.T) (tempCacheDir string, boltStorage *cachebolt } func TestLeaseCache_PersistAndRestore(t *testing.T) { - // Emulate 4 responses from the api proxy. The first two use the auto-auth - // token, and the last two use another token. + // Emulate responses from the api proxy. The first two use the auto-auth + // token, and the others use another token. + // The test re-sends each request to ensure that the response is cached + // so the number of responses and cacheTests specified should always be equal. responses := []*SendResponse{ newTestSendResponse(200, `{"auth": {"client_token": "testtoken", "renewable": true, "lease_duration": 600}}`), newTestSendResponse(201, `{"lease_id": "foo", "renewable": true, "data": {"value": "foo"}, "lease_duration": 600}`), + // The auth token will get manually deleted from the bolt DB storage, causing both of the following two responses + // to be missing from the cache after a restore, because the lease is a child of the auth token. newTestSendResponse(202, `{"auth": {"client_token": "testtoken2", "renewable": true, "orphan": true, "lease_duration": 600}}`), newTestSendResponse(203, `{"lease_id": "secret2-lease", "renewable": true, "data": {"number": "two"}, "lease_duration": 600}`), + // 204 No content gets special handling - avoid. + newTestSendResponse(250, `{"auth": {"client_token": "testtoken3", "renewable": true, "orphan": true, "lease_duration": 600}}`), + newTestSendResponse(251, `{"lease_id": "secret3-lease", "renewable": true, "data": {"number": "three"}, "lease_duration": 600}`), } tempDir, boltStorage := setupBoltStorage(t) @@ -726,59 +734,82 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) { lc := testNewLeaseCacheWithPersistence(t, responses, boltStorage) // Register an auto-auth token so that the token and lease requests are cached - lc.RegisterAutoAuthToken("autoauthtoken") + err := lc.RegisterAutoAuthToken("autoauthtoken") + require.NoError(t, err) cacheTests := []struct { - token string - method string - urlPath string - body string - wantStatusCode int + token string + method string + urlPath string + body string + deleteFromPersistentStore bool // If true, will be deleted from bolt DB to induce an error on restore + expectMissingAfterRestore bool // If true, the response is not expected to be present in the restored cache }{ { // Make a request. A response with a new token is returned to the // lease cache and that will be cached. - token: "autoauthtoken", - method: "GET", - urlPath: "http://example.com/v1/sample/api", - body: `{"value": "input"}`, - wantStatusCode: responses[0].Response.StatusCode, + token: "autoauthtoken", + method: "GET", + urlPath: "http://example.com/v1/sample/api", + body: `{"value": "input"}`, }, { // Modify the request a little bit to ensure the second response is // returned to the lease cache. - token: "autoauthtoken", - method: "GET", - urlPath: "http://example.com/v1/sample/api", - body: `{"value": "input_changed"}`, - wantStatusCode: responses[1].Response.StatusCode, + token: "autoauthtoken", + method: "GET", + urlPath: "http://example.com/v1/sample/api", + body: `{"value": "input_changed"}`, }, { // Simulate an approle login to get another token - method: "PUT", - urlPath: "http://example.com/v1/auth/approle/login", - body: `{"role_id": "my role", "secret_id": "my secret"}`, - wantStatusCode: responses[2].Response.StatusCode, + method: "PUT", + urlPath: "http://example.com/v1/auth/approle-expect-missing/login", + body: `{"role_id": "my role", "secret_id": "my secret"}`, + deleteFromPersistentStore: true, + expectMissingAfterRestore: true, }, { // Test caching with the token acquired from the approle login - token: "testtoken2", - method: "GET", - urlPath: "http://example.com/v1/sample2/api", - body: `{"second": "input"}`, - wantStatusCode: responses[3].Response.StatusCode, + token: "testtoken2", + method: "GET", + urlPath: "http://example.com/v1/sample-expect-missing/api", + body: `{"second": "input"}`, + // This will be missing from the restored cache because its parent token was deleted + expectMissingAfterRestore: true, + }, + { + // Simulate another approle login to get another token + method: "PUT", + urlPath: "http://example.com/v1/auth/approle/login", + body: `{"role_id": "my role", "secret_id": "my secret"}`, + }, + { + // Test caching with the token acquired from the latest approle login + token: "testtoken3", + method: "GET", + urlPath: "http://example.com/v1/sample3/api", + body: `{"third": "input"}`, }, } - for _, ct := range cacheTests { + var deleteIDs []string + for i, ct := range cacheTests { // Send once to cache sendReq := &SendRequest{ Token: ct.token, Request: httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body)), } + if ct.deleteFromPersistentStore { + deleteID, err := computeIndexID(sendReq) + require.NoError(t, err) + deleteIDs = append(deleteIDs, deleteID) + // Now reset the body after calculating the index + sendReq.Request = httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body)) + } resp, err := lc.Send(context.Background(), sendReq) require.NoError(t, err) - assert.Equal(t, resp.Response.StatusCode, ct.wantStatusCode, "expected proxied response") + assert.Equal(t, responses[i].Response.StatusCode, resp.Response.StatusCode, "expected proxied response") assert.Nil(t, resp.CacheMeta) // Send again to test cache. If this isn't cached, the response returned @@ -789,24 +820,36 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) { } respCached, err := lc.Send(context.Background(), sendCacheReq) require.NoError(t, err, "failed to send request %+v", ct) - assert.Equal(t, respCached.Response.StatusCode, ct.wantStatusCode, "expected proxied response") + assert.Equal(t, responses[i].Response.StatusCode, respCached.Response.StatusCode, "expected proxied response") require.NotNil(t, respCached.CacheMeta) assert.True(t, respCached.CacheMeta.Hit) } + require.NotEmpty(t, deleteIDs) + for _, deleteID := range deleteIDs { + err = boltStorage.Delete(deleteID) + require.NoError(t, err) + } + // Now we know the cache is working, so try restoring from the persisted - // cache's storage - restoredCache := testNewLeaseCache(t, nil) + // cache's storage. Responses 3 and 4 have been cleared from the cache, so + // re-send those. + restoredCache := testNewLeaseCache(t, responses[2:4]) - err := restoredCache.Restore(context.Background(), boltStorage) - assert.NoError(t, err) + err = restoredCache.Restore(context.Background(), boltStorage) + errors, ok := err.(*multierror.Error) + require.True(t, ok) + assert.Len(t, errors.Errors, 1) + assert.Contains(t, errors.Error(), "could not find parent Token testtoken2") // Now compare before and after beforeDB, err := lc.db.GetByPrefix(cachememdb.IndexNameID) require.NoError(t, err) - assert.Len(t, beforeDB, 5) - + assert.Len(t, beforeDB, 7) for _, cachedItem := range beforeDB { + if strings.Contains(cachedItem.RequestPath, "expect-missing") { + continue + } restoredItem, err := restoredCache.db.Get(cachememdb.IndexNameID, cachedItem.ID) require.NoError(t, err) @@ -838,17 +881,21 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) { assert.Len(t, afterDB, 5) // And finally send the cache requests once to make sure they're all being - // served from the restoredCache - for _, ct := range cacheTests { + // served from the restoredCache unless they were intended to be missing after restore. + for i, ct := range cacheTests { sendCacheReq := &SendRequest{ Token: ct.token, Request: httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body)), } respCached, err := restoredCache.Send(context.Background(), sendCacheReq) require.NoError(t, err, "failed to send request %+v", ct) - assert.Equal(t, respCached.Response.StatusCode, ct.wantStatusCode, "expected proxied response") - require.NotNil(t, respCached.CacheMeta) - assert.True(t, respCached.CacheMeta.Hit) + assert.Equal(t, responses[i].Response.StatusCode, respCached.Response.StatusCode, "expected proxied response") + if ct.expectMissingAfterRestore { + require.Nil(t, respCached.CacheMeta) + } else { + require.NotNil(t, respCached.CacheMeta) + assert.True(t, respCached.CacheMeta.Hit) + } } }