Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent: tolerate partial restore failure from persistent cache #12718

Merged
merged 4 commits into from Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 3 additions & 4 deletions changelog/12688.txt
@@ -1,4 +1,3 @@
+```release-note:improvement
+auth/kubernetes: validate JWT against the provided role on alias look ahead operations
+```
Comment on lines -1 to -3
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixing a small syntax error in the previous changelog


```release-note:improvement
auth/kubernetes: validate JWT against the provided role on alias look ahead operations
```
3 changes: 3 additions & 0 deletions changelog/12718.txt
@@ -0,0 +1,3 @@
```release-note:improvement
agent/cache: tolerate partial restore failure from persistent cache
```
57 changes: 37 additions & 20 deletions command/agent/cache/lease_cache.go
Expand Up @@ -17,6 +17,7 @@ import (
"time"

hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/go-secure-stdlib/base62"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/command/agent/cache/cacheboltdb"
Expand Down Expand Up @@ -969,56 +970,69 @@ func (c *LeaseCache) Flush() error {
// tokens first, since restoring a lease's renewal context and watcher requires
// looking up the token in the cachememdb.
func (c *LeaseCache) Restore(ctx context.Context, storage *cacheboltdb.BoltStorage) error {
var errors *multierror.Error
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be worth looking into down the road for how we define custom errors.
https://github.com/uber-go/guide/blob/master/style.md#error-types

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which bit of the guide are you referring to? The only bit I see that refers to multiple errors is wrapping, and that's not appropriate here because the errors we're collecting at this level are "peers" rather than forming part of a stack. In the case that there are errors, we want to log a message about what failed and optionally continue (if exit_on_err is false in the cache config).


// Process tokens first
tokens, err := storage.GetByType(ctx, cacheboltdb.TokenType)
if err != nil {
return err
}
if err := c.restoreTokens(tokens); err != nil {
return err
errors = multierror.Append(errors, err)
} else {
if err := c.restoreTokens(tokens); err != nil {
errors = multierror.Append(errors, err)
}
}

// Then process auth leases
authLeases, err := storage.GetByType(ctx, cacheboltdb.AuthLeaseType)
if err != nil {
return err
}
if err := c.restoreLeases(authLeases); err != nil {
return err
errors = multierror.Append(errors, err)
} else {
if err := c.restoreLeases(authLeases); err != nil {
errors = multierror.Append(errors, err)
}
}

// Then process secret leases
secretLeases, err := storage.GetByType(ctx, cacheboltdb.SecretLeaseType)
if err != nil {
return err
}
if err := c.restoreLeases(secretLeases); err != nil {
return err
errors = multierror.Append(errors, err)
} else {
if err := c.restoreLeases(secretLeases); err != nil {
errors = multierror.Append(errors, err)
}
}

return nil
return errors.ErrorOrNil()
}

func (c *LeaseCache) restoreTokens(tokens [][]byte) error {
var errors *multierror.Error

for _, token := range tokens {
newIndex, err := cachememdb.Deserialize(token)
if err != nil {
return err
errors = multierror.Append(errors, err)
continue
}
newIndex.RenewCtxInfo = c.createCtxInfo(nil)
if err := c.db.Set(newIndex); err != nil {
return err
errors = multierror.Append(errors, err)
continue
}
c.logger.Trace("restored token", "id", newIndex.ID)
}
return nil

return errors.ErrorOrNil()
}

func (c *LeaseCache) restoreLeases(leases [][]byte) error {
var errors *multierror.Error

for _, lease := range leases {
newIndex, err := cachememdb.Deserialize(lease)
if err != nil {
return err
errors = multierror.Append(errors, err)
continue
}

// Check if this lease has already expired
Expand All @@ -1031,14 +1045,17 @@ func (c *LeaseCache) restoreLeases(leases [][]byte) error {
}

if err := c.restoreLeaseRenewCtx(newIndex); err != nil {
return err
errors = multierror.Append(errors, err)
continue
}
if err := c.db.Set(newIndex); err != nil {
return err
errors = multierror.Append(errors, err)
continue
}
c.logger.Trace("restored lease", "id", newIndex.ID, "path", newIndex.RequestPath)
}
return nil

tomhjp marked this conversation as resolved.
Show resolved Hide resolved
return errors.ErrorOrNil()
}

// restoreLeaseRenewCtx re-creates a RenewCtx for an index object and starts
Expand Down
129 changes: 88 additions & 41 deletions command/agent/cache/lease_cache_test.go
Expand Up @@ -16,6 +16,7 @@ import (

"github.com/go-test/deep"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/command/agent/cache/cacheboltdb"
"github.com/hashicorp/vault/command/agent/cache/cachememdb"
Expand Down Expand Up @@ -711,13 +712,20 @@ func setupBoltStorage(t *testing.T) (tempCacheDir string, boltStorage *cachebolt
}

func TestLeaseCache_PersistAndRestore(t *testing.T) {
// Emulate 4 responses from the api proxy. The first two use the auto-auth
// token, and the last two use another token.
// Emulate responses from the api proxy. The first two use the auto-auth
// token, and the others use another token.
tomhjp marked this conversation as resolved.
Show resolved Hide resolved
// The test re-sends each request to ensure that the response is cached
// so the number of responses and cacheTests specified should always be equal.
responses := []*SendResponse{
newTestSendResponse(200, `{"auth": {"client_token": "testtoken", "renewable": true, "lease_duration": 600}}`),
newTestSendResponse(201, `{"lease_id": "foo", "renewable": true, "data": {"value": "foo"}, "lease_duration": 600}`),
// The auth token will get manually deleted from the bolt DB storage, causing both of the following two responses
// to be missing from the cache after a restore, because the lease is a child of the auth token.
newTestSendResponse(202, `{"auth": {"client_token": "testtoken2", "renewable": true, "orphan": true, "lease_duration": 600}}`),
newTestSendResponse(203, `{"lease_id": "secret2-lease", "renewable": true, "data": {"number": "two"}, "lease_duration": 600}`),
// 204 No content gets special handling - avoid.
newTestSendResponse(250, `{"auth": {"client_token": "testtoken3", "renewable": true, "orphan": true, "lease_duration": 600}}`),
newTestSendResponse(251, `{"lease_id": "secret3-lease", "renewable": true, "data": {"number": "three"}, "lease_duration": 600}`),
}

tempDir, boltStorage := setupBoltStorage(t)
Expand All @@ -726,59 +734,82 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) {
lc := testNewLeaseCacheWithPersistence(t, responses, boltStorage)

// Register an auto-auth token so that the token and lease requests are cached
lc.RegisterAutoAuthToken("autoauthtoken")
err := lc.RegisterAutoAuthToken("autoauthtoken")
require.NoError(t, err)

cacheTests := []struct {
token string
method string
urlPath string
body string
wantStatusCode int
token string
method string
urlPath string
body string
deleteFromPersistentStore bool // If true, will be deleted from bolt DB to induce an error on restore
expectMissingAfterRestore bool // If true, the response is not expected to be present in the restored cache
}{
{
// Make a request. A response with a new token is returned to the
// lease cache and that will be cached.
token: "autoauthtoken",
method: "GET",
urlPath: "http://example.com/v1/sample/api",
body: `{"value": "input"}`,
wantStatusCode: responses[0].Response.StatusCode,
token: "autoauthtoken",
method: "GET",
urlPath: "http://example.com/v1/sample/api",
body: `{"value": "input"}`,
},
{
// Modify the request a little bit to ensure the second response is
// returned to the lease cache.
token: "autoauthtoken",
method: "GET",
urlPath: "http://example.com/v1/sample/api",
body: `{"value": "input_changed"}`,
wantStatusCode: responses[1].Response.StatusCode,
token: "autoauthtoken",
method: "GET",
urlPath: "http://example.com/v1/sample/api",
body: `{"value": "input_changed"}`,
},
{
// Simulate an approle login to get another token
method: "PUT",
urlPath: "http://example.com/v1/auth/approle/login",
body: `{"role_id": "my role", "secret_id": "my secret"}`,
wantStatusCode: responses[2].Response.StatusCode,
method: "PUT",
urlPath: "http://example.com/v1/auth/approle-expect-missing/login",
body: `{"role_id": "my role", "secret_id": "my secret"}`,
deleteFromPersistentStore: true,
expectMissingAfterRestore: true,
},
{
// Test caching with the token acquired from the approle login
token: "testtoken2",
method: "GET",
urlPath: "http://example.com/v1/sample2/api",
body: `{"second": "input"}`,
wantStatusCode: responses[3].Response.StatusCode,
token: "testtoken2",
method: "GET",
urlPath: "http://example.com/v1/sample-expect-missing/api",
body: `{"second": "input"}`,
// This will be missing from the restored cache because its parent token was deleted
expectMissingAfterRestore: true,
},
{
// Simulate another approle login to get another token
method: "PUT",
urlPath: "http://example.com/v1/auth/approle/login",
body: `{"role_id": "my role", "secret_id": "my secret"}`,
},
{
// Test caching with the token acquired from the latest approle login
token: "testtoken3",
method: "GET",
urlPath: "http://example.com/v1/sample3/api",
body: `{"third": "input"}`,
},
}

for _, ct := range cacheTests {
var deleteIDs []string
for i, ct := range cacheTests {
// Send once to cache
sendReq := &SendRequest{
Token: ct.token,
Request: httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body)),
}
if ct.deleteFromPersistentStore {
deleteID, err := computeIndexID(sendReq)
require.NoError(t, err)
deleteIDs = append(deleteIDs, deleteID)
// Now reset the body after calculating the index
sendReq.Request = httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body))
}
resp, err := lc.Send(context.Background(), sendReq)
require.NoError(t, err)
assert.Equal(t, resp.Response.StatusCode, ct.wantStatusCode, "expected proxied response")
assert.Equal(t, responses[i].Response.StatusCode, resp.Response.StatusCode, "expected proxied response")
assert.Nil(t, resp.CacheMeta)

// Send again to test cache. If this isn't cached, the response returned
Expand All @@ -789,24 +820,36 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) {
}
respCached, err := lc.Send(context.Background(), sendCacheReq)
require.NoError(t, err, "failed to send request %+v", ct)
assert.Equal(t, respCached.Response.StatusCode, ct.wantStatusCode, "expected proxied response")
assert.Equal(t, responses[i].Response.StatusCode, respCached.Response.StatusCode, "expected proxied response")
require.NotNil(t, respCached.CacheMeta)
assert.True(t, respCached.CacheMeta.Hit)
}

require.NotEmpty(t, deleteIDs)
for _, deleteID := range deleteIDs {
err = boltStorage.Delete(deleteID)
require.NoError(t, err)
}

// Now we know the cache is working, so try restoring from the persisted
// cache's storage
restoredCache := testNewLeaseCache(t, nil)
// cache's storage. Responses 3 and 4 have been cleared from the cache, so
// re-send those.
restoredCache := testNewLeaseCache(t, responses[2:4])

err := restoredCache.Restore(context.Background(), boltStorage)
assert.NoError(t, err)
err = restoredCache.Restore(context.Background(), boltStorage)
errors, ok := err.(*multierror.Error)
require.True(t, ok)
assert.Len(t, errors.Errors, 1)
assert.Contains(t, errors.Error(), "could not find parent Token testtoken2")

// Now compare before and after
beforeDB, err := lc.db.GetByPrefix(cachememdb.IndexNameID)
require.NoError(t, err)
assert.Len(t, beforeDB, 5)

assert.Len(t, beforeDB, 7)
for _, cachedItem := range beforeDB {
if strings.Contains(cachedItem.RequestPath, "expect-missing") {
continue
}
restoredItem, err := restoredCache.db.Get(cachememdb.IndexNameID, cachedItem.ID)
require.NoError(t, err)

Expand Down Expand Up @@ -838,17 +881,21 @@ func TestLeaseCache_PersistAndRestore(t *testing.T) {
assert.Len(t, afterDB, 5)

// And finally send the cache requests once to make sure they're all being
// served from the restoredCache
for _, ct := range cacheTests {
// served from the restoredCache unless they were intended to be missing after restore.
for i, ct := range cacheTests {
sendCacheReq := &SendRequest{
Token: ct.token,
Request: httptest.NewRequest(ct.method, ct.urlPath, strings.NewReader(ct.body)),
}
respCached, err := restoredCache.Send(context.Background(), sendCacheReq)
require.NoError(t, err, "failed to send request %+v", ct)
assert.Equal(t, respCached.Response.StatusCode, ct.wantStatusCode, "expected proxied response")
require.NotNil(t, respCached.CacheMeta)
assert.True(t, respCached.CacheMeta.Hit)
assert.Equal(t, responses[i].Response.StatusCode, respCached.Response.StatusCode, "expected proxied response")
if ct.expectMissingAfterRestore {
require.Nil(t, respCached.CacheMeta)
} else {
require.NotNil(t, respCached.CacheMeta)
assert.True(t, respCached.CacheMeta.Hit)
}
}
}

Expand Down