Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix early rotation for roles with WALs, handle multiple WALs per role #28

Merged
merged 23 commits into from Sep 21, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 11 additions & 4 deletions path_rotate.go
Expand Up @@ -24,12 +24,12 @@ func (b *backend) pathRotateCredentials() []*framework.Path {
Fields: map[string]*framework.FieldSchema{},
Operations: map[logical.Operation]framework.OperationHandler{
logical.UpdateOperation: &framework.PathOperation{
Callback: b.pathRotateCredentialsUpdate,
Callback: b.pathRotateRootCredentialsUpdate,
ForwardPerformanceStandby: true,
ForwardPerformanceSecondary: true,
},
logical.CreateOperation: &framework.PathOperation{
Callback: b.pathRotateCredentialsUpdate,
Callback: b.pathRotateRootCredentialsUpdate,
ForwardPerformanceStandby: true,
ForwardPerformanceSecondary: true,
},
Expand Down Expand Up @@ -64,7 +64,7 @@ func (b *backend) pathRotateCredentials() []*framework.Path {
}
}

func (b *backend) pathRotateCredentialsUpdate(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
func (b *backend) pathRotateRootCredentialsUpdate(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
if _, hasTimeout := ctx.Deadline(); !hasTimeout {
var cancel func()
ctx, cancel = context.WithTimeout(ctx, defaultCtxTimeout)
Expand Down Expand Up @@ -149,6 +149,8 @@ func (b *backend) pathRotateRoleCredentialsUpdate(ctx context.Context, req *logi
}
} else {
item.Priority = resp.RotationTime.Add(role.StaticAccount.RotationPeriod).Unix()
// Clear any stored WAL ID as we must have successfully deleted our WAL to get here.
item.Value = ""
}

// Add their rotation to the queue. We use pushErr here to distinguish between
Expand All @@ -159,9 +161,14 @@ func (b *backend) pathRotateRoleCredentialsUpdate(ctx context.Context, req *logi
return nil, pushErr
}

if err != nil {
return nil, fmt.Errorf("unable to finish rotating credentials; retries will "+
"continue in the background but it is also safe to retry manually: %w", err)
}

// We're not returning creds here because we do not know if its been processed
// by the queue.
return nil, err
return nil, nil
}

// rollBackPassword uses naive exponential backoff to retry updating to an old password,
Expand Down
48 changes: 40 additions & 8 deletions path_static_roles.go
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"time"

"github.com/hashicorp/go-multierror"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/locksutil"
"github.com/hashicorp/vault/sdk/logical"
Expand Down Expand Up @@ -146,7 +147,26 @@ func (b *backend) pathStaticRoleDelete(ctx context.Context, req *logical.Request
return nil, err
}

return nil, nil
walIDs, err := framework.ListWAL(ctx, req.Storage)
if err != nil {
return nil, err
}
var merr *multierror.Error
for _, walID := range walIDs {
wal, err := b.findStaticWAL(ctx, req.Storage, walID)
if err != nil {
merr = multierror.Append(merr, err)
continue
}
if wal != nil && name == wal.RoleName {
err = framework.DeleteWAL(ctx, req.Storage, walID)
if err != nil {
merr = multierror.Append(merr, err)
}
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we want to do the WAL delete before the role deletion, that way we have an avenue to retry the operation if the WAL cleanup fails?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried implementing this, but I think I'd prefer to leave WAL cleanup as a best effort, because although it has its own consistency problems, the other order does as well. If we delete WALs before the role and fail, we'll need to push the item back onto the queue, which itself could fail. In practice though, deleting roles and WALs both rely on the same storage layer, so at least these partial failure scenarios should be very rare.

We could address the inconsistency of leaving WALs lying around fairly robustly by keying WALs on a UUID whose lifetime is attached to a role's storage entry instead of keying WALs on the role name. That obviously wouldn't be a backport candidate though. WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, let's leave it as is for now and revisit any improvements in a subsequent PR


return nil, merr.ErrorOrNil()
}

func (b *backend) pathStaticRoleRead(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
Expand Down Expand Up @@ -221,6 +241,7 @@ func (b *backend) pathStaticRoleCreateUpdate(ctx context.Context, req *logical.R

// Only call setStaticAccountPassword if we're creating the role for the
// first time
var item *queue.Item
switch req.Operation {
case logical.CreateOperation:
// setStaticAccountPassword calls Storage.Put and saves the role to storage
Expand All @@ -229,10 +250,22 @@ func (b *backend) pathStaticRoleCreateUpdate(ctx context.Context, req *logical.R
Role: role,
})
if err != nil {
if resp != nil && resp.WALID != "" {
walDeleteErr := framework.DeleteWAL(ctx, req.Storage, resp.WALID)
if walDeleteErr != nil {
var merr *multierror.Error
merr = multierror.Append(merr, err)
merr = multierror.Append(merr, fmt.Errorf("failed to clean up WAL from failed role creation: %w", walDeleteErr))
err = merr.ErrorOrNil()
}
}
return nil, err
}
briankassouf marked this conversation as resolved.
Show resolved Hide resolved
// guard against RotationTime not being set or zero-value
lvr = resp.RotationTime
item = &queue.Item{
Key: name,
}
case logical.UpdateOperation:
// store updated Role
entry, err := logical.StorageEntryJSON(staticRolePath+name, role)
Expand All @@ -244,20 +277,19 @@ func (b *backend) pathStaticRoleCreateUpdate(ctx context.Context, req *logical.R
}

// In case this is an update, remove any previous version of the item from
// the queue

// the queue. The existing item could be tracking a WAL ID for this role,
// so it's important to keep the existing item rather than recreate it.
//TODO: Add retry logic
_, err = b.popFromRotationQueueByKey(name)
item, err = b.popFromRotationQueueByKey(name)
if err != nil {
return nil, err
}
}

item.Priority = lvr.Add(role.StaticAccount.RotationPeriod).Unix()

// Add their rotation to the queue
if err := b.pushItem(&queue.Item{
Key: name,
Priority: lvr.Add(role.StaticAccount.RotationPeriod).Unix(),
}); err != nil {
if err := b.pushItem(item); err != nil {
return nil, err
}

Expand Down
177 changes: 177 additions & 0 deletions path_static_roles_test.go
Expand Up @@ -76,6 +76,15 @@ func TestRoles(t *testing.T) {
if resp.Data["last_vault_rotation"] == nil {
t.Fatal("expected last_vault_rotation to not be empty")
}

// Assert that we cleared the WAL ID from the queue's data in the happy path.
item, err := b.credRotationQueue.PopByKey("hashicorp")
if err != nil {
t.Fatal()
}
if item.Value != "" {
t.Fatal()
}
})
t.Run("happy path with roles", func(t *testing.T) {
b, storage := getBackend(false)
Expand Down Expand Up @@ -452,3 +461,171 @@ func TestListRoles(t *testing.T) {
}
})
}

func TestWALsStillTrackedAfterUpdate(t *testing.T) {
ctx := context.Background()
b, storage := getBackend(false)
defer b.Cleanup(ctx)
configureOpenLDAPMount(t, b, storage)

_, err := b.HandleRequest(ctx, &logical.Request{
Operation: logical.CreateOperation,
Path: staticRolePath + "hashicorp",
Storage: storage,
Data: map[string]interface{}{
"username": "hashicorp",
"dn": "uid=hashicorp,ou=users,dc=hashicorp,dc=com",
"rotation_period": "5s",
},
})
if err != nil {
t.Fatal(err)
}

generateWALFromFailedRotation(t, b, storage, "hashicorp")
requireWALs(t, storage, 1)

_, err = b.HandleRequest(ctx, &logical.Request{
Operation: logical.UpdateOperation,
Path: staticRolePath + "hashicorp",
Storage: storage,
Data: map[string]interface{}{
"username": "hashicorp",
"dn": "uid=hashicorp,ou=users,dc=hashicorp,dc=com",
"rotation_period": "60s",
},
})
if err != nil {
t.Fatal(err)
}
requireWALs(t, storage, 1)

// Check we've still got track of it in the queue as well
item, err := b.credRotationQueue.PopByKey("hashicorp")
if err != nil {
t.Fatal(err)
}
if wal, ok := item.Value.(string); !ok || wal == "" {
t.Fatal("should have a WAL ID in the rotation queue")
}
}

func TestWALsDeletedOnRoleCreationFailed(t *testing.T) {
ctx := context.Background()
b, storage := getBackend(true)
defer b.Cleanup(ctx)
configureOpenLDAPMount(t, b, storage)

for i := 0; i < 3; i++ {
_, err := b.HandleRequest(ctx, &logical.Request{
Operation: logical.CreateOperation,
Path: staticRolePath + "hashicorp",
Storage: storage,
Data: map[string]interface{}{
"username": "hashicorp",
"dn": "uid=hashicorp,ou=users,dc=hashicorp,dc=com",
"rotation_period": "5s",
},
})
if err == nil {
t.Fatal("expected error from OpenLDAP")
}
}

requireWALs(t, storage, 0)
}

func TestWALsDeletedOnRoleDeletion(t *testing.T) {
ctx := context.Background()
b, storage := getBackend(false)
defer b.Cleanup(ctx)
configureOpenLDAPMount(t, b, storage)

// Create the roles
roleNames := []string{"hashicorp", "2"}
for _, roleName := range roleNames {
_, err := b.HandleRequest(ctx, &logical.Request{
Operation: logical.CreateOperation,
Path: "static-role/" + roleName,
Storage: storage,
Data: map[string]interface{}{
"username": roleName,
"dn": "uid=hashicorp,ou=users,dc=hashicorp,dc=com",
"rotation_period": "5s",
},
})
if err != nil {
t.Fatal(err)
}
}

// Fail to rotate the roles
for _, roleName := range roleNames {
generateWALFromFailedRotation(t, b, storage, roleName)
}

// Should have 2 WALs hanging around
requireWALs(t, storage, 2)

// Delete one of the static roles
_, err := b.HandleRequest(ctx, &logical.Request{
Operation: logical.DeleteOperation,
Path: "static-role/hashicorp",
Storage: storage,
})
if err != nil {
t.Fatal(err)
}

// 1 WAL should be cleared by the delete
requireWALs(t, storage, 1)
}

func configureOpenLDAPMount(t *testing.T, b *backend, storage logical.Storage) {
t.Helper()
resp, err := b.HandleRequest(context.Background(), &logical.Request{
Operation: logical.CreateOperation,
Path: configPath,
Storage: storage,
Data: map[string]interface{}{
"binddn": "tester",
"bindpass": "pa$$w0rd",
"url": "ldap://138.91.247.105",
"certificate": validCertificate,
},
})
if err != nil || (resp != nil && resp.IsError()) {
t.Fatalf("err:%s resp:%#v\n", err, resp)
}
}

func generateWALFromFailedRotation(t *testing.T, b *backend, storage logical.Storage, roleName string) {
t.Helper()
// Fail to rotate the roles
ldapClient := b.client.(*fakeLdapClient)
originalValue := ldapClient.throwErrs
ldapClient.throwErrs = true
defer func() {
ldapClient.throwErrs = originalValue
}()

_, err := b.HandleRequest(context.Background(), &logical.Request{
Operation: logical.UpdateOperation,
Path: "rotate-role/" + roleName,
Storage: storage,
})
if err == nil {
t.Fatal("expected error")
}
}

func requireWALs(t *testing.T, storage logical.Storage, count int) {
t.Helper()
wals, err := storage.List(context.Background(), "wal/")
if err != nil {
t.Fatal(err)
}
if len(wals) != count {
t.Fatal("expected WALS", count, "got", len(wals))
}
}