diff --git a/.changelog/6748.txt b/.changelog/6748.txt new file mode 100644 index 0000000000..1803b8a73b --- /dev/null +++ b/.changelog/6748.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +container: Added support for concurrent node pool mutations on a cluster. Previously, node pool mutations were restricted to run synchronously clientside. NOTE: While this feature is supported in Terraform from this release onwards, only a limited number of GCP projects will support this behavior initially. The provider will automatically process mutations concurrently as the feature rolls out generally. +``` diff --git a/google/mutexkv.go b/google/mutexkv.go index e797104264..cd0c53a4c4 100644 --- a/google/mutexkv.go +++ b/google/mutexkv.go @@ -13,7 +13,7 @@ import ( // their access to individual security groups based on SG ID. type MutexKV struct { lock sync.Mutex - store map[string]*sync.Mutex + store map[string]*sync.RWMutex } // Locks the mutex for the given key. Caller is responsible for calling Unlock @@ -31,13 +31,28 @@ func (m *MutexKV) Unlock(key string) { log.Printf("[DEBUG] Unlocked %q", key) } +// Acquires a read-lock on the mutex for the given key. Caller is responsible for calling RUnlock +// for the same key +func (m *MutexKV) RLock(key string) { + log.Printf("[DEBUG] RLocking %q", key) + m.get(key).RLock() + log.Printf("[DEBUG] RLocked %q", key) +} + +// Releases a read-lock on the mutex for the given key. Caller must have called RLock for the same key first +func (m *MutexKV) RUnlock(key string) { + log.Printf("[DEBUG] RUnlocking %q", key) + m.get(key).RUnlock() + log.Printf("[DEBUG] RUnlocked %q", key) +} + // Returns a mutex for the given key, no guarantee of its lock status -func (m *MutexKV) get(key string) *sync.Mutex { +func (m *MutexKV) get(key string) *sync.RWMutex { m.lock.Lock() defer m.lock.Unlock() mutex, ok := m.store[key] if !ok { - mutex = &sync.Mutex{} + mutex = &sync.RWMutex{} m.store[key] = mutex } return mutex @@ -46,6 +61,6 @@ func (m *MutexKV) get(key string) *sync.Mutex { // Returns a properly initialized MutexKV func NewMutexKV() *MutexKV { return &MutexKV{ - store: make(map[string]*sync.Mutex), + store: make(map[string]*sync.RWMutex), } } diff --git a/google/resource_container_node_pool.go b/google/resource_container_node_pool.go index 6a4b511dd0..395ac5c299 100644 --- a/google/resource_container_node_pool.go +++ b/google/resource_container_node_pool.go @@ -364,11 +364,21 @@ func (nodePoolInformation *NodePoolInformation) parent() string { ) } -func (nodePoolInformation *NodePoolInformation) lockKey() string { +func (nodePoolInformation *NodePoolInformation) clusterLockKey() string { return containerClusterMutexKey(nodePoolInformation.project, nodePoolInformation.location, nodePoolInformation.cluster) } +func (nodePoolInformation *NodePoolInformation) nodePoolLockKey(nodePoolName string) string { + return fmt.Sprintf( + "projects/%s/locations/%s/clusters/%s/nodePools/%s", + nodePoolInformation.project, + nodePoolInformation.location, + nodePoolInformation.cluster, + nodePoolName, + ) +} + func extractNodePoolInformation(d *schema.ResourceData, config *Config) (*NodePoolInformation, error) { cluster := d.Get("cluster").(string) @@ -416,8 +426,15 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e return err } - mutexKV.Lock(nodePoolInfo.lockKey()) - defer mutexKV.Unlock(nodePoolInfo.lockKey()) + // Acquire read-lock on cluster. + clusterLockKey := nodePoolInfo.clusterLockKey() + mutexKV.RLock(clusterLockKey) + defer mutexKV.RUnlock(clusterLockKey) + + // Acquire write-lock on nodepool. + npLockKey := nodePoolInfo.nodePoolLockKey(nodePool.Name) + mutexKV.Lock(npLockKey) + defer mutexKV.Unlock(npLockKey) req := &container.CreateNodePoolRequest{ NodePool: nodePool, @@ -501,12 +518,6 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e return err } - //Check cluster is in running state - _, err = containerClusterAwaitRestingState(config, nodePoolInfo.project, nodePoolInfo.location, nodePoolInfo.cluster, userAgent, d.Timeout(schema.TimeoutCreate)) - if err != nil { - return err - } - state, err := containerNodePoolAwaitRestingState(config, d.Id(), nodePoolInfo.project, userAgent, d.Timeout(schema.TimeoutCreate)) if err != nil { return err @@ -591,12 +602,6 @@ func resourceContainerNodePoolUpdate(d *schema.ResourceData, meta interface{}) e } name := getNodePoolName(d.Id()) - //Check cluster is in running state - _, err = containerClusterAwaitRestingState(config, nodePoolInfo.project, nodePoolInfo.location, nodePoolInfo.cluster, userAgent, d.Timeout(schema.TimeoutCreate)) - if err != nil { - return err - } - _, err = containerNodePoolAwaitRestingState(config, nodePoolInfo.fullyQualifiedName(name), nodePoolInfo.project, userAgent, d.Timeout(schema.TimeoutUpdate)) if err != nil { return err @@ -635,16 +640,6 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e name := getNodePoolName(d.Id()) - //Check cluster is in running state - _, err = containerClusterAwaitRestingState(config, nodePoolInfo.project, nodePoolInfo.location, nodePoolInfo.cluster, userAgent, d.Timeout(schema.TimeoutCreate)) - if err != nil { - if isGoogleApiErrorWithCode(err, 404) { - log.Printf("[INFO] GKE cluster %s doesn't exist, skipping node pool %s deletion", nodePoolInfo.cluster, d.Id()) - return nil - } - return err - } - _, err = containerNodePoolAwaitRestingState(config, nodePoolInfo.fullyQualifiedName(name), nodePoolInfo.project, userAgent, d.Timeout(schema.TimeoutDelete)) if err != nil { // If the node pool doesn't get created and then we try to delete it, we get an error, @@ -657,8 +652,15 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e } } - mutexKV.Lock(nodePoolInfo.lockKey()) - defer mutexKV.Unlock(nodePoolInfo.lockKey()) + // Acquire read-lock on cluster. + clusterLockKey := nodePoolInfo.clusterLockKey() + mutexKV.RLock(clusterLockKey) + defer mutexKV.RUnlock(clusterLockKey) + + // Acquire write-lock on nodepool. + npLockKey := nodePoolInfo.nodePoolLockKey(name) + mutexKV.Lock(npLockKey) + defer mutexKV.Unlock(npLockKey) timeout := d.Timeout(schema.TimeoutDelete) startTime := time.Now() @@ -1075,13 +1077,19 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node config := meta.(*Config) name := d.Get(prefix + "name").(string) - lockKey := nodePoolInfo.lockKey() - userAgent, err := generateUserAgentString(d, config.userAgent) if err != nil { return err } + // Acquire read-lock on cluster. + clusterLockKey := nodePoolInfo.clusterLockKey() + mutexKV.RLock(clusterLockKey) + defer mutexKV.RUnlock(clusterLockKey) + + // Nodepool write-lock will be acquired when update function is called. + npLockKey := nodePoolInfo.nodePoolLockKey(name) + if d.HasChange(prefix + "autoscaling") { update := &container.ClusterUpdate{ DesiredNodePoolId: name, @@ -1124,11 +1132,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated autoscaling in Node Pool %s", d.Id()) } @@ -1164,8 +1170,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -1219,11 +1224,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated tags for node pool %s", name) } @@ -1258,7 +1261,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node } // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -1290,11 +1293,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated image type in Node Pool %s", d.Id()) } @@ -1326,11 +1327,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated workload_metadata_config for node pool %s", name) } @@ -1358,12 +1357,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node nodePoolInfo.location, "updating GKE node pool size", userAgent, timeout) } - - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] GKE node pool %s size has been updated to %d", name, newSize) } @@ -1396,11 +1392,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node nodePoolInfo.location, "updating GKE node pool management", userAgent, timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated management in Node Pool %s", name) } @@ -1425,12 +1419,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool version", userAgent, timeout) } - - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated version in Node Pool %s", name) } @@ -1453,11 +1444,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node return containerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool node locations", userAgent, timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated node locations in Node Pool %s", name) } @@ -1534,12 +1523,9 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node // Wait until it's updated return containerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool upgrade settings", userAgent, timeout) } - - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } - log.Printf("[INFO] Updated upgrade settings in Node Pool %s", name) } @@ -1568,8 +1554,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - // Call update serially. - if err := lockedCall(lockKey, updateF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } diff --git a/google/resource_container_node_pool_test.go b/google/resource_container_node_pool_test.go index 4edbef70b8..74d3f179bc 100644 --- a/google/resource_container_node_pool_test.go +++ b/google/resource_container_node_pool_test.go @@ -973,6 +973,48 @@ func TestAccContainerNodePool_shieldedInstanceConfig(t *testing.T) { }) } +func TestAccContainerNodePool_concurrent(t *testing.T) { + t.Parallel() + + cluster := fmt.Sprintf("tf-test-cluster-%s", randString(t, 10)) + np1 := fmt.Sprintf("tf-test-nodepool-%s", randString(t, 10)) + np2 := fmt.Sprintf("tf-test-nodepool-%s", randString(t, 10)) + + vcrTest(t, resource.TestCase{ + PreCheck: func() { testAccPreCheck(t) }, + Providers: testAccProviders, + CheckDestroy: testAccCheckContainerNodePoolDestroyProducer(t), + Steps: []resource.TestStep{ + { + Config: testAccContainerNodePool_concurrentCreate(cluster, np1, np2), + }, + { + ResourceName: "google_container_node_pool.np1", + ImportState: true, + ImportStateVerify: true, + }, + { + ResourceName: "google_container_node_pool.np2", + ImportState: true, + ImportStateVerify: true, + }, + { + Config: testAccContainerNodePool_concurrentUpdate(cluster, np1, np2), + }, + { + ResourceName: "google_container_node_pool.np1", + ImportState: true, + ImportStateVerify: true, + }, + { + ResourceName: "google_container_node_pool.np2", + ImportState: true, + ImportStateVerify: true, + }, + }, + }) +} + func TestAccContainerNodePool_gcfsConfig(t *testing.T) { t.Parallel() @@ -2212,3 +2254,53 @@ resource "google_container_node_pool" "np" { } `, cluster, np) } + +func testAccContainerNodePool_concurrentCreate(cluster, np1, np2 string) string { + return fmt.Sprintf(` +resource "google_container_cluster" "cluster" { + name = "%s" + location = "us-central1-a" + initial_node_count = 3 +} + +resource "google_container_node_pool" "np1" { + name = "%s" + location = "us-central1-a" + cluster = google_container_cluster.cluster.name + initial_node_count = 2 +} + +resource "google_container_node_pool" "np2" { + name = "%s" + location = "us-central1-a" + cluster = google_container_cluster.cluster.name + initial_node_count = 2 + } +`, cluster, np1, np2) +} + +func testAccContainerNodePool_concurrentUpdate(cluster, np1, np2 string) string { + return fmt.Sprintf(` +resource "google_container_cluster" "cluster" { + name = "%s" + location = "us-central1-a" + initial_node_count = 3 +} + +resource "google_container_node_pool" "np1" { + name = "%s" + location = "us-central1-a" + cluster = google_container_cluster.cluster.name + initial_node_count = 2 + version = "1.23.13-gke.900" +} + +resource "google_container_node_pool" "np2" { + name = "%s" + location = "us-central1-a" + cluster = google_container_cluster.cluster.name + initial_node_count = 2 + version = "1.23.13-gke.900" + } +`, cluster, np1, np2) +} diff --git a/google/utils.go b/google/utils.go index c04c93bcf6..14d92b562f 100644 --- a/google/utils.go +++ b/google/utils.go @@ -13,6 +13,7 @@ import ( "time" "github.com/hashicorp/errwrap" + "github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource" "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema" "github.com/hashicorp/terraform-plugin-sdk/v2/terraform" "google.golang.org/api/googleapi" @@ -573,3 +574,19 @@ func checkGoogleIamPolicy(value string) error { } return nil } + +// Retries an operation while the canonical error code is FAILED_PRECONDTION +// which indicates there is an incompatible operation already running on the +// cluster. This error can be safely retried until the incompatible operation +// completes, and the newly requested operation can begin. +func retryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error { + return resource.Retry(timeout, func() *resource.RetryError { + if err := lockedCall(lockKey, f); err != nil { + if isFailedPreconditionError(err) { + return resource.RetryableError(err) + } + return resource.NonRetryableError(err) + } + return nil + }) +}