Add support for GPU sharing on GKE (#6628) (#12733)

Signed-off-by: Modular Magician <magic-modules@google.com> Signed-off-by: Modular Magician <magic-modules@google.com>
hashicorp · Oct 6, 2022 · 626b21d · 626b21d
1 parent ecdcf5f
commit 626b21d
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 4 deletions.
diff --git a/.changelog/6628.txt b/.changelog/6628.txt
@@ -0,0 +1,3 @@
+```release-note:enhancement
+compute: added `node_config.0.guest_accelerator.0.gpu_sharing_config` field to `google_container_node_pool` resource
+```
diff --git a/google/node_config.go b/google/node_config.go
@@ -93,6 +93,30 @@ func schemaNodeConfig() *schema.Schema {
 								ForceNew:    true,
 								Description: `Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig user guide (https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning)`,
 							},
+							"gpu_sharing_config": {
+								Type:        schema.TypeList,
+								MaxItems:    1,
+								Optional:    true,
+								ForceNew:    true,
+								ConfigMode:  schema.SchemaConfigModeAttr,
+								Description: `Configuration for GPU sharing.`,
+								Elem: &schema.Resource{
+									Schema: map[string]*schema.Schema{
+										"gpu_sharing_strategy": {
+											Type:        schema.TypeString,
+											Required:    true,
+											ForceNew:    true,
+											Description: `The type of GPU sharing strategy to enable on the GPU node. Possible values are described in the API package (https://pkg.go.dev/google.golang.org/api/container/v1#GPUSharingConfig)`,
+										},
+										"max_shared_clients_per_gpu": {
+											Type:        schema.TypeInt,
+											Required:    true,
+											ForceNew:    true,
+											Description: `The maximum number of containers that can share a GPU.`,
+										},
+									},
+								},
+							},
 						},
 					},
 				},
@@ -370,11 +394,21 @@ func expandNodeConfig(v interface{}) *container.NodeConfig {
 			if data["count"].(int) == 0 {
 				continue
 			}
-			guestAccelerators = append(guestAccelerators, &container.AcceleratorConfig{
+			guestAcceleratorConfig := &container.AcceleratorConfig{
 				AcceleratorCount: int64(data["count"].(int)),
 				AcceleratorType:  data["type"].(string),
 				GpuPartitionSize: data["gpu_partition_size"].(string),
-			})
+			}
+
+			if v, ok := data["gpu_sharing_config"]; ok && len(v.([]interface{})) > 0 {
+				gpuSharingConfig := data["gpu_sharing_config"].([]interface{})[0].(map[string]interface{})
+				guestAcceleratorConfig.GpuSharingConfig = &container.GPUSharingConfig{
+					GpuSharingStrategy:     gpuSharingConfig["gpu_sharing_strategy"].(string),
+					MaxSharedClientsPerGpu: int64(gpuSharingConfig["max_shared_clients_per_gpu"].(int)),
+				}
+			}
+
+			guestAccelerators = append(guestAccelerators, guestAcceleratorConfig)
 		}
 		nc.Accelerators = guestAccelerators
 	}
@@ -573,11 +607,20 @@ func flattenNodeConfig(c *container.NodeConfig) []map[string]interface{} {
 func flattenContainerGuestAccelerators(c []*container.AcceleratorConfig) []map[string]interface{} {
 	result := []map[string]interface{}{}
 	for _, accel := range c {
-		result = append(result, map[string]interface{}{
+		accelerator := map[string]interface{}{
 			"count":              accel.AcceleratorCount,
 			"type":               accel.AcceleratorType,
 			"gpu_partition_size": accel.GpuPartitionSize,
-		})
+		}
+		if accel.GpuSharingConfig != nil {
+			accelerator["gpu_sharing_config"] = []map[string]interface{}{
+				{
+					"gpu_sharing_strategy":       accel.GpuSharingConfig.GpuSharingStrategy,
+					"max_shared_clients_per_gpu": accel.GpuSharingConfig.MaxSharedClientsPerGpu,
+				},
+			}
+		}
+		result = append(result, accelerator)
 	}
 	return result
 }

diff --git a/google/resource_container_node_pool_test.go b/google/resource_container_node_pool_test.go
@@ -1635,6 +1635,10 @@ resource "google_container_node_pool" "np_with_gpu" {
       type  = "nvidia-tesla-a100"
       gpu_partition_size = "1g.5gb"
       count = 1
+      gpu_sharing_config {
+        gpu_sharing_strategy = "TIME_SHARING"
+        max_shared_clients_per_gpu = 2
+      }
     }
   }
 }

diff --git a/website/docs/r/container_cluster.html.markdown b/website/docs/r/container_cluster.html.markdown
@@ -843,6 +843,16 @@ linux_node_config {
 
 * `gpu_partition_size` (Optional) - Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig [user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning).
 
+* `gpu_sharing_config` (Optional) - Configuration for GPU sharing. Structure is [documented below](#nested_gpu_sharing_config).
+
+<a name="nested_gpu_sharing_config"></a>The `gpu_sharing_config` block supports:
+
+* `gpu_sharing_strategy` (Required) - The type of GPU sharing strategy to enable on the GPU node. 
+    Accepted values are:
+    * `"TIME_SHARING"`: Allow multiple containers to have [time-shared](https://cloud.google.com/kubernetes-engine/docs/concepts/timesharing-gpus) access to a single GPU device. 
+
+* `max_shared_clients_per_gpu` (Required) - The maximum number of containers that can share a GPU. 
+
 <a name="nested_workload_identity_config"></a> The `workload_identity_config` block supports:
 
 * `workload_pool` (Optional) - The workload pool to attach all Kubernetes service accounts to.