From 626b21da08fd4de038263a95545e0498e48ac95d Mon Sep 17 00:00:00 2001 From: The Magician Date: Thu, 6 Oct 2022 10:50:41 -0700 Subject: [PATCH] Add support for GPU sharing on GKE (#6628) (#12733) Signed-off-by: Modular Magician Signed-off-by: Modular Magician --- .changelog/6628.txt | 3 ++ google/node_config.go | 51 +++++++++++++++++-- google/resource_container_node_pool_test.go | 4 ++ .../docs/r/container_cluster.html.markdown | 10 ++++ 4 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 .changelog/6628.txt diff --git a/.changelog/6628.txt b/.changelog/6628.txt new file mode 100644 index 0000000000..c21943cd76 --- /dev/null +++ b/.changelog/6628.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +compute: added `node_config.0.guest_accelerator.0.gpu_sharing_config` field to `google_container_node_pool` resource +``` diff --git a/google/node_config.go b/google/node_config.go index 2f10952bab..714b6fe847 100644 --- a/google/node_config.go +++ b/google/node_config.go @@ -93,6 +93,30 @@ func schemaNodeConfig() *schema.Schema { ForceNew: true, Description: `Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig user guide (https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning)`, }, + "gpu_sharing_config": { + Type: schema.TypeList, + MaxItems: 1, + Optional: true, + ForceNew: true, + ConfigMode: schema.SchemaConfigModeAttr, + Description: `Configuration for GPU sharing.`, + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + "gpu_sharing_strategy": { + Type: schema.TypeString, + Required: true, + ForceNew: true, + Description: `The type of GPU sharing strategy to enable on the GPU node. Possible values are described in the API package (https://pkg.go.dev/google.golang.org/api/container/v1#GPUSharingConfig)`, + }, + "max_shared_clients_per_gpu": { + Type: schema.TypeInt, + Required: true, + ForceNew: true, + Description: `The maximum number of containers that can share a GPU.`, + }, + }, + }, + }, }, }, }, @@ -370,11 +394,21 @@ func expandNodeConfig(v interface{}) *container.NodeConfig { if data["count"].(int) == 0 { continue } - guestAccelerators = append(guestAccelerators, &container.AcceleratorConfig{ + guestAcceleratorConfig := &container.AcceleratorConfig{ AcceleratorCount: int64(data["count"].(int)), AcceleratorType: data["type"].(string), GpuPartitionSize: data["gpu_partition_size"].(string), - }) + } + + if v, ok := data["gpu_sharing_config"]; ok && len(v.([]interface{})) > 0 { + gpuSharingConfig := data["gpu_sharing_config"].([]interface{})[0].(map[string]interface{}) + guestAcceleratorConfig.GpuSharingConfig = &container.GPUSharingConfig{ + GpuSharingStrategy: gpuSharingConfig["gpu_sharing_strategy"].(string), + MaxSharedClientsPerGpu: int64(gpuSharingConfig["max_shared_clients_per_gpu"].(int)), + } + } + + guestAccelerators = append(guestAccelerators, guestAcceleratorConfig) } nc.Accelerators = guestAccelerators } @@ -573,11 +607,20 @@ func flattenNodeConfig(c *container.NodeConfig) []map[string]interface{} { func flattenContainerGuestAccelerators(c []*container.AcceleratorConfig) []map[string]interface{} { result := []map[string]interface{}{} for _, accel := range c { - result = append(result, map[string]interface{}{ + accelerator := map[string]interface{}{ "count": accel.AcceleratorCount, "type": accel.AcceleratorType, "gpu_partition_size": accel.GpuPartitionSize, - }) + } + if accel.GpuSharingConfig != nil { + accelerator["gpu_sharing_config"] = []map[string]interface{}{ + { + "gpu_sharing_strategy": accel.GpuSharingConfig.GpuSharingStrategy, + "max_shared_clients_per_gpu": accel.GpuSharingConfig.MaxSharedClientsPerGpu, + }, + } + } + result = append(result, accelerator) } return result } diff --git a/google/resource_container_node_pool_test.go b/google/resource_container_node_pool_test.go index 9e3b44ab00..f22f655d0e 100644 --- a/google/resource_container_node_pool_test.go +++ b/google/resource_container_node_pool_test.go @@ -1635,6 +1635,10 @@ resource "google_container_node_pool" "np_with_gpu" { type = "nvidia-tesla-a100" gpu_partition_size = "1g.5gb" count = 1 + gpu_sharing_config { + gpu_sharing_strategy = "TIME_SHARING" + max_shared_clients_per_gpu = 2 + } } } } diff --git a/website/docs/r/container_cluster.html.markdown b/website/docs/r/container_cluster.html.markdown index 956a15f1cc..4dc0e3a516 100755 --- a/website/docs/r/container_cluster.html.markdown +++ b/website/docs/r/container_cluster.html.markdown @@ -843,6 +843,16 @@ linux_node_config { * `gpu_partition_size` (Optional) - Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig [user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning). +* `gpu_sharing_config` (Optional) - Configuration for GPU sharing. Structure is [documented below](#nested_gpu_sharing_config). + +The `gpu_sharing_config` block supports: + +* `gpu_sharing_strategy` (Required) - The type of GPU sharing strategy to enable on the GPU node. + Accepted values are: + * `"TIME_SHARING"`: Allow multiple containers to have [time-shared](https://cloud.google.com/kubernetes-engine/docs/concepts/timesharing-gpus) access to a single GPU device. + +* `max_shared_clients_per_gpu` (Required) - The maximum number of containers that can share a GPU. + The `workload_identity_config` block supports: * `workload_pool` (Optional) - The workload pool to attach all Kubernetes service accounts to.