Skip to content

Commit

Permalink
Add support for GPU sharing on GKE (#6628) (#12733)
Browse files Browse the repository at this point in the history
Signed-off-by: Modular Magician <magic-modules@google.com>

Signed-off-by: Modular Magician <magic-modules@google.com>
  • Loading branch information
modular-magician committed Oct 6, 2022
1 parent ecdcf5f commit 626b21d
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .changelog/6628.txt
@@ -0,0 +1,3 @@
```release-note:enhancement
compute: added `node_config.0.guest_accelerator.0.gpu_sharing_config` field to `google_container_node_pool` resource
```
51 changes: 47 additions & 4 deletions google/node_config.go
Expand Up @@ -93,6 +93,30 @@ func schemaNodeConfig() *schema.Schema {
ForceNew: true,
Description: `Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig user guide (https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning)`,
},
"gpu_sharing_config": {
Type: schema.TypeList,
MaxItems: 1,
Optional: true,
ForceNew: true,
ConfigMode: schema.SchemaConfigModeAttr,
Description: `Configuration for GPU sharing.`,
Elem: &schema.Resource{
Schema: map[string]*schema.Schema{
"gpu_sharing_strategy": {
Type: schema.TypeString,
Required: true,
ForceNew: true,
Description: `The type of GPU sharing strategy to enable on the GPU node. Possible values are described in the API package (https://pkg.go.dev/google.golang.org/api/container/v1#GPUSharingConfig)`,
},
"max_shared_clients_per_gpu": {
Type: schema.TypeInt,
Required: true,
ForceNew: true,
Description: `The maximum number of containers that can share a GPU.`,
},
},
},
},
},
},
},
Expand Down Expand Up @@ -370,11 +394,21 @@ func expandNodeConfig(v interface{}) *container.NodeConfig {
if data["count"].(int) == 0 {
continue
}
guestAccelerators = append(guestAccelerators, &container.AcceleratorConfig{
guestAcceleratorConfig := &container.AcceleratorConfig{
AcceleratorCount: int64(data["count"].(int)),
AcceleratorType: data["type"].(string),
GpuPartitionSize: data["gpu_partition_size"].(string),
})
}

if v, ok := data["gpu_sharing_config"]; ok && len(v.([]interface{})) > 0 {
gpuSharingConfig := data["gpu_sharing_config"].([]interface{})[0].(map[string]interface{})
guestAcceleratorConfig.GpuSharingConfig = &container.GPUSharingConfig{
GpuSharingStrategy: gpuSharingConfig["gpu_sharing_strategy"].(string),
MaxSharedClientsPerGpu: int64(gpuSharingConfig["max_shared_clients_per_gpu"].(int)),
}
}

guestAccelerators = append(guestAccelerators, guestAcceleratorConfig)
}
nc.Accelerators = guestAccelerators
}
Expand Down Expand Up @@ -573,11 +607,20 @@ func flattenNodeConfig(c *container.NodeConfig) []map[string]interface{} {
func flattenContainerGuestAccelerators(c []*container.AcceleratorConfig) []map[string]interface{} {
result := []map[string]interface{}{}
for _, accel := range c {
result = append(result, map[string]interface{}{
accelerator := map[string]interface{}{
"count": accel.AcceleratorCount,
"type": accel.AcceleratorType,
"gpu_partition_size": accel.GpuPartitionSize,
})
}
if accel.GpuSharingConfig != nil {
accelerator["gpu_sharing_config"] = []map[string]interface{}{
{
"gpu_sharing_strategy": accel.GpuSharingConfig.GpuSharingStrategy,
"max_shared_clients_per_gpu": accel.GpuSharingConfig.MaxSharedClientsPerGpu,
},
}
}
result = append(result, accelerator)
}
return result
}
Expand Down
4 changes: 4 additions & 0 deletions google/resource_container_node_pool_test.go
Expand Up @@ -1635,6 +1635,10 @@ resource "google_container_node_pool" "np_with_gpu" {
type = "nvidia-tesla-a100"
gpu_partition_size = "1g.5gb"
count = 1
gpu_sharing_config {
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
}
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions website/docs/r/container_cluster.html.markdown
Expand Up @@ -843,6 +843,16 @@ linux_node_config {

* `gpu_partition_size` (Optional) - Size of partitions to create on the GPU. Valid values are described in the NVIDIA mig [user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning).

* `gpu_sharing_config` (Optional) - Configuration for GPU sharing. Structure is [documented below](#nested_gpu_sharing_config).

<a name="nested_gpu_sharing_config"></a>The `gpu_sharing_config` block supports:

* `gpu_sharing_strategy` (Required) - The type of GPU sharing strategy to enable on the GPU node.
Accepted values are:
* `"TIME_SHARING"`: Allow multiple containers to have [time-shared](https://cloud.google.com/kubernetes-engine/docs/concepts/timesharing-gpus) access to a single GPU device.

* `max_shared_clients_per_gpu` (Required) - The maximum number of containers that can share a GPU.

<a name="nested_workload_identity_config"></a> The `workload_identity_config` block supports:

* `workload_pool` (Optional) - The workload pool to attach all Kubernetes service accounts to.
Expand Down

0 comments on commit 626b21d

Please sign in to comment.