Add GPU/Accelerator support to VMs

jwmay2012 · jwmay2012 · commit 2358039d76e9 · 2024-12-19T11:34:59.000-05:00
diff --git a/api/v1beta1/gcpmachine_types.go b/api/v1beta1/gcpmachine_types.go
@@ -346,6 +346,25 @@ type GCPMachineSpec struct {
 	// RootDiskEncryptionKey defines the KMS key to be used to encrypt the root disk.
 	// +optional
 	RootDiskEncryptionKey *CustomerEncryptionKey `json:"rootDiskEncryptionKey,omitempty"`
+
+	// GuestAccelerators is a list of the type and count of accelerator cards
+	// attached to the instance.
+	// +optional
+	GuestAccelerators []Accelerator `json:"guestAccelerators,omitempty"`
+}
+
+// Accelerator is a specification of the type and number of accelerator
+// cards attached to the instance.
+type Accelerator struct {
+	// Count is the number of the guest accelerator cards exposed to this
+	// instance.
+	Count int64 `json:"count,omitempty"`
+	// Type is the full or partial URL of the accelerator type resource to
+	// attach to this instance. For example:
+	// projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+	// If you are creating an instance template, specify only the accelerator name.
+	// See GPUs on Compute Engine for a full list of accelerator types.
+	Type string `json:"type,omitempty"`
 }
 
 // MetadataItem defines a single piece of metadata associated with an instance.
diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go
diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go
@@ -374,6 +374,22 @@ func (m *MachineScope) InstanceAdditionalMetadataSpec() *compute.Metadata {
 	return metadata
 }
 
+// InstanceGuestAcceleratorsSpec returns a slice of Guest Accelerator Config specs.
+func (m *MachineScope) InstanceGuestAcceleratorsSpec() []*compute.AcceleratorConfig {
+	if len(m.GCPMachine.Spec.GuestAccelerators) == 0 {
+		return nil
+	}
+	accelConfigs := make([]*compute.AcceleratorConfig, 0, len(m.GCPMachine.Spec.GuestAccelerators))
+	for _, accel := range m.GCPMachine.Spec.GuestAccelerators {
+		accelConfig := &compute.AcceleratorConfig{
+			AcceleratorType:  accel.Type,
+			AcceleratorCount: accel.Count,
+		}
+		accelConfigs = append(accelConfigs, accelConfig)
+	}
+	return accelConfigs
+}
+
 // InstanceSpec returns instance spec.
 func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
 	instance := &compute.Instance{
@@ -457,6 +473,11 @@ func (m *MachineScope) InstanceSpec(log logr.Logger) *compute.Instance {
 	instance.Metadata = m.InstanceAdditionalMetadataSpec()
 	instance.ServiceAccounts = append(instance.ServiceAccounts, m.InstanceServiceAccountsSpec())
 	instance.NetworkInterfaces = append(instance.NetworkInterfaces, m.InstanceNetworkInterfaceSpec())
+	instance.GuestAccelerators = m.InstanceGuestAcceleratorsSpec()
+	if len(instance.GuestAccelerators) > 0 {
+		instance.Scheduling.OnHostMaintenance = "TERMINATE"
+	}
+
 	return instance
 }
 
diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachines.yaml
@@ -200,6 +200,31 @@ spec:
                 - Enabled
                 - Disabled
                 type: string
+              guestAccelerators:
+                description: |-
+                  GuestAccelerators is a list of the type and count of accelerator cards
+                  attached to the instance.
+                items:
+                  description: |-
+                    Accelerator is a specification of the type and number of accelerator
+                    cards attached to the instance.
+                  properties:
+                    count:
+                      description: |-
+                        Count is the number of the guest accelerator cards exposed to this
+                        instance.
+                      format: int64
+                      type: integer
+                    type:
+                      description: |-
+                        Type is the full or partial URL of the accelerator type resource to
+                        attach to this instance. For example:
+                        projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+                        If you are creating an instance template, specify only the accelerator name.
+                        See GPUs on Compute Engine for a full list of accelerator types.
+                      type: string
+                  type: object
+                type: array
               image:
                 description: |-
                   Image is the full reference to a valid image to be used for this machine.
diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_gcpmachinetemplates.yaml
@@ -215,6 +215,31 @@ spec:
                         - Enabled
                         - Disabled
                         type: string
+                      guestAccelerators:
+                        description: |-
+                          GuestAccelerators is a list of the type and count of accelerator cards
+                          attached to the instance.
+                        items:
+                          description: |-
+                            Accelerator is a specification of the type and number of accelerator
+                            cards attached to the instance.
+                          properties:
+                            count:
+                              description: |-
+                                Count is the number of the guest accelerator cards exposed to this
+                                instance.
+                              format: int64
+                              type: integer
+                            type:
+                              description: |-
+                                Type is the full or partial URL of the accelerator type resource to
+                                attach to this instance. For example:
+                                projects/my-project/zones/us-central1-c/acceleratorTypes/nvidia-tesla-p100
+                                If you are creating an instance template, specify only the accelerator name.
+                                See GPUs on Compute Engine for a full list of accelerator types.
+                              type: string
+                          type: object
+                        type: array
                       image:
                         description: |-
                           Image is the full reference to a valid image to be used for this machine.
diff --git a/docs/book/src/topics/gpus.md b/docs/book/src/topics/gpus.md
@@ -0,0 +1,26 @@
+# GPUs
+
+Add GPUs via the `guestAccelerators` field in `GCPMachineTemplate`.
+
+```
+---
+apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
+kind: GCPMachineTemplate
+metadata:
+  name: mygcpmachinetemplate
+  namespace: mynamespace
+spec:
+  template:
+    spec:
+      image: projects/myproject/global/images/myimage
+      instanceType: n1-standard-2
+      guestAccelerators:
+      - type: projects/myproject/zones/us-central1-c/acceleratorTypes/nvidia-tesla-t4
+        count: 1
+```
+
+https://cloud.google.com/compute/docs/gpus
+
+NOTE: Instances with accelerators/GPUs do NOT support live migration. 
+Therefore, the `onHostMaintenance` event is always `TERMINATE`.
+https://cloud.google.com/compute/docs/instances/setting-vm-host-options