massdriver-cloud · mclacore · May 2, 2024 · May 7, 2024 · May 22, 2024 · May 22, 2024
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ Form input parameters for configuring a bundle for deployment.
 
 - **`fargate`** *(object)*: AWS Fargate provides on-demand, right-sized compute capacity for running containers on EKS without managing node pools or clusters of EC2 instances.
   - **`enabled`** *(boolean)*: Enables EKS Fargate. Default: `False`.
-- **`k8s_version`** *(string)*: The version of Kubernetes to run. Must be one of: `['1.22', '1.23', '1.24', '1.25', '1.26', '1.27']`. Default: `1.27`.
+- **`k8s_version`** *(string)*: The version of Kubernetes to run. Must be one of: `['1.22', '1.23', '1.24', '1.25', '1.26', '1.27', '1.28', '1.29']`. Default: `1.29`.
 - **`monitoring`** *(object)*
   - **`control_plane_log_retention`** *(integer)*: Duration to retain control plane logs in AWS Cloudwatch (Note: control plane logs do not contain application or container logs). Default: `7`.
     - **One of**
@@ -69,35 +69,36 @@ Form input parameters for configuring a bundle for deployment.
   - **`prometheus`** *(object)*: Configuration settings for the Prometheus instances that are automatically installed into the cluster to provide monitoring capabilities".
     - **`grafana_enabled`** *(boolean)*: Install Grafana into the cluster to provide a metric visualizer. Default: `False`.
     - **`persistence_enabled`** *(boolean)*: This setting will enable persistence of Prometheus data via EBS volumes. However, in small clusters (less than 5 nodes) this can create problems of pod scheduling and placement due EBS volumes being zonally-locked, and thus should be disabled. Default: `True`.
-- **`node_groups`** *(array)*
+- **`node_groups`** *(array)*: Node groups to provision.
   - **Items** *(object)*: Definition of a node group.
     - **`advanced_configuration_enabled`** *(boolean)*: Default: `False`.
     - **`instance_type`** *(string)*: Instance type to use in the node group.
       - **One of**
         - C5 High-CPU Large (2 vCPUs, 4.0 GiB)
-        - C5 High-CPU Extra Large (4 vCPUs, 8.0 GiB)
-        - C5 High-CPU Double Extra Large (8 vCPUs, 16.0 GiB)
-        - C5 High-CPU Quadruple Extra Large (16 vCPUs, 32.0 GiB)
-        - C5 High-CPU 9xlarge (36 vCPUs, 72.0 GiB)
-        - C5 High-CPU 12xlarge (48 vCPUs, 96.0 GiB)
-        - C5 High-CPU 18xlarge (72 vCPUs, 144.0 GiB)
-        - C5 High-CPU 24xlarge (96 vCPUs, 192.0 GiB)
+        - C5 High-CPU XL (4 vCPUs, 8.0 GiB)
+        - C5 High-CPU 2XL (8 vCPUs, 16.0 GiB)
+        - C5 High-CPU 4XL (16 vCPUs, 32.0 GiB)
+        - C5 High-CPU 9XL (36 vCPUs, 72.0 GiB)
+        - C5 High-CPU 12XL (48 vCPUs, 96.0 GiB)
+        - C5 High-CPU 18XL (72 vCPUs, 144.0 GiB)
+        - C5 High-CPU 24XL (96 vCPUs, 192.0 GiB)
         - M5 General Purpose Large (2 vCPUs, 8.0 GiB)
-        - M5 General Purpose Extra Large (4 vCPUs, 16.0 GiB)
-        - M5 General Purpose Double Extra Large (8 vCPUs, 32.0 GiB)
-        - M5 General Purpose Quadruple Extra Large (16 vCPUs, 64.0 GiB)
-        - M5 General Purpose Eight Extra Large (32 vCPUs, 128.0 GiB)
-        - M5 General Purpose 12xlarge (48 vCPUs, 192.0 GiB)
-        - M5 General Purpose 16xlarge (64 vCPUs, 256.0 GiB)
-        - M5 General Purpose 24xlarge (96 vCPUs, 384.0 GiB)
+        - M5 General Purpose XL (4 vCPUs, 16.0 GiB)
+        - M5 General Purpose 2XL (8 vCPUs, 32.0 GiB)
+        - M5 General Purpose 4XL (16 vCPUs, 64.0 GiB)
+        - M5 General Purpose 8XL (32 vCPUs, 128.0 GiB)
+        - M5 General Purpose 12XL (48 vCPUs, 192.0 GiB)
+        - M5 General Purpose 16XL (64 vCPUs, 256.0 GiB)
+        - M5 General Purpose 24XL (96 vCPUs, 384.0 GiB)
         - T3 Small (2 vCPUs for a 4h 48m burst, 2.0 GiB)
         - T3 Medium (2 vCPUs for a 4h 48m burst, 4.0 GiB)
         - T3 Large (2 vCPUs for a 7h 12m burst, 8.0 GiB)
-        - T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
-        - T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
-        - P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
-        - P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
-        - P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
+        - T3 XL (4 vCPUs for a 9h 36m burst, 16.0 GiB)
+        - T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
+        - P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
+        - P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
+        - P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
+        - P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
     - **`max_size`** *(integer)*: Maximum number of instances in the node group. Minimum: `0`. Default: `10`.
     - **`min_size`** *(integer)*: Minimum number of instances in the node group. Minimum: `0`. Default: `1`.
     - **`name_suffix`** *(string)*: The name of the node group. Default: ``.
@@ -114,7 +115,7 @@ Form input parameters for configuring a bundle for deployment.
       "fargate": {
           "enabled": false
       },
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 7,
           "prometheus": {
@@ -137,7 +138,7 @@ Form input parameters for configuring a bundle for deployment.
   ```json
   {
       "__name": "Development",
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 7,
           "prometheus": {
@@ -159,7 +160,7 @@ Form input parameters for configuring a bundle for deployment.
   ```json
   {
       "__name": "Production",
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 365,
           "prometheus": {

diff --git a/core-services/nvidia_gpu.tf b/core-services/nvidia_gpu.tf
@@ -0,0 +1,91 @@
+locals {
+  gpu_regex                  = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*"
+  has_gpu_node_groups        = length([for ng in var.node_groups : ng if length(regexall(local.gpu_regex, ng.instance_type)) > 0]) > 0
+  gpu_enabled_instance_types = [for ng in var.node_groups : ng.instance_type if length(regexall(local.gpu_regex, ng.instance_type)) > 0]
+}
+
+resource "kubernetes_daemonset" "nvidia" {
+  count = local.has_gpu_node_groups ? 1 : 0
+  metadata {
+    name      = "nvidia-device-plugin-daemonset"
+    namespace = kubernetes_namespace_v1.md-core-services.metadata.0.name
+    labels = merge(var.md_metadata.default_tags, {
+      k8s-app = "nvidia-device-plugin-daemonset"
+    })
+  }
+  spec {
+    selector {
+      match_labels = {
+        name = "nvidia-device-plugin-ds"
+      }
+    }
+    strategy {
+      type = "RollingUpdate"
+    }
+    template {
+      metadata {
+        labels = merge(var.md_metadata.default_tags, {
+          name = "nvidia-device-plugin-ds"
+        })
+        annotations = {
+          "scheduler.alpha.kubernetes.io/critical-pod" : ""
+        }
+      }
+      spec {
+        priority_class_name = "system-node-critical"
+        affinity {
+          node_affinity {
+            required_during_scheduling_ignored_during_execution {
+              node_selector_term {
+                match_expressions {
+                  key      = "node.kubernetes.io/instance-type"
+                  operator = "In"
+                  values   = local.gpu_enabled_instance_types
+                }
+              }
+            }
+          }
+        }
+        toleration {
+          key      = "CriticalAddonsOnly"
+          operator = "Exists"
+        }
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Exists"
+          effect   = "NoSchedule"
+        }
+        toleration {
+          key      = "gpu"
+          operator = "Equal"
+          value    = "true"
+          effect   = "NoSchedule"
+        }
+        container {
+          name  = "nvidia-device-plugin-ctr"
+          image = "nvcr.io/nvidia/k8s-device-plugin:v0.15.0"
+          env {
+            name  = "FAIL_ON_INIT_ERROR"
+            value = "false"
+          }
+          security_context {
+            privileged = true
+            capabilities {
+              drop = ["all"]
+            }
+          }
+          volume_mount {
+            name       = "device-plugin"
+            mount_path = "/var/lib/kubelet/device-plugins"
+          }
+        }
+        volume {
+          name = "device-plugin"
+          host_path {
+            path = "/var/lib/kubelet/device-plugins"
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/massdriver.yaml b/massdriver.yaml
@@ -16,7 +16,7 @@ steps:
 params:
   examples:
     - __name: Wizard
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       fargate:
         enabled: false
       node_groups:
@@ -35,7 +35,7 @@ params:
           persistence_enabled: false
           grafana_enabled: false
     - __name: Development
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       node_groups:
         - name_suffix: shared
           instance_type: t3.medium
@@ -47,7 +47,7 @@ params:
           persistence_enabled: false
           grafana_enabled: false
     - __name: Production
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       node_groups:
         - name_suffix: shared
           instance_type: c5.2xlarge
@@ -68,15 +68,17 @@ params:
     k8s_version:
       type: string
       title: Kubernetes Version
-      description: The version of Kubernetes to run
-      default: "1.27"
+      description: The version of Kubernetes to run.
+      default: "1.29"
       enum:
         - "1.22"
         - "1.23"
         - "1.24"
         - "1.25"
         - "1.26"
         - "1.27"
+        - "1.28"
+        - "1.29"
     fargate:
       type: object
       title: Fargate
@@ -118,7 +120,7 @@ params:
     node_groups:
       type: array
       title: Node Groups
-      descrition: Node groups to provision
+      description: Node groups to provision
       minItems: 1
       items:
         type: object
@@ -158,52 +160,54 @@ params:
             oneOf:
               - title: C5 High-CPU Large (2 vCPUs, 4.0 GiB)
                 const: c5.large
-              - title: C5 High-CPU Extra Large (4 vCPUs, 8.0 GiB)
+              - title: C5 High-CPU XL (4 vCPUs, 8.0 GiB)
                 const: c5.xlarge
-              - title: C5 High-CPU Double Extra Large (8 vCPUs, 16.0 GiB)
+              - title: C5 High-CPU 2XL (8 vCPUs, 16.0 GiB)
                 const: c5.2xlarge
-              - title: C5 High-CPU Quadruple Extra Large (16 vCPUs, 32.0 GiB)
+              - title: C5 High-CPU 4XL (16 vCPUs, 32.0 GiB)
                 const: c5.4xlarge
-              - title: C5 High-CPU 9xlarge (36 vCPUs, 72.0 GiB)
+              - title: C5 High-CPU 9XL (36 vCPUs, 72.0 GiB)
                 const: c5.9xlarge
-              - title: C5 High-CPU 12xlarge (48 vCPUs, 96.0 GiB)
+              - title: C5 High-CPU 12XL (48 vCPUs, 96.0 GiB)
                 const: c5.12xlarge
-              - title: C5 High-CPU 18xlarge (72 vCPUs, 144.0 GiB)
+              - title: C5 High-CPU 18XL (72 vCPUs, 144.0 GiB)
                 const: c5.18xlarge
-              - title: C5 High-CPU 24xlarge (96 vCPUs, 192.0 GiB)
+              - title: C5 High-CPU 24XL (96 vCPUs, 192.0 GiB)
                 const: c5.24xlarge
               - title: M5 General Purpose Large (2 vCPUs, 8.0 GiB)
                 const: m5.large
-              - title: M5 General Purpose Extra Large (4 vCPUs, 16.0 GiB)
+              - title: M5 General Purpose XL (4 vCPUs, 16.0 GiB)
                 const: m5.xlarge
-              - title: M5 General Purpose Double Extra Large (8 vCPUs, 32.0 GiB)
+              - title: M5 General Purpose 2XL (8 vCPUs, 32.0 GiB)
                 const: m5.2xlarge
-              - title: M5 General Purpose Quadruple Extra Large (16 vCPUs, 64.0 GiB)
+              - title: M5 General Purpose 4XL (16 vCPUs, 64.0 GiB)
                 const: m5.4xlarge
-              - title: M5 General Purpose Eight Extra Large (32 vCPUs, 128.0 GiB)
+              - title: M5 General Purpose 8XL (32 vCPUs, 128.0 GiB)
                 const: m5.8xlarge
-              - title: M5 General Purpose 12xlarge (48 vCPUs, 192.0 GiB)
+              - title: M5 General Purpose 12XL (48 vCPUs, 192.0 GiB)
                 const: m5.12xlarge
-              - title: M5 General Purpose 16xlarge (64 vCPUs, 256.0 GiB)
+              - title: M5 General Purpose 16XL (64 vCPUs, 256.0 GiB)
                 const: m5.16xlarge
-              - title: M5 General Purpose 24xlarge (96 vCPUs, 384.0 GiB)
+              - title: M5 General Purpose 24XL (96 vCPUs, 384.0 GiB)
                 const: m5.24xlarge
               - title: T3 Small (2 vCPUs for a 4h 48m burst, 2.0 GiB)
                 const: t3.small
               - title: T3 Medium (2 vCPUs for a 4h 48m burst, 4.0 GiB)
                 const: t3.medium
               - title: T3 Large (2 vCPUs for a 7h 12m burst, 8.0 GiB)
                 const: t3.large
-              - title: T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
+              - title: T3 XL (4 vCPUs for a 9h 36m burst, 16.0 GiB)
                 const: t3.xlarge
-              - title: T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
+              - title: T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
                 const: t3.2xlarge
-              - title: P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
-                const: p2.xlarge
-              - title: P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
-                const: p2.8xlarge
-              - title: P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
-                const: p2.16xlarge
+              - title: P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
+                const: p3.2xlarge
+              - title: P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
+                const: p3.8xlarge
+              - title: P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
+                const: p3.16xlarge
+              - title: P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
+                const: p3dn.24xlarge
           advanced_configuration_enabled:
             type: boolean
             title: Advanced Configuration Enabled
@@ -372,6 +376,8 @@ ui:
     - core_services
     - monitoring
     - "*"
+  k8s_version:
+    ui:field: versioningDropdown
   node_groups:
     items:
       ui:order: