massdriver-cloud · mclacore · May 2, 2024 · May 7, 2024 · May 22, 2024 · May 22, 2024
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ Form input parameters for configuring a bundle for deployment.
 
 - **`fargate`** *(object)*: AWS Fargate provides on-demand, right-sized compute capacity for running containers on EKS without managing node pools or clusters of EC2 instances.
   - **`enabled`** *(boolean)*: Enables EKS Fargate. Default: `False`.
-- **`k8s_version`** *(string)*: The version of Kubernetes to run. Must be one of: `['1.22', '1.23', '1.24', '1.25', '1.26', '1.27']`. Default: `1.27`.
+- **`k8s_version`** *(string)*: The version of Kubernetes to run. Must be one of: `['1.22', '1.23', '1.24', '1.25', '1.26', '1.27', '1.28', '1.29']`. Default: `1.29`.
 - **`monitoring`** *(object)*
   - **`control_plane_log_retention`** *(integer)*: Duration to retain control plane logs in AWS Cloudwatch (Note: control plane logs do not contain application or container logs). Default: `7`.
     - **One of**
@@ -69,7 +69,7 @@ Form input parameters for configuring a bundle for deployment.
   - **`prometheus`** *(object)*: Configuration settings for the Prometheus instances that are automatically installed into the cluster to provide monitoring capabilities".
     - **`grafana_enabled`** *(boolean)*: Install Grafana into the cluster to provide a metric visualizer. Default: `False`.
     - **`persistence_enabled`** *(boolean)*: This setting will enable persistence of Prometheus data via EBS volumes. However, in small clusters (less than 5 nodes) this can create problems of pod scheduling and placement due EBS volumes being zonally-locked, and thus should be disabled. Default: `True`.
-- **`node_groups`** *(array)*
+- **`node_groups`** *(array)*: Node groups to provision.
   - **Items** *(object)*: Definition of a node group.
     - **`advanced_configuration_enabled`** *(boolean)*: Default: `False`.
     - **`instance_type`** *(string)*: Instance type to use in the node group.
@@ -94,10 +94,11 @@ Form input parameters for configuring a bundle for deployment.
         - T3 Medium (2 vCPUs for a 4h 48m burst, 4.0 GiB)
         - T3 Large (2 vCPUs for a 7h 12m burst, 8.0 GiB)
         - T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
-        - T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
-        - P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
-        - P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
-        - P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
+        - T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
+        - P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
+        - P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
+        - P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
+        - P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
     - **`max_size`** *(integer)*: Maximum number of instances in the node group. Minimum: `0`. Default: `10`.
     - **`min_size`** *(integer)*: Minimum number of instances in the node group. Minimum: `0`. Default: `1`.
     - **`name_suffix`** *(string)*: The name of the node group. Default: ``.
@@ -114,7 +115,7 @@ Form input parameters for configuring a bundle for deployment.
       "fargate": {
           "enabled": false
       },
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 7,
           "prometheus": {
@@ -137,7 +138,7 @@ Form input parameters for configuring a bundle for deployment.
   ```json
   {
       "__name": "Development",
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 7,
           "prometheus": {
@@ -159,7 +160,7 @@ Form input parameters for configuring a bundle for deployment.
   ```json
   {
       "__name": "Production",
-      "k8s_version": "1.27",
+      "k8s_version": "1.29",
       "monitoring": {
           "control_plane_log_retention": 365,
           "prometheus": {

diff --git a/core-services/nvidia_gpu.tf b/core-services/nvidia_gpu.tf
@@ -0,0 +1,91 @@
+locals {
+  gpu_regex                  = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*"
+  has_gpu_node_groups        = length([for ng in var.node_groups : ng if length(regexall(local.gpu_regex, ng.instance_type)) > 0]) > 0
+  gpu_enabled_instance_types = [for ng in var.node_groups : ng.instance_type if length(regexall(local.gpu_regex, ng.instance_type)) > 0]
+}
+
+resource "kubernetes_daemonset" "nvidia" {
+  count = local.has_gpu_node_groups ? 1 : 0
+  metadata {
+    name      = "nvidia-device-plugin-daemonset"
+    namespace = kubernetes_namespace_v1.md-core-services.metadata.0.name
+    labels = merge(var.md_metadata.default_tags, {
+      k8s-app = "nvidia-device-plugin-daemonset"
+    })
+  }
+  spec {
+    selector {
+      match_labels = {
+        name = "nvidia-device-plugin-ds"
+      }
+    }
+    strategy {
+      type = "RollingUpdate"
+    }
+    template {
+      metadata {
+        labels = merge(var.md_metadata.default_tags, {
+          name = "nvidia-device-plugin-ds"
+        })
+        annotations = {
+          "scheduler.alpha.kubernetes.io/critical-pod" : ""
+        }
+      }
+      spec {
+        priority_class_name = "system-node-critical"
+        affinity {
+          node_affinity {
+            required_during_scheduling_ignored_during_execution {
+              node_selector_term {
+                match_expressions {
+                  key      = "node.kubernetes.io/instance-type"
+                  operator = "In"
+                  values   = local.gpu_enabled_instance_types
+                }
+              }
+            }
+          }
+        }
+        toleration {
+          key      = "CriticalAddonsOnly"
+          operator = "Exists"
+        }
+        toleration {
+          key      = "nvidia.com/gpu"
+          operator = "Exists"
+          effect   = "NoSchedule"
+        }
+        toleration {
+          key      = "sku"
+          operator = "Equal"
+          value    = "gpu"
+          effect   = "NoSchedule"
+        }
+        container {
+          name  = "nvidia-device-plugin-ctr"
+          image = "nvcr.io/nvidia/k8s-device-plugin:v0.15.0"
+          env {
+            name  = "FAIL_ON_INIT_ERROR"
+            value = "false"
+          }
+          security_context {
+            privileged = true
+            capabilities {
+              drop = ["all"]
+            }
+          }
+          volume_mount {
+            name       = "device-plugin"
+            mount_path = "/var/lib/kubelet/device-plugins"
+          }
+        }
+        volume {
+          name = "device-plugin"
+          host_path {
+            path = "/var/lib/kubelet/device-plugins"
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/massdriver.yaml b/massdriver.yaml
@@ -16,7 +16,7 @@ steps:
 params:
   examples:
     - __name: Wizard
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       fargate:
         enabled: false
       node_groups:
@@ -35,7 +35,7 @@ params:
           persistence_enabled: false
           grafana_enabled: false
     - __name: Development
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       node_groups:
         - name_suffix: shared
           instance_type: t3.medium
@@ -47,7 +47,7 @@ params:
           persistence_enabled: false
           grafana_enabled: false
     - __name: Production
-      k8s_version: "1.27"
+      k8s_version: "1.29"
       node_groups:
         - name_suffix: shared
           instance_type: c5.2xlarge
@@ -68,15 +68,17 @@ params:
     k8s_version:
       type: string
       title: Kubernetes Version
-      description: The version of Kubernetes to run
-      default: "1.27"
+      description: The version of Kubernetes to run.
+      default: "1.29"
       enum:
         - "1.22"
         - "1.23"
         - "1.24"
         - "1.25"
         - "1.26"
         - "1.27"
+        - "1.28"
+        - "1.29"
     fargate:
       type: object
       title: Fargate
@@ -118,7 +120,7 @@ params:
     node_groups:
       type: array
       title: Node Groups
-      descrition: Node groups to provision
+      description: Node groups to provision
       minItems: 1
       items:
         type: object
@@ -196,14 +198,16 @@ params:
                 const: t3.large
               - title: T3 Extra Large (4 vCPUs for a 9h 36m burst, 16.0 GiB)
                 const: t3.xlarge
-              - title: T3 Double Extra Large (8 vCPUs for a 9h 36m burst, 32.0 GiB)
+              - title: T3 2XL (8 vCPUs for a 9h 36m burst, 32.0 GiB)
                 const: t3.2xlarge
-              - title: P2 General Purpose GPU Extra Large (4 vCPUs, 61.0 GiB)
-                const: p2.xlarge
-              - title: P2 General Purpose GPU Eight Extra Large (32 vCPUs, 488.0 GiB)
-                const: p2.8xlarge
-              - title: P2 General Purpose GPU 16xlarge (64 vCPUs, 732.0 GiB)
-                const: p2.16xlarge
+              - title: P3 2XL (1 GPU, 16 GiB GPU Mem, 8 vCPUs, 61.0 GiB Mem)
+                const: p3.2xlarge
+              - title: P3 8XL (4 GPUs, 64 GiB GPU Mem, 32 vCPUs, 244.0 GiB Mem)
+                const: p3.8xlarge
+              - title: P3 16XL (8 GPUs, 128 GiB GPU Mem, 64 vCPUs, 488.0 GiB)
+                const: p3.16xlarge
+              - title: P3dn 24XL (8 GPUs, 256 GiB GPU Mem, 96 vCPUs, 768.0 GiB, 2 x 900 NVMe SSD)
+                const: p3dn.24xlarge
           advanced_configuration_enabled:
             type: boolean
             title: Advanced Configuration Enabled
@@ -263,7 +267,7 @@ params:
                 storage_class_to_efs_map:
                   type: array
                   title: Storage Classes
-                  description: You may optionally specify a storage class name for each EFS file system you would like to use for persistent volumes. In addition, by specifying any EFS volumes here you will limit the provisioner to only be able to create peristent volumes use the listed EFS file systems. By leaving this blank, all file systems are usable.
+                  description: You may optionally specify a storage class name for each EFS file system you would like to use for persistent volumes. In addition, by specifying any EFS volumes here you will limit the provisioner to only be able to create persistent volumes use the listed EFS file systems. By leaving this blank, all file systems are usable.
                   default: []
                   items:
                     type: object
@@ -372,6 +376,8 @@ ui:
     - core_services
     - monitoring
     - "*"
+  k8s_version:
+    ui:field: versioningDropdown
   node_groups:
     items:
       ui:order:

diff --git a/src/main.tf b/src/main.tf
@@ -3,9 +3,22 @@ locals {
   private_subnet_ids = [for subnet in var.vpc.data.infrastructure.private_subnets : element(split("/", subnet["arn"]), 1)]
   subnet_ids         = concat(local.public_subnet_ids, local.private_subnet_ids)
 
+  gpu_regex                  = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*"
+  gpu_enabled_instance_types = { for ng in var.node_groups : ng.name_suffix => length(regexall(local.gpu_regex, ng.instance_type)) > 0 }
+  has_gpu_node_groups        = contains(values(local.gpu_enabled_instance_types), true)
+
   cluster_name = var.md_metadata.name_prefix
 }
 
+data "aws_ssm_parameter" "eks_ami" {
+  name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2/recommended/image_id"
+}
+
+data "aws_ssm_parameter" "eks_gpu_ami" {
+  count = local.has_gpu_node_groups ? 1 : 0
+  name  = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2-gpu/recommended/image_id"
+}
+
 resource "aws_eks_cluster" "cluster" {
   name     = local.cluster_name
   role_arn = aws_iam_role.cluster.arn
@@ -39,13 +52,12 @@ resource "aws_eks_cluster" "cluster" {
 }
 
 resource "aws_eks_node_group" "node_group" {
-  for_each        = { for ng in var.node_groups : ng.name_suffix => ng }
-  node_group_name = "${local.cluster_name}-${each.value.name_suffix}"
-  cluster_name    = local.cluster_name
-  version         = var.k8s_version
-  subnet_ids      = local.private_subnet_ids
-  node_role_arn   = aws_iam_role.node.arn
-  instance_types  = [each.value.instance_type]
+  for_each               = { for ng in var.node_groups : ng.name_suffix => ng }
+  node_group_name_prefix = "${local.cluster_name}-${each.value.name_suffix}"
+  cluster_name           = local.cluster_name
+  subnet_ids             = local.private_subnet_ids
+  node_role_arn          = aws_iam_role.node.arn
+  instance_types         = [each.value.instance_type]
 
   launch_template {
     id      = aws_launch_template.nodes[each.key].id
@@ -58,6 +70,15 @@ resource "aws_eks_node_group" "node_group" {
     min_size     = each.value.min_size
   }
 
+  dynamic "taint" {
+    for_each = length(regexall(local.gpu_regex, each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([])
+    content {
+      key    = "sku"
+      value  = "gpu"
+      effect = "NO_SCHEDULE"
+    }
+  }
+
   dynamic "taint" {
     for_each = lookup(each.value, "advanced_configuration_enabled", false) ? [each.value.advanced_configuration.taint] : []
     content {
@@ -86,6 +107,16 @@ resource "aws_launch_template" "nodes" {
 
   update_default_version = true
 
+  image_id = local.gpu_enabled_instance_types[each.key] ? data.aws_ssm_parameter.eks_gpu_ami[0].value : data.aws_ssm_parameter.eks_ami.value
+
+  user_data = base64encode(
+    <<EOF
+#!/bin/bash
+set -o xtrace
+/etc/eks/bootstrap.sh ${local.cluster_name} --kubelet-extra-args '--node-labels=node.kubernetes.io/instancegroup=${each.key}'
+EOF
+  )
+
   metadata_options {
     http_endpoint = "enabled"
     http_tokens   = "required"