Skip to content

Commit 9ec1d47

Browse files
authored
chore: Update GPU patterns to use new AL2023 NVIDIA AMI variant and latest EKS 1.31 (#2031)
1 parent e7863cf commit 9ec1d47

File tree

15 files changed

+118
-111
lines changed

15 files changed

+118
-111
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/streetsidesoftware/cspell-cli
3-
rev: v8.13.3
3+
rev: v8.15.1
44
hooks:
55
- id: cspell
66
args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh']
@@ -10,7 +10,7 @@ repos:
1010
- id: pretty-format-yaml
1111
args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes]
1212
- repo: https://github.com/pre-commit/pre-commit-hooks
13-
rev: v4.6.0
13+
rev: v5.0.0
1414
hooks:
1515
- id: trailing-whitespace
1616
- id: end-of-file-fixer

patterns/fargate-serverless/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started
4545
3. Validate the `aws-logging` configMap for Fargate Fluentbit was created:
4646

4747
```sh
48-
kubectl -n aws-observability get configmap aws-logging
48+
kubectl -n aws-observability get configmap aws-logging
4949
```
5050

5151
```yaml

patterns/ml-capacity-block/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations
1313

1414
## Code
1515

16-
```terraform hl_lines="5-11 80-94 106-109 138-151"
16+
```terraform hl_lines="5-11 93-107 119-122 161-174"
1717
{% include "../../patterns/ml-capacity-block/eks.tf" %}
1818
```
1919

patterns/ml-capacity-block/eks.tf

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ variable "capacity_reservation_id" {
1616

1717
module "eks" {
1818
source = "terraform-aws-modules/eks/aws"
19-
version = "~> 20.17"
19+
version = "~> 20.26"
2020

2121
cluster_name = local.name
22-
cluster_version = "1.30"
22+
cluster_version = "1.31"
2323

2424
# Give the Terraform identity admin access to the cluster
2525
# which will allow it to deploy resources into the cluster
@@ -30,7 +30,9 @@ module "eks" {
3030
coredns = {}
3131
eks-pod-identity-agent = {}
3232
kube-proxy = {}
33-
vpc-cni = {}
33+
vpc-cni = {
34+
most_recent = true
35+
}
3436
}
3537

3638
# Add security group rules on the node group security group to
@@ -42,16 +44,27 @@ module "eks" {
4244

4345
eks_managed_node_groups = {
4446
cbr = {
45-
# The EKS AL2 GPU AMI provides all of the necessary components
47+
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
4648
# for accelerated workloads w/ EFA
47-
ami_type = "AL2_x86_64_GPU"
48-
instance_types = ["p5.48xlarge"]
49-
50-
pre_bootstrap_user_data = <<-EOT
51-
# Mount instance store volumes in RAID-0 for kubelet and containerd
52-
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
53-
/bin/setup-local-disks raid0
54-
EOT
49+
ami_type = "AL2023_x86_64_NVIDIA"
50+
instance_types = ["p5e.48xlarge"]
51+
52+
# Mount instance store volumes in RAID-0 for kubelet and containerd
53+
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
54+
cloudinit_pre_nodeadm = [
55+
{
56+
content_type = "application/node.eks.aws"
57+
content = <<-EOT
58+
---
59+
apiVersion: node.eks.aws/v1alpha1
60+
kind: NodeConfig
61+
spec:
62+
instance:
63+
localStorage:
64+
strategy: RAID0
65+
EOT
66+
}
67+
]
5568

5669
min_size = 2
5770
max_size = 2
@@ -97,7 +110,7 @@ module "eks" {
97110
default = {
98111
instance_types = ["m5.large"]
99112

100-
min_size = 1
113+
min_size = 2
101114
max_size = 2
102115
desired_size = 2
103116
}
@@ -109,21 +122,31 @@ module "eks" {
109122
# the one that works for their use case.
110123
self_managed_node_groups = {
111124
cbr2 = {
112-
# The EKS AL2 GPU AMI provides all of the necessary components
125+
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
113126
# for accelerated workloads w/ EFA
114-
ami_type = "AL2_x86_64_GPU"
115-
instance_type = "p5.48xlarge"
116-
117-
pre_bootstrap_user_data = <<-EOT
118-
# Mount instance store volumes in RAID-0 for kubelet and containerd
119-
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
120-
/bin/setup-local-disks raid0
121-
122-
# Ensure only GPU workloads are scheduled on this node group
123-
export KUBELET_EXTRA_ARGS='--node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true \
124-
--register-with-taints=nvidia.com/gpu=true:NoSchedule'
125-
126-
EOT
127+
ami_type = "AL2023_x86_64_NVIDIA"
128+
instance_type = "p5e.48xlarge"
129+
130+
# Mount instance store volumes in RAID-0 for kubelet and containerd
131+
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
132+
cloudinit_pre_nodeadm = [
133+
{
134+
content_type = "application/node.eks.aws"
135+
content = <<-EOT
136+
---
137+
apiVersion: node.eks.aws/v1alpha1
138+
kind: NodeConfig
139+
spec:
140+
instance:
141+
localStorage:
142+
strategy: RAID0
143+
kubelet:
144+
flags:
145+
- --node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true
146+
- --register-with-taints=nvidia.com/gpu=true:NoSchedule
147+
EOT
148+
}
149+
]
127150

128151
min_size = 2
129152
max_size = 2

patterns/ml-capacity-block/helm.tf

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
66
name = "nvidia-device-plugin"
77
repository = "https://nvidia.github.io/k8s-device-plugin"
88
chart = "nvidia-device-plugin"
9-
version = "0.14.5"
9+
version = "0.16.2"
1010
namespace = "nvidia-device-plugin"
1111
create_namespace = true
1212
wait = false
13-
14-
values = [
15-
<<-EOT
16-
affinity:
17-
nodeAffinity:
18-
requiredDuringSchedulingIgnoredDuringExecution:
19-
nodeSelectorTerms:
20-
- matchExpressions:
21-
- key: 'nvidia.com/gpu.present'
22-
operator: In
23-
values:
24-
- 'true'
25-
EOT
26-
]
2713
}
2814

2915
resource "helm_release" "aws_efa_device_plugin" {
3016
name = "aws-efa-k8s-device-plugin"
3117
repository = "https://aws.github.io/eks-charts"
3218
chart = "aws-efa-k8s-device-plugin"
33-
version = "v0.5.2"
19+
version = "v0.5.5"
3420
namespace = "kube-system"
3521
wait = false
3622

patterns/ml-capacity-block/main.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ terraform {
44
required_providers {
55
aws = {
66
source = "hashicorp/aws"
7-
version = ">= 5.57"
7+
version = ">= 5.70"
88
}
99
helm = {
1010
source = "hashicorp/helm"
11-
version = ">= 2.9"
11+
version = ">= 2.16"
1212
}
1313
}
1414

patterns/nvidia-gpu-efa/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ The following components are demonstrated in this pattern:
1717

1818
## Code
1919

20-
```terraform hl_lines="24-26 32-67"
20+
```terraform hl_lines="26-28 34-80"
2121
{% include "../../patterns/nvidia-gpu-efa/eks.tf" %}
2222
```
2323

24-
```terraform hl_lines="5-47"
24+
```terraform hl_lines="5-33"
2525
{% include "../../patterns/nvidia-gpu-efa/helm.tf" %}
2626
```
2727

patterns/nvidia-gpu-efa/eks.tf

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44

55
module "eks" {
66
source = "terraform-aws-modules/eks/aws"
7-
version = "~> 20.17"
7+
version = "~> 20.26"
88

99
cluster_name = local.name
10-
cluster_version = "1.30"
10+
cluster_version = "1.31"
1111

1212
# Give the Terraform identity admin access to the cluster
1313
# which will allow it to deploy resources into the cluster
@@ -18,7 +18,9 @@ module "eks" {
1818
coredns = {}
1919
eks-pod-identity-agent = {}
2020
kube-proxy = {}
21-
vpc-cni = {}
21+
vpc-cni = {
22+
most_recent = true
23+
}
2224
}
2325

2426
# Add security group rules on the node group security group to
@@ -30,16 +32,27 @@ module "eks" {
3032

3133
eks_managed_node_groups = {
3234
nvidia-efa = {
33-
# The EKS AL2 GPU AMI provides all of the necessary components
35+
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
3436
# for accelerated workloads w/ EFA
35-
ami_type = "AL2_x86_64_GPU"
37+
ami_type = "AL2023_x86_64_NVIDIA"
3638
instance_types = ["p5.48xlarge"]
3739

38-
pre_bootstrap_user_data = <<-EOT
39-
# Mount instance store volumes in RAID-0 for kubelet and containerd
40-
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
41-
/bin/setup-local-disks raid0
42-
EOT
40+
# Mount instance store volumes in RAID-0 for kubelet and containerd
41+
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
42+
cloudinit_pre_nodeadm = [
43+
{
44+
content_type = "application/node.eks.aws"
45+
content = <<-EOT
46+
---
47+
apiVersion: node.eks.aws/v1alpha1
48+
kind: NodeConfig
49+
spec:
50+
instance:
51+
localStorage:
52+
strategy: RAID0
53+
EOT
54+
}
55+
]
4356

4457
min_size = 2
4558
max_size = 2

patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
export MPI_JOB_NAME=efa-nccl-test
44
export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest
5-
export INSTANCE_TYPE=p5.48xlarge
5+
export INSTANCE_TYPE=p5e.48xlarge
66
export NUM_WORKERS=2
77
export GPU_PER_WORKER=8
88
export EFA_PER_WORKER=32

patterns/nvidia-gpu-efa/helm.tf

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
66
name = "nvidia-device-plugin"
77
repository = "https://nvidia.github.io/k8s-device-plugin"
88
chart = "nvidia-device-plugin"
9-
version = "0.14.5"
9+
version = "0.16.2"
1010
namespace = "nvidia-device-plugin"
1111
create_namespace = true
1212
wait = false
13-
14-
values = [
15-
<<-EOT
16-
affinity:
17-
nodeAffinity:
18-
requiredDuringSchedulingIgnoredDuringExecution:
19-
nodeSelectorTerms:
20-
- matchExpressions:
21-
- key: 'nvidia.com/gpu.present'
22-
operator: In
23-
values:
24-
- 'true'
25-
EOT
26-
]
2713
}
2814

2915
resource "helm_release" "aws_efa_device_plugin" {
3016
name = "aws-efa-k8s-device-plugin"
3117
repository = "https://aws.github.io/eks-charts"
3218
chart = "aws-efa-k8s-device-plugin"
33-
version = "v0.5.2"
19+
version = "v0.5.5"
3420
namespace = "kube-system"
3521
wait = false
3622

0 commit comments

Comments
 (0)