@@ -28,26 +28,14 @@ provider "helm" {
28
28
}
29
29
}
30
30
31
- provider "kubectl" {
32
- apply_retry_count = 5
33
- host = module. eks . cluster_endpoint
34
- cluster_ca_certificate = base64decode (module. eks . cluster_certificate_authority_data )
35
- load_config_file = false
36
-
37
- exec {
38
- api_version = " client.authentication.k8s.io/v1beta1"
39
- command = " aws"
40
- # This requires the awscli to be installed locally where Terraform is executed
41
- args = [" eks" , " get-token" , " --cluster-name" , module . eks . cluster_name ]
42
- }
43
- }
44
-
45
31
data "aws_availability_zones" "available" {}
46
32
47
33
locals {
48
34
name = basename (path. cwd )
49
35
region = " us-west-2"
50
36
37
+ cluster_version = " 1.27"
38
+
51
39
vpc_cidr = " 10.0.0.0/16"
52
40
azs = slice (data. aws_availability_zones . available . names , 0 , 3 )
53
41
@@ -66,7 +54,7 @@ module "eks" {
66
54
version = " ~> 19.16"
67
55
68
56
cluster_name = local. name
69
- cluster_version = " 1.27 "
57
+ cluster_version = local . cluster_version
70
58
cluster_endpoint_public_access = true
71
59
72
60
cluster_addons = {
@@ -98,6 +86,13 @@ module "eks" {
98
86
}
99
87
}
100
88
89
+ eks_managed_node_group_defaults = {
90
+ iam_role_additional_policies = {
91
+ # Not required, but used in the example to access the nodes to inspect drivers and devices
92
+ AmazonSSMManagedInstanceCore = " arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
93
+ }
94
+ }
95
+
101
96
eks_managed_node_groups = {
102
97
# For running services that do not require GPUs
103
98
default = {
@@ -113,7 +108,7 @@ module "eks" {
113
108
instance_types = [" g5.8xlarge" ]
114
109
115
110
min_size = 1
116
- max_size = 3
111
+ max_size = 1
117
112
desired_size = 1
118
113
119
114
subnet_ids = slice (module. vpc . private_subnets , 0 , 1 )
@@ -133,14 +128,26 @@ module "eks" {
133
128
}
134
129
135
130
pre_bootstrap_user_data = <<- EOT
136
- # Install EFA
137
- curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
138
- tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
139
- ./efa_installer.sh -y --minimal
140
- fi_info -p efa -t FI_EP_RDM
141
-
142
- # Disable ptrace
143
- sysctl -w kernel.yama.ptrace_scope=0
131
+ EFA_BIN='/opt/amazon/efa/bin/'
132
+
133
+ # EFA driver is installed by default on EKS GPU AMI starting on EKS 1.28
134
+ if [ ! -s "$EFA_BIN" ]; then
135
+
136
+ # Install EFA
137
+ # Note: It is recommended to install the EFA driver on a custom AMI and
138
+ # not rely on dynamic installation during instance provisioning in user data
139
+ curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
140
+ tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
141
+ ./efa_installer.sh -y --minimal
142
+ cd .. && rm -rf aws-efa-installer*
143
+
144
+ # Not required - just displays info on the EFA interfaces
145
+ $EFA_BIN/fi_info -p efa
146
+
147
+ # Disable ptrace
148
+ sysctl -w kernel.yama.ptrace_scope=0
149
+
150
+ fi
144
151
EOT
145
152
146
153
taints = {
@@ -211,8 +218,20 @@ module "eks_blueprints_addons" {
211
218
repository = " https://nvidia.github.io/gpu-operator"
212
219
values = [
213
220
<<- EOT
221
+ dcgmExporter:
222
+ enabled: false
223
+ driver:
224
+ enabled: false
225
+ toolkit:
226
+ version: v1.13.5-centos7
214
227
operator:
215
228
defaultRuntime: containerd
229
+ validator:
230
+ driver:
231
+ env:
232
+ # https://github.com/NVIDIA/gpu-operator/issues/569
233
+ - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
234
+ value: "true"
216
235
EOT
217
236
]
218
237
}
@@ -225,14 +244,96 @@ module "eks_blueprints_addons" {
225
244
# Amazon Elastic Fabric Adapter (EFA)
226
245
# ###############################################################################
227
246
228
- data "http" "efa_device_plugin_yaml" {
229
- url = " https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml"
230
- }
247
+ resource "kubernetes_daemonset" "aws_efa_k8s_device_plugin" {
248
+ metadata {
249
+ name = " aws-efa-k8s-device-plugin-daemonset"
250
+ namespace = " kube-system"
251
+ }
231
252
232
- resource "kubectl_manifest" "efa_device_plugin" {
233
- yaml_body = <<- YAML
234
- ${ data . http . efa_device_plugin_yaml . response_body }
235
- YAML
253
+ spec {
254
+ selector {
255
+ match_labels = {
256
+ name = " aws-efa-k8s-device-plugin"
257
+ }
258
+ }
259
+
260
+ template {
261
+ metadata {
262
+ labels = {
263
+ name = " aws-efa-k8s-device-plugin"
264
+ }
265
+ }
266
+
267
+ spec {
268
+ volume {
269
+ name = " device-plugin"
270
+
271
+ host_path {
272
+ path = " /var/lib/kubelet/device-plugins"
273
+ }
274
+ }
275
+
276
+ container {
277
+ name = " aws-efa-k8s-device-plugin"
278
+ image = " 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.3.3"
279
+
280
+ volume_mount {
281
+ name = " device-plugin"
282
+ mount_path = " /var/lib/kubelet/device-plugins"
283
+ }
284
+
285
+ image_pull_policy = " Always"
286
+
287
+ security_context {
288
+ capabilities {
289
+ drop = [" ALL" ]
290
+ }
291
+ }
292
+ }
293
+
294
+ host_network = true
295
+
296
+ affinity {
297
+ node_affinity {
298
+ required_during_scheduling_ignored_during_execution {
299
+ node_selector_term {
300
+ match_expressions {
301
+ key = " beta.kubernetes.io/instance-type"
302
+ operator = " In"
303
+ values = ["c5n.18xlarge", "c5n.9xlarge", "c5n.metal", "c6a.48xlarge", "c6a.metal", "c6gn.16xlarge", "c6i.32xlarge", "c6i.metal", "c6id.32xlarge", "c6id.metal", "c6in.32xlarge", "c6in.metal", "c7g.16xlarge", "c7g.metal", "c7gd.16xlarge", "c7gn.16xlarge", "c7i.48xlarge", "dl1.24xlarge", "g4dn.12xlarge", "g4dn.16xlarge", "g4dn.8xlarge", "g4dn.metal", "g5.12xlarge", "g5.16xlarge", "g5.24xlarge", "g5.48xlarge", "g5.8xlarge", "hpc7g.16xlarge", "hpc7g.4xlarge", "hpc7g.8xlarge", "i3en.12xlarge", "i3en.24xlarge", "i3en.metal", "i4g.16xlarge", "i4i.32xlarge", "i4i.metal", "im4gn.16xlarge", "inf1.24xlarge", "m5dn.24xlarge", "m5dn.metal", "m5n.24xlarge", "m5n.metal", "m5zn.12xlarge", "m5zn.metal", "m6a.48xlarge", "m6a.metal", "m6i.32xlarge", "m6i.metal", "m6id.32xlarge", "m6id.metal", "m6idn.32xlarge", "m6idn.metal", "m6in.32xlarge", "m6in.metal", "m7a.48xlarge", "m7a.metal-48xl", "m7g.16xlarge", "m7g.metal", "m7gd.16xlarge", "m7i.48xlarge", "p3dn.24xlarge", "p4d.24xlarge", "p5.48xlarge", "r5dn.24xlarge", "r5dn.metal", "r5n.24xlarge", "r5n.metal", "r6a.48xlarge", "r6a.metal", "r6i.32xlarge", "r6i.metal", "r6id.32xlarge", "r6id.metal", "r6idn.32xlarge", "r6idn.metal", "r6in.32xlarge", "r6in.metal", "r7a.48xlarge", "r7g.16xlarge", "r7g.metal", "r7gd.16xlarge", "r7iz.32xlarge", "trn1.32xlarge", "trn1n.32xlarge", "vt1.24xlarge", "x2idn.32xlarge", "x2idn.metal", "x2iedn.32xlarge", "x2iedn.metal", "x2iezn.12xlarge", "x2iezn.metal"]
304
+ }
305
+ }
306
+
307
+ node_selector_term {
308
+ match_expressions {
309
+ key = " node.kubernetes.io/instance-type"
310
+ operator = " In"
311
+ values = ["c5n.18xlarge", "c5n.9xlarge", "c5n.metal", "c6a.48xlarge", "c6a.metal", "c6gn.16xlarge", "c6i.32xlarge", "c6i.metal", "c6id.32xlarge", "c6id.metal", "c6in.32xlarge", "c6in.metal", "c7g.16xlarge", "c7g.metal", "c7gd.16xlarge", "c7gn.16xlarge", "c7i.48xlarge", "dl1.24xlarge", "g4dn.12xlarge", "g4dn.16xlarge", "g4dn.8xlarge", "g4dn.metal", "g5.12xlarge", "g5.16xlarge", "g5.24xlarge", "g5.48xlarge", "g5.8xlarge", "hpc7g.16xlarge", "hpc7g.4xlarge", "hpc7g.8xlarge", "i3en.12xlarge", "i3en.24xlarge", "i3en.metal", "i4g.16xlarge", "i4i.32xlarge", "i4i.metal", "im4gn.16xlarge", "inf1.24xlarge", "m5dn.24xlarge", "m5dn.metal", "m5n.24xlarge", "m5n.metal", "m5zn.12xlarge", "m5zn.metal", "m6a.48xlarge", "m6a.metal", "m6i.32xlarge", "m6i.metal", "m6id.32xlarge", "m6id.metal", "m6idn.32xlarge", "m6idn.metal", "m6in.32xlarge", "m6in.metal", "m7a.48xlarge", "m7a.metal-48xl", "m7g.16xlarge", "m7g.metal", "m7gd.16xlarge", "m7i.48xlarge", "p3dn.24xlarge", "p4d.24xlarge", "p5.48xlarge", "r5dn.24xlarge", "r5dn.metal", "r5n.24xlarge", "r5n.metal", "r6a.48xlarge", "r6a.metal", "r6i.32xlarge", "r6i.metal", "r6id.32xlarge", "r6id.metal", "r6idn.32xlarge", "r6idn.metal", "r6in.32xlarge", "r6in.metal", "r7a.48xlarge", "r7g.16xlarge", "r7g.metal", "r7gd.16xlarge", "r7iz.32xlarge", "trn1.32xlarge", "trn1n.32xlarge", "vt1.24xlarge", "x2idn.32xlarge", "x2idn.metal", "x2iedn.32xlarge", "x2iedn.metal", "x2iezn.12xlarge", "x2iezn.metal"]
312
+ }
313
+ }
314
+ }
315
+ }
316
+ }
317
+
318
+ toleration {
319
+ key = " CriticalAddonsOnly"
320
+ operator = " Exists"
321
+ }
322
+
323
+ toleration {
324
+ key = " aws.amazon.com/efa"
325
+ operator = " Exists"
326
+ effect = " NoSchedule"
327
+ }
328
+
329
+ priority_class_name = " system-node-critical"
330
+ }
331
+ }
332
+
333
+ strategy {
334
+ type = " RollingUpdate"
335
+ }
336
+ }
236
337
}
237
338
238
339
# ###############################################################################
0 commit comments