Skip to content

Commit 9e407f0

Browse files
authored
fix: Ensure EFA installer uses full path to EFA bin and does not install if already present (aws-ia#1780)
1 parent 2f57644 commit 9e407f0

File tree

2 files changed

+132
-39
lines changed

2 files changed

+132
-39
lines changed

patterns/elastic-fabric-adapter/main.tf

Lines changed: 132 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,14 @@ provider "helm" {
2828
}
2929
}
3030

31-
provider "kubectl" {
32-
apply_retry_count = 5
33-
host = module.eks.cluster_endpoint
34-
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
35-
load_config_file = false
36-
37-
exec {
38-
api_version = "client.authentication.k8s.io/v1beta1"
39-
command = "aws"
40-
# This requires the awscli to be installed locally where Terraform is executed
41-
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
42-
}
43-
}
44-
4531
data "aws_availability_zones" "available" {}
4632

4733
locals {
4834
name = basename(path.cwd)
4935
region = "us-west-2"
5036

37+
cluster_version = "1.27"
38+
5139
vpc_cidr = "10.0.0.0/16"
5240
azs = slice(data.aws_availability_zones.available.names, 0, 3)
5341

@@ -66,7 +54,7 @@ module "eks" {
6654
version = "~> 19.16"
6755

6856
cluster_name = local.name
69-
cluster_version = "1.27"
57+
cluster_version = local.cluster_version
7058
cluster_endpoint_public_access = true
7159

7260
cluster_addons = {
@@ -98,6 +86,13 @@ module "eks" {
9886
}
9987
}
10088

89+
eks_managed_node_group_defaults = {
90+
iam_role_additional_policies = {
91+
# Not required, but used in the example to access the nodes to inspect drivers and devices
92+
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
93+
}
94+
}
95+
10196
eks_managed_node_groups = {
10297
# For running services that do not require GPUs
10398
default = {
@@ -113,7 +108,7 @@ module "eks" {
113108
instance_types = ["g5.8xlarge"]
114109

115110
min_size = 1
116-
max_size = 3
111+
max_size = 1
117112
desired_size = 1
118113

119114
subnet_ids = slice(module.vpc.private_subnets, 0, 1)
@@ -133,14 +128,26 @@ module "eks" {
133128
}
134129

135130
pre_bootstrap_user_data = <<-EOT
136-
# Install EFA
137-
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
138-
tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
139-
./efa_installer.sh -y --minimal
140-
fi_info -p efa -t FI_EP_RDM
141-
142-
# Disable ptrace
143-
sysctl -w kernel.yama.ptrace_scope=0
131+
EFA_BIN='/opt/amazon/efa/bin/'
132+
133+
# EFA driver is installed by default on EKS GPU AMI starting on EKS 1.28
134+
if [ ! -s "$EFA_BIN" ]; then
135+
136+
# Install EFA
137+
# Note: It is recommended to install the EFA driver on a custom AMI and
138+
# not rely on dynamic installation during instance provisioning in user data
139+
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
140+
tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
141+
./efa_installer.sh -y --minimal
142+
cd .. && rm -rf aws-efa-installer*
143+
144+
# Not required - just displays info on the EFA interfaces
145+
$EFA_BIN/fi_info -p efa
146+
147+
# Disable ptrace
148+
sysctl -w kernel.yama.ptrace_scope=0
149+
150+
fi
144151
EOT
145152

146153
taints = {
@@ -211,8 +218,20 @@ module "eks_blueprints_addons" {
211218
repository = "https://nvidia.github.io/gpu-operator"
212219
values = [
213220
<<-EOT
221+
dcgmExporter:
222+
enabled: false
223+
driver:
224+
enabled: false
225+
toolkit:
226+
version: v1.13.5-centos7
214227
operator:
215228
defaultRuntime: containerd
229+
validator:
230+
driver:
231+
env:
232+
# https://github.com/NVIDIA/gpu-operator/issues/569
233+
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
234+
value: "true"
216235
EOT
217236
]
218237
}
@@ -225,14 +244,96 @@ module "eks_blueprints_addons" {
225244
# Amazon Elastic Fabric Adapter (EFA)
226245
################################################################################
227246

228-
data "http" "efa_device_plugin_yaml" {
229-
url = "https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml"
230-
}
247+
resource "kubernetes_daemonset" "aws_efa_k8s_device_plugin" {
248+
metadata {
249+
name = "aws-efa-k8s-device-plugin-daemonset"
250+
namespace = "kube-system"
251+
}
231252

232-
resource "kubectl_manifest" "efa_device_plugin" {
233-
yaml_body = <<-YAML
234-
${data.http.efa_device_plugin_yaml.response_body}
235-
YAML
253+
spec {
254+
selector {
255+
match_labels = {
256+
name = "aws-efa-k8s-device-plugin"
257+
}
258+
}
259+
260+
template {
261+
metadata {
262+
labels = {
263+
name = "aws-efa-k8s-device-plugin"
264+
}
265+
}
266+
267+
spec {
268+
volume {
269+
name = "device-plugin"
270+
271+
host_path {
272+
path = "/var/lib/kubelet/device-plugins"
273+
}
274+
}
275+
276+
container {
277+
name = "aws-efa-k8s-device-plugin"
278+
image = "602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.3.3"
279+
280+
volume_mount {
281+
name = "device-plugin"
282+
mount_path = "/var/lib/kubelet/device-plugins"
283+
}
284+
285+
image_pull_policy = "Always"
286+
287+
security_context {
288+
capabilities {
289+
drop = ["ALL"]
290+
}
291+
}
292+
}
293+
294+
host_network = true
295+
296+
affinity {
297+
node_affinity {
298+
required_during_scheduling_ignored_during_execution {
299+
node_selector_term {
300+
match_expressions {
301+
key = "beta.kubernetes.io/instance-type"
302+
operator = "In"
303+
values = ["c5n.18xlarge", "c5n.9xlarge", "c5n.metal", "c6a.48xlarge", "c6a.metal", "c6gn.16xlarge", "c6i.32xlarge", "c6i.metal", "c6id.32xlarge", "c6id.metal", "c6in.32xlarge", "c6in.metal", "c7g.16xlarge", "c7g.metal", "c7gd.16xlarge", "c7gn.16xlarge", "c7i.48xlarge", "dl1.24xlarge", "g4dn.12xlarge", "g4dn.16xlarge", "g4dn.8xlarge", "g4dn.metal", "g5.12xlarge", "g5.16xlarge", "g5.24xlarge", "g5.48xlarge", "g5.8xlarge", "hpc7g.16xlarge", "hpc7g.4xlarge", "hpc7g.8xlarge", "i3en.12xlarge", "i3en.24xlarge", "i3en.metal", "i4g.16xlarge", "i4i.32xlarge", "i4i.metal", "im4gn.16xlarge", "inf1.24xlarge", "m5dn.24xlarge", "m5dn.metal", "m5n.24xlarge", "m5n.metal", "m5zn.12xlarge", "m5zn.metal", "m6a.48xlarge", "m6a.metal", "m6i.32xlarge", "m6i.metal", "m6id.32xlarge", "m6id.metal", "m6idn.32xlarge", "m6idn.metal", "m6in.32xlarge", "m6in.metal", "m7a.48xlarge", "m7a.metal-48xl", "m7g.16xlarge", "m7g.metal", "m7gd.16xlarge", "m7i.48xlarge", "p3dn.24xlarge", "p4d.24xlarge", "p5.48xlarge", "r5dn.24xlarge", "r5dn.metal", "r5n.24xlarge", "r5n.metal", "r6a.48xlarge", "r6a.metal", "r6i.32xlarge", "r6i.metal", "r6id.32xlarge", "r6id.metal", "r6idn.32xlarge", "r6idn.metal", "r6in.32xlarge", "r6in.metal", "r7a.48xlarge", "r7g.16xlarge", "r7g.metal", "r7gd.16xlarge", "r7iz.32xlarge", "trn1.32xlarge", "trn1n.32xlarge", "vt1.24xlarge", "x2idn.32xlarge", "x2idn.metal", "x2iedn.32xlarge", "x2iedn.metal", "x2iezn.12xlarge", "x2iezn.metal"]
304+
}
305+
}
306+
307+
node_selector_term {
308+
match_expressions {
309+
key = "node.kubernetes.io/instance-type"
310+
operator = "In"
311+
values = ["c5n.18xlarge", "c5n.9xlarge", "c5n.metal", "c6a.48xlarge", "c6a.metal", "c6gn.16xlarge", "c6i.32xlarge", "c6i.metal", "c6id.32xlarge", "c6id.metal", "c6in.32xlarge", "c6in.metal", "c7g.16xlarge", "c7g.metal", "c7gd.16xlarge", "c7gn.16xlarge", "c7i.48xlarge", "dl1.24xlarge", "g4dn.12xlarge", "g4dn.16xlarge", "g4dn.8xlarge", "g4dn.metal", "g5.12xlarge", "g5.16xlarge", "g5.24xlarge", "g5.48xlarge", "g5.8xlarge", "hpc7g.16xlarge", "hpc7g.4xlarge", "hpc7g.8xlarge", "i3en.12xlarge", "i3en.24xlarge", "i3en.metal", "i4g.16xlarge", "i4i.32xlarge", "i4i.metal", "im4gn.16xlarge", "inf1.24xlarge", "m5dn.24xlarge", "m5dn.metal", "m5n.24xlarge", "m5n.metal", "m5zn.12xlarge", "m5zn.metal", "m6a.48xlarge", "m6a.metal", "m6i.32xlarge", "m6i.metal", "m6id.32xlarge", "m6id.metal", "m6idn.32xlarge", "m6idn.metal", "m6in.32xlarge", "m6in.metal", "m7a.48xlarge", "m7a.metal-48xl", "m7g.16xlarge", "m7g.metal", "m7gd.16xlarge", "m7i.48xlarge", "p3dn.24xlarge", "p4d.24xlarge", "p5.48xlarge", "r5dn.24xlarge", "r5dn.metal", "r5n.24xlarge", "r5n.metal", "r6a.48xlarge", "r6a.metal", "r6i.32xlarge", "r6i.metal", "r6id.32xlarge", "r6id.metal", "r6idn.32xlarge", "r6idn.metal", "r6in.32xlarge", "r6in.metal", "r7a.48xlarge", "r7g.16xlarge", "r7g.metal", "r7gd.16xlarge", "r7iz.32xlarge", "trn1.32xlarge", "trn1n.32xlarge", "vt1.24xlarge", "x2idn.32xlarge", "x2idn.metal", "x2iedn.32xlarge", "x2iedn.metal", "x2iezn.12xlarge", "x2iezn.metal"]
312+
}
313+
}
314+
}
315+
}
316+
}
317+
318+
toleration {
319+
key = "CriticalAddonsOnly"
320+
operator = "Exists"
321+
}
322+
323+
toleration {
324+
key = "aws.amazon.com/efa"
325+
operator = "Exists"
326+
effect = "NoSchedule"
327+
}
328+
329+
priority_class_name = "system-node-critical"
330+
}
331+
}
332+
333+
strategy {
334+
type = "RollingUpdate"
335+
}
336+
}
236337
}
237338

238339
################################################################################

patterns/elastic-fabric-adapter/versions.tf

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,6 @@ terraform {
1414
source = "hashicorp/kubernetes"
1515
version = ">= 2.20"
1616
}
17-
kubectl = {
18-
source = "gavinbunney/kubectl"
19-
version = ">= 1.14"
20-
}
21-
http = {
22-
source = "hashicorp/http"
23-
version = ">= 3.3"
24-
}
2517
}
2618

2719
# ## Used for end-to-end testing on project; update to suit your needs

0 commit comments

Comments
 (0)