-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Applying dynamic taint and nvidia daemonset #64
Changes from 7 commits
4152193
2f08291
a6d61dd
3a7aa69
e866851
097eb1d
b82821d
7daebc8
c8f83b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
locals { | ||
gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*" | ||
has_gpu_node_groups = length([for ng in var.node_groups : ng if length(regexall(local.gpu_regex, ng.instance_type)) > 0]) > 0 | ||
gpu_enabled_instance_types = [for ng in var.node_groups : ng.instance_type if length(regexall(local.gpu_regex, ng.instance_type)) > 0] | ||
} | ||
|
||
resource "kubernetes_daemonset" "nvidia" { | ||
count = local.has_gpu_node_groups ? 1 : 0 | ||
metadata { | ||
name = "nvidia-device-plugin-daemonset" | ||
namespace = kubernetes_namespace_v1.md-core-services.metadata.0.name | ||
labels = merge(var.md_metadata.default_tags, { | ||
k8s-app = "nvidia-device-plugin-daemonset" | ||
}) | ||
} | ||
spec { | ||
selector { | ||
match_labels = { | ||
name = "nvidia-device-plugin-ds" | ||
} | ||
} | ||
strategy { | ||
type = "RollingUpdate" | ||
} | ||
template { | ||
metadata { | ||
labels = merge(var.md_metadata.default_tags, { | ||
name = "nvidia-device-plugin-ds" | ||
}) | ||
annotations = { | ||
"scheduler.alpha.kubernetes.io/critical-pod" : "" | ||
} | ||
} | ||
spec { | ||
priority_class_name = "system-node-critical" | ||
affinity { | ||
node_affinity { | ||
required_during_scheduling_ignored_during_execution { | ||
node_selector_term { | ||
match_expressions { | ||
key = "node.kubernetes.io/instance-type" | ||
operator = "In" | ||
values = local.gpu_enabled_instance_types | ||
} | ||
} | ||
} | ||
} | ||
} | ||
toleration { | ||
key = "CriticalAddonsOnly" | ||
operator = "Exists" | ||
} | ||
toleration { | ||
key = "nvidia.com/gpu" | ||
operator = "Exists" | ||
effect = "NoSchedule" | ||
} | ||
toleration { | ||
mclacore marked this conversation as resolved.
Show resolved
Hide resolved
|
||
key = "sku" | ||
operator = "Equal" | ||
value = "gpu" | ||
effect = "NoSchedule" | ||
} | ||
container { | ||
name = "nvidia-device-plugin-ctr" | ||
image = "nvcr.io/nvidia/k8s-device-plugin:v0.15.0" | ||
env { | ||
name = "FAIL_ON_INIT_ERROR" | ||
value = "false" | ||
} | ||
security_context { | ||
privileged = true | ||
capabilities { | ||
drop = ["all"] | ||
} | ||
} | ||
volume_mount { | ||
name = "device-plugin" | ||
mount_path = "/var/lib/kubelet/device-plugins" | ||
} | ||
} | ||
volume { | ||
name = "device-plugin" | ||
host_path { | ||
path = "/var/lib/kubelet/device-plugins" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,9 +3,22 @@ locals { | |
private_subnet_ids = [for subnet in var.vpc.data.infrastructure.private_subnets : element(split("/", subnet["arn"]), 1)] | ||
subnet_ids = concat(local.public_subnet_ids, local.private_subnet_ids) | ||
|
||
gpu_regex = "^(p[0-9][a-z]*|g[0-9+][a-z]*|trn[0-9][a-z]*|inf[0-9]|dl[0-9][a-z]*|f[0-9]|vt[0-9])\\..*" | ||
gpu_enabled_instance_types = { for ng in var.node_groups : ng.name_suffix => length(regexall(local.gpu_regex, ng.instance_type)) > 0 } | ||
has_gpu_node_groups = contains(values(local.gpu_enabled_instance_types), true) | ||
|
||
cluster_name = var.md_metadata.name_prefix | ||
} | ||
|
||
data "aws_ssm_parameter" "eks_ami" { | ||
name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2/recommended/image_id" | ||
} | ||
|
||
data "aws_ssm_parameter" "eks_gpu_ami" { | ||
count = local.has_gpu_node_groups ? 1 : 0 | ||
name = "/aws/service/eks/optimized-ami/${var.k8s_version}/amazon-linux-2-gpu/recommended/image_id" | ||
} | ||
|
||
resource "aws_eks_cluster" "cluster" { | ||
name = local.cluster_name | ||
role_arn = aws_iam_role.cluster.arn | ||
|
@@ -39,13 +52,12 @@ resource "aws_eks_cluster" "cluster" { | |
} | ||
|
||
resource "aws_eks_node_group" "node_group" { | ||
for_each = { for ng in var.node_groups : ng.name_suffix => ng } | ||
node_group_name = "${local.cluster_name}-${each.value.name_suffix}" | ||
cluster_name = local.cluster_name | ||
version = var.k8s_version | ||
subnet_ids = local.private_subnet_ids | ||
node_role_arn = aws_iam_role.node.arn | ||
instance_types = [each.value.instance_type] | ||
for_each = { for ng in var.node_groups : ng.name_suffix => ng } | ||
node_group_name_prefix = "${local.cluster_name}-${each.value.name_suffix}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is probably what is causing the recreation of all the nodes. Why are we switching from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of node group name collision. Prior to this change, when updating the launch template and AMI, etc., I would receive an error saying the node group name already existed. By using |
||
cluster_name = local.cluster_name | ||
subnet_ids = local.private_subnet_ids | ||
node_role_arn = aws_iam_role.node.arn | ||
instance_types = [each.value.instance_type] | ||
|
||
launch_template { | ||
id = aws_launch_template.nodes[each.key].id | ||
|
@@ -58,6 +70,15 @@ resource "aws_eks_node_group" "node_group" { | |
min_size = each.value.min_size | ||
} | ||
|
||
dynamic "taint" { | ||
for_each = length(regexall(local.gpu_regex, each.value.instance_type)) > 0 ? toset(["gpu"]) : toset([]) | ||
content { | ||
key = "sku" | ||
mclacore marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value = "gpu" | ||
effect = "NO_SCHEDULE" | ||
} | ||
} | ||
|
||
dynamic "taint" { | ||
for_each = lookup(each.value, "advanced_configuration_enabled", false) ? [each.value.advanced_configuration.taint] : [] | ||
content { | ||
|
@@ -86,6 +107,16 @@ resource "aws_launch_template" "nodes" { | |
|
||
update_default_version = true | ||
|
||
image_id = local.gpu_enabled_instance_types[each.key] ? data.aws_ssm_parameter.eks_gpu_ami[0].value : data.aws_ssm_parameter.eks_ami.value | ||
|
||
user_data = base64encode( | ||
<<EOF | ||
#!/bin/bash | ||
set -o xtrace | ||
/etc/eks/bootstrap.sh ${local.cluster_name} --kubelet-extra-args '--node-labels=node.kubernetes.io/instancegroup=${each.key}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is anything else needed in this file? Did you check to see what this file looked like on a default node before this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did look and it's a massive file. I'll paste the contents of it in here after deploying a main branch cluster. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
EOF | ||
) | ||
|
||
metadata_options { | ||
http_endpoint = "enabled" | ||
http_tokens = "required" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this toleration needed? We aren't applying it, are we?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It comes with the NVIDIA daemonset plugin: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.0/nvidia-device-plugin.yml
And then used by GPU pods it seems? https://github.com/NVIDIA/k8s-device-plugin?tab=readme-ov-file#running-gpu-jobs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think they are just using that as the taint in place of the
gpu=true
taint we added. We probably don't need both. We can use theirs instead.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To confirm, remove the toleration for
sku=gpu:NoSchedule
and update the dynamic taint in the node group fornvidia.com/gpu:NoSchedule
?