Adding GPUs to Kind cluster (#494)

metalcycling · web-flow · commit a4d16e190d22 · 2023-07-25T11:04:48.000-04:00
* Triggering different build

* Checking if nodes can be listed

* Trigger build

* Resource patching. Extending resources of Kubernetes nodes to include 'fake' GPUs.

* Fixed command to describe nodes

* Kuttl tests for checking if GPUs we added correctly to the nodes

* Fixed namespace issue and node name issue.

* Adding all tests again now that the resource extension passes

* Changed where the extended resources are tested

* Added error checks for 'curl' calls. Rename variables to use lower case.

* Fixed 'if' equal operator
diff --git a/hack/run-e2e-kind.sh b/hack/run-e2e-kind.sh
@@ -373,7 +373,63 @@ function setup-mcad-env {
   do
     echo -n "." && sleep 1; 
   done
+}
+
+function extend-resources {
+    # Patch nodes to provide GPUs resources without physical GPUs.
+    # This is intended to allow testing of GPU specific features such as histograms.
+
+    # Start communication with cluster
+    kubectl proxy > /dev/null 2>&1 &
+    proxy_pid=$!
+
+    echo "Starting background proxy connection (pid=${proxy_pid})..."
+
+    curl 127.0.0.1:8001 > /dev/null 2>&1
+
+    if [[ ! $? -eq 0 ]]; then
+        echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
+        exit 1
+    else
+        echo "Connected to the kubelet for patching the nodes"
+    fi
+
+
+    # Variables
+    resource_name="nvidia.com~1gpu"
+    resource_count="8"
+
+    # Patch nodes
+    for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
+    do
+        echo "- Patching node (add): ${node_name}"
 
+        patching_status=$(curl --header "Content-Type: application/json-patch+json" \
+                                --request PATCH \
+                                --data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
+                                http://localhost:8001/api/v1/nodes/${node_name}/status | jq -r '.status')
+
+        if [[ ${patching_status} == "Failure" ]]; then
+            echo "Failed to patch node '${node_name}' with GPU resources"
+            exit 1
+        fi
+
+        echo
+    done
+
+    # Stop communication with cluster
+    echo "Killing proxy (pid=${proxy_pid})..."
+    kill -9 ${proxy_pid}
+
+    # Run kuttl tests to confirm GPUs were added correctly
+    kuttl_test="${ROOT_DIR}/test/kuttl-test-extended-resources.yaml"
+    echo "kubectl kuttl test --config ${kuttl_test}"
+    kubectl kuttl test --config ${kuttl_test}
+    if [ $? -ne 0 ]
+    then
+      echo "kuttl e2e test '${kuttl_test}' failure, exiting."
+      exit 1
+    fi
 }
 
 function kuttl-tests {
@@ -402,6 +458,7 @@ trap cleanup EXIT
 update_test_host
 check-prerequisites 
 kind-up-cluster
+extend-resources
 setup-mcad-env
 # MCAD with quotamanagement options is started by kuttl-tests
 kuttl-tests
diff --git a/test/e2e-kuttl-extended-resources/steps/00-assert.yaml b/test/e2e-kuttl-extended-resources/steps/00-assert.yaml
@@ -0,0 +1,10 @@
+---
+# Verify that GPUs are a resource for the node
+apiVersion: v1
+kind: Node
+metadata:
+    name: test-worker
+status:
+    allocatable:
+        nvidia.com/gpu: "8"
+
diff --git a/test/e2e-kuttl-extended-resources/steps/01-assert.yaml b/test/e2e-kuttl-extended-resources/steps/01-assert.yaml
@@ -0,0 +1,5 @@
+# Verify that the namespace was created
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: extended-resources
diff --git a/test/e2e-kuttl-extended-resources/steps/01-install.yaml b/test/e2e-kuttl-extended-resources/steps/01-install.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: extended-resources
diff --git a/test/e2e-kuttl-extended-resources/steps/02-assert.yaml b/test/e2e-kuttl-extended-resources/steps/02-assert.yaml
@@ -0,0 +1,8 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: gpu-job
+    namespace: extended-resources
+status:
+    conditions:
+        - type: Complete
diff --git a/test/e2e-kuttl-extended-resources/steps/02-install.yaml b/test/e2e-kuttl-extended-resources/steps/02-install.yaml
@@ -0,0 +1,19 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: gpu-job
+    namespace: extended-resources
+spec:
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: gpu-job
+                  image: ubuntu:latest
+                  command: [ "/bin/bash", "-c", "--" ]
+                  args: [ "sleep 10;" ]
+                  resources:
+                      requests:
+                          nvidia.com/gpu: 8
+                      limits:
+                          nvidia.com/gpu: 8
diff --git a/test/kuttl-test-extended-resources.yaml b/test/kuttl-test-extended-resources.yaml
@@ -0,0 +1,7 @@
+apiVersion: kuttl.dev/v1beta1
+kind: TestSuite
+testDirs:
+  - test/e2e-kuttl-extended-resources/
+timeout: 60
+artifactsDir: _output/logs
+commands: