Skip to content

Commit a4d16e1

Browse files
authored
Adding GPUs to Kind cluster (#494)
* Triggering different build * Checking if nodes can be listed * Trigger build * Resource patching. Extending resources of Kubernetes nodes to include 'fake' GPUs. * Fixed command to describe nodes * Kuttl tests for checking if GPUs we added correctly to the nodes * Fixed namespace issue and node name issue. * Adding all tests again now that the resource extension passes * Changed where the extended resources are tested * Added error checks for 'curl' calls. Rename variables to use lower case. * Fixed 'if' equal operator
1 parent 19ded5c commit a4d16e1

File tree

7 files changed

+110
-0
lines changed

7 files changed

+110
-0
lines changed

hack/run-e2e-kind.sh

+57
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,63 @@ function setup-mcad-env {
373373
do
374374
echo -n "." && sleep 1;
375375
done
376+
}
377+
378+
function extend-resources {
379+
# Patch nodes to provide GPUs resources without physical GPUs.
380+
# This is intended to allow testing of GPU specific features such as histograms.
381+
382+
# Start communication with cluster
383+
kubectl proxy > /dev/null 2>&1 &
384+
proxy_pid=$!
385+
386+
echo "Starting background proxy connection (pid=${proxy_pid})..."
387+
388+
curl 127.0.0.1:8001 > /dev/null 2>&1
389+
390+
if [[ ! $? -eq 0 ]]; then
391+
echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
392+
exit 1
393+
else
394+
echo "Connected to the kubelet for patching the nodes"
395+
fi
396+
397+
398+
# Variables
399+
resource_name="nvidia.com~1gpu"
400+
resource_count="8"
401+
402+
# Patch nodes
403+
for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
404+
do
405+
echo "- Patching node (add): ${node_name}"
376406

407+
patching_status=$(curl --header "Content-Type: application/json-patch+json" \
408+
--request PATCH \
409+
--data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
410+
http://localhost:8001/api/v1/nodes/${node_name}/status | jq -r '.status')
411+
412+
if [[ ${patching_status} == "Failure" ]]; then
413+
echo "Failed to patch node '${node_name}' with GPU resources"
414+
exit 1
415+
fi
416+
417+
echo
418+
done
419+
420+
# Stop communication with cluster
421+
echo "Killing proxy (pid=${proxy_pid})..."
422+
kill -9 ${proxy_pid}
423+
424+
# Run kuttl tests to confirm GPUs were added correctly
425+
kuttl_test="${ROOT_DIR}/test/kuttl-test-extended-resources.yaml"
426+
echo "kubectl kuttl test --config ${kuttl_test}"
427+
kubectl kuttl test --config ${kuttl_test}
428+
if [ $? -ne 0 ]
429+
then
430+
echo "kuttl e2e test '${kuttl_test}' failure, exiting."
431+
exit 1
432+
fi
377433
}
378434

379435
function kuttl-tests {
@@ -402,6 +458,7 @@ trap cleanup EXIT
402458
update_test_host
403459
check-prerequisites
404460
kind-up-cluster
461+
extend-resources
405462
setup-mcad-env
406463
# MCAD with quotamanagement options is started by kuttl-tests
407464
kuttl-tests
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
# Verify that GPUs are a resource for the node
3+
apiVersion: v1
4+
kind: Node
5+
metadata:
6+
name: test-worker
7+
status:
8+
allocatable:
9+
nvidia.com/gpu: "8"
10+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Verify that the namespace was created
2+
apiVersion: v1
3+
kind: Namespace
4+
metadata:
5+
name: extended-resources
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: extended-resources
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: gpu-job
5+
namespace: extended-resources
6+
status:
7+
conditions:
8+
- type: Complete
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: gpu-job
5+
namespace: extended-resources
6+
spec:
7+
template:
8+
spec:
9+
restartPolicy: Never
10+
containers:
11+
- name: gpu-job
12+
image: ubuntu:latest
13+
command: [ "/bin/bash", "-c", "--" ]
14+
args: [ "sleep 10;" ]
15+
resources:
16+
requests:
17+
nvidia.com/gpu: 8
18+
limits:
19+
nvidia.com/gpu: 8
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
apiVersion: kuttl.dev/v1beta1
2+
kind: TestSuite
3+
testDirs:
4+
- test/e2e-kuttl-extended-resources/
5+
timeout: 60
6+
artifactsDir: _output/logs
7+
commands:

0 commit comments

Comments
 (0)