From 488c4166e373338c4d39051a8eff1f4483dadde7 Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:42:57 +0100 Subject: [PATCH] Backport "Update AMD Operator and NFD install scripts (#2139)" in releases/2.13.0 (#2169) Update AMD Operator and NFD install scripts (#2139) * workaround for amd certified operator in ocp < 4.16 * add NFD installation script and use it in AMD script * use NFD install script in NVIDIA script * minor change * update warn msg * rm unused function --- .../Provisioning/GPU/AMD/amd_operator.sh | 31 ++++++++++++++++--- .../Provisioning/GPU/NFD/install_nfd.sh | 29 +++++++++++++++++ .../GPU/{ => NFD}/nfd_deploy.yaml | 4 +-- .../GPU/{ => NFD}/nfd_operator.yaml | 0 .../Provisioning/GPU/NVIDIA/gpu_deploy.sh | 6 ++-- 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_deploy.yaml (85%) rename ods_ci/tasks/Resources/Provisioning/GPU/{ => NFD}/nfd_operator.yaml (100%) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh index 4ff4b4d88..aa27ef97b 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh @@ -157,6 +157,32 @@ EOF fi } +function applyWorkaroundForOlderOCPVersions () { + # workaround for OCP versions less than 4.16 + # AMD certified operator is published starting from OCP v4.16 + ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"') + IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion" + if [ "${ocpVersionSplit[1]}" -lt 16 ]; then + echo "OCP Version: $ocpVersion" + echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround" + oc apply -f - </$imageUrl/g" $NFD_INSTANCE +oc apply -f "$NFD_INSTANCE" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml similarity index 85% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml index 113980150..ffa3dd1f9 100644 --- a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml @@ -7,8 +7,8 @@ spec: instance: "" # instance is empty by default topologyupdater: false # False by default operand: - # Image digest for registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 - image: registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:d6242132d2ddec00c46d22b63015a33af821eace0150ba47d185cd992fee317d + # Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11 + image: imagePullPolicy: Always workerConfig: configData: | diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml similarity index 100% rename from ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml rename to ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_operator.yaml diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh index 5edbff0b3..8ccf4f290 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla sed -i'' -e "0,/v1.11/s//$CHANNEL/g" "$GPU_INSTALL_DIR/gpu_install.yaml" oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml" -oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml" -echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" +/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh -oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd +echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified @@ -80,7 +79,6 @@ function rerun_accelerator_migration() { } wait_until_pod_ready_status "gpu-operator" -oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json wait_until_pod_ready_status "nvidia-device-plugin-daemonset"