Skip to content

Commit

Permalink
Merge branch 'releases/2.13.0' into bkp_cert_amd_213
Browse files Browse the repository at this point in the history
  • Loading branch information
bdattoma authored Jan 31, 2025
2 parents 6e77896 + 10a5e62 commit efb01d3
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 67 deletions.
31 changes: 27 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,32 @@ EOF
fi
}

function applyWorkaroundForOlderOCPVersions () {
# workaround for OCP versions less than 4.16
# AMD certified operator is published starting from OCP v4.16
ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
if [ "${ocpVersionSplit[1]}" -lt 16 ]; then
echo "OCP Version: $ocpVersion"
echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround"
oc apply -f - <<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
name: certified-operators-416-amd
namespace: openshift-marketplace
spec:
displayName: Certfied operator
image: 'registry.redhat.io/redhat/certified-operator-index:v4.16'
publisher: RHOAI QE
sourceType: grpc
EOF
oc wait --timeout="120s" --for=condition=ready=true pod -n openshift-marketplace -l olm.catalogSource=certified-operators-416-amd
sed -i'' -e "s/certified-operators/certified-operators-416-amd/g" "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
fi
}

applyWorkaroundForOlderOCPVersions
check_registry
status=$?

Expand All @@ -182,10 +208,7 @@ fi
sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
Expand Down
29 changes: 29 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -e

NFD_INSTALL_DIR="$(dirname "$0")"
NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
echo "Installing NFD operator"
oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
declare -A images=(
["4.14"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery@sha256:2977e67a413882efbfb90b52facf65d38a5cb2cd7a232ca3a69476e5dec33319"
["4.15"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:661b6697dee34626a3a98b50cdba787402ab214d2807b8460df92e3c79cdfcc5"
["4.16"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:bb95bc317ab78e8af4ef34dd66f9f62c2f8c261dfb5eab40918142812802f8b7"
["4.17"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:154cf3f1ddaf895d7ecd04947bd455a930132f72acc6e8bde8c26bc123184ace"
# 4.18 is a pre-release image. We need to update it later
["4.18"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:510cb4351253492455664b6c323f54dc2f6f2f8791c5e92ba6b7e60b8adb357c"
)
if [ "${images[$xyVersion]}" ]; then
imageUrl="${images[$xyVersion]}"
echo "Using image SHA for $xyVersion: $imageUrl"
else
imageUrl="${images["4.17"]}"
echo "WARNING: I don't know the sha for $xyVersion. Re-using default 4.17 $imageUrl. It might not work!"
fi
sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
oc apply -f "$NFD_INSTANCE"
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
# Image digest for registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:d6242132d2ddec00c46d22b63015a33af821eace0150ba47d185cd992fee317d
# Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: <imageUrl>
imagePullPolicy: Always
workerConfig:
configData: |
Expand Down
6 changes: 2 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down Expand Up @@ -80,7 +79,6 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,9 @@ Clean Up Server
Should Match "${ls_server}" "${EMPTY}"

Get User Notebook Pod Name
[Documentation] Returns notebook pod name for given username (e.g. for user ldap-admin1 it will be jupyterhub-nb-ldap-2dadmin1)
[Documentation] Returns notebook pod name for given username (e.g. for user ldap-admin1 it will be jupyter-nb-ldap-2dadmin1-0)
[Arguments] ${username}
${safe_username}= Get Safe Username ${username}
#${notebook_pod_name}= Set Variable jupyterhub-nb-${safe_username}
${notebook_pod_name}= Set Variable jupyter-nb-${safe_username}-0
RETURN ${notebook_pod_name}

Expand Down
100 changes: 74 additions & 26 deletions ods_ci/tests/Tests/0200__rhoai_upgrade/0201__pre_upgrade.robot
Original file line number Diff line number Diff line change
@@ -1,34 +1,40 @@
*** Settings ***
Documentation Test Suite for Upgrade testing, to be run before the upgrade
Library OpenShiftLibrary
Resource ../../Resources/RHOSi.resource
Resource ../../Resources/ODS.robot
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDashboard.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDashboardResources.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHModelServing.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/DataConnections.resource
Resource ../../Resources/Page/ODH/JupyterHub/HighAvailability.robot
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Projects.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource
Resource ../../Resources/Page/ODH/AiApps/Anaconda.resource
Resource ../../Resources/Page/LoginPage.robot
Resource ../../Resources/Page/OCPLogin/OCPLogin.robot
Resource ../../Resources/Common.robot
Resource ../../Resources/Page/OCPDashboard/Pods/Pods.robot
Resource ../../Resources/Page/OCPDashboard/Builds/Builds.robot
Resource ../../Resources/Page/HybridCloudConsole/OCM.robot
Resource ../../Resources/CLI/ModelServing/modelmesh.resource
Resource ../../Resources/Page/DistributedWorkloads/DistributedWorkloads.resource
Resource ../../Resources/Page/DistributedWorkloads/WorkloadMetricsUI.resource
Suite Setup Dashboard Suite Setup
Suite Teardown RHOSi Teardown
Test Tags PreUpgrade
Documentation Test Suite for Upgrade testing, to be run before the upgrade
Library OpenShiftLibrary
Resource ../../Resources/RHOSi.resource
Resource ../../Resources/ODS.robot
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDashboard.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDashboardResources.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHModelServing.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/DataConnections.resource
Resource ../../Resources/Page/ODH/JupyterHub/HighAvailability.robot
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Projects.resource
Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/ModelServer.resource
Resource ../../Resources/Page/ODH/AiApps/Anaconda.resource
Resource ../../Resources/Page/LoginPage.robot
Resource ../../Resources/Page/OCPLogin/OCPLogin.robot
Resource ../../Resources/Common.robot
Resource ../../Resources/Page/OCPDashboard/Pods/Pods.robot
Resource ../../Resources/Page/OCPDashboard/Builds/Builds.robot
Resource ../../Resources/Page/HybridCloudConsole/OCM.robot
Resource ../../Resources/CLI/ModelServing/modelmesh.resource
Resource ../../Resources/Page/DistributedWorkloads/DistributedWorkloads.resource
Resource ../../Resources/Page/DistributedWorkloads/WorkloadMetricsUI.resource

Suite Setup Upgrade Suite Setup
Suite Teardown RHOSi Teardown

Test Tags PreUpgrade


*** Variables ***
${CUSTOM_CULLER_TIMEOUT} 60000
${S_SIZE} 25
${DW_PROJECT_CREATED} False
${CODE} while True: import time ; time.sleep(10); print ("Hello")
${UPGRADE_NS} upgrade
${UPGRADE_CONFIG_MAP} upgrade-config-map


*** Test Cases ***
Expand Down Expand Up @@ -204,12 +210,54 @@ Run Training Operator ODH Setup Sleep PyTorchJob Test Use Case
Run Training Operator ODH Upgrade Test TestSetupSleepPytorchjob
[Teardown] Teardown Training Operator E2E Upgrade Test Suite

Long Running Jupyter Notebook
[Documentation] Launch a long running notebook before the upgrade
[Tags] Upgrade
Launch Notebook
Add And Run JupyterLab Code Cell In Active Notebook ${CODE}

# Get the notebook pod creation timestamp
${notebook_pod_name}= Get User Notebook Pod Name ${TEST_USER2.USERNAME}
${return_code} ${ntb_creation_timestamp} = Run And Return Rc And Output
... oc get pod -n ${NOTEBOOKS_NAMESPACE} ${notebook_pod_name} --no-headers --output='custom-columns=TIMESTAMP:.metadata.creationTimestamp' # robocop: disable: line-too-long
Should Be Equal As Integers ${return_code} 0 msg=${ntb_creation_timestamp}

# Save the timestamp to the OpenShift ConfigMap so it can be used in test in the next phase
${return_code} ${cmd_output} = Run And Return Rc And Output
... oc create configmap ${UPGRADE_CONFIG_MAP} -n ${UPGRADE_NS} --from-literal=ntb_creation_timestamp=${ntb_creation_timestamp} # robocop: disable: line-too-long
Should Be Equal As Integers ${return_code} 0 msg=${cmd_output}

Close Browser


*** Keywords ***
Dashboard Suite Setup
[Documentation] Basic suite setup
Launch Notebook
[Documentation] Launch notebook for the suite
[Arguments] ${notebook_image}=minimal-notebook
... ${username}=${TEST_USER2.USERNAME}
... ${password}=${TEST_USER2.PASSWORD}
... ${auth_type}=${TEST_USER2.AUTH_TYPE}
Begin Web Test username=${username} password=${password} auth_type=${auth_type}
Launch Jupyter From RHODS Dashboard Link
Spawn Notebook With Arguments
... image=${notebook_image}
... username=${username}
... password=${password}
... auth_type=${auth_type}

Upgrade Suite Setup
[Documentation] Basic suite setup
Set Library Search Order SeleniumLibrary
RHOSi Setup
# Prepare a namespace for storing values that should be shared between different upgrade test phases
# 1. if the namespace exists already, let's remove it
${return_code} ${cmd_output} = Run And Return Rc And Output
... oc delete namespace --wait --ignore-not-found ${UPGRADE_NS}
Should Be Equal As Integers ${return_code} 0 msg=${cmd_output}
# 2. create the namespace now
${return_code} ${cmd_output} = Run And Return Rc And Output
... oc create namespace ${UPGRADE_NS}
Should Be Equal As Integers ${return_code} 0 msg=${cmd_output}

Dashboard Test Teardown
[Documentation] Basic suite teardown
Expand Down
35 changes: 11 additions & 24 deletions ods_ci/tests/Tests/0200__rhoai_upgrade/0202__during_upgrade.robot
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,7 @@ Library JupyterLibrary
Test Tags DuringUpgrade


*** Variables ***
${CODE} while True: import time ; time.sleep(10); print ("Hello")


*** Test Cases ***
Long Running Jupyter Notebook
[Documentation] Launch a long running notebook before the upgrade
[Tags] Upgrade
Launch Notebook
Add And Run JupyterLab Code Cell In Active Notebook ${CODE}
${return_code} ${timestamp} Run And Return Rc And Output oc get pod -n ${NOTEBOOKS_NAMESPACE} jupyter-nb-ldap-2dadmin2-0 --no-headers --output='custom-columns=TIMESTAMP:.metadata.creationTimestamp' #robocop:disable
Should Be Equal As Integers ${return_code} 0
Set Global Variable ${timestamp} #robocop: disable
Close Browser

Upgrade RHODS
[Documentation] Approve the install plan for the upgrade and make sure that upgrade has completed
[Tags] ODS-1766
Expand Down Expand Up @@ -58,17 +44,18 @@ PyTorch Image Workload Test

*** Keywords ***
Launch Notebook
[Documentation] Launch notebook for the suite
[Arguments] ${notbook_image}=minimal-notebook ${username}=${TEST_USER2.USERNAME} ${password}=${TEST_USER2.PASSWORD} ${auth_type}=${TEST_USER2.AUTH_TYPE} #robocop: disable
Begin Web Test username=${username} password=${password} auth_type=${auth_type}
Login To RHODS Dashboard ${username} ${password} ${auth_type}
Wait For RHODS Dashboard To Load
[Documentation] Launch notebook for the suite
[Arguments] ${notebook_image}=minimal-notebook
... ${username}=${TEST_USER2.USERNAME}
... ${password}=${TEST_USER2.PASSWORD}
... ${auth_type}=${TEST_USER2.AUTH_TYPE}
Begin Web Test username=${username} password=${password} auth_type=${auth_type}
Launch Jupyter From RHODS Dashboard Link
Login To Jupyterhub ${username} ${password} ${auth_type}
${authorization_required} Is Service Account Authorization Required
IF ${authorization_required} Authorize Jupyterhub Service Account
Fix Spawner Status
Spawn Notebook With Arguments image=${notbook_image} username=${username} password=${password} auth_type=${auth_type} #robocop: disable
Spawn Notebook With Arguments
... image=${notebook_image}
... username=${username}
... password=${password}
... auth_type=${auth_type}

Upgrade Test Teardown
End Web Test
Expand Down
34 changes: 29 additions & 5 deletions ods_ci/tests/Tests/0200__rhoai_upgrade/0203__post_upgrade.robot
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Test Tags PostUpgrade
*** Variables ***
${S_SIZE} 25
${DW_PROJECT_CREATED} False
${UPGRADE_NS} upgrade
${UPGRADE_CONFIG_MAP} upgrade-config-map


*** Test Cases ***
Expand Down Expand Up @@ -63,11 +65,26 @@ Verify Culler is Enabled
END

Verify Notebook Has Not Restarted
[Documentation] Verify Notbook pod has not restarted after the upgrade
[Tags] Upgrade
${return_code} ${new_timestamp} Run And Return Rc And Output oc get pod -n ${NOTEBOOKS_NAMESPACE} jupyter-nb-ldap-2dadmin2-0 --no-headers --output='custom-columns=TIMESTAMP:.metadata.creationTimestamp' #robocop:disable
Should Be Equal As Integers ${return_code} 0
Should Be Equal ${timestamp} ${new_timestamp} msg=Running notebook pod has restarted
[Documentation] Verify Notebook pod has not restarted after the upgrade
[Tags] Upgrade
${notebook_name}= Get User CR Notebook Name ${TEST_USER2.USERNAME}
${notebook_pod_name}= Get User Notebook Pod Name ${TEST_USER2.USERNAME}

# Get the running notebook creation timestamp
${return_code} ${new_timestamp} Run And Return Rc And Output
... oc get pod -n ${NOTEBOOKS_NAMESPACE} ${notebook_pod_name} --no-headers --output='custom-columns=TIMESTAMP:.metadata.creationTimestamp' # robocop: disable: line-too-long
Should Be Equal As Integers ${return_code} 0 msg=${new_timestamp}

# Get the running notebook creation timestamp from the upgrade ConfigMap safed in the previous
# phase (before the actual RHOAI upgrade)
${return_code} ${ntb_creation_timestamp} Run And Return Rc And Output
... oc get configmap ${UPGRADE_CONFIG_MAP} -n ${UPGRADE_NS} -o jsonpath='{.data.ntb_creation_timestamp}'
Should Be Equal As Integers ${return_code} 0 msg=${ntb_creation_timestamp}

# The timestamps should be equal
Should Be Equal ${ntb_creation_timestamp} ${new_timestamp} msg=Running notebook pod has restarted

[Teardown] Terminate Running Notebook ${notebook_name}

Verify Custom Image Is Present
[Tags] Upgrade
Expand Down Expand Up @@ -276,6 +293,13 @@ Delete OOTB Image
IF not ${status} Fail Notebook image is deleted after the upgrade
IF not ${IS_SELF_MANAGED} Managed RHOAI Upgrade Test Teardown

Terminate Running Notebook
[Documentation] Terminates the running notebook instance
[Arguments] ${notebook_name}
${return_code} ${cmd_output} Run And Return Rc And Output
... oc delete Notebook.kubeflow.org -n ${NOTEBOOKS_NAMESPACE} ${notebook_name}
Should Be Equal As Integers ${return_code} 0 msg=${cmd_output}

Managed RHOAI Upgrade Test Teardown
[Documentation] Check rhods_aggregate_availability metric when RHOAI is installed as managed
${expression} = Set Variable rhods_aggregate_availability&step=1
Expand Down

0 comments on commit efb01d3

Please sign in to comment.