Skip to content

Commit

Permalink
Merge branch 'fix_behavior_for_present_but_failing_nvidiasmi' into me…
Browse files Browse the repository at this point in the history
…rge_pr908_pr910_pr911
  • Loading branch information
Caspar van Leeuwen committed Feb 12, 2025
2 parents c8c0ce1 + e6f89cc commit 660984a
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 7 deletions.
13 changes: 10 additions & 3 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -271,11 +271,18 @@ fi

# Install NVIDIA drivers in host_injections (if they exist)
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
nvidia-smi --version
ec=$?
if [ ${ec} -eq 0 ]; then
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
else
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
echo "This script now assumes this is NOT a GPU node."
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
fi
fi


if [ ! -z "${shared_fs_path}" ]; then
shared_eb_sourcepath=${shared_fs_path}/easybuild/sources
echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path"
Expand Down
18 changes: 16 additions & 2 deletions bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,28 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
# prepare arguments to eessi_container.sh specific to build step
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")

# add options required to handle NVIDIA support
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found, using available GPU"
BUILD_STEP_ARGS+=("--nvidia" "all")
# Accept that this may fail
set +e
nvidia-smi --version
ec=$?
set -e
if [ ${ec} -eq 0 ]; then
echo "Command 'nvidia-smi' found, using available GPU"
BUILD_STEP_ARGS+=("--nvidia" "all")
else
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
echo "This script now assumes this is NOT a GPU node."
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
BUILD_STEP_ARGS+=("--nvidia" "install")
fi
else
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
BUILD_STEP_ARGS+=("--nvidia" "install")
fi

# Retain location for host injections so we don't reinstall CUDA
# (Always need to run the driver installation as available driver may change)
if [[ ! -z ${SHARED_FS_PATH} ]]; then
Expand Down
15 changes: 13 additions & 2 deletions bot/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,19 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro")

# add options required to handle NVIDIA support
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found, using available GPU"
TEST_STEP_ARGS+=("--nvidia" "run")
# Accept that this may fail
set +e
nvidia-smi --version
ec=$?
set -e
if [ ${ec} -eq 0 ]; then
echo "Command 'nvidia-smi' found, using available GPU"
TEST_STEP_ARGS+=("--nvidia" "run")
else
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
echo "This script now assumes this is NOT a GPU node."
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
fi
fi

# prepare arguments to test_suite.sh (specific to test step)
Expand Down

0 comments on commit 660984a

Please sign in to comment.