Skip to content

Commit 660984a

Browse files
author
Caspar van Leeuwen
committed
Merge branch 'fix_behavior_for_present_but_failing_nvidiasmi' into merge_pr908_pr910_pr911
2 parents c8c0ce1 + e6f89cc commit 660984a

File tree

3 files changed

+39
-7
lines changed

3 files changed

+39
-7
lines changed

EESSI-install-software.sh

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,18 @@ fi
271271

272272
# Install NVIDIA drivers in host_injections (if they exist)
273273
if command_exists "nvidia-smi"; then
274-
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
275-
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
274+
nvidia-smi --version
275+
ec=$?
276+
if [ ${ec} -eq 0 ]; then
277+
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
278+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
279+
else
280+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
281+
echo "This script now assumes this is NOT a GPU node."
282+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
283+
fi
276284
fi
277285

278-
279286
if [ ! -z "${shared_fs_path}" ]; then
280287
shared_eb_sourcepath=${shared_fs_path}/easybuild/sources
281288
echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path"

bot/build.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,28 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
252252
# prepare arguments to eessi_container.sh specific to build step
253253
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
254254
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
255+
255256
# add options required to handle NVIDIA support
256257
if command_exists "nvidia-smi"; then
257-
echo "Command 'nvidia-smi' found, using available GPU"
258-
BUILD_STEP_ARGS+=("--nvidia" "all")
258+
# Accept that this may fail
259+
set +e
260+
nvidia-smi --version
261+
ec=$?
262+
set -e
263+
if [ ${ec} -eq 0 ]; then
264+
echo "Command 'nvidia-smi' found, using available GPU"
265+
BUILD_STEP_ARGS+=("--nvidia" "all")
266+
else
267+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
268+
echo "This script now assumes this is NOT a GPU node."
269+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
270+
BUILD_STEP_ARGS+=("--nvidia" "install")
271+
fi
259272
else
260273
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
261274
BUILD_STEP_ARGS+=("--nvidia" "install")
262275
fi
276+
263277
# Retain location for host injections so we don't reinstall CUDA
264278
# (Always need to run the driver installation as available driver may change)
265279
if [[ ! -z ${SHARED_FS_PATH} ]]; then

bot/test.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,19 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro")
215215

216216
# add options required to handle NVIDIA support
217217
if command_exists "nvidia-smi"; then
218-
echo "Command 'nvidia-smi' found, using available GPU"
219-
TEST_STEP_ARGS+=("--nvidia" "run")
218+
# Accept that this may fail
219+
set +e
220+
nvidia-smi --version
221+
ec=$?
222+
set -e
223+
if [ ${ec} -eq 0 ]; then
224+
echo "Command 'nvidia-smi' found, using available GPU"
225+
TEST_STEP_ARGS+=("--nvidia" "run")
226+
else
227+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
228+
echo "This script now assumes this is NOT a GPU node."
229+
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
230+
fi
220231
fi
221232

222233
# prepare arguments to test_suite.sh (specific to test step)

0 commit comments

Comments
 (0)