@@ -72,7 +72,7 @@ function compare_versions_lt() {
72
72
73
73
function get_metadata_attribute() {
74
74
local -r attribute_name=$1
75
- local -r default_value=$2
75
+ local -r default_value=" ${2 :- } "
76
76
/usr/share/google/get_metadata_value " attributes/${attribute_name} " || echo -n " ${default_value} "
77
77
}
78
78
@@ -81,7 +81,7 @@ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
81
81
readonly OS_NAME
82
82
83
83
# node role
84
- ROLE=" $( /usr/share/google/get_metadata_value attributes/ dataproc-role) "
84
+ ROLE=" $( get_metadata_attribute dataproc-role) "
85
85
readonly ROLE
86
86
87
87
# CUDA version and Driver version
@@ -104,7 +104,7 @@ readonly -A NCCL_FOR_CUDA=( [10.1]="2.4.8" [10.2]="2.5.6"
104
104
readonly -A CUDA_SUBVER=( [10.1]=" 10.1.243" [10.2]=" 10.2.89"
105
105
[11.0]=" 11.0.3" [11.1]=" 11.1.0" [11.2]=" 11.2.2"
106
106
[11.5]=" 11.5.2" [11.6]=" 11.6.2" [11.7]=" 11.7.1"
107
- [11.8]=" 11.8.0" [12.4]=" 12.4.0 "
107
+ [11.8]=" 11.8.0" [12.4]=" 12.4.1 "
108
108
)
109
109
110
110
RUNTIME=$( get_metadata_attribute ' rapids-runtime' ' SPARK' )
@@ -268,7 +268,7 @@ function install_nvidia_nccl() {
268
268
execute_with_retries " apt-get update"
269
269
270
270
execute_with_retries \
271
- " apt-get install -y --allow-unauthenticated libnccl2=${nccl_version} libnccl-dev=${nccl_version} "
271
+ " apt-get install -y --allow-unauthenticated libnccl2=${nccl_version} libnccl-dev=${nccl_version} "
272
272
elif [[ ${OS_NAME} == debian ]]; then
273
273
echo " nccl not packaged for debian"
274
274
else
@@ -299,25 +299,25 @@ function install_nvidia_cudnn() {
299
299
" apt-get install -y --no-install-recommends ${packages[*]} "
300
300
elif [[ ${OS_NAME} == debian ]]; then
301
301
if is_debian12; then
302
- apt-get -y install nvidia-cudnn
302
+ apt-get -y install nvidia-cudnn
303
303
elif is_debian11; then
304
- apt-get -y install cudnn9-cuda-12
304
+ apt-get -y install cudnn9-cuda-12
305
305
else
306
- local tmp_dir
307
- tmp_dir=$( mktemp -d -t gpu-init-action-cudnn-XXXX)
306
+ local tmp_dir
307
+ tmp_dir=$( mktemp -d -t gpu-init-action-cudnn-XXXX)
308
308
309
- curl -fSsL --retry-connrefused --retry 10 --retry-max-time 30 \
310
- " ${CUDNN_TARBALL_URL} " -o " ${tmp_dir} /${CUDNN_TARBALL} "
309
+ curl -fSsL --retry-connrefused --retry 10 --retry-max-time 30 \
310
+ " ${CUDNN_TARBALL_URL} " -o " ${tmp_dir} /${CUDNN_TARBALL} "
311
311
312
- if ( compare_versions_lte " ${CUDNN_VERSION} " " 8.3.0.98" ); then
313
- tar -xzf " ${tmp_dir} /${CUDNN_TARBALL} " -C /usr/local
314
- else
315
- ln -sf /usr/local/cuda/targets/x86_64-linux/lib /usr/local/cuda/lib
316
- tar -h --no-same-owner --strip-components=1 \
317
- -xJf " ${tmp_dir} /${CUDNN_TARBALL} " -C /usr/local/cuda
318
- fi
312
+ if ( compare_versions_lte " ${CUDNN_VERSION} " " 8.3.0.98" ); then
313
+ tar -xzf " ${tmp_dir} /${CUDNN_TARBALL} " -C /usr/local
314
+ else
315
+ ln -sf /usr/local/cuda/targets/x86_64-linux/lib /usr/local/cuda/lib
316
+ tar -h --no-same-owner --strip-components=1 \
317
+ -xJf " ${tmp_dir} /${CUDNN_TARBALL} " -C /usr/local/cuda
318
+ fi
319
319
320
- cat << 'EOF ' >>/etc/profile.d/cudnn.sh
320
+ cat << 'EOF ' >>/etc/profile.d/cudnn.sh
321
321
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
322
322
EOF
323
323
fi
333
333
334
334
CA_TMPDIR=" $( mktemp -u -d -p /run/tmp -t ca_dir-XXXX) "
335
335
function configure_dkms_certs() {
336
- if [[ -z " $( /usr/share/google/get_metadata_value attributes/ private_secret_name) " ]]; then
336
+ if [[ -z " $( get_metadata_attribute private_secret_name) " ]]; then
337
337
echo " No signing secret provided. skipping" ;
338
338
return 0
339
339
fi
@@ -345,11 +345,11 @@ function configure_dkms_certs() {
345
345
echo " Private key material exists"
346
346
347
347
local expected_modulus_md5sum
348
- expected_modulus_md5sum=$( /usr/share/google/get_metadata_value attributes/ cert_modulus_md5sum)
348
+ expected_modulus_md5sum=$( get_metadata_attribute cert_modulus_md5sum)
349
349
if [[ -n " ${expected_modulus_md5sum} " ]]; then
350
- modulus_md5sum=" ${expected_modulus_md5sum} "
350
+ modulus_md5sum=" ${expected_modulus_md5sum} "
351
351
else
352
- modulus_md5sum=" bd40cf5905c7bba4225d330136fdbfd3"
352
+ modulus_md5sum=" bd40cf5905c7bba4225d330136fdbfd3"
353
353
fi
354
354
355
355
# Verify that cert md5sum matches expected md5sum
@@ -369,13 +369,13 @@ function configure_dkms_certs() {
369
369
370
370
# Retrieve cloud secrets keys
371
371
local sig_priv_secret_name
372
- sig_priv_secret_name=$( /usr/share/google/get_metadata_value attributes/ private_secret_name)
372
+ sig_priv_secret_name=$( get_metadata_attribute private_secret_name)
373
373
local sig_pub_secret_name
374
- sig_pub_secret_name=$( /usr/share/google/get_metadata_value attributes/ public_secret_name)
374
+ sig_pub_secret_name=$( get_metadata_attribute public_secret_name)
375
375
local sig_secret_project
376
- sig_secret_project=$( /usr/share/google/get_metadata_value attributes/ secret_project)
376
+ sig_secret_project=$( get_metadata_attribute secret_project)
377
377
local sig_secret_version
378
- sig_secret_version=$( /usr/share/google/get_metadata_value attributes/ secret_version)
378
+ sig_secret_version=$( get_metadata_attribute secret_version)
379
379
380
380
# If metadata values are not set, do not write mok keys
381
381
if [[ -z " ${sig_priv_secret_name} " ]]; then return 0 ; fi
@@ -400,7 +400,7 @@ function configure_dkms_certs() {
400
400
}
401
401
402
402
function clear_dkms_key {
403
- if [[ -z " $( /usr/share/google/get_metadata_value attributes/ private_secret_name) " ]]; then
403
+ if [[ -z " $( get_metadata_attribute private_secret_name) " ]]; then
404
404
echo " No signing secret provided. skipping" ;
405
405
return 0
406
406
fi
@@ -428,11 +428,11 @@ function add_repo_nvidia_container_toolkit() {
428
428
if is_debian ; then
429
429
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
430
430
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
431
- | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
431
+ | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
432
432
433
433
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
434
- | sed ' s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
435
- | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
434
+ | sed ' s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
435
+ | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
436
436
437
437
apt-get update
438
438
fi
@@ -441,19 +441,22 @@ function add_repo_nvidia_container_toolkit() {
441
441
442
442
# Install NVIDIA GPU driver provided by NVIDIA
443
443
function install_nvidia_gpu_driver() {
444
+ workdir=/opt/install-nvidia-driver
445
+ mkdir -p " ${workdir} "
446
+ pushd " ${workdir} "
444
447
445
448
if is_debian12 ; then
446
449
add_nonfree_components
447
450
add_repo_nvidia_container_toolkit
448
451
configure_dkms_certs
449
452
apt-get -yq install \
450
- nvidia-container-toolkit \
451
- dkms \
452
- nvidia-open-kernel-dkms \
453
- nvidia-open-kernel-support \
454
- nvidia-smi \
455
- libglvnd0 \
456
- libcuda1
453
+ nvidia-container-toolkit \
454
+ dkms \
455
+ nvidia-open-kernel-dkms \
456
+ nvidia-open-kernel-support \
457
+ nvidia-smi \
458
+ libglvnd0 \
459
+ libcuda1
457
460
clear_dkms_key
458
461
459
462
elif is_debian ; then
@@ -467,23 +470,28 @@ function install_nvidia_gpu_driver() {
467
470
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
468
471
" ${NVIDIA_DEBIAN_GPU_DRIVER_URL} " -o driver.run
469
472
bash " ./driver.run" --no-kernel-modules --silent --install-libglvnd
470
- rm -rf driver.run
473
+ rm -f driver.run
471
474
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git --branch " ${NVIDIA_DEBIAN_GPU_DRIVER_VERSION} " --single-branch
472
475
pushd open-gpu-kernel-modules
473
476
make -j$( nproc) modules
474
- configure_dkms_certs
475
- for module in $( find kernel-open -name ' *.ko' ) ; do
476
- /lib/modules/$( uname -r) /build/scripts/sign-file sha256 /var/lib/dkms/mok.key /var/lib/dkms/mok.pub " ${module} "
477
- done
478
- clear_dkms_key
477
+ if [[ -n " $( get_metadata_attribute private_secret_name) " ]]; then
478
+ configure_dkms_certs
479
+ for module in $( find kernel-open -name ' *.ko' ) ; do
480
+ /lib/modules/$( uname -r) /build/scripts/sign-file sha256 \
481
+ /var/lib/dkms/mok.key \
482
+ /var/lib/dkms/mok.pub \
483
+ " ${module} "
484
+ done
485
+ clear_dkms_key
486
+ fi
479
487
make modules_install
480
488
depmod -a
481
489
popd
482
490
483
491
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
484
492
" ${NVIDIA_DEBIAN_CUDA_URL} " -o cuda.run
485
493
bash " ./cuda.run" --silent --toolkit --no-opengl-libs
486
- rm -rf cuda.run
494
+ rm -f cuda.run
487
495
elif [[ ${OS_NAME} == ubuntu ]]; then
488
496
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
489
497
" ${NVIDIA_UBUNTU_REPO_KEY_PACKAGE} " -o /tmp/cuda-keyring.deb
@@ -522,6 +530,7 @@ function install_nvidia_gpu_driver() {
522
530
fi
523
531
ldconfig
524
532
echo " NVIDIA GPU driver provided by NVIDIA was installed successfully"
533
+ popd
525
534
}
526
535
527
536
# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
0 commit comments