Skip to content

Commit 94e3f19

Browse files
authored
[gpu] do not require signing metadata (#1193)
* wrapped signing code in conditional that checks for private key * replaced tabs with spaces * made use of get_metadata_attribute more frequently * using default value of "" unless otherwise supplied * using CUDA subversion of 12.4.1 * building source in /opt/install-nvidia-driver
1 parent f5816e0 commit 94e3f19

File tree

1 file changed

+53
-44
lines changed

1 file changed

+53
-44
lines changed

gpu/install_gpu_driver.sh

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ function compare_versions_lt() {
7272

7373
function get_metadata_attribute() {
7474
local -r attribute_name=$1
75-
local -r default_value=$2
75+
local -r default_value="${2:-}"
7676
/usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
7777
}
7878

@@ -81,7 +81,7 @@ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
8181
readonly OS_NAME
8282

8383
# node role
84-
ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
84+
ROLE="$(get_metadata_attribute dataproc-role)"
8585
readonly ROLE
8686

8787
# CUDA version and Driver version
@@ -104,7 +104,7 @@ readonly -A NCCL_FOR_CUDA=( [10.1]="2.4.8" [10.2]="2.5.6"
104104
readonly -A CUDA_SUBVER=( [10.1]="10.1.243" [10.2]="10.2.89"
105105
[11.0]="11.0.3" [11.1]="11.1.0" [11.2]="11.2.2"
106106
[11.5]="11.5.2" [11.6]="11.6.2" [11.7]="11.7.1"
107-
[11.8]="11.8.0" [12.4]="12.4.0"
107+
[11.8]="11.8.0" [12.4]="12.4.1"
108108
)
109109

110110
RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
@@ -268,7 +268,7 @@ function install_nvidia_nccl() {
268268
execute_with_retries "apt-get update"
269269

270270
execute_with_retries \
271-
"apt-get install -y --allow-unauthenticated libnccl2=${nccl_version} libnccl-dev=${nccl_version}"
271+
"apt-get install -y --allow-unauthenticated libnccl2=${nccl_version} libnccl-dev=${nccl_version}"
272272
elif [[ ${OS_NAME} == debian ]]; then
273273
echo "nccl not packaged for debian"
274274
else
@@ -299,25 +299,25 @@ function install_nvidia_cudnn() {
299299
"apt-get install -y --no-install-recommends ${packages[*]}"
300300
elif [[ ${OS_NAME} == debian ]]; then
301301
if is_debian12; then
302-
apt-get -y install nvidia-cudnn
302+
apt-get -y install nvidia-cudnn
303303
elif is_debian11; then
304-
apt-get -y install cudnn9-cuda-12
304+
apt-get -y install cudnn9-cuda-12
305305
else
306-
local tmp_dir
307-
tmp_dir=$(mktemp -d -t gpu-init-action-cudnn-XXXX)
306+
local tmp_dir
307+
tmp_dir=$(mktemp -d -t gpu-init-action-cudnn-XXXX)
308308

309-
curl -fSsL --retry-connrefused --retry 10 --retry-max-time 30 \
310-
"${CUDNN_TARBALL_URL}" -o "${tmp_dir}/${CUDNN_TARBALL}"
309+
curl -fSsL --retry-connrefused --retry 10 --retry-max-time 30 \
310+
"${CUDNN_TARBALL_URL}" -o "${tmp_dir}/${CUDNN_TARBALL}"
311311

312-
if ( compare_versions_lte "${CUDNN_VERSION}" "8.3.0.98" ); then
313-
tar -xzf "${tmp_dir}/${CUDNN_TARBALL}" -C /usr/local
314-
else
315-
ln -sf /usr/local/cuda/targets/x86_64-linux/lib /usr/local/cuda/lib
316-
tar -h --no-same-owner --strip-components=1 \
317-
-xJf "${tmp_dir}/${CUDNN_TARBALL}" -C /usr/local/cuda
318-
fi
312+
if ( compare_versions_lte "${CUDNN_VERSION}" "8.3.0.98" ); then
313+
tar -xzf "${tmp_dir}/${CUDNN_TARBALL}" -C /usr/local
314+
else
315+
ln -sf /usr/local/cuda/targets/x86_64-linux/lib /usr/local/cuda/lib
316+
tar -h --no-same-owner --strip-components=1 \
317+
-xJf "${tmp_dir}/${CUDNN_TARBALL}" -C /usr/local/cuda
318+
fi
319319

320-
cat <<'EOF' >>/etc/profile.d/cudnn.sh
320+
cat <<'EOF' >>/etc/profile.d/cudnn.sh
321321
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
322322
EOF
323323
fi
@@ -333,7 +333,7 @@ EOF
333333

334334
CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
335335
function configure_dkms_certs() {
336-
if [[ -z "$(/usr/share/google/get_metadata_value attributes/private_secret_name)" ]]; then
336+
if [[ -z "$(get_metadata_attribute private_secret_name)" ]]; then
337337
echo "No signing secret provided. skipping";
338338
return 0
339339
fi
@@ -345,11 +345,11 @@ function configure_dkms_certs() {
345345
echo "Private key material exists"
346346

347347
local expected_modulus_md5sum
348-
expected_modulus_md5sum=$(/usr/share/google/get_metadata_value attributes/cert_modulus_md5sum)
348+
expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
349349
if [[ -n "${expected_modulus_md5sum}" ]]; then
350-
modulus_md5sum="${expected_modulus_md5sum}"
350+
modulus_md5sum="${expected_modulus_md5sum}"
351351
else
352-
modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
352+
modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
353353
fi
354354

355355
# Verify that cert md5sum matches expected md5sum
@@ -369,13 +369,13 @@ function configure_dkms_certs() {
369369

370370
# Retrieve cloud secrets keys
371371
local sig_priv_secret_name
372-
sig_priv_secret_name=$(/usr/share/google/get_metadata_value attributes/private_secret_name)
372+
sig_priv_secret_name=$(get_metadata_attribute private_secret_name)
373373
local sig_pub_secret_name
374-
sig_pub_secret_name=$(/usr/share/google/get_metadata_value attributes/public_secret_name)
374+
sig_pub_secret_name=$(get_metadata_attribute public_secret_name)
375375
local sig_secret_project
376-
sig_secret_project=$(/usr/share/google/get_metadata_value attributes/secret_project)
376+
sig_secret_project=$(get_metadata_attribute secret_project)
377377
local sig_secret_version
378-
sig_secret_version=$(/usr/share/google/get_metadata_value attributes/secret_version)
378+
sig_secret_version=$(get_metadata_attribute secret_version)
379379

380380
# If metadata values are not set, do not write mok keys
381381
if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
@@ -400,7 +400,7 @@ function configure_dkms_certs() {
400400
}
401401

402402
function clear_dkms_key {
403-
if [[ -z "$(/usr/share/google/get_metadata_value attributes/private_secret_name)" ]]; then
403+
if [[ -z "$(get_metadata_attribute private_secret_name)" ]]; then
404404
echo "No signing secret provided. skipping";
405405
return 0
406406
fi
@@ -428,11 +428,11 @@ function add_repo_nvidia_container_toolkit() {
428428
if is_debian ; then
429429
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
430430
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
431-
| gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
431+
| gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
432432

433433
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
434-
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
435-
| tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
434+
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
435+
| tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
436436

437437
apt-get update
438438
fi
@@ -441,19 +441,22 @@ function add_repo_nvidia_container_toolkit() {
441441

442442
# Install NVIDIA GPU driver provided by NVIDIA
443443
function install_nvidia_gpu_driver() {
444+
workdir=/opt/install-nvidia-driver
445+
mkdir -p "${workdir}"
446+
pushd "${workdir}"
444447

445448
if is_debian12 ; then
446449
add_nonfree_components
447450
add_repo_nvidia_container_toolkit
448451
configure_dkms_certs
449452
apt-get -yq install \
450-
nvidia-container-toolkit \
451-
dkms \
452-
nvidia-open-kernel-dkms \
453-
nvidia-open-kernel-support \
454-
nvidia-smi \
455-
libglvnd0 \
456-
libcuda1
453+
nvidia-container-toolkit \
454+
dkms \
455+
nvidia-open-kernel-dkms \
456+
nvidia-open-kernel-support \
457+
nvidia-smi \
458+
libglvnd0 \
459+
libcuda1
457460
clear_dkms_key
458461

459462
elif is_debian ; then
@@ -467,23 +470,28 @@ function install_nvidia_gpu_driver() {
467470
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
468471
"${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run
469472
bash "./driver.run" --no-kernel-modules --silent --install-libglvnd
470-
rm -rf driver.run
473+
rm -f driver.run
471474
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git --branch "${NVIDIA_DEBIAN_GPU_DRIVER_VERSION}" --single-branch
472475
pushd open-gpu-kernel-modules
473476
make -j$(nproc) modules
474-
configure_dkms_certs
475-
for module in $(find kernel-open -name '*.ko'); do
476-
/lib/modules/$(uname -r)/build/scripts/sign-file sha256 /var/lib/dkms/mok.key /var/lib/dkms/mok.pub "${module}"
477-
done
478-
clear_dkms_key
477+
if [[ -n "$(get_metadata_attribute private_secret_name)" ]]; then
478+
configure_dkms_certs
479+
for module in $(find kernel-open -name '*.ko'); do
480+
/lib/modules/$(uname -r)/build/scripts/sign-file sha256 \
481+
/var/lib/dkms/mok.key \
482+
/var/lib/dkms/mok.pub \
483+
"${module}"
484+
done
485+
clear_dkms_key
486+
fi
479487
make modules_install
480488
depmod -a
481489
popd
482490

483491
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
484492
"${NVIDIA_DEBIAN_CUDA_URL}" -o cuda.run
485493
bash "./cuda.run" --silent --toolkit --no-opengl-libs
486-
rm -rf cuda.run
494+
rm -f cuda.run
487495
elif [[ ${OS_NAME} == ubuntu ]]; then
488496
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
489497
"${NVIDIA_UBUNTU_REPO_KEY_PACKAGE}" -o /tmp/cuda-keyring.deb
@@ -522,6 +530,7 @@ function install_nvidia_gpu_driver() {
522530
fi
523531
ldconfig
524532
echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
533+
popd
525534
}
526535

527536
# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics

0 commit comments

Comments
 (0)