Skip to content

Commit 490a9ee

Browse files
authored
Merge pull request #772 from trz42/2023.06-software.eessi.io-cuDNN-8.9.2.26-part-1
{2023.06}[foss/2023a] cuDNN 8.9.2.26 w/ CUDA 12.1.1 (part 1)
2 parents 411128a + 77f3bc9 commit 490a9ee

File tree

7 files changed

+448
-62
lines changed

7 files changed

+448
-62
lines changed

Diff for: EESSI-install-software.sh

+26-8
Original file line numberDiff line numberDiff line change
@@ -161,19 +161,21 @@ _eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_
161161
_lmod_cfg_dir=${_eessi_software_path}/.lmod
162162
_lmod_rc_file=${_lmod_cfg_dir}/lmodrc.lua
163163
if [ ! -f ${_lmod_rc_file} ]; then
164+
echo "Lmod file '${_lmod_rc_file}' does not exist yet; creating it..."
164165
command -V python3
165166
python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path}
166167
fi
167168
_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua
168169
if [ ! -f ${_lmod_sitepackage_file} ]; then
170+
echo "Lmod file '${_lmod_sitepackage_file}' does not exist yet; creating it..."
169171
command -V python3
170172
python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path}
171173
fi
172174

173175
# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE)
174-
# $EESSI_SILENT - don't print any messages
175-
# $EESSI_BASIC_ENV - give a basic set of environment variables
176-
EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables
176+
# $EESSI_SILENT - don't print any messages if set (use 'unset EESSI_SILENT' to let script show messages)
177+
# $EESSI_BASIC_ENV - give a basic set of environment variables if set (use 'EESSI_BASIC_ENV=' to let script initialise a full environment)
178+
EESSI_SILENT=1 EESSI_BASIC_ENV= source $TOPDIR/init/eessi_environment_variables
177179

178180
if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then
179181
fatal_error "Failed to determine software subdirectory?!"
@@ -243,12 +245,13 @@ if [[ "${EESSI_CVMFS_REPO}" != /cvmfs/dev.eessi.io ]]; then
243245
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}
244246
fi
245247

246-
# Install full CUDA SDK in host_injections
248+
# Install full CUDA SDK and cu* libraries in host_injections
247249
# Hardcode this for now, see if it works
248250
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
249251
# Allow skipping CUDA SDK install in e.g. CI environments
250252
# The install_cuda... script uses EasyBuild. So, we need to check if we have EB
251253
# or skip this step.
254+
echo "Going to install full CUDA SDK and cu* libraries under host_injections if necessary"
252255
module_avail_out=$TMPDIR/ml.out
253256
module avail 2>&1 | grep EasyBuild &> ${module_avail_out}
254257
if [[ $? -eq 0 ]]; then
@@ -258,10 +261,15 @@ else
258261
export skip_cuda_install=True
259262
fi
260263

264+
temp_install_storage=${TMPDIR}/temp_install_storage
265+
mkdir -p ${temp_install_storage}
261266
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
262-
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
267+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
268+
-t ${temp_install_storage} \
269+
--accept-cuda-eula \
270+
--accept-cudnn-eula
263271
else
264-
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
272+
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
265273
fi
266274

267275
# Install NVIDIA drivers in host_injections (if they exist)
@@ -318,20 +326,30 @@ else
318326
done
319327
fi
320328

321-
echo ">> Creating/updating Lmod RC file..."
322329
export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod"
323330
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
331+
if [[ ! -z ${EESSI_ACCELERATOR_TARGET} ]]; then
332+
# EESSI_ACCELERATOR_TARGET is set, so let's remove the accelerator path from $lmod_rc_file
333+
lmod_rc_file=$(echo ${lmod_rc_file} | sed "s@/accel/${EESSI_ACCELERATOR_TARGET}@@")
334+
echo "Path to lmodrc.lua changed to '${lmod_rc_file}'"
335+
fi
324336
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
325337
if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then
338+
echo ">> Creating/updating Lmod RC file (${lmod_rc_file})..."
326339
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
327340
check_exit_code $? "$lmod_rc_file created" "Failed to create $lmod_rc_file"
328341
fi
329342

330-
echo ">> Creating/updating Lmod SitePackage.lua ..."
331343
export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod"
332344
lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua"
345+
if [[ ! -z ${EESSI_ACCELERATOR_TARGET} ]]; then
346+
# EESSI_ACCELERATOR_TARGET is set, so let's remove the accelerator path from $lmod_sitepackage_file
347+
lmod_sitepackage_file=$(echo ${lmod_sitepackage_file} | sed "s@/accel/${EESSI_ACCELERATOR_TARGET}@@")
348+
echo "Path to SitePackage.lua changed to '${lmod_sitepackage_file}'"
349+
fi
333350
sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
334351
if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then
352+
echo ">> Creating/updating Lmod SitePackage.lua (${lmod_sitepackage_file})..."
335353
python3 $TOPDIR/create_lmodsitepackage.py ${EASYBUILD_INSTALLPATH}
336354
check_exit_code $? "$lmod_sitepackage_file created" "Failed to create $lmod_sitepackage_file"
337355
fi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
easyconfigs:
2+
- CUDA-12.1.1.eb
3+
- cuDNN-8.9.2.26-CUDA-12.1.1.eb

Diff for: eb_hooks.py

+156-49
Original file line numberDiff line numberDiff line change
@@ -756,64 +756,170 @@ def post_postproc_cuda(self, *args, **kwargs):
756756
if 'libcudart' not in allowlist:
757757
raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist)
758758

759-
# iterate over all files in the CUDA installation directory
760-
for dir_path, _, files in os.walk(self.installdir):
761-
for filename in files:
762-
full_path = os.path.join(dir_path, filename)
763-
# we only really care about real files, i.e. not symlinks
764-
if not os.path.islink(full_path):
765-
# check if the current file name stub is part of the allowlist
766-
basename = filename.split('.')[0]
767-
if basename in allowlist:
768-
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
769-
else:
770-
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
771-
basename, full_path)
772-
# if it is not in the allowlist, delete the file and create a symlink to host_injections
773-
774-
# the host_injections path is under a fixed repo/location for CUDA
775-
host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path)
776-
# CUDA itself doesn't care about compute capability so remove this duplication from
777-
# under host_injections (symlink to a single CUDA installation for all compute
778-
# capabilities)
779-
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
780-
if accel_subdir:
781-
host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '')
782-
# make sure source and target of symlink are not the same
783-
if full_path == host_inj_path:
784-
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
785-
"are using this hook for an EESSI installation?",
786-
full_path, host_inj_path)
787-
remove_file(full_path)
788-
symlink(host_inj_path, full_path)
759+
# replace files that are not distributable with symlinks into
760+
# host_injections
761+
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
789762
else:
790763
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")
791764

792765

766+
def post_postproc_cudnn(self, *args, **kwargs):
767+
"""
768+
Remove files from cuDNN installation that we are not allowed to ship,
769+
and replace them with a symlink to a corresponding installation under host_injections.
770+
"""
771+
772+
# We need to check if we are doing an EESSI-distributed installation
773+
eessi_installation = bool(re.search(EESSI_INSTALLATION_REGEX, self.installdir))
774+
775+
if self.name == 'cuDNN' and eessi_installation:
776+
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")
777+
778+
allowlist = ['LICENSE']
779+
780+
# read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped
781+
license_path = os.path.join(self.installdir, 'LICENSE')
782+
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
783+
found_search_string = False
784+
with open(license_path) as infile:
785+
for line in infile:
786+
if line.strip().startswith(search_string):
787+
found_search_string = True
788+
# remove search string, split into words, remove trailing
789+
# dots '.' and only retain words starting with a dot '.'
790+
distributable = line[len(search_string):]
791+
# distributable looks like ' the runtime files .so and .dll.'
792+
# note the '.' after '.dll'
793+
for word in distributable.split():
794+
if word[0] == '.':
795+
# rstrip is used to remove the '.' after '.dll'
796+
allowlist.append(word.rstrip('.'))
797+
if not found_search_string:
798+
# search string wasn't found in LICENSE file
799+
raise EasyBuildError("search string '%s' was not found in license file '%s';"
800+
"hence installation may be replaced by symlinks only",
801+
search_string, license_path)
802+
803+
allowlist = sorted(set(allowlist))
804+
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))
805+
806+
# replace files that are not distributable with symlinks into
807+
# host_injections
808+
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
809+
else:
810+
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")
811+
812+
813+
def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
814+
"""
815+
Replace files that cannot be distributed with symlinks into host_injections
816+
"""
817+
# Different packages use different ways to specify which files or file
818+
# 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file
819+
# names. For cuDNN, the 'LICENSE' lists file endings/suffixes (e.g., '.so')
820+
# that can be redistributed.
821+
# The map 'extension_based' defines which of these two ways are employed. If
822+
# full file names are used it maps a package name (key) to False (value). If
823+
# endings/suffixes are used, it maps a package name to True. Later we can
824+
# easily use this data structure to employ the correct method for
825+
# postprocessing an installation.
826+
extension_based = {
827+
"CUDA": False,
828+
"cuDNN": True,
829+
}
830+
if not pkg_name in extension_based:
831+
raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", pkg_name)
832+
833+
# iterate over all files in the package installation directory
834+
for dir_path, _, files in os.walk(install_dir):
835+
for filename in files:
836+
full_path = os.path.join(dir_path, filename)
837+
# we only really care about real files, i.e. not symlinks
838+
if not os.path.islink(full_path):
839+
check_by_extension = extension_based[pkg_name] and '.' in filename
840+
if check_by_extension:
841+
# if the allowlist only contains extensions, we have to
842+
# determine that from filename. we assume the extension is
843+
# the second element when splitting the filename at dots
844+
# (e.g., for 'libcudnn_adv_infer.so.8.9.2' the extension
845+
# would be '.so')
846+
extension = '.' + filename.split('.')[1]
847+
# check if the current file name stub or its extension is part of the allowlist
848+
basename = filename.split('.')[0]
849+
if basename in allowlist:
850+
log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
851+
elif check_by_extension and extension in allowlist:
852+
log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
853+
else:
854+
print_name = filename if extension_based[pkg_name] else basename
855+
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
856+
print_name, full_path)
857+
# the host_injections path is under a fixed repo/location for CUDA or cuDNN
858+
host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path)
859+
# CUDA and cu* libraries themselves don't care about compute capability so remove this
860+
# duplication from under host_injections (symlink to a single CUDA or cu* library
861+
# installation for all compute capabilities)
862+
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
863+
if accel_subdir:
864+
host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '')
865+
# make sure source and target of symlink are not the same
866+
if full_path == host_inj_path:
867+
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
868+
"are using this hook for an EESSI installation?",
869+
full_path, host_inj_path)
870+
remove_file(full_path)
871+
symlink(host_inj_path, full_path)
872+
873+
793874
def inject_gpu_property(ec):
794875
"""
795-
Add 'gpu' property, via modluafooter easyconfig parameter
876+
Add 'gpu' property and EESSI<PACKAGE>VERSION envvars via modluafooter
877+
easyconfig parameter, and drop dependencies to build dependencies
796878
"""
797879
ec_dict = ec.asdict()
798-
# Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property
799-
if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
800-
ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version")
801-
key = 'modluafooter'
802-
value = 'add_property("arch","gpu")'
803-
cuda_version = 0
804-
for dep in iter(ec_dict['dependencies']):
805-
# Make CUDA a build dependency only (rpathing saves us from link errors)
806-
if 'CUDA' in dep[0]:
807-
cuda_version = dep[1]
808-
ec_dict['dependencies'].remove(dep)
809-
if dep not in ec_dict['builddependencies']:
810-
ec_dict['builddependencies'].append(dep)
811-
value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
812-
if key in ec_dict:
813-
if value not in ec_dict[key]:
814-
ec[key] = '\n'.join([ec_dict[key], value])
880+
# Check if CUDA, cuDNN, you-name-it is in the dependencies, if so
881+
# - drop dependency to build dependency
882+
# - add 'gpu' Lmod property
883+
# - add envvar with package version
884+
pkg_names = ( "CUDA", "cuDNN" )
885+
pkg_versions = { }
886+
add_gpu_property = ''
887+
888+
for pkg_name in pkg_names:
889+
# Check if pkg_name is in the dependencies, if so drop dependency to build
890+
# dependency and set variable for later adding the 'gpu' Lmod property
891+
# to '.remove' dependencies from ec_dict['dependencies'] we make a copy,
892+
# iterate over the copy and can then savely use '.remove' on the original
893+
# ec_dict['dependencies'].
894+
deps = ec_dict['dependencies'][:]
895+
if (pkg_name in [dep[0] for dep in deps]):
896+
add_gpu_property = 'add_property("arch","gpu")'
897+
for dep in deps:
898+
if pkg_name == dep[0]:
899+
# make pkg_name a build dependency only (rpathing saves us from link errors)
900+
ec.log.info("Dropping dependency on %s to build dependency" % pkg_name)
901+
ec_dict['dependencies'].remove(dep)
902+
if dep not in ec_dict['builddependencies']:
903+
ec_dict['builddependencies'].append(dep)
904+
# take note of version for creating the modluafooter
905+
pkg_versions[pkg_name] = dep[1]
906+
if add_gpu_property:
907+
ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version")
908+
modluafooter = 'modluafooter'
909+
extra_mod_footer_lines = [add_gpu_property]
910+
for pkg_name, version in pkg_versions.items():
911+
envvar = "EESSI%sVERSION" % pkg_name.upper()
912+
extra_mod_footer_lines.append('setenv("%s","%s")' % (envvar, version))
913+
# take into account that modluafooter may already be set
914+
if modluafooter in ec_dict:
915+
value = ec_dict[modluafooter]
916+
for line in extra_mod_footer_lines:
917+
if not line in value:
918+
value = '\n'.join([value, line])
919+
ec[modluafooter] = value
815920
else:
816-
ec[key] = value
921+
ec[modluafooter] = '\n'.join(extra_mod_footer_lines)
922+
817923
return ec
818924

819925

@@ -873,4 +979,5 @@ def inject_gpu_property(ec):
873979

874980
POST_POSTPROC_HOOKS = {
875981
'CUDA': post_postproc_cuda,
982+
'cuDNN': post_postproc_cudnn,
876983
}

Diff for: init/eessi_environment_variables

+4-4
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,10 @@ if [ -d $EESSI_PREFIX ]; then
153153
fi
154154

155155
# Fix wrong path for RHEL >=8 libcurl
156-
# This is required here because we ship curl in our compat layer. If we only provided
157-
# curl as a module file we could instead do this via a `modluafooter` in an EasyBuild
158-
# hook (or via an Lmod hook)
159-
rhel_libcurl_file="/etc/pki/tls/certs/ca-bundle.crt"
156+
# This is required here because we ship curl in our compat layer. If we only provided
157+
# curl as a module file we could instead do this via a `modluafooter` in an EasyBuild
158+
# hook (or via an Lmod hook)
159+
rhel_libcurl_file="/etc/pki/tls/certs/ca-bundle.crt"
160160
if [ -f $rhel_libcurl_file ]; then
161161
show_msg "Found libcurl CAs file at RHEL location, setting CURL_CA_BUNDLE"
162162
export CURL_CA_BUNDLE=$rhel_libcurl_file

Diff for: install_scripts.sh

+10-1
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,19 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@
123123

124124
# Copy files for the scripts/gpu_support/nvidia directory
125125
nvidia_files=(
126-
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
126+
install_cuda_and_libraries.sh
127+
install_cuda_host_injections.sh
128+
link_nvidia_host_libraries.sh
127129
)
128130
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"
129131

132+
# Easystacks to be used to install software in host injections
133+
host_injections_easystacks=(
134+
eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml
135+
)
136+
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia/easystacks \
137+
${INSTALL_PREFIX}/scripts/gpu_support/nvidia/easystacks "${host_injections_easystacks[@]}"
138+
130139
# Copy over EasyBuild hooks file used for installations
131140
hook_files=(
132141
eb_hooks.py
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# This EasyStack provides a list of all the EasyConfigs that should be installed in host_injections
2+
# for nvidia GPU support, because they cannot (fully) be shipped as part of EESSI due to license constraints
3+
easyconfigs:
4+
- CUDA-12.1.1.eb
5+
- cuDNN-8.9.2.26-CUDA-12.1.1.eb:
6+
options:
7+
# needed to enforce acceptance of EULA in cuDNN easyblock,
8+
# see https://github.com/easybuilders/easybuild-easyblocks/pull/3473
9+
include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8

0 commit comments

Comments
 (0)