Skip to content

Commit 901a944

Browse files
authored
Merge pull request #781 from ocaisa/lmod_gpu
Add accelerator detection to Lmod version of EESSI initialisation
2 parents 8232a60 + 04c2573 commit 901a944

File tree

3 files changed

+241
-32
lines changed

3 files changed

+241
-32
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
setenv("INSIDE_GITHUB_ACTIONS", "true")
2+
-- Interfere with PATH so Lmod keeps a record
3+
prepend_path("PATH", "/snap/bin")

.github/workflows/tests_eessi_module.yml

Lines changed: 134 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ on:
77
permissions:
88
contents: read # to fetch code (actions/checkout)
99
jobs:
10-
build:
10+
basic_checks:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
fail-fast: false
1414
matrix:
1515
EESSI_VERSION:
16-
- 2023.06
16+
- 2023.06
1717
steps:
1818
- name: Check out software-layer repository
1919
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -45,10 +45,11 @@ jobs:
4545
4646
- name: Test for archdetect_cpu functionality with invalid path
4747
run: |
48-
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
48+
# Initialise Lmod
49+
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
4950
export MODULEPATH=init/modules
5051
set +e # Do not exit immediately if a command exits with a non-zero status
51-
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu"
52+
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu"
5253
outfile="outfile.txt"
5354
module load EESSI/${{matrix.EESSI_VERSION}} > "${outfile}" 2>&1
5455
cat "${outfile}"
@@ -58,29 +59,149 @@ jobs:
5859
echo "Test for picking up invalid path on \${archdetect_cpu} FAILED" >&2
5960
exit 1
6061
fi
61-
unset EESSI_ARCHDETECT_OPTIONS
62+
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
6263
set -e # Re-enable exit on non-zero status
64+
65+
lmod_and_init_script_comparison:
66+
runs-on: ubuntu-latest
67+
strategy:
68+
fail-fast: false
69+
matrix:
70+
EESSI_VERSION:
71+
- 2023.06
72+
EESSI_SOFTWARE_SUBDIR_OVERRIDE:
73+
- x86_64/amd/zen3
74+
- x86_64/amd/zen4
75+
EESSI_ACCELERATOR_TARGET_OVERRIDE:
76+
- accel/nvidia/cc80
77+
steps:
78+
- name: Check out software-layer repository
79+
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
80+
81+
- name: Mount EESSI CernVM-FS pilot repository
82+
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
83+
with:
84+
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
85+
cvmfs_http_proxy: DIRECT
86+
cvmfs_repositories: software.eessi.io
6387

64-
- name: Test for expected variables while adding dummy cpu archs and loading EESSI module
88+
- name: Test for expected variables match between Lmod init script and original bash script
6589
run: |
66-
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
67-
export MODULEPATH=init/modules
68-
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
69-
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
90+
# Initialise Lmod
91+
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
92+
93+
# Set our path overrides according to our matrix
94+
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}
95+
export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}
96+
7097
moduleoutfile="moduleout.txt"
7198
sourceoutfile="sourceout.txt"
99+
100+
# First do (and undo) the Lmod initialisation
101+
export MODULEPATH=init/modules
102+
# Turn on debug output in case we want to take a look
103+
export EESSI_DEBUG_INIT=true
104+
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
105+
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
72106
module load EESSI/${{matrix.EESSI_VERSION}}
73-
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${moduleoutfile}"
107+
# EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init
108+
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
109+
unset EESSI_DEBUG_INIT
110+
# Store all relevant environment variables
111+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}"
74112
module unload EESSI/${{matrix.EESSI_VERSION}}
113+
114+
# Now do the init script initialisation
75115
source ./init/bash
76-
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${sourceoutfile}"
116+
# source script version sets environment variables to force archdetect, ignore these
117+
unset EESSI_USE_ARCHSPEC
118+
unset EESSI_USE_ARCHDETECT
119+
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}"
120+
121+
# Now compare the two results
122+
echo ""
123+
echo "Lmod initialisation:"
77124
cat "${moduleoutfile}"
125+
echo ""
126+
echo "Source script initialisation:"
78127
cat "${sourceoutfile}"
128+
echo ""
129+
echo ""
79130
if (diff "${moduleoutfile}" "${sourceoutfile}" > /dev/null); then
80131
echo "Test for checking env variables PASSED"
81132
else
82133
echo "Test for checking env variables FAILED" >&2
83-
diff "${moduleoutfile}" "${sourceoutfile}"
134+
diff --unified=0 "${moduleoutfile}" "${sourceoutfile}"
84135
exit 1
85136
fi
86137
138+
make_sure_load_and_unload_work:
139+
runs-on: ubuntu-latest
140+
strategy:
141+
fail-fast: false
142+
matrix:
143+
EESSI_VERSION:
144+
- 2023.06
145+
EESSI_SOFTWARE_SUBDIR_OVERRIDE:
146+
- none
147+
- x86_64/amd/zen2
148+
- x86_64/amd/zen4
149+
EESSI_ACCELERATOR_TARGET_OVERRIDE:
150+
- none
151+
- accel/nvidia/cc80
152+
steps:
153+
- name: Check out software-layer repository
154+
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
155+
156+
- name: Mount EESSI CernVM-FS pilot repository
157+
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
158+
with:
159+
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
160+
cvmfs_http_proxy: DIRECT
161+
cvmfs_repositories: software.eessi.io
162+
163+
- name: Test for identical environment after loading and unloading the EESSI module
164+
run: |
165+
# Initialise Lmod
166+
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
167+
168+
# Set our cpu path overrides according to our matrix
169+
if [[ "${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}" != "none" ]]; then
170+
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}
171+
fi
172+
173+
# Set our accelerator path overrides according to our matrix
174+
if [[ "${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}" != "none" ]]; then
175+
export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}
176+
fi
177+
178+
# Turn on debug output in case we want to take a look
179+
export EESSI_DEBUG_INIT=true
180+
181+
initial_env_file="initial_env.txt"
182+
module_cycled_file="load_unload_cycle.txt"
183+
184+
# prepare Lmod, resetting it in a roundabout given we don't want defaults set
185+
export MODULEPATH=init/modules:.github/workflows/modules
186+
module load fake_module
187+
module purge
188+
module unuse .github/workflows/modules
189+
module avail
190+
191+
# Store the initial environment (ignoring Lmod tables)
192+
env | grep -v _ModuleTable | sort > "${initial_env_file}"
193+
194+
# Do (and undo) loading the EESSI module
195+
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
196+
module load EESSI/${{matrix.EESSI_VERSION}}
197+
module unload EESSI/${{matrix.EESSI_VERSION}}
198+
env | grep -v _ModuleTable | sort > "${module_cycled_file}"
199+
200+
# Now compare the two results (do not expose the files, as they contain the full environment!)
201+
if (diff "${initial_env_file}" "${module_cycled_file}" > /dev/null); then
202+
echo "Test for checking env variables PASSED"
203+
else
204+
echo "Test for checking env variables FAILED" >&2
205+
diff --unified=0 "${initial_env_file}" "${module_cycled_file}"
206+
exit 1
207+
fi

init/modules/EESSI/2023.06.lua

Lines changed: 104 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,56 +17,141 @@ local eessi_os_type = "linux"
1717
setenv("EESSI_VERSION", eessi_version)
1818
setenv("EESSI_CVMFS_REPO", eessi_repo)
1919
setenv("EESSI_OS_TYPE", eessi_os_type)
20+
function eessiDebug(text)
21+
if (mode() == "load" and os.getenv("EESSI_DEBUG_INIT")) then
22+
LmodMessage(text)
23+
end
24+
end
2025
function archdetect_cpu()
2126
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper.sh')
22-
if not os.getenv("EESSI_ARCHDETECT_OPTIONS") then
27+
-- make sure that we grab the value for architecture before the module unsets the environment variable (in unload mode)
28+
local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or (os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") or "")
29+
if not os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") then
2330
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
24-
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic")
31+
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS_OVERRIDE to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic")
2532
end
2633
source_sh("bash", script)
2734
end
28-
local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or ""
29-
for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do
30-
if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then
31-
-- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected,
32-
-- since optimized software installations for Zen4 are a work-in-progress,
33-
-- see https://gitlab.com/eessi/support/-/issues/37
34-
if archdetect_filter_cpu == "x86_64/amd/zen4" then
35-
archdetect_filter_cpu = "x86_64/amd/zen3"
36-
if mode() == "load" then
37-
LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.")
35+
-- EESSI_ARCHDETECT_OPTIONS is set by the script (_if_ it was called)
36+
archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or archdetect_options
37+
if archdetect_options then
38+
eessiDebug("Got archdetect CPU options: " .. archdetect_options)
39+
-- archdetect_options is a colon-separated list of CPU architectures that are compatible with
40+
-- the host CPU and ordered from most specific to least specific, e.g.,
41+
-- x86_64/intel/skylake_avx512:x86_64/intel/haswell:x86_64/generic
42+
-- We loop over the list, and return the highest matching arch for which a directory exists for this EESSI version
43+
for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do
44+
if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then
45+
-- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected,
46+
-- since optimized software installations for Zen4 are a work-in-progress,
47+
-- see https://gitlab.com/eessi/support/-/issues/37
48+
if (archdetect_filter_cpu == "x86_64/amd/zen4" and not os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE") == "x86_64/amd/zen4") then
49+
archdetect_filter_cpu = "x86_64/amd/zen3"
50+
if mode() == "load" then
51+
LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.")
52+
end
3853
end
54+
eessiDebug("Selected archdetect CPU: " .. archdetect_filter_cpu)
55+
return archdetect_filter_cpu
3956
end
40-
return archdetect_filter_cpu
4157
end
58+
LmodError("Software directory check for the detected architecture failed")
59+
else
60+
-- Still need to return something
61+
return nil
4262
end
43-
LmodError("Software directory check for the detected architecture failed")
4463
end
64+
function archdetect_accel()
65+
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh')
66+
-- for unload mode, we need to grab the value before it is unset
67+
local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "")
68+
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then
69+
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
70+
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80")
71+
end
72+
source_sh("bash", script)
73+
end
74+
archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel
75+
eessiDebug("Got archdetect accel option: " .. archdetect_accel)
76+
return archdetect_accel
77+
end
78+
-- archdetect finds the best compatible architecture, e.g., x86_64/amd/zen3
4579
local archdetect = archdetect_cpu()
80+
-- archdetect_accel() attempts to identify an accelerator, e.g., accel/nvidia/cc80
81+
local archdetect_accel = archdetect_accel()
82+
-- eessi_cpu_family is derived from the archdetect match, e.g., x86_64
4683
local eessi_cpu_family = archdetect:match("([^/]+)")
4784
local eessi_software_subdir = archdetect
85+
-- eessi_eprefix is the base location of the compat layer, e.g., /cvmfs/software.eessi.io/versions/2023.06/compat/linux/x86_64
4886
local eessi_eprefix = pathJoin(eessi_prefix, "compat", eessi_os_type, eessi_cpu_family)
87+
-- eessi_software_path is the location of the software installations, e.g.,
88+
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3
4989
local eessi_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_software_subdir)
50-
local eessi_module_path = pathJoin(eessi_software_path, "modules", "all")
90+
local eessi_modules_subdir = pathJoin("modules", "all")
91+
-- eessi_module_path is the location of the _CPU_ module files, e.g.,
92+
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/modules/all
93+
local eessi_module_path = pathJoin(eessi_software_path, eessi_modules_subdir)
5194
local eessi_site_software_path = string.gsub(eessi_software_path, "versions", "host_injections")
52-
local eessi_site_module_path = pathJoin(eessi_site_software_path, "modules", "all")
95+
-- Site module path is the same as the EESSI one, but with `versions` changed to `host_injections`, e.g.,
96+
-- /cvmfs/software.eessi.io/host_injections/2023.06/software/linux/x86_64/amd/zen3/modules/all
97+
local eessi_site_module_path = pathJoin(eessi_site_software_path, eessi_modules_subdir)
5398
setenv("EPREFIX", eessi_eprefix)
99+
eessiDebug("Setting EPREFIX to " .. eessi_eprefix)
54100
setenv("EESSI_CPU_FAMILY", eessi_cpu_family)
101+
eessiDebug("Setting EESSI_CPU_FAMILY to " .. eessi_cpu_family)
55102
setenv("EESSI_SITE_SOFTWARE_PATH", eessi_site_software_path)
103+
eessiDebug("Setting EESSI_SITE_SOFTWARE_PATH to " .. eessi_site_software_path)
56104
setenv("EESSI_SITE_MODULEPATH", eessi_site_module_path)
105+
eessiDebug("Setting EESSI_SITE_MODULEPATH to " .. eessi_site_module_path)
57106
setenv("EESSI_SOFTWARE_SUBDIR", eessi_software_subdir)
107+
eessiDebug("Setting EESSI_SOFTWARE_SUBDIR to " .. eessi_software_subdir)
58108
setenv("EESSI_PREFIX", eessi_prefix)
109+
eessiDebug("Setting EESSI_PREFIX to " .. eessi_prefix)
59110
setenv("EESSI_EPREFIX", eessi_eprefix)
111+
eessiDebug("Setting EPREFIX to " .. eessi_eprefix)
60112
prepend_path("PATH", pathJoin(eessi_eprefix, "bin"))
61-
prepend_path("PATH", pathJoin(eessi_eprefix, "usr/bin"))
113+
eessiDebug("Adding " .. pathJoin(eessi_eprefix, "bin") .. " to PATH")
114+
prepend_path("PATH", pathJoin(eessi_eprefix, "usr", "bin"))
115+
eessiDebug("Adding " .. pathJoin(eessi_eprefix, "usr", "bin") .. " to PATH")
62116
setenv("EESSI_SOFTWARE_PATH", eessi_software_path)
117+
eessiDebug("Setting EESSI_SOFTWARE_PATH to " .. eessi_software_path)
63118
setenv("EESSI_MODULEPATH", eessi_module_path)
119+
eessiDebug("Setting EESSI_MODULEPATH to " .. eessi_module_path)
120+
-- We ship our spider cache, so this location does not need to be spider-ed
64121
if ( mode() ~= "spider" ) then
65122
prepend_path("MODULEPATH", eessi_module_path)
123+
eessiDebug("Adding " .. eessi_module_path .. " to MODULEPATH")
66124
end
67-
prepend_path("LMOD_RC", pathJoin(eessi_software_path, "/.lmod/lmodrc.lua"))
125+
prepend_path("LMOD_RC", pathJoin(eessi_software_path, ".lmod", "lmodrc.lua"))
126+
eessiDebug("Adding " .. pathJoin(eessi_software_path, ".lmod", "lmodrc.lua") .. " to LMOD_RC")
127+
-- Use pushenv for LMOD_PACKAGE_PATH as this may be set locally by the site
128+
pushenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod"))
129+
eessiDebug("Setting LMOD_PACKAGE_PATH to " .. pathJoin(eessi_software_path, ".lmod"))
130+
131+
-- the accelerator may have an empty value and we need to give some flexibility
132+
-- * construct the path we expect to find
133+
-- * then check it exists
134+
-- * then update the modulepath
135+
if not (archdetect_accel == nil or archdetect_accel == '') then
136+
-- The CPU subdirectory of the accelerator installations is _usually_ the same as host CPU, but this can be overridden
137+
eessi_accel_software_subdir = os.getenv("EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE") or eessi_software_subdir
138+
-- CPU location of the accelerator installations, e.g.,
139+
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3
140+
eessi_accel_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_accel_software_subdir)
141+
-- location of the accelerator modules, e.g.,
142+
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all
143+
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
144+
eessiDebug("Checking if " .. eessi_module_path_accel .. " exists")
145+
if isDir(eessi_module_path_accel) then
146+
setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel)
147+
prepend_path("MODULEPATH", eessi_module_path_accel)
148+
eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel)
149+
end
150+
end
151+
152+
-- prepend the site module path last so it has priority
68153
prepend_path("MODULEPATH", eessi_site_module_path)
69-
setenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod"))
154+
eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH")
70155
if mode() == "load" then
71156
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
72157
end

0 commit comments

Comments
 (0)