Skip to content

Commit ae0c367

Browse files
authored
Merge pull request #8014 from fstagni/cherry-pick-2-78d20afae-integration
[sweep:integration] Use apptainer for SingularityComputingElement and enhance debugging
2 parents 327e158 + 45557f8 commit ae0c367

File tree

3 files changed

+39
-76
lines changed

3 files changed

+39
-76
lines changed

dirac.cfg

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -647,10 +647,6 @@ Resources
647647
# Default: /cvmfs/cernvm-prod.cern.ch/cvm4
648648
ContainerRoot = /cvmfs/cernvm-prod.cern.ch/cvm4
649649

650-
# The binary to start the container
651-
# default: singularity
652-
ContainerBin = /opt/extras/bin/singularity
653-
654650
# List of directories to bind
655651
ContainerBind = /etc/grid-security,someDir:::BoundHere
656652

src/DIRAC/Resources/Computing/SingularityComputingElement.py

Lines changed: 28 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import shutil
1818
import sys
1919
import tempfile
20+
from pathlib import Path
2021

2122
import DIRAC
2223
from DIRAC import S_ERROR, S_OK, gConfig, gLogger
@@ -62,10 +63,6 @@
6263
echo "Finishing inner container wrapper scripts at `date`."
6364
6465
"""
65-
# Path to a directory on CVMFS to use as a fallback if no
66-
# other version found: Only used if node has user namespaces
67-
FALLBACK_SINGULARITY = "/cvmfs/oasis.opensciencegrid.org/mis/singularity/current/bin"
68-
6966
CONTAINER_WRAPPER_NO_INSTALL = """#!/bin/bash
7067
7168
echo "Starting inner container wrapper scripts (no install) at `date`."
@@ -110,7 +107,6 @@ def __init__(self, ceUniqueID):
110107
self.__root = self.ceParameters["ContainerRoot"]
111108
self.__workdir = CONTAINER_WORKDIR
112109
self.__innerdir = CONTAINER_INNERDIR
113-
self.__singularityBin = "singularity"
114110
self.__installDIRACInContainer = self.ceParameters.get("InstallDIRACInContainer", False)
115111
if isinstance(self.__installDIRACInContainer, str) and self.__installDIRACInContainer.lower() in (
116112
"false",
@@ -120,47 +116,6 @@ def __init__(self, ceUniqueID):
120116

121117
self.processors = int(self.ceParameters.get("NumberOfProcessors", 1))
122118

123-
def __hasUserNS(self):
124-
"""Detect if this node has user namespaces enabled.
125-
Returns True if they are enabled, False otherwise.
126-
"""
127-
try:
128-
with open("/proc/sys/user/max_user_namespaces") as proc_fd:
129-
maxns = int(proc_fd.readline().strip())
130-
# Any "reasonable number" of namespaces is sufficient
131-
return maxns > 100
132-
except Exception:
133-
# Any failure, missing file, doesn't contain a number, etc. and we
134-
# assume they are disabled.
135-
return False
136-
137-
def __hasSingularity(self):
138-
"""Search the current PATH for an exectuable named singularity.
139-
Returns True if it is found, False otherwise.
140-
"""
141-
if self.ceParameters.get("ContainerBin"):
142-
binPath = self.ceParameters["ContainerBin"]
143-
if os.path.isfile(binPath) and os.access(binPath, os.X_OK):
144-
self.__singularityBin = binPath
145-
self.log.debug(f'Use singularity from "{self.__singularityBin}"')
146-
return True
147-
if "PATH" not in os.environ:
148-
return False # Hmm, PATH not set? How unusual...
149-
searchPaths = os.environ["PATH"].split(os.pathsep)
150-
# We can use CVMFS as a last resort if userNS is enabled
151-
if self.__hasUserNS():
152-
searchPaths.append(FALLBACK_SINGULARITY)
153-
for searchPath in searchPaths:
154-
binPath = os.path.join(searchPath, "singularity")
155-
if os.path.isfile(binPath):
156-
# File found, check it's executable to be certain:
157-
if os.access(binPath, os.X_OK):
158-
self.log.debug(f'Found singularity at "{binPath}"')
159-
self.__singularityBin = binPath
160-
return True
161-
# No suitable binaries found
162-
return False
163-
164119
@staticmethod
165120
def __findInstallBaseDir():
166121
"""Find the path to root of the current DIRAC installation"""
@@ -321,11 +276,12 @@ def __getEnv(self):
321276
We blank almost everything to prevent contamination from the host system.
322277
"""
323278

324-
if not self.__installDIRACInContainer:
325-
payloadEnv = {k: v for k, v in os.environ.items() if ENV_VAR_WHITELIST.match(k)}
326-
else:
279+
if self.__installDIRACInContainer:
327280
payloadEnv = {}
281+
else:
282+
payloadEnv = {k: v for k, v in os.environ.items() if ENV_VAR_WHITELIST.match(k)}
328283

284+
payloadEnv["PATH"] = str(Path(sys.executable).parent)
329285
payloadEnv["TMP"] = "/tmp"
330286
payloadEnv["TMPDIR"] = "/tmp"
331287
payloadEnv["X509_USER_PROXY"] = os.path.join(self.__innerdir, "proxy")
@@ -356,10 +312,6 @@ def submitJob(self, executableFile, proxy=None, **kwargs):
356312
"""
357313
rootImage = self.__root
358314
renewTask = None
359-
# Check that singularity is available
360-
if not self.__hasSingularity():
361-
self.log.error("Singularity is not installed on PATH.")
362-
return S_ERROR("Failed to find singularity")
363315

364316
self.log.info("Creating singularity container")
365317

@@ -391,19 +343,19 @@ def submitJob(self, executableFile, proxy=None, **kwargs):
391343
# Mount /cvmfs in if it exists on the host
392344
withCVMFS = os.path.isdir("/cvmfs")
393345
innerCmd = os.path.join(self.__innerdir, "dirac_container.sh")
394-
cmd = [self.__singularityBin, "exec"]
395-
cmd.extend(["--contain"]) # use minimal /dev and empty other directories (e.g. /tmp and $HOME)
396-
cmd.extend(["--ipc"]) # run container in a new IPC namespace
397-
cmd.extend(["--workdir", baseDir]) # working directory to be used for /tmp, /var/tmp and $HOME
398-
cmd.extend(["--home", "/tmp"]) # Avoid using small tmpfs for default $HOME and use scratch /tmp instead
399-
if self.__hasUserNS():
400-
cmd.append("--userns")
346+
outerCmd = ["apptainer", "exec"]
347+
outerCmd.extend(["--contain"]) # use minimal /dev and empty other directories (e.g. /tmp and $HOME)
348+
outerCmd.extend(["--ipc"]) # run container in a new IPC namespace
349+
outerCmd.extend(["--workdir", baseDir]) # working directory to be used for /tmp, /var/tmp and $HOME
350+
outerCmd.extend(["--home", "/tmp"]) # Avoid using small tmpfs for default $HOME and use scratch /tmp instead
351+
outerCmd.append("--userns")
401352
if withCVMFS:
402-
cmd.extend(["--bind", "/cvmfs"])
353+
outerCmd.extend(["--bind", "/cvmfs"])
403354
if not self.__installDIRACInContainer:
404-
cmd.extend(["--bind", "{0}:{0}:ro".format(self.__findInstallBaseDir())])
355+
outerCmd.extend(["--bind", "{0}:{0}:ro".format(self.__findInstallBaseDir())])
405356

406-
bindPaths = self.ceParameters.get("ContainerBind", "").split(",")
357+
rawBindPaths = self.ceParameters.get("ContainerBind", "")
358+
bindPaths = rawBindPaths.split(",") if rawBindPaths else []
407359
siteName = gConfig.getValue("/LocalSite/Site", "")
408360
ceName = gConfig.getValue("/LocalSite/GridCE", "")
409361
if siteName and ceName:
@@ -436,20 +388,20 @@ def submitJob(self, executableFile, proxy=None, **kwargs):
436388

437389
for bindPath in bindPaths:
438390
if len(bindPath.split(":::")) == 1:
439-
cmd.extend(["--bind", bindPath.strip()])
391+
outerCmd.extend(["--bind", bindPath.strip()])
440392
elif len(bindPath.split(":::")) in [2, 3]:
441-
cmd.extend(["--bind", ":".join([bp.strip() for bp in bindPath.split(":::")])])
393+
outerCmd.extend(["--bind", ":".join([bp.strip() for bp in bindPath.split(":::")])])
442394

443395
if "ContainerOptions" in self.ceParameters:
444396
containerOpts = self.ceParameters["ContainerOptions"].split(",")
445397
for opt in containerOpts:
446-
cmd.extend([opt.strip()])
447-
if os.path.isdir(rootImage) or os.path.isfile(rootImage):
448-
cmd.extend([rootImage, innerCmd])
449-
else:
398+
outerCmd.extend([opt.strip()])
399+
if not (os.path.isdir(rootImage) or os.path.isfile(rootImage)):
450400
# if we are here is because there's no image, or it is not accessible (e.g. not on CVMFS)
451401
self.log.error("Singularity image to exec not found: ", rootImage)
452402
return S_ERROR("Failed to find singularity image to exec")
403+
outerCmd.append(rootImage)
404+
cmd = outerCmd + [innerCmd]
453405

454406
self.log.debug(f"Execute singularity command: {cmd}")
455407
self.log.debug(f"Execute singularity env: {self.__getEnv()}")
@@ -459,6 +411,13 @@ def submitJob(self, executableFile, proxy=None, **kwargs):
459411

460412
if not result["OK"]:
461413
self.log.error("Fail to run Singularity", result["Message"])
414+
# If we fail to run the container try to run it again with verbose output
415+
# to help with debugging.
416+
self.log.error("Singularity command was: ", cmd)
417+
self.log.error(f"Singularity env was: {self.__getEnv()}")
418+
debugCmd = [outerCmd[0], "--debug"] + outerCmd[1:] + ["echo", "All okay"]
419+
self.log.error("Running with debug output to facilitate debugging", debugCmd)
420+
result = systemCall(0, debugCmd, callbackFunction=self.sendOutput, env=self.__getEnv())
462421
if proxy and renewTask:
463422
gThreadScheduler.removeTask(renewTask)
464423
self.__deleteWorkArea(baseDir)

src/DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
""" Test class for Job Agent
22
"""
3+
import multiprocessing
34
import os
45
from pathlib import Path
56
import pytest
67
import time
7-
from unittest.mock import MagicMock
8+
from concurrent.futures import ProcessPoolExecutor
9+
from functools import partial
810

911
from DIRAC import gLogger, S_OK, S_ERROR
1012
from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error
@@ -635,8 +637,14 @@ def test_submitAndCheckJob(mocker, manageJobFiles, localCE, job, expectedResult1
635637
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobAgent.JobAgent._sendFailoverRequest", return_value=S_OK())
636638
mocker.patch("DIRAC.Core.Security.X509Chain.X509Chain.dumpAllToString", return_value=S_OK())
637639
mocker.patch(
638-
"DIRAC.Resources.Computing.SingularityComputingElement.SingularityComputingElement._SingularityComputingElement__hasSingularity",
639-
return_value=False,
640+
"DIRAC.Resources.Computing.SingularityComputingElement.SingularityComputingElement.submitJob",
641+
return_value=S_ERROR("Failed to find singularity"),
642+
)
643+
# We need to force ProcessPoolExecutor to use the fork context to enable the
644+
# mocks to propagate to the subprocesses used by PoolComputingElement
645+
mocker.patch(
646+
"concurrent.futures.ProcessPoolExecutor",
647+
side_effect=partial(ProcessPoolExecutor, mp_context=multiprocessing.get_context("fork")),
640648
)
641649

642650
jobAgent = JobAgent("JobAgent", "Test")

0 commit comments

Comments
 (0)