Skip to content

Commit 3d2d119

Browse files
authored
Cuda request gpu device expr (#1629)
* Allow expressions to dynamically request min/max number of GPUs. * Add test coverage for CUDA checks
1 parent 4c65bdf commit 3d2d119

15 files changed

+279
-47
lines changed

cwltool/cuda.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
1818
return (cv.data, int(ag.data))
1919

2020

21-
def cuda_check(cuda_req: CWLObjectType) -> int:
21+
def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
2222
try:
2323
vmin = float(str(cuda_req["cudaVersionMin"]))
2424
version, devices = cuda_version_and_device_count()
@@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
3131
"CUDA version '%s' is less than minimum version '%s'", version, vmin
3232
)
3333
return 0
34-
dmin = cast(int, cuda_req.get("deviceCountMin", 1))
35-
dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
36-
if devices < dmin:
34+
if requestCount > devices:
3735
_logger.warning(
38-
"Requested at least %d GPU devices but only %d available", dmin, devices
36+
"Requested %d GPU devices but only %d available", requestCount, devices
3937
)
4038
return 0
41-
return min(dmax, devices)
39+
return requestCount
4240
except Exception as e:
4341
_logger.warning("Error checking CUDA requirements: %s", e)
4442
return 0

cwltool/docker.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -397,13 +397,8 @@ def create_runtime(
397397
if runtimeContext.rm_container:
398398
runtime.append("--rm")
399399

400-
cuda_req, _ = self.builder.get_requirement(
401-
"http://commonwl.org/cwltool#CUDARequirement"
402-
)
403-
if cuda_req:
404-
# Checked earlier that the device count is non-zero in _setup
405-
count = cuda_check(cuda_req)
406-
runtime.append("--gpus=" + str(count))
400+
if self.builder.resources.get("cudaDeviceCount"):
401+
runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))
407402

408403
cidfile_path = None # type: Optional[str]
409404
# add parameters to docker to write a container ID file

cwltool/executors.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ def select_resources(
305305
result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
306306
result["outdirSize"] = math.ceil(request["outdirMin"])
307307

308+
if "cudaDeviceCount" in request:
309+
result["cudaDeviceCount"] = request["cudaDeviceCount"]
310+
308311
return result
309312

310313
def _runner(self, job, runtime_context, TMPDIR_LOCK):

cwltool/extensions-v1.1.yml

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,29 @@ $graph:
9393
9494
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
9595
details.
96-
cudaComputeCapabilityMin:
97-
type: string
98-
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
99-
deviceCountMin:
100-
type: int?
96+
cudaComputeCapability:
97+
type:
98+
- 'string'
99+
- 'string[]'
100+
doc: |
101+
CUDA hardware capability required to run the software, in X.Y
102+
format.
103+
104+
* If this is a single value, it defines only the minimum
105+
compute capability. GPUs with higher capability are also
106+
accepted.
107+
108+
* If it is an array value, then only select GPUs with compute
109+
capabilities that explicitly appear in the array.
110+
cudaDeviceCountMin:
111+
type: ['null', int, cwl:Expression]
101112
default: 1
102-
doc: Minimum number of GPU devices to request, default 1.
103-
deviceCountMax:
104-
type: int?
105-
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
113+
doc: |
114+
Minimum number of GPU devices to request. If not specified,
115+
same as `cudaDeviceCountMax`. If neither are specified,
116+
default 1.
117+
cudaDeviceCountMax:
118+
type: ['null', int, cwl:Expression]
119+
doc: |
120+
Maximum number of GPU devices to request. If not specified,
121+
same as `cudaDeviceCountMin`.

cwltool/extensions.yml

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -203,13 +203,29 @@ $graph:
203203
204204
See https://docs.nvidia.com/deploy/cuda-compatibility/ for
205205
details.
206-
cudaComputeCapabilityMin:
207-
type: string
208-
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
209-
deviceCountMin:
210-
type: int?
206+
cudaComputeCapability:
207+
type:
208+
- 'string'
209+
- 'string[]'
210+
doc: |
211+
CUDA hardware capability required to run the software, in X.Y
212+
format.
213+
214+
* If this is a single value, it defines only the minimum
215+
compute capability. GPUs with higher capability are also
216+
accepted.
217+
218+
* If it is an array value, then only select GPUs with compute
219+
capabilities that explicitly appear in the array.
220+
cudaDeviceCountMin:
221+
type: ['null', int, cwl:Expression]
211222
default: 1
212-
doc: Minimum number of GPU devices to request, default 1.
213-
deviceCountMax:
214-
type: int?
215-
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
223+
doc: |
224+
Minimum number of GPU devices to request. If not specified,
225+
same as `cudaDeviceCountMax`. If neither are specified,
226+
default 1.
227+
cudaDeviceCountMax:
228+
type: ['null', int, cwl:Expression]
229+
doc: |
230+
Maximum number of GPU devices to request. If not specified,
231+
same as `cudaDeviceCountMin`.

cwltool/job.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import functools
33
import itertools
44
import logging
5+
import math
56
import os
67
import re
78
import shutil
@@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
180181
"http://commonwl.org/cwltool#CUDARequirement"
181182
)
182183
if cuda_req:
183-
count = cuda_check(cuda_req)
184+
count = cuda_check(
185+
cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
186+
)
184187
if count == 0:
185188
raise WorkflowException("Could not satisfy CUDARequirement")
186189

cwltool/process.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -980,6 +980,7 @@ def evalResources(
980980
resourceReq, _ = self.get_requirement("ResourceRequirement")
981981
if resourceReq is None:
982982
resourceReq = {}
983+
983984
cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
984985
if cwl_version == "v1.0":
985986
ram = 1024
@@ -995,20 +996,34 @@ def evalResources(
995996
"outdirMin": 1024,
996997
"outdirMax": 1024,
997998
}
998-
for a in ("cores", "ram", "tmpdir", "outdir"):
999+
1000+
cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
1001+
if cudaReq:
1002+
request["cudaDeviceCountMin"] = 1
1003+
request["cudaDeviceCountMax"] = 1
1004+
1005+
for rsc, a in (
1006+
(resourceReq, "cores"),
1007+
(resourceReq, "ram"),
1008+
(resourceReq, "tmpdir"),
1009+
(resourceReq, "outdir"),
1010+
(cudaReq, "cudaDeviceCount"),
1011+
):
1012+
if rsc is None:
1013+
continue
9991014
mn = mx = None # type: Optional[Union[int, float]]
1000-
if resourceReq.get(a + "Min"):
1015+
if rsc.get(a + "Min"):
10011016
mn = cast(
10021017
Union[int, float],
10031018
eval_resource(
1004-
builder, cast(Union[str, int, float], resourceReq[a + "Min"])
1019+
builder, cast(Union[str, int, float], rsc[a + "Min"])
10051020
),
10061021
)
1007-
if resourceReq.get(a + "Max"):
1022+
if rsc.get(a + "Max"):
10081023
mx = cast(
10091024
Union[int, float],
10101025
eval_resource(
1011-
builder, cast(Union[str, int, float], resourceReq[a + "Max"])
1026+
builder, cast(Union[str, int, float], rsc[a + "Max"])
10121027
),
10131028
)
10141029
if mn is None:
@@ -1022,13 +1037,18 @@ def evalResources(
10221037

10231038
request_evaluated = cast(Dict[str, Union[int, float]], request)
10241039
if runtimeContext.select_resources is not None:
1040+
# Call select resources hook
10251041
return runtimeContext.select_resources(request_evaluated, runtimeContext)
1026-
return {
1042+
1043+
defaultReq = {
10271044
"cores": request_evaluated["coresMin"],
10281045
"ram": math.ceil(request_evaluated["ramMin"]),
10291046
"tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
10301047
"outdirSize": math.ceil(request_evaluated["outdirMin"]),
10311048
}
1049+
if cudaReq:
1050+
defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
1051+
return defaultReq
10321052

10331053
def validate_hints(
10341054
self, avsc_names: Names, hints: List[CWLObjectType], strict: bool

cwltool/singularity.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -434,11 +434,7 @@ def create_runtime(
434434
else:
435435
runtime.extend(["--net", "--network", "none"])
436436

437-
cuda_req, _ = self.builder.get_requirement(
438-
"http://commonwl.org/cwltool#CUDARequirement"
439-
)
440-
if cuda_req:
441-
# Checked earlier that the device count is non-zero in _setup
437+
if self.builder.resources.get("cudaDeviceCount"):
442438
runtime.append("--nv")
443439

444440
for name, value in self.environment.items():

mypy-requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ mypy==0.931
22
types-requests
33
types-setuptools
44
types-psutil
5+
types-mock

tests/test_cuda.py

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,24 @@
1+
import mock
12
import pytest
3+
from schema_salad.avro import schema
24

5+
from cwltool.builder import Builder
6+
from cwltool.context import LoadingContext, RuntimeContext
37
from cwltool.cuda import cuda_version_and_device_count
8+
from cwltool.errors import WorkflowException
9+
from cwltool.job import CommandLineJob
10+
from cwltool.load_tool import load_tool
411
from cwltool.main import main
12+
from cwltool.pathmapper import MapperEnt, PathMapper
13+
from cwltool.process import use_custom_schema, use_standard_schema
14+
from cwltool.stdfsaccess import StdFsAccess
15+
from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION
16+
from cwltool.utils import CWLObjectType
517

618
from .util import get_data, needs_docker, needs_singularity_3_or_newer
719

20+
from unittest.mock import MagicMock
21+
822
cuda_version = cuda_version_and_device_count()
923

1024

@@ -39,7 +53,127 @@ def test_cuda_singularity() -> None:
3953
def test_cuda_no_container() -> None:
4054
params = [
4155
"--enable-ext",
42-
"--singularity",
4356
get_data("tests/wf/nvidia-smi.cwl"),
4457
]
4558
assert main(params) == 0
59+
60+
61+
@pytest.mark.skipif(
62+
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
63+
)
64+
def test_cuda_cc_list() -> None:
65+
params = [
66+
"--enable-ext",
67+
get_data("tests/wf/nvidia-smi-cc.cwl"),
68+
]
69+
assert main(params) == 0
70+
71+
72+
def _makebuilder(cudaReq: CWLObjectType) -> Builder:
73+
return Builder(
74+
{},
75+
[],
76+
[],
77+
{},
78+
schema.Names(),
79+
[cudaReq],
80+
[],
81+
{"cudaDeviceCount": 1},
82+
None,
83+
None,
84+
StdFsAccess,
85+
StdFsAccess(""),
86+
None,
87+
0.1,
88+
False,
89+
False,
90+
False,
91+
"",
92+
"",
93+
"",
94+
"",
95+
INTERNAL_VERSION,
96+
"docker",
97+
)
98+
99+
100+
@mock.patch("subprocess.check_output")
101+
@mock.patch("os.makedirs")
102+
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
103+
104+
runtime_context = RuntimeContext({})
105+
106+
cudaReq: CWLObjectType = {
107+
"class": "http://commonwl.org/cwltool#CUDARequirement",
108+
"cudaVersionMin": "1.0",
109+
"cudaComputeCapability": "1.0",
110+
}
111+
builder = _makebuilder(cudaReq)
112+
113+
check_output.return_value = """
114+
<nvidia>
115+
<attached_gpus>1</attached_gpus>
116+
<cuda_version>1.0</cuda_version>
117+
</nvidia>
118+
"""
119+
120+
jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
121+
jb._setup(runtime_context)
122+
123+
124+
@mock.patch("subprocess.check_output")
125+
@mock.patch("os.makedirs")
126+
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
127+
128+
runtime_context = RuntimeContext({})
129+
130+
cudaReq: CWLObjectType = {
131+
"class": "http://commonwl.org/cwltool#CUDARequirement",
132+
"cudaVersionMin": "2.0",
133+
"cudaComputeCapability": "1.0",
134+
}
135+
builder = _makebuilder(cudaReq)
136+
137+
check_output.return_value = """
138+
<nvidia>
139+
<attached_gpus>1</attached_gpus>
140+
<cuda_version>1.0</cuda_version>
141+
</nvidia>
142+
"""
143+
jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
144+
with pytest.raises(WorkflowException):
145+
jb._setup(runtime_context)
146+
147+
148+
def test_cuda_eval_resource_range() -> None:
149+
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
150+
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
151+
152+
joborder = {} # type: CWLObjectType
153+
loadingContext = LoadingContext({"do_update": True})
154+
runtime_context = RuntimeContext({})
155+
156+
tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext)
157+
builder = _makebuilder(tool.requirements[0])
158+
builder.job = joborder
159+
160+
resources = tool.evalResources(builder, runtime_context)
161+
162+
assert resources["cudaDeviceCount"] == 2
163+
164+
165+
def test_cuda_eval_resource_max() -> None:
166+
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
167+
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
168+
169+
joborder = {} # type: CWLObjectType
170+
loadingContext = LoadingContext({"do_update": True})
171+
runtime_context = RuntimeContext({})
172+
173+
tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext)
174+
builder = _makebuilder(tool.requirements[0])
175+
builder.job = joborder
176+
177+
resources = tool.evalResources(builder, runtime_context)
178+
179+
assert resources["cudaDeviceCount"] == 4

0 commit comments

Comments
 (0)