Skip to content

Commit 51d91cf

Browse files
committed
cuda: better check our expectations of the nvidia-smi -x XML output
1 parent 96c711a commit 51d91cf

File tree

2 files changed

+177
-3
lines changed

2 files changed

+177
-3
lines changed

cwltool/cuda.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,35 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
1616
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
1717
return ("", 0)
1818
dm = xml.dom.minidom.parseString(out) # nosec
19-
ag = dm.getElementsByTagName("attached_gpus")[0].firstChild
20-
cv = dm.getElementsByTagName("cuda_version")[0].firstChild
21-
return (cv.data, int(ag.data))
19+
20+
ag = dm.getElementsByTagName("attached_gpus")
21+
if len(ag) < 1 or ag[0].firstChild is None:
22+
_logger.warning(
23+
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
24+
out,
25+
)
26+
return ("", 0)
27+
ag_element = ag[0].firstChild
28+
29+
cv = dm.getElementsByTagName("cuda_version")
30+
if len(cv) < 1 or cv[0].firstChild is None:
31+
_logger.warning(
32+
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty.: %s",
33+
out,
34+
)
35+
return ("", 0)
36+
cv_element = cv[0].firstChild
37+
38+
if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
39+
ag_element, xml.dom.minidom.Text
40+
):
41+
return (cv_element.data, int(ag_element.data))
42+
_logger.warning(
43+
"Error checking CUDA version with nvidia-smi. "
44+
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
45+
out,
46+
)
47+
return ("", 0)
2248

2349

2450
def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:

tests/test_cuda.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,154 @@ def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock)
135135
jb._setup(runtime_context)
136136

137137

138+
@mock.patch("subprocess.check_output")
139+
@mock.patch("os.makedirs")
140+
def test_cuda_job_setup_check_err_empty_attached_gpus(
141+
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
142+
) -> None:
143+
runtime_context = RuntimeContext({})
144+
145+
cudaReq: CWLObjectType = {
146+
"class": "http://commonwl.org/cwltool#CUDARequirement",
147+
"cudaVersionMin": "1.0",
148+
"cudaComputeCapability": "1.0",
149+
}
150+
builder = _makebuilder(cudaReq)
151+
152+
check_output.return_value = """
153+
<nvidia>
154+
<attached_gpus></attached_gpus>
155+
<cuda_version>1.0</cuda_version>
156+
</nvidia>
157+
"""
158+
159+
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
160+
with pytest.raises(WorkflowException):
161+
jb._setup(runtime_context)
162+
assert (
163+
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
164+
in caplog.text
165+
)
166+
167+
168+
@mock.patch("subprocess.check_output")
169+
@mock.patch("os.makedirs")
170+
def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
171+
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
172+
) -> None:
173+
runtime_context = RuntimeContext({})
174+
175+
cudaReq: CWLObjectType = {
176+
"class": "http://commonwl.org/cwltool#CUDARequirement",
177+
"cudaVersionMin": "1.0",
178+
"cudaComputeCapability": "1.0",
179+
}
180+
builder = _makebuilder(cudaReq)
181+
182+
check_output.return_value = """
183+
<nvidia>
184+
<cuda_version>1.0</cuda_version>
185+
</nvidia>
186+
"""
187+
188+
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
189+
with pytest.raises(WorkflowException):
190+
jb._setup(runtime_context)
191+
assert (
192+
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
193+
in caplog.text
194+
)
195+
196+
197+
@mock.patch("subprocess.check_output")
198+
@mock.patch("os.makedirs")
199+
def test_cuda_job_setup_check_err_empty_cuda_version(
200+
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
201+
) -> None:
202+
runtime_context = RuntimeContext({})
203+
204+
cudaReq: CWLObjectType = {
205+
"class": "http://commonwl.org/cwltool#CUDARequirement",
206+
"cudaVersionMin": "1.0",
207+
"cudaComputeCapability": "1.0",
208+
}
209+
builder = _makebuilder(cudaReq)
210+
211+
check_output.return_value = """
212+
<nvidia>
213+
<attached_gpus>1</attached_gpus>
214+
<cuda_version></cuda_version>
215+
</nvidia>
216+
"""
217+
218+
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
219+
with pytest.raises(WorkflowException):
220+
jb._setup(runtime_context)
221+
assert (
222+
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
223+
in caplog.text
224+
)
225+
226+
227+
@mock.patch("subprocess.check_output")
228+
@mock.patch("os.makedirs")
229+
def test_cuda_job_setup_check_err_missing_cuda_version(
230+
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
231+
) -> None:
232+
runtime_context = RuntimeContext({})
233+
234+
cudaReq: CWLObjectType = {
235+
"class": "http://commonwl.org/cwltool#CUDARequirement",
236+
"cudaVersionMin": "1.0",
237+
"cudaComputeCapability": "1.0",
238+
}
239+
builder = _makebuilder(cudaReq)
240+
241+
check_output.return_value = """
242+
<nvidia>
243+
<attached_gpus>1</attached_gpus>
244+
</nvidia>
245+
"""
246+
247+
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
248+
with pytest.raises(WorkflowException):
249+
jb._setup(runtime_context)
250+
assert (
251+
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
252+
in caplog.text
253+
)
254+
255+
256+
@mock.patch("subprocess.check_output")
257+
@mock.patch("os.makedirs")
258+
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
259+
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
260+
) -> None:
261+
runtime_context = RuntimeContext({})
262+
263+
cudaReq: CWLObjectType = {
264+
"class": "http://commonwl.org/cwltool#CUDARequirement",
265+
"cudaVersionMin": "1.0",
266+
"cudaComputeCapability": "1.0",
267+
}
268+
builder = _makebuilder(cudaReq)
269+
270+
check_output.return_value = """
271+
<nvidia>
272+
<attached_gpus>1</attached_gpus>
273+
<cuda_version><subelement /></cuda_version>
274+
</nvidia>
275+
"""
276+
277+
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
278+
with pytest.raises(WorkflowException):
279+
jb._setup(runtime_context)
280+
assert (
281+
"Error checking CUDA version with nvidia-smi. "
282+
"Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
283+
)
284+
285+
138286
def test_cuda_eval_resource_range() -> None:
139287
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
140288
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())

0 commit comments

Comments
 (0)