Cuda request gpu device expr (#1629)

tetron · web-flow · commit 3d2d1196aab9 · 2022-02-24T09:58:55.000+01:00
* Allow expressions to dynamically request min/max number of GPUs.

* Add test coverage for CUDA checks
diff --git a/cwltool/cuda.py b/cwltool/cuda.py
@@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
     return (cv.data, int(ag.data))
 
 
-def cuda_check(cuda_req: CWLObjectType) -> int:
+def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
     try:
         vmin = float(str(cuda_req["cudaVersionMin"]))
         version, devices = cuda_version_and_device_count()
@@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
                 "CUDA version '%s' is less than minimum version '%s'", version, vmin
             )
             return 0
-        dmin = cast(int, cuda_req.get("deviceCountMin", 1))
-        dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
-        if devices < dmin:
+        if requestCount > devices:
             _logger.warning(
-                "Requested at least %d GPU devices but only %d available", dmin, devices
+                "Requested %d GPU devices but only %d available", requestCount, devices
             )
             return 0
-        return min(dmax, devices)
+        return requestCount
     except Exception as e:
         _logger.warning("Error checking CUDA requirements: %s", e)
         return 0
diff --git a/cwltool/docker.py b/cwltool/docker.py
@@ -397,13 +397,8 @@ def create_runtime(
         if runtimeContext.rm_container:
             runtime.append("--rm")
 
-        cuda_req, _ = self.builder.get_requirement(
-            "http://commonwl.org/cwltool#CUDARequirement"
-        )
-        if cuda_req:
-            # Checked earlier that the device count is non-zero in _setup
-            count = cuda_check(cuda_req)
-            runtime.append("--gpus=" + str(count))
+        if self.builder.resources.get("cudaDeviceCount"):
+            runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))
 
         cidfile_path = None  # type: Optional[str]
         # add parameters to docker to write a container ID file
diff --git a/cwltool/executors.py b/cwltool/executors.py
@@ -305,6 +305,9 @@ def select_resources(
         result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
         result["outdirSize"] = math.ceil(request["outdirMin"])
 
+        if "cudaDeviceCount" in request:
+            result["cudaDeviceCount"] = request["cudaDeviceCount"]
+
         return result
 
     def _runner(self, job, runtime_context, TMPDIR_LOCK):
diff --git a/cwltool/extensions-v1.1.yml b/cwltool/extensions-v1.1.yml
@@ -93,13 +93,29 @@ $graph:
 
         See https://docs.nvidia.com/deploy/cuda-compatibility/ for
         details.
-    cudaComputeCapabilityMin:
-      type: string
-      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
-    deviceCountMin:
-      type: int?
+    cudaComputeCapability:
+      type:
+        - 'string'
+        - 'string[]'
+      doc: |
+        CUDA hardware capability required to run the software, in X.Y
+        format.
+
+        * If this is a single value, it defines only the minimum
+          compute capability.  GPUs with higher capability are also
+          accepted.
+
+        * If it is an array value, then only select GPUs with compute
+          capabilities that explicitly appear in the array.
+    cudaDeviceCountMin:
+      type: ['null', int, cwl:Expression]
       default: 1
-      doc: Minimum number of GPU devices to request, default 1.
-    deviceCountMax:
-      type: int?
-      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
+      doc: |
+        Minimum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMax`.  If neither are specified,
+        default 1.
+    cudaDeviceCountMax:
+      type: ['null', int, cwl:Expression]
+      doc: |
+        Maximum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMin`.
diff --git a/cwltool/extensions.yml b/cwltool/extensions.yml
@@ -203,13 +203,29 @@ $graph:
 
         See https://docs.nvidia.com/deploy/cuda-compatibility/ for
         details.
-    cudaComputeCapabilityMin:
-      type: string
-      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
-    deviceCountMin:
-      type: int?
+    cudaComputeCapability:
+      type:
+        - 'string'
+        - 'string[]'
+      doc: |
+        CUDA hardware capability required to run the software, in X.Y
+        format.
+
+        * If this is a single value, it defines only the minimum
+          compute capability.  GPUs with higher capability are also
+          accepted.
+
+        * If it is an array value, then only select GPUs with compute
+          capabilities that explicitly appear in the array.
+    cudaDeviceCountMin:
+      type: ['null', int, cwl:Expression]
       default: 1
-      doc: Minimum number of GPU devices to request, default 1.
-    deviceCountMax:
-      type: int?
-      doc: Maximum number of GPU devices to request.  If not specified, same as `deviceCountMin`.
+      doc: |
+        Minimum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMax`.  If neither are specified,
+        default 1.
+    cudaDeviceCountMax:
+      type: ['null', int, cwl:Expression]
+      doc: |
+        Maximum number of GPU devices to request.  If not specified,
+        same as `cudaDeviceCountMin`.
diff --git a/cwltool/job.py b/cwltool/job.py
@@ -2,6 +2,7 @@
 import functools
 import itertools
 import logging
+import math
 import os
 import re
 import shutil
@@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
             "http://commonwl.org/cwltool#CUDARequirement"
         )
         if cuda_req:
-            count = cuda_check(cuda_req)
+            count = cuda_check(
+                cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
+            )
             if count == 0:
                 raise WorkflowException("Could not satisfy CUDARequirement")
 
diff --git a/cwltool/process.py b/cwltool/process.py
@@ -980,6 +980,7 @@ def evalResources(
         resourceReq, _ = self.get_requirement("ResourceRequirement")
         if resourceReq is None:
             resourceReq = {}
+
         cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
         if cwl_version == "v1.0":
             ram = 1024
@@ -995,20 +996,34 @@ def evalResources(
             "outdirMin": 1024,
             "outdirMax": 1024,
         }
-        for a in ("cores", "ram", "tmpdir", "outdir"):
+
+        cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
+        if cudaReq:
+            request["cudaDeviceCountMin"] = 1
+            request["cudaDeviceCountMax"] = 1
+
+        for rsc, a in (
+            (resourceReq, "cores"),
+            (resourceReq, "ram"),
+            (resourceReq, "tmpdir"),
+            (resourceReq, "outdir"),
+            (cudaReq, "cudaDeviceCount"),
+        ):
+            if rsc is None:
+                continue
             mn = mx = None  # type: Optional[Union[int, float]]
-            if resourceReq.get(a + "Min"):
+            if rsc.get(a + "Min"):
                 mn = cast(
                     Union[int, float],
                     eval_resource(
-                        builder, cast(Union[str, int, float], resourceReq[a + "Min"])
+                        builder, cast(Union[str, int, float], rsc[a + "Min"])
                     ),
                 )
-            if resourceReq.get(a + "Max"):
+            if rsc.get(a + "Max"):
                 mx = cast(
                     Union[int, float],
                     eval_resource(
-                        builder, cast(Union[str, int, float], resourceReq[a + "Max"])
+                        builder, cast(Union[str, int, float], rsc[a + "Max"])
                     ),
                 )
             if mn is None:
@@ -1022,13 +1037,18 @@ def evalResources(
 
         request_evaluated = cast(Dict[str, Union[int, float]], request)
         if runtimeContext.select_resources is not None:
+            # Call select resources hook
             return runtimeContext.select_resources(request_evaluated, runtimeContext)
-        return {
+
+        defaultReq = {
             "cores": request_evaluated["coresMin"],
             "ram": math.ceil(request_evaluated["ramMin"]),
             "tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
             "outdirSize": math.ceil(request_evaluated["outdirMin"]),
         }
+        if cudaReq:
+            defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
+        return defaultReq
 
     def validate_hints(
         self, avsc_names: Names, hints: List[CWLObjectType], strict: bool
diff --git a/cwltool/singularity.py b/cwltool/singularity.py
@@ -434,11 +434,7 @@ def create_runtime(
         else:
             runtime.extend(["--net", "--network", "none"])
 
-        cuda_req, _ = self.builder.get_requirement(
-            "http://commonwl.org/cwltool#CUDARequirement"
-        )
-        if cuda_req:
-            # Checked earlier that the device count is non-zero in _setup
+        if self.builder.resources.get("cudaDeviceCount"):
             runtime.append("--nv")
 
         for name, value in self.environment.items():
diff --git a/mypy-requirements.txt b/mypy-requirements.txt
@@ -2,3 +2,4 @@ mypy==0.931
 types-requests
 types-setuptools
 types-psutil
+types-mock
diff --git a/tests/test_cuda.py b/tests/test_cuda.py
@@ -1,10 +1,24 @@
+import mock
 import pytest
+from schema_salad.avro import schema
 
+from cwltool.builder import Builder
+from cwltool.context import LoadingContext, RuntimeContext
 from cwltool.cuda import cuda_version_and_device_count
+from cwltool.errors import WorkflowException
+from cwltool.job import CommandLineJob
+from cwltool.load_tool import load_tool
 from cwltool.main import main
+from cwltool.pathmapper import MapperEnt, PathMapper
+from cwltool.process import use_custom_schema, use_standard_schema
+from cwltool.stdfsaccess import StdFsAccess
+from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION
+from cwltool.utils import CWLObjectType
 
 from .util import get_data, needs_docker, needs_singularity_3_or_newer
 
+from unittest.mock import MagicMock
+
 cuda_version = cuda_version_and_device_count()
 
 
@@ -39,7 +53,127 @@ def test_cuda_singularity() -> None:
 def test_cuda_no_container() -> None:
     params = [
         "--enable-ext",
-        "--singularity",
         get_data("tests/wf/nvidia-smi.cwl"),
     ]
     assert main(params) == 0
+
+
+@pytest.mark.skipif(
+    cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
+)
+def test_cuda_cc_list() -> None:
+    params = [
+        "--enable-ext",
+        get_data("tests/wf/nvidia-smi-cc.cwl"),
+    ]
+    assert main(params) == 0
+
+
+def _makebuilder(cudaReq: CWLObjectType) -> Builder:
+    return Builder(
+        {},
+        [],
+        [],
+        {},
+        schema.Names(),
+        [cudaReq],
+        [],
+        {"cudaDeviceCount": 1},
+        None,
+        None,
+        StdFsAccess,
+        StdFsAccess(""),
+        None,
+        0.1,
+        False,
+        False,
+        False,
+        "",
+        "",
+        "",
+        "",
+        INTERNAL_VERSION,
+        "docker",
+    )
+
+
+@mock.patch("subprocess.check_output")
+@mock.patch("os.makedirs")
+def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
+
+    runtime_context = RuntimeContext({})
+
+    cudaReq: CWLObjectType = {
+        "class": "http://commonwl.org/cwltool#CUDARequirement",
+        "cudaVersionMin": "1.0",
+        "cudaComputeCapability": "1.0",
+    }
+    builder = _makebuilder(cudaReq)
+
+    check_output.return_value = """
+<nvidia>
+<attached_gpus>1</attached_gpus>
+<cuda_version>1.0</cuda_version>
+</nvidia>
+"""
+
+    jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
+    jb._setup(runtime_context)
+
+
+@mock.patch("subprocess.check_output")
+@mock.patch("os.makedirs")
+def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
+
+    runtime_context = RuntimeContext({})
+
+    cudaReq: CWLObjectType = {
+        "class": "http://commonwl.org/cwltool#CUDARequirement",
+        "cudaVersionMin": "2.0",
+        "cudaComputeCapability": "1.0",
+    }
+    builder = _makebuilder(cudaReq)
+
+    check_output.return_value = """
+<nvidia>
+<attached_gpus>1</attached_gpus>
+<cuda_version>1.0</cuda_version>
+</nvidia>
+"""
+    jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
+    with pytest.raises(WorkflowException):
+        jb._setup(runtime_context)
+
+
+def test_cuda_eval_resource_range() -> None:
+    with open(get_data("cwltool/extensions-v1.1.yml")) as res:
+        use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
+
+    joborder = {}  # type: CWLObjectType
+    loadingContext = LoadingContext({"do_update": True})
+    runtime_context = RuntimeContext({})
+
+    tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext)
+    builder = _makebuilder(tool.requirements[0])
+    builder.job = joborder
+
+    resources = tool.evalResources(builder, runtime_context)
+
+    assert resources["cudaDeviceCount"] == 2
+
+
+def test_cuda_eval_resource_max() -> None:
+    with open(get_data("cwltool/extensions-v1.1.yml")) as res:
+        use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())
+
+    joborder = {}  # type: CWLObjectType
+    loadingContext = LoadingContext({"do_update": True})
+    runtime_context = RuntimeContext({})
+
+    tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext)
+    builder = _makebuilder(tool.requirements[0])
+    builder.job = joborder
+
+    resources = tool.evalResources(builder, runtime_context)
+
+    assert resources["cudaDeviceCount"] == 4
diff --git a/tests/wf/nvidia-smi-cc.cwl b/tests/wf/nvidia-smi-cc.cwl
diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl
diff --git a/tests/wf/nvidia-smi-max.cwl b/tests/wf/nvidia-smi-max.cwl
diff --git a/tests/wf/nvidia-smi-range.cwl b/tests/wf/nvidia-smi-range.cwl
diff --git a/tests/wf/nvidia-smi.cwl b/tests/wf/nvidia-smi.cwl