Merge pull request #104 from ahendriksen/fea-multi-target-regression

scanpy_funcs: Use multi-target regression
NVIDIA-Genomics-Research · Jan 24, 2023 · 5ba1227 · 5ba1227
2 parents b07da11 + a6f2cd2
commit 5ba1227
Show file tree

Hide file tree

Showing 8 changed files with 482 additions and 321 deletions.
diff --git a/README.md b/README.md
@@ -25,12 +25,12 @@ docker run --gpus all --rm -v /mnt/data:/data claraparabricks/single-cell-exampl
 All dependencies for these examples can be installed with conda.
 
 ```bash
-conda env create --name rapidgenomics -f conda/rapidgenomics_cuda11.0.yml
+conda env create --name rapidgenomics -f conda/rapidgenomics_cuda11.5.yml
 conda activate rapidgenomics
 python -m ipykernel install --user --display-name "Python (rapidgenomics)"
 ```
 
-After installing the necessary dependencies, you can just run `jupyter lab`.
+After installing the necessary dependencies, you can just run `jupyter lab`. There are are a few different conda environment files which correspond to different notebooks. In addition to the one listed above, there is one for the CPU notebooks, one for the real-time visualization notebook, and one for the AtacSeq notebook.
 
 
 ## Configuration

diff --git a/conda/rapidgenomics_cuda11.5.atacworks.yml b/conda/rapidgenomics_cuda11.5.atacworks.yml
@@ -0,0 +1,28 @@
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - nvidia
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.8
+  - cuml=22.08*
+  - cudf=22.08*
+  - cugraph=22.08*
+  - cudatoolkit=11.5
+  - python-wget
+  - cupy=9*
+  - tabix
+  - pytabix
+  - cmake
+  - jupyterlab
+  - ipykernel
+  - wget
+  - scanpy==1.9.1
+  - pyBigWig
+  - ipykernel
+  - pip
+  - pip:
+      - jupyter-server-proxy
+      - atacworks==0.3.4
diff --git a/conda/rapidgenomics_cuda11.5.viz.yml b/conda/rapidgenomics_cuda11.5.viz.yml
@@ -7,9 +7,9 @@ channels:
   - defaults
 dependencies:
   - python=3.8
-  - cuml=22.08*
-  - cudf=22.08*
-  - cugraph=22.08*
+  - cuml=22.12*
+  - cudf=22.12*
+  - cugraph=22.12*
   - tabix
   - pytabix
   - cudatoolkit=11.5

diff --git a/conda/rapidgenomics_cuda11.5.yml b/conda/rapidgenomics_cuda11.5.yml
@@ -7,9 +7,9 @@ channels:
   - defaults
 dependencies:
   - python=3.8
-  - cuml=22.08*
-  - cudf=22.08*
-  - cugraph=22.08*
+  - cuml=22.12*
+  - cudf=22.12*
+  - cugraph=22.12*
   - cudatoolkit=11.5
   - python-wget
   - cupy=9*
@@ -25,4 +25,3 @@ dependencies:
   - pip
   - pip:
       - jupyter-server-proxy
-      - atacworks==0.3.4
diff --git a/notebooks/1M_brain_gpu_analysis_multigpu.ipynb b/notebooks/1M_brain_gpu_analysis_multigpu.ipynb
diff --git a/notebooks/1M_brain_gpu_analysis_uvm.ipynb b/notebooks/1M_brain_gpu_analysis_uvm.ipynb
diff --git a/notebooks/hlca_lung_gpu_analysis.ipynb b/notebooks/hlca_lung_gpu_analysis.ipynb
diff --git a/notebooks/rapids_scanpy_funcs.py b/notebooks/rapids_scanpy_funcs.py
@@ -150,7 +150,7 @@ def normalize_total(csr_arr, target_sum):
     return csr_arr
 
 
-def regress_out(normalized, n_counts, percent_mito, verbose=False):
+def regress_out(normalized, n_counts, percent_mito, batchsize = 100, verbose=False):
 
     """
     Use linear regression to adjust for the effects of unwanted noise
@@ -169,6 +169,12 @@ def regress_out(normalized, n_counts, percent_mito, verbose=False):
     percent_mito : cupy.ndarray of shape (n_cells,)
         Percentage of genes that each cell needs to adjust for
 
+    batchsize: Union[int,Literal["all"],None] (default: 100)
+        Number of genes that should be processed together. 
+        If `'all'` all genes will be processed together if `normalized.shape[0]` <100000. 
+        If `None` each gene will be analysed seperatly.
+        Will be ignored if cuML version < 22.12
+
     verbose : bool
         Print debugging information
 
@@ -186,19 +192,54 @@ def regress_out(normalized, n_counts, percent_mito, verbose=False):
 
     outputs = cp.empty(normalized.shape, dtype=normalized.dtype, order="F")
 
-    if n_counts.shape[0] < 100000 and cp.sparse.issparse(normalized):
-        normalized = normalized.todense()
-
-    for i in range(normalized.shape[1]):
-        if verbose and i % 500 == 0:
-            print("Regressed %s out of %s" %(i, normalized.shape[1]))
-        X = regressors
-        y = normalized[:,i]
-        outputs[:, i] = _regress_out_chunk(X, y)
-
+
+    # cuML gained support for multi-target regression in version 22.12. This
+    # removes the need for a Python for loop and speeds up the code
+    # significantly.     
+    cuml_supports_multi_target = LinearRegression._get_tags()['multioutput']
+
+    if cuml_supports_multi_target and batchsize:
+        if batchsize == "all" and normalized.shape[0] < 100000:
+            if cp.sparse.issparse(normalized): 
+                normalized = normalized.todense()
+            X = regressors
+            # Use SVD algorithm as this is the only algorithm supported in the
+            # multi-target regression. In addition, it is more numerically stable
+            # than the default 'eig' algorithm.
+            lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
+            lr.fit(X, normalized, convert_dtype=True)
+            outputs[:] = normalized - lr.predict(X)
+        else:
+            if batchsize == "all":
+                batchsize = 100
+            n_batches = math.ceil(normalized.shape[1] / batchsize)
+            for batch in range(n_batches):
+                start_idx = batch * batchsize
+                stop_idx = min(batch * batchsize + batchsize,normalized.shape[1])
+                if cp.sparse.issparse(normalized):
+                    arr_batch = normalized[:,start_idx:stop_idx].todense()
+                else:
+                    arr_batch = normalized[:,start_idx:stop_idx].copy()
+                X = regressors
+                lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
+                lr.fit(X, arr_batch, convert_dtype=True)
+                # Instead of "return y - lr.predict(X), we write to outputs to maintain
+                # "F" ordering like in the else branch.
+                outputs[:,start_idx:stop_idx] =arr_batch - lr.predict(X)
+    else:
+        if normalized.shape[0] < 100000 and cp.sparse.issparse(normalized):
+            normalized = normalized.todense()
+        for i in range(normalized.shape[1]):
+            if verbose and i % 500 == 0:
+                print("Regressed %s out of %s" %(i, normalized.shape[1]))
+            X = regressors
+            y = normalized[:,i]
+            outputs[:, i] = _regress_out_chunk(X, y)
+
     return outputs
 
 
+
 def filter_cells(sparse_gpu_array, min_genes, max_genes, rows_per_batch=10000, barcodes=None):
     """
     Filter cells that have genes greater than a max number of genes or less than