Skip to content

Commit eda0149

Browse files
authored
Merge pull request #182 from jr-leary7/dev
added parallel processing to geneProgramDrivers
2 parents f7662dd + 2e0eec5 commit eda0149

File tree

5 files changed

+73
-7
lines changed

5 files changed

+73
-7
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Description: This package uses truncated power basis spline models to build flex
99
Downstream analysis functionalities include model comparison, dynamic gene clustering, smoothed counts generation, gene set enrichment testing, & visualization.
1010
License: MIT + file LICENSE
1111
Encoding: UTF-8
12-
RoxygenNote: 7.2.1
12+
RoxygenNote: 7.2.3
1313
Depends:
1414
glm2,
1515
magrittr,

R/geneProgramDrivers.R

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
#' @author Jack Leary
55
#' @importFrom Matrix Matrix
66
#' @importFrom purrr map reduce
7+
#' @importFrom foreach foreach registerDoSEQ
8+
#' @importFrom parallel makeCluster clusterEvalQ stopCluster
9+
#' @importFrom withr with_output_sink
10+
#' @importFrom utils txtProgressBar setTxtProgressBar
11+
#' @importFrom doSNOW registerDoSNOW
712
#' @importFrom stats cor.test p.adjust
813
#' @importFrom dplyr arrange desc mutate filter
914
#' @description This function computes the correlation
@@ -13,6 +18,8 @@
1318
#' @param cor.method (Optional) The correlation method to be used. Defaults to "spearman".
1419
#' @param fdr.cutoff (Optional) The FDR threshold for determining statistical significance. Defaults to 0.01.
1520
#' @param p.adj.method (Optional) The method used to adjust \emph{p}-values for multiple hypothesis testing. Defaults to "holm".
21+
#' @param n.cores (Optional) The number of cores used when iterating over genes to perform testing. Defaults to 2.
22+
#' @param verbose (Optional) Should a progress bar be printed to the console during processing? Defaults to TRUE.
1623
#' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix.
1724
#' @seealso \code{\link{geneProgramScoring}}
1825
#' @seealso \code{\link[stats]{cor.test}}
@@ -39,7 +46,9 @@ geneProgramDrivers <- function(expr.mat = NULL,
3946
gene.program = NULL,
4047
cor.method = "spearman",
4148
fdr.cutoff = 0.01,
42-
p.adj.method = "holm") {
49+
p.adj.method = "holm",
50+
n.cores = 2L,
51+
verbose = TRUE) {
4352
# check inputs
4453
if (is.null(expr.mat) || is.null(genes) || is.null(gene.program)) { stop("Arguments to geneProgramDrivers() are missing.") }
4554
cor.method <- tolower(cor.method)
@@ -54,16 +63,50 @@ geneProgramDrivers <- function(expr.mat = NULL,
5463
} else if (inherits(expr.mat, "dgCMatrix")) {
5564
counts_matrix <- Matrix::Matrix(expr.mat, sparse = FALSE)
5665
}
66+
# set up parallel processing
67+
if (n.cores > 1L) {
68+
cl <- parallel::makeCluster(n.cores)
69+
doSNOW::registerDoSNOW(cl)
70+
} else {
71+
cl <- foreach::registerDoSEQ()
72+
}
73+
# set up progress bar
74+
if (verbose) {
75+
withr::with_output_sink(tempfile(), {
76+
pb <- utils::txtProgressBar(0, length(genes), style = 3)
77+
})
78+
progress_fun <- function(n) utils::setTxtProgressBar(pb, n)
79+
snow_opts <- list(progress = progress_fun)
80+
} else {
81+
snow_opts <- list()
82+
}
5783
# iteratively compute correlations & p-values
58-
cor_tests <- purrr::map(genes, \(g) {
59-
cor_res <- stats::cor.test(counts_matrix[g, ],
84+
cor_tests <- foreach::foreach(g = seq(genes),
85+
.combine = "list",
86+
.multicombine = ifelse(length(genes) > 1, TRUE, FALSE),
87+
.maxcombine = ifelse(length(genes) > 1, length(genes), 2),
88+
.packages = c("stats"),
89+
.errorhandling = "pass",
90+
.inorder = TRUE,
91+
.verbose = FALSE,
92+
.options.snow = snow_opts) %dopar% {
93+
cor_res <- stats::cor.test(counts_matrix[genes[g], ],
6094
gene.program,
6195
method = cor.method,
6296
exact = FALSE)
63-
cor_df <- data.frame(gene = g,
97+
cor_df <- data.frame(gene = genes[g],
6498
corr = unname(cor_res$estimate),
6599
pvalue = cor_res$p.value)
66-
return(cor_df)
100+
cor_df
101+
}
102+
# end parallelization & clean up each worker node
103+
withr::with_output_sink(tempfile(), {
104+
if (n.cores > 1L) {
105+
parallel::clusterEvalQ(cl, expr = {
106+
gc(verbose = FALSE, full = TRUE)
107+
})
108+
parallel::stopCluster(cl)
109+
}
67110
})
68111
cor_tests <- purrr::reduce(cor_tests, rbind)
69112
rownames(cor_tests) <- genes

R/geneProgramScoring.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#' @param genes A character vector of gene IDs. Defaults to NULL.
99
#' @param gene.clusters A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL.
1010
#' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.
11+
#' @param minmax.norm (Optional) Should each program's score be min-max normalized to be on [0, 1]? Defaults to FALSE.
1112
#' @param n.cores (Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2.
1213
#' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix.
1314
#' @seealso \code{\link[UCell]{ScoreSignatures_UCell}}
@@ -30,6 +31,7 @@ geneProgramScoring <- function(expr.mat = NULL,
3031
genes = NULL,
3132
gene.clusters = NULL,
3233
program.labels = NULL,
34+
minmax.norm = FALSE,
3335
n.cores = 2L) {
3436
# check inputs
3537
if (is.null(expr.mat) || is.null(genes) || is.null(gene.clusters)) { stop("Arguments to geneProgramScoring() are missing.") }
@@ -42,6 +44,9 @@ geneProgramScoring <- function(expr.mat = NULL,
4244
program.labels <- paste0("cluster_", cluster.labels)
4345
} else {
4446
program.labels <- gsub(" ", "_", program.labels)
47+
if (length(program.labels) != length(levels(gene.clusters))) {
48+
stop("Each cluster must have a label.")
49+
}
4550
}
4651
# set up query matrix
4752
if (inherits(expr.mat, "SingleCellExperiment")) {
@@ -60,6 +65,15 @@ geneProgramScoring <- function(expr.mat = NULL,
6065
program_scores <- UCell::ScoreSignatures_UCell(counts_matrix,
6166
features = program_list,
6267
ncores = n.cores)
68+
# min-max normalize if desired
69+
if (minmax.norm) {
70+
program_scores <- purrr::map(seq(ncol(program_scores)), \(i) {
71+
scores <- program_scores[, i]
72+
normed_scores <- (scores - min(scores)) / (max(scores) - min(scores))
73+
return(normed_scores)
74+
})
75+
program_scores <- purrr::reduce(program_scores, cbind)
76+
}
6377
# reformat program scores depending on input format
6478
if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array") || inherits(expr.mat, "dgCMatrix")) {
6579
colnames(program_scores) <- program.labels

man/geneProgramDrivers.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/geneProgramScoring.Rd

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)