Merge pull request #184 from jr-leary7/dev

jr-leary7 · web-flow · commit 09e6daffd4c0 · 2024-01-04T01:40:19.000-05:00
Dev
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(extractBreakpoints)
 export(fitGLMM)
 export(geneProgramDrivers)
 export(geneProgramScoring)
+export(geneProgramSignificance)
 export(getFittedValues)
 export(getKnotDist)
 export(getResultsDE)
@@ -71,6 +72,7 @@ importFrom(furrr,future_map)
 importFrom(future,multisession)
 importFrom(future,plan)
 importFrom(future,sequential)
+importFrom(gamlss,LR.test)
 importFrom(gamlss,gamlss)
 importFrom(gamlss,gamlss.control)
 importFrom(gamlss,pb)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,7 @@
 
 + Added `geneProgramDrivers()` function to compute & test correlations of expression with gene module scores.
 + Updated documentation & unit tests.
++ Added `geneProgramSignificance` function to estimate associations between gene program module scores and pseudotime. 
 
 # Changes in version 0.7.8
 
diff --git a/R/geneProgramDrivers.R b/R/geneProgramDrivers.R
@@ -11,7 +11,7 @@
 #' @importFrom doSNOW registerDoSNOW
 #' @importFrom stats cor.test p.adjust
 #' @importFrom dplyr arrange desc mutate filter
-#' @description This function computes the correlation
+#' @description This function computes the correlation between smoothed gene expression and gene program scores in order to identify genes are significantly associated with program scores i.e., the "drivers" of the gene program.
 #' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of normalized counts with genes as rows & cells as columns. Defaults to NULL.
 #' @param genes A character vector of genes to test. Defaults to NULL.
 #' @param gene.program A vector of program scores as returned by \code{\link{geneProgramScoring}}. Defaults to NULL.
@@ -39,7 +39,8 @@
 #' program_drivers <- geneProgramDrivers(sim_counts,
 #'                                       genes = gene_embed$gene,
 #'                                       gene.program = sim_counts$cluster_0,
-#'                                       fdr.cutoff = 0.05)
+#'                                       fdr.cutoff = 0.05,
+#'                                       n.cores = 1L)
 
 geneProgramDrivers <- function(expr.mat = NULL,
                                genes = NULL,
diff --git a/R/geneProgramScoring.R b/R/geneProgramScoring.R
@@ -8,7 +8,8 @@
 #' @param genes A character vector of gene IDs. Defaults to NULL.
 #' @param gene.clusters A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL.
 #' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.
-#' @param minmax.norm (Optional) Should each program's score be min-max normalized to be on [0, 1]? Defaults to FALSE.
+#' @param minmax.norm (Optional) Should each program's score be min-max normalized to be on (0, 1)? Defaults to TRUE.
+#' @param minmax.epsilon (Optional) The tolerance used to ensure that program scores equal to 0 or 1 do not occur. Defaults to 0.01.
 #' @param n.cores (Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2.
 #' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix.
 #' @seealso \code{\link[UCell]{ScoreSignatures_UCell}}
@@ -31,7 +32,8 @@ geneProgramScoring <- function(expr.mat = NULL,
                                genes = NULL,
                                gene.clusters = NULL,
                                program.labels = NULL,
-                               minmax.norm = FALSE,
+                               minmax.norm = TRUE,
+                               minmax.epsilon = 1e-2,
                                n.cores = 2L) {
   # check inputs
   if (is.null(expr.mat) || is.null(genes) || is.null(gene.clusters)) { stop("Arguments to geneProgramScoring() are missing.") }
@@ -65,11 +67,11 @@ geneProgramScoring <- function(expr.mat = NULL,
   program_scores <- UCell::ScoreSignatures_UCell(counts_matrix,
                                                  features = program_list,
                                                  ncores = n.cores)
-  # min-max normalize if desired
+  # min-max normalize with tol to (0, 1) if desired
   if (minmax.norm) {
     program_scores <- purrr::map(seq(ncol(program_scores)), \(i) {
       scores <- program_scores[, i]
-      normed_scores <- (scores - min(scores)) / (max(scores) - min(scores))
+      normed_scores <- minmax.epsilon + (((scores - min(scores)) * (1 - 2 * minmax.epsilon)) / (max(scores) - min(scores)))
       return(normed_scores)
     })
     program_scores <- purrr::reduce(program_scores, cbind)
diff --git a/R/geneProgramSignificance.R b/R/geneProgramSignificance.R
@@ -0,0 +1,71 @@
+#' Test significance of gene program enrichment across a trajectory.
+#'
+#' @name geneProgramSignificance
+#' @author Jack Leary
+#' @import magrittr
+#' @importFrom purrr map reduce
+#' @importFrom gamlss gamlss LR.test
+#' @importFrom dplyr arrange desc mutate
+#' @importFrom stats p.adjust
+#' @description This function fits a Beta GAM with pseudotime as a covariate and gene program score as the response. The fitted model is then compared to a null, intercept-only model in order to determine the significance of the relationship between gene program and pseudotime.
+#' @param gene.programs A list of vectors of program scores as returned by \code{\link{geneProgramScoring}}. Defaults to NULL.
+#' @param pt A vector of pseudotime values for each cell. May contain NAs, which are handled internally. Defaults to NULL.
+#' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.
+#' @param p.adj.method (Optional) The method used to adjust \emph{p}-values for multiple hypothesis testing. Defaults to "holm".
+#' @return A table of statistical output showing the significance of the association between pseudotime and program scores.
+#' @seealso \code{\link{geneProgramScoring}}
+#' @export
+#' @examples
+#' data(sim_counts)
+#' data(scLANE_models)
+#' data(sim_pseudotime)
+#' smoothed_dynamics <- smoothedCountsMatrix(scLANE_models,
+#'                                           pt = sim_pseudotime,
+#'                                           n.cores = 1L)
+#' gene_embed <- embedGenes(smoothed_dynamics$Lineage_A, n.cores = 1L)
+#' sim_counts <- geneProgramScoring(sim_counts,
+#'                                  genes = gene_embed$gene,
+#'                                  gene.clusters = gene_embed$leiden,
+#'                                  n.cores = 1L)
+#' program_enrichment_stats <- geneProgramSignificance(list(sim_counts$cluster_0),
+#'                                                     pt = sim_pseudotime$PT,
+#'                                                     program.labels = c("Program Name"))
+
+geneProgramSignificance <- function(gene.programs = NULL,
+                                    pt = NULL,
+                                    program.labels = NULL,
+                                    p.adj.method = "holm") {
+  # check inputs
+  if (is.null(gene.programs) || is.null(pt)) { stop("Inputs to geneProgramSignificance() are missing.") }
+  if (is.null(program.labels)) {
+    program.labels <- paste0("cluster_", seq(length(gene.programs)) - 1)
+  }
+  if (length(gene.programs) != length(program.labels)) { stop("program.labels must have the same length as gene.programs.") }
+  # iterate over gene programs and fit Beta GAMs
+  program_models <- purrr::map(seq(gene.programs), \(g) {
+    pseudotime_vec <- pt[!is.na(pt)]
+    program_score_vec <- gene.programs[[g]][!is.na(pt)]
+    alt_model <- gamlss::gamlss(program_score_vec ~ pseudotime_vec,
+                                family = "BE",
+                                data = NULL,
+                                control = gamlss::gamlss.control(trace = FALSE))
+    null_model <- gamlss::gamlss(program_score_vec ~ 1,
+                                 family = "BE",
+                                 data = NULL,
+                                 control = gamlss::gamlss.control(trace = FALSE))
+    lr_test_res <- gamlss::LR.test(null = null_model,
+                                   alternative = alt_model,
+                                   print = FALSE)
+    res <- data.frame(Program = program.labels[g],
+                      LRT_Stat = lr_test_res$chi,
+                      DF = lr_test_res$df,
+                      P_Value = lr_test_res$p.val)
+    return(res)
+  })
+  # prepare test results
+  program_sig_df <- purrr::reduce(program_models, rbind) %>%
+                    dplyr::arrange(P_Value, dplyr::desc(LRT_Stat)) %>%
+                    dplyr::mutate(P_Value = dplyr::if_else(P_Value == 0, 2e-16, P_Value),
+                                  P_Value_Adj = stats::p.adjust(P_Value, method = p.adj.method))
+  return(program_sig_df)
+}
diff --git a/man/geneProgramDrivers.Rd b/man/geneProgramDrivers.Rd
diff --git a/man/geneProgramScoring.Rd b/man/geneProgramScoring.Rd
diff --git a/man/geneProgramSignificance.Rd b/man/geneProgramSignificance.Rd
diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R
@@ -227,6 +227,10 @@ withr::with_output_sink(tempfile(), {
   sim_data_seu <- geneProgramScoring(sim_data_seu,
                                      genes = gene_embedding$gene,
                                      gene.clusters = gene_embedding$leiden)
+  # gene program significance
+  program_significance <- geneProgramSignificance(list(sim_data$cluster_0),
+                                                  pt = pt_test$PT,
+                                                  program.labels = c("Cluster0"))
   # gene program drivers
   program_drivers <- geneProgramDrivers(sim_data,
                                         genes = gene_embedding$gene,
@@ -407,6 +411,11 @@ test_that("geneProgramScoring() output", {
   expect_equal(colnames(sim_data_seu@meta.data)[11], "cluster_1")
 })
 
+test_that("geneProgramSignificance() output", {
+  expect_s3_class(program_significance, "data.frame")
+  expect_equal(ncol(program_significance), 5)
+})
+
 test_that("geneProgramDrivers() output", {
   expect_s3_class(program_drivers, "data.frame")
   expect_s3_class(program_drivers_seu, "data.frame")