From ee83bd3931911f0dc796f2d054b115e512be76a5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:06:07 -0700 Subject: [PATCH] Reorganize datasets, separate out subset --- R/datasets_nf.R | 122 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 R/datasets_nf.R diff --git a/R/datasets_nf.R b/R/datasets_nf.R new file mode 100644 index 00000000..b4d00445 --- /dev/null +++ b/R/datasets_nf.R @@ -0,0 +1,122 @@ +#' Create datasets for Sarek-called somatic or germline variants results +#' +#' Organize variant call files from Nextflow Sarek into 3-4 datasets, +#' grouping files by variant type and workflow with titles having the format: +#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". +#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. +#' This makes sense for NF because Germline calls can be treated differently. +#' This uses latest version of all files and creates a Draft version of the dataset. +#' +#' Since we basically just need the syn entity id, variant type, and workflow to group the files. +#' Instead of getting this info through running `map_*` as in the example, +#' you may prefer using a fileview, in which case you just need to download a table from a fileview +#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. +#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ +#' files are annotated, then you have to use `map_*`. +#' +#' Finally, datasets cannot use the same name if stored in the same project, +#' so if there are multiple batches, the names will have to be made unique by adding +#' the batch number, source data id, processing date, or whatever makes sense. +#' +#' @inheritParams new_dataset +#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. +#' @param workflow One of workflows used. +#' @param verbose Optional, whether to be verbose -- defaults to TRUE. +#' @import data.table +#' @return A list of dataset objects. +#' @export +#' @examples +#'\dontrun{ +#' syn_out <- "syn26648589" +#' m <- map_sample_output_sarek(syn_out) +#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project +#'} +nf_sarek_datasets <- function(output_map, + parent, + workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), + verbose = TRUE, + dry_run = TRUE) { + + output_map <- as.data.table(output_map) + if(!is.null(output_map$dataType)) { + data_type <- unique(output_map$dataType) + if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") + gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) + if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") + gvtype <- switch(gvtype, + SomaticVariants = "Somatic", + GermlineVariants = "Germline") + + } else { + # Detect genomic variants type from first path name + gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { + "Somatic" + } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { + "Germline" + } else { + stop("Could not assign either Germline or Somatic labels based on main output folder. + Check whether folder contains mixed types or is not the right one.") + } + } + pattern <- "vcf.gz(.tbi)?$" + workflow <- match.arg(workflow) + datasets <- list() + for(i in workflow) { + dataset <- output_map[workflow == i & grepl(pattern, output_name)] + if(nrow(dataset)) { + if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") + name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") + dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) + if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) + } + } + + return(datasets) + +} + + +#' Create dataset for STAR-Salmon expression quantification results +#' +#' With a level-3 manifest that is created from `annotate_expression`, +#' calls `new_dataset` to make quantification files (.sf) into dataset. +#' Uses latest version of the files and creates a "Draft" dataset. +#' See `nf_sarek_datasets`. +#' +#' @inheritParams new_dataset +#' @inheritParams nf_sarek_datasets +#' @param manifest A table of annotated data manifest from `annotate_expression`. +#' @export +nf_star_salmon_datasets <- function(manifest, + parent, + dry_run = TRUE) { + + items <- manifest$entityId + new_dataset(name = "Gene Expression Quantification from RNA-seq", + parent = parent, + items = items, + dry_run = dry_run) +} + +#' Create dataset for CNVKit results +#' +#' Create dataset from all files in CNVKit output +#' +#' @inheritParams new_dataset +#' @param syn_out Output folder called 'cnvkit' +#' @export +nf_cnv_dataset <- function(syn_out, + parent, + dry_run = TRUE) { + + files <- walk(syn_out) + files <- unlist(files) + df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE)) + names(df) <- c("Filename", "id") + df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ] + items <- df$id + new_dataset(name = "Copy Number Variant - CNVkit", + parent = parent, + items = items, + dry_run = dry_run) +} \ No newline at end of file