Merge pull request #103 from mskcc/enhancement/produce_metafusion_ref

Enhancement/produce metafusion ref
mskcc · Apr 30, 2024 · cbd2d0f · cbd2d0f
2 parents a1bdc7e + b093ecb
commit cbd2d0f
Show file tree

Hide file tree

Showing 24 changed files with 803 additions and 89 deletions.
diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R
@@ -1,3 +1,4 @@
+#!/usr/local/bin/Rscript
 
 # __author__      = "Alexandria Dymun"
 # __email__       = "[email protected]"
@@ -6,42 +7,59 @@
 # __status__      = "Dev"
 
 
+suppressPackageStartupMessages({
+#    library(plyr)
+    library(dplyr)
+    library(data.table)
+    library(stringr)
+})
 
-library(dplyr)
-library(data.table)
-library(stringr)
-gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf')
-gtf_df <- as.data.frame(gtf)
+usage <- function() {
+    message("Usage:")
+    message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
+}
 
+args = commandArgs(TRUE)
+
+if (length(args)!=2) {
+    usage()
+    quit()
+}
 
 # Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
 # Add introns to gtf, convert to gff3
 # bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
 # -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0  \\
 # /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
 # -o genes.INTRONS.gff3"
-# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed
 
+gtf <- rtracklayer::import(args[1])
+gtf_df <- as.data.frame(gtf)
 
-total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F)
-colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description")
-total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2])
-total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2])
+file.to_write <- args[2]
 
-transcript_ids <- unique(total.introns.bed$transcript_id)
-file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed"
+gtf_df <- gtf_df %>%
+    rename(
+        chr = seqnames
+    ) %>%
+    select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
+    filter(type %in% c("exon","intron","UTR","CDS","cds","utr"))
 
-if(file.exists(file.to_write) ) {file.remove(file.to_write)}
 
-#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS
+#START CLOCK
 ptm <- proc.time()
+print(ptm)
 
 # Index each transcript feature, incrementing when an intron is passed
 ## metafusion expects exon count 0 to (N(exons)-1)
 ## Forward strand: Exon 0 == Exon 1
 ### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT
-for (id in transcript_ids){
-    transcript <- total.introns.bed[total.introns.bed$transcript_id == id,]
+
+print(dim(gtf_df))
+print(length(unique(gtf_df$transcript_id)))
+
+modify_transcript <- function(transcript){
+
     # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
     if ("CDS" %in% transcript$type){
         transcript <- transcript[!transcript$type == "exon",]
@@ -51,7 +69,7 @@ for (id in transcript_ids){
     # Index features
     idx <- 0
     for (i in 1:nrow(transcript)){
-        transcript$idx [i]<- idx
+        transcript$idx[i]<- idx
         if (transcript$type[i] == "intron"){
             idx <- idx + 1
         }
@@ -68,7 +86,8 @@ for (id in transcript_ids){
     #Add "chr" prefix to chromosomes
     transcript$chr <- sapply("chr", paste0,  transcript$chr)
     #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
-    if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"}
+    transcript <- transcript %>% mutate(type = as.character(type))
+    transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
     ## DETERMING UTR3 and UTR5
     ### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION.....
     if ("UTR" %in% unique(transcript$type)){
@@ -79,35 +98,42 @@ for (id in transcript_ids){
             transcript$type[transcript$end <= start_coding &  transcript$type == "UTR"] <- "utr5"
             transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3"
         }else {
+            #Reverse strand
             start_coding <- max(transcript[transcript$type == "cds","end"])
             stop_coding <- min(transcript[transcript$type == "cds","start"])
             transcript$type[transcript$end <= start_coding &  transcript$type == "UTR"] <- "utr3"
             transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
         }
     }
-    transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
-    write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F,  row.names=F, col.names=F)
+    #### Any exon that remains after teh cds change, is likely and untranslated region. change below
+
+    # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
+    #Forward strand
+    transcript$type[transcript$strand == "f" &  transcript$type == "exon" ] <- "utr5"
+    #Reverse strand
+    transcript$type[transcript$strand == "r" &  transcript$type == "exon"]<- "utr3"
+    #transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
+    expected_types <- c("cds","intron","utr3","utr5")
+    transcript <- transcript[transcript$type %in% c(expected_types),]
+    return(transcript)
 }
 
-time <- proc.time() - ptm
-time
-#
-# user    system   elapsed
-# 16657.116    32.227 16741.382
-
-
-new.bed <- fread(file.to_write,data.table = F)
-colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id")
-
-#### Any exon that remains after teh cds change, is likely and untranslated region. change below
-
-# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
-#Forward strand
-new.bed$type[new.bed$strand == "f" &  new.bed$type == "exon" ] <- "utr5"
-#Reverse strand
-new.bed$type[new.bed$strand == "r" &  new.bed$type == "exon"]<- "utr3"
+if(file.exists(file.to_write) ) {file.remove(file.to_write)}
 
-expected_types <- c("cds","intron","utr3","utr5")
-new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),]
+gtf_df_modified <- gtf_df %>%
+    group_by(transcript_id,.drop = FALSE) %>%
+    group_modify(~ modify_transcript(.x)) %>%
+    select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
+    arrange(chr,start,end)
 
-write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed",  sep="\t", quote=F,  row.names=F, col.names=F)
+time <- proc.time() - ptm
+print(time)
+
+write.table(
+    gtf_df_modified,
+    file.to_write,
+    sep="\t",
+    quote=F,
+    row.names=F,
+    col.names=F
+)
diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R
@@ -1,27 +1,58 @@
-
+#!/usr/local/bin/Rscript
 # __author__      = "Alexandria Dymun"
 # __email__       = "[email protected]"
 # __contributor__ = "Anne Marie Noronha ([email protected])"
 # __version__     = "0.0.1"
 # __status__      = "Dev"
 
 
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(stringr)
+})
 
-library(dplyr)
-library(stringr)
-library(argparse)
-
-opt = commandArgs(TRUE)
-
-parser=ArgumentParser()
-parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf")
-parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file")
-parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF")
-parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF")
-parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list")
-
-opt=parser$parse_args()
-
+usage <- function() {
+    message("Usage:")
+    message("make_gene_info_for_forte.R --primary_gtf <file.gtf> --fc_custom_bed_gene_names <custom_genes.bed> --star_fusion_ref <ref_annot.gtf> --fusioncatcher_ref <organism.gtf> --outputDir <outputpath>")
+}
+
+args = commandArgs(TRUE)
+
+if (is.null(args) | length(args)<1) {
+    usage()
+    quit()
+}
+
+#' Parse out options from a string without recourse to optparse
+#'
+#' @param x Long-form argument list like --opt1 val1 --opt2 val2
+#'
+#' @return named list of options and values similar to optparse
+
+parse_args <- function(x){
+    args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
+    args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
+    # Ensure the option vectors are length 2 (key/ value) to catch empty ones
+    args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})
+
+    parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
+    parsed_args[! is.na(parsed_args)]
+}
+
+opt <- parse_args(paste(args,collapse=" "))
+
+required_args <- c(
+    "primary_gtf",
+    "fc_custom_bed_gene_names",
+    "star_fusion_ref",
+    "fusioncatcher_ref",
+    "outputDir"
+)
+if (length(setdiff(required_args,names(opt))) > 0) {
+    message("Missing required arguments")
+    usage()
+    quit()
+}
 
 ### primary gtf is v75, also used in arriba
 primary_gtf <-  as.data.frame(rtracklayer::import(opt$primary_gtf))
@@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names
 }))
 
 
-add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
+add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
     add_symbols_and_ids <- unique_id_to_names[[name]]
     add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,]
     if(name %in% versioned_gtf){
@@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names
 
 }))
 # Excess genes being added (genes will be flagged as gene not in v75)
-gene_info <- rbind(gene_info,add_these_exess_gene_ids)
+gene_info <- rbind(gene_info,add_these_excess_gene_ids)
 
 gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)
 
@@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name
 
 gene_info <- gene_info[,c("Symbol","Synonyms")]
 
-write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F)
-write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F)
+write.table(
+    gene_info,
+    paste0(opt$outputDir,"/gene.info"),
+    sep ="\t",
+    quote = F,
+    row.names = F
+
+)
+write.table(
+    add_these_excess_gene_ids,
+    paste0(opt$outputDir,"/excess_gene_ids.txt"),
+    sep ="\t",
+    quote = F,
+    row.names = F
+)
diff --git a/conf/igenomes.config b/conf/igenomes.config
@@ -31,8 +31,6 @@ params {
                 }
             }
             metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz"
-            metafusion_gene_bed  = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz"
-            metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt"
             ensembl_version = 75
         }
         'GRCh38' {
@@ -49,8 +47,6 @@ params {
             starfusion_url = null
             cdna           = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
             metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
-            metafusion_gene_bed  = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed"
-            metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt"
             ensembl_version = 75
 
         }

diff --git a/conf/modules.config b/conf/modules.config
@@ -187,7 +187,7 @@ process {
         ]
     }
 
-    withName: METAFUSION {
+    withName: METAFUSION_RUN {
         ext.args = "--num_tools=${params.fusion_tool_cutoff}"
         publishDir = [
             path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" },
@@ -197,6 +197,24 @@ process {
 
         ]
     }
+    withName: 'METAFUSION_GENEBED' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/genebed" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
+    withName: 'METAFUSION_GENEINFO' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/geneinfo" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
+    withName: 'AGAT_SPADDINTRONS' {
+        storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
+        publishDir = [
+            enabled: false,
+        ]
+    }
 
     withName: ADD_FLAG {
         publishDir = [

diff --git a/modules.json b/modules.json
@@ -16,6 +16,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "agat/spaddintrons": {
+                        "branch": "master",
+                        "git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
+                        "installed_by": ["modules"]
+                    },
                     "arriba": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",

diff --git a/modules/local/metafusion/container/Dockerfile b/modules/local/metafusion/container/Dockerfile
@@ -21,3 +21,5 @@ RUN conda clean -afy \
     && find /opt/conda/ -follow -type f -name '*.pyc' -delete \
     && find /opt/conda/ -follow -type f -name '*.js.map' -delete
 
+ENTRYPOINT []
+
diff --git a/modules/local/metafusion/genebed/environment.yml b/modules/local/metafusion/genebed/environment.yml
@@ -0,0 +1,16 @@
+name: metafusion_genebed
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - r
+dependencies:
+  - "conda-forge::r-base=4.3.3"
+  - "bioconda::bioconductor-rtracklayer=1.62.0"
+  - "conda-forge::r-optparse=1.7.4"
+  - "conda-forge::r-dplyr=1.1.4"
+  - "conda-forge::r-stringr=1.5.1"
+  - "conda-forge::r-plyr=1.8.9"
+  - "conda-forge::r-data.table=1.15.2"
+  - "conda-forge::coreutils=9.5"
+  - "r::r-argparse=2.2.2"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,3 +21,5 @@ RUN conda clean -afy \
		&& find /opt/conda/ -follow -type f -name '*.pyc' -delete \
		&& find /opt/conda/ -follow -type f -name '*.js.map' -delete

		ENTRYPOINT []