-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #103 from mskcc/enhancement/produce_metafusion_ref
Enhancement/produce metafusion ref
- Loading branch information
Showing
24 changed files
with
803 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
#!/usr/local/bin/Rscript | ||
|
||
# __author__ = "Alexandria Dymun" | ||
# __email__ = "[email protected]" | ||
|
@@ -6,42 +7,59 @@ | |
# __status__ = "Dev" | ||
|
||
|
||
suppressPackageStartupMessages({ | ||
# library(plyr) | ||
library(dplyr) | ||
library(data.table) | ||
library(stringr) | ||
}) | ||
|
||
library(dplyr) | ||
library(data.table) | ||
library(stringr) | ||
gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf') | ||
gtf_df <- as.data.frame(gtf) | ||
usage <- function() { | ||
message("Usage:") | ||
message("final_generate_v75_gene_bed.R <in.gff> <out.bed>") | ||
} | ||
|
||
args = commandArgs(TRUE) | ||
|
||
if (length(args)!=2) { | ||
usage() | ||
quit() | ||
} | ||
|
||
# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 | ||
# Add introns to gtf, convert to gff3 | ||
# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ | ||
# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ | ||
# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ | ||
# -o genes.INTRONS.gff3" | ||
# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed | ||
|
||
gtf <- rtracklayer::import(args[1]) | ||
gtf_df <- as.data.frame(gtf) | ||
|
||
total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F) | ||
colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description") | ||
total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2]) | ||
total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2]) | ||
file.to_write <- args[2] | ||
|
||
transcript_ids <- unique(total.introns.bed$transcript_id) | ||
file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed" | ||
gtf_df <- gtf_df %>% | ||
rename( | ||
chr = seqnames | ||
) %>% | ||
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% | ||
filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) | ||
|
||
if(file.exists(file.to_write) ) {file.remove(file.to_write)} | ||
|
||
#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS | ||
#START CLOCK | ||
ptm <- proc.time() | ||
print(ptm) | ||
|
||
# Index each transcript feature, incrementing when an intron is passed | ||
## metafusion expects exon count 0 to (N(exons)-1) | ||
## Forward strand: Exon 0 == Exon 1 | ||
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT | ||
for (id in transcript_ids){ | ||
transcript <- total.introns.bed[total.introns.bed$transcript_id == id,] | ||
|
||
print(dim(gtf_df)) | ||
print(length(unique(gtf_df$transcript_id))) | ||
|
||
modify_transcript <- function(transcript){ | ||
|
||
# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another | ||
if ("CDS" %in% transcript$type){ | ||
transcript <- transcript[!transcript$type == "exon",] | ||
|
@@ -51,7 +69,7 @@ for (id in transcript_ids){ | |
# Index features | ||
idx <- 0 | ||
for (i in 1:nrow(transcript)){ | ||
transcript$idx [i]<- idx | ||
transcript$idx[i]<- idx | ||
if (transcript$type[i] == "intron"){ | ||
idx <- idx + 1 | ||
} | ||
|
@@ -68,7 +86,8 @@ for (id in transcript_ids){ | |
#Add "chr" prefix to chromosomes | ||
transcript$chr <- sapply("chr", paste0, transcript$chr) | ||
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later | ||
if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"} | ||
transcript <- transcript %>% mutate(type = as.character(type)) | ||
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) | ||
## DETERMING UTR3 and UTR5 | ||
### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION..... | ||
if ("UTR" %in% unique(transcript$type)){ | ||
|
@@ -79,35 +98,42 @@ for (id in transcript_ids){ | |
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5" | ||
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3" | ||
}else { | ||
#Reverse strand | ||
start_coding <- max(transcript[transcript$type == "cds","end"]) | ||
stop_coding <- min(transcript[transcript$type == "cds","start"]) | ||
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3" | ||
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" | ||
} | ||
} | ||
transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )] | ||
write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F, row.names=F, col.names=F) | ||
#### Any exon that remains after teh cds change, is likely and untranslated region. change below | ||
|
||
# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) | ||
#Forward strand | ||
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" | ||
#Reverse strand | ||
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" | ||
#transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )] | ||
expected_types <- c("cds","intron","utr3","utr5") | ||
transcript <- transcript[transcript$type %in% c(expected_types),] | ||
return(transcript) | ||
} | ||
|
||
time <- proc.time() - ptm | ||
time | ||
# | ||
# user system elapsed | ||
# 16657.116 32.227 16741.382 | ||
|
||
|
||
new.bed <- fread(file.to_write,data.table = F) | ||
colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id") | ||
|
||
#### Any exon that remains after teh cds change, is likely and untranslated region. change below | ||
|
||
# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) | ||
#Forward strand | ||
new.bed$type[new.bed$strand == "f" & new.bed$type == "exon" ] <- "utr5" | ||
#Reverse strand | ||
new.bed$type[new.bed$strand == "r" & new.bed$type == "exon"]<- "utr3" | ||
if(file.exists(file.to_write) ) {file.remove(file.to_write)} | ||
|
||
expected_types <- c("cds","intron","utr3","utr5") | ||
new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),] | ||
gtf_df_modified <- gtf_df %>% | ||
group_by(transcript_id,.drop = FALSE) %>% | ||
group_modify(~ modify_transcript(.x)) %>% | ||
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% | ||
arrange(chr,start,end) | ||
|
||
write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed", sep="\t", quote=F, row.names=F, col.names=F) | ||
time <- proc.time() - ptm | ||
print(time) | ||
|
||
write.table( | ||
gtf_df_modified, | ||
file.to_write, | ||
sep="\t", | ||
quote=F, | ||
row.names=F, | ||
col.names=F | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,58 @@ | ||
|
||
#!/usr/local/bin/Rscript | ||
# __author__ = "Alexandria Dymun" | ||
# __email__ = "[email protected]" | ||
# __contributor__ = "Anne Marie Noronha ([email protected])" | ||
# __version__ = "0.0.1" | ||
# __status__ = "Dev" | ||
|
||
|
||
suppressPackageStartupMessages({ | ||
library(dplyr) | ||
library(stringr) | ||
}) | ||
|
||
library(dplyr) | ||
library(stringr) | ||
library(argparse) | ||
|
||
opt = commandArgs(TRUE) | ||
|
||
parser=ArgumentParser() | ||
parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf") | ||
parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file") | ||
parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF") | ||
parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF") | ||
parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list") | ||
|
||
opt=parser$parse_args() | ||
|
||
usage <- function() { | ||
message("Usage:") | ||
message("make_gene_info_for_forte.R --primary_gtf <file.gtf> --fc_custom_bed_gene_names <custom_genes.bed> --star_fusion_ref <ref_annot.gtf> --fusioncatcher_ref <organism.gtf> --outputDir <outputpath>") | ||
} | ||
|
||
args = commandArgs(TRUE) | ||
|
||
if (is.null(args) | length(args)<1) { | ||
usage() | ||
quit() | ||
} | ||
|
||
#' Parse out options from a string without recourse to optparse | ||
#' | ||
#' @param x Long-form argument list like --opt1 val1 --opt2 val2 | ||
#' | ||
#' @return named list of options and values similar to optparse | ||
|
||
parse_args <- function(x){ | ||
args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] | ||
args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) | ||
# Ensure the option vectors are length 2 (key/ value) to catch empty ones | ||
args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) | ||
|
||
parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1]))) | ||
parsed_args[! is.na(parsed_args)] | ||
} | ||
|
||
opt <- parse_args(paste(args,collapse=" ")) | ||
|
||
required_args <- c( | ||
"primary_gtf", | ||
"fc_custom_bed_gene_names", | ||
"star_fusion_ref", | ||
"fusioncatcher_ref", | ||
"outputDir" | ||
) | ||
if (length(setdiff(required_args,names(opt))) > 0) { | ||
message("Missing required arguments") | ||
usage() | ||
quit() | ||
} | ||
|
||
### primary gtf is v75, also used in arriba | ||
primary_gtf <- as.data.frame(rtracklayer::import(opt$primary_gtf)) | ||
|
@@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names | |
})) | ||
|
||
|
||
add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ | ||
add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){ | ||
add_symbols_and_ids <- unique_id_to_names[[name]] | ||
add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,] | ||
if(name %in% versioned_gtf){ | ||
|
@@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names | |
|
||
})) | ||
# Excess genes being added (genes will be flagged as gene not in v75) | ||
gene_info <- rbind(gene_info,add_these_exess_gene_ids) | ||
gene_info <- rbind(gene_info,add_these_excess_gene_ids) | ||
|
||
gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F) | ||
|
||
|
@@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name | |
|
||
gene_info <- gene_info[,c("Symbol","Synonyms")] | ||
|
||
write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F) | ||
write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F) | ||
write.table( | ||
gene_info, | ||
paste0(opt$outputDir,"/gene.info"), | ||
sep ="\t", | ||
quote = F, | ||
row.names = F | ||
|
||
) | ||
write.table( | ||
add_these_excess_gene_ids, | ||
paste0(opt$outputDir,"/excess_gene_ids.txt"), | ||
sep ="\t", | ||
quote = F, | ||
row.names = F | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
name: metafusion_genebed | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
- defaults | ||
- r | ||
dependencies: | ||
- "conda-forge::r-base=4.3.3" | ||
- "bioconda::bioconductor-rtracklayer=1.62.0" | ||
- "conda-forge::r-optparse=1.7.4" | ||
- "conda-forge::r-dplyr=1.1.4" | ||
- "conda-forge::r-stringr=1.5.1" | ||
- "conda-forge::r-plyr=1.8.9" | ||
- "conda-forge::r-data.table=1.15.2" | ||
- "conda-forge::coreutils=9.5" | ||
- "r::r-argparse=2.2.2" |
Oops, something went wrong.