Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement/produce metafusion ref #103

Merged
merged 19 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 66 additions & 40 deletions bin/final_generate_v75_gene_bed.R
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/local/bin/Rscript

# __author__ = "Alexandria Dymun"
# __email__ = "[email protected]"
Expand All @@ -6,42 +7,59 @@
# __status__ = "Dev"


suppressPackageStartupMessages({
# library(plyr)
library(dplyr)
library(data.table)
library(stringr)
})

library(dplyr)
library(data.table)
library(stringr)
gtf <- rtracklayer::import('/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf')
gtf_df <- as.data.frame(gtf)
usage <- function() {
message("Usage:")
message("final_generate_v75_gene_bed.R <in.gff> <out.bed>")
}

args = commandArgs(TRUE)

if (length(args)!=2) {
usage()
quit()
}

# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75
# Add introns to gtf, convert to gff3
# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\
# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\
# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\
# -o genes.INTRONS.gff3"
# gff2bed < genes.INTRONS.gff3 > genes.INTRONS.agat.bed

gtf <- rtracklayer::import(args[1])
gtf_df <- as.data.frame(gtf)

total.introns.bed <- fread(file="/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/genes.INTRONS.agat.bed", header = FALSE, stringsAsFactors = F, sep="\t", na.strings = "",data.table = F)
colnames(total.introns.bed) <- c("chr","start","end","gene_id","tmp","strand","gene_biotype","type","V9","description")
total.introns.bed$transcript_id <- gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"transcript_id=",n=2)[,2])
total.introns.bed$gene_name <-gsub("\\;.*","",str_split_fixed(total.introns.bed$description,"gene_name=",n=2)[,2])
file.to_write <- args[2]

transcript_ids <- unique(total.introns.bed$transcript_id)
file.to_write <- "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/cleaned_metafusion_v75_gene.bed"
gtf_df <- gtf_df %>%
rename(
chr = seqnames
) %>%
select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>%
filter(type %in% c("exon","intron","UTR","CDS","cds","utr"))

if(file.exists(file.to_write) ) {file.remove(file.to_write)}

#START CLOCK: THE INDEXING TAKES A LONG TIME, LIKE 5 HOURS
#START CLOCK
ptm <- proc.time()
print(ptm)

# Index each transcript feature, incrementing when an intron is passed
## metafusion expects exon count 0 to (N(exons)-1)
## Forward strand: Exon 0 == Exon 1
### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT
for (id in transcript_ids){
transcript <- total.introns.bed[total.introns.bed$transcript_id == id,]

print(dim(gtf_df))
print(length(unique(gtf_df$transcript_id)))

modify_transcript <- function(transcript){

# Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another
if ("CDS" %in% transcript$type){
transcript <- transcript[!transcript$type == "exon",]
Expand All @@ -51,7 +69,7 @@ for (id in transcript_ids){
# Index features
idx <- 0
for (i in 1:nrow(transcript)){
transcript$idx [i]<- idx
transcript$idx[i]<- idx
if (transcript$type[i] == "intron"){
idx <- idx + 1
}
Expand All @@ -68,7 +86,8 @@ for (id in transcript_ids){
#Add "chr" prefix to chromosomes
transcript$chr <- sapply("chr", paste0, transcript$chr)
#Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later
if ("CDS" %in% unique(transcript$type)){transcript[transcript$type == "CDS",]$type <- "cds"}
transcript <- transcript %>% mutate(type = as.character(type))
transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type))
## DETERMING UTR3 and UTR5
### INSTEAD OF START AND STOP, USE CDS LOCATIONS AND STRAND INFORMATION.....
if ("UTR" %in% unique(transcript$type)){
Expand All @@ -79,35 +98,42 @@ for (id in transcript_ids){
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr5"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr3"
}else {
#Reverse strand
start_coding <- max(transcript[transcript$type == "cds","end"])
stop_coding <- min(transcript[transcript$type == "cds","start"])
transcript$type[transcript$end <= start_coding & transcript$type == "UTR"] <- "utr3"
transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5"
}
}
transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
write.table(transcript, file.to_write, append=TRUE, sep="\t", quote=F, row.names=F, col.names=F)
#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5"
#Reverse strand
transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3"
#transcript <- transcript[,c("chr", "start", "end", "transcript_id", "type", "idx", "strand", "gene_name", "gene_id" )]
expected_types <- c("cds","intron","utr3","utr5")
transcript <- transcript[transcript$type %in% c(expected_types),]
return(transcript)
}

time <- proc.time() - ptm
time
#
# user system elapsed
# 16657.116 32.227 16741.382


new.bed <- fread(file.to_write,data.table = F)
colnames(new.bed) <- c("chr","start","end","transcript_id","type","idx","strand","gene_name","gene_id")

#### Any exon that remains after teh cds change, is likely and untranslated region. change below

# Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)
#Forward strand
new.bed$type[new.bed$strand == "f" & new.bed$type == "exon" ] <- "utr5"
#Reverse strand
new.bed$type[new.bed$strand == "r" & new.bed$type == "exon"]<- "utr3"
if(file.exists(file.to_write) ) {file.remove(file.to_write)}

expected_types <- c("cds","intron","utr3","utr5")
new.bed.ready <- new.bed[new.bed$type %in% c(expected_types),]
gtf_df_modified <- gtf_df %>%
group_by(transcript_id,.drop = FALSE) %>%
group_modify(~ modify_transcript(.x)) %>%
select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>%
arrange(chr,start,end)

write.table(new.bed.ready, "/work/ccs/pintoa1/metafusion_refs/meta_fusion_bed_generation/v75_gene.bed", sep="\t", quote=F, row.names=F, col.names=F)
time <- proc.time() - ptm
print(time)

write.table(
gtf_df_modified,
file.to_write,
sep="\t",
quote=F,
row.names=F,
col.names=F
)
84 changes: 64 additions & 20 deletions bin/make_gene_info_for_forte.R
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,27 +1,58 @@

#!/usr/local/bin/Rscript
# __author__ = "Alexandria Dymun"
# __email__ = "[email protected]"
# __contributor__ = "Anne Marie Noronha ([email protected])"
# __version__ = "0.0.1"
# __status__ = "Dev"


suppressPackageStartupMessages({
library(dplyr)
library(stringr)
})

library(dplyr)
library(stringr)
library(argparse)

opt = commandArgs(TRUE)

parser=ArgumentParser()
parser$add_argument("-p",'--primary_gtf',type="character",default = NULL,help = "Primary GTF, should match your bed file and arriba. Assumes ARRIBA is on primary gtf")
parser$add_argument("-c",'--fc_custom_bed_gene_names',type="character",default = NULL,help = "Fusioncatcher custom genes bed file")
parser$add_argument("-s",'--star_fusion_ref',type="character",default = NULL,help = "StarFusion GTF")
parser$add_argument("-f",'--fusioncatcher_ref',type="character",default = NULL,help = "Fusioncatcher GTF")
parser$add_argument("-o",'--outputDir',type="character",default = NULL,help = "outputDirectory to write gene_info and excess gene list")

opt=parser$parse_args()

usage <- function() {
message("Usage:")
message("make_gene_info_for_forte.R --primary_gtf <file.gtf> --fc_custom_bed_gene_names <custom_genes.bed> --star_fusion_ref <ref_annot.gtf> --fusioncatcher_ref <organism.gtf> --outputDir <outputpath>")
}

args = commandArgs(TRUE)

if (is.null(args) | length(args)<1) {
usage()
quit()
}

#' Parse out options from a string without recourse to optparse
#'
#' @param x Long-form argument list like --opt1 val1 --opt2 val2
#'
#' @return named list of options and values similar to optparse

parse_args <- function(x){
args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1]
args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE))
# Ensure the option vectors are length 2 (key/ value) to catch empty ones
args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z})

parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) gsub('-','_',x[1])))
parsed_args[! is.na(parsed_args)]
}

opt <- parse_args(paste(args,collapse=" "))

required_args <- c(
"primary_gtf",
"fc_custom_bed_gene_names",
"star_fusion_ref",
"fusioncatcher_ref",
"outputDir"
)
if (length(setdiff(required_args,names(opt))) > 0) {
message("Missing required arguments")
usage()
quit()
}

### primary gtf is v75, also used in arriba
primary_gtf <- as.data.frame(rtracklayer::import(opt$primary_gtf))
Expand Down Expand Up @@ -59,7 +90,7 @@ versioned_gtf <-unlist(sapply(names(unique_id_to_names)[names(unique_id_to_names
}))


add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
add_these_excess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names(unique_id_to_names) != "primary"],function(name){
add_symbols_and_ids <- unique_id_to_names[[name]]
add_symbols_and_ids <- add_symbols_and_ids[!add_symbols_and_ids$gene_id %in% gene_info$gene_id,]
if(name %in% versioned_gtf){
Expand All @@ -70,7 +101,7 @@ add_these_exess_gene_ids <- do.call(rbind,lapply(names(unique_id_to_names)[names

}))
# Excess genes being added (genes will be flagged as gene not in v75)
gene_info <- rbind(gene_info,add_these_exess_gene_ids)
gene_info <- rbind(gene_info,add_these_excess_gene_ids)

gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F)

Expand All @@ -79,5 +110,18 @@ gene_info$Symbol <- gene_info$gene_name

gene_info <- gene_info[,c("Symbol","Synonyms")]

write.table(gene_info,paste0(opt$outputDir,"/gene.info"),sep ="\t",quote = F,row.names = F)
write.table(add_these_exess_gene_ids,paste0(opt$outputDir,"/excess_gene_ids.txt",sep ="\t",quote = F,row.names = F)
write.table(
gene_info,
paste0(opt$outputDir,"/gene.info"),
sep ="\t",
quote = F,
row.names = F

)
write.table(
add_these_excess_gene_ids,
paste0(opt$outputDir,"/excess_gene_ids.txt"),
sep ="\t",
quote = F,
row.names = F
)
4 changes: 0 additions & 4 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ params {
}
}
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz"
metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/v75_gene.bed.gz"
metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/gene_info_20230714.txt"
ensembl_version = 75
}
'GRCh38' {
Expand All @@ -49,8 +47,6 @@ params {
starfusion_url = null
cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz"
metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe"
metafusion_gene_bed = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/v75_gene.bed"
metafusion_gene_info = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/gene_info_20230714.txt"
ensembl_version = 75

}
Expand Down
20 changes: 19 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ process {
]
}

withName: METAFUSION {
withName: METAFUSION_RUN {
ext.args = "--num_tools=${params.fusion_tool_cutoff}"
publishDir = [
path: { "$params.outdir/analysis/${meta.id}/metafusion/intermediates" },
Expand All @@ -197,6 +197,24 @@ process {

]
}
withName: 'METAFUSION_GENEBED' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/genebed" }
publishDir = [
enabled: false,
]
}
withName: 'METAFUSION_GENEINFO' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/geneinfo" }
publishDir = [
enabled: false,
]
}
withName: 'AGAT_SPADDINTRONS' {
storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" }
publishDir = [
enabled: false,
]
}

withName: ADD_FLAG {
publishDir = [
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"agat/spaddintrons": {
"branch": "master",
"git_sha": "6898156da3604a6bdf26c36036053a970050fea0",
"installed_by": ["modules"]
},
"arriba": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
Expand Down
2 changes: 2 additions & 0 deletions modules/local/metafusion/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ RUN conda clean -afy \
&& find /opt/conda/ -follow -type f -name '*.pyc' -delete \
&& find /opt/conda/ -follow -type f -name '*.js.map' -delete

ENTRYPOINT []

16 changes: 16 additions & 0 deletions modules/local/metafusion/genebed/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: metafusion_genebed
channels:
- conda-forge
- bioconda
- defaults
- r
dependencies:
- "conda-forge::r-base=4.3.3"
- "bioconda::bioconductor-rtracklayer=1.62.0"
- "conda-forge::r-optparse=1.7.4"
- "conda-forge::r-dplyr=1.1.4"
- "conda-forge::r-stringr=1.5.1"
- "conda-forge::r-plyr=1.8.9"
- "conda-forge::r-data.table=1.15.2"
- "conda-forge::coreutils=9.5"
- "r::r-argparse=2.2.2"
Loading
Loading