-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmakeCompiled.R
executable file
·239 lines (193 loc) · 12.7 KB
/
makeCompiled.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env Rscript
library("dplyr")
library(ggplot2)
library(reshape2)
setwd("/scratch/amak/varCalls")
# Change these directories to fit your file system to where outDir was for post-process-wrap.sh
platDir <- "/scratch/amak/varCalls/Platypus/"
varDir <- "/scratch/amak/varCalls/VarDict/"
mutDir <- "/scratch/amak/varCalls/Mutect/"
#
platCounts <- read.delim(paste(platDir, 'countBreakdown.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
varCounts <- read.delim(paste(varDir, 'countBreakdown.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
mutCounts <- read.delim(paste(mutDir, 'countBreakdown.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
# total number of variants in whitelist
varTotal <- read.delim(paste(varDir, 'totalBreakdown.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
platSens <- read.delim(paste(platDir, 'sensitivityFPKM.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
varSens <- read.delim(paste(varDir, 'sensitivityFPKM.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
mutSens <- read.delim(paste(mutDir, 'sensitivityFPKM.txt', sep=""), sep="\t", header=TRUE, stringsAsFactors=FALSE)
platCovFound <- read.delim(paste(platDir, 'found.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
platCovNotFound <- read.delim(paste(platDir, 'notFound.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
varCovFound <- read.delim(paste(varDir, 'found.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
varCovNotFound <- read.delim(paste(varDir, 'notFound.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
mutCovFound <- read.delim(paste(mutDir, 'found.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
mutCovNotFound <- read.delim(paste(mutDir, 'notFound.cov.txt', sep=""), sep="\t", header=FALSE, stringsAsFactors=FALSE)
## Process for Platypus ##
platCounts[is.na(platCounts)] <- 0
platSensitivity <- as.numeric(platCounts %>% summarize(Mean=mean(Sensitivity)))
platFiltGermline <- as.numeric(platCounts %>% summarize(Mean=mean(numFiltGermline)))
platFiltEdits <- as.numeric(platCounts %>% summarize(Mean=mean(numFiltEdits)))
# add label column for tool type so can concat later
platCovFound <- platCovFound %>% mutate(Tool="Platypus")
platCovNotFound <- platCovNotFound %>% mutate(Tool="Platypus")
## Print overall stats ##
print("Platypus Average Sensitivity")
print(platSensitivity)
print("Platypus Average Filtered PON")
print(platFiltGermline)
print("Platypus Filtered RNAEdits")
print(platFiltEdits)
## Process for VarDict ##
varCounts[is.na(varCounts)] <- 0
varSensitivity <- as.numeric(varCounts %>% summarize(Mean=mean(Sensitivity)))
varFiltGermline <- as.numeric(varCounts %>% summarize(Mean=mean(numFiltGermline)))
varFiltEdits <- as.numeric(varCounts %>% summarize(Mean=mean(numFiltEdits)))
varCovFound <- varCovFound %>% mutate(Tool="VarDict")
varCovNotFound <- varCovNotFound %>% mutate(Tool="VarDict")
print("VarDict Average Sensitivity")
print(varSensitivity)
print("VarDict Average Filtered PON")
print(varFiltGermline)
print("VarDict Average Filtered RNAEdits")
print(varFiltEdits)
## Process for Mutect ##
mutCounts[is.na(mutCounts)] <- 0
mutSensitivity <- as.numeric(mutCounts %>% summarize(Mean=mean(Sensitivity)))
mutFiltGermline <- as.numeric(mutCounts %>% summarize(Mean=mean(numFiltGermline)))
mutFiltEdits <- as.numeric(mutCounts %>% summarize(Mean=mean(numFiltEdits)))
mutCovFound <- mutCovFound %>% mutate(Tool="Mutect2")
mutCovNotFound <- mutCovNotFound %>% mutate(Tool="Mutect2")
print("Mutect Average Sensitivity")
print(mutSensitivity)
print("Mutect Average Filtered PON")
print(mutFiltGermline)
print("Mutect Average Filtered RNAEdits")
print(mutFiltEdits)
## Remove columns and add label column to each dataset ##
platCounts <- select(platCounts, -Donor.ID, -Sensitivity, -numFiltGermline, -numFiltEdits) #%>% mutate(Tool="Platypus")
varCounts <- select(varCounts, -Donor.ID, -Sensitivity, -numFiltGermline, -numFiltEdits) #%>% mutate(Tool="VarDict")
mutCounts <- select(mutCounts, -Donor.ID, -Sensitivity, -numFiltGermline, -numFiltEdits) #%>% mutate(Tool="Mutect2")
## sum down each column(variant type) to get total not found for all bams
platCounts <- platCounts %>% summarize_all(sum)
varCounts <- varCounts %>% summarize_all(sum)
mutCounts <- mutCounts %>% summarize_all(sum)
varTotal <- varTotal %>% mutate_all(funs(ifelse(is.na(.), 0, .))) %>% select(-Donor.ID, -Total)
print("varCounts")
print(head(varCounts))
## sum down each column (variant type) to get total variants in whitelist for all bams
varTotal <- varTotal %>% summarize_all(sum)
print("varTotal")
print(varTotal)
library(tidyr)
## sort by lowest to highest sum
platCounts <- platCounts %>% gather(Variant.Type, Sum) %>% arrange(desc(Sum))
varCounts <- varCounts %>% gather(Variant.Type, Sum) %>% arrange(desc(Sum))
mutCounts <- mutCounts %>% gather(Variant.Type, Sum) %>% arrange(desc(Sum))
varTotal <- varTotal %>% gather(Variant.Type, Total) %>% arrange(desc(Total))
positions <- varCounts$Variant.Type
## to print total num of variants in same order as varTotal and add column to retain tool type before bind
platCounts <- platCounts %>% full_join(varTotal, by="Variant.Type") %>% mutate(Tool="Platypus")
varCounts <- varCounts %>% full_join(varTotal, by="Variant.Type") %>% mutate(Tool="VarDict")
mutCounts <- mutCounts %>% full_join(varTotal, by="Variant.Type") %>% mutate(Tool="Mutect2")
## Bind data frames together (append) ##
sumBreakdown <- bind_rows(platCounts, varCounts, mutCounts)
#sumBreakdown$Tool <- factor(sumBreakdown$Tool, levels = c("VarDict", "Platypus", "Mutect2"))
maxSum=max(sumBreakdown$Sum)
print(sumBreakdown)
######## Plot sum with IGR, 5'Flank, and Introns ##########
withIGRHisto <- ggplot(sumBreakdown, aes(x=Variant.Type, y=Sum, fill=Tool)) + geom_bar(stat="identity", position="dodge") + coord_flip() + geom_text(aes(label=paste(Sum, " of ", Total)), hjust=-0.3, size=3, position = position_dodge(width = 1)) + theme_classic() + guides(fill = guide_legend(reverse = TRUE)) + scale_x_discrete(limits=positions) + scale_y_continuous("Count", expand = c(0, 0), limits=c(0,maxSum+6000))
ggsave(withIGRHisto, filename="HistoWithIGR_4.jpg")
########## Plot Sum without IGR, 5'Flank, and Introns ###########
sumBreakdown <- sumBreakdown %>% filter(Variant.Type != "IGR" & Variant.Type != "Intron" & Variant.Type !="X5.Flank")
print(sumBreakdown)
#sumBreakdown <- sumBreakdown %>% gather(Variant.Type, Mean) %>% arrange(desc(Sum))
print(positions)
positions<- positions[!is.element(positions, c("IGR", "Intron", "X5.Flank"))] #%>% filter(Variant.Type != "IGR" & Variant.Type != "Intron" & Variant.Type != "X5.Flank")
maxSum=max(sumBreakdown$Sum)
print('done with IGR')
noIGRHisto <- ggplot(sumBreakdown, aes(x=Variant.Type, y=Sum, fill=Tool, order=-as.numeric(Tool))) + geom_bar(stat="identity", position="dodge") + coord_flip() + geom_text(aes(label=paste(Sum, " of ", Total)), hjust=-0.3, size=3, position = position_dodge(width = 1)) + theme_classic() + guides(fill = guide_legend(reverse = TRUE)) + scale_x_discrete(limits=positions) + scale_y_continuous("Count", expand = c(0, 0), limits=c(0,maxSum+6000))
ggsave(noIGRHisto, filename="HistoNoIGR_4.jpg")
############ Plot FPKM vs sensitivity ###############
library(grid)
library(gtable)
library(gridExtra)
colnames(platSens) <- c("bin", "total", "found", "sensitivity")
colnames(varSens) <- c("bin", "total", "found", "sensitivity")
colnames(mutSens) <- c("bin", "total", "found", "sensitivity")
## Get counts by FPKM bin for not found (sensFPKM) and found (foundsens) variants ##
print(platSens)
platsensFPKM <- as.data.frame(platSens %>% group_by(bin) %>% summarise(totalAll=sum(total)))
platfoundsens <- as.data.frame(platSens %>% group_by(bin) %>% summarise(foundAll=sum(found)))
varsensFPKM <- as.data.frame(varSens %>% group_by(bin) %>% summarise(totalAll=sum(total)))
varfoundsens <- as.data.frame(varSens %>% group_by(bin) %>% summarise(foundAll=sum(found)))
mutsensFPKM <- as.data.frame(mutSens %>% group_by(bin) %>% summarise(totalAll=sum(total)))
mutfoundsens <- as.data.frame(mutSens %>% group_by(bin) %>% summarise(foundAll=sum(found)))
## Join counts of found and not found variants by FPKM bin ##
platsensFPKM <- platsensFPKM %>% left_join(platfoundsens, by="bin")
varsensFPKM <- varsensFPKM %>% left_join(varfoundsens, by="bin")
mutsensFPKM <- mutsensFPKM %>% left_join(mutfoundsens, by="bin")
## Add sensitivity column and replace NAs with 0s ##
platsensFPKM <- as.data.frame(platsensFPKM %>% group_by(bin) %>% mutate(Sensitivity=foundAll / (totalAll)) %>% mutate_all(funs(ifelse(is.na(.), 0, .))))
platsensFPKM <- platsensFPKM %>% mutate(Tool="Platypus")
varsensFPKM <- as.data.frame(varsensFPKM %>% group_by(bin) %>% mutate(Sensitivity=foundAll / (totalAll)) %>% mutate_all(funs(ifelse(is.na(.), 0, .))))
varsensFPKM <- varsensFPKM %>% mutate(Tool="VarDict")
mutsensFPKM <- as.data.frame(mutsensFPKM %>% group_by(bin) %>% mutate(Sensitivity=foundAll / (totalAll)) %>% mutate_all(funs(ifelse(is.na(.), 0, .))))
mutsensFPKM <- mutsensFPKM %>% mutate(Tool="Mutect2")
## Bind Plat, Var, Mut ##
sensFPKM <- bind_rows(platsensFPKM, varsensFPKM, mutsensFPKM)
print(sensFPKM)
maxCount=max(platsensFPKM$totalAll)
hist_top <- ggplot(platsensFPKM, aes(x=bin, y=totalAll)) + geom_bar(stat="identity") + theme_classic() + scale_x_continuous(limits=c(-13,13), breaks = seq(-13, 13, by = 1)) + scale_y_continuous(trans='log2', breaks=c(5,30,250,2050))
#log10(breaks=seq(0,4000, by=1000))
jpeg(filename="sensitivityFPKM_4.jpg", width=10, height=6, units="in", res=300)
hist_top <- hist_top + labs(y="# Known Variants", x="")
sensPlot <- ggplot(sensFPKM, aes(x=bin, y=Sensitivity, color=Tool, group=Tool)) + geom_point() + geom_line() + theme_classic() + scale_x_continuous(limits=c(-13,13), breaks = seq(-13, 13, by = 1)) + scale_y_continuous(limits=c(0,1), breaks = seq(0, 1, by = 0.1))
sensPlot <- sensPlot + labs(x="log2(FPKM)", y="Sensitivity")
#totalPlot <- grid.arrange(hist_top, sensPlot, nrow=2, heights=c(3,8))
g1 <- ggplotGrob(hist_top)
g2 <- ggplotGrob(sensPlot)
colnames(g1) <- paste0(seq_len(ncol(g1)))
colnames(g2) <- paste0(seq_len(ncol(g2)))
grid.draw(gtable_combine(g1, g2, along=2))
dev.off()
#ggsave(totalPlot, filename="sensitivityFPKM.jpg", width=10, height=6, units="in", dpi=300)
######################## Plot Coverage vs Sensitivity ##################
## concat coverage files for each tool ##
covFound <- bind_rows(platCovFound, varCovFound, mutCovFound)
covNotFound <- bind_rows(platCovNotFound, varCovNotFound, mutCovNotFound)
print(head(covFound))
## set bins by total coverage ##
# set max read coverage bin to show on plot (goes >15,000 but no variants in most bins)
binMax <- 450 #max(covNotFound$V7)
print(binMax)
covFound$bin <- cut(covFound$V7, breaks=seq(0, binMax+25, by=25), labels=seq(0, binMax, by=25))
covNotFound$bin <- cut(covNotFound$V7, breaks=seq(0, binMax+25, by=25), labels=seq(0, binMax, by=25))
covFound <- as.data.frame(covFound %>% group_by(Tool, bin) %>% summarise(found=n()))
covNotFound <- as.data.frame(covNotFound %>% group_by(Tool, bin) %>% summarise(notFound=n()))
## join found and not found df's by bin and tool
totCov <- covNotFound %>% full_join(covFound, by=c("bin", "Tool"))
totCov <- totCov %>% drop_na() #mutate_all(funs(ifelse(is.na(.), 0, .)))
## add total counts column
totCov <- as.data.frame(totCov %>% group_by(Tool, bin) %>% mutate(total=as.numeric(found+notFound)))
print("totCov")
print(head(totCov))
## calculate sensitivity and add as column for each bin
totCov <- as.data.frame(totCov %>% group_by(Tool, bin) %>% mutate(sensitivity=(as.numeric(found) / total)) %>% mutate_all(funs(ifelse(is.na(.), 0, .))))
print(totCov)
## open jpg file for writing
jpeg(filename="sensitivityCoverage_4.jpg", width=10, height=6, units="in", res=300)
maxCount=max(totCov$total)
histCov <- totCov %>% distinct(totCov, total, .keep_all=TRUE)
## plot histogram of known variants in each bin
hist_top <- ggplot(histCov, aes(x=bin, y=total)) + geom_bar(stat="identity", position=position_dodge(width=1)) + theme_classic() + scale_y_continuous(trans='log2', breaks=c(5,30,250,2050)) + scale_x_discrete(breaks = seq(0, binMax, by=25)) #+ scale_y_continuous(limits=c(0, maxCount=50), trans='log2')
hist_top <- hist_top + labs(y="# Known Variants", x="")
## plot sensitivity vs read coverage
sensPlot <- ggplot(totCov, aes(x=bin, y=sensitivity, color=Tool, group=Tool)) + geom_point() + geom_line() + theme_classic() + scale_x_discrete(breaks = seq(0, binMax+25, by = 25)) + scale_y_continuous(limits=seq(0,1.0), breaks = seq(0, 1, by = 0.1))
sensPlot <- sensPlot + labs(x="Read Coverage", y="Sensitivity")
## save both plots as grid on same jpg
g1 <- ggplotGrob(hist_top)
g2 <- ggplotGrob(sensPlot)
colnames(g1) <- paste0(seq_len(ncol(g1)))
colnames(g2) <- paste0(seq_len(ncol(g2)))
grid.draw(gtable_combine(g1, g2, along=2))
dev.off()