-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrun_eVIP.py
executable file
·800 lines (632 loc) · 39.5 KB
/
run_eVIP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
#!/usr/bin/python
import sys
import argparse
import os
import errno
import csv
import itertools
import rpy2.robjects as robjects
import json
#importing eVIP
from bin import eVIP_corr
from bin import eVIP_predict
from bin import eVIP_sparkler
from bin import eVIP_viz
from bin import eVIP_compare
from bin import eVIPP_sparkler
########
# MAIN #
########
def main(infile=None, zscore_gct = None, out_directory=None, sig_info =None, c=None, r=None, num_reps=None,
ie_filter=None,ie_col=None, i=None, allele_col=None, conn_null=None, conn_thresh=None,
mut_wt_rep_rank_diff=None, use_c_pval=None, cell_id=None, plate_id=None, ref_allele_mode=None,
x_thresh=None, y_thresh=None, annotate=None, by_gene_color=None, pdf=None, xmin=None,
xmax=None, ymin=None, ymax=None, viz_ymin=None, viz_ymax=None, corr_val=None):
parser = argparse.ArgumentParser()
#from corr
parser.add_argument("--infile", help="Input txt file (filtered and log transformed data).")
parser.add_argument("-zscore_gct", help="Zscore input gct file (use instead of --infile)")
parser.add_argument("-out_directory",required=True, help="Path to directory for eVIP output files")
#from compare
parser.add_argument("-sig_info",required=True, help = "sig info file with gene information and distil information")
parser.add_argument("-c",required=True, help = ".grp file containing allele names of control perturbations. If this file is given, a null will be calculated from these")
parser.add_argument("-r", required=True, help = "File explicitly indicating which comparisons to do. Assumes the file has a header and it is ignored. The first column is the reference allele and second column is test allele. If this file is not given, then the reference alleles are assumed to be WT and inferred from the allele names.")
parser.add_argument("-num_reps",required=True, help = "Number of replicates expected for each allele. DEF=3")
parser.add_argument("-ie_filter", help = "Threshold for infection efficiency. Any wildtype or mutant alleles having an ie below this threshold, will be removed")
parser.add_argument("-ie_col", help = "Name of the column in the sig_info file with infection efficiency information. DEF=x_ie_a549")
parser.add_argument("-i", help = "Number of iterations to run. DEF=1000")
parser.add_argument("-allele_col", help = "Column name in sig_info file that indicates the allele names.DEF=x_mutation_status")
parser.add_argument("-conn_null", help = " Optional file containing connectivity null values from a previous run. Should end in _conn_null.txt")
#from predict
parser.add_argument("-conn_thresh",help = "P-value threshold for connectivity vs null. DEFAULT=0.05")
parser.add_argument("-mut_wt_rep_thresh", help = "P-value threshold for comparison of WT and mut robustness. DEFAULT=0.05")
parser.add_argument("-disting_thresh", help = "P-value threshold that tests if mut and wt reps are indistinguishable from each other. DEFAULT=0.05")
parser.add_argument("-mut_wt_rep_rank_diff", help = "The minimum difference in median rankpoint WT and mut to consider a difference. DEF=0")
parser.add_argument("-use_c_pval", action ="store_true", help = "Will use corrected p-value instead of raw p-val")
parser.add_argument("-cell_id", help = "Optional: Will only look at signatures from this cell line. Helps to filter sig_info file.")
parser.add_argument("-plate_id", help = "Optional: Will only look at signatures from this plate")
#from sparkler
parser.add_argument("-ref_allele_mode", action ="store_true", help = "Sparkler+Viz: Instead of organizing plots by gene, will use the wt column to determine what are the reference alleles." )
parser.add_argument("-x_thresh" , help = "Sparkler: Threshold of significance")
parser.add_argument("-y_thresh", help = "Sparkler: Threshold of impact direction")
parser.add_argument("-annotate", action ="store_true", help = "Sparkler: Will add allele labels to points.")
parser.add_argument("-by_gene_color", help = "Sparkler: File containing labels and colors for gene-centric plot.")
parser.add_argument("-pdf", help = "Sparkler + Viz: Will print plots in pdf format instead of png.")
parser.add_argument("-xmin", help = "Sparkler: Min value of x-axis. DEF=0")
parser.add_argument("-xmax", help = "Sparkler: Max value of x-axis. DEF=4")
parser.add_argument("-ymin", help = "Sparkler: Min value of y-axis. DEF=-3")
parser.add_argument("-ymax", help = "Sparkler: Min value of y-axis. DEF=3")
#from viz
parser.add_argument("-viz_ymin", help = "Viz: Minimum y-value of rep value. DEF=-100")
parser.add_argument("-viz_ymax", help = "Viz: Maximum y-value of rep value. DEF=100")
parser.add_argument("-corr_val", help = "Viz: String used to label the correlation value. DEF= 'row median rankpoints' ")
#eVIPP
parser.add_argument("-eVIPP", action ="store_true", help="Use this option when doing pathway analysis, must also have JSON file ")
parser.add_argument("-JSON", help= "JSON file created by create_pathway_JSON.py. Contains dictionary of pathways and the associated ids")
parser.add_argument("-min_genes", help = "Minimum amount of pathway genes found in data to run eVIPP on. DEF = 5")
parser.add_argument("-viz_off", action ="store_true", help = "Will not perform eVIP viz step")
parser.add_argument("-sparkler_off", action ="store_true",help = "Will not perform eVIP sparkler step")
global args
args = parser.parse_args()
#make eVIP output directory
global out_dir
out_dir = args.out_directory
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# running eVIP using JSON (running eVIP using only genes from JSON pathways)
if args.JSON:
print "JSON"
print "Extracting pathway genes from data..."
#making output for eVIP on the JSON genes
eVIP_JSON_dir = out_dir + "/eVIP_out_JSON/"
if not os.path.exists(eVIP_JSON_dir):
os.makedirs(eVIP_JSON_dir)
JSON_extracted_loc = out_dir + "/eVIP_out_JSON/JSON_extracted_data.gct"
#extracting the JSON genes from the data
JSON_extraction()
print "running eVIP on pathway genes..."
#if input is expression data (infile option)
if args.infile:
run_eVIP(JSON_extracted_loc, args.zscore_gct, out_dir + "/eVIP_out_JSON/" , args.sig_info, args.c, args.r, args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id, args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
#if input is z scores (zscore_gct option)
if args.zscore_gct:
run_eVIP(args.infile, JSON_extracted_loc, out_dir + "/eVIP_out_JSON/" , args.sig_info, args.c, args.r, args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id, args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
#running using just the input file (original way to run)
else:
eVIP_dir = out_dir + "/eVIP_out"
if not os.path.exists(eVIP_dir):
os.makedirs(eVIP_dir)
run_eVIP(args.infile, args.zscore_gct, eVIP_dir, args.sig_info, args.c, args.r, args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id, args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
#if using JSON to test pathways
if args.JSON and args.eVIPP:
print "Running eVIPP using JSON..."
summary_file = open(out_dir + "/eVIPP_summary.txt", "w")
summary_eVIPP_vals = open(out_dir + "/eVIPP_combined_predict_files.txt", "w")
#getting used pathways (removing pathways in JSON that there isn't data for)
pway_dict, used_pathways = JSON_pway()
#running eVIP for each pathway
JSON_eVIP(used_pathways)
summarize(used_pathways, summary_file)
summarize_predict_files(used_pathways, summary_eVIPP_vals)
#eVIPP sparkler
print "Making allele pathway sparkler plots..."
run_eVIPP_sparkler = eVIPP_sparkler.main(pred_file= out_dir + "/eVIPP_combined_predict_files.txt", ref_allele_mode=args.ref_allele_mode, y_thresh=args.y_thresh, x_thresh=args.x_thresh, use_c_pval=args.use_c_pval,
annotate=args.annotate, by_gene_color=args.by_gene_color, pdf=args.pdf, xmin=args.xmin, xmax=args.xmax, ymin=args.ymin, ymax=args.ymax,
out_dir= out_dir + "/eVIPP_sparkler_plots")
############
# END_MAIN #
############
#############
# FUNCTIONS #
#############
def run_eVIP(infile=None, zscore_gct = None, out_directory=None, sig_info =None, c=None, r=None, num_reps=None,
ie_filter=None,ie_col=None, i=None, allele_col=None, conn_null=None, conn_thresh=None,
mut_wt_rep_rank_diff=None, use_c_pval=None, cell_id=None, plate_id=None, ref_allele_mode=None,
x_thresh=None, y_thresh=None, annotate=None, by_gene_color=None, pdf=None, xmin=None,
xmax=None, ymin=None, ymax=None, viz_ymin=None, viz_ymax=None, corr_val=None):
#different sig_gctx for exp an z inputs used in viz
if args.infile :
sig_gctx_val = out_directory+ "/z_scores.gct"
if args.zscore_gct :
sig_gctx_val = args.zscore_gct
# run eVIP_corr.py
print('calculating correlations...')
run_corr = eVIP_corr.run_main(input=infile,zscore_gct=zscore_gct, out_dir= out_directory)
print('comparing...')
run_compare = eVIP_compare.run_main(sig_info=sig_info, gctx = out_directory+"/spearman_rank_matrix.gct",
allele_col = args.allele_col, o= out_directory+"/compare", r = args.r,
c = args.c, i = args.i, conn_null = args.conn_null, ie_col = args.ie_col,
ie_filter = args.ie_filter, num_reps = args.num_reps, cell_id = args.cell_id, plate_id = args.plate_id)
print('predicting...')
run_predict = eVIP_predict.run_main(i= out_directory+"/compare.txt", o= out_directory+"/predict", conn_thresh=args.conn_thresh,
mut_wt_rep_thresh=args.mut_wt_rep_thresh, mut_wt_rep_rank_diff=args.mut_wt_rep_rank_diff,
disting_thresh=args.disting_thresh, use_c_pval=args.use_c_pval)
if not args.sparkler_off:
print "making sparkler plots..."
run_sparkler = eVIP_sparkler.eVIP_run_main(pred_file = out_directory+"/predict.txt", ref_allele_mode=args.ref_allele_mode,
y_thresh = args.y_thresh , x_thresh = args.x_thresh,
use_c_pval= args.use_c_pval,annotate=args.annotate, by_gene_color= args.by_gene_color, pdf= args.pdf,
xmin= args.xmin, xmax = args.xmax, ymin = args.ymin, ymax = args.ymax, out_dir = out_directory+"/sparkler_plots")
if not args.viz_off:
print "making visualizations..."
if args.conn_null:
null_conn = args.conn_null
else:
null_conn = out_directory + "/compare_conn_null.txt"
run_viz = eVIP_viz.eVIP_run_main(pred_file= out_directory+"/predict.txt", sig_info = args.sig_info, gctx=out_directory+"/spearman_rank_matrix.gct",
sig_gctx = sig_gctx_val, ref_allele_mode = args.ref_allele_mode, null_conn = null_conn,
out_dir = out_directory+"/viz",ymin = args.viz_ymin, ymax= args.viz_ymax, allele_col = args.allele_col, use_c_pval = args.use_c_pval,
pdf = args.pdf, cell_id = args.cell_id, plate_id = args.plate_id, corr_val_str= args.corr_val)
# print "eVIP is DONE"
def make_adj_compare_file(pathway_list, out_directory,eVIPP_adj_pways_mut_wt_rep_c_pvals_from_compare,eVIPP_adj_pways_mut_wt_conn_null_c_pvals_from_compare,eVIPP_adj_pways_wt_mut_rep_vs_wt_mut_conn_c_pvals_from_compare):
column_headers = ["gene","mut","mut_rep", "wt_rep", "mut_wt_connectivity",
"wt", "cell_line", "mut_wt_rep_pval","mut_wt_conn_null_pval",
"wt_mut_rep_vs_wt_mut_conn_pval", "mut_wt_rep_c_pval",
"mut_wt_conn_null_c_pval","wt_mut_rep_vs_wt_mut_conn_c_pval"]
for pathway in pathway_list:
#new file
adj_compare = open(out_directory + "/" + pathway + "_eVIPP_outputs/adj_compare.txt", "w")
#writing header to new file
file_writer = csv.DictWriter(adj_compare, delimiter="\t", fieldnames=column_headers)
file_writer.writeheader()
#opening the original compare file
compare_file = open(out_directory + "/" +pathway+ "_eVIPP_outputs/compare.txt", "r")
file_reader = csv.DictReader(compare_file, delimiter="\t")
n = 0
m = 0
l = 0
for line in file_reader:
for item in eVIPP_adj_pways_mut_wt_rep_c_pvals_from_compare:
if pathway in item:
line['mut_wt_rep_c_pval'] = item[0][n]
n+=1
for item in eVIPP_adj_pways_mut_wt_conn_null_c_pvals_from_compare:
if pathway in item:
line['mut_wt_conn_null_c_pval'] = item[0][m]
m += 1
for item in eVIPP_adj_pways_wt_mut_rep_vs_wt_mut_conn_c_pvals_from_compare:
if pathway in item:
l += 1
file_writer.writerow(line)
def run_eVIP_multiple_testing_pt1(infile=None, zscore_gct = None, out_directory=None, sig_info =None, c=None, r=None, num_reps=None,
ie_filter=None,ie_col=None, i=None, allele_col=None, conn_null=None, conn_thresh=None,
mut_wt_rep_rank_diff=None, use_c_pval=None, cell_id=None, plate_id=None, ref_allele_mode=None,
x_thresh=None, y_thresh=None, annotate=None, by_gene_color=None, pdf=None, xmin=None,
xmax=None, ymin=None, ymax=None, viz_ymin=None, viz_ymax=None, corr_val=None):
#runs eVIP_corr and compare and returns the p values from compare
#different sig_gctx for exp an z inputs used in viz
if args.infile :
sig_gctx_val = out_directory+ "/z_scores.gct"
if args.zscore_gct :
sig_gctx_val = args.zscore_gct
#run eVIP_corr.py
print('calculating correlations...')
run_corr = eVIP_corr.run_main(input=infile,zscore_gct=zscore_gct, out_dir= out_directory)
#run eVIP_compare.py
print('comparing...')
run_compare = eVIP_compare.run_main(sig_info=sig_info, gctx = out_directory+"/spearman_rank_matrix.gct",
allele_col = args.allele_col, o= out_directory+"/compare", r = args.r,
c = args.c, i = args.i, conn_null = args.conn_null, ie_col = args.ie_col,
ie_filter = args.ie_filter, num_reps = args.num_reps, cell_id = args.cell_id, plate_id = args.plate_id)
#getting the p values from the pathway compare file
mut_wt_rep_pvals_from_compare = []
mut_wt_conn_null_pvals_from_compare = []
wt_mut_rep_vs_wt_mut_conn_pvals_from_compare = []
num_test_alleles = None
with open(out_directory+"/compare.txt", "r") as compare_file:
file_reader = csv.DictReader(compare_file, delimiter="\t")
for row in file_reader:
mut_wt_rep_pvals_from_compare.append(row['mut_wt_rep_pval'])
mut_wt_conn_null_pvals_from_compare.append(row['mut_wt_conn_null_pval'])
wt_mut_rep_vs_wt_mut_conn_pvals_from_compare.append(row['wt_mut_rep_vs_wt_mut_conn_pval'])
#counting the number of mutations being tested
with open(out_directory + "/compare.txt", "r") as compare_file:
file_reader = csv.DictReader(compare_file, delimiter="\t")
each_line = list(file_reader)
num_test_alleles=len(each_line)
return mut_wt_rep_pvals_from_compare, mut_wt_conn_null_pvals_from_compare, wt_mut_rep_vs_wt_mut_conn_pvals_from_compare,num_test_alleles
def run_eVIP_multiple_testing_pt2(infile=None, zscore_gct = None, out_directory=None, sig_info =None, c=None, r=None, num_reps=None,
ie_filter=None,ie_col=None, i=None, allele_col=None, conn_null=None, conn_thresh=None,
mut_wt_rep_rank_diff=None, use_c_pval=None, cell_id=None, plate_id=None, ref_allele_mode=None,
x_thresh=None, y_thresh=None, annotate=None, by_gene_color=None, pdf=None, xmin=None,
xmax=None, ymin=None, ymax=None, viz_ymin=None, viz_ymax=None, corr_val=None):
if args.infile :
sig_gctx_val = out_directory+ "/z_scores.gct"
if args.zscore_gct :
sig_gctx_val = args.zscore_gct
print('predicting...')
run_predict = eVIP_predict.run_main(i= out_directory+"/adj_compare.txt", o= out_directory+"/predict", conn_thresh=args.conn_thresh,
mut_wt_rep_thresh=args.mut_wt_rep_thresh, mut_wt_rep_rank_diff=args.mut_wt_rep_rank_diff,
disting_thresh=args.disting_thresh, use_c_pval=args.use_c_pval)
if not args.sparkler_off:
print "making sparkler plots..."
run_sparkler = eVIP_sparkler.eVIP_run_main(pred_file = out_directory+"/predict.txt", ref_allele_mode=args.ref_allele_mode,
y_thresh = args.y_thresh , x_thresh = args.x_thresh,
use_c_pval= args.use_c_pval,annotate=args.annotate, by_gene_color= args.by_gene_color, pdf= args.pdf,
xmin= args.xmin, xmax = args.xmax, ymin = args.ymin, ymax = args.ymax, out_dir = out_directory+"/sparkler_plots")
if not args.viz_off:
if args.conn_null:
null_conn = args.conn_null
else:
null_conn = out_directory + "/compare_conn_null.txt"
print "making visualizations..."
run_viz = eVIP_viz.eVIP_run_main(pred_file= out_directory+"/predict.txt", sig_info = args.sig_info, gctx=out_directory+"/spearman_rank_matrix.gct",
sig_gctx = sig_gctx_val, ref_allele_mode = args.ref_allele_mode, null_conn = null_conn,
out_dir = out_directory+"/viz",ymin = args.viz_ymin, ymax= args.viz_ymax, allele_col = args.allele_col, use_c_pval = args.use_c_pval,
pdf = args.pdf, cell_id = args.cell_id, plate_id = args.plate_id, corr_val_str= args.corr_val)
print "Pathway is DONE"
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def formatLine(line):
line = line.replace("\r", "")
line = line.replace("\n", "")
return line
def summarize(pathway_list, output_file):
#creates a simple summary file of just variant predictions for each pathway
mut_list = []
pred_list = []
#getting the list of tested mutations using the first pathway in the list
with open(out_dir + "/" + pathway_list[0] + "_eVIPP_outputs/predict.txt") as first_pathway:
for line in first_pathway:
if line.startswith("gene"):
continue
else:
splitLine = line.split()
mut_list.append(str(splitLine[1]))
header = "pathway" + "\t" + ("\t").join(mut_list) + "\n"
output_file.write(header)
for pathway in pathway_list:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/predict.txt") as this_predict:
for line in this_predict:
if line.startswith("gene"):
continue
else:
splitLine = line.split()
pred_list.append(splitLine[13])
output_file.write(pathway + "\t" + ("\t").join(pred_list)+"\n")
#clearing predict list
pred_list = []
def summarize_predict_files(pathway_list, output_file):
#combines predict files from each pathway
# getting header using the first pathway in the list
with open(out_dir + "/" + pathway_list[0] + "_eVIPP_outputs/predict.txt") as first_pathway:
for line in first_pathway:
if line.startswith("gene"):
output_file.write("Pathway" +"\t"+ line)
for pathway in pathway_list:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/predict.txt") as this_predict:
for line in this_predict:
if line.startswith("gene"):
continue
else:
output_file.write(pathway +"\t"+ line)
output_file.close()
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return izip_longest(fillvalue=fillvalue, *args)
class ZipExhausted(Exception):
pass
def izip_longest(*args, **kwds):
fillvalue = kwds.get('fillvalue')
counter = [len(args) - 1]
def sentinel():
if not counter[0]:
raise ZipExhausted
counter[0] -= 1
yield fillvalue
fillers = repeat(fillvalue)
iterators = [chain(it, sentinel(), fillers) for it in args]
try:
while iterators:
yield tuple(map(next, iterators))
except ZipExhausted:
pass
def repeat(object, times=None):
# repeat(10, 3) --> 10 10 10
if times is None:
while True:
yield object
else:
for i in xrange(times):
yield object
def chain(*iterables):
# chain('ABC', 'DEF') --> A B C D E F
for it in iterables:
for element in it:
yield element
def padj(c_pval_type, num_test_alleles, pathway_list):
# converting list of lists into one list
c_pval_type = list(itertools.chain(*c_pval_type))
#multiple testing corrections
eVIPP_c_pval_type = robjects.r['p.adjust'](robjects.FloatVector(c_pval_type), "BH")
new_cpval_list = str(eVIPP_c_pval_type).split()
fixed_cpval_list = []
for item in new_cpval_list:
if item.startswith("["):
pass
else:
fixed_cpval_list.append(item)
#grouping pvalues from the same pathway together
grouped_new_cpval_list = list(grouper(fixed_cpval_list, num_test_alleles, fillvalue=None))
#zipping the pathway with the pvalues
zipped = zip(grouped_new_cpval_list, pathway_list)
return zipped
def JSON_extraction():
all_pway_genes = []
with open(args.JSON, "rb") as JSON:
dict = json.load(JSON)
# creating new dict to remove pathways with None values (no genes were in the pathway)
new_dict = {}
new_dict = {k: v for k, v in dict.iteritems() if v is not None}
# creating list of unique ensembl ids
for a in new_dict.values():
for item in a:
if item in all_pway_genes:
continue
else:
all_pway_genes.append(item)
if args.infile:
with open(args.infile, "rb") as infile, open(out_dir + "/eVIP_out_JSON/JSON_extracted_data.gct",
"w") as JSON_extracted:
matched_ids = []
for line in infile:
if line.startswith("#gene_id"):
JSON_extracted.write(line)
sline = line.split()
for id in all_pway_genes:
if sline[0]== id:
matched_ids.append(id)
JSON_extracted.write("\t".join(sline) + "\n")
if args.zscore_gct:
with open(args.zscore_gct, "rb") as zscores, open(out_dir + "/eVIP_out_JSON/JSON_extracted_data.gct", "w") as JSON_extracted:
matched_ids = []
outlines = []
meta_row_names = []
meta_rows = []
# saving first line
first_line = next(zscores)
# saving second line (#data rows, # data columns, #row metadata fields, #col metadata fields)
second_line = next(zscores)
spl = second_line.split()
ncols = spl[1]
num_meta_lines= int(spl[3])
# saving third line (sample names)
sample_names = next(zscores)
for n in range(num_meta_lines):
# getting first element (row name)
row = (next(zscores).split("\t"))
meta_rows.append(row)
meta_row_names.append(row[0])
for line in zscores:
sline = line.split()
for id in all_pway_genes:
if sline[0]==id:
matched_ids.append(id)
outline = "\t".join(sline) + "\n"
outlines.append(outline)
JSON_extracted.write(first_line)
JSON_extracted.write(str(len(matched_ids)) + "\t" + str(ncols) + "\t" + "0" + "\t" + str(num_meta_lines) + "\n")
JSON_extracted.write(sample_names)
for row in meta_rows:
JSON_extracted.write(str(row[0]) +"\t"+ str("\t".join(row[1:])).strip("\n"))
JSON_extracted.write("\n")
for outline in outlines:
JSON_extracted.write(outline)
def JSON_pway():
pway_dict = {}
with open(args.JSON, "rb") as JSON:
old_dict = json.load(JSON)
#creating new dict to remove pathways with None values (no genes were in the pathway)
pway_dict = {k: v for k, v in old_dict.iteritems() if v is not None}
used_pathways = []
#for each pathway
for pathway, genes in pway_dict.iteritems():
print pathway
pathway_out_file_txt = out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_expression.txt"
p_out_directory = out_dir + "/" + pathway + "_eVIPP_outputs"
mkdir_p(p_out_directory)
pathway_genes = []
for gene in genes:
gene = str(gene)
pathway_genes.append(gene)
#running eVIP when the input is expression data
if args.infile:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_expression.txt", "w+") as pathway_out_file, open(args.infile, "r") as all_data:
matched_ids = []
for line in all_data:
if line.startswith("#gene_id"):
pathway_out_file.write(line)
sline = line.split()
for id in pathway_genes:
if sline[0]==id:
matched_ids.append(id)
pathway_out_file.write("\t".join(sline) + "\n")
matched_length = str(len(matched_ids))
ensembl_length = str(len(pathway_genes))
print matched_length + " of " + ensembl_length + " IDs were found in the data. "
percent_matched = (float(len(matched_ids))/(float(len(pathway_genes))))*100
print "percent matched: " + str(percent_matched)
if float(matched_length) > (int(args.min_genes)-1 if args.min_genes else 4):
used_pathways.append(pathway)
else:
print "Not enough matched genes...skipping eVIP for pathway"
# running eVIP when the input is expression data
if args.zscore_gct:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_zscores.gct","w+") as pathway_out_file, open(args.zscore_gct, "r") as all_data:
matched_ids = []
outlines = []
meta_row_names = []
meta_rows = []
# saving first line
first_line = next(all_data)
# saving second line (#data rows, # data columns, #row metadata fields, #col metadata fields)
second_line = next(all_data)
spl = second_line.split()
ncols = spl[1]
num_meta_lines = int(spl[3])
# saving third line (sample names)
sample_names = next(all_data)
for n in range(num_meta_lines):
# getting first element (row name)
row = (next(all_data).split("\t"))
meta_rows.append(row)
meta_row_names.append(row[0])
for line in all_data:
line = formatLine(line)
for id in pathway_genes:
if line[0]==id:
matched_ids.append(id)
sline = line.split()
if sline[0] == "id":
header = line
ncol_vals = int(ncols) - 1
for id in pathway_genes:
if sline[0]==id:
matched_ids.append(id)
outline = "\t".join(sline) + "\n"
outlines.append(outline)
pathway_out_file.write(first_line)
pathway_out_file.write(str(len(matched_ids)) + "\t" + str(ncols) + "\t" + "0" + "\t" + str(num_meta_lines) + "\n")
pathway_out_file.write(sample_names)
for row in meta_rows:
pathway_out_file.write(str(row[0]) + "\t" + str("\t".join(row[1:])).strip("\n"))
pathway_out_file.write("\n")
for outline in outlines:
pathway_out_file.write(outline)
matched_length = str(len(matched_ids))
ensembl_length = str(len(pathway_genes))
print matched_length + " of " + ensembl_length + " IDs were found in the data. "
percent_matched = (float(len(matched_ids))/(float(len(pathway_genes))))*100
print "percent matched: " + str(percent_matched)
if float(matched_length) > (int(args.min_genes)-1 if args.min_genes else 4):
used_pathways.append(pathway)
else:
print "Less than 2 matched genes...skipping eVIP for pathway"
return pway_dict, used_pathways
def JSON_eVIP(used_pathways):
pways_mut_wt_rep_pvals_from_compare = []
pways_mut_wt_conn_null_pvals_from_compare = []
pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare = []
for pathway in used_pathways:
pathway_out_file_txt = out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_expression.txt"
z_pathway_out_file_txt = out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_zscores.gct"
p_out_directory = out_dir + "/" + pathway + "_eVIPP_outputs"
mkdir_p(p_out_directory)
if args.infile:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_expression.txt","r") as pathway_out_file:
if args.use_c_pval:
print "Running with multiple testing..."
# running eVIP corr and compare to get pvals
mut_wt_rep_pvals_from_compare, mut_wt_conn_null_pvals_from_compare, wt_mut_rep_vs_wt_mut_conn_pvals_from_compare,num_test_alleles \
= run_eVIP_multiple_testing_pt1(pathway_out_file_txt, None, p_out_directory, args.sig_info, args.c, args.r, args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id, args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
# appending values from each pathway to one list
pways_mut_wt_rep_pvals_from_compare.append(mut_wt_rep_pvals_from_compare)
pways_mut_wt_conn_null_pvals_from_compare.append(mut_wt_conn_null_pvals_from_compare)
pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare.append(wt_mut_rep_vs_wt_mut_conn_pvals_from_compare)
# if the pathway lists are done being collected
if len(pways_mut_wt_rep_pvals_from_compare) == len(used_pathways):
eVIPP_mut_wt_rep_pvals = padj(pways_mut_wt_rep_pvals_from_compare, num_test_alleles, used_pathways)
if len(pways_mut_wt_conn_null_pvals_from_compare) == len(used_pathways):
eVIPP_pways_mut_wt_conn_null_pvals = padj(pways_mut_wt_conn_null_pvals_from_compare,num_test_alleles, used_pathways)
if len(pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare) == len(used_pathways):
eVIPP_pways_wt_mut_rep_vs_wt_mut_conn_pvals = padj(pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare,num_test_alleles, used_pathways)
#adding the new calculated eVIPP pathways to a new compare file
make_adj_compare_file(used_pathways, out_dir, eVIPP_mut_wt_rep_pvals,eVIPP_pways_mut_wt_conn_null_pvals,eVIPP_pways_wt_mut_rep_vs_wt_mut_conn_pvals)
for pathway in used_pathways:
print "pathway running pt2 of eVIP"
print pathway
pathway_out_file_txt = out_dir + "/" + pathway + "_expression.txt"
p_out_directory = out_dir + "/" + pathway + "_eVIPP_outputs"
#getting predictions and finisihin eVIP with the new corrected values
run_eVIP_multiple_testing_pt2(pathway_out_file_txt, None, p_out_directory, args.sig_info, args.c,
args.r, args.num_reps, args.ie_filter, args.ie_col, args.i, args.allele_col,
args.conn_null, args.conn_thresh,args.mut_wt_rep_rank_diff, args.use_c_pval,
args.cell_id,args.plate_id, args.ref_allele_mode,args.x_thresh, args.y_thresh,
args.annotate, args.by_gene_color,args.pdf, args.xmin,args.xmax, args.ymin,
args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
else:
run_eVIP(pathway_out_file_txt, None, p_out_directory, args.sig_info, args.c, args.r,
args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id,
args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
if args.zscore_gct:
with open(out_dir + "/" + pathway + "_eVIPP_outputs/" + pathway + "_zscores.gct","r") as pathway_out_file:
print pathway
if args.use_c_pval:
print "Running with multiple testing..."
# running eVIP corr and compare to get c pvals
mut_wt_rep_pvals_from_compare, mut_wt_conn_null_pvals_from_compare, wt_mut_rep_vs_wt_mut_conn_pvals_from_compare, num_test_alleles \
= run_eVIP_multiple_testing_pt1(None, z_pathway_out_file_txt, p_out_directory, args.sig_info,
args.c, args.r, args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col,
args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id,
args.plate_id, args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color,
args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax,
args.corr_val)
# appending values from each pathway to one list
pways_mut_wt_rep_pvals_from_compare.append(mut_wt_rep_pvals_from_compare)
pways_mut_wt_conn_null_pvals_from_compare.append(mut_wt_conn_null_pvals_from_compare)
pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare.append(
wt_mut_rep_vs_wt_mut_conn_pvals_from_compare)
# if the pathway lists are done being collected
if len(pways_mut_wt_rep_pvals_from_compare) == len(used_pathways):
eVIPP_mut_wt_rep_pvals = padj(pways_mut_wt_rep_pvals_from_compare, num_test_alleles, used_pathways)
if len(pways_mut_wt_conn_null_pvals_from_compare) == len(used_pathways):
eVIPP_pways_mut_wt_conn_null_pvals = padj(pways_mut_wt_conn_null_pvals_from_compare,num_test_alleles, used_pathways)
if len(pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare) == len(used_pathways):
eVIPP_pways_wt_mut_rep_vs_wt_mut_conn_pvals = padj(pways_wt_mut_rep_vs_wt_mut_conn_pvals_from_compare,num_test_alleles, used_pathways)
#adding the new calculated eVIPP pathways to a new compare file
make_adj_compare_file(used_pathways, out_dir, eVIPP_mut_wt_rep_pvals,eVIPP_pways_mut_wt_conn_null_pvals,eVIPP_pways_wt_mut_rep_vs_wt_mut_conn_pvals)
for pathway in used_pathways:
print "pathway running pt2"
print pathway
pathway_out_file_txt = out_dir + "/" + pathway + "_expression.txt"
p_out_directory = out_dir + "/" + pathway + "_eVIPP_outputs"
# getting predictions and finisihin eVIP with the new corrected values
run_eVIP_multiple_testing_pt2(None, z_pathway_out_file_txt, p_out_directory, args.sig_info,
args.c, args.r, args.num_reps, args.ie_filter, args.ie_col, args.i,
args.allele_col,args.conn_null, args.conn_thresh, args.mut_wt_rep_rank_diff,
args.use_c_pval,args.cell_id, args.plate_id, args.ref_allele_mode, args.x_thresh,
args.y_thresh,args.annotate, args.by_gene_color, args.pdf, args.xmin, args.xmax,
args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
else:
run_eVIP(None, z_pathway_out_file_txt, p_out_directory, args.sig_info, args.c, args.r,
args.num_reps,
args.ie_filter, args.ie_col, args.i, args.allele_col, args.conn_null, args.conn_thresh,
args.mut_wt_rep_rank_diff, args.use_c_pval, args.cell_id, args.plate_id,
args.ref_allele_mode,
args.x_thresh, args.y_thresh, args.annotate, args.by_gene_color, args.pdf, args.xmin,
args.xmax, args.ymin, args.ymax, args.viz_ymin, args.viz_ymax, args.corr_val)
return(used_pathways)
#################
# END FUNCTIONS #
#################
if __name__ == "__main__": main()