-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmutexnetwork.py
940 lines (676 loc) · 34.8 KB
/
mutexnetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
__author__ = 'jlu96'
import mutex
from comet import permute
import time
import mutex as mex
import mutex_triangles as met
import csv
import random
import os
import parallel_compute_working as pac
import numpy as np
import math
import line_profiler
class TwoWayDict(dict):
def __setitem__(self, key, value):
# Remove any previous connections with these values
if key in self:
del self[key]
if value in self:
del self[value]
dict.__setitem__(self, key, value)
dict.__setitem__(self, value, key)
def __delitem__(self, key):
dict.__delitem__(self, self[key])
dict.__delitem__(self, key)
def __len__(self):
"""Returns the number of connections"""
return dict.__len__(self) // 2
class PermutationMatrix:
def __init__(self, geneToCases, patientToGenes):
# Each is (gene, ID, sum)
self.geneToCases = geneToCases
self.patientToGenes = patientToGenes
self.genes = geneToCases.keys()
self.patients = patientToGenes.keys()
self.numGenes = len(geneToCases)
self.numCases = len(patientToGenes)
self.gene_ID = TwoWayDict()
self.patient_ID = TwoWayDict()
self.gene_ID_sums = {}
self.patient_ID_sums = {}
# Each entry is (gene_ID, patient_ID)
self.ones = set()
self.zeros = set()
ID = 0
for gene in geneToCases:
self.gene_ID[gene] = ID
sum = len(geneToCases[gene])
self.gene_ID_sums[ID] = sum
ID += 1
ID = 0
for patient in patientToGenes:
self.patient_ID[patient] = ID
sum = len(patientToGenes[patient])
self.patient_ID_sums[ID] = sum
ID += 1
#@profile
def permute_data(self):
# Take gene IDs
binary_numbers = self.random_binary_numbers()
conformed_binary_numbers = self.conform_binary_numbers(binary_numbers)
self.update_gene_patient_dicts(conformed_binary_numbers)
# FUNCTIONS TO GENERATE THE RANDOM MATRIX---------------------------------------------------------------------------
##@profile
def random_binary_matrix_ones_zeros(self):
new_zeros = set()
new_ones = set()
# Generate random binary matrix for each gene ID
for g in self.gene_ID_sums:
sum = self.gene_ID_sums[g]
binary_array = random_binary_array(sum, self.numCases)
for p in range(len(binary_array)):
if binary_array[p]:
new_ones.add((g,p))
else:
new_zeros.add((g,p))
return new_ones, new_zeros
##@profile
def random_binary_matrix(self):
binary_matrix = []
for g in self.gene_ID_sums:
sum = self.gene_ID_sums[g]
binary_array = random_binary_array(sum, self.numCases)
binary_matrix.append(binary_array)
return binary_matrix
#@profile
def random_binary_numbers(self):
binary_numbers = []
for g in self.gene_ID_sums:
g_sum = self.gene_ID_sums[g]
binary_array = random_binary_array(g_sum, self.numCases)
binary_number = 0
for i in binary_array:
binary_number += i
binary_number <<= 1
binary_number >>= 1
binary_numbers.append(binary_number)
return binary_numbers
# FUNCTIONS TO CONFORM THE RANDOM MATRIX----------------------------------------------------------------------------
#@profile
def conform_binary_numbers(self, binary_numbers):
for p in range(self.numCases):
conform_sum = self.patient_ID_sums[p]
p_mask = 1 << (self.numCases - p - 1)
p_sum = 0
p_ones = []
p_zeros = []
for g in range(self.numGenes):
# print bin(binary_numbers[g])
# print bin(p_mask)
if (binary_numbers[g] & p_mask):
p_sum += 1
p_ones.append(g)
else:
p_zeros.append(g)
ones_to_add = conform_sum - len(p_ones)
if ones_to_add > 0:
# Shuffle the rows to search
random.shuffle(p_zeros)
# Make a new p_mask to look at later columns
latter_mask = p_mask
# print
for i in range(self.numCases - p):
latter_mask >>= 1
# print "####################################################################################"
# print "BEGINNING!!!!-----------------------"
for g in p_zeros:
# If the later column has a 1, switch that column to 0 and p_column to 1
if (binary_numbers[g] & latter_mask):
binary_numbers[g] -= latter_mask
binary_numbers[g] += p_mask
p_zeros.remove(g)
# print "SWITCH HAPPENING FOR # ", g
# orig = binary_numbers[g] - p_mask + latter_mask
# p_array = binary_number_to_array(p_mask, self.numCases)
# latter_array = binary_number_to_array(latter_mask, self.numCases)
# orig_array = binary_number_to_array(orig, self.numCases)
# new_array = binary_number_to_array(binary_numbers[g], self.numCases)
#
# for index in range(self.numCases):
# if p_array[index]:
# print "p_array ", index
# if latter_array[index]:
# print "latter_array ", index
# if orig_array[index]:
# print "orig_array ", index
# if new_array[index]:
# print "new_array ", index
#
if sum(binary_number_to_array(binary_numbers[g], self.numCases)) != self.gene_ID_sums[g]:
print "in ones to add"
print "Sum violated for column ", i
# print bin(p_mask), bin(latter_mask), bin(binary_numbers[g]), self.gene_ID_sums[g]
# print p_mask
# orig = binary_numbers[g] - p_mask + latter_mask
# p_array = binary_number_to_array(p_mask, self.numCases)
# latter_array = binary_number_to_array(latter_mask, self.numCases)
# orig_array = binary_number_to_array(orig, self.numCases)
# new_array = binary_number_to_array(binary_numbers[g], self.numCases)
#
# for i in range(self.numCases):
# if p_array[i]:
# print "p_array ", i
# if latter_array[i]:
# print "latter_array ", i
# if orig_array[i]:
# print "orig_array ", i
# if new_array[i]:
# print "new_array ", i
# p_digit = math.log(p_mask, 2)
#
# latter_dig = math.log(latter_mask, 2)
# orig_dig = math.log(orig, 2)
# now_dig = math.log(binary_numbers[g], 2)
# print "p_digit", p_digit
# print "latter_dig", latter_dig
# print "orig_dig", orig_dig
# print "now_dig", now_dig
exit(1)
ones_to_add -= 1
if ones_to_add == 0:
break
if ones_to_add == 0:
break
if ones_to_add != 0:
print "Error: ", ones_to_add, " unfinished added ones"
exit(1)
elif ones_to_add < 0:
zeros_to_add = -ones_to_add
# Shuffle the rows to search
random.shuffle(p_ones)
# Make a new mask to look at later columns
latter_mask = p_mask
for i in range(self.numCases - p):
latter_mask >>= 1
for g in p_ones:
# If the later column has a 0, switch that column to 1 and p_column to 0
if not (binary_numbers[g] & latter_mask):
binary_numbers[g] += latter_mask
binary_numbers[g] -= p_mask
# This one can no longer be switched
p_ones.remove(g)
if sum(binary_number_to_array(binary_numbers[g], self.numCases)) != self.gene_ID_sums[g]:
print "Sum violated for column ", i
print bin(p_mask), bin(latter_mask), bin(binary_numbers[g]), self.gene_ID_sums[g]
exit(1)
zeros_to_add -= 1
if zeros_to_add == 0:
break
if zeros_to_add == 0:
break
if zeros_to_add != 0:
print "Error: ", zeros_to_add, " unfinished added zeros"
exit(1)
return binary_numbers
#@profile
def update_gene_patient_dicts(self, binary_numbers):
self.geneToCases = {}
self.patientToGenes = {}
for g in range(len(binary_numbers)):
binary_array = binary_number_to_array(binary_numbers[g], self.numCases)
for p in range(len(binary_array)):
if binary_array[p]:
gene = self.gene_ID[g]
patient = self.patient_ID[p]
if gene not in self.geneToCases:
self.geneToCases[gene] = set()
if patient not in self.patientToGenes:
self.patientToGenes[patient] = set()
self.geneToCases[gene].add(patient)
self.patientToGenes[patient].add(gene)
def getGenePatientDicts(self):
return self.geneToCases, self.patientToGenes
def test_conformity(self):
"""
Test if the current dictionaries satisfy the gene row sums and patient column sums
"""
conform_gene_sums = [self.gene_ID_sums[g] for g in range(self.numGenes)]
conform_patient_sums = [self.patient_ID_sums[p] for p in range(self.numCases)]
for g in range(self.numGenes):
gene = self.gene_ID[g]
g_sum = len(self.geneToCases[gene])
if g_sum != conform_gene_sums[g]:
print "Sum for g ", g, " is ", g_sum, ", not ", conform_gene_sums[g]
return False
for p in range(self.numCases):
patient = self.patient_ID[p]
p_sum = len(self.patientToGenes[patient])
if p_sum != conform_patient_sums[p]:
print "Sum for patient ", self.patient_ID[p], " is ", p_sum, ", not ", conform_patient_sums[p]
return False
return True
##@profile
def conform_binary_matrix_ones_zeros(self, ones, zeros):
"""
Make the binary matrix fit the given patient column sums.
"""
decided_ones, decided_zeros = set(), set()
for p in range(self.numCases):
# Get the ones and zeros of these patients
p_ones = set([entry for entry in ones if entry[1] == p])
ones = ones.difference(p_ones)
p_zeros = set([entry for entry in zeros if entry[1] == p])
zeros = zeros.difference(p_zeros)
ones_to_add = self.patient_ID_sums[p] - len(p_ones)
# If ones are missing from the patient column, switch some number of the zeros with a one, and
# correspondingly switch some number of ones with zeros in the other
if ones_to_add > 0:
new_p_ones = random.sample(p_zeros, ones_to_add)
new_zeros = random.sample(ones, ones_to_add)
p_ones = p_ones.union(new_p_ones)
p_zeros = p_zeros.difference(new_p_ones)
ones = ones.difference(new_zeros)
zeros = zeros.union(new_zeros)
elif ones_to_add < 0:
zeros_to_add = -ones_to_add
new_p_zeros = random.sample(p_ones, zeros_to_add)
new_ones = random.sample(zeros, zeros_to_add)
p_ones = p_ones.difference(new_p_zeros)
p_zeros = p_zeros.union(new_p_zeros)
ones = ones.union(new_ones)
zeros = zeros.difference(new_ones)
else:
pass
decided_ones = decided_ones.union(p_ones)
decided_zeros = decided_zeros.union(p_zeros)
return decided_ones, decided_zeros
def getGenePatientDicts_ones_zeros(self):
geneToCases = {}
patientToGenes = {}
for g,p in self.ones:
gene = self.gene_ID[g]
patient = self.patient_ID[p]
if gene not in geneToCases:
geneToCases[gene] = set()
geneToCases[gene].add(patient)
if patient not in patientToGenes:
patientToGenes[patient] = set()
patientToGenes[patient].add(gene)
return geneToCases, patientToGenes
##@profile
def test_conformity_ones_zeros(self):
"""
Test if the current matrix satisfy the gene row sums and patient column sums
"""
conform_gene_sums = [self.gene_ID_sums[g] for g in range(self.numGenes)]
conform_patient_sums = [self.patient_ID_sums[p] for p in range(self.numCases)]
for g in range(self.numGenes):
g_sum = sum([1 for entry in self.ones if entry[0] == g])
if g_sum != conform_gene_sums[g]:
print "Sum for gene ", self.gene_ID[g], " is ", g_sum, ", not ", conform_gene_sums[g]
return False
for p in range(self.numCases):
p_sum = sum([1 for entry in self.ones if entry[1] == p])
if p_sum != conform_patient_sums[p]:
print "Sum for patient ", self.patient_ID[p], " is ", p_sum, ", not ", conform_patient_sums[p]
return False
return True
def random_binary_array(num_ones, length):
random_row = [1] * num_ones + [0] * (length - num_ones)
random.shuffle(random_row)
return random_row
def binary_array_to_number(array):
string = "".join([str(i) for i in array])
return int(string, 2)
def binary_number_to_array(number, length):
bin_string = str(bin(number))[2:]
leading_zeros = [0] * (length - len(bin_string))
return leading_zeros + [int(i) for i in bin_string]
# Matrix class that takes input matrix, holds all relevant matrices
class PermutationMatrices:
# generate matrices
#@profile
def __init__(self, geneToCases, patientToGenes, num_permutations, seeds=[], Q=100, matrixdirectory=None, binary_perm_method=False,
write_matrices=False, load_directory=None, geneFile=None, patientFile=None, minFreq=0):
if not seeds:
for i in range(num_permutations):
seeds.append(random.random())
self.seeds = seeds
self.genes = geneToCases.keys()
self.patients = patientToGenes.keys()
self.num_permutations = num_permutations
# The original matrix
self.geneToCases_orig = geneToCases
self.patientToGenes_orig = patientToGenes
self.numCases = len(self.geneToCases_orig)
self.numGenes = len(self.patientToGenes_orig)
# A dictionary that holds each gene to a list of sets of cases. Each entry in the list is the gene's cases
# for one of the permutations
self.geneToCases_perm = {}
for gene in self.genes:
self.geneToCases_perm[gene] = []
# Same as above, but from patient to genes.
self.patientToGenes_perm = {}
for patient in self.patients:
self.patientToGenes_perm[patient] = []
#-jlu
if load_directory:
self.load_matrices(load_directory, geneFile, patientFile, minFreq)
else:
self.generate_matrices(geneToCases, patientToGenes, Q, matrixdirectory, binary_perm_method, write_matrices)
def generate_matrices(self, geneToCases, patientToGenes, Q=100, matrixdirectory=None, binary_perm_method=False,
write_matrices=False):
t_start = time.time()
# Generate output directory to write matrix
if not os.path.exists(os.path.dirname(matrixdirectory)) and write_matrices:
os.makedirs(os.path.dirname(matrixdirectory))
if binary_perm_method:
PM = PermutationMatrix(geneToCases, patientToGenes)
print "Using matrix permutation method"
else:
G = permute.construct_mutation_graph(geneToCases, patientToGenes)
print '\t- Graph has', len( G.edges() ), 'edges among', len( G.nodes() ), 'nodes.'
for i in range(self.num_permutations):
t = time.time()
if binary_perm_method:
PM.permute_data()
if not PM.test_conformity():
print "Permuted matrix failed test"
exit(1)
newGeneToCases, newPatientToGenes = PM.getGenePatientDicts()
print i + 1, " matrices generated in ", time.time() - t
print " number of same alterations is ", sum([len(patientToGenes[patient].intersection(newPatientToGenes[patient])) for patient in self.patients])
# Matrix method---------------------------------------------------------------------
else:
_, _, _, _, newGeneToCases, newPatientToGenes = permute.permute_mutation_data(G, self.genes, self.patients, self.seeds[i], Q)
print i + 1, " matrices generated in ", time.time() - t
print "For Q=", Q, " number of same alterations is ", sum([len(patientToGenes[patient].intersection(newPatientToGenes[patient])) for patient in self.patients])
# Old graph method------------------------------------------------------------------
# Old graph method------------------------------------------------------------------
for gene in newGeneToCases:
self.geneToCases_perm[gene].append(newGeneToCases[gene])
for patient in newPatientToGenes:
self.patientToGenes_perm[patient].append(newPatientToGenes[patient])
# Write permuted matrices out
if write_matrices and matrixdirectory:
adj_list = [ p + "\t" + "\t".join( sorted(newPatientToGenes[p]) ) for p in self.patients ]
permutation_file = "{}/permuted-matrix-{}.m2".format(matrixdirectory, i+1)
with open(permutation_file, 'w') as outfile: outfile.write('\n'.join(adj_list))
# Clear dictionaries from stack once they're not used
newGeneToCases.clear()
newPatientToGenes.clear()
print "Time to generate mutation matrices ", time.time() - t_start
def load_matrices(self, load_directory, geneFile=None, patientFile=None, minFreq=0):
matrices = os.listdir(load_directory)
for file in matrices:
_, _, newGenes, newPatients, newGeneToCases, newPatientToGenes = mex.load_mutation_data(load_directory + '/' + file, geneFile=geneFile,
patientFile=patientFile, minFreq=minFreq)
# check for differences in loaded genes/patient
if set.difference(set(newGenes), set(self.genes)):
print "Error: loaded genes different from original matrix"
print "in file ", file, " in matrix directory ", load_directory
exit(1)
if set.difference(set(newPatients), set(self.patients)):
print "Error: loaded genes different from original matrix"
print "in file ", file, " in matrix directory ", load_directory
exit(1)
# load the new ones in
for gene in newGeneToCases:
self.geneToCases_perm[gene].append(newGeneToCases[gene])
for patient in newPatientToGenes:
self.patientToGenes_perm[patient].append(newPatientToGenes[patient])
print "Number of loaded matrices: ", len(matrices)
self.num_permutations = len(matrices)
return
# # Method to iterate over its matrices. Return value: number of satisfying matrices
# def pvalue(self, condition_function):
#
# num_satisfy = 0
#
# for i in range(self.num_permutations):
# geneToCases = self.geneToCases_perm.copy()
# for gene in geneToCases:
# geneToCases[gene] = self.geneToCases_perm[gene][i]
#
#
# if condition_function.apply(geneToCases):
# num_satisfy += 1
#
# return num_satisfy * 1.0 / self.num_permutations
def set_to_pvalue(self, set_condition_function_list):
set_to_pvalue = {}
for set, condition_function in set_condition_function_list:
set_to_pvalue[set] = 0
for i in range(self.num_permutations):
geneToCases = {}
for gene in self.geneToCases_perm:
geneToCases[gene] = self.geneToCases_perm[gene][i]
for set, condition_function in set_condition_function_list:
if condition_function.apply(geneToCases):
set_to_pvalue[set] += 1
for set in set_to_pvalue:
set_to_pvalue[set] /= (1.0 * self.num_permutations)
return set_to_pvalue
def complete_cooccurpairs(self, genepairs, p=0.05, minCooccur=1, min_cooccurrence_ratio=0.0, parallel_compute_number=0,
compute_scores=True):
"""
:param genepairs:
:param cprob:
:param minCooccur:
:param min_cooccurrence_ratio:
:param parallel_compute_number:
:return: cpairsdict, cgenedict
"""
print "Generating list of", len(genepairs), " co-occurring hypotheses to test on permutation matrices..."
# Generate condition functions after analyzing each gene pair for Co-occurrence/min
cooccur_set_condition_function_list = []
# Generate list of condition functions to test the permutation matrix for
for genepair in genepairs:
ConditionFunction = Condition(None)
condition_dict = {}
condition_dict['Genes'] = tuple(genepair)
condition_dict['Overlap'] = len(set.intersection(*[self.geneToCases_orig[gene] for gene in condition_dict['Genes']]))
condition_dict['Mutex'] = False
ConditionFunction.set_params([condition_dict])
cooccur_set_condition_function_list.append((genepair, ConditionFunction))
print "Done. Now, calulating p-values of hypotheses..."
# Generate co-occurring pairs
if parallel_compute_number:
cooccur_pair_to_pvalue = pac.parallel_compute_new(self.set_to_pvalue, [cooccur_set_condition_function_list],
cooccur_set_condition_function_list, 0, pac.partition_inputs, {0: pac.combine_dictionaries},
number=parallel_compute_number,
procnumber=parallel_compute_number)
else:
cooccur_pair_to_pvalue = self.set_to_pvalue(cooccur_set_condition_function_list)
print "Done. Now, finding co-occurring pairs"
# Generate dictionary for each pair. Optionally analyze each cooccur set as well.
cpairsdict = {}
cgenedict = {}
for genepair in cooccur_pair_to_pvalue:
if cooccur_pair_to_pvalue[genepair] < p:
cstats = mex.analyze_cooccur_set_new(self.numCases, self.geneToCases_orig, self.patientToGenes_orig,
geneset=tuple(genepair), compute_scores=compute_scores)
if cstats['Overlap'] >= minCooccur and cstats['CooccurrenceRatio'] >= min_cooccurrence_ratio:
cstats['PermutationProbability'] = cooccur_pair_to_pvalue[genepair]
cpairsdict[genepair] = cstats
gene1, gene2 = tuple(genepair)
if gene1 not in cgenedict:
cgenedict[gene1] = set()
cgenedict[gene1].add(gene2)
else:
cgenedict[gene1].add(gene2)
if gene2 not in cgenedict:
cgenedict[gene2] = set()
cgenedict[gene2].add(gene1)
else:
cgenedict[gene2].add(gene1)
return cpairsdict, cgenedict
def complete_mutexpairs(self, genepairs, p=0.05, maxOverlap=200, parallel_compute_number=0):
print "Generating list of", len(genepairs), " mutually exclusive hypotheses to test on permutation matrices..."
# Generate condition functions after analyzing each gene pair for Co-occurrence/min
mutex_set_condition_function_list = []
# Generate list of condition functions to test the permutation matrix for
for genepair in genepairs:
ConditionFunction = Condition(None)
condition_dict = {}
condition_dict['Genes'] = tuple(genepair)
condition_dict['Overlap'] = len(set.intersection(*[self.geneToCases_orig[gene] for gene in condition_dict['Genes']]))
condition_dict['Mutex'] = True
ConditionFunction.set_params([condition_dict])
mutex_set_condition_function_list.append((genepair, ConditionFunction))
print "Done. Now, calulating p-values of hypotheses..."
# Generate co-occurring pairs
if parallel_compute_number:
mutex_pair_to_pvalue = pac.parallel_compute_new(self.set_to_pvalue, [mutex_set_condition_function_list],
mutex_set_condition_function_list, 0, pac.partition_inputs, {0: pac.combine_dictionaries},
number=parallel_compute_number,
procnumber=parallel_compute_number)
else:
mutex_pair_to_pvalue = self.set_to_pvalue(mutex_set_condition_function_list)
print "Done. Now, finding mutually exclusive pairs"
# Generate dictionary for each pair. Optionally analyze each mutex set as well.
mpairsdict = {}
mgenedict = {}
for genepair in mutex_pair_to_pvalue:
if mutex_pair_to_pvalue[genepair] < p:
mstats = mex.analyze_mutex_set_new(self.numCases, self.geneToCases_orig, self.patientToGenes_orig,
geneset=tuple(genepair))
if mstats['Overlap'] <= maxOverlap:
mstats['PermutationProbability'] = mutex_pair_to_pvalue[genepair]
mpairsdict[genepair] = mstats
gene1, gene2 = tuple(genepair)
if gene1 not in mgenedict:
mgenedict[gene1] = set()
mgenedict[gene1].add(gene2)
else:
mgenedict[gene1].add(gene2)
if gene2 not in mgenedict:
mgenedict[gene2] = set()
mgenedict[gene2].add(gene1)
else:
mgenedict[gene2].add(gene1)
return mpairsdict, mgenedict
# A class of test functions for each pair, to asses for mutual exclusivity/co-occurrence. Used for the permutation
# matrices.
class Condition:
def __init__(self, condition_dicts):
self.set_params(condition_dicts)
def set_params(self, condition_dicts):
self.conditions = condition_dicts
def apply(self, geneToCases):
"""
:param geneToCases: Input matrix
:return: Whether the input matrix satisfies all the conditions.
"""
satisfies = True
for condition in self.conditions:
genes = condition['Genes']
overlap = condition['Overlap']
mutex = condition['Mutex']
found_overlap = len(set.intersection(*[geneToCases[gene] for gene in genes]))
satisfies = satisfies and (found_overlap <= overlap if mutex else found_overlap >= overlap)
# print "Genes are ", genes, " found overlap is ", found_overlap, " overlap is ", overlap
# if found_overlap != overlap:
# print "Genes are ", genes, " found overlap is ", found_overlap, " overlap is ", overlap
return satisfies
def main():
mutationmatrix = '/Users/jlu96/maf/new/PRAD_broad/PRAD_broad-som.m2'
patientFile = None #'/Users/jlu96/maf/new/PRAD_broad/shared_patients.plst'
geneFile = None #'/Users/jlu96/conte/jlu/REQUIREDFILES_OnlyLoss2/COSMICGenes_OnlyLoss.txt'
load_directory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices'
minFreq = 0
num_permutations = 20
binary_perm_method = False
Q = 100
write_matrices = True
matrixdirectory = '/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/LoadMatrices'
#'/Users/jlu96/conte/jlu/Analyses/CooccurImprovement/SARC_broad-som-jl-' + ('matrix' if binary_perm_method else 'network')
outmutexfile = matrixdirectory + '/mutex' + str(num_permutations) + str(time.time()) + '.tsv'
outcooccurfile = matrixdirectory + '/cooccur' + str(num_permutations) + str(time.time()) + '.tsv'
outseedsfile = matrixdirectory + '/seeds' + str(time.time()) + '.tsv'
if not os.path.exists(os.path.dirname(matrixdirectory)):
os.makedirs(os.path.dirname(matrixdirectory))
numGenes, numCases, genes, patients, geneToCases, patientToGenes = mex.load_mutation_data(mutationmatrix, patientFile, geneFile, minFreq)
print "numGenes ", numGenes, " and numCases ", numCases
for patient in patients:
if not patientToGenes[patient]:
patientToGenes.pop(patient)
print patient, "popped"
# Generate Permutation Matrices
pm = PermutationMatrices(geneToCases, patientToGenes, num_permutations, Q=Q, matrixdirectory=matrixdirectory,
binary_perm_method=binary_perm_method, write_matrices=write_matrices, load_directory=load_directory,
geneFile=geneFile, patientFile=patientFile, minFreq=minFreq)
# Make list of pairs from highly mutated genes
test_genes = [gene for gene in genes if len(geneToCases[gene]) > 5]
# for test_gene in test_genes:
# print test_gene
genepairs = met.getgenepairs(geneToCases, test_genes)
print "Number of pairs to test ", len(genepairs)
# CALCULATE MUTEX
# Create a list of ConditionFunctions that you must later initialize...
ConditionFunctions = range(len(genepairs))
mutex_set_condition_function_list = []
# Generate set_condition_function_list
for i in range(len(genepairs)):
genepair = genepairs[i]
condition_dict = {}
condition_dict['Genes'] = tuple(genepair)
condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']]))
condition_dict['Mutex'] = True
ConditionFunctions[i] = Condition([condition_dict])
if [condition_dict] != ConditionFunctions[i].conditions:
print condition_dict, ConditionFunctions[i].conditions
mutex_set_condition_function_list.append((genepair, ConditionFunctions[i]))
print "Finished mutex condition function list"
t= time.time()
# Calculate pvalues for mutual exclusivity
pair_to_mutex = {}
pair_to_mutex_network_pvalue = pm.set_to_pvalue(mutex_set_condition_function_list)
print "mutex pair network pvalues finished in ", time.time() - t
for genepair in genepairs:
pair_to_mutex[genepair] = mex.analyze_mutex_set_new(numCases, geneToCases, patientToGenes, genepair)
pair_to_mutex[genepair]['NetworkProbability'] = pair_to_mutex_network_pvalue[genepair]
# Write to output
with open(outmutexfile, 'w') as csvfile:
fieldnames = pair_to_mutex[genepairs[0]].keys()
writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)
writer.writeheader()
for genepair in pair_to_mutex:
writer.writerow(pair_to_mutex[genepair])
# CALCULATE COOCCUR
cooccur_set_condition_function_list = []
# Generate set_condition_function_list
for genepair in genepairs:
ConditionFunction = Condition(None)
condition_dict = {}
condition_dict['Genes'] = tuple(genepair)
condition_dict['Overlap'] = len(set.intersection(*[geneToCases[gene] for gene in condition_dict['Genes']]))
condition_dict['Mutex'] = False
ConditionFunction.set_params([condition_dict])
cooccur_set_condition_function_list.append((genepair, ConditionFunction))
t= time.time()
# Calculate pvalues for mutual exclusivity
pair_to_cooccur = {}
pair_to_cooccur_network_pvalue = pm.set_to_pvalue(cooccur_set_condition_function_list)
print "cooccur pair network pvalues finished in ", time.time() - t
for genepair in genepairs:
pair_to_cooccur[genepair] = mex.analyze_cooccur_set_new(numCases, geneToCases, patientToGenes, genepair)
pair_to_cooccur[genepair]['NetworkProbability'] = pair_to_cooccur_network_pvalue[genepair]
# Write to output
with open(outcooccurfile, 'w') as csvfile:
fieldnames = pair_to_cooccur[genepairs[0]].keys()
writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)
writer.writeheader()
for genepair in pair_to_cooccur:
writer.writerow(pair_to_cooccur[genepair])
# Write seeds to output
with open(outseedsfile, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t')
for seed in pm.seeds:
writer.writerow([seed])
if __name__ == '__main__':
main()
# and check for certain gene overlaps (certain conditions. Return list of tuples?)
# Method to return pvalue of mutexprob. Input: genes and overlap
# Ultimately want this to work with sample-specific permutations...?