-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscorecooccur.py
171 lines (132 loc) · 6.27 KB
/
scorecooccur.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
__author__ = 'jlu96'
import csv
import operator
from scipy.misc import comb
from scipy.misc import factorial
import math
def load_gene_lengths(filename):
geneToLength = {}
with open(filename, 'rU') as geneToLengthFile:
lengthreader = csv.reader(geneToLengthFile, delimiter='\t')
for row in lengthreader:
geneToLength[row[0]] = eval(row[1])
return geneToLength
# def score_cooccur_set(genes, geneToCases, patientToGenes):
# # Find all the patients that have all the genes cooccurring
# patients = set.intersection(*[set(geneToCases[gene]) for gene in genes])
#
# # print patients
# # print [len(patientToGenes[patient]) for patient in patients]
#
# patient_scores = set()
#
# # For each patient,
# for patient in patients:
# # Get the number of mutations.
# patient_mutation_num = len(patientToGenes[patient])
#
# # Calculate a proxy for the probability of cooccurrence: Product of the k gene lengths * nPk
# patient_score = patient_cooccur_score(patient_mutation_num, genes)
# if not patient_score:
# print patient_mutation_num
# print genes
# print [gene_to_length(gene) for gene in genes]
# patient_scores.add(patient_score)
#
# # The total score is the logarithm of the reciprocal of the sum of the reciprocals of the individual scores.
# # Think the logarithm of parallel circuits.
#
# #print sum([1.0/score for score in patient_scores])
# # [1.0/score for score in patient_scores]
# total_score = math.log(1.0 / sum([1.0/score for score in patient_scores]))
#
# return total_score
def score_cooccur_set(genes, geneToCases, patientToGenes):
if len(genes) == 3: return score_cooccur_set_three(genes, geneToCases, patientToGenes)
# Find all the patients that have all the genes cooccurring
patients = set.intersection(*[set(geneToCases[gene]) for gene in genes])
gene0, gene1 = tuple(genes)
l0, l1 = gene_to_length(gene0), gene_to_length(gene1)
# print patients
# print [len(patientToGenes[patient]) for patient in patients]
patient_scores = set()
# For each patient,
for patient in patients:
# Get the number of mutations.
patient_mutation_num = len(patientToGenes[patient])
# Calculate a proxy for the probability of cooccurrence: Product of the k gene lengths * nPk
patient_score = patient_mutation_num * (patient_mutation_num - 1) * l0 * l1
patient_scores.add(patient_score)
# The total score is the logarithm of the reciprocal of the sum of the reciprocals of the individual scores.
# Think the logarithm of parallel circuits.
total_score = math.log(1.0 / sum([1.0/score for score in patient_scores]))
return total_score
def score_cooccur_set_three(genes, geneToCases, patientToGenes):
# Find all the patients that have all the genes cooccurring
patients = set.intersection(*[set(geneToCases[gene]) for gene in genes])
gene0, gene1, gene2 = tuple(genes)
l0, l1, l2 = gene_to_length(gene0), gene_to_length(gene1), gene_to_length(gene2)
# print patients
# print [len(patientToGenes[patient]) for patient in patients]
patient_scores = set()
# For each patient,
for patient in patients:
# Get the number of mutations.
patient_mutation_num = len(patientToGenes[patient])
# Calculate a proxy for the probability of cooccurrence: Product of the k gene lengths * nPk
patient_score = patient_mutation_num * (patient_mutation_num - 1) * (patient_mutation_num - 2) * l0 * l1 * l2
patient_scores.add(patient_score)
# The total score is the logarithm of the reciprocal of the sum of the reciprocals of the individual scores.
# Think the logarithm of parallel circuits.
total_score = math.log(1.0 / sum([1.0/score for score in patient_scores]))
return total_score
def patient_cooccur_score(patient_mutation_num, genes):
# try:
# geneToLength = patient_cooccur_score.geneToLength
# except AttributeError:
# patient_cooccur_score.geneToLength = load_gene_lengths(filename)
# geneToLength = patient_cooccur_score.geneToLength
lengths = [gene_to_length(gene) for gene in genes]
k = len(genes)
# Calculate a proxy for the probability of cooccurrence: Product of the k gene lengths * nPk
return prod(lengths) * comb(patient_mutation_num, k) * factorial(k)
# Average gene length is taken from: wikipedia of coding sequence length
# https://en.wikipedia.org/wiki/Human_genome#Coding_sequences_.28protein-coding_genes.29
# 20200 proteins,
# "Over the whole genome, the median size of an exon is 122 bp (mean = 145 bp),
# the median number of exons is 7 (mean = 8.8), and the median coding sequence encodes 367 amino acids (mean = 447 amino acids)
# Use median = 7 * 122 = 854
def gene_in_file(gene, filename='geneToLength_all_firstseen.txt'):
try:
geneToLength = gene_to_length.geneToLength
except AttributeError:
gene_to_length.geneToLength = load_gene_lengths(filename)
geneToLength = gene_to_length.geneToLength
return (gene in geneToLength)
def gene_to_length(gene, filename='geneToLength_all_firstseen.txt',
median_gene_length=854):
try:
geneToLength = gene_to_length.geneToLength
except AttributeError:
gene_to_length.geneToLength = load_gene_lengths(filename)
geneToLength = gene_to_length.geneToLength
if gene in geneToLength:
return geneToLength[gene]
else:
#print "Length of ", gene, "not found in file. Using median length instead:", median_gene_length
return median_gene_length
# def gene_to_length_gainloss(gene, filename='/Users/jlu96/conte/jlu/geneToLength_gainloss_firstseen.txt',
# median_gene_length=854):
# try:
# geneToLength = gene_to_length.geneToLength
# except AttributeError:
# gene_to_length.geneToLength = load_gene_lengths(filename)
# geneToLength = gene_to_length.geneToLength
#
# if gene in geneToLength:
# return geneToLength[gene]
# else:
# #print "Length of ", gene, "not found in file. Using median length instead:", median_gene_length
# return median_gene_length
def prod(iterable):
return reduce(operator.mul, iterable, 1)