Skip to content

Commit abd7de9

Browse files
author
amnorman
committed
adds the updated version of the hypergeometric functions for different species and adds a difference function.
1 parent 7d739df commit abd7de9

11 files changed

+618
-75
lines changed

classes/hypergeometric_distribution_class.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,15 @@ def predict(
7676
)
7777
)
7878

79+
c = 0
80+
if G.has_edge(positive_protein, positive_protein):
81+
c = 1 #Removes extra node if there is an edge to self
82+
7983
N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
80-
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
84+
pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
8185
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
82-
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
83-
86+
pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
87+
8488
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
8589
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
8690

@@ -95,11 +99,16 @@ def predict(
9599
)
96100
)
97101

98-
neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
102+
c = 0
103+
if G.has_edge(negative_protein, negative_protein):
104+
c = 1
105+
106+
neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
99107
neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
100108

101109
negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
102110

111+
103112
# input positive and negative score to data
104113
data["protein"].append(positive_protein)
105114
data["go_term"].append(positive_go)

classes/hypergeometric_distribution_class_V2.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,15 @@ def predict(
7575
G, positive_pro_pro_neighbor, positive_go
7676
)
7777
)
78-
78+
79+
c = 1
80+
if G.has_edge(positive_protein, positive_protein):
81+
c = 0
82+
7983
N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
80-
pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
84+
pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
8185
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
82-
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
86+
pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
8387

8488
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
8589
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
@@ -95,7 +99,11 @@ def predict(
9599
)
96100
)
97101

98-
neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
102+
c = 1
103+
if G.has_edge(negative_protein, negative_protein):
104+
c = 0
105+
106+
neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
99107
neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
100108

101109
negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
@@ -159,4 +167,4 @@ def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
159167
for element in nodeList:
160168
if G.has_edge(element[0], goTerm):
161169
count += 1
162-
return count
170+
return count

classes/hypergeometric_distribution_class_V3.py renamed to classes/removed_edge/overlapping_neighbors_class_no_pi.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
from colorama import init as colorama_init
55
from colorama import Fore, Back, Style
66
from pathlib import Path
7-
import math
87
from tools.helper import print_progress, normalize, import_graph_from_pickle
98
from tools.workflow import get_datasets
109

1110

12-
class HypergeometricDistributionV3(BaseAlgorithm):
11+
class OverlappingNeighborsNoPI(BaseAlgorithm):
1312
def __init__(self):
1413
self.y_score = []
1514
self.y_true = []
@@ -33,16 +32,15 @@ def predict(
3332
output_path,
3433
):
3534
"""
36-
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
37-
interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein
38-
of interest and/or the GO term). Does not include the protein of interest.
35+
evaluate overlapping neighbors method on a protein protein interaction network with go term annotation.
3936
"""
4037
colorama_init()
4138

4239
# have two sets of positive and negative protein-go_term pairs
4340
# for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term.
4441
# 50% of the data are proteins that are annotated to a GO term
4542
# 50% of the data are proteins that are not annotated to a GO term
43+
# score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor)
4644

4745
data = {
4846
"protein": [],
@@ -65,7 +63,7 @@ def predict(
6563
negative_dataset["protein"],
6664
negative_dataset["go"],
6765
):
68-
66+
G.remove_edge(positive_protein, positive_go)
6967
# calculate the score for the positive set
7068
positive_pro_pro_neighbor = get_neighbors(
7169
G, positive_protein, "protein_protein"
@@ -76,14 +74,9 @@ def predict(
7674
G, positive_pro_pro_neighbor, positive_go
7775
)
7876
)
79-
80-
pos_N = len(positive_pro_pro_neighbor) + len(positive_go_neighbor) -positive_go_annotated_pro_pro_neighbor_count - 1 #Sample size is only the neighbors of the protein & GO term of interest
81-
pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
82-
K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include the protein of interest
83-
pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO term and the protein of interst's neighbor proteins
84-
print("pos_N: ", pos_N, "pos_n: ", pos_n, "K: ", K, "pos_k: ", pos_k)
85-
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
86-
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(pos_N-K,pos_n-pos_k))/math.comb(pos_N,pos_n))
77+
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
78+
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
79+
)
8780

8881
# calculate the score for the negative set
8982
negative_pro_pro_neighbor = get_neighbors(
@@ -95,12 +88,9 @@ def predict(
9588
G, negative_pro_pro_neighbor, negative_go
9689
)
9790
)
98-
99-
neg_N = len(negative_pro_pro_neighbor) + len(negative_go_neighbor) - negative_go_annotated_protein_neighbor_count
100-
neg_n = len(negative_pro_pro_neighbor)
101-
neg_k = negative_go_annotated_protein_neighbor_count
102-
103-
negative_score = 1 - ((math.comb(K,neg_k)*math.comb(neg_N-K,neg_n-neg_k))/math.comb(neg_N,neg_n))
91+
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
92+
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
93+
)
10494

10595
# input positive and negative score to data
10696
data["protein"].append(positive_protein)
@@ -124,6 +114,7 @@ def predict(
124114
data["true_label"].append(0)
125115

126116
print_progress(i, len(positive_dataset["protein"]))
117+
G.add_edge(positive_protein, positive_go, type="protein_go_term")
127118
i += 1
128119

129120
normalized_data = normalize(data["score"])
@@ -134,7 +125,7 @@ def predict(
134125
df = df.sort_values(by="norm_score", ascending=False)
135126

136127
df.to_csv(
137-
Path(output_path, "hypergeometric_distribution_v3.csv"),
128+
Path(output_path, "overlapping_neighbor_data.csv"),
138129
index=False,
139130
sep="\t",
140131
)

classes/hypergeometric_distribution_class_V4.py renamed to classes/removed_edge/overlapping_neighbors_v2_class_no_pi.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,22 @@
11
from classes.base_algorithm_class import BaseAlgorithm
22
import networkx as nx
33
import pandas as pd
4-
from colorama import init as colorama_init
5-
from colorama import Fore, Back, Style
4+
from tools.helper import normalize, print_progress, import_graph_from_pickle
65
from pathlib import Path
7-
import math
8-
from tools.helper import print_progress, normalize, import_graph_from_pickle
96
from tools.workflow import get_datasets
107

118

12-
class HypergeometricDistributionV4(BaseAlgorithm):
9+
class OverlappingNeighborsV2NoPI(BaseAlgorithm):
1310
def __init__(self):
1411
self.y_score = []
1512
self.y_true = []
1613

1714
def get_y_score(self):
1815
return self.y_score
19-
16+
2017
def get_y_true(self):
2118
return self.y_true
22-
19+
2320
def set_y_score(self, y_score):
2421
self.y_score = y_score
2522

@@ -33,16 +30,14 @@ def predict(
3330
output_path,
3431
):
3532
"""
36-
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
37-
interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein
38-
of interest and/or the GO term). Accounts for protein of interest.
33+
evaluate overlapping neighbors method on a protein protein interaction network with go term annotation.
3934
"""
40-
colorama_init()
4135

4236
# have two sets of positive and negative protein-go_term pairs
4337
# for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term.
4438
# 50% of the data are proteins that are annotated to a GO term
4539
# 50% of the data are proteins that are not annotated to a GO term
40+
# score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor)
4641

4742
data = {
4843
"protein": [],
@@ -56,16 +51,16 @@ def predict(
5651
}
5752

5853
positive_dataset, negative_dataset = get_datasets(input_directory_path)
59-
G = import_graph_from_pickle(graph_file_path)
6054

55+
G = import_graph_from_pickle(graph_file_path)
6156
i = 1
6257
for positive_protein, positive_go, negative_protein, negative_go in zip(
6358
positive_dataset["protein"],
6459
positive_dataset["go"],
6560
negative_dataset["protein"],
6661
negative_dataset["go"],
6762
):
68-
63+
G.remove_edge(positive_protein, positive_go)
6964
# calculate the score for the positive set
7065
positive_pro_pro_neighbor = get_neighbors(
7166
G, positive_protein, "protein_protein"
@@ -76,32 +71,35 @@ def predict(
7671
G, positive_pro_pro_neighbor, positive_go
7772
)
7873
)
79-
80-
#Protein of interest neighbors + go term of protein neighbors - overlap
81-
pos_N = len(positive_pro_pro_neighbor) + len(positive_go_neighbor) - positive_go_annotated_pro_pro_neighbor_count #Sample size is only the neighbors of the protein & GO term of interest
82-
pos_n = len(positive_pro_pro_neighbor)+1 #Number of protein neighbors the protein of interest has (includes self)
83-
K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
84-
pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO and protein neighbor proteins (includes self)
85-
86-
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
87-
positive_score = 1 - ((math.comb(K,pos_k)*math.comb(pos_N-K,pos_n-pos_k))/math.comb(pos_N,pos_n))
74+
75+
#Maybe not the best assumption but if the number of go neighbors is 0, then there is no chance of the protein being annotated to the go term, doesn't seem to change the ROC or PR AUC much
76+
if len(positive_go_neighbor) == 0:
77+
positive_score = 0
78+
else:
79+
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
80+
1
81+
+ len(positive_pro_pro_neighbor)
82+
* positive_go_annotated_pro_pro_neighbor_count
83+
) / (len(positive_go_neighbor) / 2)
8884

8985
# calculate the score for the negative set
9086
negative_pro_pro_neighbor = get_neighbors(
9187
G, negative_protein, "protein_protein"
9288
)
9389
negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
94-
negative_go_annotated_protein_neighbor_count = (
90+
negative_go_annotated_pro_pro_neighbor_count = (
9591
get_go_annotated_pro_pro_neighbor_count(
9692
G, negative_pro_pro_neighbor, negative_go
9793
)
9894
)
99-
100-
neg_N = len(negative_pro_pro_neighbor) + len(negative_go_neighbor) - negative_go_annotated_protein_neighbor_count + 1 #Self is not accounted for by GO term since there is no connection
101-
neg_n = len(negative_pro_pro_neighbor) + 1 #Include self
102-
neg_k = negative_go_annotated_protein_neighbor_count
103-
104-
negative_score = 1 - ((math.comb(K,neg_k)*math.comb(neg_N-K,neg_n-neg_k))/math.comb(neg_N,neg_n))
95+
if len(negative_go_neighbor) == 0:
96+
negative_score = 0
97+
else:
98+
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
99+
1
100+
+ len(negative_pro_pro_neighbor)
101+
* negative_go_annotated_pro_pro_neighbor_count
102+
) / (len(negative_go_neighbor) / 2)
105103

106104
# input positive and negative score to data
107105
data["protein"].append(positive_protein)
@@ -119,12 +117,13 @@ def predict(
119117
data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
120118
data["go_neighbor"].append(len(negative_go_neighbor))
121119
data["go_annotated_pro_pro_neighbors"].append(
122-
negative_go_annotated_protein_neighbor_count
120+
negative_go_annotated_pro_pro_neighbor_count
123121
)
124122
data["score"].append(negative_score)
125123
data["true_label"].append(0)
126124

127125
print_progress(i, len(positive_dataset["protein"]))
126+
G.add_edge(positive_protein, positive_go, type="protein_go_term")
128127
i += 1
129128

130129
normalized_data = normalize(data["score"])
@@ -135,7 +134,7 @@ def predict(
135134
df = df.sort_values(by="norm_score", ascending=False)
136135

137136
df.to_csv(
138-
Path(output_path, "hypergeometric_distribution_v4_data.csv"),
137+
Path(output_path, "overlapping_neighbor_v2_data.csv"),
139138
index=False,
140139
sep="\t",
141140
)

0 commit comments

Comments
 (0)