You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: classes/hypergeometric_distribution_class.py
+13-4Lines changed: 13 additions & 4 deletions
Original file line number
Diff line number
Diff line change
@@ -76,11 +76,15 @@ def predict(
76
76
)
77
77
)
78
78
79
+
c=0
80
+
ifG.has_edge(positive_protein, positive_protein):
81
+
c=1#Removes extra node if there is an edge to self
82
+
79
83
N=len([xforx,yinG.nodes(data=True) ify['type']=="protein"]) #Total number of protein nodes in the entire graph
80
-
pos_n=len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
84
+
pos_n=len(positive_pro_pro_neighbor) -c#Number of protein neighbors the protein of interest has
81
85
K=len(positive_go_neighbor) -1#Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
82
-
pos_k=positive_go_annotated_pro_pro_neighbor_count#The overlap between the GO protein neighbors and protein neighbors of the protein of interest
83
-
86
+
pos_k=positive_go_annotated_pro_pro_neighbor_count-c#The overlap between the GO protein neighbors and protein neighbors of the protein of interest
87
+
84
88
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
Copy file name to clipboardExpand all lines: classes/hypergeometric_distribution_class_V2.py
+13-5Lines changed: 13 additions & 5 deletions
Original file line number
Diff line number
Diff line change
@@ -75,11 +75,15 @@ def predict(
75
75
G, positive_pro_pro_neighbor, positive_go
76
76
)
77
77
)
78
-
78
+
79
+
c=1
80
+
ifG.has_edge(positive_protein, positive_protein):
81
+
c=0
82
+
79
83
N=len([xforx,yinG.nodes(data=True) ify['type']=="protein"]) #Total number of protein nodes in the entire graph
80
-
pos_n=len(positive_pro_pro_neighbor) +1#Number of protein neighbors the protein of interest has (includes the protein of interest)
84
+
pos_n=len(positive_pro_pro_neighbor) +c#Number of protein neighbors the protein of interest has (includes the protein of interest)
81
85
K=len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
82
-
pos_k=positive_go_annotated_pro_pro_neighbor_count+1#The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
86
+
pos_k=positive_go_annotated_pro_pro_neighbor_count+c#The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
83
87
84
88
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of
37
-
interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein
38
-
of interest and/or the GO term). Does not include the protein of interest.
35
+
evaluate overlapping neighbors method on a protein protein interaction network with go term annotation.
39
36
"""
40
37
colorama_init()
41
38
42
39
# have two sets of positive and negative protein-go_term pairs
43
40
# for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term.
44
41
# 50% of the data are proteins that are annotated to a GO term
45
42
# 50% of the data are proteins that are not annotated to a GO term
43
+
# score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor)
46
44
47
45
data= {
48
46
"protein": [],
@@ -65,7 +63,7 @@ def predict(
65
63
negative_dataset["protein"],
66
64
negative_dataset["go"],
67
65
):
68
-
66
+
G.remove_edge(positive_protein, positive_go)
69
67
# calculate the score for the positive set
70
68
positive_pro_pro_neighbor=get_neighbors(
71
69
G, positive_protein, "protein_protein"
@@ -76,14 +74,9 @@ def predict(
76
74
G, positive_pro_pro_neighbor, positive_go
77
75
)
78
76
)
79
-
80
-
pos_N=len(positive_pro_pro_neighbor) +len(positive_go_neighbor) -positive_go_annotated_pro_pro_neighbor_count-1#Sample size is only the neighbors of the protein & GO term of interest
81
-
pos_n=len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
82
-
K=len(positive_go_neighbor) -1#Number of protein neighbors the GO term of interest has, same for pos & neg, does not include the protein of interest
83
-
pos_k=positive_go_annotated_pro_pro_neighbor_count#The overlap between the GO term and the protein of interst's neighbor proteins
#Protein of interest neighbors + go term of protein neighbors - overlap
81
-
pos_N=len(positive_pro_pro_neighbor) +len(positive_go_neighbor) -positive_go_annotated_pro_pro_neighbor_count#Sample size is only the neighbors of the protein & GO term of interest
82
-
pos_n=len(positive_pro_pro_neighbor)+1#Number of protein neighbors the protein of interest has (includes self)
83
-
K=len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
84
-
pos_k=positive_go_annotated_pro_pro_neighbor_count+1#The overlap between the GO and protein neighbor proteins (includes self)
85
-
86
-
#The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
#Maybe not the best assumption but if the number of go neighbors is 0, then there is no chance of the protein being annotated to the go term, doesn't seem to change the ROC or PR AUC much
neg_N=len(negative_pro_pro_neighbor) +len(negative_go_neighbor) -negative_go_annotated_protein_neighbor_count+1#Self is not accounted for by GO term since there is no connection
0 commit comments