Skip to content

Commit 7dbcd90

Browse files
author
amnorman
committed
added pytests for zfish and bsub wi
th self edges, also changed overlapping neighbors and protein degree algorithms to take self edges into account
1 parent abd7de9 commit 7dbcd90

13 files changed

+224
-42
lines changed

classes/overlapping_neighbors_class.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,30 +55,37 @@ def predict(
5555

5656
positive_dataset, negative_dataset = get_datasets(input_directory_path)
5757
G = import_graph_from_pickle(graph_file_path)
58-
5958
i = 1
6059
for positive_protein, positive_go, negative_protein, negative_go in zip(
6160
positive_dataset["protein"],
6261
positive_dataset["go"],
6362
negative_dataset["protein"],
6463
negative_dataset["go"],
6564
):
66-
65+
c = 0
66+
if G.has_edge(positive_protein, positive_protein):
67+
c = 1
6768
# calculate the score for the positive set
6869
positive_pro_pro_neighbor = get_neighbors(
6970
G, positive_protein, "protein_protein"
7071
)
72+
73+
# print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
7174
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
7275
positive_go_annotated_pro_pro_neighbor_count = (
7376
get_go_annotated_pro_pro_neighbor_count(
7477
G, positive_pro_pro_neighbor, positive_go
7578
)
76-
)
79+
) - c
80+
7781
positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
78-
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
82+
len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
7983
)
8084

8185
# calculate the score for the negative set
86+
c = 0
87+
if G.has_edge(negative_protein, negative_protein):
88+
c = 1
8289
negative_pro_pro_neighbor = get_neighbors(
8390
G, negative_protein, "protein_protein"
8491
)
@@ -89,7 +96,7 @@ def predict(
8996
)
9097
)
9198
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
92-
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
99+
len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
93100
)
94101

95102
# input positive and negative score to data
@@ -141,7 +148,7 @@ def get_neighbors(G: nx.Graph, node, edgeType):
141148
for edge in res:
142149
if edge[2]["type"] == edgeType:
143150
neighborNode = [edge[1], edge[2]]
144-
neighbors.append(neighborNode)
151+
neighbors.append(neighborNode)
145152

146153
return neighbors
147154

classes/overlapping_neighbors_v2_class.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,9 @@ def predict(
6060
negative_dataset["protein"],
6161
negative_dataset["go"],
6262
):
63-
63+
c = 0
64+
if G.has_edge(positive_protein, positive_protein):
65+
c = 1
6466
# calculate the score for the positive set
6567
positive_pro_pro_neighbor = get_neighbors(
6668
G, positive_protein, "protein_protein"
@@ -70,13 +72,16 @@ def predict(
7072
get_go_annotated_pro_pro_neighbor_count(
7173
G, positive_pro_pro_neighbor, positive_go
7274
)
73-
)
75+
) - c
7476
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
7577
1
76-
+ len(positive_pro_pro_neighbor)
78+
+ (len(positive_pro_pro_neighbor) - c)
7779
* positive_go_annotated_pro_pro_neighbor_count
7880
) / (len(positive_go_neighbor) / 2)
7981

82+
c = 0
83+
if G.has_edge(negative_protein, negative_protein):
84+
c = 1
8085
# calculate the score for the negative set
8186
negative_pro_pro_neighbor = get_neighbors(
8287
G, negative_protein, "protein_protein"
@@ -89,7 +94,7 @@ def predict(
8994
)
9095
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
9196
1
92-
+ len(negative_pro_pro_neighbor)
97+
+ (len(negative_pro_pro_neighbor) - c)
9398
* negative_go_annotated_pro_pro_neighbor_count
9499
) / (len(negative_go_neighbor) / 2)
95100

classes/overlapping_neighbors_v3_class.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def predict(
6060
negative_dataset["protein"],
6161
negative_dataset["go"],
6262
):
63+
c = 0
64+
if G.has_edge(positive_protein, positive_protein):
65+
c = 1
6366
# calculate the score for the positive set
6467
positive_pro_pro_neighbor = get_neighbors(
6568
G, positive_protein, "protein_protein"
@@ -69,7 +72,7 @@ def predict(
6972
get_go_annotated_pro_pro_neighbor_count(
7073
G, positive_pro_pro_neighbor, positive_go
7174
)
72-
)
75+
) - c
7376
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
7477
1 + positive_go_annotated_pro_pro_neighbor_count
7578
) / (len(positive_go_neighbor))
@@ -83,7 +86,7 @@ def predict(
8386
get_go_annotated_pro_pro_neighbor_count(
8487
G, negative_pro_pro_neighbor, negative_go
8588
)
86-
)
89+
)
8790
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
8891
1 + negative_go_annotated_pro_pro_neighbor_count
8992
) / (len(negative_go_neighbor))

classes/protein_degree_class.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,20 @@ def predict(
5252
negative_dataset["go"],
5353
):
5454

55+
c = 0
56+
if G.has_edge(positive_protein, positive_protein):
57+
c = 1
5558
data["protein"].append(positive_protein)
5659
data["go_term"].append(positive_go)
57-
data["degree"].append(G.degree(positive_protein))
60+
data["degree"].append(G.degree(positive_protein) - c)
5861
data["true_label"].append(1)
5962

63+
c = 0
64+
if G.has_edge(negative_protein, negative_protein):
65+
c = 1
6066
data["protein"].append(negative_protein)
6167
data["go_term"].append(negative_go)
62-
data["degree"].append(G.degree(negative_protein))
68+
data["degree"].append(G.degree(negative_protein) - c)
6369
data["true_label"].append(0)
6470
print_progress(i, len(positive_dataset["protein"]))
6571
i += 1

classes/protein_degree_v2_class.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,23 @@ def predict(
5454
negative_dataset["go"],
5555
):
5656

57+
c = 0
58+
if G.has_edge(positive_protein, positive_protein):
59+
c = 1
5760
data["protein"].append(positive_protein)
5861
data["go_term"].append(positive_go)
5962
data["degree"].append(
60-
len(get_neighbors(G, positive_protein, "protein_protein"))
63+
len(get_neighbors(G, positive_protein, "protein_protein")) - c
6164
)
6265
data["true_label"].append(1)
6366

67+
c = 0
68+
if G.has_edge(negative_protein, negative_protein):
69+
c = 1
6470
data["protein"].append(negative_protein)
6571
data["go_term"].append(negative_go)
6672
data["degree"].append(
67-
len(get_neighbors(G, negative_protein, "protein_protein"))
73+
len(get_neighbors(G, negative_protein, "protein_protein")) - c
6874
)
6975
data["true_label"].append(0)
7076
print_progress(i, len(positive_dataset["protein"]))

main.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,37 +61,37 @@ def main():
6161
go_protein_pairs = read_specific_columns(
6262
fly_go_association_path, go_inferred_columns, ","
6363
)
64-
64+
6565
protein_list = []
6666

6767
# if there is no graph.pickle file in the output/dataset directory, uncomment the following lines
68-
G, protein_list = create_ppi_network(interactome, go_protein_pairs)
69-
export_graph_to_pickle(G, graph_file_path)
68+
# G, protein_list = create_ppi_network(interactome, go_protein_pairs)
69+
# export_graph_to_pickle(G, testing_graph_file_path)
7070

7171
# if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used
72-
positive_dataset, negative_dataset = sample_data(
73-
go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
74-
)
72+
# positive_dataset, negative_dataset = sample_data(
73+
# go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
74+
# )
7575

7676
# Define algorithm classes and their names
7777
algorithm_classes = {
78-
# "OverlappingNeighbors": OverlappingNeighbors,
79-
# "OverlappingNeighborsV2": OverlappingNeighborsV2,
80-
# "OverlappingNeighborsV3": OverlappingNeighborsV3,
81-
# "ProteinDegree": ProteinDegree,
82-
# "ProteinDegreeV2": ProteinDegreeV2,
83-
# "ProteinDegreeV3": ProteinDegreeV3,
84-
# "SampleAlgorithm": SampleAlgorithm,
85-
# "HypergeometricDistribution": HypergeometricDistribution,
78+
"OverlappingNeighbors": OverlappingNeighbors,
79+
"OverlappingNeighborsV2": OverlappingNeighborsV2,
80+
"OverlappingNeighborsV3": OverlappingNeighborsV3,
81+
"ProteinDegree": ProteinDegree,
82+
"ProteinDegreeV2": ProteinDegreeV2,
83+
"ProteinDegreeV3": ProteinDegreeV3,
84+
"SampleAlgorithm": SampleAlgorithm,
85+
"HypergeometricDistribution": HypergeometricDistribution,
8686
"HypergeometricDistributionV2": HypergeometricDistributionV2,
8787
}
8888

8989
results = run_workflow(
9090
algorithm_classes,
91-
dataset_directory_path,
92-
graph_file_path,
93-
output_data_path,
94-
output_image_path,
91+
testing_input_directory_path,
92+
testing_graph_file_path,
93+
testing_output_data_path,
94+
testing_output_image_path,
9595
True,
9696
True,
9797
)

tests/test_pytest.py

Lines changed: 119 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
from classes.protein_degree_v3_class import ProteinDegreeV3
99
from classes.sample_algorithm import SampleAlgorithm
1010
from classes.base_algorithm_class import BaseAlgorithm
11+
from classes.hypergeometric_distribution_class import HypergeometricDistribution
12+
from classes.hypergeometric_distribution_class_V2 import HypergeometricDistributionV2
13+
1114
from pathlib import Path
1215
from tools.helper import (
1316
read_specific_columns,
@@ -70,6 +73,8 @@ def test_algorithm_workflow():
7073
"ProteinDegree": ProteinDegree,
7174
"ProteinDegreeV2": ProteinDegreeV2,
7275
"ProteinDegreeV3": ProteinDegreeV3,
76+
"HypergeometricDistribution": HypergeometricDistribution,
77+
"HypergeometricDistributionV2": HypergeometricDistributionV2
7378
}
7479

7580
results = run_workflow(
@@ -88,10 +93,8 @@ def test_algorithm_workflow():
8893
"ProteinDegree": 0.825,
8994
"ProteinDegreeV2": 0.675,
9095
"ProteinDegreeV3": 0.89,
91-
"HypergeometricDistribution": 0.78,
92-
"HypergeometricDistributionV2": 0.89,
93-
"HypergeometricDistributionV3": 0.675,
94-
"HypergeometricDistributionV4": 0.6
96+
"HypergeometricDistribution": 0.76,
97+
"HypergeometricDistributionV2": 0.86,
9598
}
9699

97100
pr_results = {
@@ -102,14 +105,122 @@ def test_algorithm_workflow():
102105
"ProteinDegreeV2": 0.6367757242757243,
103106
"OverlappingNeighbors": 0.5329058916229968,
104107
"SampleAlgorithm": 0.4093791854859966,
105-
"HypergeometricDistribution": 0.7899246806,
106-
"HypergeometricDistributionV2": 0.8519169719,
107-
"HypergeometricDistributionV3": 0.7142573629,
108-
"HypergeometricDistributionV4": 0.6967847007,
108+
"HypergeometricDistribution": 0.7899246805825753,
109+
"HypergeometricDistributionV2": 0.8519169719169718,
110+
}
111+
112+
for algorithm, metrics in results.items():
113+
assert metrics["roc_auc"] == roc_results[algorithm]
114+
115+
for algorithm, metrics in results.items():
116+
assert metrics["pr_auc"] == pr_results[algorithm]
117+
118+
119+
def test_self_edge_case(): #Redundant but mostly for the sake of seperation, I can add it to the above section
120+
if not os.path.exists("output"):
121+
os.makedirs("output")
122+
if not os.path.exists("output/dataset"):
123+
os.makedirs("output/dataset")
124+
if not os.path.exists("output/data"):
125+
os.makedirs("output/data")
126+
if not os.path.exists("output/images"):
127+
os.makedirs("output/images")
128+
129+
output_data_path = Path("./output/data/")
130+
output_image_path = Path("./output/images/")
131+
input_directory_path = Path("./tests/testing-dataset/zfish")
132+
graph_file_path = Path(input_directory_path, "graph.pickle")
133+
134+
algorithm_classes = {
135+
"OverlappingNeighbors": OverlappingNeighbors,
136+
"OverlappingNeighborsV2": OverlappingNeighborsV2,
137+
"OverlappingNeighborsV3": OverlappingNeighborsV3,
138+
"ProteinDegree": ProteinDegree,
139+
"ProteinDegreeV2": ProteinDegreeV2,
140+
"ProteinDegreeV3": ProteinDegreeV3,
141+
"HypergeometricDistribution": HypergeometricDistribution,
142+
"HypergeometricDistributionV2": HypergeometricDistributionV2
143+
}
144+
145+
#For zfish
146+
results = run_workflow(
147+
algorithm_classes,
148+
input_directory_path,
149+
graph_file_path,
150+
output_data_path,
151+
output_image_path,
152+
False,
153+
False,
154+
)
155+
roc_results = {
156+
"OverlappingNeighbors": 0.715,
157+
"OverlappingNeighborsV2": 0.8,
158+
"OverlappingNeighborsV3": 0.7899999999999999,
159+
"ProteinDegree": 0.9650000000000001,
160+
"ProteinDegreeV2": 0.775,
161+
"ProteinDegreeV3": 0.9750000000000001,
162+
"HypergeometricDistribution": 0.5449999999999999,
163+
"HypergeometricDistributionV2": 0.8300000000000001,
164+
}
165+
166+
pr_results = {
167+
"ProteinDegreeV3": 0.9754545454545455,
168+
"ProteinDegree": 0.9675757575757575,
169+
"OverlappingNeighborsV3": 0.8179265873015872,
170+
"OverlappingNeighborsV2": 0.8292361111111111,
171+
"ProteinDegreeV2": 0.7573318322544329,
172+
"OverlappingNeighbors": 0.5794961247902424,
173+
"SampleAlgorithm": 0.43900023737872035,
174+
"HypergeometricDistribution": 0.5095882374849092,
175+
"HypergeometricDistributionV2": 0.674983904983905,
176+
}
177+
178+
for algorithm, metrics in results.items():
179+
assert metrics["roc_auc"] == roc_results[algorithm]
180+
181+
for algorithm, metrics in results.items():
182+
assert metrics["pr_auc"] == pr_results[algorithm]
183+
184+
185+
#For Bsub
186+
input_directory_path = Path("./tests/testing-dataset/bsub")
187+
graph_file_path = Path(input_directory_path, "graph.pickle")
188+
189+
results = run_workflow(
190+
algorithm_classes,
191+
input_directory_path,
192+
graph_file_path,
193+
output_data_path,
194+
output_image_path,
195+
False,
196+
False,
197+
)
198+
roc_results = {
199+
"OverlappingNeighbors": 0.575,
200+
"OverlappingNeighborsV2": 0.6399999999999999,
201+
"OverlappingNeighborsV3": 0.6399999999999999,
202+
"ProteinDegree": 0.7050000000000001,
203+
"ProteinDegreeV2": 0.54,
204+
"ProteinDegreeV3": 0.71,
205+
"HypergeometricDistribution": 0.51,
206+
"HypergeometricDistributionV2": 0.8499999999999999,
207+
}
208+
209+
pr_results = {
210+
"ProteinDegreeV3": 0.6918311998459057,
211+
"ProteinDegree": 0.6560890253537313,
212+
"OverlappingNeighborsV3": 0.5933333333333334,
213+
"OverlappingNeighborsV2": 0.5933333333333334,
214+
"ProteinDegreeV2": 0.588080808080808,
215+
"OverlappingNeighbors": 0.5224841799067805,
216+
"SampleAlgorithm": 0.5922520550055379,
217+
"HypergeometricDistribution": 0.5001244588744589,
218+
"HypergeometricDistributionV2": 0.7131783494283495,
109219
}
110220

111221
for algorithm, metrics in results.items():
112222
assert metrics["roc_auc"] == roc_results[algorithm]
113223

114224
for algorithm, metrics in results.items():
115225
assert metrics["pr_auc"] == pr_results[algorithm]
226+
2.06 MB
Binary file not shown.

0 commit comments

Comments
 (0)