Skip to content

Commit

Permalink
added pytests for zfish and bsub wi
Browse files Browse the repository at this point in the history
th self edges, also changed overlapping neighbors and protein degree algorithms to take self edges into account
  • Loading branch information
amnorman committed Jun 14, 2024
1 parent abd7de9 commit 7dbcd90
Show file tree
Hide file tree
Showing 13 changed files with 224 additions and 42 deletions.
19 changes: 13 additions & 6 deletions classes/overlapping_neighbors_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,30 +55,37 @@ def predict(

positive_dataset, negative_dataset = get_datasets(input_directory_path)
G = import_graph_from_pickle(graph_file_path)

i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
)

# print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
positive_go_annotated_pro_pro_neighbor_count = (
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
) - c

positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
)

# calculate the score for the negative set
c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
)
Expand All @@ -89,7 +96,7 @@ def predict(
)
)
negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
)

# input positive and negative score to data
Expand Down Expand Up @@ -141,7 +148,7 @@ def get_neighbors(G: nx.Graph, node, edgeType):
for edge in res:
if edge[2]["type"] == edgeType:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)
neighbors.append(neighborNode)

return neighbors

Expand Down
13 changes: 9 additions & 4 deletions classes/overlapping_neighbors_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -70,13 +72,16 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
) - c
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1
+ len(positive_pro_pro_neighbor)
+ (len(positive_pro_pro_neighbor) - c)
* positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor) / 2)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
# calculate the score for the negative set
negative_pro_pro_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -89,7 +94,7 @@ def predict(
)
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1
+ len(negative_pro_pro_neighbor)
+ (len(negative_pro_pro_neighbor) - c)
* negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor) / 2)

Expand Down
7 changes: 5 additions & 2 deletions classes/overlapping_neighbors_v3_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def predict(
negative_dataset["protein"],
negative_dataset["go"],
):
c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
# calculate the score for the positive set
positive_pro_pro_neighbor = get_neighbors(
G, positive_protein, "protein_protein"
Expand All @@ -69,7 +72,7 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, positive_pro_pro_neighbor, positive_go
)
)
) - c
positive_score = positive_go_annotated_pro_pro_neighbor_count + (
1 + positive_go_annotated_pro_pro_neighbor_count
) / (len(positive_go_neighbor))
Expand All @@ -83,7 +86,7 @@ def predict(
get_go_annotated_pro_pro_neighbor_count(
G, negative_pro_pro_neighbor, negative_go
)
)
)
negative_score = negative_go_annotated_pro_pro_neighbor_count + (
1 + negative_go_annotated_pro_pro_neighbor_count
) / (len(negative_go_neighbor))
Expand Down
10 changes: 8 additions & 2 deletions classes/protein_degree_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,20 @@ def predict(
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(G.degree(positive_protein))
data["degree"].append(G.degree(positive_protein) - c)
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(G.degree(negative_protein))
data["degree"].append(G.degree(negative_protein) - c)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
i += 1
Expand Down
10 changes: 8 additions & 2 deletions classes/protein_degree_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,23 @@ def predict(
negative_dataset["go"],
):

c = 0
if G.has_edge(positive_protein, positive_protein):
c = 1
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(
len(get_neighbors(G, positive_protein, "protein_protein"))
len(get_neighbors(G, positive_protein, "protein_protein")) - c
)
data["true_label"].append(1)

c = 0
if G.has_edge(negative_protein, negative_protein):
c = 1
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(
len(get_neighbors(G, negative_protein, "protein_protein"))
len(get_neighbors(G, negative_protein, "protein_protein")) - c
)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
Expand Down
36 changes: 18 additions & 18 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,37 +61,37 @@ def main():
go_protein_pairs = read_specific_columns(
fly_go_association_path, go_inferred_columns, ","
)

protein_list = []

# if there is no graph.pickle file in the output/dataset directory, uncomment the following lines
G, protein_list = create_ppi_network(interactome, go_protein_pairs)
export_graph_to_pickle(G, graph_file_path)
# G, protein_list = create_ppi_network(interactome, go_protein_pairs)
# export_graph_to_pickle(G, testing_graph_file_path)

# if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used
positive_dataset, negative_dataset = sample_data(
go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
)
# positive_dataset, negative_dataset = sample_data(
# go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
# )

# Define algorithm classes and their names
algorithm_classes = {
# "OverlappingNeighbors": OverlappingNeighbors,
# "OverlappingNeighborsV2": OverlappingNeighborsV2,
# "OverlappingNeighborsV3": OverlappingNeighborsV3,
# "ProteinDegree": ProteinDegree,
# "ProteinDegreeV2": ProteinDegreeV2,
# "ProteinDegreeV3": ProteinDegreeV3,
# "SampleAlgorithm": SampleAlgorithm,
# "HypergeometricDistribution": HypergeometricDistribution,
"OverlappingNeighbors": OverlappingNeighbors,
"OverlappingNeighborsV2": OverlappingNeighborsV2,
"OverlappingNeighborsV3": OverlappingNeighborsV3,
"ProteinDegree": ProteinDegree,
"ProteinDegreeV2": ProteinDegreeV2,
"ProteinDegreeV3": ProteinDegreeV3,
"SampleAlgorithm": SampleAlgorithm,
"HypergeometricDistribution": HypergeometricDistribution,
"HypergeometricDistributionV2": HypergeometricDistributionV2,
}

results = run_workflow(
algorithm_classes,
dataset_directory_path,
graph_file_path,
output_data_path,
output_image_path,
testing_input_directory_path,
testing_graph_file_path,
testing_output_data_path,
testing_output_image_path,
True,
True,
)
Expand Down
127 changes: 119 additions & 8 deletions tests/test_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from classes.protein_degree_v3_class import ProteinDegreeV3
from classes.sample_algorithm import SampleAlgorithm
from classes.base_algorithm_class import BaseAlgorithm
from classes.hypergeometric_distribution_class import HypergeometricDistribution
from classes.hypergeometric_distribution_class_V2 import HypergeometricDistributionV2

from pathlib import Path
from tools.helper import (
read_specific_columns,
Expand Down Expand Up @@ -70,6 +73,8 @@ def test_algorithm_workflow():
"ProteinDegree": ProteinDegree,
"ProteinDegreeV2": ProteinDegreeV2,
"ProteinDegreeV3": ProteinDegreeV3,
"HypergeometricDistribution": HypergeometricDistribution,
"HypergeometricDistributionV2": HypergeometricDistributionV2
}

results = run_workflow(
Expand All @@ -88,10 +93,8 @@ def test_algorithm_workflow():
"ProteinDegree": 0.825,
"ProteinDegreeV2": 0.675,
"ProteinDegreeV3": 0.89,
"HypergeometricDistribution": 0.78,
"HypergeometricDistributionV2": 0.89,
"HypergeometricDistributionV3": 0.675,
"HypergeometricDistributionV4": 0.6
"HypergeometricDistribution": 0.76,
"HypergeometricDistributionV2": 0.86,
}

pr_results = {
Expand All @@ -102,14 +105,122 @@ def test_algorithm_workflow():
"ProteinDegreeV2": 0.6367757242757243,
"OverlappingNeighbors": 0.5329058916229968,
"SampleAlgorithm": 0.4093791854859966,
"HypergeometricDistribution": 0.7899246806,
"HypergeometricDistributionV2": 0.8519169719,
"HypergeometricDistributionV3": 0.7142573629,
"HypergeometricDistributionV4": 0.6967847007,
"HypergeometricDistribution": 0.7899246805825753,
"HypergeometricDistributionV2": 0.8519169719169718,
}

for algorithm, metrics in results.items():
assert metrics["roc_auc"] == roc_results[algorithm]

for algorithm, metrics in results.items():
assert metrics["pr_auc"] == pr_results[algorithm]


def test_self_edge_case(): #Redundant but mostly for the sake of seperation, I can add it to the above section
if not os.path.exists("output"):
os.makedirs("output")
if not os.path.exists("output/dataset"):
os.makedirs("output/dataset")
if not os.path.exists("output/data"):
os.makedirs("output/data")
if not os.path.exists("output/images"):
os.makedirs("output/images")

output_data_path = Path("./output/data/")
output_image_path = Path("./output/images/")
input_directory_path = Path("./tests/testing-dataset/zfish")
graph_file_path = Path(input_directory_path, "graph.pickle")

algorithm_classes = {
"OverlappingNeighbors": OverlappingNeighbors,
"OverlappingNeighborsV2": OverlappingNeighborsV2,
"OverlappingNeighborsV3": OverlappingNeighborsV3,
"ProteinDegree": ProteinDegree,
"ProteinDegreeV2": ProteinDegreeV2,
"ProteinDegreeV3": ProteinDegreeV3,
"HypergeometricDistribution": HypergeometricDistribution,
"HypergeometricDistributionV2": HypergeometricDistributionV2
}

#For zfish
results = run_workflow(
algorithm_classes,
input_directory_path,
graph_file_path,
output_data_path,
output_image_path,
False,
False,
)
roc_results = {
"OverlappingNeighbors": 0.715,
"OverlappingNeighborsV2": 0.8,
"OverlappingNeighborsV3": 0.7899999999999999,
"ProteinDegree": 0.9650000000000001,
"ProteinDegreeV2": 0.775,
"ProteinDegreeV3": 0.9750000000000001,
"HypergeometricDistribution": 0.5449999999999999,
"HypergeometricDistributionV2": 0.8300000000000001,
}

pr_results = {
"ProteinDegreeV3": 0.9754545454545455,
"ProteinDegree": 0.9675757575757575,
"OverlappingNeighborsV3": 0.8179265873015872,
"OverlappingNeighborsV2": 0.8292361111111111,
"ProteinDegreeV2": 0.7573318322544329,
"OverlappingNeighbors": 0.5794961247902424,
"SampleAlgorithm": 0.43900023737872035,
"HypergeometricDistribution": 0.5095882374849092,
"HypergeometricDistributionV2": 0.674983904983905,
}

for algorithm, metrics in results.items():
assert metrics["roc_auc"] == roc_results[algorithm]

for algorithm, metrics in results.items():
assert metrics["pr_auc"] == pr_results[algorithm]


#For Bsub
input_directory_path = Path("./tests/testing-dataset/bsub")
graph_file_path = Path(input_directory_path, "graph.pickle")

results = run_workflow(
algorithm_classes,
input_directory_path,
graph_file_path,
output_data_path,
output_image_path,
False,
False,
)
roc_results = {
"OverlappingNeighbors": 0.575,
"OverlappingNeighborsV2": 0.6399999999999999,
"OverlappingNeighborsV3": 0.6399999999999999,
"ProteinDegree": 0.7050000000000001,
"ProteinDegreeV2": 0.54,
"ProteinDegreeV3": 0.71,
"HypergeometricDistribution": 0.51,
"HypergeometricDistributionV2": 0.8499999999999999,
}

pr_results = {
"ProteinDegreeV3": 0.6918311998459057,
"ProteinDegree": 0.6560890253537313,
"OverlappingNeighborsV3": 0.5933333333333334,
"OverlappingNeighborsV2": 0.5933333333333334,
"ProteinDegreeV2": 0.588080808080808,
"OverlappingNeighbors": 0.5224841799067805,
"SampleAlgorithm": 0.5922520550055379,
"HypergeometricDistribution": 0.5001244588744589,
"HypergeometricDistributionV2": 0.7131783494283495,
}

for algorithm, metrics in results.items():
assert metrics["roc_auc"] == roc_results[algorithm]

for algorithm, metrics in results.items():
assert metrics["pr_auc"] == pr_results[algorithm]

Binary file added tests/testing-dataset/bsub/graph.pickle
Binary file not shown.
Loading

0 comments on commit 7dbcd90

Please sign in to comment.