Reed-CompBio
diff --git a/‎classes/overlapping_neighbors_class.py
Lines changed: 13 additions & 6 deletions b/‎classes/overlapping_neighbors_class.py
Lines changed: 13 additions & 6 deletions
diff --git a/‎classes/overlapping_neighbors_v2_class.py
Lines changed: 9 additions & 4 deletions b/‎classes/overlapping_neighbors_v2_class.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎classes/overlapping_neighbors_v3_class.py
Lines changed: 5 additions & 2 deletions b/‎classes/overlapping_neighbors_v3_class.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎classes/protein_degree_class.py
Lines changed: 8 additions & 2 deletions b/‎classes/protein_degree_class.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎classes/protein_degree_v2_class.py
Lines changed: 8 additions & 2 deletions b/‎classes/protein_degree_v2_class.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎main.py
Lines changed: 18 additions & 18 deletions b/‎main.py
Lines changed: 18 additions & 18 deletions
diff --git a/‎tests/test_pytest.py
Lines changed: 119 additions & 8 deletions b/‎tests/test_pytest.py
Lines changed: 119 additions & 8 deletions
diff --git a/‎tests/testing-dataset/bsub/graph.pickle
2.06 MB b/‎tests/testing-dataset/bsub/graph.pickle
2.06 MB
@@ -55,30 +55,37 @@ def predict(
 
         positive_dataset, negative_dataset = get_datasets(input_directory_path)
         G = import_graph_from_pickle(graph_file_path)
-
         i = 1
         for positive_protein, positive_go, negative_protein, negative_go in zip(
             positive_dataset["protein"],
             positive_dataset["go"],
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
             )
+            
+            # print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
             positive_go_neighbor = get_neighbors(G, positive_go, "protein_go_term")
             positive_go_annotated_pro_pro_neighbor_count = (
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
+            ) - c
+        
             positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
-                len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
+                len(positive_pro_pro_neighbor) -c + len(positive_go_neighbor)
             )
 
             # calculate the score for the negative set
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
@@ -89,7 +96,7 @@ def predict(
                 )
             )
             negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
-                len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
+                len(negative_pro_pro_neighbor) - c + len(negative_go_neighbor)
             )
 
             # input positive and negative score to data
@@ -141,7 +148,7 @@ def get_neighbors(G: nx.Graph, node, edgeType):
     for edge in res:
         if edge[2]["type"] == edgeType:
             neighborNode = [edge[1], edge[2]]
-            neighbors.append(neighborNode)
+            neighbors.append(neighborNode) 
 
     return neighbors
 
 
@@ -60,7 +60,9 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -70,13 +72,16 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
+            ) - c
             positive_score = positive_go_annotated_pro_pro_neighbor_count + (
                 1
-                + len(positive_pro_pro_neighbor)
+                + (len(positive_pro_pro_neighbor) - c)
                 * positive_go_annotated_pro_pro_neighbor_count
             ) / (len(positive_go_neighbor) / 2)
 
+            c = 0 
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
@@ -89,7 +94,7 @@ def predict(
             )
             negative_score = negative_go_annotated_pro_pro_neighbor_count + (
                 1
-                + len(negative_pro_pro_neighbor)
+                + (len(negative_pro_pro_neighbor) - c)
                 * negative_go_annotated_pro_pro_neighbor_count
             ) / (len(negative_go_neighbor) / 2)
 
 
@@ -60,6 +60,9 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -69,7 +72,7 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, positive_pro_pro_neighbor, positive_go
                 )
-            )
+            ) - c
             positive_score = positive_go_annotated_pro_pro_neighbor_count + (
                 1 + positive_go_annotated_pro_pro_neighbor_count
             ) / (len(positive_go_neighbor))
@@ -83,7 +86,7 @@ def predict(
                 get_go_annotated_pro_pro_neighbor_count(
                     G, negative_pro_pro_neighbor, negative_go
                 )
-            )
+            ) 
             negative_score = negative_go_annotated_pro_pro_neighbor_count + (
                 1 + negative_go_annotated_pro_pro_neighbor_count
             ) / (len(negative_go_neighbor))
 
@@ -52,14 +52,20 @@ def predict(
             negative_dataset["go"],
         ):
 
+            c = 0 
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
-            data["degree"].append(G.degree(positive_protein))
+            data["degree"].append(G.degree(positive_protein) - c)
             data["true_label"].append(1)
 
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
-            data["degree"].append(G.degree(negative_protein))
+            data["degree"].append(G.degree(negative_protein) - c)
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))
             i += 1
 
@@ -54,17 +54,23 @@ def predict(
             negative_dataset["go"],
         ):
 
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
             data["degree"].append(
-                len(get_neighbors(G, positive_protein, "protein_protein"))
+                len(get_neighbors(G, positive_protein, "protein_protein")) - c
             )
             data["true_label"].append(1)
 
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
             data["protein"].append(negative_protein)
             data["go_term"].append(negative_go)
             data["degree"].append(
-                len(get_neighbors(G, negative_protein, "protein_protein"))
+                len(get_neighbors(G, negative_protein, "protein_protein")) - c
             )
             data["true_label"].append(0)
             print_progress(i, len(positive_dataset["protein"]))
 
@@ -61,37 +61,37 @@ def main():
     go_protein_pairs = read_specific_columns(
         fly_go_association_path, go_inferred_columns, ","
     )
-
+        
     protein_list = []
 
     # if there is no graph.pickle file in the output/dataset directory, uncomment the following lines
-    G, protein_list = create_ppi_network(interactome, go_protein_pairs)
-    export_graph_to_pickle(G, graph_file_path)
+    # G, protein_list = create_ppi_network(interactome, go_protein_pairs)
+    # export_graph_to_pickle(G, testing_graph_file_path)
 
     # if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used
-    positive_dataset, negative_dataset = sample_data(
-        go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
-    )
+    # positive_dataset, negative_dataset = sample_data(
+    #     go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
+    # )
 
     # Define algorithm classes and their names
     algorithm_classes = {
-        # "OverlappingNeighbors": OverlappingNeighbors,
-        # "OverlappingNeighborsV2": OverlappingNeighborsV2,
-        # "OverlappingNeighborsV3": OverlappingNeighborsV3,
-        # "ProteinDegree": ProteinDegree,
-        # "ProteinDegreeV2": ProteinDegreeV2,
-        # "ProteinDegreeV3": ProteinDegreeV3,
-        # "SampleAlgorithm": SampleAlgorithm,
-        # "HypergeometricDistribution": HypergeometricDistribution,
+        "OverlappingNeighbors": OverlappingNeighbors,
+        "OverlappingNeighborsV2": OverlappingNeighborsV2,
+        "OverlappingNeighborsV3": OverlappingNeighborsV3,
+        "ProteinDegree": ProteinDegree,
+        "ProteinDegreeV2": ProteinDegreeV2,
+        "ProteinDegreeV3": ProteinDegreeV3,
+        "SampleAlgorithm": SampleAlgorithm,
+        "HypergeometricDistribution": HypergeometricDistribution,
         "HypergeometricDistributionV2": HypergeometricDistributionV2,
     }
 
     results = run_workflow(
         algorithm_classes,
-        dataset_directory_path,
-        graph_file_path,
-        output_data_path,
-        output_image_path,
+        testing_input_directory_path,
+        testing_graph_file_path,
+        testing_output_data_path,
+        testing_output_image_path,
         True,
         True,
     )
 
@@ -8,6 +8,9 @@
 from classes.protein_degree_v3_class import ProteinDegreeV3
 from classes.sample_algorithm import SampleAlgorithm
 from classes.base_algorithm_class import BaseAlgorithm
+from classes.hypergeometric_distribution_class import HypergeometricDistribution
+from classes.hypergeometric_distribution_class_V2 import HypergeometricDistributionV2
+
 from pathlib import Path
 from tools.helper import (
     read_specific_columns,
@@ -70,6 +73,8 @@ def test_algorithm_workflow():
         "ProteinDegree": ProteinDegree,
         "ProteinDegreeV2": ProteinDegreeV2,
         "ProteinDegreeV3": ProteinDegreeV3,
+        "HypergeometricDistribution": HypergeometricDistribution,
+        "HypergeometricDistributionV2": HypergeometricDistributionV2
     }
 
     results = run_workflow(
@@ -88,10 +93,8 @@ def test_algorithm_workflow():
         "ProteinDegree": 0.825,
         "ProteinDegreeV2": 0.675,
         "ProteinDegreeV3": 0.89,
-        "HypergeometricDistribution": 0.78,
-        "HypergeometricDistributionV2": 0.89,
-        "HypergeometricDistributionV3": 0.675,
-        "HypergeometricDistributionV4": 0.6
+        "HypergeometricDistribution": 0.76,
+        "HypergeometricDistributionV2": 0.86,
     }
 
     pr_results = {
@@ -102,14 +105,122 @@ def test_algorithm_workflow():
         "ProteinDegreeV2": 0.6367757242757243,
         "OverlappingNeighbors": 0.5329058916229968,
         "SampleAlgorithm": 0.4093791854859966,
-        "HypergeometricDistribution": 0.7899246806,
-        "HypergeometricDistributionV2": 0.8519169719,
-        "HypergeometricDistributionV3": 0.7142573629,
-        "HypergeometricDistributionV4": 0.6967847007,
+        "HypergeometricDistribution": 0.7899246805825753,
+        "HypergeometricDistributionV2":	0.8519169719169718,
+    }
+
+    for algorithm, metrics in results.items():
+        assert metrics["roc_auc"] == roc_results[algorithm]
+
+    for algorithm, metrics in results.items():
+        assert metrics["pr_auc"] == pr_results[algorithm]
+        
+
+def test_self_edge_case(): #Redundant but mostly for the sake of seperation, I can add it to the above section
+    if not os.path.exists("output"):
+        os.makedirs("output")
+    if not os.path.exists("output/dataset"):
+        os.makedirs("output/dataset")
+    if not os.path.exists("output/data"):
+        os.makedirs("output/data")
+    if not os.path.exists("output/images"):
+        os.makedirs("output/images")
+
+    output_data_path = Path("./output/data/")
+    output_image_path = Path("./output/images/")
+    input_directory_path = Path("./tests/testing-dataset/zfish")
+    graph_file_path = Path(input_directory_path, "graph.pickle")
+
+    algorithm_classes = {
+        "OverlappingNeighbors": OverlappingNeighbors,
+        "OverlappingNeighborsV2": OverlappingNeighborsV2,
+        "OverlappingNeighborsV3": OverlappingNeighborsV3,
+        "ProteinDegree": ProteinDegree,
+        "ProteinDegreeV2": ProteinDegreeV2,
+        "ProteinDegreeV3": ProteinDegreeV3,
+        "HypergeometricDistribution": HypergeometricDistribution,
+        "HypergeometricDistributionV2": HypergeometricDistributionV2
+    }
+
+    #For zfish
+    results = run_workflow(
+        algorithm_classes,
+        input_directory_path,
+        graph_file_path,
+        output_data_path,
+        output_image_path,
+        False,
+        False,
+    )
+    roc_results = {
+        "OverlappingNeighbors": 0.715,
+        "OverlappingNeighborsV2": 0.8,
+        "OverlappingNeighborsV3": 0.7899999999999999,
+        "ProteinDegree": 0.9650000000000001,
+        "ProteinDegreeV2": 0.775,
+        "ProteinDegreeV3": 0.9750000000000001,
+        "HypergeometricDistribution": 0.5449999999999999,
+        "HypergeometricDistributionV2": 0.8300000000000001,
+    }
+
+    pr_results = {
+        "ProteinDegreeV3": 0.9754545454545455,
+        "ProteinDegree": 0.9675757575757575,
+        "OverlappingNeighborsV3": 0.8179265873015872,
+        "OverlappingNeighborsV2": 0.8292361111111111,
+        "ProteinDegreeV2": 0.7573318322544329,
+        "OverlappingNeighbors": 0.5794961247902424,
+        "SampleAlgorithm": 0.43900023737872035,
+        "HypergeometricDistribution": 0.5095882374849092,
+        "HypergeometricDistributionV2":	0.674983904983905,
+    }
+
+    for algorithm, metrics in results.items():
+        assert metrics["roc_auc"] == roc_results[algorithm]
+
+    for algorithm, metrics in results.items():
+        assert metrics["pr_auc"] == pr_results[algorithm]
+
+
+    #For Bsub
+    input_directory_path = Path("./tests/testing-dataset/bsub")
+    graph_file_path = Path(input_directory_path, "graph.pickle")
+    
+    results = run_workflow(
+        algorithm_classes,
+        input_directory_path,
+        graph_file_path,
+        output_data_path,
+        output_image_path,
+        False,
+        False,
+    )
+    roc_results = {
+        "OverlappingNeighbors": 0.575,
+        "OverlappingNeighborsV2": 0.6399999999999999,
+        "OverlappingNeighborsV3": 0.6399999999999999,
+        "ProteinDegree": 0.7050000000000001,
+        "ProteinDegreeV2": 0.54,
+        "ProteinDegreeV3": 0.71,
+        "HypergeometricDistribution": 0.51,
+        "HypergeometricDistributionV2": 0.8499999999999999,
+    }
+
+    pr_results = {
+        "ProteinDegreeV3": 0.6918311998459057,
+        "ProteinDegree": 0.6560890253537313,
+        "OverlappingNeighborsV3": 0.5933333333333334,
+        "OverlappingNeighborsV2": 0.5933333333333334,
+        "ProteinDegreeV2": 0.588080808080808,
+        "OverlappingNeighbors": 0.5224841799067805,
+        "SampleAlgorithm": 0.5922520550055379,
+        "HypergeometricDistribution": 0.5001244588744589,
+        "HypergeometricDistributionV2":	0.7131783494283495,
     }
 
     for algorithm, metrics in results.items():
         assert metrics["roc_auc"] == roc_results[algorithm]
 
     for algorithm, metrics in results.items():
         assert metrics["pr_auc"] == pr_results[algorithm]
+