Reed-CompBio
diff --git a/‎classes/hypergeometric_distribution_class.py
Lines changed: 13 additions & 4 deletions b/‎classes/hypergeometric_distribution_class.py
Lines changed: 13 additions & 4 deletions
diff --git a/‎classes/hypergeometric_distribution_class_V2.py
Lines changed: 13 additions & 5 deletions b/‎classes/hypergeometric_distribution_class_V2.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎classes/hypergeometric_distribution_class_V3.py renamed to ‎classes/removed_edge/overlapping_neighbors_class_no_pi.py
Lines changed: 12 additions & 21 deletions b/‎classes/hypergeometric_distribution_class_V3.py renamed to ‎classes/removed_edge/overlapping_neighbors_class_no_pi.py
Lines changed: 12 additions & 21 deletions
diff --git a/‎classes/hypergeometric_distribution_class_V4.py renamed to ‎classes/removed_edge/overlapping_neighbors_v2_class_no_pi.py
Lines changed: 30 additions & 31 deletions b/‎classes/hypergeometric_distribution_class_V4.py renamed to ‎classes/removed_edge/overlapping_neighbors_v2_class_no_pi.py
Lines changed: 30 additions & 31 deletions
@@ -76,11 +76,15 @@ def predict(
                 )
             )
 
+            c = 0
+            if G.has_edge(positive_protein, positive_protein):
+                c = 1 #Removes extra node if there is an edge to self 
+            
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
+            pos_n = len(positive_pro_pro_neighbor) - c #Number of protein neighbors the protein of interest has
             K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include protein of interest (but does not change significantly if protein is included)
-            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
-
+            pos_k = positive_go_annotated_pro_pro_neighbor_count - c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest
+            
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
 
@@ -95,11 +99,16 @@ def predict(
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) #Negative protein of interest neighbors
+            c = 0
+            if G.has_edge(negative_protein, negative_protein):
+                c = 1
+
+            neg_n = len(negative_pro_pro_neighbor) - c #Negative protein of interest neighbors
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap between go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
 
+            
             # input positive and negative score to data
             data["protein"].append(positive_protein)
             data["go_term"].append(positive_go)
 
@@ -75,11 +75,15 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-            
+
+            c = 1
+            if G.has_edge(positive_protein, positive_protein):
+                c = 0
+                
             N = len([x for x,y in G.nodes(data=True) if y['type']=="protein"]) #Total number of protein nodes in the entire graph
-            pos_n = len(positive_pro_pro_neighbor) + 1 #Number of protein neighbors the protein of interest has (includes the protein of interest)
+            pos_n = len(positive_pro_pro_neighbor) + c #Number of protein neighbors the protein of interest has (includes the protein of interest)
             K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
+            pos_k = positive_go_annotated_pro_pro_neighbor_count + c #The overlap between the GO protein neighbors and protein neighbors of the protein of interest (includes the protein of interest)
 
             #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
             positive_score = 1 - ((math.comb(K,pos_k)*math.comb(N-K,pos_n-pos_k))/math.comb(N,pos_n))
@@ -95,7 +99,11 @@ def predict(
                 )
             )
 
-            neg_n = len(negative_pro_pro_neighbor) + 1 #Negative protein of interest neighbors (includes self)
+            c = 1
+            if G.has_edge(negative_protein, negative_protein):
+                c = 0
+                
+            neg_n = len(negative_pro_pro_neighbor) + c #Negative protein of interest neighbors (includes self)
             neg_k = negative_go_annotated_protein_neighbor_count #Overlap betweesn go neighbors and protein neighbors (should be fewer for neg than pos)
 
             negative_score = 1 - ((math.comb(K,neg_k)*math.comb(N-K,neg_n-neg_k))/math.comb(N,neg_n))
@@ -159,4 +167,4 @@ def get_go_annotated_pro_pro_neighbor_count(G: nx.Graph, nodeList, goTerm):
     for element in nodeList:
         if G.has_edge(element[0], goTerm):
             count += 1
-    return count
+    return count
@@ -4,12 +4,11 @@
 from colorama import init as colorama_init
 from colorama import Fore, Back, Style
 from pathlib import Path
-import math
 from tools.helper import print_progress, normalize, import_graph_from_pickle
 from tools.workflow import get_datasets
 
 
-class HypergeometricDistributionV3(BaseAlgorithm):
+class OverlappingNeighborsNoPI(BaseAlgorithm):
     def __init__(self):
         self.y_score = []
         self.y_true = []
@@ -33,16 +32,15 @@ def predict(
         output_path,
     ):
         """
-        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of 
-        interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein 
-        of interest and/or the GO term). Does not include the protein of interest.
+        evaluate overlapping neighbors method on a protein protein interaction network with go term annotation.
         """
         colorama_init()
 
         # have two sets of positive and negative protein-go_term pairs
         # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term.
         # 50% of the data are proteins that are annotated to a GO term
         # 50% of the data are proteins that are not annotated to a GO term
+        # score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor)
 
         data = {
             "protein": [],
@@ -65,7 +63,7 @@ def predict(
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            G.remove_edge(positive_protein, positive_go)
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -76,14 +74,9 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-            
-            pos_N = len(positive_pro_pro_neighbor) + len(positive_go_neighbor) -positive_go_annotated_pro_pro_neighbor_count - 1 #Sample size is only the neighbors of the protein & GO term of interest
-            pos_n = len(positive_pro_pro_neighbor) #Number of protein neighbors the protein of interest has
-            K = len(positive_go_neighbor) - 1 #Number of protein neighbors the GO term of interest has, same for pos & neg, does not include the protein of interest
-            pos_k = positive_go_annotated_pro_pro_neighbor_count #The overlap between the GO term and the protein of interst's neighbor proteins
-            print("pos_N: ", pos_N, "pos_n: ", pos_n, "K: ", K, "pos_k: ", pos_k)
-            #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
-            positive_score = 1 - ((math.comb(K,pos_k)*math.comb(pos_N-K,pos_n-pos_k))/math.comb(pos_N,pos_n))
+            positive_score = (1 + positive_go_annotated_pro_pro_neighbor_count) / (
+                len(positive_pro_pro_neighbor) + len(positive_go_neighbor)
+            )
 
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
@@ -95,12 +88,9 @@ def predict(
                     G, negative_pro_pro_neighbor, negative_go
                 )
             )
-
-            neg_N = len(negative_pro_pro_neighbor) + len(negative_go_neighbor) - negative_go_annotated_protein_neighbor_count 
-            neg_n = len(negative_pro_pro_neighbor) 
-            neg_k = negative_go_annotated_protein_neighbor_count
-
-            negative_score = 1 - ((math.comb(K,neg_k)*math.comb(neg_N-K,neg_n-neg_k))/math.comb(neg_N,neg_n))
+            negative_score = (1 + negative_go_annotated_protein_neighbor_count) / (
+                len(negative_pro_pro_neighbor) + len(negative_go_neighbor)
+            )
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
@@ -124,6 +114,7 @@ def predict(
             data["true_label"].append(0)
 
             print_progress(i, len(positive_dataset["protein"]))
+            G.add_edge(positive_protein, positive_go, type="protein_go_term")
             i += 1
 
         normalized_data = normalize(data["score"])
@@ -134,7 +125,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometric_distribution_v3.csv"),
+            Path(output_path, "overlapping_neighbor_data.csv"),
             index=False,
             sep="\t",
         )
 
@@ -1,25 +1,22 @@
 from classes.base_algorithm_class import BaseAlgorithm
 import networkx as nx
 import pandas as pd
-from colorama import init as colorama_init
-from colorama import Fore, Back, Style
+from tools.helper import normalize, print_progress, import_graph_from_pickle
 from pathlib import Path
-import math
-from tools.helper import print_progress, normalize, import_graph_from_pickle
 from tools.workflow import get_datasets
 
 
-class HypergeometricDistributionV4(BaseAlgorithm):
+class OverlappingNeighborsV2NoPI(BaseAlgorithm):
     def __init__(self):
         self.y_score = []
         self.y_true = []
 
     def get_y_score(self):
         return self.y_score
-    
+
     def get_y_true(self):
         return self.y_true
-    
+
     def set_y_score(self, y_score):
         self.y_score = y_score
 
@@ -33,16 +30,14 @@ def predict(
         output_path,
     ):
         """
-        Uses a Hypergeometric distribution to calculate a confidence value for the relationship between a protein of 
-        interest and a GO term. Only uses proteins inside the sub-network (comprised of proteins linked with the protein 
-        of interest and/or the GO term). Accounts for protein of interest. 
+        evaluate overlapping neighbors method on a protein protein interaction network with go term annotation.
         """
-        colorama_init()
 
         # have two sets of positive and negative protein-go_term pairs
         # for each pair, calculate the score of how well they predict whether a protein should be annotated to a GO term.
         # 50% of the data are proteins that are annotated to a GO term
         # 50% of the data are proteins that are not annotated to a GO term
+        # score equation (1 + number of ProProNeighbor that are annotated to the go term) / (number of ProProNeighbor + number of GoNeighbor)
 
         data = {
             "protein": [],
@@ -56,16 +51,16 @@ def predict(
         }
 
         positive_dataset, negative_dataset = get_datasets(input_directory_path)
-        G = import_graph_from_pickle(graph_file_path)
 
+        G = import_graph_from_pickle(graph_file_path)
         i = 1
         for positive_protein, positive_go, negative_protein, negative_go in zip(
             positive_dataset["protein"],
             positive_dataset["go"],
             negative_dataset["protein"],
             negative_dataset["go"],
         ):
-
+            G.remove_edge(positive_protein, positive_go)
             # calculate the score for the positive set
             positive_pro_pro_neighbor = get_neighbors(
                 G, positive_protein, "protein_protein"
@@ -76,32 +71,35 @@ def predict(
                     G, positive_pro_pro_neighbor, positive_go
                 )
             )
-            
-            #Protein of interest neighbors + go term of protein neighbors - overlap
-            pos_N = len(positive_pro_pro_neighbor) + len(positive_go_neighbor) - positive_go_annotated_pro_pro_neighbor_count #Sample size is only the neighbors of the protein & GO term of interest
-            pos_n = len(positive_pro_pro_neighbor)+1 #Number of protein neighbors the protein of interest has (includes self)
-            K = len(positive_go_neighbor) #Number of protein neighbors the GO term of interest has, same for pos & neg
-            pos_k = positive_go_annotated_pro_pro_neighbor_count + 1 #The overlap between the GO and protein neighbor proteins (includes self)
-            
-            #The hypergeometric function using variables above, math.comb(n,k) is an n choose k function
-            positive_score = 1 - ((math.comb(K,pos_k)*math.comb(pos_N-K,pos_n-pos_k))/math.comb(pos_N,pos_n))
+
+            #Maybe not the best assumption but if the number of go neighbors is 0, then there is no chance of the protein being annotated to the go term, doesn't seem to change the ROC or PR AUC much
+            if len(positive_go_neighbor) == 0:
+                positive_score = 0
+            else:
+                positive_score = positive_go_annotated_pro_pro_neighbor_count + (
+                    1
+                    + len(positive_pro_pro_neighbor)
+                    * positive_go_annotated_pro_pro_neighbor_count
+                ) / (len(positive_go_neighbor) / 2)
 
             # calculate the score for the negative set
             negative_pro_pro_neighbor = get_neighbors(
                 G, negative_protein, "protein_protein"
             )
             negative_go_neighbor = get_neighbors(G, negative_go, "protein_go_term")
-            negative_go_annotated_protein_neighbor_count = (
+            negative_go_annotated_pro_pro_neighbor_count = (
                 get_go_annotated_pro_pro_neighbor_count(
                     G, negative_pro_pro_neighbor, negative_go
                 )
             )
-
-            neg_N = len(negative_pro_pro_neighbor) + len(negative_go_neighbor) - negative_go_annotated_protein_neighbor_count + 1 #Self is not accounted for by GO term since there is no connection
-            neg_n = len(negative_pro_pro_neighbor) + 1 #Include self
-            neg_k = negative_go_annotated_protein_neighbor_count
-
-            negative_score = 1 - ((math.comb(K,neg_k)*math.comb(neg_N-K,neg_n-neg_k))/math.comb(neg_N,neg_n))
+            if len(negative_go_neighbor) == 0:
+                negative_score = 0
+            else:
+                negative_score = negative_go_annotated_pro_pro_neighbor_count + (
+                    1
+                    + len(negative_pro_pro_neighbor)
+                    * negative_go_annotated_pro_pro_neighbor_count
+                ) / (len(negative_go_neighbor) / 2)
 
             # input positive and negative score to data
             data["protein"].append(positive_protein)
@@ -119,12 +117,13 @@ def predict(
             data["pro_pro_neighbor"].append(len(negative_pro_pro_neighbor))
             data["go_neighbor"].append(len(negative_go_neighbor))
             data["go_annotated_pro_pro_neighbors"].append(
-                negative_go_annotated_protein_neighbor_count
+                negative_go_annotated_pro_pro_neighbor_count
             )
             data["score"].append(negative_score)
             data["true_label"].append(0)
 
             print_progress(i, len(positive_dataset["protein"]))
+            G.add_edge(positive_protein, positive_go, type="protein_go_term")
             i += 1
 
         normalized_data = normalize(data["score"])
@@ -135,7 +134,7 @@ def predict(
         df = df.sort_values(by="norm_score", ascending=False)
 
         df.to_csv(
-            Path(output_path, "hypergeometric_distribution_v4_data.csv"),
+            Path(output_path, "overlapping_neighbor_v2_data.csv"),
             index=False,
             sep="\t",
         )