new version with groupby_cols

michaelbornholdt · michaelbornholdt · commit cfdee113dc99 · 2021-10-15T14:22:48.000-04:00
diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
@@ -25,6 +25,7 @@ def evaluate(
     meta_features: List[str],
     replicate_groups: Union[List[str], dict],
     operation: str = "replicate_reproducibility",
+    groupby_columns: List[str] = ["Metadata_broad_sample"],
     similarity_metric: str = "pearson",
     replicate_reproducibility_quantile: float = 0.95,
     replicate_reproducibility_return_median_cor: bool = False,
@@ -34,7 +35,6 @@ def evaluate(
     mp_value_params: dict = {},
     enrichment_percentile: Union[float, List[float]] = 0.99,
     hitk_percent_list=[2, 5, 10],
-    hitk_group_col = "pair_a_index",
 ):
     r"""Evaluate profile quality and strength.
 
@@ -69,6 +69,12 @@ def evaluate(
     operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
         The specific evaluation metric to calculate. The default is
         "replicate_reproducibility".
+    groupby_columns : List of str
+        Only used for operation = 'precision_recall' and 'hitk'
+        Column by which the similarity matrix is grouped and by which the operation is calculated.
+        For example, if groupby_column = "Metadata_broad_sample" then precision/recall is calculated for each sample.
+        Note that it makes sense for these columns to be unique or to span a unique space
+        since precision and hitk may otherwise stop making sense.
     similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
         How to calculate pairwise similarity. Defaults to "pearson". We use the input
         in pandas.DataFrame.cor(). The default is "pearson".
@@ -112,10 +118,6 @@ def evaluate(
         A list of percentages at which to calculate the percent scores, ie the amount of indexes below this percentage.
         If percent_list == "all" a full dict with the length of classes will be created.
         Percentages are given as integers, ie 50 means 50 %.
-    hitk_group_col : str, optional
-        Only used when operation='hitk'
-        The column over which the hits are indexed.
-        Only deviate from "pair_a_index" if you know what you are doing!
     """
     # Check replicate groups input
     check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups)
@@ -170,7 +172,7 @@ def evaluate(
         metric_result = hitk(
             similarity_melted_df=similarity_melted_df,
             replicate_groups=replicate_groups,
+            groupby_columns=groupby_columns,
             percent_list=hitk_percent_list,
-            group_col=hitk_group_col,
         )
     return metric_result
diff --git a/cytominer_eval/operations/hitk.py b/cytominer_eval/operations/hitk.py
@@ -4,16 +4,16 @@
 from typing import List, Union
 
 
-from cytominer_eval.utils.hitk_utils import add_hits_hits, percentage_scores
+from cytominer_eval.utils.hitk_utils import add_hit_rank, percentage_scores
 from cytominer_eval.utils.operation_utils import assign_replicates
 from cytominer_eval.utils.transform_utils import set_pair_ids, assert_melt
 
 
 def hitk(
     similarity_melted_df: pd.DataFrame,
     replicate_groups: List[str],
+    groupby_columns: List[str],
     percent_list: Union[int, List[int]],
-    group_col="pair_a_index",
 ) -> pd.DataFrame:
     """Calculate the hit@k hits list and percent scores.
     This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
@@ -34,23 +34,24 @@ def hitk(
     replicate_groups : list or int
         a list of metadata column names in the original profile dataframe to use as replicate columns.
 
+    groupby_columns: str
+        group columns determine the columns over which the similarity_melted_df is grouped.
+        Usually groupby_columns will span the full space of the input data
+        such that drop_duplicates by the groupby_cols would not change the data.
+        If you group over Metadata_plate for examples, you will get meaningless results.
+        This can easily be seen from the fact that the percent score at 100 will be nonzero.
+
     percent_list : list or "all"
         A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
         If percent_list == "all" a full dict with the length of classes will be created.
         Percentages are given as integers, ie 50 means 50 %.
 
-    group_col: str
-        group columns determine the column over which the similarity_melted_df is grouped.
-        Usually group_col will be "pair_a_index" since this follows metric_melt in its decision on using each row of the original matrix as a unique sample.
-        If you wish to group by Metadata_broad_sample or by Metadata_moa, you can do this.
-        However, this makes your results less intuitive and maybe meaningless.
-
     Returns
     -------
     hits_list : list
         full list of all hits. Can be used for histogram plotting.
     percent_scores: dict
-        dictionary of the percentage list and their corresponding percent scores (see above).
+        dictionary of the percentage list and their corresponding percent scores (see percent score function).
     """
     # make sure percent_list is a list
     if type(percent_list) == int:
@@ -66,20 +67,25 @@ def hitk(
     # Check to make sure that the melted dataframe is full
     assert_melt(similarity_melted_df, eval_metric="hitk")
 
-    # see documentation above, this should be "pair_index_a"
-    grouped = similarity_melted_df.groupby(group_col)
+    # Extract the name of the columns in the sim_df
+    pair_ids = set_pair_ids()
+    groupby_cols_suffix = [
+        "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
+        for x in groupby_columns
+    ]
+
+    # group the sim_df by the groupby_columns
+    grouped = similarity_melted_df.groupby(groupby_cols_suffix)
     nr_of_groups = grouped.ngroups
     # Within each group, add the ranks of each connection to a new column
-    similarity_melted_with_rank = grouped.apply(add_hits_hits)
+    similarity_melted_with_rank = grouped.apply(lambda x: add_hit_rank(x))
 
     # make a list of the ranks of correct connection (hits), ie where the group_replicate is true
     hits_list = similarity_melted_with_rank[
         similarity_melted_with_rank["group_replicate"] == True
     ]["rank"].tolist()
 
     # calculate the scores at each percentage
-    percent_scores = percentage_scores(
-        similarity_melted_df, hits_list, percent_list, nr_of_groups
-    )
+    percent_scores = percentage_scores(hits_list, percent_list, nr_of_groups)
 
     return hits_list, percent_scores
diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py
@@ -44,7 +44,6 @@
 compound_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
 
 
-
 def test_evaluate_replicate_reproducibility():
     similarity_metrics = get_available_similarity_metrics()
     replicate_reproducibility_quantiles = [0.5, 0.95]
@@ -116,11 +115,7 @@ def test_evaluate_replicate_reprod_return_cor_true():
 
     assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
     assert sorted(med_cor_df.columns.tolist()) == sorted(
-        [
-            "Metadata_gene_name",
-            "Metadata_pert_name",
-            "similarity_metric",
-        ]
+        ["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
     )
 
 
@@ -211,9 +206,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
     assert np.round(top_result.grit, 4) == 2.3352
     assert top_result.group == "PTK2"
@@ -239,9 +232,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
 
     assert np.round(top_result.grit, 4) == 0.9990
@@ -356,20 +347,20 @@ def test_evaluate_mp_value():
 
 
 def test_evaluate_hitk():
-    hitk_replicate_groups = ['Metadata_moa']
+    hitk_replicate_groups = ["Metadata_moa"]
     hitk_percent_list = "all"
+    groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]
 
     hitk_hits_list, percent_scores = evaluate(
         profiles=compound_profiles,
         features=compound_features,
         meta_features=compound_meta_features,
         replicate_groups=hitk_replicate_groups,
         operation="hitk",
+        groupby_columns=groupby_columns,
         hitk_percent_list=hitk_percent_list,
-        hitk_group_col="pair_a_index",
     )
     assert isclose(percent_scores[0], 150.75, abs_tol=1e-1)
 
     last_score = percent_scores[len(percent_scores) - 1]
     assert isclose(last_score, 0, abs_tol=1e-1)
-
diff --git a/cytominer_eval/tests/test_operations/test_hitk.py b/cytominer_eval/tests/test_operations/test_hitk.py
@@ -29,7 +29,7 @@
     "Metadata_moa",
 ] = "none"
 df = df[~df["Metadata_moa"].isna()]
-df_len = df.shape[0]
+
 
 meta_features = [
     x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_"))
@@ -46,12 +46,15 @@
 
 # compute the normal index_list
 replicate_group = ["Metadata_moa"]
+groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]
 percent_list = [2, 5, 10, 100]
+
+
 index_list, percent_results = hitk(
     similarity_melted_df=similarity_melted_df,
     replicate_groups=replicate_group,
+    groupby_columns=groupby_columns,
     percent_list=percent_list,
-    group_col="pair_a_index",
 )
 
 
@@ -60,8 +63,8 @@
 indexes_all, percent_results_all = hitk(
     similarity_melted_df=similarity_melted_df,
     replicate_groups=replicate_group,
+    groupby_columns=groupby_columns,
     percent_list=percent_all,
-    group_col="pair_a_index",
 )
 
 # compute the index with a randomized input
@@ -80,8 +83,8 @@
 ran_index_list, ran_percent_results = hitk(
     similarity_melted_df=similarity_melted_ran,
     replicate_groups=replicate_group,
+    groupby_columns=groupby_columns,
     percent_list=percent_list,
-    group_col="pair_a_index",
 )
 
 # if we use a combination of replicate groups that is unique for each index in the original df,
@@ -92,8 +95,8 @@
 index_list_empty, percent_results_empty = hitk(
     similarity_melted_df=similarity_melted_df,
     replicate_groups=replicate_group,
+    groupby_columns=groupby_columns,
     percent_list=percent_list,
-    group_col="pair_a_index",
 )
 
 
diff --git a/cytominer_eval/utils/hitk_utils.py b/cytominer_eval/utils/hitk_utils.py
@@ -1,43 +1,42 @@
-def add_hits_hits(df):
-    """Adds the rank or index of each connection to the dataframe.
-    This column will later be used to create a full list of hits and their index.
+def add_hit_rank(df):
+    """Adds the rank/index of each connection to the dataframe.
+    This column will later be used to create a full list of hits.
 
     Parameters
     ----------
     df : sub-grouped dataframe from the similarity_melted_df
 
     Returns
     -------
-    dataframe with rank column
+    dataframe with added rank column
 
     """
-    # rank all compounds by their similarity and assign the index/order to the rank column
+    # rank all compounds by their similarity
     df = df.sort_values(["similarity_metric"], ascending=False)
+    # and assign the index/order to the rank column
     df = df.assign(rank=range(len(df)))
-
     return df
 
 
-def percentage_scores(similarity_melted_df, hits_list, p_list, nr_of_groups):
-    """Calculates the number of hits below a certain percentage.
-    The function subtracts the expected random distribution from the accumulated score
+def percentage_scores(hits_list, p_list, nr_of_groups):
+    """Calculates the percent score which is the cumulative number of hits below a given percentage.
+    The function counts the number of hits in the hits_list contains below a percentage of the maximum hit score (nr_of_groups).
+    It then subtracts the expected value from that accumulated count value.
     such that random input should give scores around zero.
     If p_list = "all" then, instead of percentages, all classes are enumerated and hits counted.
 
     Parameters
     ----------
-    similarity_melted_df : pandas.DataFrame
-        An elongated symmetrical matrix indicating pairwise correlations between
-        samples
     hits_list : list
         long list of hits that correspond to index of the replicate in the list of neighbors
     p_list : list or 'all'
         list of percentages to score. Percentages are given as integers, ie 50 is 50%.
-
+    nr_of_groups : int
+        number of groups that add_hit_rank was applied to.
     Returns
     -------
     d : dict
-        dictionary with percentages and scores or a full list of hits and scores
+        dictionary with percentages and scores or a full list of indexes and scores
     """
     # get the number of compounds in this dataset
     d = {}
@@ -69,4 +68,9 @@ def percentage_scores(similarity_melted_df, hits_list, p_list, nr_of_groups):
             accumulated_hits_n = len([i for i in hits_list if i <= p_value])
             d[p] = accumulated_hits_n - expected_hits
 
+            if p == 100 and d[p] != 0:
+                print(
+                    f"The percent score at 100% is {d[p]}, it should be 0 tho. Check your groupby_columns"
+                )
+
     return d