44from typing import List , Union
55
66
7- from cytominer_eval .utils .hitk_utils import add_hits_hits , percentage_scores
7+ from cytominer_eval .utils .hitk_utils import add_hit_rank , percentage_scores
88from cytominer_eval .utils .operation_utils import assign_replicates
99from cytominer_eval .utils .transform_utils import set_pair_ids , assert_melt
1010
1111
1212def hitk (
1313 similarity_melted_df : pd .DataFrame ,
1414 replicate_groups : List [str ],
15+ groupby_columns : List [str ],
1516 percent_list : Union [int , List [int ]],
16- group_col = "pair_a_index" ,
1717) -> pd .DataFrame :
1818 """Calculate the hit@k hits list and percent scores.
1919 This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
@@ -34,23 +34,24 @@ def hitk(
3434 replicate_groups : list or int
3535 a list of metadata column names in the original profile dataframe to use as replicate columns.
3636
37+ groupby_columns: str
38+ group columns determine the columns over which the similarity_melted_df is grouped.
39+ Usually groupby_columns will span the full space of the input data
40+ such that drop_duplicates by the groupby_cols would not change the data.
41+ If you group over Metadata_plate for examples, you will get meaningless results.
42+ This can easily be seen from the fact that the percent score at 100 will be nonzero.
43+
3744 percent_list : list or "all"
3845 A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
3946 If percent_list == "all" a full dict with the length of classes will be created.
4047 Percentages are given as integers, ie 50 means 50 %.
4148
42- group_col: str
43- group columns determine the column over which the similarity_melted_df is grouped.
44- Usually group_col will be "pair_a_index" since this follows metric_melt in its decision on using each row of the original matrix as a unique sample.
45- If you wish to group by Metadata_broad_sample or by Metadata_moa, you can do this.
46- However, this makes your results less intuitive and maybe meaningless.
47-
4849 Returns
4950 -------
5051 hits_list : list
5152 full list of all hits. Can be used for histogram plotting.
5253 percent_scores: dict
53- dictionary of the percentage list and their corresponding percent scores (see above ).
54+ dictionary of the percentage list and their corresponding percent scores (see percent score function ).
5455 """
5556 # make sure percent_list is a list
5657 if type (percent_list ) == int :
@@ -66,20 +67,25 @@ def hitk(
6667 # Check to make sure that the melted dataframe is full
6768 assert_melt (similarity_melted_df , eval_metric = "hitk" )
6869
69- # see documentation above, this should be "pair_index_a"
70- grouped = similarity_melted_df .groupby (group_col )
70+ # Extract the name of the columns in the sim_df
71+ pair_ids = set_pair_ids ()
72+ groupby_cols_suffix = [
73+ "{x}{suf}" .format (x = x , suf = pair_ids [list (pair_ids )[0 ]]["suffix" ])
74+ for x in groupby_columns
75+ ]
76+
77+ # group the sim_df by the groupby_columns
78+ grouped = similarity_melted_df .groupby (groupby_cols_suffix )
7179 nr_of_groups = grouped .ngroups
7280 # Within each group, add the ranks of each connection to a new column
73- similarity_melted_with_rank = grouped .apply (add_hits_hits )
81+ similarity_melted_with_rank = grouped .apply (lambda x : add_hit_rank ( x ) )
7482
7583 # make a list of the ranks of correct connection (hits), ie where the group_replicate is true
7684 hits_list = similarity_melted_with_rank [
7785 similarity_melted_with_rank ["group_replicate" ] == True
7886 ]["rank" ].tolist ()
7987
8088 # calculate the scores at each percentage
81- percent_scores = percentage_scores (
82- similarity_melted_df , hits_list , percent_list , nr_of_groups
83- )
89+ percent_scores = percentage_scores (hits_list , percent_list , nr_of_groups )
8490
8591 return hits_list , percent_scores
0 commit comments