4
4
from typing import List , Union
5
5
6
6
7
- from cytominer_eval .utils .hitk_utils import add_hits_hits , percentage_scores
7
+ from cytominer_eval .utils .hitk_utils import add_hit_rank , percentage_scores
8
8
from cytominer_eval .utils .operation_utils import assign_replicates
9
9
from cytominer_eval .utils .transform_utils import set_pair_ids , assert_melt
10
10
11
11
12
12
def hitk (
13
13
similarity_melted_df : pd .DataFrame ,
14
14
replicate_groups : List [str ],
15
+ groupby_columns : List [str ],
15
16
percent_list : Union [int , List [int ]],
16
- group_col = "pair_a_index" ,
17
17
) -> pd .DataFrame :
18
18
"""Calculate the hit@k hits list and percent scores.
19
19
This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
@@ -34,23 +34,24 @@ def hitk(
34
34
replicate_groups : list or int
35
35
a list of metadata column names in the original profile dataframe to use as replicate columns.
36
36
37
+ groupby_columns: str
38
+ group columns determine the columns over which the similarity_melted_df is grouped.
39
+ Usually groupby_columns will span the full space of the input data
40
+ such that drop_duplicates by the groupby_cols would not change the data.
41
+ If you group over Metadata_plate for examples, you will get meaningless results.
42
+ This can easily be seen from the fact that the percent score at 100 will be nonzero.
43
+
37
44
percent_list : list or "all"
38
45
A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
39
46
If percent_list == "all" a full dict with the length of classes will be created.
40
47
Percentages are given as integers, ie 50 means 50 %.
41
48
42
- group_col: str
43
- group columns determine the column over which the similarity_melted_df is grouped.
44
- Usually group_col will be "pair_a_index" since this follows metric_melt in its decision on using each row of the original matrix as a unique sample.
45
- If you wish to group by Metadata_broad_sample or by Metadata_moa, you can do this.
46
- However, this makes your results less intuitive and maybe meaningless.
47
-
48
49
Returns
49
50
-------
50
51
hits_list : list
51
52
full list of all hits. Can be used for histogram plotting.
52
53
percent_scores: dict
53
- dictionary of the percentage list and their corresponding percent scores (see above ).
54
+ dictionary of the percentage list and their corresponding percent scores (see percent score function ).
54
55
"""
55
56
# make sure percent_list is a list
56
57
if type (percent_list ) == int :
@@ -66,20 +67,25 @@ def hitk(
66
67
# Check to make sure that the melted dataframe is full
67
68
assert_melt (similarity_melted_df , eval_metric = "hitk" )
68
69
69
- # see documentation above, this should be "pair_index_a"
70
- grouped = similarity_melted_df .groupby (group_col )
70
+ # Extract the name of the columns in the sim_df
71
+ pair_ids = set_pair_ids ()
72
+ groupby_cols_suffix = [
73
+ "{x}{suf}" .format (x = x , suf = pair_ids [list (pair_ids )[0 ]]["suffix" ])
74
+ for x in groupby_columns
75
+ ]
76
+
77
+ # group the sim_df by the groupby_columns
78
+ grouped = similarity_melted_df .groupby (groupby_cols_suffix )
71
79
nr_of_groups = grouped .ngroups
72
80
# Within each group, add the ranks of each connection to a new column
73
- similarity_melted_with_rank = grouped .apply (add_hits_hits )
81
+ similarity_melted_with_rank = grouped .apply (lambda x : add_hit_rank ( x ) )
74
82
75
83
# make a list of the ranks of correct connection (hits), ie where the group_replicate is true
76
84
hits_list = similarity_melted_with_rank [
77
85
similarity_melted_with_rank ["group_replicate" ] == True
78
86
]["rank" ].tolist ()
79
87
80
88
# calculate the scores at each percentage
81
- percent_scores = percentage_scores (
82
- similarity_melted_df , hits_list , percent_list , nr_of_groups
83
- )
89
+ percent_scores = percentage_scores (hits_list , percent_list , nr_of_groups )
84
90
85
91
return hits_list , percent_scores
0 commit comments