Skip to content

Commit cfdee11

Browse files
new version with groupby_cols
1 parent b8608ca commit cfdee11

File tree

5 files changed

+61
-55
lines changed

5 files changed

+61
-55
lines changed

cytominer_eval/evaluate.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def evaluate(
2525
meta_features: List[str],
2626
replicate_groups: Union[List[str], dict],
2727
operation: str = "replicate_reproducibility",
28+
groupby_columns: List[str] = ["Metadata_broad_sample"],
2829
similarity_metric: str = "pearson",
2930
replicate_reproducibility_quantile: float = 0.95,
3031
replicate_reproducibility_return_median_cor: bool = False,
@@ -34,7 +35,6 @@ def evaluate(
3435
mp_value_params: dict = {},
3536
enrichment_percentile: Union[float, List[float]] = 0.99,
3637
hitk_percent_list=[2, 5, 10],
37-
hitk_group_col = "pair_a_index",
3838
):
3939
r"""Evaluate profile quality and strength.
4040
@@ -69,6 +69,12 @@ def evaluate(
6969
operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
7070
The specific evaluation metric to calculate. The default is
7171
"replicate_reproducibility".
72+
groupby_columns : List of str
73+
Only used for operation = 'precision_recall' and 'hitk'
74+
Column by which the similarity matrix is grouped and by which the operation is calculated.
75+
For example, if groupby_column = "Metadata_broad_sample" then precision/recall is calculated for each sample.
76+
Note that it makes sense for these columns to be unique or to span a unique space
77+
since precision and hitk may otherwise stop making sense.
7278
similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
7379
How to calculate pairwise similarity. Defaults to "pearson". We use the input
7480
in pandas.DataFrame.cor(). The default is "pearson".
@@ -112,10 +118,6 @@ def evaluate(
112118
A list of percentages at which to calculate the percent scores, ie the amount of indexes below this percentage.
113119
If percent_list == "all" a full dict with the length of classes will be created.
114120
Percentages are given as integers, ie 50 means 50 %.
115-
hitk_group_col : str, optional
116-
Only used when operation='hitk'
117-
The column over which the hits are indexed.
118-
Only deviate from "pair_a_index" if you know what you are doing!
119121
"""
120122
# Check replicate groups input
121123
check_replicate_groups(eval_metric=operation, replicate_groups=replicate_groups)
@@ -170,7 +172,7 @@ def evaluate(
170172
metric_result = hitk(
171173
similarity_melted_df=similarity_melted_df,
172174
replicate_groups=replicate_groups,
175+
groupby_columns=groupby_columns,
173176
percent_list=hitk_percent_list,
174-
group_col=hitk_group_col,
175177
)
176178
return metric_result

cytominer_eval/operations/hitk.py

+21-15
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44
from typing import List, Union
55

66

7-
from cytominer_eval.utils.hitk_utils import add_hits_hits, percentage_scores
7+
from cytominer_eval.utils.hitk_utils import add_hit_rank, percentage_scores
88
from cytominer_eval.utils.operation_utils import assign_replicates
99
from cytominer_eval.utils.transform_utils import set_pair_ids, assert_melt
1010

1111

1212
def hitk(
1313
similarity_melted_df: pd.DataFrame,
1414
replicate_groups: List[str],
15+
groupby_columns: List[str],
1516
percent_list: Union[int, List[int]],
16-
group_col="pair_a_index",
1717
) -> pd.DataFrame:
1818
"""Calculate the hit@k hits list and percent scores.
1919
This function groups the similarity matrix by each sample (group_col) and by similarity score. It then determines the rank of each correct hit.
@@ -34,23 +34,24 @@ def hitk(
3434
replicate_groups : list or int
3535
a list of metadata column names in the original profile dataframe to use as replicate columns.
3636
37+
groupby_columns: str
38+
group columns determine the columns over which the similarity_melted_df is grouped.
39+
Usually groupby_columns will span the full space of the input data
40+
such that drop_duplicates by the groupby_cols would not change the data.
41+
If you group over Metadata_plate for examples, you will get meaningless results.
42+
This can easily be seen from the fact that the percent score at 100 will be nonzero.
43+
3744
percent_list : list or "all"
3845
A list of percentages at which to calculate the percent scores, ie the amount of hits below this percentage.
3946
If percent_list == "all" a full dict with the length of classes will be created.
4047
Percentages are given as integers, ie 50 means 50 %.
4148
42-
group_col: str
43-
group columns determine the column over which the similarity_melted_df is grouped.
44-
Usually group_col will be "pair_a_index" since this follows metric_melt in its decision on using each row of the original matrix as a unique sample.
45-
If you wish to group by Metadata_broad_sample or by Metadata_moa, you can do this.
46-
However, this makes your results less intuitive and maybe meaningless.
47-
4849
Returns
4950
-------
5051
hits_list : list
5152
full list of all hits. Can be used for histogram plotting.
5253
percent_scores: dict
53-
dictionary of the percentage list and their corresponding percent scores (see above).
54+
dictionary of the percentage list and their corresponding percent scores (see percent score function).
5455
"""
5556
# make sure percent_list is a list
5657
if type(percent_list) == int:
@@ -66,20 +67,25 @@ def hitk(
6667
# Check to make sure that the melted dataframe is full
6768
assert_melt(similarity_melted_df, eval_metric="hitk")
6869

69-
# see documentation above, this should be "pair_index_a"
70-
grouped = similarity_melted_df.groupby(group_col)
70+
# Extract the name of the columns in the sim_df
71+
pair_ids = set_pair_ids()
72+
groupby_cols_suffix = [
73+
"{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
74+
for x in groupby_columns
75+
]
76+
77+
# group the sim_df by the groupby_columns
78+
grouped = similarity_melted_df.groupby(groupby_cols_suffix)
7179
nr_of_groups = grouped.ngroups
7280
# Within each group, add the ranks of each connection to a new column
73-
similarity_melted_with_rank = grouped.apply(add_hits_hits)
81+
similarity_melted_with_rank = grouped.apply(lambda x: add_hit_rank(x))
7482

7583
# make a list of the ranks of correct connection (hits), ie where the group_replicate is true
7684
hits_list = similarity_melted_with_rank[
7785
similarity_melted_with_rank["group_replicate"] == True
7886
]["rank"].tolist()
7987

8088
# calculate the scores at each percentage
81-
percent_scores = percentage_scores(
82-
similarity_melted_df, hits_list, percent_list, nr_of_groups
83-
)
89+
percent_scores = percentage_scores(hits_list, percent_list, nr_of_groups)
8490

8591
return hits_list, percent_scores

cytominer_eval/tests/test_evaluate.py

+6-15
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
compound_groups = ["Metadata_broad_sample", "Metadata_mg_per_ml"]
4545

4646

47-
4847
def test_evaluate_replicate_reproducibility():
4948
similarity_metrics = get_available_similarity_metrics()
5049
replicate_reproducibility_quantiles = [0.5, 0.95]
@@ -116,11 +115,7 @@ def test_evaluate_replicate_reprod_return_cor_true():
116115

117116
assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
118117
assert sorted(med_cor_df.columns.tolist()) == sorted(
119-
[
120-
"Metadata_gene_name",
121-
"Metadata_pert_name",
122-
"similarity_metric",
123-
]
118+
["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
124119
)
125120

126121

@@ -211,9 +206,7 @@ def test_evaluate_grit():
211206
top_result = (
212207
grit_results_df.sort_values(by="grit", ascending=False)
213208
.reset_index(drop=True)
214-
.iloc[
215-
0,
216-
]
209+
.iloc[0,]
217210
)
218211
assert np.round(top_result.grit, 4) == 2.3352
219212
assert top_result.group == "PTK2"
@@ -239,9 +232,7 @@ def test_evaluate_grit():
239232
top_result = (
240233
grit_results_df.sort_values(by="grit", ascending=False)
241234
.reset_index(drop=True)
242-
.iloc[
243-
0,
244-
]
235+
.iloc[0,]
245236
)
246237

247238
assert np.round(top_result.grit, 4) == 0.9990
@@ -356,20 +347,20 @@ def test_evaluate_mp_value():
356347

357348

358349
def test_evaluate_hitk():
359-
hitk_replicate_groups = ['Metadata_moa']
350+
hitk_replicate_groups = ["Metadata_moa"]
360351
hitk_percent_list = "all"
352+
groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]
361353

362354
hitk_hits_list, percent_scores = evaluate(
363355
profiles=compound_profiles,
364356
features=compound_features,
365357
meta_features=compound_meta_features,
366358
replicate_groups=hitk_replicate_groups,
367359
operation="hitk",
360+
groupby_columns=groupby_columns,
368361
hitk_percent_list=hitk_percent_list,
369-
hitk_group_col="pair_a_index",
370362
)
371363
assert isclose(percent_scores[0], 150.75, abs_tol=1e-1)
372364

373365
last_score = percent_scores[len(percent_scores) - 1]
374366
assert isclose(last_score, 0, abs_tol=1e-1)
375-

cytominer_eval/tests/test_operations/test_hitk.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
"Metadata_moa",
3030
] = "none"
3131
df = df[~df["Metadata_moa"].isna()]
32-
df_len = df.shape[0]
32+
3333

3434
meta_features = [
3535
x for x in df.columns if (x.startswith("Metadata_") or x.startswith("Image_"))
@@ -46,12 +46,15 @@
4646

4747
# compute the normal index_list
4848
replicate_group = ["Metadata_moa"]
49+
groupby_columns = ["Metadata_broad_sample", "Metadata_Plate", "Metadata_Well"]
4950
percent_list = [2, 5, 10, 100]
51+
52+
5053
index_list, percent_results = hitk(
5154
similarity_melted_df=similarity_melted_df,
5255
replicate_groups=replicate_group,
56+
groupby_columns=groupby_columns,
5357
percent_list=percent_list,
54-
group_col="pair_a_index",
5558
)
5659

5760

@@ -60,8 +63,8 @@
6063
indexes_all, percent_results_all = hitk(
6164
similarity_melted_df=similarity_melted_df,
6265
replicate_groups=replicate_group,
66+
groupby_columns=groupby_columns,
6367
percent_list=percent_all,
64-
group_col="pair_a_index",
6568
)
6669

6770
# compute the index with a randomized input
@@ -80,8 +83,8 @@
8083
ran_index_list, ran_percent_results = hitk(
8184
similarity_melted_df=similarity_melted_ran,
8285
replicate_groups=replicate_group,
86+
groupby_columns=groupby_columns,
8387
percent_list=percent_list,
84-
group_col="pair_a_index",
8588
)
8689

8790
# if we use a combination of replicate groups that is unique for each index in the original df,
@@ -92,8 +95,8 @@
9295
index_list_empty, percent_results_empty = hitk(
9396
similarity_melted_df=similarity_melted_df,
9497
replicate_groups=replicate_group,
98+
groupby_columns=groupby_columns,
9599
percent_list=percent_list,
96-
group_col="pair_a_index",
97100
)
98101

99102

cytominer_eval/utils/hitk_utils.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,42 @@
1-
def add_hits_hits(df):
2-
"""Adds the rank or index of each connection to the dataframe.
3-
This column will later be used to create a full list of hits and their index.
1+
def add_hit_rank(df):
2+
"""Adds the rank/index of each connection to the dataframe.
3+
This column will later be used to create a full list of hits.
44
55
Parameters
66
----------
77
df : sub-grouped dataframe from the similarity_melted_df
88
99
Returns
1010
-------
11-
dataframe with rank column
11+
dataframe with added rank column
1212
1313
"""
14-
# rank all compounds by their similarity and assign the index/order to the rank column
14+
# rank all compounds by their similarity
1515
df = df.sort_values(["similarity_metric"], ascending=False)
16+
# and assign the index/order to the rank column
1617
df = df.assign(rank=range(len(df)))
17-
1818
return df
1919

2020

21-
def percentage_scores(similarity_melted_df, hits_list, p_list, nr_of_groups):
22-
"""Calculates the number of hits below a certain percentage.
23-
The function subtracts the expected random distribution from the accumulated score
21+
def percentage_scores(hits_list, p_list, nr_of_groups):
22+
"""Calculates the percent score which is the cumulative number of hits below a given percentage.
23+
The function counts the number of hits in the hits_list contains below a percentage of the maximum hit score (nr_of_groups).
24+
It then subtracts the expected value from that accumulated count value.
2425
such that random input should give scores around zero.
2526
If p_list = "all" then, instead of percentages, all classes are enumerated and hits counted.
2627
2728
Parameters
2829
----------
29-
similarity_melted_df : pandas.DataFrame
30-
An elongated symmetrical matrix indicating pairwise correlations between
31-
samples
3230
hits_list : list
3331
long list of hits that correspond to index of the replicate in the list of neighbors
3432
p_list : list or 'all'
3533
list of percentages to score. Percentages are given as integers, ie 50 is 50%.
36-
34+
nr_of_groups : int
35+
number of groups that add_hit_rank was applied to.
3736
Returns
3837
-------
3938
d : dict
40-
dictionary with percentages and scores or a full list of hits and scores
39+
dictionary with percentages and scores or a full list of indexes and scores
4140
"""
4241
# get the number of compounds in this dataset
4342
d = {}
@@ -69,4 +68,9 @@ def percentage_scores(similarity_melted_df, hits_list, p_list, nr_of_groups):
6968
accumulated_hits_n = len([i for i in hits_list if i <= p_value])
7069
d[p] = accumulated_hits_n - expected_hits
7170

71+
if p == 100 and d[p] != 0:
72+
print(
73+
f"The percent score at 100% is {d[p]}, it should be 0 tho. Check your groupby_columns"
74+
)
75+
7276
return d

0 commit comments

Comments
 (0)