Skip to content

Commit 4bfcf32

Browse files
committed
Modified evaluate.py to address output format changes requested in PR, also made outputs cleaner and in their own sub-directory.
1 parent 5d87995 commit 4bfcf32

File tree

1 file changed

+31
-11
lines changed

1 file changed

+31
-11
lines changed

sr-eval/evaluate.py

+31-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import argparse
2-
import collections
32
from collections import defaultdict, Counter
43
import pathlib
5-
import goldretriever
64
import pandas as pd
75
import json
8-
import mmif
6+
from clams_utils.aapb import goldretriever
97

108
# constant:
119
GOLD_URL = "https://github.com/clamsproject/aapb-annotations/tree/bebd93af0882b8cf942ba827917938b49570d6d9/scene-recognition/golds"
@@ -110,8 +108,13 @@ def document_evaluation(combined_dict):
110108
average_p = 0
111109
average_r = 0
112110
average_f1 = 0
111+
# counter to account for unseen labels
112+
unseen = 0
113113
for label in total_counts:
114114
tp, fp, fn = total_counts[label]["tp"], total_counts[label]["fp"], total_counts[label]["fn"]
115+
# if no instances are present/predicted, account for this when taking average of scores
116+
if tp + fp + fn == 0:
117+
unseen += 1
115118
precision = float(tp/(tp + fp)) if (tp + fp) > 0 else 0
116119
recall = float(tp/(tp + fn)) if (tp + fn) > 0 else 0
117120
f1 = float(2*(precision*recall)/(precision + recall)) if (precision + recall) > 0 else 0
@@ -123,9 +126,11 @@ def document_evaluation(combined_dict):
123126
average_r += recall
124127
average_f1 += f1
125128
# calculate macro averages for document and add to scores_by_label
126-
scores_by_label["average"]["precision"] = float(average_p / len(scores_by_label))
127-
scores_by_label["average"]["recall"] = float(average_r / len(scores_by_label))
128-
scores_by_label["average"]["f1"] = float(average_f1 / len(scores_by_label))
129+
# make sure to account for unseen unpredicted labels
130+
denominator = len(scores_by_label) - unseen
131+
scores_by_label["average"]["precision"] = float(average_p / denominator)
132+
scores_by_label["average"]["recall"] = float(average_r / denominator)
133+
scores_by_label["average"]["f1"] = float(average_f1 / denominator)
129134
# return both scores_by_label and total_counts (to calculate micro avg later)
130135
return scores_by_label, total_counts
131136

@@ -190,6 +195,24 @@ def run_dataset_eval(mmif_dir, gold_dir, count_subtypes):
190195
data_scores = total_evaluation(document_counts)
191196
return doc_scores, data_scores
192197

198+
def separate_score_outputs(doc_scores, dataset_scores, mmif_dir):
199+
# get name for new directory
200+
# with our standard, this results in "scores@" appended to the batch name
201+
batch_score_name = "scores@" + mmif_dir.split('@')[-1].strip('/')
202+
# create new dir for scores based on batch name
203+
new_dir = pathlib.Path.cwd() / batch_score_name
204+
new_dir.mkdir(parents = True, exist_ok = True)
205+
# iterate through nested dict, output separate scores for each guid
206+
for guid in doc_scores:
207+
doc_df = pd.DataFrame(doc_scores[guid])
208+
doc_df = doc_df.transpose()
209+
out_path = new_dir / f"{guid}.csv"
210+
doc_df.to_csv(out_path)
211+
# output total dataset scores
212+
dataset_df = pd.DataFrame(dataset_scores)
213+
dataset_df = dataset_df.transpose()
214+
dataset_df.to_csv(new_dir/"dataset_scores.csv")
215+
193216

194217
if __name__ == "__main__":
195218
parser = argparse.ArgumentParser()
@@ -201,13 +224,10 @@ def run_dataset_eval(mmif_dir, gold_dir, count_subtypes):
201224
help='bool flag whether to consider subtypes for evaluation')
202225
args = parser.parse_args()
203226
mmif_dir = args.mmif_dir
204-
GOLD_URL = "https://github.com/clamsproject/aapb-annotations/tree/bebd93af0882b8cf942ba827917938b49570d6d9/scene-recognition/golds"
205227
gold_dir = goldretriever.download_golds(GOLD_URL) if args.gold_dir is None else args.gold_dir
206228
count_subtypes = args.count_subtypes
207229
document_scores, dataset_scores = run_dataset_eval(mmif_dir, gold_dir, count_subtypes)
208230
# document scores are for each doc, dataset scores are for overall (micro avg)
209-
doc_df = pd.DataFrame(document_scores)
210-
dataset_df = pd.DataFrame(dataset_scores)
211-
doc_df.to_csv('document_scores.csv')
212-
dataset_df.to_csv('dataset_scores.csv')
231+
# call method to output scores for each doc and then for total scores
232+
separate_score_outputs(document_scores, dataset_scores, mmif_dir)
213233

0 commit comments

Comments
 (0)