clamsproject · keighrim · Jul 8, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 5, 2024
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ Each subdirectory of the repository is an evaluation task within the project. Ea
 * golds - The gold-truth standard, humanly-annotated files by which apps are evaluated for predictive ability. 
   * often are `.tsv` or `.csv` or `.txt`. 
 * preds/predictions - The app-predicted files with predicted-annotations of what phenomena are to be evaluated. (e.g. time durations for slate detection.)
+  * each preds directory represents a batch, with naming conventions as follows:`preds@<APP_NAME><APP_VER>@<BATCH_NAME>`
   * are always `.mmif` files with app views. 
 #### Outputs to Evaluations
 * results - This should be the result system output of the evaluation numbers from a finished evaluation. 

diff --git a/sr-eval/README.md b/sr-eval/README.md
@@ -0,0 +1,42 @@
+# Scene Recognition Evaluation
+This involves evaluating the results of the scenes-with-text classification task. While SWT returns both timepoint and
+timeframe annotations, this subdirectory is focused on timepoints.
+The goal is to have a simple way of comparing different results from SWT. 
+
+# Required Input
+To run this evaluation script, you need the following:
+
+* Set of predictions in MMIF format (either from the preds folder in this repo
+or generated from the [SWT app](https://github.com/clamsproject/app-swt-detection/tree/6b12498fc596327ec47933b7f785044da2f8cf2f)
+* Set of golds in csv format (either downloaded from the annotations repository
+using goldretriever.py, or your own set that exactly matches the format present in [aapb-annotations](https://github.com/clamsproject/aapb-annotations/tree/9cbe41aa124da73a0158bfc0b4dbf8bafe6d460d/scene-recognition/golds)
+
+There are three arguments when running the script: `-mmif-dir`, `-gold-dir`, and `count-subtypes`.
+The first two are directories that contain the predictions and golds, respectively. The third is a boolean value that
+determines if the evaluation takes into account subtype labels or not.
+* Our standard for naming prediction (mmif) directories is as follows:
+* `preds@app-swt-detection<VERSION-NUMBER>@<BATCH-NAME>`.
+
+Note that only the first one is required, as `-gold-dir` defaults to the set of golds downloaded (using `goldretriever`)
+from the [aapb-annotations](https://github.com/clamsproject/aapb-annotations/tree/9cbe41aa124da73a0158bfc0b4dbf8bafe6d460d/scene-recognition/golds) repo,
+and `count-subtypes` defaults to `False`.
+
+# Usage
+To run the evaluation, run the following in the `sr-eval` directory:
+```
+python evaluate.py --mmif-dir <pred_directory> --gold-dir <gold_directory> --count-subtypes True
+```
+
+# Output Format
+Currently, the evaluation script produces a set of `{guid}.csv` files for each document in the set of predictions, and a
+`dataset-scores.csv`.
+* `{guid}.csv` has the label scores for a given document, including a macro-average of label scores.
+* `dataset-scores.csv` has the total label scores across the dataset, including a final micro-average of all labels.
+
+These contain the precision, recall, and f1 scores by label. In each document, the first row 
+is the negative label `-`, and specifically the `dataset-scores` has the `all` label as its second row which
+represents the final micro-average of all the labels.
+
+The output files are placed in a directory whose name is derived from the final portion (split on `@`)of the basename 
+for the given prediction directory. Using our format described in [Required Input](#required-input), this would result
+in the name being `scores@<BATCH-NAME>`.
diff --git a/sr-eval/evaluate.py b/sr-eval/evaluate.py
@@ -0,0 +1,233 @@
+import argparse
+from collections import defaultdict, Counter
+import pathlib
+import pandas as pd
+import json
+from clams_utils.aapb import goldretriever
+
+# constant:
+GOLD_URL = "https://github.com/clamsproject/aapb-annotations/tree/bebd93af0882b8cf942ba827917938b49570d6d9/scene-recognition/golds"
+# note that you must first have output mmif files to compare against
+
+# parse SWT output into dictionary to extract label-timepoint pairs
+
+# convert ISO timestamp strings (hours:minutes:seconds.ms) back to milliseconds
+
+
+def convert_iso_milliseconds(timestamp):
+    ms = 0
+    # add hours
+    ms += int(timestamp.split(":")[0]) * 3600000
+    # add minutes
+    ms += int(timestamp.split(":")[1]) * 60000
+    # add seconds and milliseconds
+    ms += float(timestamp.split(":")[2]) * 1000
+    ms = int(ms)
+    return ms
+
+# extract gold pairs from each csv. note goldpath is fed in as a path object
+def extract_gold_labels(goldpath, count_subtypes=False):
+    df = pd.read_csv(goldpath)
+    # convert timestamps (iso) back to ms
+    df['timestamp'] = df['timestamp'].apply(convert_iso_milliseconds)
+    if count_subtypes:
+        # fill empty subtype rows with '' then concatenate with type label
+        df['subtype label'] = df['subtype label'].fillna("")
+        df['combined'] = df['type label'] + ":" + df['subtype label']
+        # trim extra ":"
+        df['combined'] = df['combined'].apply(lambda row: row[:-1] if row[-1] == ':' else row)
+        # create dictionary of 'timestamp':'combined' from dataframe
+        gold_dict = df.set_index('timestamp')['combined'].to_dict()
+    else:
+        # ignore subtype label column
+        gold_dict = df.set_index('timestamp')['type label'].to_dict()
+    # return dictionary that maps timestamps to label
+    return gold_dict
+
+# method to match a given predicted timestamp (key) with the closest gold timestamp:
+# acceptable range is default +/- 5 ms. if nothing matches, return None
+
+def closest_gold_timestamp(pred_stamp, gold_dict, good_range = 5):
+    # first check if pred in gold_dict. if yes, return pred
+    if pred_stamp in gold_dict:
+        return pred_stamp
+    # for i = 5 to 1 check if pred - i in gold_dict, if yes return pred - i
+    for i in range(good_range, 0, -1):
+        if pred_stamp - i in gold_dict:
+            return pred_stamp - i
+    # for i = 1 to i = 5 check if pred + i in gold dict, if yes return pred + i
+    for i in range(1, good_range + 1):
+        if pred_stamp + i in gold_dict:
+            return pred_stamp + i
+    return None
+
+# extract predicted label pairs from output mmif and match with gold pairs
+# note that pred_path is already a filepath, not a string
+# returns a dictionary with timestamps as keys and tuples of labels as values.
+
+
+def extract_predicted_consolidate(pred_path, gold_dict, count_subtypes = False):
+    # create a dictionary to fill in with timestamps -> label tuples (predicted, gold)
+    combined_dict = {}
+    with open(pred_path, "r") as file:
+        pred_json = json.load(file)
+        for view in pred_json["views"]:
+            if "annotations" in view:
+                for annotation in view["annotations"]:
+                    if "timePoint" in annotation['properties']:
+                        # match pred timestamp to closest gold timestamp
+                        # using default range (+/- 5ms)
+                        curr_timestamp = closest_gold_timestamp(annotation['properties']['timePoint'], gold_dict)
+                        # check if closest_gold_timestamp returned None (not within acceptable range)
+                        if not curr_timestamp:
+                            continue
+                        # truncate label if count_subtypes is false
+                        pred_label = annotation['properties']['label'] if count_subtypes else annotation['properties']['label'][0]
+                        # if NEG set to '-'
+                        if annotation['properties']['label'] == 'NEG':
+                            pred_label = '-'
+                        # put gold and pred labels into combined dictionary
+                        combined_dict[curr_timestamp] = (pred_label, gold_dict[curr_timestamp])
+    return combined_dict
+
+# calculate document-level p, r, f1 for each label and macro avg. also returns total counts
+# of tp, fp, fn for each label to calculate micro avg later.
+def document_evaluation(combined_dict):
+    # count up tp, fp, fn for each label
+    total_counts = defaultdict(Counter)
+    for timestamp in combined_dict:
+        pred, gold = combined_dict[timestamp][0], combined_dict[timestamp][1]
+        if pred == gold:
+            total_counts[pred]["tp"] += 1
+        else:
+            total_counts[pred]["fp"] += 1
+            total_counts[gold]["fn"] += 1
+    # calculate P, R, F1 for each label, store in nested dictionary
+    scores_by_label = defaultdict(lambda: defaultdict(float))
+    # running total for (macro) averaged scores per document
+    average_p = 0
+    average_r = 0
+    average_f1 = 0
+    # counter to account for unseen labels
+    unseen = 0
+    for label in total_counts:
+        tp, fp, fn = total_counts[label]["tp"], total_counts[label]["fp"], total_counts[label]["fn"]
+        # if no instances are present/predicted, account for this when taking average of scores
+        if tp + fp + fn == 0:
+            unseen += 1
+        precision = float(tp/(tp + fp)) if (tp + fp) > 0 else 0
+        recall = float(tp/(tp + fn)) if (tp + fn) > 0 else 0
+        f1 = float(2*(precision*recall)/(precision + recall)) if (precision + recall) > 0 else 0
+        # add individual scores to dict and then add to running sum
+        scores_by_label[label]["precision"] = precision
+        scores_by_label[label]["recall"] = recall
+        scores_by_label[label]["f1"] = f1
+        average_p += precision
+        average_r += recall
+        average_f1 += f1
+    # calculate macro averages for document and add to scores_by_label
+    # make sure to account for unseen unpredicted labels
+    denominator = len(scores_by_label) - unseen
+    scores_by_label["average"]["precision"] = float(average_p / denominator)
+    scores_by_label["average"]["recall"] = float(average_r / denominator)
+    scores_by_label["average"]["f1"] = float(average_f1 / denominator)
+    # return both scores_by_label and total_counts (to calculate micro avg later)
+    return scores_by_label, total_counts
+
+# once you have processed every document, this method runs to calculate the micro-averaged
+# scores. the input is a list of total_counts dictionaries, each obtained from running
+# document_evaluation.
+def total_evaluation(total_counts_list):
+    # create dict to hold total tp, fp, fn for all labels
+    total_instances_by_label = defaultdict(Counter)
+    # iterate through total_counts_list to get complete count of tp, fp, fn by label
+    for doc_dict in total_counts_list:
+        for label in doc_dict:
+            total_instances_by_label[label]["tp"] += doc_dict[label]["tp"]
+            total_instances_by_label[label]["fp"] += doc_dict[label]["fp"]
+            total_instances_by_label[label]["fn"] += doc_dict[label]["fn"]
+            # include a section for total tp/fp/fn for all labels
+            total_instances_by_label["all"]["tp"] += doc_dict[label]["tp"]
+            total_instances_by_label["all"]["fp"] += doc_dict[label]["fp"]
+            total_instances_by_label["all"]["fn"] += doc_dict[label]["fn"]
+    # create complete_micro_scores to store micro avg scores for entire dataset
+    complete_micro_scores = defaultdict(lambda: defaultdict(float))
+    # fill in micro scores
+    for label in total_instances_by_label:
+        tp, fp, fn = (total_instances_by_label[label]["tp"], total_instances_by_label[label]["fp"],
+                      total_instances_by_label[label]["fn"])
+        precision = float(tp/(tp + fp)) if (tp + fp) > 0 else 0
+        recall = float(tp/ (tp + fn)) if (tp + fn) > 0 else 0
+        f1 = float(2*precision*recall/(precision + recall)) if (precision + recall) > 0 else 0
+        complete_micro_scores[label]["precision"] = precision
+        complete_micro_scores[label]["recall"] = recall
+        complete_micro_scores[label]["f1"] = f1
+    return complete_micro_scores
+
+# run the evaluation on each predicted-gold pair of files, and then the entire dataset for
+# micro average
+def run_dataset_eval(mmif_dir, gold_dir, count_subtypes):
+    # create dict of guid -> scores to store each dict of document-level scores
+    doc_scores = {}
+    # create list to store each dict of document-level counts
+    document_counts = []
+    mmif_files = pathlib.Path(mmif_dir).glob("*.mmif")
+    # get each mmif file
+    for mmif_file in mmif_files:
+        guid = ""
+        with open(mmif_file, "r") as f:
+            curr_mmif = json.load(f)
+            # get guid
+            location = curr_mmif["documents"][0]["properties"]["location"]
+            guid = location.split("/")[-1].split(".")[0]
+        # match guid with gold file
+        gold_file = next(pathlib.Path(gold_dir).glob(f"*{guid}*"))
+        # process gold
+        gold_dict = extract_gold_labels(gold_file, count_subtypes)
+        # process predicted and consolidate
+        combined_dict = extract_predicted_consolidate(mmif_file, gold_dict, count_subtypes)
+        # evaluate on document level, storing scores in document_scores and counts in document_counts
+        eval_result = document_evaluation(combined_dict)
+        doc_scores[guid] = eval_result[0]
+        document_counts.append(eval_result[1])
+    # now after processing each document and storing the relevant scores, we can evaluate the
+    # dataset performance as a whole
+    data_scores = total_evaluation(document_counts)
+    return doc_scores, data_scores
+
+def separate_score_outputs(doc_scores, dataset_scores, mmif_dir):
+    # get name for new directory
+    # with our standard, this results in "scores@" appended to the batch name
+    batch_score_name = "scores@" + mmif_dir.split('@')[-1].strip('/')
+    # create new dir for scores based on batch name
+    new_dir = pathlib.Path.cwd() / batch_score_name
+    new_dir.mkdir(parents = True, exist_ok = True)
+    # iterate through nested dict, output separate scores for each guid
+    for guid in doc_scores:
+        doc_df = pd.DataFrame(doc_scores[guid])
+        doc_df = doc_df.transpose()
+        out_path = new_dir / f"{guid}.csv"
+        doc_df.to_csv(out_path)
+    # output total dataset scores
+    dataset_df = pd.DataFrame(dataset_scores)
+    dataset_df = dataset_df.transpose()
+    dataset_df.to_csv(new_dir/"dataset_scores.csv")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--mmif_dir', type=str, required=True,
+                        help='directory containing machine-annotated files in MMIF format')
+    parser.add_argument('-g', '--gold_dir', type=str, default=None,
+                        help='directory containing gold labels in csv format')
+    parser.add_argument('-s', '--count_subtypes', type=bool, default=False,
+                        help='bool flag whether to consider subtypes for evaluation')
+    args = parser.parse_args()
+    mmif_dir = args.mmif_dir
+    gold_dir = goldretriever.download_golds(GOLD_URL) if args.gold_dir is None else args.gold_dir
+    count_subtypes = args.count_subtypes
+    document_scores, dataset_scores = run_dataset_eval(mmif_dir, gold_dir, count_subtypes)
+    # document scores are for each doc, dataset scores are for overall (micro avg)
+    # call method to output scores for each doc and then for total scores
+    separate_score_outputs(document_scores, dataset_scores, mmif_dir)
+