clamsproject · keighrim · Jul 8, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 5, 2024
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ Each subdirectory of the repository is an evaluation task within the project. Ea
 * golds - The gold-truth standard, humanly-annotated files by which apps are evaluated for predictive ability. 
   * often are `.tsv` or `.csv` or `.txt`. 
 * preds/predictions - The app-predicted files with predicted-annotations of what phenomena are to be evaluated. (e.g. time durations for slate detection.)
+  * each preds directory represents a batch, with naming conventions as follows:`preds@<APP_NAME><APP_VER>@<BATCH_NAME>`
   * are always `.mmif` files with app views. 
 #### Outputs to Evaluations
 * results - This should be the result system output of the evaluation numbers from a finished evaluation. 

diff --git a/sr-eval/README.md b/sr-eval/README.md
@@ -0,0 +1,36 @@
+# Scene Recognition Evaluation
+This involves evaluating the results of the scenes-with-text classification task.
+The goal is to have a simple way of comparing different results from SWT.
+
+# Required Input
+To run this evaluation script, you need the following:
+
+* Set of predictions in MMIF format (either from the preds folder in this repo
+or generated from the [SWT app](https://github.com/clamsproject/app-swt-detection) )
+* Set of golds in csv format (either downloaded from the annotations repository
+using goldretriever.py, or your own set that exactly matches the format present in [aapb-annotations](https://github.com/clamsproject/aapb-annotations/tree/main/scene-recognition/golds))
+
+There are three arguments when running the script: `-mmif-dir`, `-gold-dir`, and `count-subtypes`.
+The first two are directories that contain the predictions and golds, respectively. The third is a boolean value that
+determines if the evaluation takes into account subtype labels or not.
+Note that only the first one is required, as `-gold-dir` defaults to the set of golds downloaded (using `goldretriever`)
+from the [aapb-annotations](https://github.com/clamsproject/aapb-annotations/tree/main/scene-recognition/golds) repo,
+and `count-subtypes` defaults to `False`.
+
+# Usage
+To run the evaluation, run the following in the `sr-eval` directory:
+```
+python evaluate.py -mmif-dir <pred_directory> -gold-dir <gold_directory> -count-subtypes True
+```
+
+# Output Format
+Currently, the evaluation script produces two output files: `document-scores.csv` and `dataset-scores.csv`
+* `document-scores.csv` has the label scores by document, including a macro-average of label scores.
+* `dataset-scores.csv` has the total label scores across the dataset, including micro-averaged results.
+
+These contain the precision, recall, and f1 scores by label. At the moment, the scores themselves are outputted in a
+dictionary format, but this is subject to change.
+
+# Notes
+As mentioned previously, this is the first version of this evaluation script and some things are subject to change
+including output format and location.
diff --git a/sr-eval/evaluate.py b/sr-eval/evaluate.py
@@ -0,0 +1,213 @@
+import argparse
+import collections
+from collections import defaultdict, Counter
+import pathlib
+import goldretriever
+import pandas as pd
+import json
+import mmif
+
+# constant:
+GOLD_URL = "https://github.com/clamsproject/aapb-annotations/tree/bebd93af0882b8cf942ba827917938b49570d6d9/scene-recognition/golds"
+# note that you must first have output mmif files to compare against
+
+# parse SWT output into dictionary to extract label-timepoint pairs
+
+# convert ISO timestamp strings (hours:minutes:seconds.ms) back to milliseconds
+
+
+def convert_iso_milliseconds(timestamp):
+    ms = 0
+    # add hours
+    ms += int(timestamp.split(":")[0]) * 3600000
+    # add minutes
+    ms += int(timestamp.split(":")[1]) * 60000
+    # add seconds and milliseconds
+    ms += float(timestamp.split(":")[2]) * 1000
+    ms = int(ms)
+    return ms
+
+# extract gold pairs from each csv. note goldpath is fed in as a path object
+def extract_gold_labels(goldpath, count_subtypes=False):
+    df = pd.read_csv(goldpath)
+    # convert timestamps (iso) back to ms
+    df['timestamp'] = df['timestamp'].apply(convert_iso_milliseconds)
+    if count_subtypes:
+        # fill empty subtype rows with '' then concatenate with type label
+        df['subtype label'] = df['subtype label'].fillna("")
+        df['combined'] = df['type label'] + ":" + df['subtype label']
+        # trim extra ":"
+        df['combined'] = df['combined'].apply(lambda row: row[:-1] if row[-1] == ':' else row)
+        # create dictionary of 'timestamp':'combined' from dataframe
+        gold_dict = df.set_index('timestamp')['combined'].to_dict()
+    else:
+        # ignore subtype label column
+        gold_dict = df.set_index('timestamp')['type label'].to_dict()
+    # return dictionary that maps timestamps to label
+    return gold_dict
+
+# method to match a given predicted timestamp (key) with the closest gold timestamp:
+# acceptable range is default +/- 5 ms. if nothing matches, return None
+
+def closest_gold_timestamp(pred_stamp, gold_dict, good_range = 5):
+    # first check if pred in gold_dict. if yes, return pred
+    if pred_stamp in gold_dict:
+        return pred_stamp
+    # for i = 5 to 1 check if pred - i in gold_dict, if yes return pred - i
+    for i in range(good_range, 0, -1):
+        if pred_stamp - i in gold_dict:
+            return pred_stamp - i
+    # for i = 1 to i = 5 check if pred + i in gold dict, if yes return pred + i
+    for i in range(1, good_range + 1):
+        if pred_stamp + i in gold_dict:
+            return pred_stamp + i
+    return None
+
+# extract predicted label pairs from output mmif and match with gold pairs
+# note that pred_path is already a filepath, not a string
+# returns a dictionary with timestamps as keys and tuples of labels as values.
+
+
+def extract_predicted_consolidate(pred_path, gold_dict, count_subtypes = False):
+    # create a dictionary to fill in with timestamps -> label tuples (predicted, gold)
+    combined_dict = {}
+    with open(pred_path, "r") as file:
+        pred_json = json.load(file)
+        for view in pred_json["views"]:
+            if "annotations" in view:
+                for annotation in view["annotations"]:
+                    if "timePoint" in annotation['properties']:
+                        # match pred timestamp to closest gold timestamp
+                        # using default range (+/- 5ms)
+                        curr_timestamp = closest_gold_timestamp(annotation['properties']['timePoint'], gold_dict)
+                        # check if closest_gold_timestamp returned None (not within acceptable range)
+                        if not curr_timestamp:
+                            continue
+                        # truncate label if count_subtypes is false
+                        pred_label = annotation['properties']['label'] if count_subtypes else annotation['properties']['label'][0]
+                        # if NEG set to '-'
+                        if annotation['properties']['label'] == 'NEG':
+                            pred_label = '-'
+                        # put gold and pred labels into combined dictionary
+                        combined_dict[curr_timestamp] = (pred_label, gold_dict[curr_timestamp])
+    return combined_dict
+
+# calculate document-level p, r, f1 for each label and macro avg. also returns total counts
+# of tp, fp, fn for each label to calculate micro avg later.
+def document_evaluation(combined_dict):
+    # count up tp, fp, fn for each label
+    total_counts = defaultdict(Counter)
+    for timestamp in combined_dict:
+        pred, gold = combined_dict[timestamp][0], combined_dict[timestamp][1]
+        if pred == gold:
+            total_counts[pred]["tp"] += 1
+        else:
+            total_counts[pred]["fp"] += 1
+            total_counts[gold]["fn"] += 1
+    # calculate P, R, F1 for each label, store in nested dictionary
+    scores_by_label = defaultdict(lambda: defaultdict(float))
+    # running total for (macro) averaged scores per document
+    average_p = 0
+    average_r = 0
+    average_f1 = 0
+    for label in total_counts:
+        tp, fp, fn = total_counts[label]["tp"], total_counts[label]["fp"], total_counts[label]["fn"]
+        precision = float(tp/(tp + fp)) if (tp + fp) > 0 else 0
+        recall = float(tp/(tp + fn)) if (tp + fn) > 0 else 0
+        f1 = float(2*(precision*recall)/(precision + recall)) if (precision + recall) > 0 else 0
+        # add individual scores to dict and then add to running sum
+        scores_by_label[label]["precision"] = precision
+        scores_by_label[label]["recall"] = recall
+        scores_by_label[label]["f1"] = f1
+        average_p += precision
+        average_r += recall
+        average_f1 += f1
+    # calculate macro averages for document and add to scores_by_label
+    scores_by_label["average"]["precision"] = float(average_p / len(scores_by_label))
+    scores_by_label["average"]["recall"] = float(average_r / len(scores_by_label))
+    scores_by_label["average"]["f1"] = float(average_f1 / len(scores_by_label))
+    # return both scores_by_label and total_counts (to calculate micro avg later)
+    return scores_by_label, total_counts
+
+# once you have processed every document, this method runs to calculate the micro-averaged
+# scores. the input is a list of total_counts dictionaries, each obtained from running
+# document_evaluation.
+def total_evaluation(total_counts_list):
+    # create dict to hold total tp, fp, fn for all labels
+    total_instances_by_label = defaultdict(Counter)
+    # iterate through total_counts_list to get complete count of tp, fp, fn by label
+    for doc_dict in total_counts_list:
+        for label in doc_dict:
+            total_instances_by_label[label]["tp"] += doc_dict[label]["tp"]
+            total_instances_by_label[label]["fp"] += doc_dict[label]["fp"]
+            total_instances_by_label[label]["fn"] += doc_dict[label]["fn"]
+            # include a section for total tp/fp/fn for all labels
+            total_instances_by_label["all"]["tp"] += doc_dict[label]["tp"]
+            total_instances_by_label["all"]["fp"] += doc_dict[label]["fp"]
+            total_instances_by_label["all"]["fn"] += doc_dict[label]["fn"]
+    # create complete_micro_scores to store micro avg scores for entire dataset
+    complete_micro_scores = defaultdict(lambda: defaultdict(float))
+    # fill in micro scores
+    for label in total_instances_by_label:
+        tp, fp, fn = (total_instances_by_label[label]["tp"], total_instances_by_label[label]["fp"],
+                      total_instances_by_label[label]["fn"])
+        precision = float(tp/(tp + fp)) if (tp + fp) > 0 else 0
+        recall = float(tp/ (tp + fn)) if (tp + fn) > 0 else 0
+        f1 = float(2*precision*recall/(precision + recall)) if (precision + recall) > 0 else 0
+        complete_micro_scores[label]["precision"] = precision
+        complete_micro_scores[label]["recall"] = recall
+        complete_micro_scores[label]["f1"] = f1
+    return complete_micro_scores
+
+# run the evaluation on each predicted-gold pair of files, and then the entire dataset for
+# micro average
+def run_dataset_eval(mmif_dir, gold_dir, count_subtypes):
+    # create dict of guid -> scores to store each dict of document-level scores
+    doc_scores = {}
+    # create list to store each dict of document-level counts
+    document_counts = []
+    mmif_files = pathlib.Path(mmif_dir).glob("*.mmif")
+    # get each mmif file
+    for mmif_file in mmif_files:
+        guid = ""
+        with open(mmif_file, "r") as f:
+            curr_mmif = json.load(f)
+            # get guid
+            location = curr_mmif["documents"][0]["properties"]["location"]
+            guid = location.split("/")[-1].split(".")[0]
+        # match guid with gold file
+        gold_file = next(pathlib.Path(gold_dir).glob(f"*{guid}*"))
+        # process gold
+        gold_dict = extract_gold_labels(gold_file, count_subtypes)
+        # process predicted and consolidate
+        combined_dict = extract_predicted_consolidate(mmif_file, gold_dict, count_subtypes)
+        # evaluate on document level, storing scores in document_scores and counts in document_counts
+        eval_result = document_evaluation(combined_dict)
+        doc_scores[guid] = eval_result[0]
+        document_counts.append(eval_result[1])
+    # now after processing each document and storing the relevant scores, we can evaluate the
+    # dataset performance as a whole
+    data_scores = total_evaluation(document_counts)
+    return doc_scores, data_scores
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--mmif_dir', type=str, required=True,
+                        help='directory containing machine-annotated files in MMIF format')
+    parser.add_argument('-g', '--gold_dir', type=str, default=None,
+                        help='directory containing gold labels in csv format')
+    parser.add_argument('-s', '--count_subtypes', type=bool, default=False,
+                        help='bool flag whether to consider subtypes for evaluation')
+    args = parser.parse_args()
+    mmif_dir = args.mmif_dir
+    GOLD_URL = "https://github.com/clamsproject/aapb-annotations/tree/bebd93af0882b8cf942ba827917938b49570d6d9/scene-recognition/golds"
+    gold_dir = goldretriever.download_golds(GOLD_URL) if args.gold_dir is None else args.gold_dir
+    count_subtypes = args.count_subtypes
+    document_scores, dataset_scores = run_dataset_eval(mmif_dir, gold_dir, count_subtypes)
+    # document scores are for each doc, dataset scores are for overall (micro avg)
+    doc_df = pd.DataFrame(document_scores)
+    dataset_df = pd.DataFrame(dataset_scores)
+    doc_df.to_csv('document_scores.csv')
+    dataset_df.to_csv('dataset_scores.csv')
+
diff --git a/sr-eval/goldretriever.py b/sr-eval/goldretriever.py
@@ -0,0 +1,55 @@
+import json
+from pathlib import Path
+from urllib.parse import urljoin
+
+import requests
+
+
+def download_golds(gold_dir_url, folder_name=None):
+    import tempfile
+    # code adapt from Angela Lam's
+
+    if folder_name is None:
+        folder_name = tempfile.TemporaryDirectory().name
+    # Create a new directory to store the downloaded files on local computer
+    target_dir = Path(folder_name)
+    if not target_dir.exists():
+        target_dir.mkdir()
+
+    # Check if the directory is empty
+    try:
+        next(target_dir.glob('*'))
+        raise Exception("The folder '" + folder_name + "' already exists and is not empty")
+    except StopIteration:
+        pass
+
+    # Send a GET request to the repository URL and extract the HTML content
+    response = requests.get(gold_dir_url, headers={"Accept": "application/json"})
+
+    # github responses with JSON? wow
+    payload = json.loads(response.text)['payload']
+    links = [i['path'] for i in payload['tree']['items']]
+
+    # Download each file in the links list into the created folder
+    for link in links:
+        raw_url = urljoin('https://raw.githubusercontent.com/',
+                          '/'.join((payload['repo']['ownerLogin'],
+                                    payload['repo']['name'],
+                                    payload['refInfo']['name'],
+                                    link)))
+        file_path = target_dir / link.split('/')[-1]
+        with open(file_path, 'wb') as file:
+            response = requests.get(raw_url)
+            file.write(response.content)
+    return folder_name
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Download gold files from a github repository')
+    parser.add_argument('-d', '--download_dir', default=None, 
+                        help='The name of the folder to store the downloaded files. '
+                             'If not provided, a system temporary directory will be created')
+    parser.add_argument('gold_url', help='The URL of the gold directory')
+    args = parser.parse_args()
+    download_golds(args.gold_url, args.download_dir)
+