Merge branch 'main' into code-refactor

haydenmccormick · web-flow · commit 3fa2b5a8c8a0 · 2024-06-25T14:17:13.000-04:00
diff --git a/app.py b/app.py
@@ -148,7 +148,13 @@ def upload_file(in_mmif):
         with open(path / 'file.mmif', 'w') as in_mmif_file:
             app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}")
             in_mmif_file.write(in_mmif_str)
-        html_page = render_mmif(in_mmif_str, viz_id)
+        mmif = Mmif(in_mmif_str)
+        htmlized_docs = utils.documents_to_htmls(mmif, viz_id)
+        app.logger.debug(f"Prepared document: {[d[0] for d in htmlized_docs]}")
+        annotations = utils.prep_annotations(mmif, viz_id)
+        app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}")
+        html_page = render_template('player.html',
+                               docs=htmlized_docs, viz_id=viz_id, annotations=annotations)
         with open(os.path.join(path, "index.html"), "w") as f:
             f.write(html_page)
     except FileExistsError:
diff --git a/cache.py b/cache.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import pathlib
 import shutil
@@ -16,7 +17,7 @@ def get_cache_root():
     return pathlib.Path(_CACHE_DIR_ROOT.name)
 
 
-def invalidate_cache(viz_ids):
+def invalidate_cache(viz_ids=[]):
     if not viz_ids:
         shutil.rmtree(get_cache_root())
         os.makedirs(get_cache_root())
@@ -55,11 +56,11 @@ def scan_tmp_directory():
 
 def cleanup():
     with lock:
-        print("Checking visualization cache...")
+        logging.info("Checking visualization cache...")
         # Max tmp size is 500MB
         max_size = 500000000
         folder_size, oldest_dir = scan_tmp_directory()
         while folder_size > max_size:
-            print(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.")
+            logging.info(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.")
             shutil.rmtree(oldest_dir)
             folder_size, oldest_dir = scan_tmp_directory()
diff --git a/displacy/__init__.py b/displacy/__init__.py
@@ -83,15 +83,9 @@ def mmif_to_dict(mmif: Mmif):
 
 
 def entity(view: View, annotation: Annotation):
-    if "targets" in annotation.properties:
-        start = min([view.annotations[target].properties["start"] for target in annotation.properties["targets"]])
-        end = max([view.annotations[target].properties["end"] for target in annotation.properties["targets"]])
-    else:
-        start = annotation.properties['start']
-        end = annotation.properties['end']
-    return {'start': start,
-            'end': end,
-            'label': annotation.properties['category']}
+    return {'start': annotation.get('start'),
+            'end': annotation.get('end'),
+            'label': annotation.get('category')}
 
 
 def dict_to_html(d):
diff --git a/iiif_utils.py b/iiif_utils.py
@@ -1,14 +1,15 @@
 import datetime
 import json
-import os
 import tempfile
 from typing import Dict
 
 import mmif
 from flask import url_for
 from mmif import AnnotationTypes, DocumentTypes, Mmif
+from mmif.utils import video_document_helper as vdh
 
 import cache
+import utils
 
 
 def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
@@ -27,18 +28,20 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
         ],
         "structures": []
     }
-    add_canvas_from_documents(in_mmif, iiif_json)
+    add_canvas_from_documents(viz_id, in_mmif, iiif_json)
     add_structure_from_timeframe(in_mmif, iiif_json)
     return save_manifest(iiif_json, viz_id)
 
 
-def add_canvas_from_documents(in_mmif, iiif_json):
+def add_canvas_from_documents(viz_id, in_mmif, iiif_json):
     video_documents = in_mmif.get_documents_by_type(DocumentTypes.VideoDocument)
     audio_documents = in_mmif.get_documents_by_type(DocumentTypes.AudioDocument)
     image_documents = in_mmif.get_documents_by_type(DocumentTypes.ImageDocument)
     all_documents = video_documents + audio_documents + image_documents
     document_canvas_dict = {}
     for _id, document in enumerate(all_documents, start=1):
+        canvas_media_path = url_for(
+            'static', filename=f"{cache._CACHE_DIR_SUFFIX}/{viz_id}/{utils.get_src_media_symlink_basename(document)}")
         document_canvas_dict[document.id] = _id
         canvas = {
             "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/canvas/{_id}",
@@ -62,7 +65,7 @@ def add_canvas_from_documents(in_mmif, iiif_json):
                                     "choiceHint": "user",
                                     "items": [
                                         {
-                                            "id": build_document_url(document),
+                                            "id": canvas_media_path,
                                             "type": get_iiif_type(document),
                                             "label": "",
                                             "format": get_iiif_format(document)
@@ -76,34 +79,37 @@ def add_canvas_from_documents(in_mmif, iiif_json):
                 }
             ],
         }
-        # if not os.path.isfile(f"static{document.location_path()}"):
-        #     shutil.copyfile(
-        #         f"{document.location_path()}",
-        #         f"static{os.path.basename(document.location_path())}"
-        #     )
         iiif_json["sequences"][0]["canvases"].append(canvas)
         break # todo currently only supports single document, needs more work to align canvas values
 
 
-def build_document_url(document):
-    """
-    This trims off all of the path to the document except the filename then prepends data/video/. This is so
-    mmif's from running locally can still be found if the viewe
-    r is run in docker, assuming the volume mount or
-    symlink is correctly set.
-    """
-    location = document.location
-    if location.startswith("file://"):
-        location = document.location[7:]
-    file_path = os.path.join("data", "video", os.path.basename(location))
-    return url_for('static', filename=file_path)
-
-
 def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict):
     # # get all views with timeframe annotations from mmif obj
     tf_views = in_mmif.get_views_contain(AnnotationTypes.TimeFrame)
     for range_id, view in enumerate(tf_views, start=1):
-        view_range = tf_view_to_iiif_range(range_id, view)
+        view_range = {
+            "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/range/{range_id}",
+            "type": "Range",
+            "label": f"View: {view.id}",
+            "members": []
+        }
+        for ann in view.get_annotations(AnnotationTypes.TimeFrame):
+            label = ann.get_property('label')
+            s, e = vdh.convert_timeframe(in_mmif, ann, "seconds")
+
+            structure = {
+                "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/range/{range_id}",
+                "type": "Range",
+                "label": f"{label.capitalize()}",
+                "members": [
+                    {
+                        "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/canvas/{1}#t={s},{e}",
+                        # need to align id here to support more than one document
+                        "type": "Canvas"
+                    }
+                ]
+            }
+            view_range["members"].append(structure)
         iiif_json["structures"].append(view_range)
 
 
@@ -115,55 +121,6 @@ def save_manifest(iiif_json: Dict, viz_id) -> str:
     return manifest.name
 
 
-def tf_view_to_iiif_range(range_id, view):
-    view_range = {
-        "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/range/{range_id}",
-        "type": "Range",
-        "label": f"View: {view.id}",
-        "members": []
-    }
-    # for annotation in view.annotations:
-        # # TODO: TimeUnits generated by Kaldi have no "timeUnit" or "unit" property.
-        # The mmif documentation does specify a "unit" property, but the Kaldi
-        # ASR doesn't seem to include that in annotations.
-
-        # if annotation.at_type == AnnotationTypes.TimeFrame:
-        #     if 'unit' in annotation.properties:
-        #         annotation_unit = annotation.properties['unit']
-        #     elif 'unit' in view.metadata.parameters:
-        #         annotation_unit = view.metadata.parameters['unit']
-        #     else:
-        #         raise Exception("Error finding timeframe unit.")
-        #     frame_type = annotation.properties["frameType"]
-        #     if annotation_unit == "frame":
-        #         start_fn = int(annotation.properties["start"])
-        #         end_fn = int(annotation.properties["end"])
-        #         frame_rate = 29.97
-        #         start_sec = int(start_fn // frame_rate)
-        #         end_sec = int(end_fn // frame_rate)
-        #     elif annotation_unit == "milliseconds":
-        #         start_milli = int(annotation.properties["start"])
-        #         end_milli = int(annotation.properties["end"])
-        #         start_sec = int(start_milli // 1000)
-        #         end_sec = int(end_milli // 1000)
-        #     else:
-        #         continue
-        #     structure = {
-        #         "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/range/{range_id}",
-        #         "type": "Range",
-        #         "label": f"{frame_type.capitalize()}",
-        #         "members": [
-        #             {
-        #                 "id": f"http://0.0.0.0:5000/mmif_example_manifest.json/canvas/{1}#t={start_sec},{end_sec}",
-        #                 # need to align id here to support more than one document
-        #                 "type": "Canvas"
-        #             }
-        #         ]
-        #     }
-        #     view_range["members"].append(structure)
-    return view_range
-
-
 def get_iiif_format(document):
     if document.is_type(DocumentTypes.VideoDocument):
         return 'video/mp4'
diff --git a/ocr.py b/ocr.py
@@ -34,16 +34,16 @@ def __init__(self, anno, mmif):
 
     def update(self, anno, mmif):
 
-        if anno.at_type.shortname == "BoundingBox":
+        if anno.at_type == AnnotationTypes.BoundingBox:
             self.add_bounding_box(anno, mmif)
 
-        elif anno.at_type.shortname == "TimeFrame":
+        elif anno.at_type == AnnotationTypes.TimeFrame:
             self.add_timeframe(anno, mmif)
 
-        elif anno.at_type.shortname == "TimePoint":
+        elif anno.at_type == AnnotationTypes.TimePoint:
             self.add_timepoint(anno, mmif)
 
-        elif anno.at_type.shortname == "TextDocument":
+        elif anno.at_type == DocumentTypes.TextDocument:
             self.add_text_document(anno)
 
         elif anno.at_type.shortname == "Paragraph":
@@ -61,19 +61,26 @@ def add_bounding_box(self, anno, mmif):
                 self.add_timepoint(timepoint_anno, mmif,
                                    skip_if_view_has_frames=False)
         else:
-            self.frame_num = convert_timepoint(mmif, anno, "frames")
-            self.secs = convert_timepoint(mmif, anno, "seconds")
-        box_id = anno.properties["id"]
-        boxType = anno.properties["boxType"]
-        coordinates = anno.properties["coordinates"]
+            for alignment_anns in mmif.get_alignments(AnnotationTypes.BoundingBox, AnnotationTypes.TimePoint).values():
+                for alignment_ann in alignment_anns:
+                    if alignment_ann.get('source') == anno.id:
+                        timepoint_anno = mmif[alignment_ann.get('target')]
+                        break
+                    elif alignment_ann.get('target') == anno.id:
+                        timepoint_anno = mmif[alignment_ann.get('source')]
+                        break
+        if timepoint_anno:
+            self.add_timepoint(timepoint_anno, mmif, skip_if_view_has_frames=False)
+
+        box_id = anno.get("id")
+        boxType = anno.get("boxType")
+        coordinates = anno.get("coordinates")
         x = coordinates[0][0]
         y = coordinates[0][1]
         w = coordinates[1][0] - x
         h = coordinates[1][1] - y
         box = [box_id, boxType, [x, y, w, h]]
-        # TODO: This is a hack to ignore percentage-based Doctr bounding boxes
-        if "doctr" not in mmif.get_view_by_id(anno.parent).metadata["app"]:
-            self.boxes.append(box)
+        self.boxes.append(box)
         self.anno_ids.append(box_id)
         self.timestamp = str(datetime.timedelta(seconds=self.secs))
         if anno.properties.get("boxType") and anno.properties.get("boxType") not in self.boxtypes:
@@ -142,26 +149,16 @@ def prepare_ocr(mmif, view, viz_id):
     save_json(frames_pages, view.id, viz_id)
 
 
-def find_annotation(anno_id, mmif):
-    if mmif.id_delimiter in anno_id:
-        view_id, anno_id = anno_id.split(mmif.id_delimiter)
-        view = mmif.get_view_by_id(view_id)
-    for view in mmif.views:
-        try:
-            return view.get_annotation_by_id(anno_id)
-        except KeyError:
-            continue
-
 
 def get_ocr_frames(view, mmif):
     frames = {}
     full_alignment_type = [
-        at_type for at_type in view.metadata.contains if at_type.shortname == "Alignment"]
+        at_type for at_type in view.metadata.contains if at_type == AnnotationTypes.Alignment]
     # If view contains alignments
     if full_alignment_type:
         for alignment in view.get_annotations(full_alignment_type[0]):
-            source = find_annotation(alignment.properties["source"], mmif)
-            target = find_annotation(alignment.properties["target"], mmif)
+            source = mmif[alignment.get("source")]
+            target = mmif[alignment.get("target")]
 
             # Account for alignment in either direction
             frame = OCRFrame(source, mmif)
diff --git a/templates/uv_player.html b/templates/uv_player.html
@@ -20,7 +20,7 @@
     $(".nav-item.UV").click(function() {
         if (!uvLoaded) {
             const data = {
-                manifest: "/tmp/{{mmif_id}}/{{manifest}}",
+                manifest: "/mmif-viz-cache/{{mmif_id}}/{{manifest}}",
                 embedded: true
              };