improved alignment caching mechanism

keighrim · keighrim · commit 43bb3f14c91f · 2024-06-25T20:48:47.000-04:00
diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py
@@ -60,6 +60,9 @@ def __init__(self, anno_obj: Optional[Union[bytes, str, dict]] = None, *_) -> No
         self._required_attributes = ["_type", "properties"]
         super().__init__(anno_obj)
     
+    def __hash__(self):
+        return hash(self.serialize())
+    
     def _deserialize(self, input_dict: dict) -> None:
         self.at_type = input_dict.pop('_type', '')
         # TODO (krim @ 6/1/21): If annotation IDs must follow a certain string format,
@@ -70,21 +73,31 @@ def _deserialize(self, input_dict: dict) -> None:
         for k, v in self.properties.items():
             self._add_prop_aliases(k, v)
                             
-    def _cache_alignment(self, alignment_id: str, alignedto_id: str) -> None:
+    def _cache_alignment(self, alignment_ann: 'Annotation', alignedto_ann: 'Annotation') -> None:
         """
         Cache alignment information. This cache will not be serialized. Both ID arguments must be in their long_id 
         format.
         :param alignment_id: long_id of the Alignment annotation that has this annotation on one side
         :param alignedto_id: long_id of the annotation that this annotation is aligned to (other side of Alignment)
         """
-        self._alignments[alignment_id] = alignedto_id
+        self._alignments[alignment_ann] = alignedto_ann
     
-    def aligned_to_by(self, alignment_id: str) -> Optional[str]:
+    def aligned_to_by(self, alignment: 'Annotation') -> Optional['Annotation']:
         """
         Retrieve the long_id of the annotation that this annotation is aligned to. 
-        :param alignment_id: ID if the Alignment annotation
+        :param alignment: Alignment annotation that has this annotation on one side
+        """
+        return self._alignments.get(alignment)
+    
+    def get_all_aligned(self) -> Iterator['Annotation']:
         """
-        return self._alignments.get(alignment_id)
+        Generator to iterate through all alignments and aligned annotations.
+        :return: yields the alignment annotation and the aligned annotation in order
+        """
+        for alignment, aligned in self._alignments.items():
+            yield alignment
+            yield aligned
+        
         
     def _add_prop_aliases(self, key_to_add, val_to_add):
         """
diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
@@ -10,7 +10,7 @@
 import warnings
 from collections import defaultdict
 from datetime import datetime
-from typing import List, Union, Optional, Dict, cast, Iterator, Tuple
+from typing import List, Union, Optional, Dict, cast, Iterator
 
 import jsonschema.validators
 
@@ -235,14 +235,39 @@ def _deserialize(self, input_dict: dict) -> None:
                 # add quick access to `start` and `end` values if the annotation is using `targets` property
                 if 'targets' in ann.properties:
                     if 'start' in ann.properties or 'end' in ann.properties:
-                        raise ValueError(f"Annotation {ann.id} (in view {view.id}) has `targes` and `start`/`end/` "
+                        raise ValueError(f"Annotation {ann.id} (in view {view.id}) has `targets` and `start`/`end/` "
                                          f"properties at the same time. Annotation anchors are ambiguous.")
                     ann._props_ephemeral['start'] = self._get_linear_anchor_point(ann, start=True)
                     ann._props_ephemeral['end'] = self._get_linear_anchor_point(ann, start=False)
                 
                 ## caching alignments
                 if ann.at_type == AnnotationTypes.Alignment:
-                    view._cache_alignment(ann)
+                    self._cache_alignment(ann)
+    
+    def _cache_alignment(self, alignment_ann: Annotation):
+        view = self.views.get(alignment_ann.parent)
+        if view is None:
+            warnings.warn(f"Alignment {alignment_ann.long_id} doesn't have a parent view, but it should.", RuntimeWarning)
+            return
+
+        ## caching alignments
+        def _desprately_search_annotation_object(ann_short_id):
+            ann_long_id = f"{view.id}{self.id_delimiter}{ann_short_id}"
+            try:
+                return self.__getitem__(ann_long_id)
+            except KeyError:
+                return self.__getitem__(ann_short_id)
+
+        if all(map(lambda x: x in alignment_ann.properties, ('source', 'target'))):
+            source_ann = _desprately_search_annotation_object(alignment_ann.get('source'))
+            target_ann = _desprately_search_annotation_object(alignment_ann.get('target'))
+            if isinstance(source_ann, Annotation) and isinstance(target_ann, Annotation):
+                source_ann._cache_alignment(alignment_ann, target_ann)
+                target_ann._cache_alignment(alignment_ann, source_ann)
+            else:
+                warnings.warn(
+                    f"Alignment {alignment_ann.long_id} has `source` and `target` properties that do not point to Annotation objects.",
+                    RuntimeWarning)
 
     def generate_capital_annotations(self):
         """
@@ -566,19 +591,16 @@ def get_all_views_with_error(self) -> List[View]:
     
     get_views_with_error = get_all_views_with_error
             
-    def get_all_views_contain(self, at_types: Union[ThingTypesBase, str, List[Union[str, ThingTypesBase]]]) -> List[View]:
+    def get_all_views_contain(self, *at_types: Union[ThingTypesBase, str]) -> List[View]:
         """
         Returns the list of all views in the MMIF if given types
         are present in that view's 'contains' metadata.
 
         :param at_types: a list of types or just a type to check for. When given more than one types, all types must be found.
         :return: the list of views that contain the type
         """
-        if isinstance(at_types, list):
-            return [view for view in self.views
-                    if all(map(lambda x: x in view.metadata.contains, at_types))]
-        else:
-            return [view for view in self.views if at_types in view.metadata.contains]
+        return [view for view in self.views
+                if all(map(lambda x: x in view.metadata.contains, at_types))]
 
     get_views_contain = get_all_views_contain
     
@@ -621,35 +643,20 @@ def get_view_contains(self, at_types: Union[ThingTypesBase, str, List[Union[str,
                     return view
         return None
 
-    def _is_in_time_range(self, ann: Annotation, start: Union[int, float], end: Union[int, float]) -> bool:
+    def _is_in_time_range(self, ann: Annotation, range_s: Union[int, float], range_e: Union[int, float]) -> bool:
         """
-        Checks if the annotation is anchored within the given time range.
+        Checks if the annotation is anchored within the given time range. Any overlap is considered included. 
 
-        :param ann: the Annotation object to check
-        :param start: the start time point in milliseconds
-        :param end: the end time point in milliseconds
+        :param ann: the Annotation object to check, must be time-based itself or anchored to time-based annotations
+        :param range_s: the start time point of the range (in milliseconds)
+        :param range_e: the end time point of the range (in milliseconds)
 
         :return: True if the annotation is anchored within the time range, False otherwise
         """
-        s, e = self.get_start(ann), self.get_end(ann)
-        return (s < start < e) or (s < end < e) or (s > start and e < end)
-
-    def _handle_time_unit(self, input_unit: str, ann_unit: str,
-                          start: int, end: int) -> Tuple[Union[int, float, str], Union[int, float, str]]:
-        """
-        Helper method to convert time unit defined by user to the unit in mmif object.
+        ann_s, ann_e = self.get_start(ann), self.get_end(ann)
+        return (ann_s < range_s < ann_e) or (ann_s < range_e < ann_e) or (ann_s > range_s and ann_e < range_e)
 
-        :param input_unit: the time unit defined by user
-        :param ann_unit: the time unit in mmif object
-        :param start: the start time point in the unit of `input_unit`
-        :param end: the end time point in the unit of `input_unit`
-
-        :return: the start and end time points in the unit of `ann_unit`
-        """
-        from mmif.utils.timeunit_helper import convert
-        return convert(start, input_unit, ann_unit, 1), convert(end, input_unit, ann_unit, 1)
-
-    def get_annotations_between_time(self, start: Union[int, float], end: Union[int, float],
+    def get_annotations_between_time(self, start: Union[int, float], end: Union[int, float], 
                                      time_unit: str = "ms") -> Iterator[Annotation]:
         """
         Finds annotations that are anchored between the given time points.
@@ -662,34 +669,24 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int,
         assert start < end, f"Start time point must be smaller than the end time point, given {start} and {end}"
         assert start >= 0, f"Start time point must be non-negative, given {start}"
         assert end >= 0, f"End time point must be non-negative, given {end}"
+        
+        from mmif.utils.timeunit_helper import convert
 
-        tf_in_range = []
-        tf_to_anns = defaultdict(list)
+        time_anchors_in_range = []
 
-        # Runtime: O(V * (TF * AL))
-        for view in self.get_all_views_contain([AnnotationTypes.TimeFrame, AnnotationTypes.Alignment]):
+        for view in self.get_all_views_contain(AnnotationTypes.TimeFrame) + self.get_all_views_contain(AnnotationTypes.TimePoint):
             time_unit_in_view = view.metadata.contains.get(AnnotationTypes.TimeFrame)["timeUnit"]
-            start_time, end_time = self._handle_time_unit(time_unit, time_unit_in_view, start, end)
-
-            tf_anns = view.get_annotations(AnnotationTypes.TimeFrame)
-            al_anns = view.get_annotations(AnnotationTypes.Alignment)
-
-            for tf_ann in tf_anns:
-                if self._is_in_time_range(tf_ann, start_time, end_time):
-                    tf_in_range.append(tf_ann)
-                    tf_to_anns[self.get_start(tf_ann)] = []
-
-            for al_ann in al_anns:
-                for tf in tf_in_range:
-                    target_ann_long_id = tf.aligned_to_by(al_ann.long_id)
-                    if target_ann_long_id:
-                        tf_to_anns[self.get_start(tf)].append(view.get_annotation_by_id(target_ann_long_id))
-                        break
-
-        # Runtime: O(TF + AL)
-        for start_point, anns in dict(sorted(tf_to_anns.items())).items():
-            for ann in anns:
-                yield ann
+            
+            start_time = convert(start, time_unit, time_unit_in_view, 1)
+            end_time = convert(end, time_unit, time_unit_in_view, 1)
+            for ann in view.get_annotations():
+                if ann.at_type in (AnnotationTypes.TimeFrame, AnnotationTypes.TimePoint) and self._is_in_time_range(ann, start_time, end_time):
+                    time_anchors_in_range.append(ann)
+        time_anchors_in_range.sort(key=lambda x: self.get_start(x))
+        for time_anchor in time_anchors_in_range:
+            yield time_anchor
+            for aligned in time_anchor.get_all_aligned():
+                yield aligned
 
     def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]:
         # TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192)
diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py
@@ -122,16 +122,9 @@ def add_annotation(self, annotation: 'Annotation', overwrite=False) -> 'Annotati
         self.annotations.append(annotation, overwrite)
         self.new_contain(annotation.at_type)
         if annotation.at_type == AnnotationTypes.Alignment:
-            self._cache_alignment(annotation)
+            self._parent_mmif._cache_alignment(annotation)
         return annotation
     
-    def _cache_alignment(self, alignent_ann: 'Annotation'):
-        if all(map(lambda x: x in alignent_ann.properties, ('source', 'target'))):
-            source_ann = self.get_annotation_by_id(alignent_ann.get('source'))
-            target_ann = self.get_annotation_by_id(alignent_ann.get('target'))
-            source_ann._cache_alignment(alignent_ann.long_id, target_ann.long_id)
-            target_ann._cache_alignment(alignent_ann.long_id, source_ann.long_id)
-
     def new_textdocument(self, text: str, lang: str = "en", did: Optional[str] = None, 
                          overwrite=False, **properties) -> 'Document':
         """
diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py
@@ -1,12 +1,12 @@
 import importlib
+import math
 import warnings
 from typing import List, Union, Tuple
-import math
 
 import mmif
 from mmif import Annotation, Document, Mmif
 from mmif.utils.timeunit_helper import convert
-from mmif.vocabulary import DocumentTypes, AnnotationTypes
+from mmif.vocabulary import DocumentTypes
 
 for cv_dep in ('cv2', 'ffmpeg', 'PIL'):
     try:
@@ -212,19 +212,19 @@ def convert_timepoint(mmif: Mmif, timepoint: Annotation, out_unit: str) -> Union
     return convert(timepoint.get_property('timePoint'), in_unit, out_unit, get_framerate(vd))
 
 
-def convert_timeframe(mmif: Mmif, time_frame: Annotation, out_unit: str) -> Union[Tuple[Union[int, float, str], Union[int, float, str]]]:
+def convert_timeframe(mmif: Mmif, time_frame: Annotation, out_unit: str) -> Tuple[Union[int, float, str], Union[int, float, str]]:
     """
     Converts start and end points in a ``TimeFrame`` annotation a different time unit.
 
     :param mmif: :py:class:`~mmif.serialize.mmif.Mmif` instance
     :param time_frame: :py:class:`~mmif.serialize.annotation.Annotation` instance that holds a time interval annotation (``"@type": ".../TimeFrame/..."``)
     :param out_unit: time unit to which the point is converted
-    :return: tuple of frame numbers (integer) or seconds/milliseconds (float) of input start and end
+    :return: tuple of frame numbers, seconds/milliseconds, or ISO notation of TimeFrame's start and end
     """
     in_unit = time_frame.get_property('timeUnit')
     vd = mmif[time_frame.get_property('document')]
-    return convert(mmif.get_start(time_frame), in_unit, out_unit, get_framerate(vd)), \
-        convert(mmif.get_end(time_frame), in_unit, out_unit, get_framerate(vd))
+    fps = get_framerate(vd)
+    return convert(time_frame.get_property('start'), in_unit, out_unit, fps), convert(time_frame.get_property('end'), in_unit, out_unit, fps)
 
 
 def framenum_to_second(video_doc: Document, frame: int):
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
@@ -222,7 +222,7 @@ def test_document_location_helpers(self):
     def test_document_location_helpers_http(self):
         new_doc = Document()
         new_doc.id = "d1"
-        new_doc.location = f"https://www.gnu.org/licenses/gpl-3.0.txt"
+        new_doc.location = f"https://example.com/"
         self.assertEqual(new_doc.location_scheme(), 'https')
         try:
             path = new_doc.location_path()
@@ -277,11 +277,11 @@ def test_get_all_views_contain(self):
         self.assertEqual(2, len(views))
         views = mmif_obj.get_all_views_contain('http://vocab.lappsgrid.org/SemanticTag')
         self.assertEqual(1, len(views))
-        views = mmif_obj.get_views_contain([
+        views = mmif_obj.get_views_contain(
             AnnotationTypes.TimeFrame,
             DocumentTypes.TextDocument,
             AnnotationTypes.Alignment,
-        ])
+        )
         self.assertEqual(1, len(views))
         views = mmif_obj.get_all_views_contain(not_existing_attype)
         self.assertEqual(0, len(views))
@@ -324,16 +324,16 @@ def test_get_alignments(self):
         self.assertEqual(1, len(views_and_alignments))
         self.assertTrue('v6' in views_and_alignments)
     
-    def test_alignment_caching(self):
+    def test_cache_alignment(self):
         mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
         views_and_alignments = mmif_obj.get_alignments(DocumentTypes.TextDocument, AnnotationTypes.TimeFrame)
         for vid, alignments in views_and_alignments.items():
             v = mmif_obj.get_view_by_id(vid)
             for alignment in alignments:
                 s = v.get_annotation_by_id(alignment.get('source'))
                 t = v.get_annotation_by_id(alignment.get('target'))
-                self.assertTrue(s.aligned_to_by(alignment.long_id).endswith(t.long_id))
-                self.assertTrue(t.aligned_to_by(alignment.long_id).endswith(s.long_id))
+                self.assertTrue(s.aligned_to_by(alignment).long_id.endswith(t.long_id))
+                self.assertTrue(t.aligned_to_by(alignment).long_id.endswith(s.long_id))
 
     def test_new_view_id(self):
         p = Mmif.view_prefix
@@ -393,23 +393,24 @@ def test_get_annotations_between_time(self):
         mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
 
         # Test case 1: All token annotations are selected
-        selected_token_anns = mmif_obj.get_annotations_between_time(0, 22000)
-        self.assertEqual(28, len(list(selected_token_anns)))
+        selected_token_anns = [ann for ann in mmif_obj.get_annotations_between_time(0, 22000) if ann.is_type(token_type)]
+        self.assertEqual(28, len(selected_token_anns))
         for i, ann in enumerate(selected_token_anns):
-            self.assertTrue(ann.is_type(token_type))
-            self.assertEqual(tokens_in_order[i], ann.get_property("text"))
+            self.assertEqual(tokens_in_order[i], ann.get_property("word"))
 
         # Test case 2: No token annotation are selected
-        selected_token_anns = mmif_obj.get_annotations_between_time(0, 5, time_unit="seconds")
-        self.assertEqual(0, len(list(selected_token_anns)))
+        selected_token_anns = list(mmif_obj.get_annotations_between_time(0, 5, time_unit="seconds"))
+        self.assertEqual(4, len(list(selected_token_anns))) 
+        for ann in selected_token_anns:
+            self.assertFalse(ann.is_type(token_type))
 
         # Test case 3(a): Partial tokens are selected (involve partial overlap)
         selected_token_anns = mmif_obj.get_annotations_between_time(7, 10, time_unit="seconds")
-        self.assertEqual(tokens_in_order[3:9], [ann.get_property("text") for ann in selected_token_anns])
+        self.assertEqual(tokens_in_order[3:9], [ann.get_property("word") for ann in selected_token_anns])
 
         # Test case 3(b): Partial tokens are selected (only full overlap)
         selected_token_anns = mmif_obj.get_annotations_between_time(11500, 14600)
-        self.assertEqual(tokens_in_order[12:17], [ann.get_property("text") for ann in selected_token_anns])
+        self.assertEqual(tokens_in_order[12:17], [ann.get_property("word") for ann in selected_token_anns])
 
     def test_add_document(self):
         mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -4,10 +4,9 @@
 
 from mmif import Mmif, Document, AnnotationTypes
 from mmif.utils import sequence_helper as sqh
+from mmif.utils import text_document_helper as tdh
 from mmif.utils import timeunit_helper as tuh
 from mmif.utils import video_document_helper as vdh
-from mmif.utils import text_document_helper as tdh
-from mmif.serialize import mmif
 from tests.mmif_examples import *
 
 
@@ -178,7 +177,7 @@ def test_width_based_smoothing(self):
 class TestTextDocHelper(unittest.TestCase):
     mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
 
-    @pytest.mark.skip("The only valid test cases come from kalbi app which annotates wrong property")
+    @pytest.mark.skip("The only valid test cases come from kaldi app which annotates wrong property")
     def test_slice_text(self):
         sliced_text_full_overlap = tdh.slice_text(self.mmif_obj, 11500, 14600)
         sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7, 10, unit="seconds")