Changed the data structure mapping TimeFrame to aligned anns and what anns to return

bohJiang12 · bohJiang12 · commit f1cb2e6d2df8 · 2024-06-13T16:24:07.000-04:00
diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
@@ -532,7 +532,7 @@ def get_annotations_between_time(self, start: int, end: int, time_unit: str = "m
         assert end >= 0, "End time must be greater than or equal to zero"
         # 0. Initialize container and helper method
         valid_tf_anns = []
-        idtf_to_token = {}
+        tf_to_anns = defaultdict(list)
 
         # 1. find all views that contain the type of TF
         views = self.get_all_views_contain([AnnotationTypes.TimeFrame, AnnotationTypes.Alignment])
@@ -556,18 +556,26 @@ def get_annotations_between_time(self, start: int, end: int, time_unit: str = "m
                 if self._is_in_time_between(start, end, tf):
                     valid_tf_anns.append(tf)
 
-            # Map 'TimeFrame' id to 'Token' annotation
-            idtf_to_idtk = {align.get_property('source'): align.get_property('target') for align in al_anns}
-            for id_tf in idtf_to_idtk:
-                token_id = idtf_to_idtk[id_tf]
-                idtf_to_token[id_tf] = view.get_annotation_by_id(token_id)
+            # Map 'TimeFrame' annotation to its aligned annotation
+            for align in al_anns:
+                source_id, target_id = align.get_property('source'), align.get_property('target')
+                try:
+                    source, target = view.get_annotation_by_id(source_id), view.get_annotation_by_id(target_id)
+                    if source in valid_tf_anns:
+                        tf_to_anns[source_id].append(target)
+                    elif target in valid_tf_anns:
+                        tf_to_anns[target_id].append(source)
+                except KeyError:
+                    pass
 
         # 3. For those extracted 'TimeFrame' annotations, sort them by their start time
         sort_tf_anns = sorted(valid_tf_anns, key=lambda x: self.get_start(x))
 
-        # 4. Find all 'Token' annotations aligned with sorted 'TimeFrame' annotations
-        for ann in sort_tf_anns:
-            yield idtf_to_token[ann.get_property('id')]
+        # 4. Yield all annotations aligned with sorted 'TimeFrame' annotations
+        for tf_ann in sort_tf_anns:
+            anns = tf_to_anns[tf_ann.get_property("id")]
+            for ann in anns:
+                yield ann
 
     def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]:
         # TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192)
diff --git a/mmif/utils/textdoc_helper.py b/mmif/utils/textdoc_helper.py
@@ -1,9 +1,12 @@
 import mmif
+from mmif import Annotation
 
 
-def slice_text(mmif_obj, start: int, end: int) -> str:
-    sort_token_anns = mmif_obj.get_annotations_between_time(start, end)
+def slice_text(mmif_obj, start: int, end: int, unit: str = "milliseconds") -> str:
+    token_type = "http://vocab.lappsgrid.org/Token"
+    anns_found = mmif_obj.get_annotations_between_time(start, end, unit)
     tokens_sliced = []
-    for ann in sort_token_anns:
-        tokens_sliced.append(ann.get_property('text'))  # FIXME: Sometimes the string attribute "word" is used for getting property value
+    for ann in anns_found:
+        if ann.is_type(token_type):
+            tokens_sliced.append(ann.get_property('text'))  # FIXME: Sometimes the string attribute "word" is used for getting property value
     return ' '.join(tokens_sliced)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -163,9 +163,9 @@ class TestTextDocHelper(unittest.TestCase):
 
     def test_slice_text(self):
         sliced_text_full_overlap = tdh.slice_text(self.mmif_obj, 11500, 14600)
-        sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7000, 10000)
+        sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7, 10, unit="seconds")
         no_sliced_text = tdh.slice_text(self.mmif_obj, 0, 5000)
-        full_sliced_text = tdh.slice_text(self.mmif_obj, 0, 22000)
+        full_sliced_text = tdh.slice_text(self.mmif_obj, 0, 22, unit="seconds")
         self.assertEqual("In the nineteen eighties ,", sliced_text_full_overlap)
         self.assertEqual("is Jim Lehrer with the NewsHour", sliced_text_partial_overlap)
         self.assertEqual("", no_sliced_text)