Merge pull request #284 from clamsproject/280-fix-textslicer

keighrim · web-flow · commit 416c71754f3b · 2024-06-13T22:09:46.000-04:00
280-fix-textslicer
diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
@@ -10,7 +10,7 @@
 import warnings
 from collections import defaultdict
 from datetime import datetime
-from typing import List, Union, Optional, Dict, cast
+from typing import List, Union, Optional, Dict, cast, Iterator, Tuple
 
 import jsonschema.validators
 
@@ -609,7 +609,74 @@ def get_view_contains(self, at_types: Union[ThingTypesBase, str, List[Union[str,
                 if at_types in view.metadata.contains:
                     return view
         return None
-    
+
+    def _is_in_time_between(self, start: Union[int, float], end: Union[int, float], annotation: Annotation) -> bool:
+        s, e = self.get_start(annotation), self.get_end(annotation)
+        return (s < start < e) or (s > start and e < end) or (s < end < e)
+
+    def _handle_time_unit(self, input_unit: str, ann_unit: str,
+                          start: int, end: int) -> Tuple[Union[int, float, str], Union[int, float, str]]:
+        from mmif.utils.timeunit_helper import convert
+        start = convert(start, input_unit, ann_unit, 1)
+        end = convert(end, input_unit, ann_unit, 1)
+        return start, end
+
+    def get_annotations_between_time(self, start: int, end: int, time_unit: str = "milliseconds") -> Iterator[Annotation]:
+        """
+        Version: 1.0
+        Returns all 'Token' annotations aligned with 'TimeFrame' annotations sorted by start time within start and end time
+        Note: this function only works for mmif object obtained from Whisper-wrapper
+
+        :param start: the start time
+        :param end: the end time
+        :param time_unit: the time unit, either string "milliseconds" or "seconds", defaults to "milliseconds"
+        :return: a generator of 'Token' annotations
+        """
+        assert start <= end, "Start time must be less than end time"
+        assert start >= 0, "Start time must be greater than or equal to zero"
+        assert end >= 0, "End time must be greater than or equal to zero"
+        # 0. Initialize container and helper method
+        valid_tf_anns = []
+        tf_to_anns = defaultdict(list)
+
+        # 1. find all views that contain the type of TF
+        views = self.get_all_views_contain([AnnotationTypes.TimeFrame, AnnotationTypes.Alignment])
+
+        # 2. For each view, extract annotations that satisfy conditions that are TF/TP and fall into time interval
+        for view in views:
+            # Make sure time unit stay at the same level
+            start_time, end_time = self._handle_time_unit(time_unit, view.metadata.contains.get(AnnotationTypes.TimeFrame)["timeUnit"],
+                                                          start, end)
+            tf_anns = view.get_annotations(at_type=AnnotationTypes.TimeFrame)
+            al_anns = view.get_annotations(at_type=AnnotationTypes.Alignment)
+
+            # Select 'TimeFrame' annotations within given time interval
+            for tf in tf_anns:
+                if self._is_in_time_between(start_time, end_time, tf):
+                    valid_tf_anns.append(tf)
+
+            # Map 'TimeFrame' annotation to its aligned annotation
+            for align in al_anns:
+                source_id, target_id = align.get_property('source'), align.get_property('target')
+                to_long_id = lambda x: x if self.id_delimiter in x else f'{view.id}{self.id_delimiter}{x}'
+                try:
+                    source, target = view.get_annotation_by_id(source_id), view.get_annotation_by_id(target_id)
+                    if source in valid_tf_anns:
+                        tf_to_anns[to_long_id(source_id)].append(target)
+                    elif target in valid_tf_anns:
+                        tf_to_anns[to_long_id(target_id)].append(source)
+                except KeyError:
+                    pass
+
+        # 3. For those extracted 'TimeFrame' annotations, sort them by their start time
+        sort_tf_anns = sorted(valid_tf_anns, key=lambda x: self.get_start(x))
+
+        # 4. Yield all annotations aligned with sorted 'TimeFrame' annotations
+        for tf_ann in sort_tf_anns:
+            anns = tf_to_anns[tf_ann.long_id]
+            for ann in anns:
+                yield ann
+
     def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]:
         # TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192)
         """
diff --git a/mmif/utils/text_document_helper.py b/mmif/utils/text_document_helper.py
@@ -0,0 +1,12 @@
+import mmif
+from mmif import Annotation
+
+
+def slice_text(mmif_obj, start: int, end: int, unit: str = "milliseconds") -> str:
+    token_type = "http://vocab.lappsgrid.org/Token"
+    anns_found = mmif_obj.get_annotations_between_time(start, end, unit)
+    tokens_sliced = []
+    for ann in anns_found:
+        if ann.is_type(token_type):
+            tokens_sliced.append(ann.get_property('word'))
+    return ' '.join(tokens_sliced)
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
@@ -342,6 +342,64 @@ def test_new_view_id(self):
         self.assertEqual(e_view.id, f'{p}4')
         self.assertEqual(len(mmif_obj.views), 5)
 
+    def test_get_annotations_between_time(self):
+        token_type = "http://vocab.lappsgrid.org/Token"
+        # Below tokens are obtained by 'jq' in CLI using command:
+        # jq '[
+        # .views[3].annotations |
+        # .[] |
+        # select(."@type"=="http://vocab.lappsgrid.org/Token")] |
+        # sort_by(.properties.id | ltrimstr("t") | tonumber) |
+        # map(.properties.text)' <examples>.json
+        tokens_in_order = ["Hello",
+                           ",",
+                           "this",
+                           "is",
+                           "Jim",
+                           "Lehrer",
+                           "with",
+                           "the",
+                           "NewsHour",
+                           "on",
+                           "PBS",
+                           ".",
+                           "In",
+                           "the",
+                           "nineteen",
+                           "eighties",
+                           ",",
+                           "barking",
+                           "dogs",
+                           "have",
+                           "increasingly",
+                           "become",
+                           "a",
+                           "problem",
+                           "in",
+                           "urban",
+                           "areas",
+                           "."]
+        mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
+
+        # Test case 1: All token annotations are selected
+        selected_token_anns = mmif_obj.get_annotations_between_time(0, 22000)
+        self.assertEqual(28, len(list(selected_token_anns)))
+        for i, ann in enumerate(selected_token_anns):
+            self.assertTrue(ann.is_type(token_type))
+            self.assertEqual(tokens_in_order[i], ann.get_property("text"))
+
+        # Test case 2: No token annotation are selected
+        selected_token_anns = mmif_obj.get_annotations_between_time(0, 5, time_unit="seconds")
+        self.assertEqual(0, len(list(selected_token_anns)))
+
+        # Test case 3(a): Partial tokens are selected (involve partial overlap)
+        selected_token_anns = mmif_obj.get_annotations_between_time(7, 10, time_unit="seconds")
+        self.assertEqual(tokens_in_order[3:9], [ann.get_property("text") for ann in selected_token_anns])
+
+        # Test case 3(b): Partial tokens are selected (only full overlap)
+        selected_token_anns = mmif_obj.get_annotations_between_time(11500, 14600)
+        self.assertEqual(tokens_in_order[12:17], [ann.get_property("text") for ann in selected_token_anns])
+
     def test_add_document(self):
         mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
         med_obj = Document(FRACTIONAL_EXAMPLES['doc_only'])
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -6,12 +6,14 @@
 from mmif.utils import sequence_helper as sqh
 from mmif.utils import timeunit_helper as tuh
 from mmif.utils import video_document_helper as vdh
+from mmif.utils import text_document_helper as tdh
+from mmif.serialize import mmif
+from tests.mmif_examples import *
 
 
 class TestTimeunitHelper(unittest.TestCase):
-    
     FPS = 30
-    
+
     def test_convert(self):
         self.assertEqual(1000, tuh.convert(1, 's', 'ms', self.FPS))
         self.assertEqual(1.1, tuh.convert(1100, 'ms', 's', self.FPS))
@@ -35,7 +37,7 @@ def setUp(self):
         })
         self.video_doc.add_property('fps', self.fps)
         self.mmif_obj.add_document(self.video_doc)
-    
+
     def test_extract_mid_frame(self):
         tf = self.a_view.new_annotation(AnnotationTypes.TimeFrame, start=100, end=200, timeUnit='frame', document='d1')
         self.assertEqual(150, vdh.get_mid_framenum(self.mmif_obj, tf))
@@ -92,11 +94,12 @@ def test_sample_frames(self):
         s_frame = vdh.second_to_framenum(self.video_doc, 3)
         e_frame = vdh.second_to_framenum(self.video_doc, 5)
         self.assertEqual(1, len(vdh.sample_frames(s_frame, e_frame, 60)))
-        
+
     def test_convert_timepoint(self):
-        timepoint_ann = self.a_view.new_annotation(AnnotationTypes.BoundingBox, timePoint=3, timeUnit='second', document='d1')
+        timepoint_ann = self.a_view.new_annotation(AnnotationTypes.BoundingBox, timePoint=3, timeUnit='second',
+                                                   document='d1')
         self.assertEqual(vdh.convert(3, 's', 'f', self.fps), vdh.convert_timepoint(self.mmif_obj, timepoint_ann, 'f'))
-    
+
     def test_convert_timeframe(self):
         self.a_view.metadata.new_contain(AnnotationTypes.TimeFrame, timeUnit='frame', document='d1')
         timeframe_ann = self.a_view.new_annotation(AnnotationTypes.TimeFrame, start=100, end=200)
@@ -105,7 +108,7 @@ def test_convert_timeframe(self):
 
 
 class TestSequenceHelper(unittest.TestCase):
-    
+
     def test_validate_labelset(self):
         mmif_obj = Mmif(validate=False)
         view = mmif_obj.new_view()
@@ -172,5 +175,23 @@ def test_width_based_smoothing(self):
                          sqh.smooth_outlying_short_intervals(scores, 1, 1))
 
 
+class TestTextDocHelper(unittest.TestCase):
+    mmif_obj = Mmif(MMIF_EXAMPLES['everything'])
+
+    @pytest.mark.skip("The only valid test cases come from kalbi app which annotates wrong property")
+    def test_slice_text(self):
+        sliced_text_full_overlap = tdh.slice_text(self.mmif_obj, 11500, 14600)
+        sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7, 10, unit="seconds")
+        no_sliced_text = tdh.slice_text(self.mmif_obj, 0, 5000)
+        full_sliced_text = tdh.slice_text(self.mmif_obj, 0, 22, unit="seconds")
+        self.assertEqual("In the nineteen eighties ,", sliced_text_full_overlap)
+        self.assertEqual("is Jim Lehrer with the NewsHour", sliced_text_partial_overlap)
+        self.assertEqual("", no_sliced_text)
+        self.assertEqual(
+            "Hello , this is Jim Lehrer with the NewsHour on PBS . "
+            "In the nineteen eighties , barking dogs have increasingly become a problem in urban areas .",
+            full_sliced_text)
+
+
 if __name__ == '__main__':
     unittest.main()