Skip to content

Commit f1cb2e6

Browse files
committed
Changed the data structure mapping TimeFrame to aligned anns and what anns to return
1 parent ec4ee38 commit f1cb2e6

File tree

3 files changed

+26
-15
lines changed

3 files changed

+26
-15
lines changed

mmif/serialize/mmif.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ def get_annotations_between_time(self, start: int, end: int, time_unit: str = "m
532532
assert end >= 0, "End time must be greater than or equal to zero"
533533
# 0. Initialize container and helper method
534534
valid_tf_anns = []
535-
idtf_to_token = {}
535+
tf_to_anns = defaultdict(list)
536536

537537
# 1. find all views that contain the type of TF
538538
views = self.get_all_views_contain([AnnotationTypes.TimeFrame, AnnotationTypes.Alignment])
@@ -556,18 +556,26 @@ def get_annotations_between_time(self, start: int, end: int, time_unit: str = "m
556556
if self._is_in_time_between(start, end, tf):
557557
valid_tf_anns.append(tf)
558558

559-
# Map 'TimeFrame' id to 'Token' annotation
560-
idtf_to_idtk = {align.get_property('source'): align.get_property('target') for align in al_anns}
561-
for id_tf in idtf_to_idtk:
562-
token_id = idtf_to_idtk[id_tf]
563-
idtf_to_token[id_tf] = view.get_annotation_by_id(token_id)
559+
# Map 'TimeFrame' annotation to its aligned annotation
560+
for align in al_anns:
561+
source_id, target_id = align.get_property('source'), align.get_property('target')
562+
try:
563+
source, target = view.get_annotation_by_id(source_id), view.get_annotation_by_id(target_id)
564+
if source in valid_tf_anns:
565+
tf_to_anns[source_id].append(target)
566+
elif target in valid_tf_anns:
567+
tf_to_anns[target_id].append(source)
568+
except KeyError:
569+
pass
564570

565571
# 3. For those extracted 'TimeFrame' annotations, sort them by their start time
566572
sort_tf_anns = sorted(valid_tf_anns, key=lambda x: self.get_start(x))
567573

568-
# 4. Find all 'Token' annotations aligned with sorted 'TimeFrame' annotations
569-
for ann in sort_tf_anns:
570-
yield idtf_to_token[ann.get_property('id')]
574+
# 4. Yield all annotations aligned with sorted 'TimeFrame' annotations
575+
for tf_ann in sort_tf_anns:
576+
anns = tf_to_anns[tf_ann.get_property("id")]
577+
for ann in anns:
578+
yield ann
571579

572580
def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]:
573581
# TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192)

mmif/utils/textdoc_helper.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import mmif
2+
from mmif import Annotation
23

34

4-
def slice_text(mmif_obj, start: int, end: int) -> str:
5-
sort_token_anns = mmif_obj.get_annotations_between_time(start, end)
5+
def slice_text(mmif_obj, start: int, end: int, unit: str = "milliseconds") -> str:
6+
token_type = "http://vocab.lappsgrid.org/Token"
7+
anns_found = mmif_obj.get_annotations_between_time(start, end, unit)
68
tokens_sliced = []
7-
for ann in sort_token_anns:
8-
tokens_sliced.append(ann.get_property('text')) # FIXME: Sometimes the string attribute "word" is used for getting property value
9+
for ann in anns_found:
10+
if ann.is_type(token_type):
11+
tokens_sliced.append(ann.get_property('text')) # FIXME: Sometimes the string attribute "word" is used for getting property value
912
return ' '.join(tokens_sliced)

tests/test_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,9 @@ class TestTextDocHelper(unittest.TestCase):
163163

164164
def test_slice_text(self):
165165
sliced_text_full_overlap = tdh.slice_text(self.mmif_obj, 11500, 14600)
166-
sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7000, 10000)
166+
sliced_text_partial_overlap = tdh.slice_text(self.mmif_obj, 7, 10, unit="seconds")
167167
no_sliced_text = tdh.slice_text(self.mmif_obj, 0, 5000)
168-
full_sliced_text = tdh.slice_text(self.mmif_obj, 0, 22000)
168+
full_sliced_text = tdh.slice_text(self.mmif_obj, 0, 22, unit="seconds")
169169
self.assertEqual("In the nineteen eighties ,", sliced_text_full_overlap)
170170
self.assertEqual("is Jim Lehrer with the NewsHour", sliced_text_partial_overlap)
171171
self.assertEqual("", no_sliced_text)

0 commit comments

Comments
 (0)