Skip to content

Commit dcf7324

Browse files
authored
Merge pull request #298 from clamsproject/develop
releasing 1.0.18
2 parents 0962ff3 + 030fd08 commit dcf7324

File tree

6 files changed

+107
-25
lines changed

6 files changed

+107
-25
lines changed

documentation/plugins.rst

+6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ To add a document location handler plugin, you need to implement a Python `"pack
2828

2929
#. the package must be named ``mmif_docloc_<SCHEME>``. For example, to implement a handler for ``s3`` scheme, the package name must be ``mmif_docloc_s3``. The prefix is important as it's used in the plugin discovery process from the core ``mmif-python`` modules.
3030
#. the top module of the package must have a function named ``resolve``. The function must take a single argument, which is a :class:`str` of the document location URI. The function must return a :class:`str` of the local file path. For example, if the document location is ``s3://mybucket/myfile.mp4``, a Python user should be able to to something like this;
31+
#. Optionally (but highly recommended), the top module also can provide another function named ``help``. The function must take no arguments and return a :class:`str` that explains how the input string to the ``resolve`` function should be formatted.
3132

3233
.. code-block:: python
3334
@@ -76,6 +77,11 @@ And the plugin code.
7677
else:
7778
raise ValueError(f'cannot handle document location scheme: {docloc}')
7879
80+
def help():
81+
return "location format: `<DOCUMENT_ID>.video`"
82+
83+
84+
7985
Bulit-in Document Location Scheme Plugins
8086
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
8187

mmif/serialize/annotation.py

+4
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,10 @@ def add_property(self, name: str,
323323
only keep the latest value (in order of appearances in views list) of
324324
the property, effectively overwriting the previous values.
325325
"""
326+
# we don't checking if this k-v already exists in _original (new props) or _ephemeral (read from existing MMIF)
327+
# because it is impossible to keep the _original updated when a new annotation is added (via `new_annotation`)
328+
# without look across other views and top-level documents list. Also see
329+
# ``mmif.serialize.mmif.Mmif.generate_capital_annotations`` for where the "de-duplication" happens.
326330
if name == "text":
327331
self.properties.text = Text(value)
328332
elif name == "mime":

mmif/serialize/mmif.py

+49-14
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,24 @@ def append(self, value: View, overwrite=False) -> None:
115115
"""
116116
super()._append_with_key(value.id, value, overwrite)
117117

118-
def get_last(self) -> Optional[View]:
118+
def get_last_contentful_view(self) -> Optional[View]:
119119
"""
120-
Returns the last view appended to the list.
120+
Returns the last view that is contentful, i.e., has no error or warning .
121121
"""
122122
for view in reversed(self._items.values()):
123123
if 'error' not in view.metadata and 'warning' not in view.metadata:
124124
return view
125+
126+
def get_last_view(self) -> Optional[View]:
127+
"""
128+
Returns the last view appended.
129+
"""
130+
if self._items:
131+
return self._items[list(self._items.keys())[-1]]
132+
133+
def get_last(self) -> Optional[View]:
134+
warnings.warn('get_last() is deprecated, use get_last_contentful_view() instead.', DeprecationWarning)
135+
return self.get_last_contentful_view()
125136

126137

127138
class Mmif(MmifObject):
@@ -278,13 +289,23 @@ def generate_capital_annotations(self):
278289
See https://github.com/clamsproject/mmif-python/issues/226 for rationale
279290
behind this behavior and discussion.
280291
"""
281-
# this view will be the default kitchen sink for all generated annotations
282-
last_view = self.views.get_last()
292+
# this view will be the default kitchen sink for all generated annotations.
293+
last_view = self.views.get_last_contentful_view()
294+
283295
# proceed only when there's at least one view
284296
if last_view:
297+
298+
# this app name is used to check a view is generated by the "currently running" app.
299+
# knowing the currently running app is important so that properties of `Document` objects generated by the
300+
# current app can be properly recorded inside the `Document` objects (since they are "writable" to the
301+
# current app), instead of being recorded in a separate `Annotation` object.
302+
current_app = last_view.metadata.app
303+
285304
# to avoid duplicate property recording, this will be populated with
286305
# existing Annotation objects from all existing views
287306
existing_anns = defaultdict(lambda: defaultdict(dict))
307+
# ideally, if we can "de-duplicate" props at `add_property()` time, that'd be more efficient,
308+
# but that is impossible without looking for the target `document` across other views and top documents list
288309

289310
# new properties to record in the current serialization call
290311
anns_to_write = defaultdict(dict)
@@ -296,20 +317,30 @@ def generate_capital_annotations(self):
296317
for ann in view.get_annotations(AnnotationTypes.Annotation):
297318
if doc_id is None:
298319
doc_id = ann.get_property('document')
320+
# only if we are sure that the document ID is unique across all views... (with v_id prefix)
321+
# TODO (krim @ 7/15/24): update id checking once https://github.com/clamsproject/mmif/issues/228 is resolved
322+
if not any([doc_id == doc.id for doc in self.documents]) and Mmif.id_delimiter not in doc_id:
323+
doc_id = f"{view.id}{Mmif.id_delimiter}{doc_id}"
299324
existing_anns[doc_id].update(ann.properties)
300325
for doc in view.get_documents():
301-
anns_to_write[doc.id].update(doc._props_pending)
326+
anns_to_write[doc.long_id].update(doc._props_pending)
302327
for doc in self.documents:
303-
anns_to_write[doc.id].update(doc._props_pending)
328+
anns_to_write[doc.long_id].update(doc._props_pending)
304329
# additional iteration of views, to find a proper view to add the
305330
# generated annotations. If none found, use the last view as the kitchen sink
306331
last_view_for_docs = defaultdict(lambda: last_view)
307332
doc_ids = set(anns_to_write.keys())
308333
for doc_id in doc_ids:
334+
if len(last_view.annotations) == 0:
335+
# meaning, this new app didn't generate any annotation except for these document properties
336+
# thus, we should add capital annotations to the last (empty) view
337+
last_view_for_docs[doc_id] = last_view
338+
break
309339
for view in reversed(self.views):
310340
# first try to find out if this view "contains" any annotation to the doc
311341
# then, check for individual annotations
312-
if [cont for cont in view.metadata.contains.values() if cont.get('document', None) == doc_id] \
342+
# TODO (krim @ 7/15/24): update id checking once https://github.com/clamsproject/mmif/issues/228 is resolved
343+
if [cont for cont in view.metadata.contains.values() if doc_id.endswith(cont.get('document', 'TODO:this endswith test is a temporal solution we use until long_id is forced everywhere'))] \
313344
or list(view.get_annotations(document=doc_id)):
314345
last_view_for_docs[doc_id] = view
315346
break
@@ -323,14 +354,18 @@ def generate_capital_annotations(self):
323354
if k != 'id' and existing_anns[doc_id][k] != v:
324355
props[k] = v
325356
if props:
326-
if len(anns_to_write) == 1:
327-
# if there's only one document, we can record the doc_id in the contains metadata
328-
last_view_for_docs[doc_id].metadata.new_contain(AnnotationTypes.Annotation, document=doc_id)
329-
props.pop('document', None)
357+
view_to_write = last_view_for_docs[doc_id]
358+
if view_to_write.metadata.app == current_app and view_to_write.annotations.get(doc_id) is not None:
359+
view_to_write.get_document_by_id(doc_id).properties.update(props)
330360
else:
331-
# otherwise, doc_id needs to be recorded in the annotation property
332-
props['document'] = doc_id
333-
last_view_for_docs[doc_id].new_annotation(AnnotationTypes.Annotation, **props)
361+
if len(anns_to_write) == 1:
362+
# if there's only one document, we can record the doc_id in the contains metadata
363+
view_to_write.metadata.new_contain(AnnotationTypes.Annotation, document=doc_id)
364+
props.pop('document', None)
365+
else:
366+
# otherwise, doc_id needs to be recorded in the annotation property
367+
props['document'] = doc_id
368+
view_to_write.new_annotation(AnnotationTypes.Annotation, **props)
334369

335370
def sanitize(self):
336371
"""

mmif/serialize/view.py

+11
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from typing import Dict, Union, Optional, Generator, List, cast
1111

1212
from mmif import DocumentTypes, AnnotationTypes, ThingTypesBase, ClamsTypesBase
13+
from mmif.serialize import model
1314
from mmif.serialize.annotation import Annotation, Document
1415
from mmif.serialize.model import PRMTV_TYPES, MmifObject, DataList, DataDict
1516

@@ -439,6 +440,16 @@ def append(self, value: Union[Annotation, Document], overwrite=False) -> None:
439440
:return: None
440441
"""
441442
super()._append_with_key(value.id, value, overwrite)
443+
444+
def __getitem__(self, key: str):
445+
"""
446+
specialized getter implementation to workaround https://github.com/clamsproject/mmif/issues/228
447+
# TODO (krim @ 7/12/24): annotation ids must be in the long form in the future, so this check will be unnecessary once https://github.com/clamsproject/mmif/issues/228 is resolved.
448+
"""
449+
if ":" in key:
450+
_, aid = key.split(":")
451+
return self._items.__getitem__(aid)
452+
return self._items.get(key, None)
442453

443454

444455
class ContainsDict(DataDict[ThingTypesBase, Contain]):

mmif_docloc_http/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,7 @@ def resolve(docloc):
1010
raise ValueError(f'cannot handle document location scheme: {docloc}')
1111
except urllib.error.URLError as e:
1212
raise e
13+
14+
15+
def help():
16+
return "location must be a URL string."

tests/test_serialize.py

+33-11
Original file line numberDiff line numberDiff line change
@@ -1118,11 +1118,11 @@ def test_document_added_properties(self):
11181118
# then converted to an `Annotation` annotation during serialization at `Mmif`-level
11191119
mmif.add_document(doc1)
11201120
## no Annotation before serialization
1121-
self.assertEqual(0, len(list(mmif.views.get_last().get_annotations(AnnotationTypes.Annotation, author='me'))))
1121+
self.assertEqual(0, len(list(mmif.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation, author='me'))))
11221122
## after serialization, the `Annotation` annotation should be added to the last view
11231123
## note that we didn't add any views, so the last view before and after the serialization are the same
11241124
mmif_roundtrip = Mmif(mmif.serialize())
1125-
self.assertTrue(next(mmif_roundtrip.views.get_last().get_annotations(AnnotationTypes.Annotation, author='me')))
1125+
self.assertTrue(next(mmif_roundtrip.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation, author='me')))
11261126
# finally, when deserialized back to a Mmif instance, the `Annotation` props should be added
11271127
# as a property of the document
11281128
doc1_mmif_roundtrip = mmif_roundtrip.get_document_by_id('doc1')
@@ -1135,7 +1135,7 @@ def test_document_adding_duplicate_properties(self):
11351135
mmif.add_document(doc1)
11361136
did = 'doc1'
11371137
# last view before serialization rounds
1138-
r0_vid = mmif.views.get_last().id
1138+
r0_vid = mmif.views.get_last_contentful_view().id
11391139
doc1.at_type = DocumentTypes.TextDocument
11401140
doc1.id = did
11411141
doc1.location = 'aScheme:///data/doc1.txt'
@@ -1165,7 +1165,7 @@ def test_document_adding_duplicate_properties(self):
11651165
self.assertEqual(2, len(doc1._props_pending))
11661166
mmif_roundtrip2 = Mmif(mmif_roundtrip1.serialize())
11671167
## but not serialized
1168-
self.assertEqual(0, len(list(mmif_roundtrip2.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1168+
self.assertEqual(0, len(list(mmif_roundtrip2.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
11691169

11701170
# adding non-duplicate value should be serialized into a new Annotation object
11711171
# even when there is a duplicate key in a previous view
@@ -1211,7 +1211,7 @@ def test_document_added_properties_with_manual_capital_annotation(self):
12111211
doc1.add_property('author', 'me')
12121212
mmif_roundtrip = Mmif(mmif.serialize())
12131213
## should be only one Annotation object, from manual call of `new_annotation`
1214-
self.assertEqual(1, len(list(mmif_roundtrip.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1214+
self.assertEqual(1, len(list(mmif_roundtrip.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
12151215

12161216
# test with duplicate property added by a downstream app
12171217
mmif_roundtrip = Mmif(mmif.serialize())
@@ -1223,9 +1223,9 @@ def test_document_added_properties_with_manual_capital_annotation(self):
12231223
# author=me is already in the input MMIF
12241224
doc1_prime.add_property('author', 'me')
12251225
mmif_roundtrip2 = Mmif(mmif_roundtrip.serialize())
1226-
self.assertEqual(1, len(list(mmif.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1226+
self.assertEqual(1, len(list(mmif.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
12271227
# none should be added
1228-
self.assertEqual(0, len(list(mmif_roundtrip2.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1228+
self.assertEqual(0, len(list(mmif_roundtrip2.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
12291229

12301230
# test with same key, different value
12311231
mmif_roundtrip = Mmif(mmif.serialize())
@@ -1238,13 +1238,13 @@ def test_document_added_properties_with_manual_capital_annotation(self):
12381238
doc1_prime.add_property('author', 'you')
12391239
mmif_roundtrip2 = Mmif(mmif_roundtrip.serialize())
12401240
# the Annotation in the previous view should be preserved
1241-
self.assertEqual(1, len(list(mmif.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1241+
self.assertEqual(1, len(list(mmif.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
12421242
# but a new one should be added
1243-
self.assertEqual(1, len(list(mmif_roundtrip2.views.get_last().get_annotations(AnnotationTypes.Annotation))))
1243+
self.assertEqual(1, len(list(mmif_roundtrip2.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))))
12441244

1245-
self.assertEqual('me', list(mmif.views.get_last().get_annotations(AnnotationTypes.Annotation))[0].get_property(
1245+
self.assertEqual('me', list(mmif.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))[0].get_property(
12461246
'author'))
1247-
self.assertEqual('you', list(mmif_roundtrip2.views.get_last().get_annotations(AnnotationTypes.Annotation))[
1247+
self.assertEqual('you', list(mmif_roundtrip2.views.get_last_contentful_view().get_annotations(AnnotationTypes.Annotation))[
12481248
0].get_property('author'))
12491249

12501250
def test_capital_annotation_generation_viewfinder(self):
@@ -1269,6 +1269,28 @@ def test_capital_annotation_generation_viewfinder(self):
12691269
cap_anns = list(mmif_roundtrip.views[f'v{i}'].get_annotations(AnnotationTypes.Annotation))
12701270
self.assertEqual(1, len(cap_anns))
12711271
self.assertEqual(authors[i-1], cap_anns[0].get_property('author'))
1272+
1273+
def test_capital_annotation_nongeneration_for_writable_documents(self):
1274+
mmif = Mmif(validate=False)
1275+
doc = Document()
1276+
doc.at_type = DocumentTypes.TextDocument
1277+
doc.id = f'doc0'
1278+
doc.location = f'aScheme:///data/doc0.txt'
1279+
mmif.add_document(doc)
1280+
1281+
v = mmif.new_view()
1282+
v.metadata.app = tester_appname
1283+
vid = v.id
1284+
new_td_id = mmif[vid].new_textdocument(text='new text', document='doc0', origin='transformation').id
1285+
doc.add_property('author', 'me')
1286+
1287+
mmif_roundtrip = Mmif(mmif.serialize())
1288+
1289+
self.assertTrue(AnnotationTypes.Annotation in mmif_roundtrip[vid].metadata.contains)
1290+
self.assertTrue(mmif_roundtrip.get_document_by_id('doc0').get_property('author'), 'me')
1291+
self.assertTrue(next(mmif_roundtrip[vid].get_annotations(AnnotationTypes.Annotation)).get_property('author'), 'me')
1292+
self.assertTrue(next(mmif_roundtrip[vid].get_annotations(AnnotationTypes.Annotation)).get_property('document'), doc.id)
1293+
self.assertTrue(mmif_roundtrip[new_td_id].get_property('origin'), 'transformation')
12721294

12731295
def test_deserialize_with_whole_mmif(self):
12741296
for i, datum in self.data.items():

0 commit comments

Comments
 (0)