diff --git a/.gitignore b/.gitignore index 84afd451..221d585d 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,9 @@ $RECYCLE.BIN/ *.msp *.lnk # Windows shortcuts +# vscode +.vscode + # idea .idea *.iml @@ -51,7 +54,7 @@ target/ # Package Files # *.ear hs_err_pid* # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -# python +# python **/*.pyc **/__pycache__ **/dist @@ -62,7 +65,7 @@ build/** coverage.xml htmlcov -# ruby and jekyll files +# ruby and jekyll files Gemfile.lock docs/**/_site/** diff --git a/VERSION.dev b/VERSION.dev new file mode 100644 index 00000000..da1193db --- /dev/null +++ b/VERSION.dev @@ -0,0 +1 @@ +1.0.20.dev1 diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index d1b7a245..cb40afbb 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -50,6 +50,15 @@ def __init__(self, anno_obj: Optional[Union[bytes, str, dict]] = None, *_) -> No self._type: ThingTypesBase = ThingTypesBase('') # to store the parent view ID self._parent_view_id = '' + + # FIXME: + # ============================ + # in solving issue #296, + # To level up the annotation id from `AnnotationProperties`, + # simply eliminated the attribute `self.id` from `AnnotationProperties` class + # and added private attribute `self._id` here because we want "id" to be a property + self._id: str = '' + self._props_ephemeral: AnnotationProperties = AnnotationProperties() self._alignments = {} # to hold alignment information (Alignment anno long_id -> aligned anno long_id) self.reserved_names.update(('_parent_view_id', '_props_ephemeral', '_alignments')) @@ -57,61 +66,61 @@ def __init__(self, anno_obj: Optional[Union[bytes, str, dict]] = None, *_) -> No self.properties: AnnotationProperties = AnnotationProperties() self._attribute_classes = {'properties': AnnotationProperties} self.disallow_additional_properties() - self._required_attributes = ["_type", "properties"] + self._required_attributes = ["id", "_type", "properties"] super().__init__(anno_obj) - + def __hash__(self): return hash(self.serialize()) - + def _deserialize(self, input_dict: dict) -> None: self.at_type = input_dict.pop('_type', '') # TODO (krim @ 6/1/21): If annotation IDs must follow a certain string format, # (e.g. currently auto-generated IDs will always have "prefix"_"number" format) - # here is the place to parse formatted IDs and store prefixes in the parent mmif object. + # here is the place to parse formatted IDs and store prefixes in the parent mmif object. # (see https://github.com/clamsproject/mmif/issues/64#issuecomment-849241309 for discussion) super()._deserialize(input_dict) for k, v in self.properties.items(): self._add_prop_aliases(k, v) - + def _cache_alignment(self, alignment_ann: 'Annotation', alignedto_ann: 'Annotation') -> None: """ - Cache alignment information. This cache will not be serialized. - + Cache alignment information. This cache will not be serialized. + :param alignment_ann: the Alignment annotation that has this annotation on one side :param alignedto_ann: the annotation that this annotation is aligned to (other side of Alignment) """ self._alignments[alignment_ann] = alignedto_ann - + def aligned_to_by(self, alignment: 'Annotation') -> Optional['Annotation']: """ - Retrieves the other side of ``Alignment`` annotation that has this annotation on one side. - + Retrieves the other side of ``Alignment`` annotation that has this annotation on one side. + :param alignment: ``Alignment`` annotation that has this annotation on one side - :return: the annotation that this annotation is aligned to (other side of ``Alignment``), + :return: the annotation that this annotation is aligned to (other side of ``Alignment``), or None if this annotation is not used in the ``Alignment``. """ return self._alignments.get(alignment) - + def get_all_aligned(self) -> Iterator['Annotation']: """ Generator to iterate through all alignments and aligned annotations. Note that this generator will yield the `Alignment` annotations as well. Every odd-numbered yield will be an `Alignment` annotation, and every even-numbered yield will be the aligned annotation. If there's a specific annotation type that you're looking - for, you need to filter the generated results outside. - + for, you need to filter the generated results outside. + :return: yields the alignment annotation and the aligned annotation. The order is decided by the order of appearance of Alignment annotations in the MMIF """ for alignment, aligned in self._alignments.items(): yield alignment yield aligned - - + + def _add_prop_aliases(self, key_to_add, val_to_add): """ Method to handle aliases of the same property. - Annotation property aliases were first introduced in MMIF 1.0.2, - with addition of general `label` property to all `Annotation` + Annotation property aliases were first introduced in MMIF 1.0.2, + with addition of general `label` property to all `Annotation` subtypes, and effectively deprecated `frameType` and `boxType` in `TimeFrame` and `BoundingBox` respectively. """ @@ -157,38 +166,62 @@ def parent(self, parent_view_id: str) -> None: # but import `View` will break the code due to circular imports self._parent_view_id = parent_view_id + # @property + # def id(self) -> str: + # return self.properties.id + # + # @id.setter + # def id(self, aid: str) -> None: + # self.properties.id = aid + @property def id(self) -> str: - return self.properties.id + return self._get_long_id() @id.setter - def id(self, aid: str) -> None: - self.properties.id = aid - - @property - def long_id(self) -> str: + def id(self, new_id: str) -> None: + self._set_long_id(new_id) + + # FIXME: + # ============== + # To cause less confusion to users/developers, I suggest we use + # `id` as a universal property and `long_id` works as its internal process + # , which means the property `long_id` is deprecated + def _get_long_id(self) -> str: + """ + The internal function of id.getter + @return: long form of id or deprecated short form of id while raising RuntimeWarning + """ if self.parent is not None and len(self.parent) > 0: - return f"{self.parent}{self.id_delimiter}{self.id}" + return f"{self.parent}{self.id_delimiter}{self._id}" else: - return self.id - - @long_id.setter - def long_id(self, long_id: str) -> None: + warnings.warn("Parent view id is empty, return a deprecated annotation id") + return self._id + + def _set_long_id(self, long_id: str) -> None: + """ + The internal function of id.setter + @param long_id: new long form of id or deprecated short form of id + @return: None + """ if self.id_delimiter in long_id: - self.parent, self.id = long_id.split(self.id_delimiter) + self.parent, self._id = long_id.split(self.id_delimiter) else: - self.id = long_id - + # This case only adapts to when set the id of existing `Annotation` object + # However, it doesn't fit to the case when a new `Annotation` is added to a `View` + # , which needs another revise at `View` level. + self._id = long_id + @staticmethod def check_prop_value_is_simple_enough( value: Union[PRMTV_TYPES, LIST_PRMTV, LIST_LIST_PRMTV, DICT_PRMTV, DICT_LIST_PRMTV]) -> bool: - - def json_primitives(x): + + def json_primitives(x): return isinstance(x, typing.get_args(PRMTV_TYPES)) - + def json_primitives_list(x): return isinstance(x, list) and all(map(json_primitives, x)) - + def json_primitives_list_of_list(x): return all(map(lambda elem: isinstance(elem, list), x) and map(json_primitives, [subelem for elem in x for subelem in elem])) @@ -201,7 +234,7 @@ def add_property(self, name: str, value: Union[PRMTV_TYPES, LIST_PRMTV, LIST_LIST_PRMTV, DICT_PRMTV, DICT_LIST_PRMTV]) -> None: """ Adds a property to the annotation's properties. - + :param name: the name of the property :param value: the property's desired value :return: None @@ -219,8 +252,8 @@ def get(self, prop_name: str) -> Union['AnnotationProperties', PRMTV_TYPES, LIST """ A special getter for Annotation properties. This is to allow for directly accessing properties without having to go through the - properties object, or view-level annotation properties encoded in the - ``view.metadata.contains`` dict. Note that the regular props will take + properties object, or view-level annotation properties encoded in the + ``view.metadata.contains`` dict. Note that the regular props will take the priority over the ephemeral props when there are conflicts. """ if prop_name in {'at_type', '@type'}: @@ -238,21 +271,21 @@ def get(self, prop_name: str) -> Union['AnnotationProperties', PRMTV_TYPES, LIST def __getitem__(self, prop_name: str): return self.get(prop_name) - + def __contains__(self, item): try: self.get(item) return True except KeyError: return False - + def _get_label(self) -> str: """ Another prototypical method to handle property aliases. - See :meth:`.Annotation._add_prop_aliases` for more details on + See :meth:`.Annotation._add_prop_aliases` for more details on what property aliases are. - Not recommended to use this method as `_add_prop_aliases` method - is preferred. + Not recommended to use this method as `_add_prop_aliases` method + is preferred. """ if 'label' in self: return str(self.get('label')) @@ -262,7 +295,7 @@ def _get_label(self) -> str: return str(self.get('boxType')) else: raise KeyError("No label found in this annotation.") - + def is_document(self): return isinstance(self._type, DocumentTypesBase) @@ -287,45 +320,45 @@ def __init__(self, doc_obj: Optional[Union[bytes, str, dict]] = None, *_) -> Non self._props_original: DocumentProperties = DocumentProperties() self._props_pending: AnnotationProperties = AnnotationProperties() self.reserved_names.update(('_props_original', '_props_pending')) - + self._type: Union[ThingTypesBase, DocumentTypesBase] = ThingTypesBase('') self.properties = self._props_original self.disallow_additional_properties() self._attribute_classes = {'properties': DocumentProperties} super().__init__(doc_obj) - + def add_property(self, name: str, value: Union[PRMTV_TYPES, LIST_PRMTV] ) -> None: """ Adds a property to the document's properties. - - Unlike the parent :class:`Annotation` class, added properties of a - ``Document`` object can be lost during serialization unless it belongs - to somewhere in a ``Mmif`` object. This is because we want to keep - ``Document`` object as "read-only" as possible. Thus, if you want to add - a property to a ``Document`` object, - - * add the document to a ``Mmif`` object (either in the documents list or + + Unlike the parent :class:`Annotation` class, added properties of a + ``Document`` object can be lost during serialization unless it belongs + to somewhere in a ``Mmif`` object. This is because we want to keep + ``Document`` object as "read-only" as possible. Thus, if you want to add + a property to a ``Document`` object, + + * add the document to a ``Mmif`` object (either in the documents list or in a view from the views list), or * directly write to ``Document.properties`` instead of using this method - (which is not recommended). - - With the former method, the SDK will record the added property as a - `Annotation` annotation object, separate from the original `Document` + (which is not recommended). + + With the former method, the SDK will record the added property as a + `Annotation` annotation object, separate from the original `Document` object. See :meth:`.Mmif.generate_capital_annotations()` for more. - + A few notes to keep in mind: - - #. You can't overwrite an existing property of a ``Document`` object. - #. A MMIF can have multiple ``Annotation`` objects with the same + + #. You can't overwrite an existing property of a ``Document`` object. + #. A MMIF can have multiple ``Annotation`` objects with the same property name but different values. When this happens, the SDK will - only keep the latest value (in order of appearances in views list) of + only keep the latest value (in order of appearances in views list) of the property, effectively overwriting the previous values. """ # we don't checking if this k-v already exists in _original (new props) or _ephemeral (read from existing MMIF) # because it is impossible to keep the _original updated when a new annotation is added (via `new_annotation`) - # without look across other views and top-level documents list. Also see + # without look across other views and top-level documents list. Also see # ``mmif.serialize.mmif.Mmif.generate_capital_annotations`` for where the "de-duplication" happens. if name == "text": self.properties.text = Text(value) @@ -342,11 +375,11 @@ def add_property(self, name: str, def get(self, prop_name): """ A special getter for Document properties. The major difference from - the super class's :py:meth:`Annotation.get` method is that Document - class has one more set of *"pending"* properties, that are added after - the Document object is created and will be serialized as a separate - :py:class:`Annotation` object of which ``@type = Annotation``. The - pending properties will take the priority over the regular properties + the super class's :py:meth:`Annotation.get` method is that Document + class has one more set of *"pending"* properties, that are added after + the Document object is created and will be serialized as a separate + :py:class:`Annotation` object of which ``@type = Annotation``. The + pending properties will take the priority over the regular properties when there are conflicts. """ if prop_name == 'id': @@ -365,7 +398,7 @@ class has one more set of *"pending"* properties, that are added after return super().get(prop_name) get_property = get - + @property def text_language(self) -> str: if self._type == DocumentTypes.TextDocument: @@ -433,7 +466,7 @@ def location_path(self, nonexist_ok=True) -> Optional[str]: To obtain the original value of the "path" part in the location string (before resolving), use ``properties.location_path_literal`` method. Returns None when no location is set. - + :param nonexist_ok: if False, raise FileNotFoundError when the resolved path doesn't exist """ return self.properties.location_path_resolved(nonexist_ok=nonexist_ok) @@ -455,22 +488,22 @@ def __delitem__(self, key: str) -> None: else: raise AttributeError(f'Cannot delete a required attribute "{key}"!') raise KeyError(f'Key "{key}" not found.') - + def __iter__(self) -> Iterator[str]: """ - ``__iter__`` on Mapping should basically work as ``keys()`` method + ``__iter__`` on Mapping should basically work as ``keys()`` method of vanilla dict. """ for key in itertools.chain(self._named_attributes(), self._unnamed_attributes): yield key - + def __getitem__(self, key): """ - Parent MmifObject class has a __getitem__ method that checks if - the value is empty when asked for an unnamed attribute. But for - AnnotationProperties, any arbitrary property that's added - explicitly by the user (developer) should not be ignored and - returned even the value is empty. + Parent MmifObject class has a __getitem__ method that checks if + the value is empty when asked for an unnamed attribute. But for + AnnotationProperties, any arbitrary property that's added + explicitly by the user (developer) should not be ignored and + returned even the value is empty. """ if key in self._named_attributes(): return self.__dict__[key] @@ -478,9 +511,9 @@ def __getitem__(self, key): return self._unnamed_attributes[key] def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *_) -> None: - self.id: str = '' - # any individual at_type (subclassing this class) can have its own set of required attributes - self._required_attributes = ["id"] + # self.id: str = '' + # # any individual at_type (subclassing this class) can have its own set of required attributes + # self._required_attributes = ["id"] # allowing additional attributes for arbitrary annotation properties self._unnamed_attributes = {} super().__init__(mmif_obj) @@ -504,9 +537,9 @@ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *_) -> No self.text: Text = Text() self._attribute_classes = {'text': Text} # in theory, either `location` or `text` should appear in a `document` - # but with current implementation, there's no easy way to set a condition - # for `oneOf` requirement - # see MmifObject::_required_attributes in model.py + # but with current implementation, there's no easy way to set a condition + # for `oneOf` requirement + # see MmifObject::_required_attributes in model.py super().__init__(mmif_obj) def _deserialize(self, input_dict: dict) -> None: @@ -519,7 +552,7 @@ def _serialize(self, alt_container: Optional[Dict] = None) -> dict: if "location_" in serialized: serialized["location"] = serialized.pop("location_") return serialized - + @property def text_language(self) -> str: return self.text.lang @@ -539,8 +572,8 @@ def text_value(self, s: str) -> None: @property def location(self) -> Optional[str]: """ - ``location`` property must be a legitimate URI. That is, should the document be a local file - then the file:// scheme must be used. + ``location`` property must be a legitimate URI. That is, should the document be a local file + then the file:// scheme must be used. Returns None when no location is set. """ return self.location_ if len(self.location_) > 0 else None @@ -578,14 +611,14 @@ def location_address(self) -> Optional[str]: def location_path(self) -> Optional[str]: warnings.warn('location_path() is deprecated. Use location_path_resolved() instead.', DeprecationWarning) return self.location_path_resolved() - + def location_path_resolved(self, nonexist_ok=True) -> Optional[str]: """ - Retrieves only path name of the document location (hostname is ignored), + Retrieves only path name of the document location (hostname is ignored), and then try to resolve the path name in the local file system. This method should be used when the document scheme is ``file`` or empty. For other schemes, users should install ``mmif-locdoc-`` plugin. - + Returns None when no location is set. Raise ValueError when no code found to resolve the given location scheme. """ @@ -605,7 +638,7 @@ def location_path_resolved(self, nonexist_ok=True) -> Optional[str]: def location_path_literal(self) -> Optional[str]: """ - Retrieves only path name of the document location (hostname is ignored). + Retrieves only path name of the document location (hostname is ignored). Returns None when no location is set. """ if self.location is None: diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 1900a0d4..ba8c527a 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -54,6 +54,9 @@ def _deserialize(self, input_list: list) -> None: # pytype: disable=signature-m :return: None """ self._items = {item['properties']['id']: Document(item) for item in input_list} + # Note: + # by doing above, the results would map short ids to ``Document`` objects + # if we assume the raw MMIF JSON has ``Document``/``Annotation`` id as short form def append(self, value: Document, overwrite=False) -> None: """ @@ -122,14 +125,14 @@ def get_last_contentful_view(self) -> Optional[View]: for view in reversed(self._items.values()): if 'error' not in view.metadata and 'warnings' not in view.metadata: return view - + def get_last_view(self) -> Optional[View]: """ Returns the last view appended. """ if self._items: return self._items[list(self._items.keys())[-1]] - + def get_last(self) -> Optional[View]: warnings.warn('get_last() is deprecated, use get_last_contentful_view() instead.', DeprecationWarning) return self.get_last_contentful_view() @@ -183,11 +186,11 @@ def serialize(self, pretty: bool = False, sanitize: bool = False, autogenerate_c """ Serializes the MMIF object to a JSON string. - :param sanitize: If True, performs some sanitization of before returning + :param sanitize: If True, performs some sanitization of before returning the JSON string. See :meth:`sanitize` for details. - :param autogenerate_capital_annotations: If True, automatically convert - any "pending" temporary properties from `Document` objects to - `Annotation` objects. See :meth:`generate_capital_annotations` for + :param autogenerate_capital_annotations: If True, automatically convert + any "pending" temporary properties from `Document` objects to + `Annotation` objects. See :meth:`generate_capital_annotations` for details. :param pretty: If True, returns string representation with indentation. :return: JSON string of the MMIF object. @@ -202,24 +205,24 @@ def serialize(self, pretty: bool = False, sanitize: bool = False, autogenerate_c def _deserialize(self, input_dict: dict) -> None: """ Deserializes the MMIF JSON string into a Mmif object. - After *regular* deserialization, this method will perform the following - *special* handling of Annotation.properties that allows apps to access - Annotation/Document properties that are not encoded in the objects - themselves. This is to allow apps to access in a more intuitive way, + After *regular* deserialization, this method will perform the following + *special* handling of Annotation.properties that allows apps to access + Annotation/Document properties that are not encoded in the objects + themselves. This is to allow apps to access in a more intuitive way, without having too much hassle to iterate views and manually collect the properties. - + 1. This will read in existing *view*-scoped properties from *contains* metadata and attach them to the corresponding ``Annotation`` objects. - 1. This will read in existing ``Annotation`` typed annotations and - attach the document-level properties to the ``Document`` objects, - using an ephemeral property dict. - + 1. This will read in existing ``Annotation`` typed annotations and + attach the document-level properties to the ``Document`` objects, + using an ephemeral property dict. + """ super()._deserialize(input_dict) for view in self.views: view._parent_mmif = self - # this dict will be populated with properties + # this dict will be populated with properties # that are not encoded in individual annotations objects themselves extrinsic_props = defaultdict(dict) for at_type, type_lv_props in view.metadata.contains.items(): @@ -231,7 +234,7 @@ def _deserialize(self, input_dict: dict) -> None: # as "ephemeral" properties for prop_key, prop_value in extrinsic_props[ann.at_type].items(): ann._props_ephemeral[prop_key] = prop_value - # then, do the same to associated Document objects. Note that, + # then, do the same to associated Document objects. Note that, # in a view, it is guaranteed that all Annotation objects are not duplicates if ann.at_type == AnnotationTypes.Annotation: doc_id = ann.get_property('document') @@ -241,7 +244,7 @@ def _deserialize(self, input_dict: dict) -> None: except KeyError: warnings.warn(f"Annotation {ann.id} (in view {view.id}) has a document ID {doc_id} that " f"does not exist in the MMIF object. Skipping.", RuntimeWarning) - + ## caching start and end points for time-based annotations # add quick access to `start` and `end` values if the annotation is using `targets` property if 'targets' in ann.properties: @@ -250,24 +253,20 @@ def _deserialize(self, input_dict: dict) -> None: f"properties at the same time. Annotation anchors are ambiguous.") ann._props_ephemeral['start'] = self._get_linear_anchor_point(ann, start=True) ann._props_ephemeral['end'] = self._get_linear_anchor_point(ann, start=False) - + ## caching alignments if ann.at_type == AnnotationTypes.Alignment: self._cache_alignment(ann) - + def _cache_alignment(self, alignment_ann: Annotation): view = self.views.get(alignment_ann.parent) if view is None: - warnings.warn(f"Alignment {alignment_ann.long_id} doesn't have a parent view, but it should.", RuntimeWarning) + warnings.warn(f"Alignment {alignment_ann.id} doesn't have a parent view, but it should.", RuntimeWarning) return ## caching alignments - def _desprately_search_annotation_object(ann_short_id): - ann_long_id = f"{view.id}{self.id_delimiter}{ann_short_id}" - try: - return self.__getitem__(ann_long_id) - except KeyError: - return self.__getitem__(ann_short_id) + def _desprately_search_annotation_object(ann_id): + return self.__getitem__(ann_id) if all(map(lambda x: x in alignment_ann.properties, ('source', 'target'))): source_ann = _desprately_search_annotation_object(alignment_ann.get('source')) @@ -277,36 +276,36 @@ def _desprately_search_annotation_object(ann_short_id): target_ann._cache_alignment(alignment_ann, source_ann) else: warnings.warn( - f"Alignment {alignment_ann.long_id} has `source` and `target` properties that do not point to Annotation objects.", + f"Alignment {alignment_ann.id} has `source` and `target` properties that do not point to Annotation objects.", RuntimeWarning) def generate_capital_annotations(self): """ - Automatically convert any "pending" temporary properties from - `Document` objects to `Annotation` objects . The generated `Annotation` - objects are then added to the last `View` in the views lists. - + Automatically convert any "pending" temporary properties from + `Document` objects to `Annotation` objects . The generated `Annotation` + objects are then added to the last `View` in the views lists. + See https://github.com/clamsproject/mmif-python/issues/226 for rationale behind this behavior and discussion. """ # this view will be the default kitchen sink for all generated annotations. last_view = self.views.get_last_contentful_view() - + # proceed only when there's at least one view if last_view: - + # this app name is used to check a view is generated by the "currently running" app. - # knowing the currently running app is important so that properties of `Document` objects generated by the - # current app can be properly recorded inside the `Document` objects (since they are "writable" to the + # knowing the currently running app is important so that properties of `Document` objects generated by the + # current app can be properly recorded inside the `Document` objects (since they are "writable" to the # current app), instead of being recorded in a separate `Annotation` object. current_app = last_view.metadata.app # to avoid duplicate property recording, this will be populated with # existing Annotation objects from all existing views existing_anns = defaultdict(lambda: defaultdict(dict)) - # ideally, if we can "de-duplicate" props at `add_property()` time, that'd be more efficient, + # ideally, if we can "de-duplicate" props at `add_property()` time, that'd be more efficient, # but that is impossible without looking for the target `document` across other views and top documents list - + # new properties to record in the current serialization call anns_to_write = defaultdict(dict) for view in self.views: @@ -323,10 +322,10 @@ def generate_capital_annotations(self): doc_id = f"{view.id}{Mmif.id_delimiter}{doc_id}" existing_anns[doc_id].update(ann.properties) for doc in view.get_documents(): - anns_to_write[doc.long_id].update(doc._props_pending) + anns_to_write[doc.id].update(doc._props_pending) for doc in self.documents: - anns_to_write[doc.long_id].update(doc._props_pending) - # additional iteration of views, to find a proper view to add the + anns_to_write[doc.id].update(doc._props_pending) + # additional iteration of views, to find a proper view to add the # generated annotations. If none found, use the last view as the kitchen sink last_view_for_docs = defaultdict(lambda: last_view) doc_ids = set(anns_to_write.keys()) @@ -345,8 +344,8 @@ def generate_capital_annotations(self): last_view_for_docs[doc_id] = view break for doc_id, found_props in anns_to_write.items(): - # ignore the "empty" id property from temporary dict - # `id` is "required" attribute for `AnnotationProperty` class + # ignore the "empty" id property from temporary dict + # `id` is "required" attribute for `AnnotationProperty` class # thus will always be present in `props` dict as a key with emtpy value # also ignore duplicate k-v pairs props = {} @@ -371,10 +370,10 @@ def sanitize(self): """ Sanitizes a Mmif object by running some safeguards. Concretely, it performs the following before returning the JSON string. - + #. validating output using built-in MMIF jsonschema #. remove non-existing annotation types from ``contains`` metadata - + """ for view in self.views: existing_at_types = set(annotation.at_type for annotation in view.annotations) @@ -532,7 +531,7 @@ def get_document_by_id(self, doc_id: str) -> Document: vid, did = doc_id.split(self.id_delimiter) view = self[vid] if isinstance(view, View): - return view.get_document_by_id(did) + return view.get_document_by_id(did) else: raise KeyError("{} view not found".format(vid)) else: @@ -567,7 +566,7 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s alignments = [] contains_meta = alignment_view.metadata.contains[AnnotationTypes.Alignment] if 'sourceType' in contains_meta and 'targetType' in contains_meta: - aligned_types = [ThingTypesBase.from_str(x) + aligned_types = [ThingTypesBase.from_str(x) for x in {contains_meta['sourceType'], contains_meta['targetType']}] if len(aligned_types) == 2 and at_type1 in aligned_types and at_type2 in aligned_types: alignments.extend(alignment_view.annotations) @@ -602,7 +601,7 @@ def get_views_for_document(self, doc_id: str) -> List[View]: next(annotations) views.append(view) except StopIteration: - # means search failed by the full doc_id string, + # means search failed by the full doc_id string, # now try trimming the view_id from the string and re-do the search if self.id_delimiter in doc_id: vid, did = doc_id.split(self.id_delimiter) @@ -618,14 +617,14 @@ def get_views_for_document(self, doc_id: str) -> List[View]: def get_all_views_with_error(self) -> List[View]: """ - Returns the list of all views in the MMIF that have errors. - + Returns the list of all views in the MMIF that have errors. + :return: the list of views that contain errors but no annotations """ return [v for v in self.views if v.has_error()] - + get_views_with_error = get_all_views_with_error - + def get_all_views_contain(self, *at_types: Union[ThingTypesBase, str]) -> List[View]: """ Returns the list of all views in the MMIF if given types @@ -638,22 +637,22 @@ def get_all_views_contain(self, *at_types: Union[ThingTypesBase, str]) -> List[V if all(map(lambda x: x in view.metadata.contains, at_types))] get_views_contain = get_all_views_contain - + def get_view_with_error(self) -> Optional[View]: """ Returns the last view appended that contains an error. - + :return: the view, or None if no error is found """ for view in reversed(self.views): if view.has_error(): return view return None - + def get_last_error(self) -> Optional[str]: """ Returns the last error message found in the views. - + :return: the error message in human-readable format, or None if no error is found """ v = self.get_view_with_error() @@ -680,7 +679,7 @@ def get_view_contains(self, at_types: Union[ThingTypesBase, str, List[Union[str, def _is_in_time_range(self, ann: Annotation, range_s: Union[int, float], range_e: Union[int, float]) -> bool: """ - Checks if the annotation is anchored within the given time range. Any overlap is considered included. + Checks if the annotation is anchored within the given time range. Any overlap is considered included. :param ann: the Annotation object to check, must be time-based itself or anchored to time-based annotations :param range_s: the start time point of the range (in milliseconds) @@ -705,7 +704,7 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int, assert start < end, f"Start time point must be smaller than the end time point, given {start} and {end}" assert start >= 0, f"Start time point must be non-negative, given {start}" assert end >= 0, f"End time point must be non-negative, given {end}" - + from mmif.utils.timeunit_helper import convert time_anchors_in_range = [] @@ -713,7 +712,7 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int, for view in self.get_all_views_contain(AnnotationTypes.TimeFrame) + self.get_all_views_contain(AnnotationTypes.TimePoint): time_unit_in_view = view.metadata.contains.get(AnnotationTypes.TimeFrame)["timeUnit"] - + start_time = convert(start, time_unit, time_unit_in_view, 1) end_time = convert(end, time_unit, time_unit_in_view, 1) for ann in view.get_annotations(): @@ -730,9 +729,9 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int, def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]: # TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192) """ - Retrieves the anchor point of the annotation. Currently, this method only supports linear anchors, + Retrieves the anchor point of the annotation. Currently, this method only supports linear anchors, namely time and text, hence does not work with spatial anchors (polygons or video-object). - + :param ann: An Annotation object that has a linear anchor point. Namely, some subtypes of `Region` vocabulary type. :param start: If True, returns the start anchor point. Otherwise, returns the end anchor point. N/A for `timePoint` anchors. :param targets_sorted: If True, the method will assume that the targets are sorted in the order of the anchor points. @@ -742,12 +741,12 @@ def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: if 'timePoint' in props: return ann.get_property('timePoint') elif 'targets' in props: - + def get_target_ann(cur_ann, target_id): if self.id_delimiter not in target_id: target_id = self.id_delimiter.join((cur_ann.parent, target_id)) return self.__getitem__(target_id) - + if not targets_sorted: point = math.inf if start else -1 comp = min if start else max @@ -762,13 +761,13 @@ def get_target_ann(cur_ann, target_id): return ann.get_property('start' if start else 'end') else: raise ValueError(f"{ann.id} ({ann.at_type}) does not have a valid anchor point. Is it a valid 'Region' type?") - + def get_start(self, annotation: Annotation) -> Union[int, float]: """ An alias to `get_anchor_point` method with `start=True`. """ return self._get_linear_anchor_point(annotation, start=True) - + def get_end(self, annotation: Annotation) -> Union[int, float]: """ An alias to `get_anchor_point` method with `start=False`. @@ -778,12 +777,12 @@ def get_end(self, annotation: Annotation) -> Union[int, float]: def __getitem__(self, item: str) \ -> Union[Document, View, Annotation, MmifMetadata, DocumentsList, ViewsList]: """ - getitem implementation for Mmif. This will try to find any object, given an identifier or an immediate - attribute name. When nothing is found, this will raise an error rather than returning a None + getitem implementation for Mmif. This will try to find any object, given an identifier or an immediate + attribute name. When nothing is found, this will raise an error rather than returning a None :raises KeyError: if the item is not found or if the search results are ambiguous - :param item: an attribute name or an object identifier (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view ID prefix), the method will try to find a + :param item: an attribute name or an object identifier (a document ID, a view ID, or an annotation ID). When + annotation ID is given as a "short" ID (without view ID prefix), the method will try to find a match from the first view, and return immediately if found. :return: the object searched for :raise KeyError: if the item is not found or multiple objects are found with the same ID diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index 58413f41..233d1b65 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -37,7 +37,7 @@ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mm self._parent_mmif = parent_mmif self.reserved_names.update(("_parent_mmif", "_id_counts")) self._exclude_from_diff = {"_id_counts"} - + self.id: str = '' self.metadata: ViewMetadata = ViewMetadata() self.annotations: AnnotationsList = AnnotationsList() @@ -63,18 +63,21 @@ def new_contain(self, at_type: Union[str, ThingTypesBase], **contains_metadata) raise ValueError("@type must not be empty.") else: return self.metadata.new_contain(at_type, **contains_metadata) - + def _set_ann_id(self, annotation: Annotation, identifier): if identifier is not None: - annotation.id = identifier + if self.id_delimiter in identifier: + annotation.id = identifier + else: + annotation.id = f"{self.id}{self.id_delimiter}{identifier}" else: prefix = annotation.at_type.get_prefix() new_num = self._id_counts.get(prefix, 0) + 1 new_id = f'{prefix}_{new_num}' self._id_counts[prefix] = new_num - annotation.id = new_id - - def new_annotation(self, at_type: Union[str, ThingTypesBase], aid: Optional[str] = None, + annotation.id = f"{self.id}{self.id_delimiter}{new_id}" + + def new_annotation(self, at_type: Union[str, ThingTypesBase], aid: Optional[str] = None, overwrite=False, **properties) -> 'Annotation': """ Generates a new :class:`mmif.serialize.annotation.Annotation` @@ -84,9 +87,9 @@ def new_annotation(self, at_type: Union[str, ThingTypesBase], aid: Optional[str] in the view, unless ``overwrite`` is set to True. :param at_type: the desired ``@type`` of the annotation. - :param aid: the desired ID of the annotation, when not given, - the mmif SDK tries to automatically generate an ID based on - Annotation type and existing annotations in the view. + :param aid: the desired ID of the annotation, when not given, + the mmif SDK tries to automatically generate an ID based on + Annotation type and existing annotations in the view. :param overwrite: if set to True, will overwrite an existing annotation with the same ID. :raises KeyError: if ``overwrite`` is set to False and @@ -125,8 +128,8 @@ def add_annotation(self, annotation: 'Annotation', overwrite=False) -> 'Annotati if annotation.at_type == AnnotationTypes.Alignment: self._parent_mmif._cache_alignment(annotation) return annotation - - def new_textdocument(self, text: str, lang: str = "en", did: Optional[str] = None, + + def new_textdocument(self, text: str, lang: str = "en", did: Optional[str] = None, overwrite=False, **properties) -> 'Document': """ Generates a new :class:`mmif.serialize.annotation.Document` @@ -137,9 +140,9 @@ def new_textdocument(self, text: str, lang: str = "en", did: Optional[str] = Non :param text: text content of the new document :param lang: ISO 639-1 code of the language used in the new document - :param did: the desired ID of the document, when not given, - the mmif SDK tries to automatically generate an ID based on - Annotation type and existing documents in the view. + :param did: the desired ID of the document, when not given, + the mmif SDK tries to automatically generate an ID based on + Annotation type and existing documents in the view. :param overwrite: if set to True, will overwrite an existing document with the same ID :raises KeyError: if ``overwrite`` is set to False and @@ -170,7 +173,7 @@ def add_document(self, document: Document, overwrite=False) -> Annotation: """ return self.add_annotation(document, overwrite) - def get_annotations(self, at_type: Optional[Union[str, ThingTypesBase]] = None, + def get_annotations(self, at_type: Optional[Union[str, ThingTypesBase]] = None, **properties) -> Generator[Annotation, None, None]: """ Look for certain annotations in this view, specified by parameters @@ -186,26 +189,31 @@ def prop_check(k, v, *props): for annotation in self.annotations: at_type_metadata = self.metadata.contains.get(annotation.at_type, {}) if not at_type or (at_type and annotation.at_type == at_type): - if all(map(lambda kv: prop_check(kv[0], kv[1], annotation.properties, at_type_metadata), + if all(map(lambda kv: prop_check(kv[0], kv[1], annotation.properties, at_type_metadata), properties.items())): yield annotation - + def get_annotation_by_id(self, ann_id) -> Annotation: - if self.id_delimiter in ann_id and not ann_id.startswith(self.id): - try: - ann_found = self._parent_mmif[ann_id] - except KeyError: - ann_found = None - else: - ann_found = self.annotations.get(ann_id.split(self.id_delimiter)[-1]) - if ann_found is None or not isinstance(ann_found, Annotation): - if self.id_delimiter in ann_id: - raise KeyError(f"Annotation \"{ann_id}\" is not found in the MMIF.") - else: - raise KeyError(f"Annotation \"{ann_id}\" is not found in view {self.id}.") + # if self.id_delimiter in ann_id and not ann_id.startswith(self.id): + # try: + # ann_found = self._parent_mmif[ann_id] + # except KeyError: + # ann_found = None + # else: + # ann_found = self.annotations.get(ann_id.split(self.id_delimiter)[-1]) + # if ann_found is None or not isinstance(ann_found, Annotation): + # if self.id_delimiter in ann_id: + # raise KeyError(f"Annotation \"{ann_id}\" is not found in the MMIF.") + # else: + # raise KeyError(f"Annotation \"{ann_id}\" is not found in view {self.id}.") + # else: + # return ann_found + ann_found = self.annotations.get(ann_id) + if not ann_found or not isinstance(ann_found, Annotation): + raise KeyError(f"Annotation \"{ann_id}\" is not found in the MMIF.") else: return ann_found - + def get_documents(self) -> List[Document]: return [cast(Document, annotation) for annotation in self.annotations if annotation.is_document()] @@ -238,15 +246,15 @@ def __getitem__(self, key: str) -> 'Annotation': if not anno_result: raise KeyError("Annotation ID not found: %s" % key) return anno_result - + def set_error(self, err_message: str, err_trace: str) -> None: self.metadata.set_error(err_message, err_trace) self.annotations.empty() - + def get_error(self) -> Optional[str]: """ - Get the "text" representation of the error occurred during - processing. Text representation is supposed to be human-readable. + Get the "text" representation of the error occurred during + processing. Text representation is supposed to be human-readable. When ths view does not have any error, returns None. """ if self.has_error(): @@ -283,26 +291,26 @@ def __init__(self, viewmetadata_obj: Optional[Union[bytes, str, dict]] = None, * 'contains': ContainsDict } # in theory, *oneOf* `contains`, `error`, or `warnings` should appear in a `view` - # but with current implementation, there's no easy way to set a condition - # for `oneOf` requirement - # see MmifObject::_required_attributes in model.py + # but with current implementation, there's no easy way to set a condition + # for `oneOf` requirement + # see MmifObject::_required_attributes in model.py # also see this class' `_serialize()` override implementation super().__init__(viewmetadata_obj) def _serialize(self, alt_container: Optional[Dict] = None) -> dict: serialized = super()._serialize() - # `_serialize()` eliminates any *empty* attributes, so + # `_serialize()` eliminates any *empty* attributes, so # when no "contains", "errors", nor "warnings", at least add an empty contains back if not (self.contains.items() or self.error or self.warnings): serialized['contains'] = {} return serialized - + def has_error(self) -> bool: return len(self.error) > 0 - + def has_warnings(self): return len(self.warnings) > 0 - + def get_error_as_text(self) -> str: if self.has_error(): if isinstance(self.error, ErrorDict): @@ -313,7 +321,7 @@ def get_error_as_text(self) -> str: return f"Error (unknown error format): {self.error}" else: raise KeyError(f"No error found") - + def new_contain(self, at_type: Union[str, ThingTypesBase], **contains_metadata) -> Optional['Contain']: """ Adds a new element to the ``contains`` dictionary. @@ -324,12 +332,12 @@ def new_contain(self, at_type: Union[str, ThingTypesBase], **contains_metadata) """ if isinstance(at_type, str): at_type = ThingTypesBase.from_str(at_type) - + if at_type not in self.contains: new_contain = Contain(contains_metadata) self.add_contain(new_contain, at_type) return new_contain - + def add_contain(self, contain: 'Contain', at_type: Union[str, ThingTypesBase]) -> None: self.contains[at_type] = contain @@ -371,11 +379,11 @@ def get_parameter(self, param_key: str) -> str: return self.parameters[param_key] except KeyError: raise KeyError(f"parameter \"{param_key}\" is not set in the view: {self.serialize()}") - + def set_error(self, message: str, stack_trace: str): self.error = ErrorDict({"message": message, "stackTrace": stack_trace}) self.contains.empty() - + def add_warnings(self, *warnings: Warning): for warning in warnings: self.warnings.append(f'{warning.__class__.__name__}: {" - ".join(warning.args)}') @@ -386,16 +394,16 @@ def emtpy_warnings(self): class ErrorDict(MmifObject): """ - Error object that stores information about error occurred during processing. + Error object that stores information about error occurred during processing. """ def __init__(self, error_obj: Optional[Union[bytes, str, dict]] = None, *_) -> None: self.message: str = '' self.stackTrace: str = '' super().__init__(error_obj) - + def __str__(self): return f"({self.message})\n\n{self.stackTrace}" - + class Contain(DataDict[str, str]): """ @@ -419,6 +427,9 @@ def _deserialize(self, input_list: list) -> None: # pytype: disable=signature-m :param input_list: the JSON data that defines the list of annotations :return: None """ + # Note: + # In MMIF 1.x, we still assume that annotation id lives in properties of MMIF JSON + # TODO: However, in MMIF 2.x, annotation id would be assumed live outside of MMIF JSON self._items = {item['properties']['id']: Document(item) if ClamsTypesBase.attype_iri_isdocument(item['_type']) else Annotation(item) for item in input_list} @@ -440,15 +451,15 @@ def append(self, value: Union[Annotation, Document], overwrite=False) -> None: :return: None """ super()._append_with_key(value.id, value, overwrite) - + def __getitem__(self, key: str): """ specialized getter implementation to workaround https://github.com/clamsproject/mmif/issues/228 - # TODO (krim @ 7/12/24): annotation ids must be in the long form in the future, so this check will be unnecessary once https://github.com/clamsproject/mmif/issues/228 is resolved. + # TODO (krim @ 7/12/24): annotation ids must be in the long form in the future, so this check will be unnecessary once https://github.com/clamsproject/mmif/issues/228 is resolved. """ - if ":" in key: - _, aid = key.split(":") - return self._items.__getitem__(aid) + # if ":" in key: + # _, aid = key.split(":") + # return self._items.__getitem__(aid) return self._items.get(key, None) @@ -465,12 +476,12 @@ def update(self, other: Union[dict, 'ContainsDict'], overwrite=False): if isinstance(k, str): k = ThingTypesBase.from_str(k) self._append_with_key(k, v, overwrite=overwrite) - + def get(self, key: Union[str, ThingTypesBase], default=None): if isinstance(key, str): key = ThingTypesBase.from_str(key) return self._items.get(key, default) - + def __contains__(self, item: Union[str, ThingTypesBase]): if isinstance(item, str): string_keys = [str(k) for k in self._items.keys()] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b