From 1ac7b566d49cd31512b259048ed6cd6aa8ec98ed Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Fri, 18 Oct 2024 17:18:16 +0200 Subject: [PATCH] Some optimization of the `exists()` method: use a simpler query, and allow restricting the query to specific spaces. --- fairgraph/client.py | 3 ++ fairgraph/kgobject.py | 56 +++++++++++++++++++++++---- fairgraph/openminds/core/data/file.py | 2 +- fairgraph/openminds/core/data/hash.py | 1 + test/utils.py | 2 +- 5 files changed, 54 insertions(+), 10 deletions(-) diff --git a/fairgraph/client.py b/fairgraph/client.py index fbd6884c..5017d112 100644 --- a/fairgraph/client.py +++ b/fairgraph/client.py @@ -174,6 +174,7 @@ def query( scope: str = "released", id_key: str = "@id", use_stored_query: bool = False, + restrict_to_spaces: Optional[List[str]] = None ) -> ResultPage[JsonLdDocument]: """ Execute a Knowledge Graph (KG) query with the given filters and query definition. @@ -204,6 +205,7 @@ def _query(scope, from_index, size): stage=STAGE_MAP[scope], pagination=Pagination(start=from_index, size=size), instance_id=instance_id, + restrict_to_spaces=restrict_to_spaces ) error_context = f"_query(scope={scope} query_id={query_id} filter={filter} instance_id={instance_id} size={size} from_index={from_index})" return self._check_response(response, error_context=error_context) @@ -217,6 +219,7 @@ def _query(scope, from_index, size): stage=STAGE_MAP[scope], pagination=Pagination(start=from_index, size=size), instance_id=instance_id, + restrict_to_spaces=restrict_to_spaces ) error_context = f"_query(scope={scope} query_id={query_id} filter={filter} instance_id={instance_id} size={size} from_index={from_index})" return self._check_response(response, error_context=error_context) diff --git a/fairgraph/kgobject.py b/fairgraph/kgobject.py index ca1c856b..0a9e9076 100644 --- a/fairgraph/kgobject.py +++ b/fairgraph/kgobject.py @@ -35,7 +35,7 @@ have_tabulate = False from .utility import expand_uri, as_list, expand_filter, ActivityLog from .registry import lookup_type -from .queries import Query +from .queries import Query, QueryProperty from .errors import AuthorizationError, ResourceExistsError, CannotBuildExistenceQuery from .caching import object_cache, save_cache, generate_cache_key from .base import RepresentsSingleObject, ContainsMetadata, SupportsQuerying, IRI, JSONdict @@ -470,7 +470,12 @@ def diff(self, other): differences["properties"][prop.name] = (val_self, val_other) return differences - def exists(self, client: KGClient, ignore_duplicates: bool = False) -> bool: + def exists( + self, + client: KGClient, + ignore_duplicates: bool = False, + in_spaces: Optional[List[str]] = None + ) -> bool: """Check if this object already exists in the KnowledgeGraph""" if self.id: @@ -509,12 +514,12 @@ def exists(self, client: KGClient, ignore_duplicates: bool = False) -> bool: self.remote_data = cached_obj.remote_data # copy or update needed? return True - query = self.__class__.generate_query( - space=None, + query = self.__class__.generate_minimal_query( client=client, filters=query_filter, ) - instances = client.query(query=query, size=2, scope="any").data + + instances = client.query(query=query, size=2, scope="any", restrict_to_spaces=in_spaces).data if instances: if len(instances) > 1 and not ignore_duplicates: @@ -527,10 +532,10 @@ def exists(self, client: KGClient, ignore_duplicates: bool = False) -> bool: if instance is None: return False - self.id = instances[0]["@id"] + self.id = instance["@id"] assert isinstance(self.id, str) save_cache[self.__class__][query_cache_key] = self.id - self._update_empty_properties(instances[0], client) # also updates `remote_data` + self._update_empty_properties(instance, client) # also updates `remote_data` return bool(instances) def modified_data(self) -> JSONdict: @@ -619,7 +624,7 @@ def save( else: space = self.space logger.info(f"Saving a {self.__class__.__name__} in space {space}") - if self.exists(client, ignore_duplicates=ignore_duplicates): + if self.exists(client, ignore_duplicates=ignore_duplicates, in_spaces=[space]): if not self.allow_update: logger.info(f" - not updating {self.__class__.__name__}(id={self.id}), update not allowed by user") if activity_log: @@ -840,6 +845,41 @@ def generate_query( # than necessary, but it makes the logic easier to understand. return query.serialize() + @classmethod + def generate_minimal_query( + cls, + client: KGClient, + filters: Optional[Dict[str, Any]] = None, + label: Optional[str] = None, + ) -> Union[Dict[str, Any], None]: + """ + Generate a minimal KG query definition as a JSON-LD document. + Such a query returns only the @id of any instances that are found. + + Args: + client: KGClient object that handles the communication with the KG. + filters (dict): A dictonary defining search parameters for the query. + label (str, optional): a label for the query + + Returns: + A JSON-LD document containing the KG query definition. + + """ + if filters: + normalized_filters = cls.normalize_filter(expand_filter(filters)) + else: + normalized_filters = None + # first pass, we build the basic structure + query = Query( + node_type=cls.type_, + label=label, + space=None, + properties=[QueryProperty("@type")], + ) + # second pass, we add filters + query.properties.extend(cls.generate_query_filter_properties(normalized_filters)) + return query.serialize() + def children( self, client: KGClient, follow_links: Optional[Dict[str, Any]] = None ) -> List[RepresentsSingleObject]: diff --git a/fairgraph/openminds/core/data/file.py b/fairgraph/openminds/core/data/file.py index d058c005..c05b2bfa 100644 --- a/fairgraph/openminds/core/data/file.py +++ b/fairgraph/openminds/core/data/file.py @@ -240,7 +240,7 @@ class File(KGObject): ), ] aliases = {"hash": "hashes"} - existence_query_properties = ("iri", "hash") + existence_query_properties = ("iri", "hashes") def __init__( self, diff --git a/fairgraph/openminds/core/data/hash.py b/fairgraph/openminds/core/data/hash.py index 2d5ce4a6..c4acb775 100644 --- a/fairgraph/openminds/core/data/hash.py +++ b/fairgraph/openminds/core/data/hash.py @@ -34,6 +34,7 @@ class Hash(EmbeddedMetadata): ), ] reverse_properties = [] + existence_query_properties = ("algorithm", "digest") def __init__(self, algorithm=None, digest=None, id=None, data=None, space=None, scope=None): return super().__init__(data=data, algorithm=algorithm, digest=digest) diff --git a/test/utils.py b/test/utils.py index 8f14a208..8f523fe2 100644 --- a/test/utils.py +++ b/test/utils.py @@ -62,7 +62,7 @@ def instance_from_full_uri( else: raise NotImplementedError - def query(self, query, filter=None, space=None, size=100, from_index=0, scope="released"): + def query(self, query, filter=None, space=None, size=100, from_index=0, scope="released", restrict_to_spaces=None): for prop in query["structure"]: if prop.get("propertyName", "") in ("Qname", "Qfull_name"): filter_value = prop["filter"]["value"]