Skip to content

Commit be402cc

Browse files
authored
Add parents/ancestry methods (#231)
Fetch entity parents: A new Node method (`fetch_entity_parents`) which uses `containedInPlace` to get the immediate parents for a given entity or list of entities. Fetch entity ancestry: A new Node method `fetch_entity_ancestry` performs a parallel breadth-first traversal to construct the full ancestry chain for an entity — all the way up to Earth — and returns it in either flat or nested form.
1 parent 78010ac commit be402cc

File tree

7 files changed

+759
-92
lines changed

7 files changed

+759
-92
lines changed

Diff for: datacommons_client/endpoints/node.py

+124-13
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1+
from concurrent.futures import ThreadPoolExecutor
12
from typing import Optional
23

34
from datacommons_client.endpoints.base import API
45
from datacommons_client.endpoints.base import Endpoint
56
from datacommons_client.endpoints.payloads import NodeRequestPayload
67
from datacommons_client.endpoints.payloads import normalize_properties_to_string
78
from datacommons_client.endpoints.response import NodeResponse
9+
from datacommons_client.models.node import Node
10+
from datacommons_client.utils.graph import build_ancestry_map
11+
from datacommons_client.utils.graph import build_ancestry_tree
12+
from datacommons_client.utils.graph import fetch_parents_lru
13+
from datacommons_client.utils.graph import flatten_ancestry
14+
15+
ANCESTRY_MAX_WORKERS = 10
816

917

1018
class NodeEndpoint(Endpoint):
@@ -91,10 +99,12 @@ def fetch_property_labels(
9199
expression = "->" if out else "<-"
92100

93101
# Make the request and return the response.
94-
return self.fetch(node_dcids=node_dcids,
95-
expression=expression,
96-
all_pages=all_pages,
97-
next_token=next_token)
102+
return self.fetch(
103+
node_dcids=node_dcids,
104+
expression=expression,
105+
all_pages=all_pages,
106+
next_token=next_token,
107+
)
98108

99109
def fetch_property_values(
100110
self,
@@ -143,10 +153,12 @@ def fetch_property_values(
143153
if constraints:
144154
expression += f"{{{constraints}}}"
145155

146-
return self.fetch(node_dcids=node_dcids,
147-
expression=expression,
148-
all_pages=all_pages,
149-
next_token=next_token)
156+
return self.fetch(
157+
node_dcids=node_dcids,
158+
expression=expression,
159+
all_pages=all_pages,
160+
next_token=next_token,
161+
)
150162

151163
def fetch_all_classes(
152164
self,
@@ -174,8 +186,107 @@ def fetch_all_classes(
174186
```
175187
"""
176188

177-
return self.fetch_property_values(node_dcids="Class",
178-
properties="typeOf",
179-
out=False,
180-
all_pages=all_pages,
181-
next_token=next_token)
189+
return self.fetch_property_values(
190+
node_dcids="Class",
191+
properties="typeOf",
192+
out=False,
193+
all_pages=all_pages,
194+
next_token=next_token,
195+
)
196+
197+
def fetch_entity_parents(
198+
self,
199+
entity_dcids: str | list[str],
200+
*,
201+
as_dict: bool = True) -> dict[str, list[Node | dict]]:
202+
"""Fetches the direct parents of one or more entities using the 'containedInPlace' property.
203+
204+
Args:
205+
entity_dcids (str | list[str]): A single DCID or a list of DCIDs to query.
206+
as_dict (bool): If True, returns a dictionary mapping each input DCID to its
207+
immediate parent entities. If False, returns a dictionary of Parent objects (which
208+
are dataclasses).
209+
210+
Returns:
211+
dict[str, list[Parent | dict]]: A dictionary mapping each input DCID to a list of its
212+
immediate parent entities. Each parent is represented as a Parent object (which
213+
contains the DCID, name, and type of the parent entity) or as a dictionary with
214+
the same data.
215+
"""
216+
# Fetch property values from the API
217+
data = self.fetch_property_values(
218+
node_dcids=entity_dcids,
219+
properties="containedInPlace",
220+
).get_properties()
221+
222+
if as_dict:
223+
return {k: v.to_dict() for k, v in data.items()}
224+
225+
return data
226+
227+
def _fetch_parents_cached(self, dcid: str) -> tuple[Node, ...]:
228+
"""Returns cached parent nodes for a given entity using an LRU cache.
229+
230+
This private wrapper exists because `@lru_cache` cannot be applied directly
231+
to instance methods. By passing the `NodeEndpoint` instance (`self`) as an
232+
argument caching is preserved while keeping the implementation modular and testable.
233+
234+
Args:
235+
dcid (str): The DCID of the entity whose parents should be fetched.
236+
237+
Returns:
238+
tuple[Parent, ...]: A tuple of Parent objects representing the entity's immediate parents.
239+
"""
240+
return fetch_parents_lru(self, dcid)
241+
242+
def fetch_entity_ancestry(
243+
self,
244+
entity_dcids: str | list[str],
245+
as_tree: bool = False,
246+
*,
247+
max_concurrent_requests: Optional[int] = ANCESTRY_MAX_WORKERS
248+
) -> dict[str, list[dict[str, str]] | dict]:
249+
"""Fetches the full ancestry (flat or nested) for one or more entities.
250+
For each input DCID, this method builds the complete ancestry graph using a
251+
breadth-first traversal and parallel fetching.
252+
It returns either a flat list of unique parents or a nested tree structure for
253+
each entity, depending on the `as_tree` flag. The flat list matches the structure
254+
of the `/api/place/parent` endpoint of the DC website.
255+
Args:
256+
entity_dcids (str | list[str]): One or more DCIDs of the entities whose ancestry
257+
will be fetched.
258+
as_tree (bool): If True, returns a nested tree structure; otherwise, returns a flat list.
259+
Defaults to False.
260+
max_concurrent_requests (Optional[int]): The maximum number of concurrent requests to make.
261+
Defaults to ANCESTRY_MAX_WORKERS.
262+
Returns:
263+
dict[str, list[dict[str, str]] | dict]: A dictionary mapping each input DCID to either:
264+
- A flat list of parent dictionaries (if `as_tree` is False), or
265+
- A nested ancestry tree (if `as_tree` is True). Each parent is represented by
266+
a dict with 'dcid', 'name', and 'type'.
267+
"""
268+
269+
if isinstance(entity_dcids, str):
270+
entity_dcids = [entity_dcids]
271+
272+
result = {}
273+
274+
# Use a thread pool to fetch ancestry graphs in parallel for each input entity
275+
with ThreadPoolExecutor(max_workers=max_concurrent_requests) as executor:
276+
futures = [
277+
executor.submit(build_ancestry_map,
278+
root=dcid,
279+
fetch_fn=self._fetch_parents_cached)
280+
for dcid in entity_dcids
281+
]
282+
283+
# Gather ancestry maps and postprocess into flat or nested form
284+
for future in futures:
285+
dcid, ancestry = future.result()
286+
if as_tree:
287+
ancestry = build_ancestry_tree(dcid, ancestry)
288+
else:
289+
ancestry = flatten_ancestry(ancestry)
290+
result[dcid] = ancestry
291+
292+
return result

Diff for: datacommons_client/endpoints/response.py

+1-38
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
from dataclasses import asdict
21
from dataclasses import dataclass
32
from dataclasses import field
4-
import json
53
from typing import Any, Dict, List
64

75
from datacommons_client.models.node import Arcs
@@ -15,42 +13,7 @@
1513
from datacommons_client.models.resolve import Entity
1614
from datacommons_client.utils.data_processing import flatten_properties
1715
from datacommons_client.utils.data_processing import observations_as_records
18-
19-
20-
class SerializableMixin:
21-
"""Provides serialization methods for the Response dataclasses."""
22-
23-
def to_dict(self, exclude_none: bool = True) -> Dict[str, Any]:
24-
"""Converts the instance to a dictionary.
25-
26-
Args:
27-
exclude_none: If True, only include non-empty values in the response.
28-
29-
Returns:
30-
Dict[str, Any]: The dictionary representation of the instance.
31-
"""
32-
33-
def _remove_none(data: Any) -> Any:
34-
"""Recursively removes None or empty values from a dictionary or list."""
35-
if isinstance(data, dict):
36-
return {k: _remove_none(v) for k, v in data.items() if v is not None}
37-
elif isinstance(data, list):
38-
return [_remove_none(item) for item in data]
39-
return data
40-
41-
result = asdict(self)
42-
return _remove_none(result) if exclude_none else result
43-
44-
def to_json(self, exclude_none: bool = True) -> str:
45-
"""Converts the instance to a JSON string.
46-
47-
Args:
48-
exclude_none: If True, only include non-empty values in the response.
49-
50-
Returns:
51-
str: The JSON string representation of the instance.
52-
"""
53-
return json.dumps(self.to_dict(exclude_none=exclude_none), indent=2)
16+
from datacommons_client.utils.data_processing import SerializableMixin
5417

5518

5619
@dataclass

Diff for: datacommons_client/models/node.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from dataclasses import field
33
from typing import Any, Dict, List, Optional, TypeAlias
44

5+
from datacommons_client.utils.data_processing import SerializableMixin
6+
57
NextToken: TypeAlias = Optional[str]
68
NodeDCID: TypeAlias = str
79
ArcLabel: TypeAlias = str
@@ -10,7 +12,7 @@
1012

1113

1214
@dataclass
13-
class Node:
15+
class Node(SerializableMixin):
1416
"""Represents an individual node in the Data Commons knowledge graph.
1517
1618
Attributes:

0 commit comments

Comments
 (0)