Skip to content

Commit daefc5f

Browse files
Jason MunroJason Munro
andauthored
Enable downloads from AWS Open Data for all collections (#884)
* Ensure json schema fields are retained * Add direct s3 querying for regular docs * Fix timeout passing * Fix s3 threadsafety * Fix for latest data re-org * Use smart open * Fix deserialization performance * Switch to s3 for all doc queryies * Fix remaining type issues * More type fixes * Remove print * Fix deprecated field in materials and s3 * Fix type issues * Fix structure import * Fix open data json sanitization * Add projection to multithreaded s3 func * Remove default s3 query * Add comments for clarity in s3 func * Update pbar for s3 download * Switch grain boundary prefix * Fix s3 suffix delim * Fix task retrieval support * Linting * Docstring noqa * Add smart_open to deps * Linting * More linting * Add missing docstring * Docstring arg rename * Fix tests * Fix bs s3 query * Fix chgcar query * Fix dos query * More linting * Fix s3 keys * More s3 fixes * Last s3 query fix * Linting * Skip alloys generic test * Fix alloys skip * Remove task chemsys query * Linting * Fix similarity search name * More similarity fixes * Remove fermi rester * Remove fermi ref * More fermi fixes * Linting * Fix s3 decoding * Add deprecated filter for s3 * Fix object key pagination --------- Co-authored-by: Jason Munro <[email protected]>
1 parent 8724ebe commit daefc5f

File tree

13 files changed

+433
-216
lines changed

13 files changed

+433
-216
lines changed

mp_api/client/core/client.py

Lines changed: 281 additions & 101 deletions
Large diffs are not rendered by default.

mp_api/client/core/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def api_sanitize(
7373
model_fields_to_leave = {f[1] for f in fields_tuples if model.__name__ == f[0]}
7474
for name in model.model_fields:
7575
field = model.model_fields[name]
76+
field_json_extra = field.json_schema_extra
7677
field_type = field.annotation
7778

7879
if field_type is not None and allow_dict_msonable:
@@ -87,6 +88,7 @@ def api_sanitize(
8788
new_field = FieldInfo.from_annotated_attribute(
8889
Optional[field_type], None
8990
)
91+
new_field.json_schema_extra = field_json_extra or {}
9092
model.model_fields[name] = new_field
9193

9294
model.model_rebuild(force=True)

mp_api/client/mprester.py

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
from __future__ import annotations
22

33
import itertools
4+
import json
5+
import os
46
import warnings
57
from functools import cache, lru_cache
68
from json import loads
7-
from os import environ
89
from typing import Literal
910

1011
from emmet.core.electronic_structure import BSPathType
1112
from emmet.core.mpid import MPID
1213
from emmet.core.settings import EmmetSettings
1314
from emmet.core.tasks import TaskDoc
1415
from emmet.core.vasp.calc_types import CalcType
16+
from monty.json import MontyDecoder
1517
from packaging import version
1618
from pymatgen.analysis.phase_diagram import PhaseDiagram
1719
from pymatgen.analysis.pourbaix_diagram import IonEntry
@@ -39,7 +41,6 @@
3941
ElectrodeRester,
4042
ElectronicStructureRester,
4143
EOSRester,
42-
FermiRester,
4344
GrainBoundaryRester,
4445
MagnetismRester,
4546
OxidationStatesRester,
@@ -66,10 +67,12 @@
6667
)
6768

6869
_EMMET_SETTINGS = EmmetSettings() # type: ignore
69-
_MAPI_SETTINGS = MAPIClientSettings() # type: ignore
70+
_MAPI_SETTINGS = MAPIClientSettings() # typeL ignore # type: ignore
7071

71-
DEFAULT_API_KEY = environ.get("MP_API_KEY", None)
72-
DEFAULT_ENDPOINT = environ.get("MP_API_ENDPOINT", "https://api.materialsproject.org/")
72+
DEFAULT_API_KEY = os.environ.get("MP_API_KEY", None)
73+
DEFAULT_ENDPOINT = os.environ.get(
74+
"MP_API_ENDPOINT", "https://api.materialsproject.org/"
75+
)
7376

7477

7578
class MPRester:
@@ -86,8 +89,7 @@ class MPRester:
8689
similarity: SimilarityRester
8790
tasks: TaskRester
8891
xas: XASRester
89-
fermi: FermiRester
90-
grain_boundary: GrainBoundaryRester
92+
grain_boundaries: GrainBoundaryRester
9193
substrates: SubstratesRester
9294
surface_properties: SurfacePropertiesRester
9395
phonon: PhononRester
@@ -195,7 +197,7 @@ def __init__(
195197
"tasks",
196198
"xas",
197199
"fermi",
198-
"grain_boundary",
200+
"grain_boundaries",
199201
"substrates",
200202
"surface_properties",
201203
"phonon",
@@ -340,7 +342,7 @@ def __molecules_getattr__(_self, attr):
340342
return rester
341343

342344
MaterialsRester.__getattr__ = __materials_getattr__ # type: ignore
343-
MoleculeRester.__getattr__ = __molecules_getattr__ # type: ignore
345+
MoleculeRester.__getattr__ = __molecules_getattr__ # type: ignore
344346

345347
for attr, rester in core_resters.items():
346348
setattr(
@@ -598,14 +600,15 @@ def get_structures(
598600
input_params = {"formula": chemsys_formula}
599601

600602
if final:
601-
return [
602-
doc.structure if self.use_document_model else doc["structure"] # type: ignore
603-
for doc in self.materials.search(
604-
**input_params, # type: ignore
605-
all_fields=False,
606-
fields=["structure"],
607-
)
608-
]
603+
docs = self.materials.search(
604+
**input_params, # type: ignore
605+
all_fields=False,
606+
fields=["structure"],
607+
)
608+
if not self.use_document_model:
609+
return [doc["structure"] for doc in docs] # type: ignore
610+
611+
return [doc.structure for doc in docs] # type: ignore
609612
else:
610613
structures = []
611614

@@ -614,11 +617,12 @@ def get_structures(
614617
all_fields=False,
615618
fields=["initial_structures"],
616619
):
617-
structures.extend(
620+
initial_structures = (
618621
doc.initial_structures # type: ignore
619622
if self.use_document_model
620623
else doc["initial_structures"] # type: ignore
621624
)
625+
structures.extend(initial_structures)
622626

623627
return structures
624628

@@ -1301,7 +1305,7 @@ def get_wulff_shape(self, material_id: str):
13011305
)
13021306
miller_energy_map = {}
13031307
for surf in surfaces:
1304-
miller = tuple(surf.miller_index)
1308+
miller = tuple(surf.miller_index) if surf.miller_index else ()
13051309
# Prefer reconstructed surfaces, which have lower surface energies.
13061310
if (miller not in miller_energy_map) or surf.is_reconstructed:
13071311
miller_energy_map[miller] = surf.surface_energy
@@ -1339,20 +1343,22 @@ def get_charge_density_from_material_id(
13391343
else x["last_updated"], # type: ignore
13401344
)
13411345

1342-
result = (
1346+
decoder = MontyDecoder().decode if self.monty_decode else json.loads
1347+
chgcar = (
13431348
self.tasks._query_open_data(
13441349
bucket="materialsproject-parsed",
1345-
prefix="chgcars",
1346-
key=str(latest_doc.task_id),
1347-
)
1350+
key=f"chgcars/{str(latest_doc.task_id)}.json.gz",
1351+
decoder=decoder,
1352+
fields=["data"],
1353+
)[0]
13481354
or {}
13491355
)
13501356

1351-
chgcar = result.get("data", None)
1352-
1353-
if chgcar is None:
1357+
if not chgcar:
13541358
raise MPRestError(f"No charge density fetched for {material_id}.")
13551359

1360+
chgcar = chgcar[0]["data"] # type: ignore
1361+
13561362
if inc_task_doc:
13571363
task_doc = self.tasks.search(
13581364
task_ids=latest_doc.task_id
@@ -1384,7 +1390,7 @@ def get_download_info(self, material_ids, calc_types=None, file_patterns=None):
13841390
)
13851391

13861392
meta = {}
1387-
for doc in self.materials.search(
1393+
for doc in self.materials.search( # type: ignore
13881394
task_ids=material_ids,
13891395
fields=["calc_types", "deprecated_tasks", "material_id"],
13901396
):

mp_api/client/routes/materials/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
ElectronicStructureRester,
1414
)
1515
from .eos import EOSRester
16-
from .fermi import FermiRester
17-
from .grain_boundary import GrainBoundaryRester
16+
from .grain_boundaries import GrainBoundaryRester
1817
from .magnetism import MagnetismRester
1918
from .oxidation_states import OxidationStatesRester
2019
from .phonon import PhononRester

mp_api/client/routes/materials/electronic_structure.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import json
34
import warnings
45
from collections import defaultdict
56

@@ -8,6 +9,7 @@
89
DOSProjectionType,
910
ElectronicStructureDoc,
1011
)
12+
from monty.json import MontyDecoder
1113
from pymatgen.analysis.magnetism.analyzer import Ordering
1214
from pymatgen.core.periodic_table import Element
1315
from pymatgen.electronic_structure.core import OrbitalType, Spin
@@ -232,12 +234,16 @@ def get_bandstructure_from_task_id(self, task_id: str):
232234
Returns:
233235
bandstructure (BandStructure): BandStructure or BandStructureSymmLine object
234236
"""
237+
decoder = MontyDecoder().decode if self.monty_decode else json.loads
235238
result = self._query_open_data(
236-
bucket="materialsproject-parsed", prefix="bandstructures", key=task_id
237-
)
238-
239-
if result.get("data", None) is not None:
240-
return result["data"]
239+
bucket="materialsproject-parsed",
240+
key=f"bandstructures/{task_id}.json.gz",
241+
decoder=decoder,
242+
fields=["data"],
243+
)[0]
244+
245+
if result:
246+
return result[0]["data"]
241247
else:
242248
raise MPRestError("No object found")
243249

@@ -418,12 +424,16 @@ def get_dos_from_task_id(self, task_id: str):
418424
Returns:
419425
bandstructure (CompleteDos): CompleteDos object
420426
"""
427+
decoder = MontyDecoder().decode if self.monty_decode else json.loads
421428
result = self._query_open_data(
422-
bucket="materialsproject-parsed", prefix="dos", key=task_id
423-
)
424-
425-
if result.get("data", None) is not None:
426-
return result["data"]
429+
bucket="materialsproject-parsed",
430+
key=f"dos/{task_id}.json.gz",
431+
decoder=decoder,
432+
fields=["data"],
433+
)[0]
434+
435+
if result:
436+
return result[0]["data"] # type: ignore
427437
else:
428438
raise MPRestError("No object found")
429439

mp_api/client/routes/materials/fermi.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

mp_api/client/routes/materials/materials.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
ElectrodeRester,
2020
ElectronicStructureRester,
2121
EOSRester,
22-
FermiRester,
2322
GrainBoundaryRester,
2423
MagnetismRester,
2524
OxidationStatesRester,
@@ -50,8 +49,7 @@ class MaterialsRester(BaseRester[MaterialsDoc]):
5049
"similarity",
5150
"tasks",
5251
"xas",
53-
"fermi",
54-
"grain_boundary",
52+
"grain_boundaries",
5553
"substrates",
5654
"surface_properties",
5755
"phonon",
@@ -81,7 +79,6 @@ class MaterialsRester(BaseRester[MaterialsDoc]):
8179
similarity: SimilarityRester
8280
tasks: TaskRester
8381
xas: XASRester
84-
fermi: FermiRester
8582
grain_boundary: GrainBoundaryRester
8683
substrates: SubstratesRester
8784
surface_properties: SurfacePropertiesRester

mp_api/client/routes/materials/similarity.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@ class SimilarityRester(BaseRester[SimilarityDoc]):
1111
document_model = SimilarityDoc # type: ignore
1212
primary_key = "material_id"
1313

14-
def search_docs(
14+
def search(
1515
self,
1616
material_ids: str | list[str] | None = None,
17-
deprecated: bool | None = False,
1817
num_chunks: int | None = None,
1918
chunk_size: int = 1000,
2019
all_fields: bool = True,
@@ -25,7 +24,6 @@ def search_docs(
2524
Arguments:
2625
material_ids (str, List[str]): A single Material ID string or list of strings
2726
(e.g., mp-149, [mp-149, mp-13]).
28-
deprecated (bool): Whether the material is tagged as deprecated.
2927
num_chunks (int): Maximum number of chunks of data to yield. None will yield all possible.
3028
chunk_size (int): Number of data entries per chunk.
3129
all_fields (bool): Whether to return all fields in the document. Defaults to True.
@@ -35,7 +33,7 @@ def search_docs(
3533
Returns:
3634
([SimilarityDoc], [dict]) List of similarity documents or dictionaries.
3735
"""
38-
query_params = {"deprecated": deprecated} # type: dict
36+
query_params = {} # type: dict
3937

4038
if material_ids:
4139
if isinstance(material_ids, str):

mp_api/client/routes/materials/tasks.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ def get_trajectory(self, task_id):
3636
def search(
3737
self,
3838
task_ids: str | list[str] | None = None,
39-
chemsys: str | list[str] | None = None,
4039
elements: list[str] | None = None,
4140
exclude_elements: list[str] | None = None,
4241
formula: str | list[str] | None = None,
@@ -50,8 +49,6 @@ def search(
5049
5150
Arguments:
5251
task_ids (str, List[str]): List of Materials Project IDs to return data for.
53-
chemsys (str, List[str]): A chemical system or list of chemical systems
54-
(e.g., Li-Fe-O, Si-*, [Si-O, Li-Fe-P]).
5552
elements (List[str]): A list of elements.
5653
exclude_elements (List[str]): A list of elements to exclude.
5754
formula (str, List[str]): A formula including anonymized formula
@@ -84,12 +81,6 @@ def search(
8481
if exclude_elements:
8582
query_params.update({"exclude_elements": ",".join(exclude_elements)})
8683

87-
if chemsys:
88-
if isinstance(chemsys, str):
89-
chemsys = [chemsys]
90-
91-
query_params.update({"chemsys": ",".join(chemsys)})
92-
9384
if last_updated:
9485
query_params.update(
9586
{

0 commit comments

Comments
 (0)