diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 52c7f9b18e3..fc55280a17b 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -42,7 +42,7 @@ dependencies: - pandas=2.1 - pint=0.22 - pip - - pydap=3.4 + - pydap=3.5.0 - pytest - pytest-cov - pytest-env diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 1cee4597836..679aa4d8c32 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -1245,37 +1245,44 @@ over the network until we look at particular values: .. image:: ../_static/opendap-prism-tmax.png -Some servers require authentication before we can access the data. For this -purpose we can explicitly create a :py:class:`backends.PydapDataStore` -and pass in a `Requests`__ session object. For example for -HTTP Basic authentication:: +Some servers require authentication before we can access the data. Pydap uses +a `Requests`__ session object (which the user can pre-define), and this +session object can recover `authentication`__` credentials from a locally stored +``.netrc`` file. For example, to connect to a server that requires NASA's +URS authentication, with the username/password credentials stored on a locally +accessible ``.netrc``, access to OPeNDAP data should be as simple as this:: import xarray as xr import requests - session = requests.Session() - session.auth = ('username', 'password') + my_session = requests.Session() - store = xr.backends.PydapDataStore.open('http://example.com/data', - session=session) - ds = xr.open_dataset(store) + ds_url = 'https://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' -`Pydap's cas module`__ has functions that generate custom sessions for -servers that use CAS single sign-on. For example, to connect to servers -that require NASA's URS authentication:: + ds = xr.open_dataset(ds_url, session=my_session, engine="pydap") - import xarray as xr - from pydata.cas.urs import setup_session +Moreover, a bearer token header can be included in a `Requests`__ session +object, allowing for token-based authentication which OPeNDAP servers can use +to avoid some redirects. - ds_url = 'https://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' - session = setup_session('username', 'password', check_url=ds_url) - store = xr.backends.PydapDataStore.open(ds_url, session=session) +Lastly, OPeNDAP servers may provide endpoint URLs for different OPeNDAP protocols, +DAP2 and DAP4. To specify which protocol between the two options to use, you can +replace the scheme of the url with the name of the protocol. For example:: - ds = xr.open_dataset(store) + # dap2 url + ds_url = 'dap2://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' + + # dap4 url + ds_url = 'dap4://gpm1.gesdisc.eosdis.nasa.gov/opendap/hyrax/example.nc' + +While most OPeNDAP servers implement DAP2, not all servers implement DAP4. It +is recommended to check if the URL you are using `supports DAP4`__ by checking the +URL on a browser. __ https://docs.python-requests.org -__ https://www.pydap.org/en/latest/client.html#authentication +__ https://pydap.github.io/pydap/en/notebooks/Authentication.html +__ https://pydap.github.io/pydap/en/faqs/dap2_or_dap4_url.html .. _io.pickle: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 591d30cfadf..48cd69ad82d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,10 +24,20 @@ New Features - Added `scipy-stubs `_ to the ``xarray[types]`` dependencies. By `Joren Hammudoglu `_. +- Improved compatibility with OPeNDAP DAP4 data model for backend engine ``pydap``. This + includes ``datatree`` support, and removing slashes from dimension names. By + `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ +- The minimum versions of some dependencies were changed + + ===================== ========= ======= + Package Old New + ===================== ========= ======= + pydap 3.4 3.5.0 + ===================== ========= ======= Deprecations ~~~~~~~~~~~~ @@ -47,6 +57,8 @@ Documentation - Fix references to core classes in docs (:issue:`10195`, :pull:`10207`). By `Mattia Almansi `_. +- Fix references to point to updated pydap documentation (:pull:`10182`). + By `Miguel Jimenez-Urias `_. Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 74ddbc8443b..301ea430c4c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -10,6 +10,8 @@ AbstractDataStore, BackendArray, BackendEntrypoint, + _normalize_path, + datatree_from_dict_with_io_cleanup, robust_getitem, ) from xarray.backends.store import StoreBackendEntrypoint @@ -18,7 +20,6 @@ Frozen, FrozenDict, close_on_error, - is_dict_like, is_remote_uri, ) from xarray.core.variable import Variable @@ -28,6 +29,7 @@ import os from xarray.core.dataset import Dataset + from xarray.core.datatree import DataTree from xarray.core.types import ReadBuffer @@ -49,36 +51,26 @@ def __getitem__(self, key): ) def _getitem(self, key): - # pull the data from the array attribute if possible, to avoid - # downloading coordinate data twice - array = getattr(self.array, "array", self.array) - result = robust_getitem(array, key, catch=ValueError) - result = np.asarray(result) + result = robust_getitem(self.array, key, catch=ValueError) # in some cases, pydap doesn't squeeze axes automatically like numpy + result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - if result.ndim + len(axis) != array.ndim and axis: + if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) return result -def _fix_attributes(attributes): - attributes = dict(attributes) - for k in list(attributes): - if k.lower() == "global" or k.lower().endswith("_global"): - # move global attributes to the top level, like the netcdf-C - # DAP client - attributes.update(attributes.pop(k)) - elif is_dict_like(attributes[k]): - # Make Hierarchical attributes to a single level with a - # dot-separated key - attributes.update( - { - f"{k}.{k_child}": v_child - for k_child, v_child in attributes.pop(k).items() - } - ) - return attributes +def get_group(ds, group): + if group in {None, "", "/"}: + # use the root group + return ds + else: + try: + return ds[group] + except KeyError as e: + # wrap error to provide slightly more helpful message + raise KeyError(f"group not found: {group}", e) from e class PydapDataStore(AbstractDataStore): @@ -88,18 +80,22 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, ds): + def __init__(self, dataset, group=None): """ Parameters ---------- ds : pydap DatasetType + group: str or None (default None) + The group to open. If None, the root group is opened. """ - self.ds = ds + self.dataset = dataset + self.group = group @classmethod def open( cls, url, + group=None, application=None, session=None, output_grid=None, @@ -107,43 +103,89 @@ def open( verify=None, user_charset=None, ): - import pydap.client - import pydap.lib - - if timeout is None: - from pydap.lib import DEFAULT_TIMEOUT + from pydap.client import open_url + from pydap.net import DEFAULT_TIMEOUT - timeout = DEFAULT_TIMEOUT + if output_grid is not None: + # output_grid is no longer passed to pydap.client.open_url + from xarray.core.utils import emit_user_level_warning + emit_user_level_warning( + "`output_grid` is deprecated and will be removed in a future version" + " of xarray. Will be set to `None`, the new default. ", + DeprecationWarning, + ) + output_grid = False # new default behavior kwargs = { "url": url, "application": application, "session": session, - "output_grid": output_grid or True, - "timeout": timeout, + "output_grid": output_grid or False, + "timeout": timeout or DEFAULT_TIMEOUT, + "verify": verify or True, + "user_charset": user_charset, } - if verify is not None: - kwargs.update({"verify": verify}) - if user_charset is not None: - kwargs.update({"user_charset": user_charset}) - ds = pydap.client.open_url(**kwargs) - return cls(ds) + if isinstance(url, str): + # check uit begins with an acceptable scheme + dataset = open_url(**kwargs) + elif hasattr(url, "ds"): + # pydap dataset + dataset = url.ds + args = {"dataset": dataset} + if group: + # only then, change the default + args["group"] = group + return cls(**args) def open_store_variable(self, var): data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) - return Variable(var.dimensions, data, _fix_attributes(var.attributes)) + try: + dimensions = [ + dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims + ] + except AttributeError: + # GridType does not have a dims attribute - instead get `dimensions` + # see https://github.com/pydap/pydap/issues/485 + dimensions = var.dimensions + return Variable(dimensions, data, var.attributes) def get_variables(self): - return FrozenDict( - (k, self.open_store_variable(self.ds[k])) for k in self.ds.keys() - ) + # get first all variables arrays, excluding any container type like, + # `Groups`, `Sequence` or `Structure` types + try: + _vars = list(self.ds.variables()) + _vars += list(self.ds.grids()) # dap2 objects + except AttributeError: + from pydap.model import GroupType + + _vars = list(self.ds.keys()) + # check the key is a BaseType or GridType + for var in _vars: + if isinstance(self.ds[var], GroupType): + _vars.remove(var) + return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) def get_attrs(self): - return Frozen(_fix_attributes(self.ds.attributes)) + """Remove any opendap specific attributes""" + opendap_attrs = ( + "configuration", + "build_dmrpp", + "bes", + "libdap", + "invocation", + "dimensions", + ) + attrs = self.ds.attributes + list(map(attrs.pop, opendap_attrs, [None] * 6)) + return Frozen(attrs) def get_dimensions(self): return Frozen(self.ds.dimensions) + @property + def ds(self): + return get_group(self.dataset, self.group) + class PydapBackendEntrypoint(BackendEntrypoint): """ @@ -154,7 +196,7 @@ class PydapBackendEntrypoint(BackendEntrypoint): This backend is selected by default for urls. For more information about the underlying library, visit: - https://www.pydap.org + https://pydap.github.io/pydap/en/intro.html See Also -------- @@ -181,6 +223,7 @@ def open_dataset( drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, + group=None, application=None, session=None, output_grid=None, @@ -190,6 +233,7 @@ def open_dataset( ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, + group=group, application=application, session=session, output_grid=output_grid, @@ -197,7 +241,6 @@ def open_dataset( verify=verify, user_charset=user_charset, ) - store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): ds = store_entrypoint.open_dataset( @@ -212,5 +255,140 @@ def open_dataset( ) return ds + def open_datatree( + self, + filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + *, + mask_and_scale=True, + decode_times=True, + concat_characters=True, + decode_coords=True, + drop_variables: str | Iterable[str] | None = None, + use_cftime=None, + decode_timedelta=None, + group: str | None = None, + application=None, + session=None, + timeout=None, + verify=None, + user_charset=None, + ) -> DataTree: + groups_dict = self.open_groups_as_dict( + filename_or_obj, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + group=group, + application=None, + session=None, + timeout=None, + verify=None, + user_charset=None, + ) + + return datatree_from_dict_with_io_cleanup(groups_dict) + + def open_groups_as_dict( + self, + filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + *, + mask_and_scale=True, + decode_times=True, + concat_characters=True, + decode_coords=True, + drop_variables: str | Iterable[str] | None = None, + use_cftime=None, + decode_timedelta=None, + group: str | None = None, + application=None, + session=None, + timeout=None, + verify=None, + user_charset=None, + ) -> dict[str, Dataset]: + from xarray.core.treenode import NodePath + + filename_or_obj = _normalize_path(filename_or_obj) + store = PydapDataStore.open( + url=filename_or_obj, + application=application, + session=session, + timeout=timeout, + verify=verify, + user_charset=user_charset, + ) + + # Check for a group and make it a parent if it exists + if group: + parent = str(NodePath("/") / NodePath(group)) + else: + parent = str(NodePath("/")) + + groups_dict = {} + group_names = [parent] + # construct fully qualified path to group + try: + # this works for pydap >= 3.5.1 + Groups = store.ds[parent].groups() + except AttributeError: + # THIS IS ONLY NEEDED FOR `pydap == 3.5.0` + # `pydap>= 3.5.1` has a new method `groups()` + # that returns a dict of group names and their paths + def group_fqn(store, path=None, g_fqn=None) -> dict[str, str]: + """To be removed for pydap > 3.5.0. + Derives the fully qualifying name of a Group.""" + from pydap.model import GroupType + + if not path: + path = "/" # parent + if not g_fqn: + g_fqn = {} + groups = [ + store[key].id + for key in store.keys() + if isinstance(store[key], GroupType) + ] + for g in groups: + g_fqn.update({g: path}) + subgroups = [ + var for var in store[g] if isinstance(store[g][var], GroupType) + ] + if len(subgroups) > 0: + npath = path + g + g_fqn = group_fqn(store[g], npath, g_fqn) + return g_fqn + + Groups = group_fqn(store.ds) + group_names += [ + str(NodePath(path_to_group) / NodePath(group)) + for group, path_to_group in Groups.items() + ] + for path_group in group_names: + # get a group from the store + store.group = path_group + store_entrypoint = StoreBackendEntrypoint() + with close_on_error(store): + group_ds = store_entrypoint.open_dataset( + store, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + if group: + group_name = str(NodePath(path_group).relative_to(parent)) + else: + group_name = str(NodePath(path_group)) + groups_dict[group_name] = group_ds + + return groups_dict + BACKEND_ENTRYPOINTS["pydap"] = ("pydap", PydapBackendEntrypoint) diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index 2a7dd4010f1..cf3626dbb12 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -7,7 +7,7 @@ from xarray.core.datatree import DataTree from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -T_DataTreeNetcdfEngine = Literal["netcdf4", "h5netcdf"] +T_DataTreeNetcdfEngine = Literal["netcdf4", "h5netcdf", "pydap"] T_DataTreeNetcdfTypes = Literal["NETCDF4"] if TYPE_CHECKING: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ec9f2fe8aef..e37f73c8004 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5335,20 +5335,14 @@ def num_graph_nodes(obj): @pytest.mark.filterwarnings("ignore:The binary mode of fromstring is deprecated") class TestPydap: def convert_to_pydap_dataset(self, original): - from pydap.model import BaseType, DatasetType, GridType + from pydap.model import BaseType, DatasetType ds = DatasetType("bears", **original.attrs) for key, var in original.data_vars.items(): - v = GridType(key) - v[key] = BaseType(key, var.values, dimensions=var.dims, **var.attrs) - for d in var.dims: - v[d] = BaseType(d, var[d].values) - ds[key] = v + ds[key] = BaseType(key, var.values, dims=var.dims, **var.attrs) # check all dims are stored in ds for d in original.coords: - ds[d] = BaseType( - d, original[d].values, dimensions=(d,), **original[d].attrs - ) + ds[d] = BaseType(d, original[d].values, dims=(d,), **original[d].attrs) return ds @contextlib.contextmanager @@ -5372,9 +5366,7 @@ def test_cmp_local_file(self) -> None: # we don't check attributes exactly with assertDatasetIdentical() # because the test DAP server seems to insert some extra # attributes not found in the netCDF file. - # 2025/03/18 : The DAP server now modifies the keys too - # assert actual.attrs.keys() == expected.attrs.keys() - assert len(actual.attrs.keys()) == len(expected.attrs.keys()) + assert actual.attrs.keys() == expected.attrs.keys() with self.create_datasets() as (actual, expected): assert_equal(actual[{"l": 2}], expected[{"l": 2}]) @@ -5416,7 +5408,8 @@ def test_dask(self) -> None: @requires_pydap class TestPydapOnline(TestPydap): @contextlib.contextmanager - def create_datasets(self, **kwargs): + def create_dap2_datasets(self, **kwargs): + # in pydap 3.5.0, urls defaults to dap2. url = "http://test.opendap.org/opendap/data/nc/bears.nc" actual = open_dataset(url, engine="pydap", **kwargs) with open_example_dataset("bears.nc") as expected: @@ -5424,18 +5417,33 @@ def create_datasets(self, **kwargs): expected["bears"] = expected["bears"].astype(str) yield actual, expected + def output_grid_deprecation_warning_dap2dataset(self): + with pytest.warns(DeprecationWarning, match="`output_grid` is deprecated"): + with self.create_dap2_datasets(output_grid=True) as (actual, expected): + assert_equal(actual, expected) + + def create_dap4_dataset(self, **kwargs): + url = "dap4://test.opendap.org/opendap/data/nc/bears.nc" + actual = open_dataset(url, engine="pydap", **kwargs) + with open_example_dataset("bears.nc") as expected: + # workaround to restore string which is converted to byte + expected["bears"] = expected["bears"].astype(str) + yield actual, expected + def test_session(self) -> None: - from pydap.cas.urs import setup_session + from requests import Session - session = setup_session("XarrayTestUser", "Xarray2017") + session = Session() # blank requests.Session object with mock.patch("pydap.client.open_url") as mock_func: xr.backends.PydapDataStore.open("http://test.url", session=session) mock_func.assert_called_with( url="http://test.url", application=None, session=session, - output_grid=True, + output_grid=False, timeout=120, + verify=True, + user_charset=None, ) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 2ff41adde0c..2d189299b2f 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -14,10 +14,12 @@ from xarray.testing import assert_equal, assert_identical from xarray.tests import ( has_zarr_v3, + network, parametrize_zarr_format, requires_dask, requires_h5netcdf, requires_netCDF4, + requires_pydap, requires_zarr, ) @@ -418,6 +420,99 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: assert_equal(subgroup_tree, expected_subtree) +@network +@requires_pydap +class TestPyDAPDatatreeIO: + """Test PyDAP backend for DataTree.""" + + engine: T_DataTreeNetcdfEngine | None = "pydap" + # you can check these by adding a .dmr to urls, and replacing dap4 with http + unaligned_datatree_url = ( + "dap4://test.opendap.org/opendap/dap4/unaligned_simple_datatree.nc.h5" + ) + all_aligned_child_nodes_url = ( + "dap4://test.opendap.org/opendap/dap4/all_aligned_child_nodes.nc.h5" + ) + simplegroup_datatree_url = "dap4://test.opendap.org/opendap/dap4/SimpleGroup.nc4.h5" + + def test_open_datatree(self, url=unaligned_datatree_url) -> None: + """Test if `open_datatree` fails to open a netCDF4 with an unaligned group hierarchy.""" + + with pytest.raises( + ValueError, + match=( + re.escape( + "group '/Group1/subgroup1' is not aligned with its parents:\nGroup:\n" + ) + + ".*" + ), + ): + open_datatree(url, engine=self.engine) + + def test_open_groups(self, url=unaligned_datatree_url) -> None: + """Test `open_groups` with a netCDF4/HDF5 file with an unaligned group hierarchy.""" + unaligned_dict_of_datasets = open_groups(url, engine=self.engine) + + # Check that group names are keys in the dictionary of `xr.Datasets` + assert "/" in unaligned_dict_of_datasets.keys() + assert "/Group1" in unaligned_dict_of_datasets.keys() + assert "/Group1/subgroup1" in unaligned_dict_of_datasets.keys() + # Check that group name returns the correct datasets + with xr.open_dataset(url, engine=self.engine, group="/") as expected: + assert_identical(unaligned_dict_of_datasets["/"], expected) + with xr.open_dataset(url, group="Group1", engine=self.engine) as expected: + assert_identical(unaligned_dict_of_datasets["/Group1"], expected) + with xr.open_dataset( + url, + group="/Group1/subgroup1", + engine=self.engine, + ) as expected: + assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) + + def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: + """Test that `open_datatree` inherits coordinates from root tree. + + This particular h5 file is a test file that inherits the time coordinate from the root + dataset to the child dataset. + + Group: / + │ Dimensions: (time: 1, Z: 1000, nv: 2) + │ Coordinates: + | time: (time) float32 0.5 + | Z: (Z) float32 -0.0 -1.0 -2.0 ... + │ Data variables: + │ Pressure (Z) float32 ... + | time_bnds (time, nv) float32 ... + └── Group: /SimpleGroup + │ Dimensions: (time: 1, Z: 1000, nv: 2, Y: 40, X: 40) + │ Coordinates: + | Y: (Y) int16 1 2 3 4 ... + | X: (X) int16 1 2 3 4 ... + | Inherited coordinates: + | time: (time) float32 0.5 + | Z: (Z) float32 -0.0 -1.0 -2.0 ... + │ Data variables: + │ Temperature (time, Z, Y, X) float32 ... + | Salinity (time, Z, Y, X) float32 ... + """ + tree = open_datatree(url, engine=self.engine) + assert list(tree.dims) == ["time", "Z", "nv"] + assert tree["/SimpleGroup"].coords["time"].dims == ("time",) + assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) + assert tree["/SimpleGroup"].coords["Y"].dims == ("Y",) + assert tree["/SimpleGroup"].coords["X"].dims == ("X",) + with xr.open_dataset(url, engine=self.engine, group="/SimpleGroup") as expected: + assert set(tree["/SimpleGroup"].dims) == set( + list(expected.dims) + ["Z", "nv"] + ) + + def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None: + aligned_dict_of_datasets = open_groups(url, engine=self.engine) + aligned_dt = DataTree.from_dict(aligned_dict_of_datasets) + with open_datatree(url, engine=self.engine) as opened_tree: + assert opened_tree.identical(aligned_dt) + + @requires_h5netcdf class TestH5NetCDFDatatreeIO(DatatreeIOBase): engine: T_DataTreeNetcdfEngine | None = "h5netcdf"