Skip to content
forked from pydata/xarray

Commit a064430

Browse files
committed
Merge branch 'main' into depr-groupby-squeeze-2
* main: Fix mypy type ignore (pydata#8564) Support for the new compression arguments. (pydata#7551) FIX: reverse index output of bottleneck move_argmax/move_argmin functions (pydata#8552) Adapt map_blocks to use new Coordinates API (pydata#8560) add xeofs to ecosystem.rst (pydata#8561) Offer a fixture for unifying DataArray & Dataset tests (pydata#8533) Generalize cumulative reduction (scan) to non-dask types (pydata#8019)
2 parents d6a3f2d + 03ec3cb commit a064430

15 files changed

+348
-98
lines changed

doc/ecosystem.rst

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Extend xarray capabilities
7878
- `xarray-dataclasses <https://github.com/astropenguin/xarray-dataclasses>`_: xarray extension for typed DataArray and Dataset creation.
7979
- `xarray_einstats <https://xarray-einstats.readthedocs.io>`_: Statistics, linear algebra and einops for xarray
8080
- `xarray_extras <https://github.com/crusaderky/xarray_extras>`_: Advanced algorithms for xarray objects (e.g. integrations/interpolations).
81+
- `xeofs <https://github.com/nicrie/xeofs>`_: PCA/EOF analysis and related techniques, integrated with xarray and Dask for efficient handling of large-scale data.
8182
- `xpublish <https://xpublish.readthedocs.io/>`_: Publish Xarray Datasets via a Zarr compatible REST API.
8283
- `xrft <https://github.com/rabernat/xrft>`_: Fourier transforms for xarray data.
8384
- `xr-scipy <https://xr-scipy.readthedocs.io>`_: A lightweight scipy wrapper for xarray.

doc/whats-new.rst

+11
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ New Features
2626

2727
- :py:meth:`xr.cov` and :py:meth:`xr.corr` now support using weights (:issue:`8527`, :pull:`7392`).
2828
By `Llorenç Lledó <https://github.com/lluritu>`_.
29+
- Accept the compression arguments new in netCDF 1.6.0 in the netCDF4 backend.
30+
See `netCDF4 documentation <https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables>`_ for details.
31+
By `Markel García-Díez <https://github.com/markelg>`_. (:issue:`6929`, :pull:`7551`) Note that some
32+
new compression filters needs plugins to be installed which may not be available in all netCDF distributions.
2933

3034
Breaking changes
3135
~~~~~~~~~~~~~~~~
@@ -39,6 +43,9 @@ Deprecations
3943
Bug fixes
4044
~~~~~~~~~
4145

46+
- Reverse index output of bottleneck's rolling move_argmax/move_argmin functions (:issue:`8541`, :pull:`8552`).
47+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
48+
4249

4350
Documentation
4451
~~~~~~~~~~~~~
@@ -589,6 +596,10 @@ Internal Changes
589596

590597
- :py:func:`as_variable` now consistently includes the variable name in any exceptions
591598
raised. (:pull:`7995`). By `Peter Hill <https://github.com/ZedThree>`_
599+
- Redirect cumulative reduction functions internally through the :py:class:`ChunkManagerEntryPoint`,
600+
potentially allowing :py:meth:`~xarray.DataArray.ffill` and :py:meth:`~xarray.DataArray.bfill` to
601+
use non-dask chunked array types.
602+
(:pull:`8019`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
592603
- :py:func:`encode_dataset_coordinates` now sorts coordinates automatically assigned to
593604
`coordinates` attributes during serialization (:issue:`8026`, :pull:`8034`).
594605
`By Ian Carroll <https://github.com/itcarroll>`_.

xarray/backends/netCDF4_.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,12 @@ def _extract_nc4_variable_encoding(
257257
"_FillValue",
258258
"dtype",
259259
"compression",
260+
"significant_digits",
261+
"quantize_mode",
262+
"blosc_shuffle",
263+
"szip_coding",
264+
"szip_pixels_per_block",
265+
"endian",
260266
}
261267
if lsd_okay:
262268
valid_encodings.add("least_significant_digit")
@@ -497,20 +503,23 @@ def prepare_variable(
497503
if name in self.ds.variables:
498504
nc4_var = self.ds.variables[name]
499505
else:
500-
nc4_var = self.ds.createVariable(
506+
default_args = dict(
501507
varname=name,
502508
datatype=datatype,
503509
dimensions=variable.dims,
504-
zlib=encoding.get("zlib", False),
505-
complevel=encoding.get("complevel", 4),
506-
shuffle=encoding.get("shuffle", True),
507-
fletcher32=encoding.get("fletcher32", False),
508-
contiguous=encoding.get("contiguous", False),
509-
chunksizes=encoding.get("chunksizes"),
510+
zlib=False,
511+
complevel=4,
512+
shuffle=True,
513+
fletcher32=False,
514+
contiguous=False,
515+
chunksizes=None,
510516
endian="native",
511-
least_significant_digit=encoding.get("least_significant_digit"),
517+
least_significant_digit=None,
512518
fill_value=fill_value,
513519
)
520+
default_args.update(encoding)
521+
default_args.pop("_FillValue", None)
522+
nc4_var = self.ds.createVariable(**default_args)
514523

515524
nc4_var.setncatts(attrs)
516525

xarray/core/coordinates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ class Coordinates(AbstractCoordinates):
213213
:py:class:`~xarray.Coordinates` object is passed, its indexes
214214
will be added to the new created object.
215215
indexes: dict-like, optional
216-
Mapping of where keys are coordinate names and values are
216+
Mapping where keys are coordinate names and values are
217217
:py:class:`~xarray.indexes.Index` objects. If None (default),
218218
pandas indexes will be created for each dimension coordinate.
219219
Passing an empty dictionary will skip this default behavior.

xarray/core/daskmanager.py

+22
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,28 @@ def reduction(
9797
keepdims=keepdims,
9898
)
9999

100+
def scan(
101+
self,
102+
func: Callable,
103+
binop: Callable,
104+
ident: float,
105+
arr: T_ChunkedArray,
106+
axis: int | None = None,
107+
dtype: np.dtype | None = None,
108+
**kwargs,
109+
) -> DaskArray:
110+
from dask.array.reductions import cumreduction
111+
112+
return cumreduction(
113+
func,
114+
binop,
115+
ident,
116+
arr,
117+
axis=axis,
118+
dtype=dtype,
119+
**kwargs,
120+
)
121+
100122
def apply_gufunc(
101123
self,
102124
func: Callable,

xarray/core/dataarray.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,11 @@
8080
try:
8181
from dask.dataframe import DataFrame as DaskDataFrame
8282
except ImportError:
83-
DaskDataFrame = None # type: ignore
83+
DaskDataFrame = None
8484
try:
8585
from dask.delayed import Delayed
8686
except ImportError:
87-
Delayed = None # type: ignore
87+
Delayed = None # type: ignore[misc,assignment]
8888
try:
8989
from iris.cube import Cube as iris_Cube
9090
except ImportError:

xarray/core/dataset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,11 @@
167167
try:
168168
from dask.delayed import Delayed
169169
except ImportError:
170-
Delayed = None # type: ignore
170+
Delayed = None # type: ignore[misc,assignment]
171171
try:
172172
from dask.dataframe import DataFrame as DaskDataFrame
173173
except ImportError:
174-
DaskDataFrame = None # type: ignore
174+
DaskDataFrame = None
175175

176176

177177
# list of attributes of pd.DatetimeIndex that are ndarrays of time info

xarray/core/parallel.py

+57-32
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,29 @@
44
import itertools
55
import operator
66
from collections.abc import Hashable, Iterable, Mapping, Sequence
7-
from typing import TYPE_CHECKING, Any, Callable
7+
from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict
88

99
import numpy as np
1010

1111
from xarray.core.alignment import align
12+
from xarray.core.coordinates import Coordinates
1213
from xarray.core.dataarray import DataArray
1314
from xarray.core.dataset import Dataset
15+
from xarray.core.indexes import Index
16+
from xarray.core.merge import merge
1417
from xarray.core.pycompat import is_dask_collection
1518

1619
if TYPE_CHECKING:
1720
from xarray.core.types import T_Xarray
1821

1922

23+
class ExpectedDict(TypedDict):
24+
shapes: dict[Hashable, int]
25+
coords: set[Hashable]
26+
data_vars: set[Hashable]
27+
indexes: dict[Hashable, Index]
28+
29+
2030
def unzip(iterable):
2131
return zip(*iterable)
2232

@@ -31,7 +41,9 @@ def assert_chunks_compatible(a: Dataset, b: Dataset):
3141

3242

3343
def check_result_variables(
34-
result: DataArray | Dataset, expected: Mapping[str, Any], kind: str
44+
result: DataArray | Dataset,
45+
expected: ExpectedDict,
46+
kind: Literal["coords", "data_vars"],
3547
):
3648
if kind == "coords":
3749
nice_str = "coordinate"
@@ -254,7 +266,7 @@ def _wrapper(
254266
args: list,
255267
kwargs: dict,
256268
arg_is_array: Iterable[bool],
257-
expected: dict,
269+
expected: ExpectedDict,
258270
):
259271
"""
260272
Wrapper function that receives datasets in args; converts to dataarrays when necessary;
@@ -345,33 +357,45 @@ def _wrapper(
345357
for arg in aligned
346358
)
347359

360+
merged_coordinates = merge([arg.coords for arg in aligned]).coords
361+
348362
_, npargs = unzip(
349363
sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0])
350364
)
351365

352366
# check that chunk sizes are compatible
353367
input_chunks = dict(npargs[0].chunks)
354-
input_indexes = dict(npargs[0]._indexes)
355368
for arg in xarray_objs[1:]:
356369
assert_chunks_compatible(npargs[0], arg)
357370
input_chunks.update(arg.chunks)
358-
input_indexes.update(arg._indexes)
359371

372+
coordinates: Coordinates
360373
if template is None:
361374
# infer template by providing zero-shaped arrays
362375
template = infer_template(func, aligned[0], *args, **kwargs)
363-
template_indexes = set(template._indexes)
364-
preserved_indexes = template_indexes & set(input_indexes)
365-
new_indexes = template_indexes - set(input_indexes)
366-
indexes = {dim: input_indexes[dim] for dim in preserved_indexes}
367-
indexes.update({k: template._indexes[k] for k in new_indexes})
376+
template_coords = set(template.coords)
377+
preserved_coord_vars = template_coords & set(merged_coordinates)
378+
new_coord_vars = template_coords - set(merged_coordinates)
379+
380+
preserved_coords = merged_coordinates.to_dataset()[preserved_coord_vars]
381+
# preserved_coords contains all coordinates bariables that share a dimension
382+
# with any index variable in preserved_indexes
383+
# Drop any unneeded vars in a second pass, this is required for e.g.
384+
# if the mapped function were to drop a non-dimension coordinate variable.
385+
preserved_coords = preserved_coords.drop_vars(
386+
tuple(k for k in preserved_coords.variables if k not in template_coords)
387+
)
388+
389+
coordinates = merge(
390+
(preserved_coords, template.coords.to_dataset()[new_coord_vars])
391+
).coords
368392
output_chunks: Mapping[Hashable, tuple[int, ...]] = {
369393
dim: input_chunks[dim] for dim in template.dims if dim in input_chunks
370394
}
371395

372396
else:
373397
# template xarray object has been provided with proper sizes and chunk shapes
374-
indexes = dict(template._indexes)
398+
coordinates = template.coords
375399
output_chunks = template.chunksizes
376400
if not output_chunks:
377401
raise ValueError(
@@ -473,6 +497,9 @@ def subset_dataset_to_block(
473497

474498
return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs)
475499

500+
# variable names that depend on the computation. Currently, indexes
501+
# cannot be modified in the mapped function, so we exclude thos
502+
computed_variables = set(template.variables) - set(coordinates.xindexes)
476503
# iterate over all possible chunk combinations
477504
for chunk_tuple in itertools.product(*ichunk.values()):
478505
# mapping from dimension name to chunk index
@@ -485,29 +512,32 @@ def subset_dataset_to_block(
485512
for isxr, arg in zip(is_xarray, npargs)
486513
]
487514

488-
# expected["shapes", "coords", "data_vars", "indexes"] are used to
489515
# raise nice error messages in _wrapper
490-
expected = {}
491-
# input chunk 0 along a dimension maps to output chunk 0 along the same dimension
492-
# even if length of dimension is changed by the applied function
493-
expected["shapes"] = {
494-
k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks
495-
}
496-
expected["data_vars"] = set(template.data_vars.keys()) # type: ignore[assignment]
497-
expected["coords"] = set(template.coords.keys()) # type: ignore[assignment]
498-
expected["indexes"] = {
499-
dim: indexes[dim][_get_chunk_slicer(dim, chunk_index, output_chunk_bounds)]
500-
for dim in indexes
516+
expected: ExpectedDict = {
517+
# input chunk 0 along a dimension maps to output chunk 0 along the same dimension
518+
# even if length of dimension is changed by the applied function
519+
"shapes": {
520+
k: output_chunks[k][v]
521+
for k, v in chunk_index.items()
522+
if k in output_chunks
523+
},
524+
"data_vars": set(template.data_vars.keys()),
525+
"coords": set(template.coords.keys()),
526+
"indexes": {
527+
dim: coordinates.xindexes[dim][
528+
_get_chunk_slicer(dim, chunk_index, output_chunk_bounds)
529+
]
530+
for dim in coordinates.xindexes
531+
},
501532
}
502533

503534
from_wrapper = (gname,) + chunk_tuple
504535
graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected)
505536

506537
# mapping from variable name to dask graph key
507538
var_key_map: dict[Hashable, str] = {}
508-
for name, variable in template.variables.items():
509-
if name in indexes:
510-
continue
539+
for name in computed_variables:
540+
variable = template.variables[name]
511541
gname_l = f"{name}-{gname}"
512542
var_key_map[name] = gname_l
513543

@@ -543,12 +573,7 @@ def subset_dataset_to_block(
543573
},
544574
)
545575

546-
# TODO: benbovy - flexible indexes: make it work with custom indexes
547-
# this will need to pass both indexes and coords to the Dataset constructor
548-
result = Dataset(
549-
coords={k: idx.to_pandas_index() for k, idx in indexes.items()},
550-
attrs=template.attrs,
551-
)
576+
result = Dataset(coords=coordinates, attrs=template.attrs)
552577

553578
for index in result._indexes:
554579
result[index].attrs = template[index].attrs

xarray/core/parallelcompat.py

+37
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,43 @@ def reduction(
403403
"""
404404
raise NotImplementedError()
405405

406+
def scan(
407+
self,
408+
func: Callable,
409+
binop: Callable,
410+
ident: float,
411+
arr: T_ChunkedArray,
412+
axis: int | None = None,
413+
dtype: np.dtype | None = None,
414+
**kwargs,
415+
) -> T_ChunkedArray:
416+
"""
417+
General version of a 1D scan, also known as a cumulative array reduction.
418+
419+
Used in ``ffill`` and ``bfill`` in xarray.
420+
421+
Parameters
422+
----------
423+
func: callable
424+
Cumulative function like np.cumsum or np.cumprod
425+
binop: callable
426+
Associated binary operator like ``np.cumsum->add`` or ``np.cumprod->mul``
427+
ident: Number
428+
Associated identity like ``np.cumsum->0`` or ``np.cumprod->1``
429+
arr: dask Array
430+
axis: int, optional
431+
dtype: dtype
432+
433+
Returns
434+
-------
435+
Chunked array
436+
437+
See also
438+
--------
439+
dask.array.cumreduction
440+
"""
441+
raise NotImplementedError()
442+
406443
@abstractmethod
407444
def apply_gufunc(
408445
self,

xarray/core/rolling.py

+5
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,11 @@ def _bottleneck_reduce(self, func, keep_attrs, **kwargs):
596596
values = func(
597597
padded.data, window=self.window[0], min_count=min_count, axis=axis
598598
)
599+
# index 0 is at the rightmost edge of the window
600+
# need to reverse index here
601+
# see GH #8541
602+
if func in [bottleneck.move_argmin, bottleneck.move_argmax]:
603+
values = self.window[0] - 1 - values
599604

600605
if self.center[0]:
601606
values = values[valid]

0 commit comments

Comments
 (0)