Skip to content

Commit 1a91802

Browse files
committed
Merge branch 'main' into groupby-aggs-using-numpy-groupies
* main: (23 commits) Vectorize groupby binary ops (pydata#6160) Speed-up multi-index html repr + add display_values_threshold option (pydata#6400) [pre-commit.ci] pre-commit autoupdate (pydata#6422) Fix concat scalar coord dtype (pydata#6418) use the `DaskIndexingAdapter` for `duck dask` arrays (pydata#6414) Weighted quantile (pydata#6059) upgrade `sphinx` (pydata#6415) Add kwarg-only breaking change to whats-new (pydata#6409) [pre-commit.ci] pre-commit autoupdate (pydata#6396) fix DataArray groupby returning a Dataset (pydata#6394) reindex: fix missing variable metadata (pydata#6389) [skip-ci] Add benchmarks for groupby math (pydata#6390) Fix concat with scalar coordinate (pydata#6385) isel: convert IndexVariable to Variable if index is dropped (pydata#6388) fix dataset groupby combine dataarray func (pydata#6386) fix concat with variable or dataarray as dim (pydata#6387) pydata#6367 Fix for time units checking could produce "unhashable type" error (pydata#6368) Explicit indexes (pydata#5692) Remove test_rasterio_vrt_network (pydata#6371) Allow write_empty_chunks to be set in Zarr encoding (pydata#6348) ...
2 parents 2694dbe + 2e93d54 commit 1a91802

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+5289
-2290
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ repos:
1919
hooks:
2020
- id: isort
2121
- repo: https://github.com/asottile/pyupgrade
22-
rev: v2.31.0
22+
rev: v2.31.1
2323
hooks:
2424
- id: pyupgrade
2525
args:
@@ -45,7 +45,7 @@ repos:
4545
# - id: velin
4646
# args: ["--write", "--compact"]
4747
- repo: https://github.com/pre-commit/mirrors-mypy
48-
rev: v0.931
48+
rev: v0.942
4949
hooks:
5050
- id: mypy
5151
# Copied from setup.cfg
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
version: 2
2+
23
build:
34
os: ubuntu-20.04
45
tools:
56
python: mambaforge-4.10
6-
sphinx:
7-
fail_on_warning: true
7+
88
conda:
99
environment: ci/requirements/doc.yml
10+
11+
sphinx:
12+
fail_on_warning: true
13+
1014
formats: []

asv_bench/benchmarks/groupby.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def setup(self, *args, **kwargs):
1717
}
1818
)
1919
self.ds2d = self.ds1d.expand_dims(z=10)
20+
self.ds1d_mean = self.ds1d.groupby("b").mean()
21+
self.ds2d_mean = self.ds2d.groupby("b").mean()
2022

2123
@parameterized(["ndim"], [(1, 2)])
2224
def time_init(self, ndim):
@@ -32,15 +34,30 @@ def time_agg_large_num_groups(self, method, ndim):
3234
ds = getattr(self, f"ds{ndim}d")
3335
getattr(ds.groupby("b"), method)()
3436

37+
def time_groupby_binary_op_1d(self):
38+
self.ds1d - self.ds1d_mean
39+
40+
def time_groupby_binary_op_2d(self):
41+
self.ds2d - self.ds2d_mean
42+
43+
def peakmem_groupby_binary_op_1d(self):
44+
self.ds1d - self.ds1d_mean
45+
46+
def peakmem_groupby_binary_op_2d(self):
47+
self.ds2d - self.ds2d_mean
48+
3549

3650
class GroupByDask(GroupBy):
3751
def setup(self, *args, **kwargs):
3852
requires_dask()
3953
super().setup(**kwargs)
54+
4055
self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2))
4156
self.ds1d["c"] = self.ds1d["c"].chunk({"dim_0": 50})
4257
self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2))
4358
self.ds2d["c"] = self.ds2d["c"].chunk({"dim_0": 50, "z": 5})
59+
self.ds1d_mean = self.ds1d.groupby("b").mean()
60+
self.ds2d_mean = self.ds2d.groupby("b").mean()
4461

4562

4663
class GroupByPandasDataFrame(GroupBy):
@@ -52,6 +69,13 @@ def setup(self, *args, **kwargs):
5269

5370
super().setup(**kwargs)
5471
self.ds1d = self.ds1d.to_dataframe()
72+
self.ds1d_mean = self.ds1d.groupby("b").mean()
73+
74+
def time_groupby_binary_op_2d(self):
75+
raise NotImplementedError
76+
77+
def peakmem_groupby_binary_op_2d(self):
78+
raise NotImplementedError
5579

5680

5781
class GroupByDaskDataFrame(GroupBy):
@@ -64,6 +88,13 @@ def setup(self, *args, **kwargs):
6488
requires_dask()
6589
super().setup(**kwargs)
6690
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
91+
self.ds1d_mean = self.ds1d.groupby("b").mean()
92+
93+
def time_groupby_binary_op_2d(self):
94+
raise NotImplementedError
95+
96+
def peakmem_groupby_binary_op_2d(self):
97+
raise NotImplementedError
6798

6899

69100
class Resample:
@@ -75,6 +106,8 @@ def setup(self, *args, **kwargs):
75106
coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)},
76107
)
77108
self.ds2d = self.ds1d.expand_dims(z=10)
109+
self.ds1d_mean = self.ds1d.resample(time="48H").mean()
110+
self.ds2d_mean = self.ds2d.resample(time="48H").mean()
78111

79112
@parameterized(["ndim"], [(1, 2)])
80113
def time_init(self, ndim):
@@ -90,6 +123,18 @@ def time_agg_large_num_groups(self, method, ndim):
90123
ds = getattr(self, f"ds{ndim}d")
91124
getattr(ds.resample(time="48H"), method)()
92125

126+
def time_groupby_binary_op_1d(self):
127+
self.ds1d - self.ds1d_mean
128+
129+
def time_groupby_binary_op_2d(self):
130+
self.ds2d - self.ds2d_mean
131+
132+
def peakmem_groupby_binary_op_1d(self):
133+
self.ds1d - self.ds1d_mean
134+
135+
def peakmem_groupby_binary_op_2d(self):
136+
self.ds2d - self.ds2d_mean
137+
93138

94139
class ResampleDask(Resample):
95140
def setup(self, *args, **kwargs):

ci/requirements/doc.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@ channels:
44
- conda-forge
55
- nodefaults
66
dependencies:
7-
- python=3.8
7+
- python=3.9
88
- bottleneck
99
- cartopy
1010
- cfgrib>=0.9
1111
- dask-core>=2.30
1212
- h5netcdf>=0.7.4
1313
- ipykernel
1414
- ipython
15-
- ipython_genutils # remove once `nbconvert` fixed its dependencies
1615
- iris>=2.3
16+
- jinja2<3.1 # remove once nbconvert fixed the use of removed functions
1717
- jupyter_client
1818
- matplotlib-base
1919
- nbsphinx
@@ -34,7 +34,7 @@ dependencies:
3434
- sphinx-book-theme >= 0.0.38
3535
- sphinx-copybutton
3636
- sphinx-panels
37-
- sphinx<4
37+
- sphinx!=4.4.0
3838
- zarr>=2.4
3939
- pip:
4040
- sphinxext-rediraffe

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,7 @@ Dataset
944944

945945
DatasetWeighted
946946
DatasetWeighted.mean
947+
DatasetWeighted.quantile
947948
DatasetWeighted.sum
948949
DatasetWeighted.std
949950
DatasetWeighted.var
@@ -958,6 +959,7 @@ DataArray
958959

959960
DataArrayWeighted
960961
DataArrayWeighted.mean
962+
DataArrayWeighted.quantile
961963
DataArrayWeighted.sum
962964
DataArrayWeighted.std
963965
DataArrayWeighted.var

doc/developers-meeting.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ The meeting occurs on `Zoom <https://us02web.zoom.us/j/88251613296?pwd=azZsSkU1U
77

88
Notes for the meeting are kept `here <https://hackmd.io/@U4W-olO3TX-hc-cvbjNe4A/xarray-dev-meeting/edit>`__.
99

10-
There is a `GitHub issue <https://github.com/pydata/xarray/issues/4001>`__ for changes to the meeting.
10+
There is a :issue:`GitHub issue <4001>` for changes to the meeting.
1111

1212
You can subscribe to this calendar to be notified of changes:
1313

doc/internals/extending-xarray.rst

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,10 @@ easy to inadvertently use internal APIs when subclassing, which means that your
1818
code may break when xarray upgrades. Furthermore, many builtin methods will
1919
only return native xarray objects.
2020

21-
The standard advice is to use `composition over inheritance`__, but
21+
The standard advice is to use :issue:`composition over inheritance <706>`, but
2222
reimplementing an API as large as xarray's on your own objects can be an onerous
2323
task, even if most methods are only forwarding to xarray implementations.
2424

25-
__ https://github.com/pydata/xarray/issues/706
26-
2725
If you simply want the ability to call a function with the syntax of a
2826
method call, then the builtin :py:meth:`~xarray.DataArray.pipe` method (copied
2927
from pandas) may suffice.

doc/roadmap.rst

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ xarray's data model, e.g., as attributes on the ``Dataset`` and
114114
coordinates in xarray operations, but will no longer would need to have
115115
a one-to-one correspondence with coordinate variables. Instead, an index
116116
should be able to refer to multiple (possibly multidimensional)
117-
coordinates that define it. See `GH
118-
1603 <https://github.com/pydata/xarray/issues/1603>`__ for full details
117+
coordinates that define it. See :issue:`1603` for full details.
119118

120119
Specific tasks:
121120

@@ -182,11 +181,9 @@ backends means that users can not easily build backend interface for
182181
xarray in third-party libraries.
183182

184183
The idea of refactoring the backends API and exposing it to users was
185-
originally proposed in `GH
186-
1970 <https://github.com/pydata/xarray/issues/1970>`__. The idea would
187-
be to develop a well tested and generic backend base class and
188-
associated utilities for external use. Specific tasks for this
189-
development would include:
184+
originally proposed in :issue:`1970`. The idea would be to develop a
185+
well tested and generic backend base class and associated utilities
186+
for external use. Specific tasks for this development would include:
190187

191188
- Exposing an abstract backend for writing new storage systems.
192189
- Exposing utilities for features like automatic closing of files,
@@ -225,7 +222,7 @@ examples include:
225222

226223
A new tree-like data structure which is essentially a structured hierarchical
227224
collection of Datasets could represent these cases, and would instead map to
228-
multiple netCDF groups (see `GH4118 <https://github.com/pydata/xarray/issues/4118>`__.).
225+
multiple netCDF groups (see :issue:`4118`).
229226

230227
Currently there are several libraries which have wrapped xarray in order to build
231228
domain-specific data structures (e.g. `xarray-multiscale <https://github.com/JaneliaSciComp/xarray-multiscale>`__.),

doc/tutorials-and-videos.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ Videos
1818
.. panels::
1919
:card: text-center
2020

21+
---
22+
Xdev Python Tutorial Seminar Series 2022 Thinking with Xarray : High-level computation patterns | Deepak Cherian
23+
^^^
24+
.. raw:: html
25+
26+
<iframe width="100%" src="https://www.youtube.com/embed/TSw3GF_d2y8" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
27+
28+
---
29+
Xdev Python Tutorial Seminar Series 2021 seminar introducing xarray (2 of 2) | Anderson Banihirwe
30+
^^^
31+
.. raw:: html
32+
33+
<iframe width="100%" src="https://www.youtube.com/embed/2H_4drBwORY" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
34+
2135
---
2236
Xdev Python Tutorial Seminar Series 2021 seminar introducing xarray (1 of 2) | Anderson Banihirwe
2337
^^^

doc/user-guide/computation.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ Weighted array reductions
265265

266266
:py:class:`DataArray` and :py:class:`Dataset` objects include :py:meth:`DataArray.weighted`
267267
and :py:meth:`Dataset.weighted` array reduction methods. They currently
268-
support weighted ``sum``, ``mean``, ``std`` and ``var``.
268+
support weighted ``sum``, ``mean``, ``std``, ``var`` and ``quantile``.
269269

270270
.. ipython:: python
271271
@@ -293,6 +293,12 @@ Calculate the weighted mean:
293293
294294
weighted_prec.mean(dim="month")
295295
296+
Calculate the weighted quantile:
297+
298+
.. ipython:: python
299+
300+
weighted_prec.quantile(q=0.5, dim="month")
301+
296302
The weighted sum corresponds to:
297303

298304
.. ipython:: python

doc/user-guide/plotting.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ Finally, if a dataset does not have any coordinates it enumerates all data point
251251
.. ipython:: python
252252
:okwarning:
253253
254-
air1d_multi = air1d_multi.drop("date")
254+
air1d_multi = air1d_multi.drop(["date", "time", "decimal_day"])
255255
air1d_multi.plot()
256256
257257
The same applies to 2D plots below.

doc/whats-new.rst

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,30 @@ v2022.03.1 (unreleased)
2222
New Features
2323
~~~~~~~~~~~~
2424

25+
- Add a weighted ``quantile`` method to :py:class:`~core.weighted.DatasetWeighted` and
26+
:py:class:`~core.weighted.DataArrayWeighted` (:pull:`6059`). By
27+
`Christian Jauvin <https://github.com/cjauvin>`_ and `David Huard <https://github.com/huard>`_.
28+
- Add a ``create_index=True`` parameter to :py:meth:`Dataset.stack` and
29+
:py:meth:`DataArray.stack` so that the creation of multi-indexes is optional
30+
(:pull:`5692`).
31+
By `Benoît Bovy <https://github.com/benbovy>`_.
32+
- Multi-index levels are now accessible through their own, regular coordinates
33+
instead of virtual coordinates (:pull:`5692`).
34+
By `Benoît Bovy <https://github.com/benbovy>`_.
35+
- Add a ``display_values_threshold`` option to control the total number of array
36+
elements which trigger summarization rather than full repr in (numpy) array
37+
detailed views of the html repr (:pull:`6400`).
38+
By `Benoît Bovy <https://github.com/benbovy>`_.
2539

2640
Breaking changes
2741
~~~~~~~~~~~~~~~~
2842

43+
- The Dataset and DataArray ``rename*`` methods do not implicitly add or drop
44+
indexes. (:pull:`5692`).
45+
By `Benoît Bovy <https://github.com/benbovy>`_.
46+
- Many arguments like ``keep_attrs``, ``axis``, and ``skipna`` are now keyword
47+
only for all reduction operations like ``.mean``.
48+
By `Deepak Cherian <https://github.com/dcherian>`_, `Jimmy Westling <https://github.com/illviljan>`_.
2949

3050
Deprecations
3151
~~~~~~~~~~~~
@@ -36,17 +56,36 @@ Bug fixes
3656

3757
- Set ``skipna=None`` for all ``quantile`` methods (e.g. :py:meth:`Dataset.quantile`) and
3858
ensure it skips missing values for float dtypes (consistent with other methods). This should
39-
not change the behavior (:pull:`6303`). By `Mathias Hauser <https://github.com/mathause>`_.
59+
not change the behavior (:pull:`6303`).
60+
By `Mathias Hauser <https://github.com/mathause>`_.
61+
- Many bugs fixed by the explicit indexes refactor, mainly related to multi-index (virtual)
62+
coordinates. See the corresponding pull-request on GitHub for more details. (:pull:`5692`).
63+
By `Benoît Bovy <https://github.com/benbovy>`_.
64+
- Fixed "unhashable type" error trying to read NetCDF file with variable having its 'units'
65+
attribute not ``str`` (e.g. ``numpy.ndarray``) (:issue:`6368`).
66+
By `Oleh Khoma <https://github.com/okhoma>`_.
67+
- Fixed the poor html repr performance on large multi-indexes (:pull:`6400`).
68+
By `Benoît Bovy <https://github.com/benbovy>`_.
69+
- Allow fancy indexing of duck dask arrays along multiple dimensions. (:pull:`6414`)
70+
By `Justus Magin <https://github.com/keewis>`_.
4071

4172
Documentation
4273
~~~~~~~~~~~~~
4374

75+
Performance
76+
~~~~~~~~~~~
77+
78+
- GroupBy binary operations are now vectorized.
79+
Previously this involved looping over all groups. (:issue:`5804`,:pull:`6160`)
80+
By `Deepak Cherian <https://github.com/dcherian>`_.
4481

4582
Internal Changes
4683
~~~~~~~~~~~~~~~~
4784

85+
- Many internal changes due to the explicit indexes refactor. See the
86+
corresponding pull-request on GitHub for more details. (:pull:`5692`).
87+
By `Benoît Bovy <https://github.com/benbovy>`_.
4888

49-
.. _whats-new.2022.02.0:
5089
.. _whats-new.2022.03.0:
5190

5291
v2022.03.0 (2 March 2022)
@@ -2129,7 +2168,7 @@ Documentation
21292168
- Created a "How do I..." section (:ref:`howdoi`) for solutions to common questions. (:pull:`3357`).
21302169
By `Deepak Cherian <https://github.com/dcherian>`_.
21312170
- Add examples for :py:meth:`Dataset.swap_dims` and :py:meth:`DataArray.swap_dims`
2132-
(pull:`3331`, pull:`3331`). By `Justus Magin <https://github.com/keewis>`_.
2171+
(:pull:`3331`, :pull:`3331`). By `Justus Magin <https://github.com/keewis>`_.
21332172
- Add examples for :py:meth:`align`, :py:meth:`merge`, :py:meth:`combine_by_coords`,
21342173
:py:meth:`full_like`, :py:meth:`zeros_like`, :py:meth:`ones_like`, :py:meth:`Dataset.pipe`,
21352174
:py:meth:`Dataset.assign`, :py:meth:`Dataset.reindex`, :py:meth:`Dataset.fillna` (:pull:`3328`).
@@ -2713,7 +2752,7 @@ Removes inadvertently introduced setup dependency on pytest-runner
27132752
will be Python 3 only, but older versions of xarray will always be available
27142753
for Python 2.7 users. For the more details, see:
27152754

2716-
- `Xarray Github issue discussing dropping Python 2 <https://github.com/pydata/xarray/issues/1829>`__
2755+
- :issue:`Xarray Github issue discussing dropping Python 2 <1829>`
27172756
- `Python 3 Statement <http://www.python3statement.org/>`__
27182757
- `Tips on porting to Python 3 <https://docs.python.org/3/howto/pyporting.html>`__
27192758

xarray/backends/zarr.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,13 @@ def extract_zarr_variable_encoding(
212212
"""
213213
encoding = variable.encoding.copy()
214214

215-
valid_encodings = {"chunks", "compressor", "filters", "cache_metadata"}
215+
valid_encodings = {
216+
"chunks",
217+
"compressor",
218+
"filters",
219+
"cache_metadata",
220+
"write_empty_chunks",
221+
}
216222

217223
if raise_on_invalid:
218224
invalid = [k for k in encoding if k not in valid_encodings]

xarray/coding/times.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,8 @@ def encode(self, variable, name=None):
695695
def decode(self, variable, name=None):
696696
dims, data, attrs, encoding = unpack_for_decoding(variable)
697697

698-
if "units" in attrs and attrs["units"] in TIME_UNITS:
698+
units = attrs.get("units")
699+
if isinstance(units, str) and units in TIME_UNITS:
699700
units = pop_to(attrs, encoding, "units")
700701
transform = partial(decode_cf_timedelta, units=units)
701702
dtype = np.dtype("timedelta64[ns]")

0 commit comments

Comments
 (0)