Skip to content

Commit 7ffdcc7

Browse files
authored
Allow setting fill_value on Zarr format 3 arrays (#10161)
* Allow setting `fill_value` on Zarr format 3 arrays Closes #10064 * fix * fix format detection * fix * Set use_zarr_fill_value_as_mask=False
1 parent ec88c28 commit 7ffdcc7

File tree

4 files changed

+96
-6
lines changed

4 files changed

+96
-6
lines changed

doc/user-guide/io.rst

+16
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,22 @@ reads. Because this fall-back option is so much slower, xarray issues a
10211021
instead of falling back to try reading non-consolidated metadata.
10221022

10231023

1024+
Fill Values
1025+
~~~~~~~~~~~
1026+
1027+
Zarr arrays have a ``fill_value`` that is used for chunks that were never written to disk.
1028+
For the Zarr version 2 format, Xarray will set ``fill_value`` to be equal to the CF/NetCDF ``"_FillValue"``.
1029+
This is ``np.nan`` by default for floats, and unset otherwise. Note that the Zarr library will set a
1030+
default ``fill_value`` if not specified (usually ``0``).
1031+
1032+
For the Zarr version 3 format, ``_FillValue`` and ```fill_value`` are decoupled.
1033+
So you can set ``fill_value`` in ``encoding`` as usual.
1034+
1035+
Note that at read-time, you can control whether ``_FillValue`` is masked using the
1036+
``mask_and_scale`` kwarg; and whether Zarr's ``fill_value`` is treated as synonymous
1037+
with ``_FillValue`` using the ``use_zarr_fill_value_as_mask`` kwarg to :py:func:`xarray.open_zarr`.
1038+
1039+
10241040
.. _io.kerchunk:
10251041

10261042
Kerchunk

doc/whats-new.rst

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ v2025.03.1 (unreleased)
2222
New Features
2323
~~~~~~~~~~~~
2424

25+
- Allow setting a ``fill_value`` for Zarr format 3 arrays. Specify ``fill_value`` in ``encoding`` as usual.
26+
(:issue:`10064`). By `Deepak Cherian <https://github.com/dcherian>`_.
2527

2628
Breaking changes
2729
~~~~~~~~~~~~~~~~

xarray/backends/zarr.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ def extract_zarr_variable_encoding(
426426
raise_on_invalid=False,
427427
name=None,
428428
*,
429+
zarr_format: ZarrFormat,
429430
safe_chunks=True,
430431
region=None,
431432
mode=None,
@@ -443,6 +444,7 @@ def extract_zarr_variable_encoding(
443444
region: tuple[slice, ...], optional
444445
mode: str, optional
445446
shape: tuple[int, ...], optional
447+
zarr_format: Literal[2,3]
446448
Returns
447449
-------
448450
encoding : dict
@@ -463,16 +465,23 @@ def extract_zarr_variable_encoding(
463465
"cache_metadata",
464466
"write_empty_chunks",
465467
}
468+
if zarr_format == 3:
469+
valid_encodings.add("fill_value")
466470

467471
for k in safe_to_drop:
468472
if k in encoding:
469473
del encoding[k]
470474

471475
if raise_on_invalid:
472476
invalid = [k for k in encoding if k not in valid_encodings]
477+
if "fill_value" in invalid and zarr_format == 2:
478+
msg = " Use `_FillValue` to set the Zarr array `fill_value`"
479+
else:
480+
msg = ""
481+
473482
if invalid:
474483
raise ValueError(
475-
f"unexpected encoding parameters for zarr backend: {invalid!r}"
484+
f"unexpected encoding parameters for zarr backend: {invalid!r}." + msg
476485
)
477486
else:
478487
for k in list(encoding):
@@ -1147,7 +1156,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
11471156
if self._use_zarr_fill_value_as_mask:
11481157
fill_value = attrs.pop("_FillValue", None)
11491158
else:
1150-
fill_value = None
1159+
fill_value = v.encoding.pop("fill_value", None)
11511160
if "_FillValue" in attrs:
11521161
# replace with encoded fill value
11531162
fv = attrs.pop("_FillValue")
@@ -1198,6 +1207,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
11981207
region=region,
11991208
mode=self._mode,
12001209
shape=zarr_shape,
1210+
zarr_format=3 if is_zarr_v3_format else 2,
12011211
)
12021212

12031213
if self._mode == "w" or name not in existing_keys:

xarray/tests/test_backends.py

+66-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import tempfile
1414
import uuid
1515
import warnings
16+
from collections import ChainMap
1617
from collections.abc import Generator, Iterator, Mapping
1718
from contextlib import ExitStack
1819
from io import BytesIO
@@ -3343,6 +3344,67 @@ def test_cache_members(self) -> None:
33433344
observed_keys_2 = sorted(zstore_mut.array_keys())
33443345
assert observed_keys_2 == sorted(array_keys + [new_key])
33453346

3347+
@requires_dask
3348+
@pytest.mark.parametrize("dtype", [int, float])
3349+
def test_zarr_fill_value_setting(self, dtype):
3350+
# When zarr_format=2, _FillValue sets fill_value
3351+
# When zarr_format=3, fill_value is set independently
3352+
# We test this by writing a dask array with compute=False,
3353+
# on read we should receive chunks filled with `fill_value`
3354+
fv = -1
3355+
ds = xr.Dataset(
3356+
{"foo": ("x", dask.array.from_array(np.array([0, 0, 0], dtype=dtype)))}
3357+
)
3358+
expected = xr.Dataset({"foo": ("x", [fv] * 3)})
3359+
3360+
zarr_format_2 = (
3361+
has_zarr_v3 and zarr.config.get("default_zarr_format") == 2
3362+
) or not has_zarr_v3
3363+
if zarr_format_2:
3364+
attr = "_FillValue"
3365+
expected.foo.attrs[attr] = fv
3366+
else:
3367+
attr = "fill_value"
3368+
if dtype is float:
3369+
# for floats, Xarray inserts a default `np.nan`
3370+
expected.foo.attrs["_FillValue"] = np.nan
3371+
3372+
# turn off all decoding so we see what Zarr returns to us.
3373+
# Since chunks, are not written, we should receive on `fill_value`
3374+
open_kwargs = {
3375+
"mask_and_scale": False,
3376+
"consolidated": False,
3377+
"use_zarr_fill_value_as_mask": False,
3378+
}
3379+
save_kwargs = dict(compute=False, consolidated=False)
3380+
with self.roundtrip(
3381+
ds,
3382+
save_kwargs=ChainMap(save_kwargs, dict(encoding={"foo": {attr: fv}})),
3383+
open_kwargs=open_kwargs,
3384+
) as actual:
3385+
assert_identical(actual, expected)
3386+
3387+
ds.foo.encoding[attr] = fv
3388+
with self.roundtrip(
3389+
ds, save_kwargs=save_kwargs, open_kwargs=open_kwargs
3390+
) as actual:
3391+
assert_identical(actual, expected)
3392+
3393+
if zarr_format_2:
3394+
ds = ds.drop_encoding()
3395+
with pytest.raises(ValueError, match="_FillValue"):
3396+
with self.roundtrip(
3397+
ds,
3398+
save_kwargs=ChainMap(
3399+
save_kwargs, dict(encoding={"foo": {"fill_value": fv}})
3400+
),
3401+
open_kwargs=open_kwargs,
3402+
):
3403+
pass
3404+
# TODO: this doesn't fail because of the
3405+
# ``raise_on_invalid=vn in check_encoding_set`` line in zarr.py
3406+
# ds.foo.encoding["fill_value"] = fv
3407+
33463408

33473409
@requires_zarr
33483410
@pytest.mark.skipif(
@@ -5827,23 +5889,23 @@ def test_encode_zarr_attr_value() -> None:
58275889
@requires_zarr
58285890
def test_extract_zarr_variable_encoding() -> None:
58295891
var = xr.Variable("x", [1, 2])
5830-
actual = backends.zarr.extract_zarr_variable_encoding(var)
5892+
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)
58315893
assert "chunks" in actual
58325894
assert actual["chunks"] == ("auto" if has_zarr_v3 else None)
58335895

58345896
var = xr.Variable("x", [1, 2], encoding={"chunks": (1,)})
5835-
actual = backends.zarr.extract_zarr_variable_encoding(var)
5897+
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)
58365898
assert actual["chunks"] == (1,)
58375899

58385900
# does not raise on invalid
58395901
var = xr.Variable("x", [1, 2], encoding={"foo": (1,)})
5840-
actual = backends.zarr.extract_zarr_variable_encoding(var)
5902+
actual = backends.zarr.extract_zarr_variable_encoding(var, zarr_format=3)
58415903

58425904
# raises on invalid
58435905
var = xr.Variable("x", [1, 2], encoding={"foo": (1,)})
58445906
with pytest.raises(ValueError, match=r"unexpected encoding parameters"):
58455907
actual = backends.zarr.extract_zarr_variable_encoding(
5846-
var, raise_on_invalid=True
5908+
var, raise_on_invalid=True, zarr_format=3
58475909
)
58485910

58495911

0 commit comments

Comments
 (0)