From c5e706b863324bfe17a5378c1a9b2d669d59cf2d Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 5 Feb 2025 21:38:29 +1100 Subject: [PATCH 1/4] fix: implicit fill value initialisation - initialise empty chunks to the default fill value during writing - add default fill value for datetime, timedelta, structured data types --- src/zarr/core/codec_pipeline.py | 34 ++++++++++++++++----------------- src/zarr/core/metadata/v2.py | 8 ++++++++ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 583ca01c5e..a35c5ca210 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] +def fill_value_or_default(chunk_spec: ArraySpec) -> Any: + fill_value = chunk_spec.fill_value + if fill_value is None: + # Zarr V2 allowed `fill_value` to be null in the metadata. + # Zarr V3 requires it to be set. This has already been + # validated when decoding the metadata, but we support reading + # Zarr V2 data and need to support the case where fill_value + # is None. + return _default_fill_value(dtype=chunk_spec.dtype) + else: + return fill_value + + @dataclass(frozen=True) class BatchedCodecPipeline(CodecPipeline): """Default codec pipeline. @@ -247,17 +260,7 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - fill_value = chunk_spec.fill_value - - if fill_value is None: - # Zarr V2 allowed `fill_value` to be null in the metadata. - # Zarr V3 requires it to be set. This has already been - # validated when decoding the metadata, but we support reading - # Zarr V2 data and need to support the case where fill_value - # is None. - fill_value = _default_fill_value(dtype=chunk_spec.dtype) - - out[out_selection] = fill_value + out[out_selection] = fill_value_or_default(chunk_spec) else: chunk_bytes_batch = await concurrent_map( [ @@ -284,10 +287,7 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - fill_value = chunk_spec.fill_value - if fill_value is None: - fill_value = _default_fill_value(dtype=chunk_spec.dtype) - out[out_selection] = fill_value + out[out_selection] = fill_value_or_default(chunk_spec) def _merge_chunk_array( self, @@ -305,7 +305,7 @@ def _merge_chunk_array( shape=chunk_spec.shape, dtype=chunk_spec.dtype, order=chunk_spec.order, - fill_value=chunk_spec.fill_value, + fill_value=fill_value_or_default(chunk_spec), ) else: chunk_array = existing_chunk_array.copy() # make a writable copy @@ -394,7 +394,7 @@ async def _read_key( chunk_array_batch.append(None) # type: ignore[unreachable] else: if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( - chunk_spec.fill_value + fill_value_or_default(chunk_spec) ): chunk_array_batch.append(None) else: diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 192db5b203..7d6741500e 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return b"" elif dtype.kind in "UO": return "" + elif dtype.kind in "Mm": + return dtype.type('nat') + elif dtype.kind == "V": + if dtype.fields is not None: + default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()]) + return np.array([default], dtype=dtype) + else: + return np.zeros(1, dtype=dtype) else: return dtype.type(0) From 19ec879fe1dbdc67ceca401968369c39b95f9dba Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 5 Feb 2025 23:13:25 +1100 Subject: [PATCH 2/4] fmt --- src/zarr/core/metadata/v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 7d6741500e..25697c4545 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -350,7 +350,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: elif dtype.kind in "UO": return "" elif dtype.kind in "Mm": - return dtype.type('nat') + return dtype.type("nat") elif dtype.kind == "V": if dtype.fields is not None: default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()]) From ea99951c66b1098ca43b594eb6f5308ce4f6b8d5 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 5 Feb 2025 23:34:12 +1100 Subject: [PATCH 3/4] add "other" dtype test --- tests/test_v2.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_v2.py b/tests/test_v2.py index 4c689c8e64..0a4487cfcc 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -313,3 +313,22 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: za[...] = a za = zarr.open_array(store=array_path) assert (a == za[:]).all() + + +@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"]) +def test_other_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7") + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all() From 077a1ba5211c76ec562ddff736cbc940b5782cf4 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 11 Feb 2025 07:53:07 +1100 Subject: [PATCH 4/4] changelog --- changes/2799.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2799.bugfix.rst diff --git a/changes/2799.bugfix.rst b/changes/2799.bugfix.rst new file mode 100644 index 0000000000..f22b7074bb --- /dev/null +++ b/changes/2799.bugfix.rst @@ -0,0 +1 @@ +Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types \ No newline at end of file