Skip to content

Commit 96c9677

Browse files
d-v-bdcherian
andauthored
don't serialize empty tuples for v2 filters, and warn when reading such metadata (#2847)
* don't serialize empty tuples for v2 filters, and warn when reading such metadata * release notes * alter text in comment * add skeleton of metadata spec compliance property test * do something for V3 --------- Co-authored-by: Deepak Cherian <[email protected]>
1 parent e8bfb64 commit 96c9677

File tree

5 files changed

+71
-28
lines changed

5 files changed

+71
-28
lines changed

changes/2847.fix.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixed a bug where ``ArrayV2Metadata`` could save ``filters`` as an empty array.

src/zarr/core/metadata/v2.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import base64
4+
import warnings
45
from collections.abc import Iterable
56
from enum import Enum
67
from functools import cached_property
@@ -178,6 +179,16 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
178179
# handle the renames
179180
expected |= {"dtype", "chunks"}
180181

182+
# check if `filters` is an empty sequence; if so use None instead and raise a warning
183+
if _data["filters"] is not None and len(_data["filters"]) == 0:
184+
msg = (
185+
"Found an empty list of filters in the array metadata document. "
186+
"This is contrary to the Zarr V2 specification, and will cause an error in the future. "
187+
"Use None (or Null in a JSON document) instead of an empty list of filters."
188+
)
189+
warnings.warn(msg, UserWarning, stacklevel=1)
190+
_data["filters"] = None
191+
181192
_data = {k: v for k, v in _data.items() if k in expected}
182193

183194
return cls(**_data)
@@ -255,7 +266,11 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
255266
else:
256267
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
257268
raise TypeError(msg)
258-
return tuple(out)
269+
if len(out) == 0:
270+
# Per the v2 spec, an empty tuple is not allowed -- use None to express "no filters"
271+
return None
272+
else:
273+
return tuple(out)
259274
# take a single codec instance and wrap it in a tuple
260275
if isinstance(data, numcodecs.abc.Codec):
261276
return (data,)

tests/test_array.py

+21-26
Original file line numberDiff line numberDiff line change
@@ -972,45 +972,40 @@ def test_default_fill_value(dtype: str, fill_value_expected: object, store: Stor
972972
@staticmethod
973973
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
974974
@pytest.mark.parametrize("empty_value", [None, ()])
975-
async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_value: Any) -> None:
975+
async def test_no_filters_compressors(
976+
store: MemoryStore, dtype: str, empty_value: object, zarr_format: ZarrFormat
977+
) -> None:
976978
"""
977979
Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked.
978980
"""
979981

980-
# v2
981982
arr = await create_array(
982983
store=store,
983984
dtype=dtype,
984985
shape=(10,),
985-
zarr_format=2,
986+
zarr_format=zarr_format,
986987
compressors=empty_value,
987988
filters=empty_value,
988989
)
989990
# Test metadata explicitly
990-
assert arr.metadata.zarr_format == 2 # guard for mypy
991-
# The v2 metadata stores None and () separately
992-
assert arr.metadata.filters == empty_value
993-
# The v2 metadata does not allow tuple for compressor, therefore it is turned into None
994-
assert arr.metadata.compressor is None
995-
996-
assert arr.filters == ()
997-
assert arr.compressors == ()
998-
999-
# v3
1000-
arr = await create_array(
1001-
store=store,
1002-
dtype=dtype,
1003-
shape=(10,),
1004-
compressors=empty_value,
1005-
filters=empty_value,
1006-
)
1007-
assert arr.metadata.zarr_format == 3 # guard for mypy
1008-
if dtype == "str":
1009-
assert arr.metadata.codecs == (VLenUTF8Codec(),)
1010-
assert arr.serializer == VLenUTF8Codec()
991+
if zarr_format == 2:
992+
assert arr.metadata.zarr_format == 2 # guard for mypy
993+
# v2 spec requires that filters be either a collection with at least one filter, or None
994+
assert arr.metadata.filters is None
995+
# Compressor is a single element in v2 metadata; the absence of a compressor is encoded
996+
# as None
997+
assert arr.metadata.compressor is None
998+
999+
assert arr.filters == ()
1000+
assert arr.compressors == ()
10111001
else:
1012-
assert arr.metadata.codecs == (BytesCodec(),)
1013-
assert arr.serializer == BytesCodec()
1002+
assert arr.metadata.zarr_format == 3 # guard for mypy
1003+
if dtype == "str":
1004+
assert arr.metadata.codecs == (VLenUTF8Codec(),)
1005+
assert arr.serializer == VLenUTF8Codec()
1006+
else:
1007+
assert arr.metadata.codecs == (BytesCodec(),)
1008+
assert arr.serializer == BytesCodec()
10141009

10151010
@staticmethod
10161011
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])

tests/test_metadata/test_v2.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None:
3333

3434

3535
@pytest.mark.parametrize("attributes", [None, {"foo": "bar"}])
36-
@pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)])
36+
@pytest.mark.parametrize("filters", [None, (numcodecs.GZip(),)])
3737
@pytest.mark.parametrize("compressor", [None, numcodecs.GZip()])
3838
@pytest.mark.parametrize("fill_value", [None, 0, 1])
3939
@pytest.mark.parametrize("order", ["C", "F"])
@@ -81,6 +81,24 @@ def test_metadata_to_dict(
8181
assert observed == expected
8282

8383

84+
def test_filters_empty_tuple_warns() -> None:
85+
metadata_dict = {
86+
"zarr_format": 2,
87+
"shape": (1,),
88+
"chunks": (1,),
89+
"dtype": "uint8",
90+
"order": "C",
91+
"compressor": None,
92+
"filters": (),
93+
"fill_value": 0,
94+
}
95+
with pytest.warns(
96+
UserWarning, match="Found an empty list of filters in the array metadata document."
97+
):
98+
meta = ArrayV2Metadata.from_dict(metadata_dict)
99+
assert meta.filters is None
100+
101+
84102
class TestConsolidated:
85103
@pytest.fixture
86104
async def v2_consolidated_metadata(

tests/test_properties.py

+14
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,20 @@ async def test_roundtrip_array_metadata(
113113
assert actual == expected
114114

115115

116+
@given(store=stores, meta=array_metadata()) # type: ignore[misc]
117+
def test_array_metadata_meets_spec(store: Store, meta: ArrayV2Metadata | ArrayV3Metadata) -> None:
118+
# TODO: fill this out
119+
asdict = meta.to_dict()
120+
if isinstance(meta, ArrayV2Metadata):
121+
assert asdict["filters"] != ()
122+
assert asdict["filters"] is None or isinstance(asdict["filters"], tuple)
123+
assert asdict["zarr_format"] == 2
124+
elif isinstance(meta, ArrayV3Metadata):
125+
assert asdict["zarr_format"] == 3
126+
else:
127+
raise NotImplementedError
128+
129+
116130
# @st.composite
117131
# def advanced_indices(draw, *, shape):
118132
# basic_idxr = draw(

0 commit comments

Comments
 (0)