Skip to content

Parse 0 fill value as "" for str dtype #2798

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2798.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Ensure fill value of `0` for `str` `dtype` is parsed to `""`
6 changes: 4 additions & 2 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
_warn_order_kwarg,
concurrent_map,
parse_dtype,
parse_fill_value,
parse_order,
parse_shapelike,
product,
Expand Down Expand Up @@ -3901,6 +3902,7 @@ async def init_array(

from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation

fill_value_parsed = parse_fill_value(fill_value, dtype, zarr_format)
dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format)
shape_parsed = parse_shapelike(shape)
chunk_key_encoding_parsed = _parse_chunk_key_encoding(
Expand Down Expand Up @@ -3947,7 +3949,7 @@ async def init_array(
dtype=dtype_parsed,
chunks=chunk_shape_parsed,
dimension_separator=chunk_key_encoding_parsed.separator,
fill_value=fill_value,
fill_value=fill_value_parsed,
order=order_parsed,
filters=filters_parsed,
compressor=compressor_parsed,
Expand Down Expand Up @@ -3985,7 +3987,7 @@ async def init_array(
meta = AsyncArray._create_metadata_v3(
shape=shape_parsed,
dtype=dtype_parsed,
fill_value=fill_value,
fill_value=fill_value_parsed,
chunk_shape=chunks_out,
chunk_key_encoding=chunk_key_encoding_parsed,
codecs=codecs_out,
Expand Down
4 changes: 1 addition & 3 deletions src/zarr/core/array_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from zarr.core.common import (
MemoryOrder,
parse_bool,
parse_fill_value,
parse_order,
parse_shapelike,
)
Expand Down Expand Up @@ -102,11 +101,10 @@ def __init__(
) -> None:
shape_parsed = parse_shapelike(shape)
dtype_parsed = np.dtype(dtype)
fill_value_parsed = parse_fill_value(fill_value)

object.__setattr__(self, "shape", shape_parsed)
object.__setattr__(self, "dtype", dtype_parsed)
object.__setattr__(self, "fill_value", fill_value_parsed)
object.__setattr__(self, "fill_value", fill_value)
object.__setattr__(self, "config", config)
object.__setattr__(self, "prototype", prototype)

Expand Down
6 changes: 4 additions & 2 deletions src/zarr/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,11 @@ def parse_shapelike(data: int | Iterable[int]) -> tuple[int, ...]:
return data_tuple


def parse_fill_value(data: Any) -> Any:
def parse_fill_value(fill_value: Any, dtype: Any, zarr_format: ZarrFormat) -> Any:
if zarr_format == 2 and (dtype is str or dtype == "str") and fill_value == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs at least a comment explaining why we are doing this. what was zarr-python 2's behavior in this situation?

fill_value = ""
# todo: real validation
return data
return fill_value


def parse_order(data: Any) -> Literal["C", "F"]:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_metadata/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,18 @@ def test_zstd_checksum() -> None:
arr.metadata.to_buffer_dict(default_buffer_prototype())[".zarray"].to_bytes()
)
assert "checksum" not in metadata["compressor"]


def test_0_fill_str_type():
array = zarr.create_array(
store=zarr.storage.MemoryStore(),
dtype=str,
shape=(5,),
chunks=(2,),
fill_value=0,
zarr_format=2,
overwrite=True,
)

# Ensure the array initializes correctly with the fill value
np.testing.assert_array_equal(array[:], ["", "", "", "", ""])