diff --git a/changes/2622.feature.rst b/changes/2622.feature.rst new file mode 100644 index 0000000000..f5c7cbe192 --- /dev/null +++ b/changes/2622.feature.rst @@ -0,0 +1 @@ +Add ``zarr.from_array`` using concurrent streaming of source data \ No newline at end of file diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 9a6e680e4d..c585e4f0d3 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -145,6 +145,8 @@ Other 3.0.1 (Jan. 17, 2025) --------------------- +* Implement ``zarr.from_array`` using concurrent streaming (:issue:`2622`). + Bug fixes ~~~~~~~~~ * Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`). diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 4ffa4c9bbc..31796601b3 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -11,6 +11,7 @@ create_hierarchy, empty, empty_like, + from_array, full, full_like, group, @@ -54,6 +55,7 @@ "create_hierarchy", "empty", "empty_like", + "from_array", "full", "full_like", "group", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 6059893920..2e9319eaba 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,7 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata +from zarr.core.array import Array, AsyncArray, create_array, from_array, get_array_metadata from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( @@ -56,6 +56,7 @@ "create_hierarchy", "empty", "empty_like", + "from_array", "full", "full_like", "group", @@ -533,7 +534,7 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = async def array( - data: npt.ArrayLike, **kwargs: Any + data: npt.ArrayLike | Array, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array filled with `data`. @@ -550,13 +551,16 @@ async def array( The new array. """ + if isinstance(data, Array): + return await from_array(data=data, **kwargs) + # ensure data is array-like if not hasattr(data, "shape") or not hasattr(data, "dtype"): data = np.asanyarray(data) # setup dtype kw_dtype = kwargs.get("dtype") - if kw_dtype is None: + if kw_dtype is None and hasattr(data, "dtype"): kwargs["dtype"] = data.dtype else: kwargs["dtype"] = kw_dtype diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 9424ae1fde..b19aced2be 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -50,6 +50,7 @@ "create_hierarchy", "empty", "empty_like", + "from_array", "full", "full_like", "group", @@ -359,7 +360,7 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An # TODO: add type annotations for kwargs -def array(data: npt.ArrayLike, **kwargs: Any) -> Array: +def array(data: npt.ArrayLike | Array, **kwargs: Any) -> Array: """Create an array filled with `data`. Parameters @@ -759,11 +760,12 @@ def create_array( order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, + write_data: bool = True, ) -> Array: """Create an array. @@ -857,6 +859,11 @@ def create_array( Whether to overwrite an array with the same name in the store, if one exists. config : ArrayConfigLike, optional Runtime configuration for the array. + write_data : bool + If a pre-existing array-like object was provided to this function via the ``data`` parameter + then ``write_data`` determines whether the values in that array-like object should be + written to the Zarr array created by this function. If ``write_data`` is ``False``, then the + array will be left empty. Returns ------- @@ -866,7 +873,7 @@ def create_array( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> arr = await zarr.create_array( >>> store=store, >>> shape=(100,100), @@ -897,6 +904,220 @@ def create_array( storage_options=storage_options, overwrite=overwrite, config=config, + write_data=write_data, + ) + ) + ) + + +def from_array( + store: str | StoreLike, + *, + data: Array | npt.ArrayLike, + write_data: bool = True, + name: str | None = None, + chunks: Literal["auto", "keep"] | ChunkCoords = "keep", + shards: ShardsLike | None | Literal["keep"] = "keep", + filters: FiltersLike | Literal["keep"] = "keep", + compressors: CompressorsLike | Literal["keep"] = "keep", + serializer: SerializerLike | Literal["keep"] = "keep", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfigLike | None = None, +) -> Array: + """Create an array from an existing array or array-like. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file for the new array. + data : Array | array-like + The array to copy. + write_data : bool, default True + Whether to copy the data from the input array to the new array. + If ``write_data`` is ``False``, the new array will be created with the same metadata as the + input array, but without any data. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + chunks : ChunkCoords or "auto" or "keep", optional + Chunk shape of the array. + Following values are supported: + + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". + shards : ChunkCoords, optional + Shard shape of the array. + Following values are supported: + + - "auto": Automatically determine the shard shape based on the array's shape and chunk shape. + - "keep": Retain the shard shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the shard shape. + - None: No sharding. + + If not specified, defaults to "keep" if data is a zarr Array, otherwise None. + filters : Iterable[Codec] or "auto" or "keep", optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + + Following values are supported: + + - Iterable[Codec]: List of filters to apply to the array. + - "auto": Automatically determine the filters based on the array's dtype. + - "keep": Retain the filters of the data array if it is a zarr Array. + + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". + compressors : Iterable[Codec] or "auto" or "keep", optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. + + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. + + Following values are supported: + + - Iterable[Codec]: List of compressors to apply to the array. + - "auto": Automatically determine the compressors based on the array's dtype. + - "keep": Retain the compressors of the input array if it is a zarr Array. + + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". + serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional + Array-to-bytes codec to use for encoding the array data. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. + + Following values are supported: + + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. + - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. + - "auto": a default serializer will be used. These defaults can be changed by modifying the value of + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + - "keep": Retain the serializer of the input array if it is a zarr Array. + + fill_value : Any, optional + Fill value for the array. + If not specified, defaults to the fill value of the data array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If not specified, defaults to the memory order of the data array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + If not specified, defaults to the zarr format of the data array. + attributes : dict, optional + Attributes for the array. + If not specified, defaults to the attributes of the data array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. + If not specified and the data array has the same zarr format as the target array, + the chunk key encoding of the data array is used. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. + If not specified, defaults to the dimension names of the data array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + Array + The array. + + Examples + -------- + Create an array from an existing Array:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = zarr.from_array(store2, data=arr) + + + Create an array from an existing NumPy array:: + + >>> import numpy as np + >>> arr3 = zarr.from_array( + zarr.storage.MemoryStore(), + >>> data=np.arange(10000, dtype='i4').reshape(100, 100), + >>> ) + + + Create an array from any array-like object:: + + >>> arr4 = zarr.from_array( + >>> zarr.storage.MemoryStore(), + >>> data=[[1, 2], [3, 4]], + >>> ) + + >>> arr4[...] + array([[1, 2],[3, 4]]) + + Create an array from an existing Array without copying the data:: + + >>> arr5 = zarr.from_array( + >>> zarr.storage.MemoryStore(), + >>> data=arr4, + >>> write_data=False, + >>> ) + + >>> arr5[...] + array([[0, 0],[0, 0]]) + """ + return Array( + sync( + zarr.core.array.from_array( + store, + data=data, + write_data=write_data, + name=name, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, ) ) ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 0e03cbcabb..2f85ae296c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -25,6 +25,7 @@ import numpy.typing as npt from typing_extensions import deprecated +import zarr from zarr._compat import _deprecate_positional_args from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete @@ -888,7 +889,7 @@ async def open( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> async_arr = await AsyncArray.open(store) # doctest: +ELLIPSIS """ @@ -1324,7 +1325,7 @@ async def getitem( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> async_arr = await zarr.api.asynchronous.create_array( ... store=store, ... shape=(100,100), @@ -3794,6 +3795,269 @@ class ShardsConfigParam(TypedDict): ShardsLike: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] +async def from_array( + store: str | StoreLike, + *, + data: Array | npt.ArrayLike, + write_data: bool = True, + name: str | None = None, + chunks: Literal["auto", "keep"] | ChunkCoords = "keep", + shards: ShardsLike | None | Literal["keep"] = "keep", + filters: FiltersLike | Literal["keep"] = "keep", + compressors: CompressorsLike | Literal["keep"] = "keep", + serializer: SerializerLike | Literal["keep"] = "keep", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array from an existing array or array-like. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file for the new array. + data : Array | array-like + The array to copy. + write_data : bool, default True + Whether to copy the data from the input array to the new array. + If ``write_data`` is ``False``, the new array will be created with the same metadata as the + input array, but without any data. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + chunks : ChunkCoords or "auto" or "keep", optional + Chunk shape of the array. + Following values are supported: + + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". + shards : ChunkCoords, optional + Shard shape of the array. + Following values are supported: + + - "auto": Automatically determine the shard shape based on the array's shape and chunk shape. + - "keep": Retain the shard shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the shard shape. + - None: No sharding. + + If not specified, defaults to "keep" if data is a zarr Array, otherwise None. + filters : Iterable[Codec] or "auto" or "keep", optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + + Following values are supported: + + - Iterable[Codec]: List of filters to apply to the array. + - "auto": Automatically determine the filters based on the array's dtype. + - "keep": Retain the filters of the data array if it is a zarr Array. + + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". + compressors : Iterable[Codec] or "auto" or "keep", optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. + + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. + + Following values are supported: + + - Iterable[Codec]: List of compressors to apply to the array. + - "auto": Automatically determine the compressors based on the array's dtype. + - "keep": Retain the compressors of the input array if it is a zarr Array. + + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". + serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional + Array-to-bytes codec to use for encoding the array data. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. + + Following values are supported: + + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. + - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. + - "auto": a default serializer will be used. These defaults can be changed by modifying the value of + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + - "keep": Retain the serializer of the input array if it is a zarr Array. + + fill_value : Any, optional + Fill value for the array. + If not specified, defaults to the fill value of the data array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If not specified, defaults to the memory order of the data array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + If not specified, defaults to the zarr format of the data array. + attributes : dict, optional + Attributes for the array. + If not specified, defaults to the attributes of the data array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. + If not specified and the data array has the same zarr format as the target array, + the chunk key encoding of the data array is used. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. + If not specified, defaults to the dimension names of the data array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + Create an array from an existing Array:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = await zarr.api.asynchronous.from_array(store2, data=arr) + + + Create an array from an existing NumPy array:: + + >>> arr3 = await zarr.api.asynchronous.from_array( + >>> zarr.storage.MemoryStore(), + >>> data=np.arange(10000, dtype='i4').reshape(100, 100), + >>> ) + + + Create an array from any array-like object:: + + >>> arr4 = await zarr.api.asynchronous.from_array( + >>> zarr.storage.MemoryStore(), + >>> data=[[1, 2], [3, 4]], + >>> ) + + >>> await arr4.getitem(...) + array([[1, 2],[3, 4]]) + + Create an array from an existing Array without copying the data:: + + >>> arr5 = await zarr.api.asynchronous.from_array( + >>> zarr.storage.MemoryStore(), + >>> data=Array(arr4), + >>> write_data=False, + >>> ) + + >>> await arr5.getitem(...) + array([[0, 0],[0, 0]]) + """ + mode: Literal["a"] = "a" + config_parsed = parse_array_config(config) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + + ( + chunks, + shards, + filters, + compressors, + serializer, + fill_value, + order, + zarr_format, + chunk_key_encoding, + dimension_names, + ) = _parse_keep_array_attr( + data=data, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + ) + if not hasattr(data, "dtype") or not hasattr(data, "shape"): + data = np.array(data) + + result = await init_array( + store_path=store_path, + shape=data.shape, + dtype=data.dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + overwrite=overwrite, + config=config_parsed, + ) + + if write_data: + if isinstance(data, Array): + + async def _copy_array_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + arr = await _data._async_array.getitem(chunk_coords) + await result.setitem(chunk_coords, arr) + + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in result._iter_chunk_regions()], + _copy_array_region, + zarr.core.config.config.get("async.concurrency"), + ) + else: + + async def _copy_arraylike_region(chunk_coords: slice, _data: NDArrayLike) -> None: + await result.setitem(chunk_coords, _data[chunk_coords]) + + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in result._iter_chunk_regions()], + _copy_arraylike_region, + zarr.core.config.config.get("async.concurrency"), + ) + return result + + async def init_array( *, store_path: StorePath, @@ -4137,38 +4401,138 @@ async def create_array( >>> fill_value=0) """ - mode: Literal["a"] = "a" - store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) - data_parsed, shape_parsed, dtype_parsed = _parse_data_params( data=data, shape=shape, dtype=dtype ) - result = await init_array( - store_path=store_path, - shape=shape_parsed, - dtype=dtype_parsed, - chunks=chunks, - shards=shards, - filters=filters, - compressors=compressors, - serializer=serializer, - fill_value=fill_value, - order=order, - zarr_format=zarr_format, - attributes=attributes, - chunk_key_encoding=chunk_key_encoding, - dimension_names=dimension_names, - overwrite=overwrite, - config=config, - ) + if data_parsed is not None: + return await from_array( + store, + data=data_parsed, + write_data=write_data, + name=name, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + else: + mode: Literal["a"] = "a" - if write_data is True and data_parsed is not None: - await result._set_selection( - BasicIndexer(..., shape=result.shape, chunk_grid=result.metadata.chunk_grid), - data_parsed, - prototype=default_buffer_prototype(), + store_path = await make_store_path( + store, path=name, mode=mode, storage_options=storage_options ) - return result + return await init_array( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + overwrite=overwrite, + config=config, + ) + + +def _parse_keep_array_attr( + data: Array | npt.ArrayLike, + chunks: Literal["auto", "keep"] | ChunkCoords, + shards: ShardsLike | None | Literal["keep"], + filters: FiltersLike | Literal["keep"], + compressors: CompressorsLike | Literal["keep"], + serializer: SerializerLike | Literal["keep"], + fill_value: Any | None, + order: MemoryOrder | None, + zarr_format: ZarrFormat | None, + chunk_key_encoding: ChunkKeyEncodingLike | None, + dimension_names: Iterable[str] | None, +) -> tuple[ + ChunkCoords | Literal["auto"], + ShardsLike | None, + FiltersLike, + CompressorsLike, + SerializerLike, + Any | None, + MemoryOrder | None, + ZarrFormat, + ChunkKeyEncodingLike | None, + Iterable[str] | None, +]: + if isinstance(data, Array): + if chunks == "keep": + chunks = data.chunks + if shards == "keep": + shards = data.shards + if zarr_format is None: + zarr_format = data.metadata.zarr_format + if filters == "keep": + if zarr_format == data.metadata.zarr_format: + filters = data.filters or None + else: + filters = "auto" + if compressors == "keep": + if zarr_format == data.metadata.zarr_format: + compressors = data.compressors or None + else: + compressors = "auto" + if serializer == "keep": + if zarr_format == 3 and data.metadata.zarr_format == 3: + serializer = cast(SerializerLike, data.serializer) + else: + serializer = "auto" + if fill_value is None: + fill_value = data.fill_value + if order is None: + order = data.order + if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: + if isinstance(data.metadata, ArrayV2Metadata): + chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} + elif isinstance(data.metadata, ArrayV3Metadata): + chunk_key_encoding = data.metadata.chunk_key_encoding + if dimension_names is None and data.metadata.zarr_format == 3: + dimension_names = data.metadata.dimension_names + else: + if chunks == "keep": + chunks = "auto" + if shards == "keep": + shards = None + if zarr_format is None: + zarr_format = 3 + if filters == "keep": + filters = "auto" + if compressors == "keep": + compressors = "auto" + if serializer == "keep": + serializer = "auto" + return ( + chunks, + shards, + filters, + compressors, + serializer, + fill_value, + order, + zarr_format, + chunk_key_encoding, + dimension_names, + ) def _parse_chunk_key_encoding( diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 8e7f7f3474..c75227650c 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -68,7 +68,7 @@ from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike + from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -998,7 +998,7 @@ async def create_array( fill_value: Any | None = 0, order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -2369,7 +2369,7 @@ def create_array( fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -2763,7 +2763,7 @@ def array( fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + chunk_key_encoding: ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, diff --git a/tests/conftest.py b/tests/conftest.py index 04034cb5b8..74a140c5c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,6 +89,14 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: return await parse_store(param, str(tmpdir)) +@pytest.fixture +async def store2(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: + """Fixture to create a second store for testing copy operations between stores""" + param = request.param + store2_path = tmpdir.mkdir("store2") + return await parse_store(param, str(store2_path)) + + @pytest.fixture(params=["local", "memory", "zip"]) def sync_store(request: pytest.FixtureRequest, tmp_path: LEGACY_PATH) -> Store: result = sync(parse_store(request.param, str(tmp_path))) diff --git a/tests/test_array.py b/tests/test_array.py index efcf8a6bf9..f71d3e0845 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -11,6 +11,7 @@ import numcodecs import numpy as np +import numpy.typing as npt import pytest from packaging.version import Version @@ -1326,6 +1327,125 @@ async def test_scalar_array() -> None: assert arr.shape == () +@pytest.mark.parametrize("store", ["local"], indirect=True) +@pytest.mark.parametrize("store2", ["local"], indirect=["store2"]) +@pytest.mark.parametrize("src_format", [2, 3]) +@pytest.mark.parametrize("new_format", [2, 3, None]) +async def test_creation_from_other_zarr_format( + store: Store, + store2: Store, + src_format: ZarrFormat, + new_format: ZarrFormat | None, +) -> None: + if src_format == 2: + src = zarr.create( + (50, 50), chunks=(10, 10), store=store, zarr_format=src_format, dimension_separator="/" + ) + else: + src = zarr.create( + (50, 50), + chunks=(10, 10), + store=store, + zarr_format=src_format, + chunk_key_encoding=("default", "."), + ) + + src[:] = np.arange(50 * 50).reshape((50, 50)) + result = zarr.from_array( + store=store2, + data=src, + zarr_format=new_format, + ) + np.testing.assert_array_equal(result[:], src[:]) + assert result.fill_value == src.fill_value + assert result.dtype == src.dtype + assert result.chunks == src.chunks + expected_format = src_format if new_format is None else new_format + assert result.metadata.zarr_format == expected_format + if src_format == new_format: + assert result.metadata == src.metadata + + result2 = zarr.array( + data=src, + store=store2, + overwrite=True, + zarr_format=new_format, + ) + np.testing.assert_array_equal(result2[:], src[:]) + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) +@pytest.mark.parametrize("store2", ["local", "memory", "zip"], indirect=["store2"]) +@pytest.mark.parametrize("src_chunks", [(40, 10), (11, 50)]) +@pytest.mark.parametrize("new_chunks", [(40, 10), (11, 50)]) +async def test_from_array( + store: Store, + store2: Store, + src_chunks: tuple[int, int], + new_chunks: tuple[int, int], + zarr_format: ZarrFormat, +) -> None: + src_fill_value = 2 + src_dtype = np.dtype("uint8") + src_attributes = None + + src = zarr.create( + (100, 10), + chunks=src_chunks, + dtype=src_dtype, + store=store, + fill_value=src_fill_value, + attributes=src_attributes, + ) + src[:] = np.arange(1000).reshape((100, 10)) + + new_fill_value = 3 + new_attributes: dict[str, JSON] = {"foo": "bar"} + + result = zarr.from_array( + data=src, + store=store2, + chunks=new_chunks, + fill_value=new_fill_value, + attributes=new_attributes, + ) + + np.testing.assert_array_equal(result[:], src[:]) + assert result.fill_value == new_fill_value + assert result.dtype == src_dtype + assert result.attrs == new_attributes + assert result.chunks == new_chunks + + +@pytest.mark.parametrize("store", ["local"], indirect=True) +@pytest.mark.parametrize("chunks", ["keep", "auto"]) +@pytest.mark.parametrize("write_data", [True, False]) +@pytest.mark.parametrize( + "src", + [ + np.arange(1000).reshape(10, 10, 10), + zarr.ones((10, 10, 10)), + 5, + [1, 2, 3], + [[1, 2, 3], [4, 5, 6]], + ], +) # add other npt.ArrayLike? +async def test_from_array_arraylike( + store: Store, + chunks: Literal["auto", "keep"] | tuple[int, int], + write_data: bool, + src: Array | npt.ArrayLike, +) -> None: + fill_value = 42 + result = zarr.from_array( + store, data=src, chunks=chunks, write_data=write_data, fill_value=fill_value + ) + if write_data: + np.testing.assert_array_equal(result[...], np.array(src)) + else: + np.testing.assert_array_equal(result[...], np.full_like(src, fill_value)) + + async def test_orthogonal_set_total_slice() -> None: """Ensure that a whole chunk overwrite does not read chunks""" store = MemoryStore() diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 77363acff3..b1707c88a3 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1902,7 +1902,7 @@ def test_iter_grid( """ Test that iter_grid works as expected for 1, 2, and 3 dimensions. """ - grid_shape = (5,) * ndim + grid_shape = (10, 5, 7)[:ndim] if origin_0d is not None: origin_kwarg = origin_0d * ndim @@ -1984,3 +1984,13 @@ def test_vectorized_indexing_incompatible_shape(store) -> None: ) with pytest.raises(ValueError, match="Attempting to set"): arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) + + +def test_iter_chunk_regions(): + chunks = (2, 3) + a = zarr.create((10, 10), chunks=chunks) + a[:] = 1 + for region in a._iter_chunk_regions(): + assert_array_equal(a[region], np.ones_like(a[region])) + a[region] = 0 + assert_array_equal(a[region], np.zeros_like(a[region]))