Skip to content

Commit 018f61d

Browse files
brokkoli71normanrz
andauthored
zarr.array from from an existing zarr.Array (#2622)
* add creation from other zarr * remove duplicated tests * improve test * test_iter_grid for non-squares * concurrent streaming for equal chunk sizes * fix merge * fix mypy * fix mypy * fix test_iter_grid * extract to zarr.from_array * fix mypy * fix mypy * format * fix test_creation_from_other_zarr_format * distinguish between keep and auto for from_array arguments * partition concurrency along new_array chunks * fix mypy * improve test_creation_from_other_zarr_format * add typing in test * Update src/zarr/core/array.py Co-authored-by: Norman Rzepka <[email protected]> * add from_array with npt.ArrayLike * add write_data argument * improve tests * improve docstrings and add examples * fix mypy and readthedocs * fix mypy and readthedocs * fix mypy and readthedocs * fix mypy and readthedocs * fix readthedocs ERROR: Unexpected indentation * add release notes * format docstring examples * add write_data attr to synchronous.create_array * `create_array` calls `from_array` calls `init_array` * document changes * fix serializer from_array v2 to v3 * fix mypy * improve codecov * fix mypy * from_array: copy zarr format on default * in ``from_array`` make all arguments except ``store`` keyword-only, to match ``create_array`` * in ``from_array`` default shards="keep" * redundant ``ChunkKeyEncoding | ChunkKeyEncodingLike`` * fix argument order in calls of `from_array` * fix numpydoc-validation * add docstring to store2 pytest fixture * extract `_parse_keep_array_attr` from `from_array` * extract `_parse_keep_array_attr` from `from_array` * correct _parse_keep_array_attr * fix merge * fix merge --------- Co-authored-by: Norman Rzepka <[email protected]>
1 parent 06f7796 commit 018f61d

File tree

10 files changed

+772
-40
lines changed

10 files changed

+772
-40
lines changed

changes/2622.feature.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add ``zarr.from_array`` using concurrent streaming of source data

docs/release-notes.rst

+2
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ Other
145145
3.0.1 (Jan. 17, 2025)
146146
---------------------
147147

148+
* Implement ``zarr.from_array`` using concurrent streaming (:issue:`2622`).
149+
148150
Bug fixes
149151
~~~~~~~~~
150152
* Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`).

src/zarr/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
create_hierarchy,
1212
empty,
1313
empty_like,
14+
from_array,
1415
full,
1516
full_like,
1617
group,
@@ -54,6 +55,7 @@
5455
"create_hierarchy",
5556
"empty",
5657
"empty_like",
58+
"from_array",
5759
"full",
5860
"full_like",
5961
"group",

src/zarr/api/asynchronous.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import numpy.typing as npt
1010
from typing_extensions import deprecated
1111

12-
from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata
12+
from zarr.core.array import Array, AsyncArray, create_array, from_array, get_array_metadata
1313
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArrayConfigParams
1414
from zarr.core.buffer import NDArrayLike
1515
from zarr.core.common import (
@@ -57,6 +57,7 @@
5757
"create_hierarchy",
5858
"empty",
5959
"empty_like",
60+
"from_array",
6061
"full",
6162
"full_like",
6263
"group",
@@ -534,7 +535,7 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None =
534535

535536

536537
async def array(
537-
data: npt.ArrayLike, **kwargs: Any
538+
data: npt.ArrayLike | Array, **kwargs: Any
538539
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
539540
"""Create an array filled with `data`.
540541
@@ -551,13 +552,16 @@ async def array(
551552
The new array.
552553
"""
553554

555+
if isinstance(data, Array):
556+
return await from_array(data=data, **kwargs)
557+
554558
# ensure data is array-like
555559
if not hasattr(data, "shape") or not hasattr(data, "dtype"):
556560
data = np.asanyarray(data)
557561

558562
# setup dtype
559563
kw_dtype = kwargs.get("dtype")
560-
if kw_dtype is None:
564+
if kw_dtype is None and hasattr(data, "dtype"):
561565
kwargs["dtype"] = data.dtype
562566
else:
563567
kwargs["dtype"] = kw_dtype

src/zarr/api/synchronous.py

+224-3
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
"create_hierarchy",
5151
"empty",
5252
"empty_like",
53+
"from_array",
5354
"full",
5455
"full_like",
5556
"group",
@@ -359,7 +360,7 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An
359360

360361

361362
# TODO: add type annotations for kwargs
362-
def array(data: npt.ArrayLike, **kwargs: Any) -> Array:
363+
def array(data: npt.ArrayLike | Array, **kwargs: Any) -> Array:
363364
"""Create an array filled with `data`.
364365
365366
Parameters
@@ -759,11 +760,12 @@ def create_array(
759760
order: MemoryOrder | None = None,
760761
zarr_format: ZarrFormat | None = 3,
761762
attributes: dict[str, JSON] | None = None,
762-
chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None,
763+
chunk_key_encoding: ChunkKeyEncodingLike | None = None,
763764
dimension_names: Iterable[str] | None = None,
764765
storage_options: dict[str, Any] | None = None,
765766
overwrite: bool = False,
766767
config: ArrayConfigLike | None = None,
768+
write_data: bool = True,
767769
) -> Array:
768770
"""Create an array.
769771
@@ -857,6 +859,11 @@ def create_array(
857859
Whether to overwrite an array with the same name in the store, if one exists.
858860
config : ArrayConfigLike, optional
859861
Runtime configuration for the array.
862+
write_data : bool
863+
If a pre-existing array-like object was provided to this function via the ``data`` parameter
864+
then ``write_data`` determines whether the values in that array-like object should be
865+
written to the Zarr array created by this function. If ``write_data`` is ``False``, then the
866+
array will be left empty.
860867
861868
Returns
862869
-------
@@ -866,7 +873,7 @@ def create_array(
866873
Examples
867874
--------
868875
>>> import zarr
869-
>>> store = zarr.storage.MemoryStore(mode='w')
876+
>>> store = zarr.storage.MemoryStore()
870877
>>> arr = await zarr.create_array(
871878
>>> store=store,
872879
>>> shape=(100,100),
@@ -897,6 +904,220 @@ def create_array(
897904
storage_options=storage_options,
898905
overwrite=overwrite,
899906
config=config,
907+
write_data=write_data,
908+
)
909+
)
910+
)
911+
912+
913+
def from_array(
914+
store: str | StoreLike,
915+
*,
916+
data: Array | npt.ArrayLike,
917+
write_data: bool = True,
918+
name: str | None = None,
919+
chunks: Literal["auto", "keep"] | ChunkCoords = "keep",
920+
shards: ShardsLike | None | Literal["keep"] = "keep",
921+
filters: FiltersLike | Literal["keep"] = "keep",
922+
compressors: CompressorsLike | Literal["keep"] = "keep",
923+
serializer: SerializerLike | Literal["keep"] = "keep",
924+
fill_value: Any | None = None,
925+
order: MemoryOrder | None = None,
926+
zarr_format: ZarrFormat | None = None,
927+
attributes: dict[str, JSON] | None = None,
928+
chunk_key_encoding: ChunkKeyEncodingLike | None = None,
929+
dimension_names: Iterable[str] | None = None,
930+
storage_options: dict[str, Any] | None = None,
931+
overwrite: bool = False,
932+
config: ArrayConfigLike | None = None,
933+
) -> Array:
934+
"""Create an array from an existing array or array-like.
935+
936+
Parameters
937+
----------
938+
store : str or Store
939+
Store or path to directory in file system or name of zip file for the new array.
940+
data : Array | array-like
941+
The array to copy.
942+
write_data : bool, default True
943+
Whether to copy the data from the input array to the new array.
944+
If ``write_data`` is ``False``, the new array will be created with the same metadata as the
945+
input array, but without any data.
946+
name : str or None, optional
947+
The name of the array within the store. If ``name`` is ``None``, the array will be located
948+
at the root of the store.
949+
chunks : ChunkCoords or "auto" or "keep", optional
950+
Chunk shape of the array.
951+
Following values are supported:
952+
953+
- "auto": Automatically determine the chunk shape based on the array's shape and dtype.
954+
- "keep": Retain the chunk shape of the data array if it is a zarr Array.
955+
- ChunkCoords: A tuple of integers representing the chunk shape.
956+
957+
If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto".
958+
shards : ChunkCoords, optional
959+
Shard shape of the array.
960+
Following values are supported:
961+
962+
- "auto": Automatically determine the shard shape based on the array's shape and chunk shape.
963+
- "keep": Retain the shard shape of the data array if it is a zarr Array.
964+
- ChunkCoords: A tuple of integers representing the shard shape.
965+
- None: No sharding.
966+
967+
If not specified, defaults to "keep" if data is a zarr Array, otherwise None.
968+
filters : Iterable[Codec] or "auto" or "keep", optional
969+
Iterable of filters to apply to each chunk of the array, in order, before serializing that
970+
chunk to bytes.
971+
972+
For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
973+
and these values must be instances of ``ArrayArrayCodec``, or dict representations
974+
of ``ArrayArrayCodec``.
975+
976+
For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the
977+
the order if your filters is consistent with the behavior of each filter.
978+
979+
Following values are supported:
980+
981+
- Iterable[Codec]: List of filters to apply to the array.
982+
- "auto": Automatically determine the filters based on the array's dtype.
983+
- "keep": Retain the filters of the data array if it is a zarr Array.
984+
985+
If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto".
986+
compressors : Iterable[Codec] or "auto" or "keep", optional
987+
List of compressors to apply to the array. Compressors are applied in order, and after any
988+
filters are applied (if any are specified) and the data is serialized into bytes.
989+
990+
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
991+
returns another bytestream. Multiple compressors my be provided for Zarr format 3.
992+
993+
For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
994+
be provided for Zarr format 2.
995+
996+
Following values are supported:
997+
998+
- Iterable[Codec]: List of compressors to apply to the array.
999+
- "auto": Automatically determine the compressors based on the array's dtype.
1000+
- "keep": Retain the compressors of the input array if it is a zarr Array.
1001+
1002+
If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto".
1003+
serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional
1004+
Array-to-bytes codec to use for encoding the array data.
1005+
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.
1006+
1007+
Following values are supported:
1008+
1009+
- dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``.
1010+
- ArrayBytesCodec: An instance of ``ArrayBytesCodec``.
1011+
- "auto": a default serializer will be used. These defaults can be changed by modifying the value of
1012+
``array.v3_default_serializer`` in :mod:`zarr.core.config`.
1013+
- "keep": Retain the serializer of the input array if it is a zarr Array.
1014+
1015+
fill_value : Any, optional
1016+
Fill value for the array.
1017+
If not specified, defaults to the fill value of the data array.
1018+
order : {"C", "F"}, optional
1019+
The memory of the array (default is "C").
1020+
For Zarr format 2, this parameter sets the memory order of the array.
1021+
For Zarr format 3, this parameter is deprecated, because memory order
1022+
is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory
1023+
order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
1024+
If not specified, defaults to the memory order of the data array.
1025+
zarr_format : {2, 3}, optional
1026+
The zarr format to use when saving.
1027+
If not specified, defaults to the zarr format of the data array.
1028+
attributes : dict, optional
1029+
Attributes for the array.
1030+
If not specified, defaults to the attributes of the data array.
1031+
chunk_key_encoding : ChunkKeyEncoding, optional
1032+
A specification of how the chunk keys are represented in storage.
1033+
For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``.
1034+
For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``.
1035+
If not specified and the data array has the same zarr format as the target array,
1036+
the chunk key encoding of the data array is used.
1037+
dimension_names : Iterable[str], optional
1038+
The names of the dimensions (default is None).
1039+
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
1040+
If not specified, defaults to the dimension names of the data array.
1041+
storage_options : dict, optional
1042+
If using an fsspec URL to create the store, these will be passed to the backend implementation.
1043+
Ignored otherwise.
1044+
overwrite : bool, default False
1045+
Whether to overwrite an array with the same name in the store, if one exists.
1046+
config : ArrayConfig or ArrayConfigLike, optional
1047+
Runtime configuration for the array.
1048+
1049+
Returns
1050+
-------
1051+
Array
1052+
The array.
1053+
1054+
Examples
1055+
--------
1056+
Create an array from an existing Array::
1057+
1058+
>>> import zarr
1059+
>>> store = zarr.storage.MemoryStore()
1060+
>>> store2 = zarr.storage.LocalStore('example.zarr')
1061+
>>> arr = zarr.create_array(
1062+
>>> store=store,
1063+
>>> shape=(100,100),
1064+
>>> chunks=(10,10),
1065+
>>> dtype='int32',
1066+
>>> fill_value=0)
1067+
>>> arr2 = zarr.from_array(store2, data=arr)
1068+
<Array file://example.zarr shape=(100, 100) dtype=int32>
1069+
1070+
Create an array from an existing NumPy array::
1071+
1072+
>>> import numpy as np
1073+
>>> arr3 = zarr.from_array(
1074+
zarr.storage.MemoryStore(),
1075+
>>> data=np.arange(10000, dtype='i4').reshape(100, 100),
1076+
>>> )
1077+
<Array memory://125477403529984 shape=(100, 100) dtype=int32>
1078+
1079+
Create an array from any array-like object::
1080+
1081+
>>> arr4 = zarr.from_array(
1082+
>>> zarr.storage.MemoryStore(),
1083+
>>> data=[[1, 2], [3, 4]],
1084+
>>> )
1085+
<Array memory://125477392154368 shape=(2, 2) dtype=int64>
1086+
>>> arr4[...]
1087+
array([[1, 2],[3, 4]])
1088+
1089+
Create an array from an existing Array without copying the data::
1090+
1091+
>>> arr5 = zarr.from_array(
1092+
>>> zarr.storage.MemoryStore(),
1093+
>>> data=arr4,
1094+
>>> write_data=False,
1095+
>>> )
1096+
<Array memory://140678602965568 shape=(2, 2) dtype=int64>
1097+
>>> arr5[...]
1098+
array([[0, 0],[0, 0]])
1099+
"""
1100+
return Array(
1101+
sync(
1102+
zarr.core.array.from_array(
1103+
store,
1104+
data=data,
1105+
write_data=write_data,
1106+
name=name,
1107+
chunks=chunks,
1108+
shards=shards,
1109+
filters=filters,
1110+
compressors=compressors,
1111+
serializer=serializer,
1112+
fill_value=fill_value,
1113+
order=order,
1114+
zarr_format=zarr_format,
1115+
attributes=attributes,
1116+
chunk_key_encoding=chunk_key_encoding,
1117+
dimension_names=dimension_names,
1118+
storage_options=storage_options,
1119+
overwrite=overwrite,
1120+
config=config,
9001121
)
9011122
)
9021123
)

0 commit comments

Comments
 (0)