Skip to content

api: make buffer public api #2664

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/user-guide/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ This is the current default configuration::
'string': {'name': 'vlen-utf8'}},
'write_empty_chunks': False},
'async': {'concurrency': 10, 'timeout': None},
'buffer': 'zarr.core.buffer.cpu.Buffer',
'buffer': 'zarr.buffer.cpu.Buffer',
'codec_pipeline': {'batch_size': 1,
'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'},
'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec',
Expand All @@ -85,5 +85,5 @@ This is the current default configuration::
'zstd': 'zarr.codecs.zstd.ZstdCodec'},
'default_zarr_format': 3,
'json_indent': 2,
'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer',
'ndbuffer': 'zarr.buffer.cpu.NDBuffer',
'threading': {'max_workers': None}}
43 changes: 19 additions & 24 deletions src/zarr/core/buffer/core.py → src/zarr/abc/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,23 @@
from zarr.codecs.bytes import Endian
from zarr.core.common import BytesLike, ChunkCoords

# Everything here is imported into ``zarr.core.buffer`` namespace.
__all__: list[str] = []
__all__ = [
"ArrayLike",
"Buffer",
"BufferPrototype",
"NDArrayLike",
"NDBuffer",
]


def _check_item_key_is_1d_contiguous(key: Any) -> None:
"""Raises error if `key` isn't a 1d contiguous slice"""
if not isinstance(key, slice):
raise TypeError(
f"Item key has incorrect type (expected slice, got {key.__class__.__name__})"
)
if not (key.step is None or key.step == 1):
raise ValueError("slice must be contiguous")


@runtime_checkable
Expand Down Expand Up @@ -105,16 +120,6 @@ def __eq__(self, other: object) -> Self: # type: ignore[explicit-override, over
"""


def check_item_key_is_1d_contiguous(key: Any) -> None:
"""Raises error if `key` isn't a 1d contiguous slice"""
if not isinstance(key, slice):
raise TypeError(
f"Item key has incorrect type (expected slice, got {key.__class__.__name__})"
)
if not (key.step is None or key.step == 1):
raise ValueError("slice must be contiguous")


class Buffer(ABC):
"""A flat contiguous memory block

Expand Down Expand Up @@ -266,11 +271,11 @@ def to_bytes(self) -> bytes:
return bytes(self.as_numpy_array())

def __getitem__(self, key: slice) -> Self:
check_item_key_is_1d_contiguous(key)
_check_item_key_is_1d_contiguous(key)
return self.__class__(self._data.__getitem__(key))

def __setitem__(self, key: slice, value: Any) -> None:
check_item_key_is_1d_contiguous(key)
_check_item_key_is_1d_contiguous(key)
self._data.__setitem__(key, value)

def __len__(self) -> int:
Expand Down Expand Up @@ -498,13 +503,3 @@ class BufferPrototype(NamedTuple):

buffer: type[Buffer]
nd_buffer: type[NDBuffer]


# The default buffer prototype used throughout the Zarr codebase.
def default_buffer_prototype() -> BufferPrototype:
from zarr.registry import (
get_buffer_class,
get_ndbuffer_class,
)

return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class())
2 changes: 1 addition & 1 deletion src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, Generic, TypeVar

from zarr.abc.buffer import Buffer, NDBuffer
from zarr.abc.metadata import Metadata
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import ChunkCoords, concurrent_map
from zarr.core.config import config

Expand Down
4 changes: 2 additions & 2 deletions src/zarr/abc/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from itertools import starmap
from typing import TYPE_CHECKING, Protocol, runtime_checkable

from zarr.core.buffer.core import default_buffer_prototype
from zarr.buffer import default_buffer_prototype
from zarr.core.common import concurrent_map
from zarr.core.config import config

Expand All @@ -14,7 +14,7 @@
from types import TracebackType
from typing import Any, Self, TypeAlias

from zarr.core.buffer import Buffer, BufferPrototype
from zarr.abc.buffer import Buffer, BufferPrototype
from zarr.core.common import BytesLike

__all__ = ["ByteGetter", "ByteSetter", "Store", "set_or_delete"]
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import numpy.typing as npt
from typing_extensions import deprecated

from zarr.abc.buffer import NDArrayLike
from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike
from zarr.core.buffer import NDArrayLike
from zarr.core.common import (
JSON,
AccessModeLiteral,
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import numpy.typing as npt

from zarr.abc.buffer import NDArrayLike
from zarr.abc.codec import Codec
from zarr.api.asynchronous import ArrayLike, PathLike
from zarr.core.array import (
Expand All @@ -25,7 +26,6 @@
ShardsLike,
)
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike
from zarr.core.buffer import NDArrayLike
from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike
from zarr.core.common import (
JSON,
Expand Down
17 changes: 17 additions & 0 deletions src/zarr/buffer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from zarr.abc.buffer import BufferPrototype
from zarr.buffer.cpu import numpy_buffer_prototype

__all__ = [
"default_buffer_prototype",
"numpy_buffer_prototype",
]


# The default buffer prototype used throughout the Zarr codebase.
def default_buffer_prototype() -> BufferPrototype:
from zarr.registry import (
get_buffer_class,
get_ndbuffer_class,
)

return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class())
229 changes: 229 additions & 0 deletions src/zarr/buffer/cpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
from __future__ import annotations

from typing import (
TYPE_CHECKING,
Any,
Literal,
)

import numpy as np
import numpy.typing as npt

import zarr.abc.buffer
from zarr.registry import (
register_buffer,
register_ndbuffer,
)

if TYPE_CHECKING:
from collections.abc import Callable, Iterable
from typing import Self

from zarr.abc.buffer import ArrayLike, NDArrayLike
from zarr.core.common import BytesLike


class Buffer(zarr.abc.buffer.Buffer):
"""A flat contiguous memory block

We use Buffer throughout Zarr to represent a contiguous block of memory.

A Buffer is backed by a underlying array-like instance that represents
the memory. The memory type is unspecified; can be regular host memory,
CUDA device memory, or something else. The only requirement is that the
array-like instance can be copied/converted to a regular Numpy array
(host memory).

Notes
-----
This buffer is untyped, so all indexing and sizes are in bytes.

Parameters
----------
array_like
array-like object that must be 1-dim, contiguous, and byte dtype.
"""

def __init__(self, array_like: ArrayLike) -> None:
super().__init__(array_like)

@classmethod
def create_zero_length(cls) -> Self:
return cls(np.array([], dtype="b"))

@classmethod
def from_buffer(cls, buffer: zarr.abc.buffer.Buffer) -> Self:
"""Create a new buffer of an existing Buffer

This is useful if you want to ensure that an existing buffer is
of the correct subclass of Buffer. E.g., MemoryStore uses this
to return a buffer instance of the subclass specified by its
BufferPrototype argument.

Typically, this only copies data if the data has to be moved between
memory types, such as from host to device memory.

Parameters
----------
buffer
buffer object.

Returns
-------
A new buffer representing the content of the input buffer

Notes
-----
Subclasses of `Buffer` must override this method to implement
more optimal conversions that avoid copies where possible
"""
return cls.from_array_like(buffer.as_numpy_array())

@classmethod
def from_bytes(cls, bytes_like: BytesLike) -> Self:
"""Create a new buffer of a bytes-like object (host memory)

Parameters
----------
bytes_like
bytes-like object

Returns
-------
New buffer representing `bytes_like`
"""
return cls.from_array_like(np.frombuffer(bytes_like, dtype="b"))

def as_numpy_array(self) -> npt.NDArray[Any]:
"""Returns the buffer as a NumPy array (host memory).

Notes
-----
Might have to copy data, consider using `.as_array_like()` instead.

Returns
-------
NumPy array of this buffer (might be a data copy)
"""
return np.asanyarray(self._data)

def __add__(self, other: zarr.abc.buffer.Buffer) -> Self:
"""Concatenate two buffers"""

other_array = other.as_array_like()
assert other_array.dtype == np.dtype("b")
return self.__class__(
np.concatenate((np.asanyarray(self._data), np.asanyarray(other_array)))
)


class NDBuffer(zarr.abc.buffer.NDBuffer):
"""An n-dimensional memory block

We use NDBuffer throughout Zarr to represent a n-dimensional memory block.

A NDBuffer is backed by a underlying ndarray-like instance that represents
the memory. The memory type is unspecified; can be regular host memory,
CUDA device memory, or something else. The only requirement is that the
ndarray-like instance can be copied/converted to a regular Numpy array
(host memory).

Notes
-----
The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer
is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However,
in order to use Python's type system to differentiate between the contiguous
Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the
two classes separate.

Parameters
----------
array
ndarray-like object that is convertible to a regular Numpy array.
"""

def __init__(self, array: NDArrayLike) -> None:
super().__init__(array)

@classmethod
def create(
cls,
*,
shape: Iterable[int],
dtype: npt.DTypeLike,
order: Literal["C", "F"] = "C",
fill_value: Any | None = None,
) -> Self:
ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order))
if fill_value is not None:
ret.fill(fill_value)
return ret

@classmethod
def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
return cls.from_ndarray_like(np.asanyarray(array_like))

def as_numpy_array(self) -> npt.NDArray[Any]:
"""Returns the buffer as a NumPy array (host memory).

Warnings
--------
Might have to copy data, consider using `.as_ndarray_like()` instead.

Returns
-------
NumPy array of this buffer (might be a data copy)
"""
return np.asanyarray(self._data)

def __getitem__(self, key: Any) -> Self:
return self.__class__(np.asanyarray(self._data.__getitem__(key)))

def __setitem__(self, key: Any, value: Any) -> None:
if isinstance(value, NDBuffer):
value = value._data
self._data.__setitem__(key, value)


def as_numpy_array_wrapper(
func: Callable[[npt.NDArray[Any]], bytes],
buf: zarr.abc.buffer.Buffer,
prototype: zarr.abc.buffer.BufferPrototype,
) -> zarr.abc.buffer.Buffer:
"""Converts the input of `func` to a numpy array and the output back to `Buffer`.

This function is useful when calling a `func` that only support host memory such
as `GZip.decode` and `Blosc.decode`. In this case, use this wrapper to convert
the input `buf` to a Numpy array and convert the result back into a `Buffer`.

Parameters
----------
func
The callable that will be called with the converted `buf` as input.
`func` must return bytes, which will be converted into a `Buffer`
before returned.
buf
The buffer that will be converted to a Numpy array before given as
input to `func`.
prototype
The prototype of the output buffer.

Returns
-------
The result of `func` converted to a `Buffer`
"""
return prototype.buffer.from_bytes(func(buf.as_numpy_array()))


# CPU buffer prototype using numpy arrays
buffer_prototype = zarr.abc.buffer.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
# default_buffer_prototype = buffer_prototype


# The numpy prototype used for E.g. when reading the shard index
def numpy_buffer_prototype() -> zarr.abc.buffer.BufferPrototype:
return zarr.abc.buffer.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)


register_buffer(Buffer)
register_ndbuffer(NDBuffer)
Loading
Loading