Skip to content

add numcodec protocol #3318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 42 commits into from
Aug 13, 2025
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
a367268
add numcodec protocol
d-v-b Jul 31, 2025
1d424c0
add tests for numcodecs compatibility
d-v-b Jul 31, 2025
41dd6ff
changelog
d-v-b Jul 31, 2025
c435a59
ignore unknown key
d-v-b Jul 31, 2025
8e50ef8
remove re-implementation of get_codec
d-v-b Aug 1, 2025
ef31c5b
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 1, 2025
4ba7914
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
ab52539
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
95c9c8b
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
fcf84b3
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 4, 2025
5b0c3ac
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 5, 2025
84c9780
avoid circular imports by importing lower-level routines exactly wher…
d-v-b Aug 5, 2025
9a2f35b
push numcodec prototol into abcs; remove all numcodecs.abc.Codec type…
d-v-b Aug 5, 2025
0d0712f
add tests for codecjson typeguard
d-v-b Aug 5, 2025
931bf2f
avoid using zarr's buffer / ndbuffer for numcodec encode / decode
d-v-b Aug 5, 2025
01bd4b7
use Any to model input / output types of numcodec protocol
d-v-b Aug 5, 2025
f06c6aa
add numcodec protocol
d-v-b Jul 31, 2025
b71e8ac
add tests for numcodecs compatibility
d-v-b Jul 31, 2025
bcaa9ee
changelog
d-v-b Jul 31, 2025
7e49f39
ignore unknown key
d-v-b Jul 31, 2025
4b53f5d
remove re-implementation of get_codec
d-v-b Aug 1, 2025
b35e6c9
avoid circular imports by importing lower-level routines exactly wher…
d-v-b Aug 5, 2025
deef94a
push numcodec prototol into abcs; remove all numcodecs.abc.Codec type…
d-v-b Aug 5, 2025
f057525
add tests for codecjson typeguard
d-v-b Aug 5, 2025
190e1b2
avoid using zarr's buffer / ndbuffer for numcodec encode / decode
d-v-b Aug 5, 2025
82992c5
use Any to model input / output types of numcodec protocol
d-v-b Aug 5, 2025
7ea7e91
Merge branch 'feat/numcodecs-protocol' of github.com:d-v-b/zarr-pytho…
d-v-b Aug 5, 2025
413573a
Merge branch 'main' of github.com:zarr-developers/zarr-python into fe…
d-v-b Aug 6, 2025
cee4389
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 6, 2025
76f666c
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 6, 2025
c86be01
Update src/zarr/abc/numcodec.py
d-v-b Aug 10, 2025
dba39f5
Update src/zarr/abc/numcodec.py
d-v-b Aug 10, 2025
a857fc2
Update src/zarr/abc/numcodec.py
d-v-b Aug 10, 2025
a082222
Update src/zarr/abc/numcodec.py
d-v-b Aug 10, 2025
ccaaa65
Update src/zarr/abc/numcodec.py
d-v-b Aug 10, 2025
c1991e4
Merge branch 'feat/numcodecs-protocol' of github.com:d-v-b/zarr-pytho…
d-v-b Aug 13, 2025
bb28d1d
fix docstrings
d-v-b Aug 13, 2025
eedea84
revert changes to store imports
d-v-b Aug 13, 2025
fcc010b
remove whitespace
d-v-b Aug 13, 2025
0166d44
fix docstring
d-v-b Aug 13, 2025
ab19c46
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 13, 2025
194e70d
Merge branch 'main' into feat/numcodecs-protocol
d-v-b Aug 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changes/3318.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Define a ``Protocol`` to model the ``numcodecs.abc.Codec`` interface. This is groundwork toward
making ``numcodecs`` an optional dependency for ``zarr-python``.
28 changes: 26 additions & 2 deletions src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from __future__ import annotations

from abc import abstractmethod
from typing import TYPE_CHECKING, Generic, TypeVar
from collections.abc import Mapping
from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar

from typing_extensions import ReadOnly, TypedDict

from zarr.abc.metadata import Metadata
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import ChunkCoords, concurrent_map
from zarr.core.common import ChunkCoords, NamedConfig, concurrent_map
from zarr.core.config import config

if TYPE_CHECKING:
Expand Down Expand Up @@ -34,6 +37,27 @@
CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer)
CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer)

TName = TypeVar("TName", bound=str, covariant=True)


class CodecJSON_V2(TypedDict, Generic[TName]):
"""The JSON representation of a codec for Zarr V2"""

id: ReadOnly[TName]


def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]:
return isinstance(data, Mapping) and "id" in data and isinstance(data["id"], str)


CodecJSON_V3 = str | NamedConfig[str, Mapping[str, object]]
"""The JSON representation of a codec for Zarr V3."""

# The widest type we will *accept* for a codec JSON
# This covers v2 and v3
CodecJSON = str | Mapping[str, object]
"""The widest type of JSON-like input that could specify a codec."""


class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]):
"""Generic base class for codecs.
Expand Down
54 changes: 54 additions & 0 deletions src/zarr/abc/numcodec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Any, Self, TypeGuard

from typing_extensions import Protocol


class Numcodec(Protocol):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried :( but I learned that @runtime_checkable doesn't work for protocols with non-methods (i.e., attributes), and the numcodecs.abc.Codec ABC uses codec_id as an attribute. I'm not 100% sure we need the codec_id here, but if we ever wanted to register these codecs, then it would be important.

"""
A protocol that models the ``numcodecs.abc.Codec`` interface.
"""

codec_id: str

def encode(self, buf: Any) -> Any: ...

def decode(self, buf: Any, out: Any | None = None) -> Any: ...

def get_config(self) -> Any: ...

@classmethod
def from_config(cls, config: Any) -> Self: ...


def _is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]:
"""
Check if the given object is a class implements the Numcodec protocol.

The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method
members (i.e., attributes), so we use this function to manually check for the presence of the
required attributes and methods on a given object.
"""
return (
isinstance(obj, type)
and hasattr(obj, "codec_id")
and isinstance(obj.codec_id, str)
and hasattr(obj, "encode")
and callable(obj.encode)
and hasattr(obj, "decode")
and callable(obj.decode)
and hasattr(obj, "get_config")
and callable(obj.get_config)
and hasattr(obj, "from_config")
and callable(obj.from_config)
)


def _is_numcodec(obj: object) -> TypeGuard[Numcodec]:
"""
Check if the given object implements the Numcodec protocol.

The @runtime_checkable decorator does not allow issubclass checks for protocols with non-method
members (i.e., attributes), so we use this function to manually check for the presence of the
required attributes and methods on a given object.
"""
return _is_numcodec_cls(type(obj))
9 changes: 5 additions & 4 deletions src/zarr/abc/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
from itertools import starmap
from typing import TYPE_CHECKING, Protocol, runtime_checkable

from zarr.core.buffer.core import default_buffer_prototype
from zarr.core.common import concurrent_map
from zarr.core.config import config

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, AsyncIterator, Iterable
from types import TracebackType
Expand Down Expand Up @@ -438,6 +434,8 @@ async def getsize(self, key: str) -> int:
# Note to implementers: this default implementation is very inefficient since
# it requires reading the entire object. Many systems will have ways to get the
# size of an object without reading it.
from zarr.core.buffer.core import default_buffer_prototype

value = await self.get(key, prototype=default_buffer_prototype())
if value is None:
raise FileNotFoundError(key)
Expand Down Expand Up @@ -476,6 +474,9 @@ async def getsize_prefix(self, prefix: str) -> int:
# on to getting sizes. Ideally we would overlap those two, which should
# improve tail latency and might reduce memory pressure (since not all keys
# would be in memory at once).
from zarr.core.common import concurrent_map
from zarr.core.config import config

keys = [(x,) async for x in self.list_prefix(prefix)]
limit = config.get("async.concurrency")
sizes = await concurrent_map(keys, self.getsize, limit=limit)
Expand Down
5 changes: 2 additions & 3 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@
if TYPE_CHECKING:
from collections.abc import Iterable

import numcodecs.abc

from zarr.abc.codec import Codec
from zarr.abc.numcodec import Numcodec
from zarr.core.buffer import NDArrayLikeOrScalar
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
from zarr.storage import StoreLike
Expand Down Expand Up @@ -877,7 +876,7 @@ async def create(
overwrite: bool = False,
path: PathLike | None = None,
chunk_store: StoreLike | None = None,
filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None,
filters: Iterable[dict[str, JSON] | Numcodec] | None = None,
cache_metadata: bool | None = None,
cache_attrs: bool | None = None,
read_only: bool | None = None,
Expand Down
4 changes: 2 additions & 2 deletions src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
if TYPE_CHECKING:
from collections.abc import Iterable

import numcodecs.abc
import numpy as np
import numpy.typing as npt

from zarr.abc.codec import Codec
from zarr.abc.numcodec import Numcodec
from zarr.api.asynchronous import ArrayLike, PathLike
from zarr.core.array import (
CompressorsLike,
Expand Down Expand Up @@ -610,7 +610,7 @@ def create(
overwrite: bool = False,
path: PathLike | None = None,
chunk_store: StoreLike | None = None,
filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None,
filters: Iterable[dict[str, JSON] | Numcodec] | None = None,
cache_metadata: bool | None = None,
cache_attrs: bool | None = None,
read_only: bool | None = None,
Expand Down
10 changes: 3 additions & 7 deletions src/zarr/codecs/_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,22 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

import numcodecs
import numpy as np
from numcodecs.compat import ensure_bytes, ensure_ndarray_like

from zarr.abc.codec import ArrayBytesCodec
from zarr.registry import get_ndbuffer_class

if TYPE_CHECKING:
import numcodecs.abc

from zarr.abc.numcodec import Numcodec
from zarr.core.array_spec import ArraySpec
from zarr.core.buffer import Buffer, NDBuffer


@dataclass(frozen=True)
class V2Codec(ArrayBytesCodec):
filters: tuple[numcodecs.abc.Codec, ...] | None
compressor: numcodecs.abc.Codec | None
filters: tuple[Numcodec, ...] | None
compressor: Numcodec | None

is_fixed_size = False

Expand Down Expand Up @@ -86,7 +84,6 @@ async def _encode_single(
if self.filters:
for f in self.filters:
chunk = await asyncio.to_thread(f.encode, chunk)

# check object encoding
if ensure_ndarray_like(chunk).dtype == object:
raise RuntimeError("cannot write object array without object codec")
Expand All @@ -96,7 +93,6 @@ async def _encode_single(
cdata = await asyncio.to_thread(self.compressor.encode, chunk)
else:
cdata = chunk

cdata = ensure_bytes(cdata)
return chunk_spec.prototype.buffer.from_bytes(cdata)

Expand Down
7 changes: 3 additions & 4 deletions src/zarr/core/_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
import numcodecs.abc

from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
from zarr.abc.numcodec import Numcodec
from zarr.core.common import ZarrFormat
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType

Expand Down Expand Up @@ -88,9 +87,9 @@ class ArrayInfo:
_order: Literal["C", "F"]
_read_only: bool
_store_type: str
_filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = ()
_filters: tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] = ()
_serializer: ArrayBytesCodec | None = None
_compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = ()
_compressors: tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] = ()
_count_bytes: int | None = None
_count_bytes_stored: int | None = None
_count_chunks_initialized: int | None = None
Expand Down
Loading
Loading