From 94e460eeac2e779c0ba0d92e669650125cfe0059 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Wed, 22 Jan 2025 13:37:25 -0800
Subject: [PATCH 1/8] Update GPU handling

This updates how we handle GPU buffers. See the new docs page for a
simple example.

The basic idea, as discussed in ..., is to use host buffers for all
metadata objects and device buffers for data.

Zarr has two types of buffers: plain buffers (used for a stream of
bytes) and ndbuffers (used for bytes that represent ndarrays). To make
it easier for users, I've added a new config option
`zarr.config.enable_gpu()` that can be used to update those both. If
we need additional customizations in the future, we can add them here.
---
 docs/user-guide/gpu.rst     | 25 +++++++++++++++++++++++++
 docs/user-guide/index.rst   |  1 +
 src/zarr/core/array.py      | 16 +++++++++-------
 src/zarr/core/buffer/gpu.py |  7 +++++++
 src/zarr/core/config.py     | 13 ++++++++++++-
 src/zarr/testing/utils.py   |  2 +-
 tests/test_api.py           | 36 ++++++++++++++++++++++++++++++++++++
 7 files changed, 91 insertions(+), 9 deletions(-)
 create mode 100644 docs/user-guide/gpu.rst

diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst
new file mode 100644
index 0000000000..7cff5082eb
--- /dev/null
+++ b/docs/user-guide/gpu.rst
@@ -0,0 +1,25 @@
+.. _user-guide-gpu:
+
+Using GPUs with Zarr
+====================
+
+Zarr can be used along with GPUs to accelerate your workload. Currently,
+Zarr supports reading data into GPU memory. In the future, Zarr will
+support GPU-accelerated codecs and file IO.
+
+Reading data into device memory
+-------------------------------
+
+.. code-block:: python
+
+   >>> import zarr
+   >>> import cupy as cp
+   >>> zarr.config.enable_cuda()
+   >>> store = zarr.storage.MemoryStore()
+   >>> type(z[:10, :10])
+   cupy.ndarray
+
+:meth:`zarr.config.enable_cuda` updates the Zarr configuration to use device
+memory for all data buffers used by Zarr. This means that any reads from a Zarr
+store will return a CuPy ndarray rather than a NumPy ndarray. Any buffers used
+for metadata will be on the host.
\ No newline at end of file
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index a7bbd12453..c50713332b 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -23,6 +23,7 @@ Advanced Topics
     performance
     consolidated_metadata
     extending
+    gpu
 
 
 .. Coming soon
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index 632e8221b4..0f71a59c73 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -38,6 +38,7 @@
     NDBuffer,
     default_buffer_prototype,
 )
+from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype
 from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks
 from zarr.core.chunk_key_encodings import (
     ChunkKeyEncoding,
@@ -163,19 +164,20 @@ async def get_array_metadata(
 ) -> dict[str, JSON]:
     if zarr_format == 2:
         zarray_bytes, zattrs_bytes = await gather(
-            (store_path / ZARRAY_JSON).get(), (store_path / ZATTRS_JSON).get()
+            (store_path / ZARRAY_JSON).get(prototype=cpu_buffer_prototype),
+            (store_path / ZATTRS_JSON).get(prototype=cpu_buffer_prototype),
         )
         if zarray_bytes is None:
             raise FileNotFoundError(store_path)
     elif zarr_format == 3:
-        zarr_json_bytes = await (store_path / ZARR_JSON).get()
+        zarr_json_bytes = await (store_path / ZARR_JSON).get(prototype=cpu_buffer_prototype)
         if zarr_json_bytes is None:
             raise FileNotFoundError(store_path)
     elif zarr_format is None:
         zarr_json_bytes, zarray_bytes, zattrs_bytes = await gather(
-            (store_path / ZARR_JSON).get(),
-            (store_path / ZARRAY_JSON).get(),
-            (store_path / ZATTRS_JSON).get(),
+            (store_path / ZARR_JSON).get(prototype=cpu_buffer_prototype),
+            (store_path / ZARRAY_JSON).get(prototype=cpu_buffer_prototype),
+            (store_path / ZATTRS_JSON).get(prototype=cpu_buffer_prototype),
         )
         if zarr_json_bytes is not None and zarray_bytes is not None:
             # warn and favor v3
@@ -1295,7 +1297,7 @@ async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = F
         """
         Asynchronously save the array metadata.
         """
-        to_save = metadata.to_buffer_dict(default_buffer_prototype())
+        to_save = metadata.to_buffer_dict(cpu_buffer_prototype)
         awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()]
 
         if ensure_parents:
@@ -1307,7 +1309,7 @@ async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = F
                     [
                         (parent.store_path / key).set_if_not_exists(value)
                         for key, value in parent.metadata.to_buffer_dict(
-                            default_buffer_prototype()
+                            cpu_buffer_prototype
                         ).items()
                     ]
                 )
diff --git a/src/zarr/core/buffer/gpu.py b/src/zarr/core/buffer/gpu.py
index 6941c8897e..aac6792cff 100644
--- a/src/zarr/core/buffer/gpu.py
+++ b/src/zarr/core/buffer/gpu.py
@@ -13,6 +13,10 @@
 
 from zarr.core.buffer import core
 from zarr.core.buffer.core import ArrayLike, BufferPrototype, NDArrayLike
+from zarr.registry import (
+    register_buffer,
+    register_ndbuffer,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -215,3 +219,6 @@ def __setitem__(self, key: Any, value: Any) -> None:
 
 
 buffer_prototype = BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+
+register_buffer(Buffer)
+register_ndbuffer(NDBuffer)
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 051e8c68e1..c565cb0708 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -29,10 +29,13 @@
 
 from __future__ import annotations
 
-from typing import Any, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 from donfig import Config as DConfig
 
+if TYPE_CHECKING:
+    from donfig.config_obj import ConfigSet
+
 
 class BadConfigError(ValueError):
     _msg = "bad Config: %r"
@@ -56,6 +59,14 @@ def reset(self) -> None:
         self.clear()
         self.refresh()
 
+    def enable_gpu(self) -> ConfigSet:
+        """
+        Configure Zarr to use GPUs where possible.
+        """
+        return self.set(
+            {"buffer": "zarr.core.buffer.gpu.Buffer", "ndbuffer": "zarr.core.buffer.gpu.NDBuffer"}
+        )
+
 
 # The default configuration for zarr
 config = Config(
diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py
index c7b6e7939c..0a93b93fdb 100644
--- a/src/zarr/testing/utils.py
+++ b/src/zarr/testing/utils.py
@@ -38,7 +38,7 @@ def has_cupy() -> bool:
         return False
 
 
-T_Callable = TypeVar("T_Callable", bound=Callable[[], Coroutine[Any, Any, None]])
+T_Callable = TypeVar("T_Callable", bound=Callable[..., Coroutine[Any, Any, None] | None])
 
 
 # Decorator for GPU tests
diff --git a/tests/test_api.py b/tests/test_api.py
index aacd558f2a..c52c2ed68d 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -27,6 +27,7 @@
 from zarr.errors import MetadataValidationError
 from zarr.storage import MemoryStore
 from zarr.storage._utils import normalize_path
+from zarr.testing.utils import gpu_test
 
 
 def test_create(memory_store: Store) -> None:
@@ -1121,3 +1122,38 @@ def test_open_array_with_mode_r_plus(store: Store) -> None:
     assert isinstance(z2, Array)
     assert (z2[:] == 1).all()
     z2[:] = 3
+
+
+@gpu_test
+@pytest.mark.parametrize(
+    "store",
+    ["local", "memory", "zip"],
+    indirect=True,
+)
+@pytest.mark.parametrize("zarr_format", [None, 2, 3])
+def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None:
+    import cupy as cp
+
+    if zarr_format == 2:
+        # Without this, the zstd codec attempts to convert the cupy
+        # array to bytes.
+        compressors = None
+    else:
+        compressors = "auto"
+
+    with zarr.config.enable_gpu():
+        src = cp.random.uniform(size=(100, 100))  # allocate on the device
+        z = zarr.create_array(
+            store,
+            name="a",
+            shape=src.shape,
+            chunks=(10, 10),
+            dtype=src.dtype,
+            overwrite=True,
+            zarr_format=zarr_format,
+            compressors=compressors,
+        )
+        z[:10, :10] = src[:10, :10]
+
+        result = z[:10, :10]
+        cp.testing.assert_array_equal(result, src[:10, :10])

From 9884f365b58e5a539f4edd527884df7a5c3ee05d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Thu, 23 Jan 2025 04:38:24 -0800
Subject: [PATCH 2/8] fixed doc

---
 docs/user-guide/gpu.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst
index 7cff5082eb..4698ab5e6b 100644
--- a/docs/user-guide/gpu.rst
+++ b/docs/user-guide/gpu.rst
@@ -16,6 +16,7 @@ Reading data into device memory
    >>> import cupy as cp
    >>> zarr.config.enable_cuda()
    >>> store = zarr.storage.MemoryStore()
+   >>> z = zarr.create_array(store=store, shape=(100, 100), chunks=(10, 10), dtype="float32")
    >>> type(z[:10, :10])
    cupy.ndarray
 

From 8371513b9329c67f9262e97623a9f80367bc4195 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Thu, 30 Jan 2025 13:35:06 -0800
Subject: [PATCH 3/8] Fixup

---
 docs/user-guide/config.rst |  1 +
 docs/user-guide/gpu.rst    | 29 ++++++++++++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
index 3662f75dff..3d9fdafcfb 100644
--- a/docs/user-guide/config.rst
+++ b/docs/user-guide/config.rst
@@ -32,6 +32,7 @@ Configuration options include the following:
 - Whether empty chunks are written to storage ``array.write_empty_chunks``
 - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
 - Selections of implementations of codecs, codec pipelines and buffers
+- Enabling GPU support with ``enable_cuda()``. See :ref:`user-guide-gpu` for more.
 
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
 first register the implementations in the registry and then select them in the config.
diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst
index 4698ab5e6b..6cd6db2a1b 100644
--- a/docs/user-guide/gpu.rst
+++ b/docs/user-guide/gpu.rst
@@ -3,24 +3,35 @@
 Using GPUs with Zarr
 ====================
 
-Zarr can be used along with GPUs to accelerate your workload. Currently,
-Zarr supports reading data into GPU memory. In the future, Zarr will
-support GPU-accelerated codecs and file IO.
+Zarr can use GPUs to accelerate your workload by running
+:meth:`zarr.config.enable_gpu`.
+
+.. note::
+
+   `zarr-python` currently supports reading the ndarray data into device (GPU)
+   memory as the final stage of the codec pipeline. Data will still be read into
+   or copied to host (CPU) memory for encoding and decoding.
+
+   In the future, codecs will be available compressing and decompressing data on
+   the GPU, avoiding the need to move data between the host and device for
+   compression and decompression.
 
 Reading data into device memory
 -------------------------------
 
+:meth:`zarr.config.enable_gpu` configures Zarr to use GPU memory for the data
+buffers used internally by Zarr.
+
 .. code-block:: python
 
    >>> import zarr
    >>> import cupy as cp
-   >>> zarr.config.enable_cuda()
+   >>> zarr.config.enable_gpu()
    >>> store = zarr.storage.MemoryStore()
-   >>> z = zarr.create_array(store=store, shape=(100, 100), chunks=(10, 10), dtype="float32")
+   >>> z = zarr.create_array(
+   ...     store=store, shape=(100, 100), chunks=(10, 10), dtype="float32",
+   ... )
    >>> type(z[:10, :10])
    cupy.ndarray
 
-:meth:`zarr.config.enable_cuda` updates the Zarr configuration to use device
-memory for all data buffers used by Zarr. This means that any reads from a Zarr
-store will return a CuPy ndarray rather than a NumPy ndarray. Any buffers used
-for metadata will be on the host.
\ No newline at end of file
+Note that the output type is a ``cupy.ndarray`` rather than a NumPy array.

From 56cc00c8b597b87d43f2882fba094dd52d801622 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Thu, 30 Jan 2025 13:42:33 -0800
Subject: [PATCH 4/8] changelog

---
 changes/2751.bugfix.rst    | 1 +
 changes/2751.doc.rst       | 1 +
 changes/2751.feature.rst   | 1 +
 docs/user-guide/config.rst | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 changes/2751.bugfix.rst
 create mode 100644 changes/2751.doc.rst
 create mode 100644 changes/2751.feature.rst

diff --git a/changes/2751.bugfix.rst b/changes/2751.bugfix.rst
new file mode 100644
index 0000000000..6f737999cf
--- /dev/null
+++ b/changes/2751.bugfix.rst
@@ -0,0 +1 @@
+Fixed bug with Zarr using device memory, instead of host memory, for storing metadata when using GPUs.
\ No newline at end of file
diff --git a/changes/2751.doc.rst b/changes/2751.doc.rst
new file mode 100644
index 0000000000..19fbcbeea6
--- /dev/null
+++ b/changes/2751.doc.rst
@@ -0,0 +1 @@
+Added new user guide on :ref:`user-guide-gpu`.
\ No newline at end of file
diff --git a/changes/2751.feature.rst b/changes/2751.feature.rst
new file mode 100644
index 0000000000..61d97479c6
--- /dev/null
+++ b/changes/2751.feature.rst
@@ -0,0 +1 @@
+Added :meth:`zarr.config.enable_gpu` to update Zarr's configuration to use GPUs.
\ No newline at end of file
diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
index 3d9fdafcfb..91ffe50b91 100644
--- a/docs/user-guide/config.rst
+++ b/docs/user-guide/config.rst
@@ -32,7 +32,7 @@ Configuration options include the following:
 - Whether empty chunks are written to storage ``array.write_empty_chunks``
 - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
 - Selections of implementations of codecs, codec pipelines and buffers
-- Enabling GPU support with ``enable_cuda()``. See :ref:`user-guide-gpu` for more.
+- Enabling GPU support with ``zarr.config.enable_gpu()``. See :ref:`user-guide-gpu` for more.
 
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
 first register the implementations in the registry and then select them in the config.

From 60ba16a1ac7fcaf230aba010b37385bfdd80e23f Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Thu, 30 Jan 2025 14:07:11 -0800
Subject: [PATCH 5/8] doctest, skip

---
 docs/user-guide/gpu.rst | 10 +++++-----
 pyproject.toml          |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst
index 6cd6db2a1b..4d3492f8bd 100644
--- a/docs/user-guide/gpu.rst
+++ b/docs/user-guide/gpu.rst
@@ -25,13 +25,13 @@ buffers used internally by Zarr.
 .. code-block:: python
 
    >>> import zarr
-   >>> import cupy as cp
-   >>> zarr.config.enable_gpu()
-   >>> store = zarr.storage.MemoryStore()
-   >>> z = zarr.create_array(
+   >>> import cupy as cp  # doctest: +SKIP
+   >>> zarr.config.enable_gpu()  # doctest: +SKIP
+   >>> store = zarr.storage.MemoryStore()  # doctest: +SKIP
+   >>> z = zarr.create_array(  # doctest: +SKIP
    ...     store=store, shape=(100, 100), chunks=(10, 10), dtype="float32",
    ... )
-   >>> type(z[:10, :10])
+   >>> type(z[:10, :10])  # doctest: +SKIP
    cupy.ndarray
 
 Note that the output type is a ``cupy.ndarray`` rather than a NumPy array.
diff --git a/pyproject.toml b/pyproject.toml
index 8d73485dac..36c0a8475b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -168,7 +168,7 @@ features = ["test", "optional"]
 description = "Test environment for doctests"
 
 [tool.hatch.envs.doctest.scripts]
-run = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst'"
+run = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst' -k not gpu"
 fix = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst' --accept"
 list-env = "pip list"
 

From cb67094d001822959d8f96754b70895779ee3f67 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Thu, 30 Jan 2025 14:20:04 -0800
Subject: [PATCH 6/8] removed not gpu

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 36c0a8475b..8d73485dac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -168,7 +168,7 @@ features = ["test", "optional"]
 description = "Test environment for doctests"
 
 [tool.hatch.envs.doctest.scripts]
-run = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst' -k not gpu"
+run = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst'"
 fix = "rm -r data/; pytest docs/user-guide --doctest-glob='*.rst' --accept"
 list-env = "pip list"
 

From 2b70d0d0b44a4f917472e34350f11362e099b768 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Fri, 31 Jan 2025 04:09:52 -0800
Subject: [PATCH 7/8] assert that the type matches

---
 tests/test_api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_api.py b/tests/test_api.py
index c52c2ed68d..e9db33f6c5 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1156,4 +1156,6 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None:
         z[:10, :10] = src[:10, :10]
 
         result = z[:10, :10]
+        # assert_array_equal doesn't check the type
+        assert isinstance(result, type(src))
         cp.testing.assert_array_equal(result, src[:10, :10])

From 7c31bc2b8343267e039dc40474aae5506d5ddd6b Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Wed, 5 Feb 2025 05:59:03 -0800
Subject: [PATCH 8/8] Added changelog notes

---
 docs/developers/contributing.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst
index 220e24eced..de10fab2c6 100644
--- a/docs/developers/contributing.rst
+++ b/docs/developers/contributing.rst
@@ -230,6 +230,27 @@ during development at `http://0.0.0.0:8000/ <http://0.0.0.0:8000/>`_. This can b
 
     $ hatch --env docs run serve
 
+.. _changelog:
+
+Changelog
+~~~~~~~~~
+
+zarr-python uses `towncrier`_ to manage release notes. Most pull requests should
+include at least one news fragment describing the changes. To add a release
+note, you'll need the GitHub issue or pull request number and the type of your
+change (``feature``, ``bugfix``, ``doc``, ``removal``, ``misc``). With that, run
+```towncrier create``` with your development environment, which will prompt you
+for the issue number, change type, and the news text::
+
+   towncrier create
+
+Alternatively, you can manually create the files in the ``changes`` directory
+using the naming convention ``{issue-number}.{change-type}.rst``.
+
+See the `towncrier`_ docs for more.
+
+.. _towncrier: https://towncrier.readthedocs.io/en/stable/tutorial.html
+
 Development best practices, policies and procedures
 ---------------------------------------------------