Write sampling state periodically

lucianopaz · lucianopaz · commit 35cdfa674e8d · 2025-01-10T14:27:18.000+01:00
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
@@ -26,6 +26,7 @@
     Any,
     Literal,
     TypeAlias,
+    cast,
     overload,
 )
 
@@ -40,6 +41,7 @@
 from rich.theme import Theme
 from threadpoolctl import threadpool_limits
 from typing_extensions import Protocol
+from zarr.storage import MemoryStore
 
 import pymc as pm
 
@@ -50,7 +52,7 @@
     find_observations,
 )
 from pymc.backends.base import IBaseTrace, MultiTrace, _choose_chains
-from pymc.backends.zarr import ZarrTrace
+from pymc.backends.zarr import ZarrChain, ZarrTrace
 from pymc.blocking import DictToArrayBijection
 from pymc.exceptions import SamplingError
 from pymc.initial_point import PointType, StartDict, make_initial_point_fns_per_chain
@@ -1275,6 +1277,8 @@ def _iter_sample(
     step.set_rng(rng)
 
     point = start
+    if isinstance(trace, ZarrChain):
+        trace.link_stepper(step)
 
     try:
         step.tune = bool(tune)
@@ -1297,12 +1301,18 @@ def _iter_sample(
 
             yield diverging
     except KeyboardInterrupt:
+        if isinstance(trace, ZarrChain):
+            trace.record_sampling_state(step=step)
         trace.close()
         raise
     except BaseException:
+        if isinstance(trace, ZarrChain):
+            trace.record_sampling_state(step=step)
         trace.close()
         raise
     else:
+        if isinstance(trace, ZarrChain):
+            trace.record_sampling_state(step=step)
         trace.close()
 
 
@@ -1361,6 +1371,19 @@ def _mp_sample(
 
     # We did draws += tune in pm.sample
     draws -= tune
+    zarr_chains: list[ZarrChain] | None = None
+    zarr_recording = False
+    if all(isinstance(trace, ZarrChain) for trace in traces):
+        if isinstance(cast(ZarrChain, traces[0])._posterior.store, MemoryStore):
+            warnings.warn(
+                "Parallel sampling with MemoryStore zarr store wont write the processes "
+                "step method sampling state. If you wish to be able to access the step "
+                "method sampling state, please use a different storage backend, e.g. "
+                "DirectoryStore or ZipStore"
+            )
+        else:
+            zarr_chains = cast(list[ZarrChain], traces)
+            zarr_recording = True
 
     sampler = ps.ParallelSampler(
         draws=draws,
@@ -1374,13 +1397,16 @@ def _mp_sample(
         progressbar_theme=progressbar_theme,
         blas_cores=blas_cores,
         mp_ctx=mp_ctx,
+        zarr_chains=zarr_chains,
     )
     try:
         try:
             with sampler:
                 for draw in sampler:
                     strace = traces[draw.chain]
-                    strace.record(draw.point, draw.stats)
+                    if not zarr_recording:
+                        # Zarr recording happens in each process
+                        strace.record(draw.point, draw.stats)
                     log_warning_stats(draw.stats)
 
                     if callback is not None:
diff --git a/pymc/sampling/parallel.py b/pymc/sampling/parallel.py
@@ -22,6 +22,7 @@
 
 from collections import namedtuple
 from collections.abc import Sequence
+from typing import cast
 
 import cloudpickle
 import numpy as np
@@ -31,6 +32,7 @@
 from rich.theme import Theme
 from threadpoolctl import threadpool_limits
 
+from pymc.backends.zarr import ZarrChain
 from pymc.blocking import DictToArrayBijection
 from pymc.exceptions import SamplingError
 from pymc.util import (
@@ -104,13 +106,25 @@ def __init__(
         tune: int,
         rng_state: RandomGeneratorState,
         blas_cores,
+        chain: int,
+        zarr_chains: list[ZarrChain] | bytes | None = None,
+        zarr_chains_is_pickled: bool = False,
     ):
         # For some strange reason, spawn multiprocessing doesn't copy the rng
         # seed sequence, so we have to rebuild it from scratch
         rng = random_generator_from_state(rng_state)
         self._msg_pipe = msg_pipe
         self._step_method = step_method
         self._step_method_is_pickled = step_method_is_pickled
+        self.chain = chain
+        self._zarr_recording = False
+        self._zarr_chain: ZarrChain | None = None
+        if zarr_chains_is_pickled:
+            self._zarr_chain = cloudpickle.loads(zarr_chains)[self.chain]
+        elif zarr_chains is not None:
+            self._zarr_chain = cast(list[ZarrChain], zarr_chains)[self.chain]
+        self._zarr_recording = self._zarr_chain is not None
+
         self._shared_point = shared_point
         self._rng = rng
         self._draws = draws
@@ -135,6 +149,7 @@ def run(self):
                 # We do not create this in __init__, as pickling this
                 # would destroy the shared memory.
                 self._unpickle_step_method()
+                self._link_step_to_zarrchain()
                 self._point = self._make_numpy_refs()
                 self._start_loop()
             except KeyboardInterrupt:
@@ -148,6 +163,10 @@ def run(self):
             finally:
                 self._msg_pipe.close()
 
+    def _link_step_to_zarrchain(self):
+        if self._zarr_recording:
+            self._zarr_chain.link_stepper(self._step_method)
+
     def _wait_for_abortion(self):
         while True:
             msg = self._recv_msg()
@@ -170,6 +189,7 @@ def _recv_msg(self):
         return self._msg_pipe.recv()
 
     def _start_loop(self):
+        zarr_recording = self._zarr_recording
         self._step_method.set_rng(self._rng)
 
         draw = 0
@@ -199,6 +219,8 @@ def _start_loop(self):
             if msg[0] == "abort":
                 raise KeyboardInterrupt()
             elif msg[0] == "write_next":
+                if zarr_recording:
+                    self._zarr_chain.record(point, stats)
                 self._write_point(point)
                 is_last = draw + 1 == self._draws + self._tune
                 self._msg_pipe.send(("writing_done", is_last, draw, tuning, stats))
@@ -225,6 +247,8 @@ def __init__(
         start: dict[str, np.ndarray],
         blas_cores,
         mp_ctx,
+        zarr_chains: list[ZarrChain] | None = None,
+        zarr_chains_pickled: bytes | None = None,
     ):
         self.chain = chain
         process_name = f"worker_chain_{chain}"
@@ -247,6 +271,16 @@ def __init__(
         self._readable = True
         self._num_samples = 0
 
+        zarr_chains_send: list[ZarrChain] | bytes | None = None
+        if zarr_chains_pickled is not None:
+            zarr_chains_send = zarr_chains_pickled
+        elif zarr_chains is not None:
+            if mp_ctx.get_start_method() == "spawn":
+                raise ValueError(
+                    "please provide a pre-pickled zarr_chains when multiprocessing start method is 'spawn'"
+                )
+            zarr_chains_send = zarr_chains
+
         if step_method_pickled is not None:
             step_method_send = step_method_pickled
         else:
@@ -270,6 +304,9 @@ def __init__(
                 tune,
                 get_state_from_generator(rng),
                 blas_cores,
+                self.chain,
+                zarr_chains_send,
+                zarr_chains_pickled is not None,
             ),
         )
         self._process.start()
@@ -392,6 +429,7 @@ def __init__(
         progressbar_theme: Theme | None = default_progress_theme,
         blas_cores: int | None = None,
         mp_ctx=None,
+        zarr_chains: list[ZarrChain] | None = None,
     ):
         if any(len(arg) != chains for arg in [rngs, start_points]):
             raise ValueError(f"Number of rngs and start_points must be {chains}.")
@@ -412,8 +450,15 @@ def __init__(
             mp_ctx = multiprocessing.get_context(mp_ctx)
 
         step_method_pickled = None
+        zarr_chains_pickled = None
+        self.zarr_recording = False
+        if zarr_chains is not None:
+            assert all(isinstance(zarr_chain, ZarrChain) for zarr_chain in zarr_chains)
+            self.zarr_recording = True
         if mp_ctx.get_start_method() != "fork":
             step_method_pickled = cloudpickle.dumps(step_method, protocol=-1)
+            if zarr_chains is not None:
+                zarr_chains_pickled = cloudpickle.dumps(zarr_chains, protocol=-1)
 
         self._samplers = [
             ProcessAdapter(
@@ -426,6 +471,8 @@ def __init__(
                 start,
                 blas_cores,
                 mp_ctx,
+                zarr_chains=zarr_chains,
+                zarr_chains_pickled=zarr_chains_pickled,
             )
             for chain, rng, start in zip(range(chains), rngs, start_points)
         ]
diff --git a/pymc/sampling/population.py b/pymc/sampling/population.py
@@ -27,6 +27,7 @@
 from rich.progress import BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
 
 from pymc.backends.base import BaseTrace
+from pymc.backends.zarr import ZarrChain
 from pymc.initial_point import PointType
 from pymc.model import Model, modelcontext
 from pymc.stats.convergence import log_warning_stats
@@ -36,6 +37,7 @@
     PopulationArrayStepShared,
     StatsType,
 )
+from pymc.step_methods.compound import StepMethodState
 from pymc.step_methods.metropolis import DEMetropolis
 from pymc.util import CustomProgress
 
@@ -81,6 +83,11 @@ def _sample_population(
         Show progress bars? (defaults to True)
     parallelize : bool
         Setting for multiprocess parallelization
+    traces : Sequence[BaseTrace]
+        A sequences of chain traces where the sampling results will be stored. Can be
+        a sequence of :py:class:`~pymc.backends.ndarray.NDArray`,
+        :py:class:`~pymc.backends.mcbackend.ChainRecordAdapter`, or
+        :py:class:`~pymc.backends.zarr.ZarrChain`.
     """
     warn_population_size(
         step=step,
@@ -263,6 +270,9 @@ def _run_secondary(c, stepper_dumps, secondary_end, task, progress):
                 # receiving a None is the signal to exit
                 if incoming is None:
                     break
+                elif incoming == "sampling_state":
+                    secondary_end.send((c, stepper.sampling_state))
+                    continue
                 tune_stop, population = incoming
                 if tune_stop:
                     stepper.stop_tuning()
@@ -307,6 +317,14 @@ def step(self, tune_stop: bool, population) -> list[tuple[PointType, StatsType]]
                 updates.append(self._steppers[c].step(population[c]))
         return updates
 
+    def request_sampling_state(self, chain) -> StepMethodState:
+        if self.is_parallelized:
+            self._primary_ends[chain].send(("sampling_state",))
+            _, sampling_state = self._primary_ends[chain].recv()
+        else:
+            sampling_state = self._steppers[chain].sampling_state
+        return sampling_state
+
 
 def _prepare_iter_population(
     *,
@@ -332,6 +350,11 @@ def _prepare_iter_population(
         Start points for each chain
     parallelize : bool
         Setting for multiprocess parallelization
+    traces : Sequence[BaseTrace]
+        A sequences of chain traces where the sampling results will be stored. Can be
+        a sequence of :py:class:`~pymc.backends.ndarray.NDArray`,
+        :py:class:`~pymc.backends.mcbackend.ChainRecordAdapter`, or
+        :py:class:`~pymc.backends.zarr.ZarrChain`.
     tune : int
         Number of iterations to tune.
     rngs: sequence of random Generators
@@ -411,8 +434,11 @@ def _iter_population(
         the helper object for (parallelized) stepping of chains
     steppers : list
         The step methods for each chain
-    traces : list
-        Traces for each chain
+    traces : Sequence[BaseTrace]
+        A sequences of chain traces where the sampling results will be stored. Can be
+        a sequence of :py:class:`~pymc.backends.ndarray.NDArray`,
+        :py:class:`~pymc.backends.mcbackend.ChainRecordAdapter`, or
+        :py:class:`~pymc.backends.zarr.ZarrChain`.
     points : list
         population of chain states
 
@@ -432,8 +458,11 @@ def _iter_population(
                 # apply the update to the points and record to the traces
                 for c, strace in enumerate(traces):
                     points[c], stats = updates[c]
-                    strace.record(points[c], stats)
+                    flushed = strace.record(points[c], stats)
                     log_warning_stats(stats)
+                    if flushed and isinstance(strace, ZarrChain):
+                        sampling_state = popstep.request_sampling_state(c)
+                        strace.store_sampling_state(sampling_state)
                 # yield the state of all chains in parallel
                 yield i
     except KeyboardInterrupt:
diff --git a/tests/backends/test_zarr.py b/tests/backends/test_zarr.py