Integrate ZarrTrace into pymc.sample

lucianopaz · lucianopaz · commit 147b92eefa16 · 2025-01-10T14:27:18.000+01:00
diff --git a/pymc/backends/__init__.py b/pymc/backends/__init__.py
@@ -72,6 +72,7 @@
 from pymc.backends.arviz import predictions_to_inference_data, to_inference_data
 from pymc.backends.base import BaseTrace, IBaseTrace
 from pymc.backends.ndarray import NDArray
+from pymc.backends.zarr import ZarrTrace
 from pymc.blocking import PointType
 from pymc.model import Model
 from pymc.step_methods.compound import BlockedStep, CompoundStep
@@ -120,15 +121,27 @@ def _init_trace(
 
 def init_traces(
     *,
-    backend: TraceOrBackend | None,
+    backend: TraceOrBackend | ZarrTrace | None,
     chains: int,
     expected_length: int,
     step: BlockedStep | CompoundStep,
     initial_point: PointType,
     model: Model,
     trace_vars: list[TensorVariable] | None = None,
+    tune: int = 0,
 ) -> tuple[RunType | None, Sequence[IBaseTrace]]:
     """Initialize a trace recorder for each chain."""
+    if isinstance(backend, ZarrTrace):
+        backend.init_trace(
+            chains=chains,
+            draws=expected_length - tune,
+            tune=tune,
+            step=step,
+            model=model,
+            vars=trace_vars,
+            test_point=initial_point,
+        )
+        return None, backend.straces
     if HAS_MCB and isinstance(backend, Backend):
         return init_chain_adapters(
             backend=backend,
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
@@ -50,6 +50,7 @@
     find_observations,
 )
 from pymc.backends.base import IBaseTrace, MultiTrace, _choose_chains
+from pymc.backends.zarr import ZarrTrace
 from pymc.blocking import DictToArrayBijection
 from pymc.exceptions import SamplingError
 from pymc.initial_point import PointType, StartDict, make_initial_point_fns_per_chain
@@ -503,7 +504,7 @@ def sample(
     model: Model | None = None,
     compile_kwargs: dict | None = None,
     **kwargs,
-) -> InferenceData | MultiTrace:
+) -> InferenceData | MultiTrace | ZarrTrace:
     r"""Draw samples from the posterior using the given step methods.
 
     Multiple step methods are supported via compound step methods.
@@ -570,7 +571,13 @@ def sample(
         Number of iterations of initializer. Only works for 'ADVI' init methods.
     trace : backend, optional
         A backend instance or None.
-        If None, the NDArray backend is used.
+        If ``None``, a ``MultiTrace`` object with underlying ``NDArray`` trace objects
+        is used. If ``trace`` is a :class:`~pymc.backends.zarr.ZarrTrace` instance,
+        the drawn samples will be written onto the desired storage while sampling is
+        on-going. This means sampling runs that, for whatever reason, die in the middle
+        of their execution will write the partial results onto the storage. If the
+        storage persist on disk, these results should be available even after a server
+        crash. See :class:`~pymc.backends.zarr.ZarrTrace` for more information.
     discard_tuned_samples : bool
         Whether to discard posterior samples of the tune interval.
     compute_convergence_checks : bool, default=True
@@ -607,8 +614,12 @@ def sample(
 
     Returns
     -------
-    trace : pymc.backends.base.MultiTrace or arviz.InferenceData
-        A ``MultiTrace`` or ArviZ ``InferenceData`` object that contains the samples.
+    trace : pymc.backends.base.MultiTrace | pymc.backends.zarr.ZarrTrace | arviz.InferenceData
+        A ``MultiTrace``, :class:`~arviz.InferenceData` or
+        :class:`~pymc.backends.zarr.ZarrTrace` object that contains the samples. A
+        ``ZarrTrace`` is only returned if the supplied ``trace`` argument is a
+        ``ZarrTrace`` instance. Refer to :class:`~pymc.backends.zarr.ZarrTrace` for
+        the benefits this backend provides.
 
     Notes
     -----
@@ -741,7 +752,7 @@ def joined_blas_limiter():
     rngs = get_random_generator(random_seed).spawn(chains)
     random_seed_list = [rng.integers(2**30) for rng in rngs]
 
-    if not discard_tuned_samples and not return_inferencedata:
+    if not discard_tuned_samples and not return_inferencedata and not isinstance(trace, ZarrTrace):
         warnings.warn(
             "Tuning samples will be included in the returned `MultiTrace` object, which can lead to"
             " complications in your downstream analysis. Please consider to switch to `InferenceData`:\n"
@@ -852,6 +863,7 @@ def joined_blas_limiter():
         trace_vars=trace_vars,
         initial_point=initial_points[0],
         model=model,
+        tune=tune,
     )
 
     sample_args = {
@@ -934,7 +946,7 @@ def joined_blas_limiter():
     # into a function to make it easier to test and refactor.
     return _sample_return(
         run=run,
-        traces=traces,
+        traces=trace if isinstance(trace, ZarrTrace) else traces,
         tune=tune,
         t_sampling=t_sampling,
         discard_tuned_samples=discard_tuned_samples,
@@ -949,7 +961,7 @@ def joined_blas_limiter():
 def _sample_return(
     *,
     run: RunType | None,
-    traces: Sequence[IBaseTrace],
+    traces: Sequence[IBaseTrace] | ZarrTrace,
     tune: int,
     t_sampling: float,
     discard_tuned_samples: bool,
@@ -958,18 +970,69 @@ def _sample_return(
     keep_warning_stat: bool,
     idata_kwargs: dict[str, Any],
     model: Model,
-) -> InferenceData | MultiTrace:
+) -> InferenceData | MultiTrace | ZarrTrace:
     """Pick/slice chains, run diagnostics and convert to the desired return type.
 
     Final step of `pm.sampler`.
     """
+    if isinstance(traces, ZarrTrace):
+        # Split warmup from posterior samples
+        traces.split_warmup_groups()
+
+        # Set sampling time
+        traces.sampling_time = t_sampling
+
+        # Compute number of actual draws per chain
+        total_draws_per_chain = traces._sampling_state.draw_idx[:]
+        n_chains = len(traces.straces)
+        desired_tune = traces.tuning_steps
+        desired_draw = len(traces.posterior.draw)
+        tuning_steps_per_chain = np.clip(total_draws_per_chain, 0, desired_tune)
+        draws_per_chain = total_draws_per_chain - tuning_steps_per_chain
+
+        total_n_tune = tuning_steps_per_chain.sum()
+        total_draws = draws_per_chain.sum()
+
+        _log.info(
+            f'Sampling {n_chains} chain{"s" if n_chains > 1 else ""} for {desired_tune:_d} desired tune and {desired_draw:_d} desired draw iterations '
+            f"(Actually sampled {total_n_tune:_d} tune and {total_draws:_d} draws total) "
+            f"took {t_sampling:.0f} seconds."
+        )
+
+        if compute_convergence_checks or return_inferencedata:
+            idata = traces.to_inferencedata(save_warmup=not discard_tuned_samples)
+            log_likelihood = idata_kwargs.pop("log_likelihood", False)
+            if log_likelihood:
+                from pymc.stats.log_density import compute_log_likelihood
+
+                idata = compute_log_likelihood(
+                    idata,
+                    var_names=None if log_likelihood is True else log_likelihood,
+                    extend_inferencedata=True,
+                    model=model,
+                    sample_dims=["chain", "draw"],
+                    progressbar=False,
+                )
+            if compute_convergence_checks:
+                warns = run_convergence_checks(idata, model)
+                for warn in warns:
+                    traces._sampling_state.global_warnings.append(np.array([warn]))
+                log_warnings(warns)
+
+            if return_inferencedata:
+                # By default we drop the "warning" stat which contains `SamplerWarning`
+                # objects that can not be stored with `.to_netcdf()`.
+                if not keep_warning_stat:
+                    return drop_warning_stat(idata)
+                return idata
+        return traces
+
     # Pick and slice chains to keep the maximum number of samples
     if discard_tuned_samples:
         traces, length = _choose_chains(traces, tune)
     else:
         traces, length = _choose_chains(traces, 0)
     mtrace = MultiTrace(traces)[:length]
-
     # count the number of tune/draw iterations that happened
     # ideally via the "tune" statistic, but not all samplers record it!
     if "tune" in mtrace.stat_names:
diff --git a/tests/backends/test_zarr.py b/tests/backends/test_zarr.py
@@ -19,6 +19,8 @@
 import pytest
 import zarr
 
+from arviz import InferenceData
+
 import pymc as pm
 
 from pymc.backends.zarr import ZarrTrace
@@ -357,3 +359,112 @@ def test_split_warmup(tune, model, model_step, include_transformed):
             if len(dims) >= 2 and dims[1] == "draw":
                 assert sample_stats_array.shape[1] == draws
                 assert trace.root["warmup_sample_stats"][var_name].shape[1] == tune
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def discard_tuned_samples(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def return_inferencedata(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def keep_warning_stat(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def parallel(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def log_likelihood(request):
+    return request.param
+
+
+def test_sample(
+    model,
+    model_step,
+    include_transformed,
+    discard_tuned_samples,
+    return_inferencedata,
+    keep_warning_stat,
+    parallel,
+    log_likelihood,
+    draws_per_chunk,
+):
+    if not return_inferencedata and not log_likelihood:
+        pytest.skip(
+            reason="log_likelihood is only computed if an inference data object is returned"
+        )
+    store = zarr.MemoryStore()
+    trace = ZarrTrace(
+        store=store, include_transformed=include_transformed, draws_per_chunk=draws_per_chunk
+    )
+    tune = 2
+    draws = 3
+    if parallel:
+        chains = 2
+        cores = 2
+    else:
+        chains = 1
+        cores = 1
+    with model:
+        out_trace = pm.sample(
+            draws=draws,
+            tune=tune,
+            chains=chains,
+            cores=cores,
+            trace=trace,
+            step=model_step,
+            discard_tuned_samples=discard_tuned_samples,
+            return_inferencedata=return_inferencedata,
+            keep_warning_stat=keep_warning_stat,
+            idata_kwargs={"log_likelihood": log_likelihood},
+        )
+
+    if not return_inferencedata:
+        assert isinstance(out_trace, ZarrTrace)
+        assert out_trace.root.store is trace.root.store
+    else:
+        assert isinstance(out_trace, InferenceData)
+
+    expected_groups = {"posterior", "constant_data", "observed_data", "sample_stats"}
+    if include_transformed:
+        expected_groups |= {"unconstrained_posterior"}
+    if not return_inferencedata or not discard_tuned_samples:
+        expected_groups |= {"warmup_posterior", "warmup_sample_stats"}
+        if include_transformed:
+            expected_groups |= {"warmup_unconstrained_posterior"}
+    if not return_inferencedata:
+        expected_groups |= {"_sampling_state"}
+    elif log_likelihood:
+        expected_groups |= {"log_likelihood"}
+    assert set(out_trace.groups()) == expected_groups
+
+    if return_inferencedata:
+        warning_stat = (
+            "sampler_1__warning" if isinstance(model_step, CompoundStep) else "sampler_0__warning"
+        )
+        if keep_warning_stat:
+            assert warning_stat in out_trace.sample_stats
+        else:
+            assert warning_stat not in out_trace.sample_stats
+
+    # Assert that all variables have non empty samples (not NaNs)
+    if return_inferencedata:
+        assert all(
+            (not np.any(np.isnan(v))) and v.shape[:2] == (chains, draws)
+            for v in out_trace.posterior.data_vars.values()
+        )
+    else:
+        dimensions = {*model.coords, "a_dim_0", "a_dim_1", "chain", "draw"}
+        assert all(
+            (not np.any(np.isnan(v[:]))) and v.shape[:2] == (chains, draws)
+            for name, v in out_trace.posterior.arrays()
+            if name not in dimensions
+        )