pymc-devs · fonnesbeck · Jun 27, 2017 · Jun 5, 2017 · Jun 7, 2017 · Jun 8, 2017
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -12,6 +12,8 @@ Howto
    notebooks/sampler-stats.ipynb
    notebooks/Diagnosing_biased_Inference_with_Divergences.ipynb
    notebooks/posterior_predictive.ipynb
+   notebooks/model_comparison.ipynb
+   notebooks/model_averaging.ipynb
    notebooks/howto_debugging.ipynb
    notebooks/PyMC3_tips_and_heuristic.ipynb
    notebooks/LKJ.ipynb

diff --git a/docs/source/notebooks/model_averaging.ipynb b/docs/source/notebooks/model_averaging.ipynb
diff --git a/docs/source/notebooks/Model Comparison.ipynb → docs/source/notebooks/model_comparison.ipynb b/docs/source/notebooks/Model Comparison.ipynb → docs/source/notebooks/model_comparison.ipynb
diff --git a/pymc3/examples/data/milk.csv b/pymc3/examples/data/milk.csv
@@ -0,0 +1,18 @@
+kcal.per.g,neocortex,log_mass
+0.490,0.552,0.668
+0.470,0.645,1.658
+0.560,0.645,1.681
+0.890,0.676,0.920
+0.920,0.688,-0.386
+0.800,0.589,-2.120
+0.460,0.617,-0.755
+0.710,0.603,-1.139
+0.680,0.700,0.438
+0.970,0.704,1.176
+0.840,0.734,2.510
+0.620,0.675,1.681
+0.540,0.713,3.569
+0.490,0.726,4.375
+0.480,0.702,3.707
+0.550,0.763,3.500
+0.710,0.755,4.006
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
@@ -18,7 +18,7 @@
 import sys
 sys.setrecursionlimit(10000)
 
-__all__ = ['sample', 'iter_sample', 'sample_ppc', 'init_nuts']
+__all__ = ['sample', 'iter_sample', 'sample_ppc', 'sample_ppc_w', 'init_nuts']
 
 STEP_METHODS = (NUTS, HamiltonianMC, Metropolis, BinaryMetropolis,
                 BinaryGibbsMetropolis, Slice, CategoricalGibbsMetropolis)
@@ -489,14 +489,15 @@ def _update_start_vals(a, b, model):
 
     a.update({k: v for k, v in b.items() if k not in a})
 
+
 def sample_ppc(trace, samples=None, model=None, vars=None, size=None,
                random_seed=None, progressbar=True):
     """Generate posterior predictive samples from a model given a trace.
 
     Parameters
     ----------
     trace : backend, list, or MultiTrace
-        Trace generated from MCMC sampling
+        Trace generated from MCMC sampling.
     samples : int
         Number of posterior predictive samples to generate. Defaults to the
         length of `trace`
@@ -508,12 +509,19 @@ def sample_ppc(trace, samples=None, model=None, vars=None, size=None,
     size : int
         The number of random draws from the distribution specified by the
         parameters in each sample of the trace.
+    random_seed : int
+        Seed for the random number generator.
+    progressbar : bool
+        Whether or not to display a progress bar in the command line. The
+        bar shows the percentage of completion, the sampling speed in
+        samples per second (SPS), and the estimated remaining time until
+        completion ("expected time of arrival"; ETA).
 
     Returns
     -------
     samples : dict
-        Dictionary with the variables as keys. The values corresponding
-        to the posterior predictive samples.
+        Dictionary with the variables as keys. The values corresponding to the
+        posterior predictive samples.
     """
     if samples is None:
         samples = len(trace)
@@ -526,18 +534,124 @@ def sample_ppc(trace, samples=None, model=None, vars=None, size=None,
 
     seed(random_seed)
 
+    indices = randint(0, len(trace), samples)
     if progressbar:
-        indices = tqdm(randint(0, len(trace), samples), total=samples)
-    else:
-        indices = randint(0, len(trace), samples)
+        indices = tqdm(indices, total=samples)
 
     try:
         ppc = defaultdict(list)
         for idx in indices:
             param = trace[idx]
             for var in vars:
-                vals = var.distribution.random(point=param, size=size)
-                ppc[var.name].append(vals)
+                ppc[var.name].append(var.distribution.random(point=param,
+                                                             size=size))
+
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        if progressbar:
+            indices.close()
+
+    return {k: np.asarray(v) for k, v in ppc.items()}
+
+
+def sample_ppc_w(traces, samples=None, models=None, size=None, weights=None,
+                 random_seed=None, progressbar=True):
+    """Generate weighted posterior predictive samples from a list of models and
+    a list of traces according to a set of weights.
+
+    Parameters
+    ----------
+    traces : list
+        List of traces generated from MCMC sampling. The number of traces should
+        be equal to the number of weights.
+    samples : int
+        Number of posterior predictive samples to generate. Defaults to the
+        length of the shorter trace in traces.
+    models : list
+        List of models used to generate the list of traces. The number of models
+        should be equal to the number of weights and the number of observed RVs
+        should be the same for all models.
+        By default a single model will be inferred from `with` context, in this
+        case results will only be meaningful if all models share the same
+        distributions for the observed RVs.
+    size : int
+        The number of random draws from the distributions specified by the
+        parameters in each sample of the trace.
+    weights: array-like
+        Individual weights for each trace. Default, same weight for each model.
+    random_seed : int
+        Seed for the random number generator.
+    progressbar : bool
+        Whether or not to display a progress bar in the command line. The
+        bar shows the percentage of completion, the sampling speed in
+        samples per second (SPS), and the estimated remaining time until
+        completion ("expected time of arrival"; ETA).
+
+    Returns
+    -------
+    samples : dict
+        Dictionary with the variables as keys. The values corresponding to the
+        posterior predictive samples from the weighted models.
+    """
+    seed(random_seed)
+
+    if models is None:
+        models = [modelcontext(models)] * len(traces)
+
+    if weights is None:
+        weights = [1] * len(traces)
+
+    if len(traces) != len(weights):
+        raise ValueError('The number of traces and weights should be the same')
+
+    if len(models) != len(weights):
+        raise ValueError('The number of models and weights should be the same')
+
+    lenght_morv = len(models[0].observed_RVs)
+    if not all(len(i.observed_RVs) == lenght_morv for i in models):
+        raise ValueError(
+            'The number of observed RVs should be the same for all models')
+
+    weights = np.asarray(weights)
+    p = weights / np.sum(weights)
+
+    min_tr = min([len(i) for i in traces])
+
+    n = (min_tr * p).astype('int')
+    # ensure n sum up to min_tr
+    idx = np.argmax(n)
+    n[idx] = n[idx] + min_tr - np.sum(n)
+
+    trace = np.concatenate([np.random.choice(traces[i], j)
+                            for i, j in enumerate(n)])
+
+    variables = []
+    for i, m in enumerate(models):
+        variables.extend(m.observed_RVs * n[i])
+
+    len_trace = len(trace)
+
+    if samples is None:
+        samples = len_trace
+
+    indices = randint(0, len_trace, samples)
+
+    if progressbar:
+        indices = tqdm(indices, total=samples)
+
+    try:
+        ppc = defaultdict(list)
+        for idx in indices:
+            param = trace[idx]
+            var = variables[idx]
+            ppc[var.name].append(var.distribution.random(point=param,
+                                                         size=size))
+
+    except KeyboardInterrupt:
+        pass
+
     finally:
         if progressbar:
             indices.close()