Changed Categorical to work with multidim p at the logp level (#3383)

lucianopaz · twiecki · commit 8d9d3d825713 · 2019-02-26T11:26:52.000+01:00
* Changed Categorical to work with multidim p at the logp level.

* Fixed problems with OrderedLogistic.

* Use np.moveaxis instead of transposing. Also added some more tests.
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -23,6 +23,9 @@
 - Fixed incorrect usage of `broadcast_distribution_samples` in `DiscreteWeibull`.
 - `Mixture`'s default dtype is now determined by `theano.config.floatX`.
 - `dist_math.random_choice` now handles nd-arrays of category probabilities, and also handles sizes that are not `None`. Also removed unused `k` kwarg from `dist_math.random_choice`.
+- Changed `Categorical.mode` to preserve all the dimensions of `p` except the last one, which encodes each category's probability.
+- Changed initialization of `Categorical.p`. `p` is now normalized to sum to `1` inside `logp` and `random`, but not during initialization. This could hide negative values supplied to `p` as mentioned in #2082.
+- To be able to test for negative `p` values supplied to `Categorical`, `Categorical.logp` was changed to check for `sum(self.p, axis=-1) == 1` only if `self.p` is not a `Number`, `np.ndarray`, `TensorConstant` or `SharedVariable`. These cases are automatically normalized to sum to `1`. The other condition may originate from a `step_method` proposal, where `self.p` tensor's value may be set, but must sum to 1 nevertheless. This may break old code which intialized `p` with a theano expression and relied on the default normalization to get it to sum to 1. `Categorical.logp` now also checks that the used `p` has values lower than 1.
 
 ### Deprecations
 
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
@@ -1,3 +1,4 @@
+import numbers
 import numpy as np
 import theano
 import theano.tensor as tt
@@ -710,11 +711,17 @@ def __init__(self, p, *args, **kwargs):
         except AttributeError:
             self.k = tt.shape(p)[-1]
         p = tt.as_tensor_variable(floatX(p))
-        self.p = (p.T / tt.sum(p, -1)).T
-        self.mode = tt.argmax(p)
+
+        # From #2082, it may be dangerous to automatically rescale p at this
+        # point without checking for positiveness
+        self.p = p
+        self.mode = tt.argmax(p, axis=-1)
+        if self.mode.ndim == 1:
+            self.mode = tt.squeeze(self.mode)
 
     def random(self, point=None, size=None):
         p, k = draw_values([self.p, self.k], point=point, size=size)
+        p = p / np.sum(p, axis=-1, keepdims=True)
 
         return generate_samples(random_choice,
                                 p=p,
@@ -723,21 +730,33 @@ def random(self, point=None, size=None):
                                 size=size)
 
     def logp(self, value):
-        p = self.p
+        p_ = self.p
         k = self.k
 
         # Clip values before using them for indexing
         value_clip = tt.clip(value, 0, k - 1)
 
-        sumto1 = theano.gradient.zero_grad(
-            tt.le(abs(tt.sum(p, axis=-1) - 1), 1e-5))
+        # We must only check that the values sum to 1 if p comes from a
+        # tensor variable, i.e. when p is a step_method proposal. In the other
+        # cases we normalize ourselves
+        if not isinstance(p_, (numbers.Number,
+                               np.ndarray,
+                               tt.TensorConstant,
+                               tt.sharedvar.SharedVariable)):
+            sumto1 = theano.gradient.zero_grad(
+                tt.le(abs(tt.sum(p_, axis=-1) - 1), 1e-5))
+            p = p_
+        else:
+            p = p_ / tt.sum(p_, axis=-1, keepdims=True)
+            sumto1 = True
 
         if p.ndim > 1:
-            a = tt.log(p[tt.arange(p.shape[0]), value_clip])
+            a = tt.log(np.moveaxis(p, -1, 0)[value_clip])
         else:
             a = tt.log(p[value_clip])
 
-        return bound(a, value >= 0, value <= (k - 1), sumto1)
+        return bound(a, value >= 0, value <= (k - 1), sumto1,
+                     tt.all(p_ > 0, axis=-1), tt.all(p <= 1, axis=-1))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -1177,7 +1196,7 @@ def __init__(self, eta, cutpoints, *args, **kwargs):
             tt.zeros_like(tt.shape_padright(pa[:, 0])),
             pa,
             tt.ones_like(tt.shape_padright(pa[:, 0]))
-        ], axis=1)
+        ], axis=-1)
         p = p_cum[:, 1:] - p_cum[:, :-1]
 
         super().__init__(p=p, *args, **kwargs)
diff --git a/pymc3/tests/test_distribution_defaults.py b/pymc3/tests/test_distribution_defaults.py
@@ -1,5 +1,5 @@
 from ..model import Model
-from ..distributions import DiscreteUniform, Continuous
+from ..distributions import DiscreteUniform, Continuous, Categorical
 
 import numpy as np
 import pytest
@@ -67,3 +67,10 @@ def test_discrete_uniform_negative():
     with model:
         x = DiscreteUniform('x', lower=-10, upper=0)
     assert model.test_point['x'] == -5
+
+
+def test_categorical_mode():
+    model = Model()
+    with model:
+        x = Categorical('x', p=np.eye(4), shape=4)
+    assert np.allclose(model.test_point['x'], np.arange(4))
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
@@ -321,7 +321,7 @@ def dirichlet_logpdf(value, a):
 
 def categorical_logpdf(value, p):
     if value >= 0 and value <= len(p):
-        return floatX(np.log(p[value]))
+        return floatX(np.log(np.moveaxis(p, -1, 0)[value]))
     else:
         return -inf
 
@@ -346,8 +346,10 @@ def invlogit(x, eps=sys.float_info.epsilon):
 
 def orderedlogistic_logpdf(value, eta, cutpoints):
     c = np.concatenate(([-np.inf], cutpoints, [np.inf]))
-    p = invlogit(eta - c[value]) - invlogit(eta - c[value + 1])
-    return np.log(p)
+    ps = np.array([invlogit(eta - cc) - invlogit(eta - cc1)
+                   for cc, cc1 in zip(c[:-1], c[1:])])
+    p = ps[value]
+    return np.where(np.all(ps > 0), np.log(p), -np.inf)
 
 class Simplex:
     def __init__(self, n):
@@ -1079,6 +1081,29 @@ def test_categorical_bounds(self):
             assert np.isinf(x.logp({'x': -1}))
             assert np.isinf(x.logp({'x': 3}))
 
+    def test_categorical_valid_p(self):
+        with Model():
+            x = Categorical('x', p=np.array([-0.2, 0.3, 0.5]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
+        with Model():
+            # A model where p sums to 1 but contains negative values
+            x = Categorical('x', p=np.array([-0.2, 0.7, 0.5]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
+        with Model():
+            # Hard edge case from #2082
+            # Early automatic normalization of p's sum would hide the negative
+            # entries if there is a single or pair number of negative values
+            # and the rest are zero
+            x = Categorical('x', p=np.array([-1, -1, 0, 0]))
+            assert np.isinf(x.logp({'x': 0}))
+            assert np.isinf(x.logp({'x': 1}))
+            assert np.isinf(x.logp({'x': 2}))
+            assert np.isinf(x.logp({'x': 3}))
+
     @pytest.mark.parametrize('n', [2, 3, 4])
     def test_categorical(self, n):
         self.pymc3_matches_scipy(Categorical, Domain(range(n), 'int64'), {'p': Simplex(n)},
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
@@ -444,6 +444,9 @@ def test_probability_vector_shape(self):
         p = np.ones((10, 5))
         assert pm.Categorical.dist(p=p).random().shape == (10,)
         assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 10)
+        p = np.ones((3, 7, 5))
+        assert pm.Categorical.dist(p=p).random().shape == (3, 7)
+        assert pm.Categorical.dist(p=p).random(size=4).shape == (4, 3, 7)
 
 
 class TestScalarParameterSamples(SeededTest):