Add HyperGeometric Distribution to pymc3.distributions.discrete #4108 (#4249)

Harivallabha · tirthasheshpatel · web-flow · commit 198d13e2ed6f · 2020-12-03T18:30:37.000+01:00
* Add HyperGeometric distribution to discrete.py; Add tests

* Add HyperGeo to distirbutions/__init__.py

* Fix minor linting issue

* Add ref_rand helper function. Clip lower in logp

* Fix bug. Now pymc3_matches_scipy runs without error but pymc3_random_discrete diverges from expected value

* passes match with scipy test in test_distributions.py but fails in test_distributions_random.py

* Modify HyperGeom.random; Random test still failing. match_with_scipy test passing

* rm stray print

* Fix failing random test by specifying domain

* Update pymc3/distributions/discrete.py

Remove stray newline

Co-authored-by: Tirth Patel &lt;tirthasheshpatel@gmail.com&gt;

* Add note in RELEASE-NOTES.md

Co-authored-by: Tirth Patel &lt;tirthasheshpatel@gmail.com&gt;
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -35,6 +35,7 @@ This new version of `Theano-PyMC` comes with an experimental JAX backend which,
 - Change SMC metropolis kernel to independent metropolis kernel [#4115](https://github.com/pymc-devs/pymc3/pull/4115))
 - Add alternative parametrization to NegativeBinomial distribution in terms of n and p (see [#4126](https://github.com/pymc-devs/pymc3/issues/4126))
 - Added semantically meaningful `str` representations to PyMC3 objects for console, notebook, and GraphViz use (see [#4076](https://github.com/pymc-devs/pymc3/pull/4076), [#4065](https://github.com/pymc-devs/pymc3/pull/4065), [#4159](https://github.com/pymc-devs/pymc3/pull/4159), [#4217](https://github.com/pymc-devs/pymc3/pull/4217), and [#4243](https://github.com/pymc-devs/pymc3/pull/4243)).
+- Add Discrete HyperGeometric Distribution (see [#4249](https://github.com/pymc-devs/pymc3/pull/#4249))
 
 ### Maintenance
 - Switch the dependency of Theano to our own fork, [Theano-PyMC](https://github.com/pymc-devs/Theano-PyMC).
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
@@ -63,6 +63,7 @@
 from .discrete import ZeroInflatedBinomial
 from .discrete import DiscreteUniform
 from .discrete import Geometric
+from .discrete import HyperGeometric
 from .discrete import Categorical
 from .discrete import OrderedLogistic
 
@@ -141,6 +142,7 @@
     "ZeroInflatedBinomial",
     "DiscreteUniform",
     "Geometric",
+    "HyperGeometric",
     "Categorical",
     "OrderedLogistic",
     "DensityDist",
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
@@ -38,6 +38,7 @@
     "ZeroInflatedNegativeBinomial",
     "DiscreteUniform",
     "Geometric",
+    "HyperGeometric",
     "Categorical",
     "OrderedLogistic",
 ]
@@ -809,6 +810,118 @@ def logp(self, value):
         return bound(tt.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1)
 
 
+class HyperGeometric(Discrete):
+    R"""
+    Discrete hypergeometric distribution.
+
+    The probability of :math:`x` successes in a sequence of :math:`n` bernoulli
+    trials taken without replacement from a population of :math:`N` objects,
+    containing :math:`k` good (or successful or Type I) objects.
+    The pmf of this distribution is
+
+    .. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}}
+
+    .. plot::
+
+        import matplotlib.pyplot as plt
+        import numpy as np
+        import scipy.stats as st
+        plt.style.use('seaborn-darkgrid')
+        x = np.arange(1, 15)
+        N = 50
+        k = 10
+        for n in [20, 25]:
+            pmf = st.hypergeom.pmf(x, N, k, n)
+            plt.plot(x, pmf, '-o', label='n = {}'.format(n))
+        plt.plot(x, pmf, '-o', label='N = {}'.format(N))
+        plt.plot(x, pmf, '-o', label='k = {}'.format(k))
+        plt.xlabel('x', fontsize=12)
+        plt.ylabel('f(x)', fontsize=12)
+        plt.legend(loc=1)
+        plt.show()
+
+    ========  =============================
+    Support   :math:`x \in \left[\max(0, n - N + k), \min(k, n)\right]`
+    Mean      :math:`\dfrac{nk}{N}`
+    Variance  :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}`
+    ========  =============================
+
+    Parameters
+    ----------
+    N : integer
+        Total size of the population
+    k : integer
+        Number of successful individuals in the population
+    n : integer
+        Number of samples drawn from the population
+    """
+
+    def __init__(self, N, k, n, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.N = intX(N)
+        self.k = intX(k)
+        self.n = intX(n)
+        self.mode = intX(tt.floor((n + 1) * (k + 1) / (N + 2)))
+
+    def random(self, point=None, size=None):
+        r"""
+        Draw random values from HyperGeometric distribution.
+
+        Parameters
+        ----------
+        point : dict, optional
+            Dict of variable values on which random values are to be
+            conditioned (uses default point if not specified).
+        size : int, optional
+            Desired size of random sample (returns one sample if not
+            specified).
+
+        Returns
+        -------
+        array
+        """
+
+        N, k, n = draw_values([self.N, self.k, self.n], point=point, size=size)
+        return generate_samples(self._random, N, k, n, dist_shape=self.shape, size=size)
+
+    def _random(self, M, n, N, size=None):
+        r"""Wrapper around scipy stat's hypergeom.rvs"""
+        try:
+            samples = stats.hypergeom.rvs(M=M, n=n, N=N, size=size)
+            return samples
+        except ValueError:
+            raise ValueError("Domain error in arguments")
+
+    def logp(self, value):
+        r"""
+        Calculate log-probability of HyperGeometric distribution at specified value.
+
+        Parameters
+        ----------
+        value : numeric
+            Value(s) for which log-probability is calculated. If the log probabilities for multiple
+            values are desired the values must be provided in a numpy array or theano tensor
+
+        Returns
+        -------
+        TensorVariable
+        """
+        N = self.N
+        k = self.k
+        n = self.n
+        tot, good = N, k
+        bad = tot - good
+        result = (
+            betaln(good + 1, 1)
+            + betaln(bad + 1, 1)
+            + betaln(tot - n + 1, n + 1)
+            - betaln(value + 1, good - value + 1)
+            - betaln(n - value + 1, bad - n + value + 1)
+            - betaln(tot + 1, 1)
+        )
+        return result
+
+
 class DiscreteUniform(Discrete):
     R"""
     Discrete uniform distribution.
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
@@ -75,6 +75,7 @@
     Rice,
     Kumaraswamy,
     Moyal,
+    HyperGeometric,
 )
 
 from ..distributions import continuous
@@ -790,6 +791,14 @@ def test_geometric(self):
             Geometric, Nat, {"p": Unit}, lambda value, p: np.log(sp.geom.pmf(value, p))
         )
 
+    def test_hypergeometric(self):
+        self.pymc3_matches_scipy(
+            HyperGeometric,
+            Nat,
+            {"N": NatSmall, "k": NatSmall, "n": NatSmall},
+            lambda value, N, k, n: sp.hypergeom.logpmf(value, N, k, n),
+        )
+
     def test_negative_binomial(self):
         def test_fun(value, mu, alpha):
             return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
@@ -507,6 +507,11 @@ class TestGeometric(BaseTestCases.BaseTestCase):
     params = {"p": 0.5}
 
 
+class TestHyperGeometric(BaseTestCases.BaseTestCase):
+    distribution = pm.HyperGeometric
+    params = {"N": 50, "k": 25, "n": 10}
+
+
 class TestMoyal(BaseTestCases.BaseTestCase):
     distribution = pm.Moyal
     params = {"mu": 0.0, "sigma": 1.0}
@@ -739,6 +744,22 @@ def ref_rand(size, alpha, mu):
     def test_geometric(self):
         pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric)
 
+    def test_hypergeometric(self):
+        def ref_rand(size, N, k, n):
+            return st.hypergeom.rvs(M=N, n=k, N=n, size=size)
+
+        pymc3_random_discrete(
+            pm.HyperGeometric,
+            {
+                "N": Domain([10, 11, 12, 13], "int64"),
+                "k": Domain([4, 5, 6, 7], "int64"),
+                "n": Domain([6, 7, 8, 9], "int64"),
+            },
+            size=500,
+            fails=50,
+            ref_rand=ref_rand,
+        )
+
     def test_discrete_uniform(self):
         def ref_rand(size, lower, upper):
             return st.randint.rvs(lower, upper + 1, size=size)