From ddf561753cfda4794b2130b1420ffb149e33d567 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Tue, 24 Nov 2020 02:24:37 +0530 Subject: [PATCH 01/11] Add HyperGeometric distribution to discrete.py; Add tests --- pymc3/distributions/__init__.py | 1 + pymc3/distributions/discrete.py | 110 +++++++++++++++++++++++ pymc3/tests/test_distributions.py | 9 ++ pymc3/tests/test_distributions_random.py | 7 ++ 4 files changed, 127 insertions(+) diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index 2a671d3293..f84deef3fe 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -63,6 +63,7 @@ from .discrete import ZeroInflatedBinomial from .discrete import DiscreteUniform from .discrete import Geometric +from .discrete import HyperGeometric from .discrete import Categorical from .discrete import OrderedLogistic diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index fd2d399618..3cd31157cf 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -38,6 +38,7 @@ "ZeroInflatedNegativeBinomial", "DiscreteUniform", "Geometric", + "HyperGeometric", "Categorical", "OrderedLogistic", ] @@ -809,6 +810,115 @@ def logp(self, value): return bound(tt.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1) +class HyperGeometric(Discrete): + R""" + Discrete hypergeometric distribution. + + The probability of :math:`x` successes in a sequence of :math:`n` bernoulli + trials taken without replacement from a population of :math:`N` objects, + containing :math:`k` good (or successful or Type I) objects. + The pmf of this distribution is + + .. math:: f(x \mid N, n, k) = \frac{\binom{k}{x}\binom{N-k}{n-x}}{\binom{N}{n}} + + .. plot:: + + import matplotlib.pyplot as plt + import numpy as np + import scipy.stats as st + plt.style.use('seaborn-darkgrid') + x = np.arange(1, 15) + N = 50 + k = 10 + for n in [20, 25]: + pmf = st.hypergeom.pmf(x, N, k, n) + plt.plot(x, pmf, '-o', label='n = {}'.format(n)) + plt.plot(x, pmf, '-o', label='N = {}'.format(N)) + plt.plot(x, pmf, '-o', label='k = {}'.format(k)) + plt.xlabel('x', fontsize=12) + plt.ylabel('f(x)', fontsize=12) + plt.legend(loc=1) + plt.show() + + ======== ============================= + + Support :math:`x in [max(0, n - \mathbb{N} + k), min(k, n)]` + Mean :math:`\dfrac{nk}{N}` + Variance :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}` + ======== ============================= + + Parameters + ---------- + N : integer + Total size of the population + n : integer + Number of samples drawn from the population + k : integer + Number of successful individuals in the population + """ + + def __init__(self, N, k, n, *args, **kwargs): + super().__init__(*args, **kwargs) + self.N = intX(N) + self.k = intX(k) + self.n = intX(n) + self.mode = intX(tt.floor((n + 1) * (k + 1) / (N + 2))) + + def random(self, point=None, size=None): + r""" + Draw random values from HyperGeometric distribution. + + Parameters + ---------- + point : dict, optional + Dict of variable values on which random values are to be + conditioned (uses default point if not specified). + size : int, optional + Desired size of random sample (returns one sample if not + specified). + + Returns + ------- + array + """ + N, n, k = draw_values([self.N, self.n, self.k], point=point, size=size) + return generate_samples( + np.random.hypergeometric, N, n, k, dist_shape=self.shape, size=size + ) + + def logp(self, value): + r""" + Calculate log-probability of HyperGeometric distribution at specified value. + + Parameters + ---------- + value : numeric + Value(s) for which log-probability is calculated. If the log probabilities for multiple + values are desired the values must be provided in a numpy array or theano tensor + + Returns + ------- + TensorVariable + """ + N = self.N + k = self.k + n = self.n + tot, good = N, k + bad = tot - good + result = ( + betaln(good + 1, 1) + + betaln(bad + 1, 1) + + betaln(tot - n + 1, n + 1) + - betaln(value + 1, good - value + 1) + - betaln(n - value + 1, bad - n + value + 1) + - betaln(tot + 1, 1) + ) + lower = tt.switch(tt.gt(n - N + k, 0), n - N + k, 0) + upper = tt.switch(tt.lt(k, n), k, n) + nonint_value = (value != intX(tt.floor(value))) + return bound(result, lower <= value, value <= upper, nonint_value) + + class DiscreteUniform(Discrete): R""" Discrete uniform distribution. diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 04d9b0e3c0..12a3fbc037 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -75,6 +75,7 @@ Rice, Kumaraswamy, Moyal, + HyperGeometric, ) from ..distributions import continuous @@ -790,6 +791,14 @@ def test_geometric(self): Geometric, Nat, {"p": Unit}, lambda value, p: np.log(sp.geom.pmf(value, p)) ) + def test_hypergeometric(self): + self.pymc3_matches_scipy( + HyperGeometric, + Nat, + {"N": NatSmall, "n": NatSmall, "k": NatSmall}, + lambda value, N, n, k: sp.hypergeom.logpmf(value, N, k, n), + ) + def test_negative_binomial(self): def test_fun(value, mu, alpha): return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha)) diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index 4daa1857b8..14e9e72596 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -506,6 +506,10 @@ class TestGeometric(BaseTestCases.BaseTestCase): distribution = pm.Geometric params = {"p": 0.5} +class TestHyperGeometric(BaseTestCases.BaseTestCase): + distribution = pm.HyperGeometric + params = {"N": 50, "n": 25, "k": 10} + class TestMoyal(BaseTestCases.BaseTestCase): distribution = pm.Moyal @@ -739,6 +743,9 @@ def ref_rand(size, alpha, mu): def test_geometric(self): pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric) + def test_hypergeometric(self): + pymc3_random_discrete(pm.HyperGeometric, {"N": Nat, "n": Nat, "k": Nat}, size=500, fails=50, ref_rand=nr.hypergeometric) + def test_discrete_uniform(self): def ref_rand(size, lower, upper): return st.randint.rvs(lower, upper + 1, size=size) From dfb0440c1a2b26426a6decb2b366fcd11001b5c0 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Tue, 24 Nov 2020 02:56:03 +0530 Subject: [PATCH 02/11] Add HyperGeo to distirbutions/__init__.py --- pymc3/distributions/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py index f84deef3fe..b4b56d749a 100644 --- a/pymc3/distributions/__init__.py +++ b/pymc3/distributions/__init__.py @@ -142,6 +142,7 @@ "ZeroInflatedBinomial", "DiscreteUniform", "Geometric", + "HyperGeometric", "Categorical", "OrderedLogistic", "DensityDist", From 3426e36bc6a1cec4876653a357694a3437e7ba31 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Tue, 24 Nov 2020 03:05:54 +0530 Subject: [PATCH 03/11] Fix minor linting issue --- pymc3/tests/test_distributions_random.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index 14e9e72596..9030a3a878 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -506,6 +506,7 @@ class TestGeometric(BaseTestCases.BaseTestCase): distribution = pm.Geometric params = {"p": 0.5} + class TestHyperGeometric(BaseTestCases.BaseTestCase): distribution = pm.HyperGeometric params = {"N": 50, "n": 25, "k": 10} From ae9fb7c95ce01327f795a38f2402bfd6b2c1f9b3 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 26 Nov 2020 00:43:50 +0530 Subject: [PATCH 04/11] Add ref_rand helper function. Clip lower in logp --- pymc3/distributions/discrete.py | 2 +- pymc3/tests/test_distributions_random.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 3cd31157cf..f3438387d9 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -913,7 +913,7 @@ def logp(self, value): - betaln(n - value + 1, bad - n + value + 1) - betaln(tot + 1, 1) ) - lower = tt.switch(tt.gt(n - N + k, 0), n - N + k, 0) + lower = tt.clip(n - N + k, 0, n - N + k) upper = tt.switch(tt.lt(k, n), k, n) nonint_value = (value != intX(tt.floor(value))) return bound(result, lower <= value, value <= upper, nonint_value) diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index 9030a3a878..b4cf5d57e5 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -745,7 +745,16 @@ def test_geometric(self): pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric) def test_hypergeometric(self): - pymc3_random_discrete(pm.HyperGeometric, {"N": Nat, "n": Nat, "k": Nat}, size=500, fails=50, ref_rand=nr.hypergeometric) + def ref_rand(size, N, n, k): + return nr.hypergeometric(ngood=k, nbad=N-k, nsample=n, size=size) + + pymc3_random_discrete( + pm.HyperGeometric, + {"N": Nat, "n": Nat, "k": Nat}, + size=500, + fails=50, + ref_rand=ref_rand, + ) def test_discrete_uniform(self): def ref_rand(size, lower, upper): From 164a8f249be716dde6a2c51bda812bfd29aadff5 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 26 Nov 2020 02:03:27 +0530 Subject: [PATCH 05/11] Fix bug. Now pymc3_matches_scipy runs without error but pymc3_random_discrete diverges from expected value --- pymc3/distributions/discrete.py | 15 +++++---------- pymc3/tests/test_distributions.py | 4 ++-- pymc3/tests/test_distributions_random.py | 12 ++++++------ 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index f3438387d9..15e8ab4935 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -851,10 +851,10 @@ class HyperGeometric(Discrete): ---------- N : integer Total size of the population - n : integer - Number of samples drawn from the population k : integer Number of successful individuals in the population + n : integer + Number of samples drawn from the population """ def __init__(self, N, k, n, *args, **kwargs): @@ -881,10 +881,8 @@ def random(self, point=None, size=None): ------- array """ - N, n, k = draw_values([self.N, self.n, self.k], point=point, size=size) - return generate_samples( - np.random.hypergeometric, N, n, k, dist_shape=self.shape, size=size - ) + N, k, n = draw_values([self.N, self.k, self.n], point=point, size=size) + return generate_samples(np.random.hypergeometric, N, k, n, dist_shape=self.shape, size=size) def logp(self, value): r""" @@ -913,10 +911,7 @@ def logp(self, value): - betaln(n - value + 1, bad - n + value + 1) - betaln(tot + 1, 1) ) - lower = tt.clip(n - N + k, 0, n - N + k) - upper = tt.switch(tt.lt(k, n), k, n) - nonint_value = (value != intX(tt.floor(value))) - return bound(result, lower <= value, value <= upper, nonint_value) + return result class DiscreteUniform(Discrete): diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py index 12a3fbc037..f7210264f1 100644 --- a/pymc3/tests/test_distributions.py +++ b/pymc3/tests/test_distributions.py @@ -795,8 +795,8 @@ def test_hypergeometric(self): self.pymc3_matches_scipy( HyperGeometric, Nat, - {"N": NatSmall, "n": NatSmall, "k": NatSmall}, - lambda value, N, n, k: sp.hypergeom.logpmf(value, N, k, n), + {"N": NatSmall, "k": NatSmall, "n": NatSmall}, + lambda value, N, k, n: sp.hypergeom.logpmf(value, N, k, n), ) def test_negative_binomial(self): diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index b4cf5d57e5..5f3cb13683 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -745,14 +745,14 @@ def test_geometric(self): pymc3_random_discrete(pm.Geometric, {"p": Unit}, size=500, fails=50, ref_rand=nr.geometric) def test_hypergeometric(self): - def ref_rand(size, N, n, k): - return nr.hypergeometric(ngood=k, nbad=N-k, nsample=n, size=size) + def ref_rand(size, N, k, n): + return st.hypergeom.rvs(M=N, n=k, N=n, size=size) pymc3_random_discrete( - pm.HyperGeometric, - {"N": Nat, "n": Nat, "k": Nat}, - size=500, - fails=50, + pm.HyperGeometric, + {"N": Nat, "k": Nat, "n": Nat}, + size=100, + fails=50, ref_rand=ref_rand, ) From f4edd8d8c291bb615b1c6b9725ab7b6495b2c2ec Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 26 Nov 2020 02:18:13 +0530 Subject: [PATCH 06/11] passes match with scipy test in test_distributions.py but fails in test_distributions_random.py --- pymc3/tests/test_distributions_random.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index 5f3cb13683..ac9cdd080f 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -509,7 +509,7 @@ class TestGeometric(BaseTestCases.BaseTestCase): class TestHyperGeometric(BaseTestCases.BaseTestCase): distribution = pm.HyperGeometric - params = {"N": 50, "n": 25, "k": 10} + params = {"N": 50, "k": 25, "n": 10} class TestMoyal(BaseTestCases.BaseTestCase): @@ -751,8 +751,8 @@ def ref_rand(size, N, k, n): pymc3_random_discrete( pm.HyperGeometric, {"N": Nat, "k": Nat, "n": Nat}, - size=100, - fails=50, + size=500, + fails=100, ref_rand=ref_rand, ) From 6cd35fd3c36c737bd9e2b9811d88ffb22c05499a Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 3 Dec 2020 18:19:18 +0530 Subject: [PATCH 07/11] Modify HyperGeom.random; Random test still failing. match_with_scipy test passing --- pymc3/distributions/discrete.py | 11 ++++++++++- pymc3/tests/test_distributions_random.py | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 15e8ab4935..802add9a23 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -881,8 +881,17 @@ def random(self, point=None, size=None): ------- array """ + N, k, n = draw_values([self.N, self.k, self.n], point=point, size=size) - return generate_samples(np.random.hypergeometric, N, k, n, dist_shape=self.shape, size=size) + return generate_samples(self._random, N, k, n, dist_shape=self.shape, size=size) + + def _random(self, M, n, N, size=None): + r"""Wrapper around scipy stat's hypergeom.rvs""" + try: + samples = stats.hypergeom.rvs(M=M, n=n, N=N, size=size) + return samples + except ValueError: + raise ValueError("Domain error in arguments") def logp(self, value): r""" diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index ac9cdd080f..74c0301c4f 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -109,6 +109,7 @@ def pymc3_random_discrete( p = 1.0 else: _, p = st.chisquare(k[:, 0], k[:, 1]) + print(p) f -= 1 assert p > alpha, str(pt) @@ -752,7 +753,7 @@ def ref_rand(size, N, k, n): pm.HyperGeometric, {"N": Nat, "k": Nat, "n": Nat}, size=500, - fails=100, + fails=50, ref_rand=ref_rand, ) From 71508a0ca976621b059d56619ee836242523d5a7 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 3 Dec 2020 18:39:39 +0530 Subject: [PATCH 08/11] rm stray print --- pymc3/tests/test_distributions_random.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index 74c0301c4f..c1c72284d3 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -109,7 +109,6 @@ def pymc3_random_discrete( p = 1.0 else: _, p = st.chisquare(k[:, 0], k[:, 1]) - print(p) f -= 1 assert p > alpha, str(pt) From b0da512ce7d87b30fac0721ffe80a8544df84e3f Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 3 Dec 2020 19:48:46 +0530 Subject: [PATCH 09/11] Fix failing random test by specifying domain --- pymc3/distributions/discrete.py | 2 +- pymc3/tests/test_distributions_random.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 802add9a23..154319026d 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -842,7 +842,7 @@ class HyperGeometric(Discrete): ======== ============================= - Support :math:`x in [max(0, n - \mathbb{N} + k), min(k, n)]` + Support :math:`x \in \left[\max(0, n - N + k), \min(k, n)\right]` Mean :math:`\dfrac{nk}{N}` Variance :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}` ======== ============================= diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py index c1c72284d3..84bd8bc117 100644 --- a/pymc3/tests/test_distributions_random.py +++ b/pymc3/tests/test_distributions_random.py @@ -750,7 +750,11 @@ def ref_rand(size, N, k, n): pymc3_random_discrete( pm.HyperGeometric, - {"N": Nat, "k": Nat, "n": Nat}, + { + "N": Domain([10, 11, 12, 13], "int64"), + "k": Domain([4, 5, 6, 7], "int64"), + "n": Domain([6, 7, 8, 9], "int64"), + }, size=500, fails=50, ref_rand=ref_rand, From a9f13bc1791dafc71df2aa0ccd4cbd909ea556fd Mon Sep 17 00:00:00 2001 From: Hari Date: Thu, 3 Dec 2020 21:24:29 +0530 Subject: [PATCH 10/11] Update pymc3/distributions/discrete.py Remove stray newline Co-authored-by: Tirth Patel --- pymc3/distributions/discrete.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py index 154319026d..d88c9bcd21 100644 --- a/pymc3/distributions/discrete.py +++ b/pymc3/distributions/discrete.py @@ -841,7 +841,6 @@ class HyperGeometric(Discrete): plt.show() ======== ============================= - Support :math:`x \in \left[\max(0, n - N + k), \min(k, n)\right]` Mean :math:`\dfrac{nk}{N}` Variance :math:`\dfrac{(N-n)nk(N-k)}{(N-1)N^2}` From 3e9c589de5642a6904be61034a9365f0293d6702 Mon Sep 17 00:00:00 2001 From: Hariv Rang Date: Thu, 3 Dec 2020 21:36:19 +0530 Subject: [PATCH 11/11] Add note in RELEASE-NOTES.md --- RELEASE-NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 91227e409d..3f73b652c4 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -35,6 +35,7 @@ This new version of `Theano-PyMC` comes with an experimental JAX backend which, - Change SMC metropolis kernel to independent metropolis kernel [#4115](https://github.com/pymc-devs/pymc3/pull/4115)) - Add alternative parametrization to NegativeBinomial distribution in terms of n and p (see [#4126](https://github.com/pymc-devs/pymc3/issues/4126)) - Added semantically meaningful `str` representations to PyMC3 objects for console, notebook, and GraphViz use (see [#4076](https://github.com/pymc-devs/pymc3/pull/4076), [#4065](https://github.com/pymc-devs/pymc3/pull/4065), [#4159](https://github.com/pymc-devs/pymc3/pull/4159), [#4217](https://github.com/pymc-devs/pymc3/pull/4217), and [#4243](https://github.com/pymc-devs/pymc3/pull/4243)). +- Add Discrete HyperGeometric Distribution (see [#4249](https://github.com/pymc-devs/pymc3/pull/#4249)) ### Maintenance - Switch the dependency of Theano to our own fork, [Theano-PyMC](https://github.com/pymc-devs/Theano-PyMC).