Skip to content

Commit 4afdb22

Browse files
authored
Merge pull request statsmodels#8780 from josef-pkt/ref_getdistribution
REF: get_distribution, return 1-d instead of column frozen distribution
2 parents dda8e3d + 992d72c commit 4afdb22

File tree

10 files changed

+128
-18
lines changed

10 files changed

+128
-18
lines changed

statsmodels/discrete/count_model.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,8 @@ def get_distribution(self, params, exog=None, exog_infl=None,
727727
w = self.predict(params, exog=exog, exog_infl=exog_infl,
728728
exposure=exposure, offset=offset, which="prob-main")
729729

730-
distr = self.distribution(mu[:, None], 1 - w[:, None])
730+
# distr = self.distribution(mu[:, None], 1 - w[:, None])
731+
distr = self.distribution(mu, 1 - w)
731732
return distr
732733

733734

@@ -842,7 +843,8 @@ def get_distribution(self, params, exog=None, exog_infl=None,
842843
exposure=exposure, offset=offset, which="mean-main")
843844
w = self.predict(params, exog=exog, exog_infl=exog_infl,
844845
exposure=exposure, offset=offset, which="prob-main")
845-
distr = self.distribution(mu[:, None], params[-1], p, 1 - w[:, None])
846+
# distr = self.distribution(mu[:, None], params[-1], p, 1 - w[:, None])
847+
distr = self.distribution(mu, params[-1], p, 1 - w)
846848
return distr
847849

848850

@@ -958,7 +960,8 @@ def get_distribution(self, params, exog=None, exog_infl=None,
958960
w = self.predict(params, exog=exog, exog_infl=exog_infl,
959961
exposure=exposure, offset=offset, which="prob-main")
960962

961-
distr = self.distribution(mu[:, None], params[-1], p, 1 - w[:, None])
963+
# distr = self.distribution(mu[:, None], params[-1], p, 1 - w[:, None])
964+
distr = self.distribution(mu, params[-1], p, 1 - w)
962965
return distr
963966

964967

statsmodels/discrete/discrete_model.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,8 @@ def get_distribution(self, params, exog=None, offset=None):
677677
Instance of frozen scipy distribution.
678678
"""
679679
mu = self.predict(params, exog=exog, offset=offset)
680-
distr = stats.bernoulli(mu[:, None])
680+
# distr = stats.bernoulli(mu[:, None])
681+
distr = stats.bernoulli(mu)
681682
return distr
682683

683684

@@ -1659,7 +1660,7 @@ def predict(self, params, exog=None, exposure=None, offset=None,
16591660
exposure=exposure, offset=offset,
16601661
)[:, None]
16611662
# uses broadcasting
1662-
return stats.poisson.pmf(y_values, mu)
1663+
return stats.poisson._pmf(y_values, mu)
16631664
else:
16641665
raise ValueError('Value of the `which` option is not recognized')
16651666

@@ -2281,7 +2282,8 @@ def get_distribution(self, params, exog=None, exposure=None, offset=None):
22812282
"""
22822283
mu = self.predict(params, exog=exog, exposure=exposure, offset=offset)
22832284
p = self.parameterization + 1
2284-
distr = genpoisson_p(mu[:, None], params[-1], p)
2285+
# distr = genpoisson_p(mu[:, None], params[-1], p)
2286+
distr = genpoisson_p(mu, params[-1], p)
22852287
return distr
22862288

22872289

@@ -3587,8 +3589,13 @@ def predict(self, params, exog=None, exposure=None, offset=None,
35873589
offset=offset
35883590
)
35893591
if y_values is None:
3590-
y_values = np.atleast_2d(np.arange(0, np.max(self.endog)+1))
3591-
return distr.pmf(y_values)
3592+
y_values = np.arange(0, np.max(self.endog) + 1)
3593+
else:
3594+
y_values = np.asarray(y_values)
3595+
3596+
assert y_values.ndim == 1
3597+
y_values = y_values[..., None]
3598+
return distr.pmf(y_values).T
35923599

35933600
exog, offset, exposure = self._get_predict_arrays(
35943601
exog=exog,
@@ -3757,7 +3764,8 @@ def get_distribution(self, params, exog=None, exposure=None, offset=None):
37573764
"""
37583765
mu = self.predict(params, exog=exog, exposure=exposure, offset=offset)
37593766
if self.loglike_method == 'geometric':
3760-
distr = stats.geom(1 / (1 + mu[:, None]), loc=-1)
3767+
# distr = stats.geom(1 / (1 + mu[:, None]), loc=-1)
3768+
distr = stats.geom(1 / (1 + mu), loc=-1)
37613769
else:
37623770
if self.loglike_method == 'nb2':
37633771
p = 2
@@ -3768,7 +3776,8 @@ def get_distribution(self, params, exog=None, exposure=None, offset=None):
37683776
q = 2 - p
37693777
size = 1. / alpha * mu**q
37703778
prob = size / (size + mu)
3771-
distr = nbinom(size[:, None], prob[:, None])
3779+
# distr = nbinom(size[:, None], prob[:, None])
3780+
distr = nbinom(size, prob)
37723781

37733782
return distr
37743783

@@ -4343,7 +4352,8 @@ def get_distribution(self, params, exog=None, exposure=None, offset=None):
43434352
"""
43444353
mu = self.predict(params, exog=exog, exposure=exposure, offset=offset)
43454354
size, prob = self.convert_params(params, mu)
4346-
distr = nbinom(size[:, None], prob[:, None])
4355+
# distr = nbinom(size[:, None], prob[:, None])
4356+
distr = nbinom(size, prob)
43474357
return distr
43484358

43494359

statsmodels/discrete/tests/test_conditional.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ def test_logit_1d():
2020
for x in -1, 0, 1, 2:
2121
params = np.r_[x, ]
2222
_, grad = model._denom_grad(0, params)
23-
ngrad = approx_fprime(params, lambda x: model._denom(0, x))
23+
ngrad = approx_fprime(params, lambda x: model._denom(0, x)).squeeze()
2424
assert_allclose(grad, ngrad)
2525

2626
# Check the gradient for the loglikelihood
2727
for x in -1, 0, 1, 2:
28-
grad = approx_fprime(np.r_[x, ], model.loglike)
28+
grad = approx_fprime(np.r_[x, ], model.loglike).squeeze()
2929
score = model.score(np.r_[x, ])
3030
assert_allclose(grad, score, rtol=1e-4)
3131

@@ -117,7 +117,7 @@ def test_poisson_1d():
117117

118118
# Check the gradient for the loglikelihood
119119
for x in -1, 0, 1, 2:
120-
grad = approx_fprime(np.r_[x, ], model.loglike)
120+
grad = approx_fprime(np.r_[x, ], model.loglike).squeeze()
121121
score = model.score(np.r_[x, ])
122122
assert_allclose(grad, score, rtol=1e-4)
123123

statsmodels/discrete/tests/test_predict.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ def test_distr(case):
359359
# res = mod.fit()
360360
params_dgp = params
361361
distr = mod.get_distribution(params_dgp)
362+
assert distr.pmf(1).ndim == 1
362363
try:
363364
y2 = distr.rvs(size=(nobs, 1)).squeeze()
364365
except ValueError:
@@ -378,7 +379,14 @@ def test_distr(case):
378379
assert_allclose(res.resid_pearson, (y2 - mean) / np.sqrt(var_), rtol=1e-13)
379380

380381
if not issubclass(cls_model, BinaryModel):
382+
# smoke, shape, consistency test
383+
probs = res.predict(which="prob", y_values=np.arange(5))
384+
assert probs.shape == (len(mod.endog), 5)
385+
probs2 = res.get_prediction(
386+
which="prob", y_values=np.arange(5), average=True)
387+
assert_allclose(probs2.predicted, probs.mean(0), rtol=1e-10)
381388
dia = res.get_diagnostic()
389+
dia.probs_predicted
382390
# fig = dia.plot_probs();
383391
# fig.suptitle(cls_model.__name__ + repr(kwds), fontsize=16)
384392

statsmodels/discrete/truncated_model.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,12 +383,13 @@ def predict(self, params, exog=None, exposure=None, offset=None,
383383
return probs
384384
elif which == 'prob-base':
385385
if y_values is not None:
386-
counts = np.atleast_2d(y_values)
386+
counts = np.asarray(y_values)
387387
else:
388-
counts = np.atleast_2d(np.arange(0, np.max(self.endog)+1))
388+
counts = np.arange(0, np.max(self.endog)+1)
389+
389390
probs = self.model_main.predict(
390391
params, exog=exog, exposure=np.exp(exposure),
391-
offset=offset, which="prob", y_values=counts)[:, None]
392+
offset=offset, which="prob", y_values=counts)
392393
return probs
393394
elif which == 'var':
394395
mu = np.exp(linpred)

statsmodels/genmod/generalized_linear_model.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,6 +2149,68 @@ def get_influence(self, observed=True):
21492149
hat_matrix_diag=hat_matrix_diag)
21502150
return infl
21512151

2152+
def get_distribution(self, exog=None, exposure=None,
2153+
offset=None, var_weights=1., n_trials=1.):
2154+
"""
2155+
Return a instance of the predictive distribution.
2156+
2157+
Parameters
2158+
----------
2159+
scale : scalar
2160+
The scale parameter.
2161+
exog : array_like
2162+
The predictor variable matrix.
2163+
offset : array_like or None
2164+
Offset variable for predicted mean.
2165+
exposure : array_like or None
2166+
Log(exposure) will be added to the linear prediction.
2167+
var_weights : array_like
2168+
1d array of variance (analytic) weights. The default is None.
2169+
n_trials : int
2170+
Number of trials for the binomial distribution. The default is 1
2171+
which corresponds to a Bernoulli random variable.
2172+
2173+
Returns
2174+
-------
2175+
gen
2176+
Instance of a scipy frozen distribution based on estimated
2177+
parameters.
2178+
Use the ``rvs`` method to generate random values.
2179+
2180+
Notes
2181+
-----
2182+
Due to the behavior of ``scipy.stats.distributions objects``, the
2183+
returned random number generator must be called with ``gen.rvs(n)``
2184+
where ``n`` is the number of observations in the data set used
2185+
to fit the model. If any other value is used for ``n``, misleading
2186+
results will be produced.
2187+
"""
2188+
# Note this is mostly a copy of GLM.get_prediction
2189+
# calling here results.predict avoids the exog check and trasnform
2190+
2191+
if isinstance(self.model.family, (families.Binomial, families.Poisson,
2192+
families.NegativeBinomial)):
2193+
# use scale=1, independent of QMLE scale for discrete
2194+
scale = 1.
2195+
if self.scale != 1.:
2196+
msg = "using scale=1, no exess dispersion in distribution"
2197+
warnings.warn(msg, UserWarning)
2198+
else:
2199+
scale = self.scale
2200+
2201+
mu = self.predict(exog, exposure, offset, which="mean")
2202+
2203+
kwds = {}
2204+
if (np.any(n_trials != 1) and
2205+
isinstance(self.model.family, families.Binomial)):
2206+
2207+
kwds["n_trials"] = n_trials
2208+
2209+
distr = self.model.family.get_distribution(
2210+
mu, scale, var_weights=var_weights, **kwds)
2211+
return distr
2212+
2213+
21522214
@Appender(base.LikelihoodModelResults.remove_data.__doc__)
21532215
def remove_data(self):
21542216
# GLM has alias/reference in result instance

statsmodels/genmod/tests/test_glm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,15 @@ def test_get_distribution(self):
255255
var_ = res1.predict(which="var_unscaled")
256256
assert_allclose(var_ * res_scale, var_endog, rtol=1e-13)
257257

258+
# check get_distribution of results instance
259+
if getattr(self, "has_edispersion", False):
260+
with pytest.warns(UserWarning, match="using scale=1"):
261+
distr3 = res1.get_distribution()
262+
else:
263+
distr3 = res1.get_distribution()
264+
for k in distr2.kwds:
265+
assert_allclose(distr3.kwds[k], distr2.kwds[k], rtol=1e-13)
266+
258267

259268
class CheckComparisonMixin:
260269

@@ -865,6 +874,7 @@ def setup_class(cls):
865874
res2 = Committee()
866875
res2.aic_R += 2 # They do not count a degree of freedom for the scale
867876
cls.res2 = res2
877+
cls.has_edispersion = True
868878

869879
# FIXME: enable or delete
870880
# def setup_method(self):

statsmodels/othermod/betareg.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ def __init__(self, endog, exog, exog_precision=None,
134134
self._init_keys.extend(['exog_precision'])
135135
self._init_keys.extend(['link', 'link_precision'])
136136
self._null_drop_keys = ['exog_precision']
137+
del kwds['extra_params_names']
138+
self._check_kwargs(kwds)
137139
self.results_class = BetaResults
138140
self.results_class_wrapper = BetaResultsWrapper
139141

statsmodels/othermod/tests/test_beta.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,16 @@
22
import io
33
import os
44

5+
import pytest
6+
57
import numpy as np
68
from numpy.testing import assert_allclose, assert_equal
79
import pandas as pd
810
import patsy
911
from statsmodels.api import families
12+
from statsmodels.tools.sm_exceptions import (
13+
ValueWarning,
14+
)
1015
from statsmodels.othermod.betareg import BetaModel
1116
from .results import results_betareg as resultsb
1217

@@ -126,6 +131,12 @@ def test_precision_formula(self):
126131
assert_close(rslt.params, self.meth_fit.params, 1e-10)
127132
assert isinstance(rslt.params, pd.Series)
128133

134+
with pytest.warns(ValueWarning, match="unknown kwargs"):
135+
BetaModel.from_formula(self.model, methylation,
136+
exog_precision_formula='~ age',
137+
link_precision=links.Identity(),
138+
junk=False)
139+
129140
def test_scores(self):
130141
model, Z = self.model, self.Z
131142
for link in (links.Identity(), links.Log()):

statsmodels/tools/numdiff.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,10 @@ def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False):
158158
f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k])
159159
ei[k] = 0.0
160160

161-
return grad.squeeze().T
161+
if n == 1:
162+
return grad.T
163+
else:
164+
return grad.squeeze().T
162165

163166

164167
def _approx_fprime_scalar(x, f, epsilon=None, args=(), kwargs={},

0 commit comments

Comments
 (0)