Skip to content

Commit 2733504

Browse files
authored
Merge pull request statsmodels#8502 from josef-pkt/ref_doc_diagnostic
REF/DOC Poisson diagnostic
2 parents bcaba34 + cbff096 commit 2733504

File tree

8 files changed

+367
-225
lines changed

8 files changed

+367
-225
lines changed

examples/notebooks/postestimation_poisson.ipynb

+66-54
Large diffs are not rendered by default.

statsmodels/discrete/_diagnostics_count.py

+194
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import numpy as np
99
from scipy import stats
1010

11+
import pandas as pd
12+
1113
from statsmodels.stats.base import HolderTuple
1214
from statsmodels.discrete.discrete_model import Poisson
1315
from statsmodels.regression.linear_model import OLS
@@ -227,6 +229,198 @@ def test_chisquare_prob(results, probs, bin_edges=None, method=None):
227229
return res
228230

229231

232+
class DispersionResults(HolderTuple):
233+
234+
def summary_frame(self):
235+
frame = pd.DataFrame({
236+
"statistic": self.statistic,
237+
"pvalue": self.pvalue,
238+
"method": self.method,
239+
"alternative": self.alternative
240+
})
241+
242+
return frame
243+
244+
245+
def test_poisson_dispersion(results, method="all", _old=False):
246+
"""Score/LM type tests for Poisson variance assumptions
247+
248+
Null Hypothesis is
249+
250+
H0: var(y) = E(y) and assuming E(y) is correctly specified
251+
H1: var(y) ~= E(y)
252+
253+
The tests are based on the constrained model, i.e. the Poisson model.
254+
The tests differ in their assumed alternatives, and in their maintained
255+
assumptions.
256+
257+
Parameters
258+
----------
259+
results : Poisson results instance
260+
This can be a results instance for either a discrete Poisson or a GLM
261+
with family Poisson.
262+
method : str
263+
Not used yet. Currently results for all methods are returned.
264+
_old : bool
265+
Temporary keyword for backwards compatibility, will be removed
266+
in future version of statsmodels.
267+
268+
Returns
269+
-------
270+
res : instance
271+
The instance of DispersionResults has the hypothesis test results,
272+
statistic, pvalue, method, alternative, as main attributes and a
273+
summary_frame method that returns the results as pandas DataFrame.
274+
275+
"""
276+
277+
if method not in ["all"]:
278+
raise ValueError(f'unknown method "{method}"')
279+
280+
if hasattr(results, '_results'):
281+
results = results._results
282+
283+
endog = results.model.endog
284+
nobs = endog.shape[0] # TODO: use attribute, may need to be added
285+
fitted = results.predict()
286+
# fitted = results.fittedvalues # discrete has linear prediction
287+
# this assumes Poisson
288+
resid2 = results.resid_response**2
289+
var_resid_endog = (resid2 - endog)
290+
var_resid_fitted = (resid2 - fitted)
291+
std1 = np.sqrt(2 * (fitted**2).sum())
292+
293+
var_resid_endog_sum = var_resid_endog.sum()
294+
dean_a = var_resid_fitted.sum() / std1
295+
dean_b = var_resid_endog_sum / std1
296+
dean_c = (var_resid_endog / fitted).sum() / np.sqrt(2 * nobs)
297+
298+
pval_dean_a = 2 * stats.norm.sf(np.abs(dean_a))
299+
pval_dean_b = 2 * stats.norm.sf(np.abs(dean_b))
300+
pval_dean_c = 2 * stats.norm.sf(np.abs(dean_c))
301+
302+
results_all = [[dean_a, pval_dean_a],
303+
[dean_b, pval_dean_b],
304+
[dean_c, pval_dean_c]]
305+
description = [['Dean A', 'mu (1 + a mu)'],
306+
['Dean B', 'mu (1 + a mu)'],
307+
['Dean C', 'mu (1 + a)']]
308+
309+
# Cameron Trived auxiliary regression page 78 count book 1989
310+
endog_v = var_resid_endog / fitted
311+
res_ols_nb2 = OLS(endog_v, fitted).fit(use_t=False)
312+
stat_ols_nb2 = res_ols_nb2.tvalues[0]
313+
pval_ols_nb2 = res_ols_nb2.pvalues[0]
314+
results_all.append([stat_ols_nb2, pval_ols_nb2])
315+
description.append(['CT nb2', 'mu (1 + a mu)'])
316+
317+
res_ols_nb1 = OLS(endog_v, fitted).fit(use_t=False)
318+
stat_ols_nb1 = res_ols_nb1.tvalues[0]
319+
pval_ols_nb1 = res_ols_nb1.pvalues[0]
320+
results_all.append([stat_ols_nb1, pval_ols_nb1])
321+
description.append(['CT nb1', 'mu (1 + a)'])
322+
323+
endog_v = var_resid_endog / fitted
324+
res_ols_nb2 = OLS(endog_v, fitted).fit(cov_type='HC3', use_t=False)
325+
stat_ols_hc1_nb2 = res_ols_nb2.tvalues[0]
326+
pval_ols_hc1_nb2 = res_ols_nb2.pvalues[0]
327+
results_all.append([stat_ols_hc1_nb2, pval_ols_hc1_nb2])
328+
description.append(['CT nb2 HC3', 'mu (1 + a mu)'])
329+
330+
res_ols_nb1 = OLS(endog_v, np.ones(len(endog_v))).fit(cov_type='HC3',
331+
use_t=False)
332+
stat_ols_hc1_nb1 = res_ols_nb1.tvalues[0]
333+
pval_ols_hc1_nb1 = res_ols_nb1.pvalues[0]
334+
results_all.append([stat_ols_hc1_nb1, pval_ols_hc1_nb1])
335+
description.append(['CT nb1 HC3', 'mu (1 + a)'])
336+
337+
results_all = np.array(results_all)
338+
if _old:
339+
# for backwards compatibility in 0.14, remove in later versions
340+
return results_all, description
341+
else:
342+
res = DispersionResults(
343+
statistic=results_all[:, 0],
344+
pvalue=results_all[:, 1],
345+
method=[i[0] for i in description],
346+
alternative=[i[1] for i in description],
347+
name="Poisson Dispersion Test"
348+
)
349+
return res
350+
351+
352+
def _test_poisson_dispersion_generic(
353+
results,
354+
exog_new_test,
355+
exog_new_control=None,
356+
include_score=False,
357+
use_endog=True,
358+
cov_type='HC3',
359+
cov_kwds=None,
360+
use_t=False
361+
):
362+
"""A variable addition test for the variance function
363+
364+
This uses an artificial regression to calculate a variant of an LM or
365+
generalized score test for the specification of the variance assumption
366+
in a Poisson model. The performed test is a Wald test on the coefficients
367+
of the `exog_new_test`.
368+
369+
Warning: insufficiently tested, especially for options
370+
"""
371+
372+
if hasattr(results, '_results'):
373+
results = results._results
374+
375+
endog = results.model.endog
376+
nobs = endog.shape[0] # TODO: use attribute, may need to be added
377+
# fitted = results.fittedvalues # generic has linpred as fittedvalues
378+
fitted = results.predict()
379+
resid2 = results.resid_response**2
380+
# the following assumes Poisson
381+
if use_endog:
382+
var_resid = (resid2 - endog)
383+
else:
384+
var_resid = (resid2 - fitted)
385+
386+
endog_v = var_resid / fitted
387+
388+
k_constraints = exog_new_test.shape[1]
389+
ex_list = [exog_new_test]
390+
if include_score:
391+
score_obs = results.model.score_obs(results.params)
392+
ex_list.append(score_obs)
393+
394+
if exog_new_control is not None:
395+
ex_list.append(score_obs)
396+
397+
if len(ex_list) > 1:
398+
ex = np.column_stack(ex_list)
399+
use_wald = True
400+
else:
401+
ex = ex_list[0] # no control variables in exog
402+
use_wald = False
403+
404+
res_ols = OLS(endog_v, ex).fit(cov_type=cov_type, cov_kwds=cov_kwds,
405+
use_t=use_t)
406+
407+
if use_wald:
408+
# we have controls and need to test coefficients
409+
k_vars = ex.shape[1]
410+
constraints = np.eye(k_constraints, k_vars)
411+
ht = res_ols.wald_test(constraints)
412+
stat_ols = ht.statistic
413+
pval_ols = ht.pvalue
414+
else:
415+
# we do not have controls and can use overall fit
416+
nobs = endog_v.shape[0]
417+
rsquared_noncentered = 1 - res_ols.ssr/res_ols.uncentered_tss
418+
stat_ols = nobs * rsquared_noncentered
419+
pval_ols = stats.chi2.sf(stat_ols, k_constraints)
420+
421+
return stat_ols, pval_ols
422+
423+
230424
def test_poisson_zeroinflation_jh(results_poisson, exog_infl=None):
231425
"""score test for zero inflation or deflation in Poisson
232426

statsmodels/discrete/diagnostic.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,12 @@
1313

1414
from statsmodels.tools.decorators import cache_readonly
1515

16-
from statsmodels.stats._diagnostic_other import (
17-
dispersion_poisson,
18-
# dispersion_poisson_generic,
19-
)
20-
2116
from statsmodels.stats.diagnostic_gen import (
2217
test_chisquare_binning
2318
)
2419
from statsmodels.discrete._diagnostics_count import (
20+
test_poisson_dispersion,
21+
# _test_poisson_dispersion_generic,
2522
test_poisson_zeroinflation_jh,
2623
test_poisson_zeroinflation_broek,
2724
test_poisson_zeros,
@@ -37,7 +34,10 @@ class CountDiagnostic:
3734
3835
Parameters
3936
----------
40-
results : PoissonResults instance
37+
results : Results instance of a count model.
38+
y_max : int
39+
Largest count to include when computing predicted probabilities for
40+
counts. Default is the largest observed count.
4141
4242
"""
4343

@@ -48,7 +48,7 @@ def __init__(self, results, y_max=None):
4848
@cache_readonly
4949
def probs_predicted(self):
5050
if self.y_max is not None:
51-
kwds = {"y_values": np.arange(self.y_max)}
51+
kwds = {"y_values": np.arange(self.y_max + 1)}
5252
else:
5353
kwds = {}
5454
return self.results.predict(which="prob", **kwds)
@@ -147,7 +147,7 @@ def test_dispersion(self):
147147
-------
148148
dispersion results
149149
"""
150-
res = dispersion_poisson(self.results)
150+
res = test_poisson_dispersion(self.results)
151151
return res
152152

153153
def test_poisson_zeroinflation(self, method="prob", exog_infl=None):

statsmodels/discrete/tests/test_diagnostic.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,9 @@ def test_spec_tests(self):
116116

117117
respoi = Poisson(self.endog, self.exog).fit(disp=0)
118118
dia = PoissonDiagnostic(respoi)
119-
t_disp = dia.test_dispersion()[0]
120-
assert_allclose(t_disp, res_dispersion, rtol=1e-8)
119+
t_disp = dia.test_dispersion()
120+
res_disp = np.column_stack(((t_disp.statistic, t_disp.pvalue)))
121+
assert_allclose(res_disp, res_dispersion, rtol=1e-8)
121122

122123
nobs = self.endog.shape[0]
123124
t_zi_jh = dia.test_poisson_zeroinflation(method="broek",

statsmodels/discrete/tests/test_predict.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def test_diagnostic(self):
153153
dia = res1.get_diagnostic(y_max=21)
154154
res_chi2 = dia.test_chisquare_prob(bin_edges=np.arange(4))
155155
assert_equal(res_chi2.diff1.shape[1], 3)
156-
assert_equal(dia.probs_predicted.shape[1], 21)
156+
assert_equal(dia.probs_predicted.shape[1], 22)
157157

158158
try:
159159
dia.plot_probs(upp_xlim=20)

statsmodels/genmod/tests/test_score_test.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from statsmodels.genmod import families
1414
from statsmodels.discrete.discrete_model import Poisson
1515
import statsmodels.stats._diagnostic_other as diao
16+
import statsmodels.discrete._diagnostics_count as diac
1617
from statsmodels.base._parameter_inference import score_test
1718

1819

@@ -110,19 +111,19 @@ def setup_class(cls):
110111
cls.model_full = GLM(y, xx, family=families.Poisson())
111112
cls.model_drop = GLM(y, x, family=families.Poisson())
112113

113-
114114
def test_dispersion(self):
115115
res_drop = self.model_drop.fit()
116-
res_test = diao.dispersion_poisson(res_drop)
117-
assert_allclose(res_test[0], self.res_disptest, rtol=1e-6, atol=1e-14)
116+
res_test = diac.test_poisson_dispersion(res_drop)
117+
res_test_ = np.column_stack((res_test.statistic, res_test.pvalue))
118+
assert_allclose(res_test_, self.res_disptest, rtol=1e-6, atol=1e-14)
118119
# constant only dispersion
119120
ex = np.ones((res_drop.model.endog.shape[0], 1))
120121
# ex = np.column_stack((np.ones(res_drop.model.endog.shape[0]),
121122
# res_drop.predict())) # or **2
122123
# dispersion_poisson_generic might not be correct
123124
# or not clear what the alternative hypothesis is
124125
# choosing different `ex` implies different alternative hypotheses
125-
res_test = diao.dispersion_poisson_generic(res_drop, ex)
126+
res_test = diac._test_poisson_dispersion_generic(res_drop, ex)
126127
assert_allclose(res_test, self.res_disptest_g, rtol=1e-6, atol=1e-14)
127128

128129

0 commit comments

Comments
 (0)