Merge pull request statsmodels#8782 from aglebov/plot_ccf

josef-pkt · web-flow · commit b1de8b79606f · 2023-06-10T20:38:44.000-04:00
ENH/TST: ccf to optionally return confidence intervals
diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py
@@ -1045,14 +1045,14 @@ def pacf(x, nlags=None, method="ywadjusted", alpha=None):
 @deprecate_kwarg("unbiased", "adjusted")
 def ccovf(x, y, adjusted=True, demean=True, fft=True):
     """
-    Calculate the crosscovariance between two series.
+    Calculate the cross-covariance between two series.
 
     Parameters
     ----------
     x, y : array_like
        The time series data to use in the calculation.
     adjusted : bool, optional
-       If True, then denominators for crosscovariance is n-k, otherwise n.
+       If True, then denominators for cross-covariance are n-k, otherwise n.
     demean : bool, optional
         Flag indicating whether to demean x and y.
     fft : bool, default True
@@ -1062,7 +1062,9 @@ def ccovf(x, y, adjusted=True, demean=True, fft=True):
     Returns
     -------
     ndarray
-        The estimated crosscovariance function.
+        The estimated cross-covariance function: the element at index k
+        is the covariance between {x[k], x[k+1], ..., x[n]} and {y[0], y[1], ..., y[m-k]},
+        where n and m are the lengths of x and y, respectively.
     """
     x = array_like(x, "x")
     y = array_like(y, "y")
@@ -1083,11 +1085,11 @@ def ccovf(x, y, adjusted=True, demean=True, fft=True):
         d = n
 
     method = "fft" if fft else "direct"
-    return correlate(xo, yo, "full", method=method)[n - 1 :] / d
+    return correlate(xo, yo, "full", method=method)[n - 1:] / d
 
 
 @deprecate_kwarg("unbiased", "adjusted")
-def ccf(x, y, adjusted=True, fft=True):
+def ccf(x, y, adjusted=True, fft=True, *, nlags=None, alpha=None):
     """
     The cross-correlation function.
 
@@ -1096,27 +1098,54 @@ def ccf(x, y, adjusted=True, fft=True):
     x, y : array_like
         The time series data to use in the calculation.
     adjusted : bool
-        If True, then denominators for cross-correlation is n-k, otherwise n.
+        If True, then denominators for cross-correlation are n-k, otherwise n.
     fft : bool, default True
         If True, use FFT convolution.  This method should be preferred
         for long time series.
+    nlags : int, optional
+        Number of lags to return cross-correlations for. If not provided,
+        the number of lags equals len(x).
+    alpha : float, optional
+        If a number is given, the confidence intervals for the given level are
+        returned. For instance if alpha=.05, 95 % confidence intervals are
+        returned where the standard deviation is computed according to
+        1/sqrt(len(x)).
 
     Returns
     -------
     ndarray
-        The cross-correlation function of x and y.
+        The cross-correlation function of x and y: the element at index k
+        is the correlation between {x[k], x[k+1], ..., x[n]} and {y[0], y[1], ..., y[m-k]},
+        where n and m are the lengths of x and y, respectively.
+    confint : ndarray, optional
+        Confidence intervals for the CCF at lags 0, 1, ..., nlags-1 using the level given by
+        alpha and the standard deviation calculated as 1/sqrt(len(x)) [1]. Shape (nlags, 2).
+        Returned if alpha is not None.
 
     Notes
     -----
-    If adjusted is true, the denominator for the autocovariance is adjusted.
+    If adjusted is True, the denominator for the cross-correlation is adjusted.
+
+    References
+    ----------
+    .. [1] Brockwell and Davis, 2016. Introduction to Time Series and
+       Forecasting, 3rd edition, p. 242.
     """
     x = array_like(x, "x")
     y = array_like(y, "y")
     adjusted = bool_like(adjusted, "adjusted")
     fft = bool_like(fft, "fft", optional=False)
 
     cvf = ccovf(x, y, adjusted=adjusted, demean=True, fft=fft)
-    return cvf / (np.std(x) * np.std(y))
+    ret = cvf / (np.std(x) * np.std(y))
+    ret = ret[:nlags]
+
+    if alpha is not None:
+        interval = stats.norm.ppf(1.0 - alpha / 2.0) / np.sqrt(len(x))
+        confint = ret.reshape(-1, 1) + interval * np.array([-1, 1])
+        return ret, confint
+    else:
+        return ret
 
 
 # moved from sandbox.tsa.examples.try_ld_nitime, via nitime
diff --git a/statsmodels/tsa/tests/results/results_ccf.csv b/statsmodels/tsa/tests/results/results_ccf.csv
@@ -0,0 +1,21 @@
+ccf
+-0.143209851132143
+-0.137214062489963
+0.0112277082183379
+-0.112707803670746
+0.0257215540372298
+0.0887760977452586
+0.0599704040558258
+0.0125061572089355
+0.105920661816752
+0.0986774966768683
+-0.0255492787688061
+0.0565448657444209
+-0.00333420978987776
+0.0292053767750115
+0.0245959478684983
+0.0264416873143308
+0.0366034369179944
+-0.0118566255778657
+-0.0528090477545778
+-0.0238963002464966
diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py
@@ -18,6 +18,7 @@
 from pandas import DataFrame, Series, date_range
 import pytest
 from scipy.interpolate import interp1d
+from scipy import stats
 
 from statsmodels.datasets import macrodata, modechoice, nile, randhie, sunspots
 from statsmodels.tools.sm_exceptions import (
@@ -38,6 +39,7 @@
     arma_order_select_ic,
     breakvar_heteroskedasticity_test,
     ccovf,
+    ccf,
     coint,
     grangercausalitytests,
     innovations_algo,
@@ -365,6 +367,36 @@ def test_burg(self):
         pacfburg = pacf(self.x, nlags=40, method="burg")
         assert_almost_equal(pacfburg_, pacfburg, DECIMAL_8)
 
+
+class TestCCF:
+    """
+    Test cross-correlation function
+    """
+
+    data = macrodata.load_pandas()
+    x = data.data["unemp"].diff().dropna()
+    y = data.data["infl"].diff().dropna()
+    filename = os.path.join(CURR_DIR, "results", "results_ccf.csv")
+    results = pd.read_csv(filename, delimiter=",")
+    nlags = 20
+
+    @classmethod
+    def setup_class(cls):
+        cls.ccf = cls.results['ccf']
+        cls.res1 = ccf(cls.x, cls.y, nlags=cls.nlags, adjusted=False, fft=False)
+
+    def test_ccf(self):
+        assert_almost_equal(self.res1, self.ccf, DECIMAL_8)
+
+    def test_confint(self):
+        alpha = 0.05
+        res2, confint = ccf(self.x, self.y, nlags=self.nlags, adjusted=False, fft=False, alpha=alpha)
+        assert_equal(res2, self.res1)
+        assert_almost_equal(res2 - confint[:, 0], confint[:, 1] - res2, DECIMAL_8)
+        alpha1 = stats.norm.cdf(confint[:, 1] - res2, scale=1.0 / np.sqrt(len(self.x)))
+        assert_almost_equal(alpha1, np.repeat(1 - alpha / 2.0, self.nlags), DECIMAL_8)
+
+
 class TestBreakvarHeteroskedasticityTest:
     from scipy.stats import chi2, f