From f12c078a10f46bc51c227ac8941a5f56f1bb97dc Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 00:46:21 +0200 Subject: [PATCH 1/9] Implement Kahan summation for rolling mean --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/window/aggregations.pyx | 40 +++++++++++++++++----------- pandas/core/arrays/datetimelike.py | 4 +-- pandas/tests/window/test_rolling.py | 18 +++++++++++++ 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..7cebb5d3161d7 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -104,7 +104,7 @@ Other enhancements - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) -- +- :meth:`Series.rolling().mean()` and :meth:`DataFrame.rolling().mean()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`36031`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3ec4547d223ce..ca70fc81d1df5 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -277,24 +277,34 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ add a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *c) nogil: + """ add a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - c[0] + t = sum_x[0] + y + c[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] + 1 cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ remove a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *c) nogil: + """ remove a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t if notnan(val): nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + y = - val - c[0] + t = sum_x[0] + y + c[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] - 1 @@ -302,7 +312,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0, c = 0 Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -311,16 +321,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(minp - 1): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &c) output[i] = NaN for i in range(minp - 1, N): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &c) if i > win - 1: prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -330,7 +340,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, sum_x = 0 + float64_t val, c_add = 0, c_remove = 0, sum_x = 0 int64_t s, e Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -350,26 +360,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, # setup for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) else: # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) if not is_monotonic_bounds: for j in range(s, e): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) return output # ---------------------------------------------------------------------- diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bb40cf78ea006..59b9fa8d66ef7 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -474,9 +474,9 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` - return type(self)._simple_new( # type: ignore[attr-defined] + return type(self)._simple_new( arr, dtype=self.dtype - ) + ) # type: ignore[attr-defined] # ------------------------------------------------------------------ diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 67b20fd2d6daa..031cee8b78c8d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -696,3 +696,21 @@ def scaled_sum(*args): expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index) result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,)) tm.assert_frame_equal(result, expected) + + +def test_rolling_numerical_accuracy_kahan(): + # GH: 36031 implementing kahan summation + df = pd.DataFrame( + { + "A": [3002399751580331.0, -0.0, -0.0] + }, # First value is a single digit longer. + index=[ + pd.Timestamp("19700101 09:00:00"), + pd.Timestamp("19700101 09:00:03"), + pd.Timestamp("19700101 09:00:06"), + ], + ) + result = ( + df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() + ) + assert result.values[-1] == 0.0 From 6d67d194e8449787465b73d56192aa0bc9cef49d Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 01:52:35 +0200 Subject: [PATCH 2/9] Change variable names --- pandas/_libs/window/aggregations.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ca70fc81d1df5..86246714858e2 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -312,7 +312,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0, c = 0 + float64_t val, prev_x, sum_x = 0, c_add = 0, c_remove = 0 Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -321,16 +321,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(minp - 1): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct, &c) + add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) output[i] = NaN for i in range(minp - 1, N): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct, &c) + add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) if i > win - 1: prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c) + remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c_remove) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) From 971fb26bec98b31cac1aa5edd28373fb4d0e7728 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 04:16:14 +0200 Subject: [PATCH 3/9] Add test for resample mean --- pandas/tests/resample/test_datetime_index.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 9475dcc6981ff..f99da0cd52b03 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1787,3 +1787,13 @@ def test_resample_calendar_day_with_dst( 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") ) tm.assert_series_equal(result, expected) + + +def test_resample_numerical_accuracy_kahan(): + # GH: 36031 implementing kahan summation + df = pd.DataFrame( + {"A": [3002399751580331.0, -0.0]}, + index=[pd.Timestamp("19700101 09:00:00"), pd.Timestamp("19700101 09:00:02")], + ) + result = df.resample("2s").mean() + assert result.values[-1] == 0.0 From 5c8d7ccd4a177b1190e70ca6bba9a6f20d2cd0eb Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 10:20:23 +0200 Subject: [PATCH 4/9] Remove unnecessary test --- pandas/tests/resample/test_datetime_index.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index f99da0cd52b03..9475dcc6981ff 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1787,13 +1787,3 @@ def test_resample_calendar_day_with_dst( 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") ) tm.assert_series_equal(result, expected) - - -def test_resample_numerical_accuracy_kahan(): - # GH: 36031 implementing kahan summation - df = pd.DataFrame( - {"A": [3002399751580331.0, -0.0]}, - index=[pd.Timestamp("19700101 09:00:00"), pd.Timestamp("19700101 09:00:02")], - ) - result = df.resample("2s").mean() - assert result.values[-1] == 0.0 From 58cc03577aea0075b1296675e80936c6494cc779 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 15:01:43 +0200 Subject: [PATCH 5/9] Rename variable and parametrize test --- pandas/_libs/window/aggregations.pyx | 30 ++++++++++++++-------------- pandas/tests/window/test_rolling.py | 7 +++---- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 86246714858e2..0d0760d16c26c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -277,7 +277,7 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct, float64_t *c) nogil: + Py_ssize_t *neg_ct, float64_t *compensation) nogil: """ add a value from the mean calc using Kahan summation """ cdef: float64_t y, t @@ -285,25 +285,25 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - y = val - c[0] + y = val - compensation[0] t = sum_x[0] + y - c[0] = t - sum_x[0] - y + compensation[0] = t - sum_x[0] - y sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] + 1 cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct, float64_t *c) nogil: + Py_ssize_t *neg_ct, float64_t *compensation) nogil: """ remove a value from the mean calc using Kahan summation """ cdef: float64_t y, t if notnan(val): nobs[0] = nobs[0] - 1 - y = - val - c[0] + y = - val - compensation[0] t = sum_x[0] + y - c[0] = t - sum_x[0] - y + compensation[0] = t - sum_x[0] - y sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] - 1 @@ -312,7 +312,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0, c_add = 0, c_remove = 0 + float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0 Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -321,16 +321,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(minp - 1): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) output[i] = NaN for i in range(minp - 1, N): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) if i > win - 1: prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c_remove) + remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -340,7 +340,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, c_add = 0, c_remove = 0, sum_x = 0 + float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0 int64_t s, e Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -360,26 +360,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, # setup for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) + remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) if not is_monotonic_bounds: for j in range(s, e): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) + remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) return output # ---------------------------------------------------------------------- diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 031cee8b78c8d..5666031ab1539 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -698,12 +698,11 @@ def scaled_sum(*args): tm.assert_frame_equal(result, expected) -def test_rolling_numerical_accuracy_kahan(): +@pytest.mark.parametrize("add", [0.0, 2.0]) +def test_rolling_numerical_accuracy_kahan(add): # GH: 36031 implementing kahan summation df = pd.DataFrame( - { - "A": [3002399751580331.0, -0.0, -0.0] - }, # First value is a single digit longer. + {"A": [3002399751580331.0 + add, -0.0, -0.0]}, index=[ pd.Timestamp("19700101 09:00:00"), pd.Timestamp("19700101 09:00:03"), From a4e3884f8bbfd0ad93017665ac851d9ea256318f Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 19:51:23 +0200 Subject: [PATCH 6/9] Implement Kahan summation for rolling sum and add tests for issues --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/window/aggregations.pyx | 47 ++++++++++++++++++---------- pandas/tests/window/test_rolling.py | 44 +++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 656f0cbdb292a..3005c923be4af 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -105,7 +105,7 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- :meth:`Series.rolling().mean()` and :meth:`DataFrame.rolling().mean()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`36031`) +- :meth:`Series.rolling().mean()`, :meth:`DataFrame.rolling().mean()` :meth:`Series.rolling().sum()` and :meth:`DataFrame.rolling().sum()`use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0d0760d16c26c..ca53485b5a6e8 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi return result -cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ add a value from the sum calc """ +cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ add a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t -cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ remove a value from the sum calc """ +cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ remove a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t + # Not NaN if notnan(val): - nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + nobs[0] = nobs[0] + 1 + y = - val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t sum_x = 0 + float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t s, e int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output @@ -201,23 +216,23 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, # setup for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x, &compensation_remove) # calculate adds for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) output[i] = calc_sum(minp, nobs, sum_x) if not is_monotonic_bounds: for j in range(s, e): - remove_sum(values[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x, &compensation_remove) return output @@ -225,7 +240,7 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t range_endpoint int64_t nobs = 0, i, N = len(values) ndarray[float64_t] output @@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) + add_sum(values[i], &nobs, &sum_x, &compensation_add) output[i] = NaN for i in range(range_endpoint, N): val = values[i] - add_sum(val, &nobs, &sum_x) + add_sum(val, &nobs, &sum_x, &compensation_add) if i > win - 1: prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) + remove_sum(prev_x, &nobs, &sum_x, &compensation_remove) output[i] = calc_sum(minp, nobs, sum_x) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 5666031ab1539..e031f4274a503 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -699,7 +699,7 @@ def scaled_sum(*args): @pytest.mark.parametrize("add", [0.0, 2.0]) -def test_rolling_numerical_accuracy_kahan(add): +def test_rolling_numerical_accuracy_kahan_mean(add): # GH: 36031 implementing kahan summation df = pd.DataFrame( {"A": [3002399751580331.0 + add, -0.0, -0.0]}, @@ -713,3 +713,45 @@ def test_rolling_numerical_accuracy_kahan(add): df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() ) assert result.values[-1] == 0.0 + + +def test_rolling_numerical_accuracy_kahan_sum(): + # GH: 13254 + df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"]) + result = df["x"].rolling(3).sum() + expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x") + tm.assert_series_equal(result, expected) + + +def test_rolling_numerical_accuracy_jump(): + # GH: 32761 + index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="1s").append( + pd.DatetimeIndex(["2020-01-03"]) + ) + data = np.random.rand(len(index)) + + df = pd.DataFrame({"data": data}, index=index) + df["mean"] = df.rolling("60s").mean() + assert df["mean"][-1] == df["data"][-1] + + +def test_rolling_numerical_accuracy_small_values(): + # GH: 10319 + s = Series( + data=[0.00012456, 0.0003, -0.0, -0.0], + index=date_range("1999-02-03", "1999-02-06"), + ) + assert s.rolling(1).mean()["1999-02-06"] == 0.0 + + +def test_rolling_numerical_too_large_numbers(): + # GH: 11645 + dates = pd.date_range("2015-01-01", periods=10, freq="D") + ds = pd.Series(data=range(10), index=dates, dtype=np.float64) + ds[2] = -9e33 + result = ds.rolling(5).mean() + expected = pd.Series( + [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 0.0, 6.0, 7.0], + index=dates, + ) + tm.assert_series_equal(result, expected) From 9cd1ebe7a220ffc61cebfc6f115e302186f53c3e Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 22:12:57 +0200 Subject: [PATCH 7/9] Change tests and fix typo --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/window/aggregations.pyx | 2 +- pandas/tests/window/test_rolling.py | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3005c923be4af..bbeec2edf09e5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -105,7 +105,7 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- :meth:`Series.rolling().mean()`, :meth:`DataFrame.rolling().mean()` :meth:`Series.rolling().sum()` and :meth:`DataFrame.rolling().sum()`use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`Rolling.mean()`, :meth:`DataFrame.rolling().mean()` :meth:`Series.rolling().sum()` and :meth:`DataFrame.rolling().sum()`use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ca53485b5a6e8..5f60b884c6ada 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -186,7 +186,7 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, # Not NaN if notnan(val): - nobs[0] = nobs[0] + 1 + nobs[0] = nobs[0] - 1 y = - val - compensation[0] t = sum_x[0] + y compensation[0] = t - sum_x[0] - y diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index e031f4274a503..b7e2b2dc77b65 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -725,14 +725,14 @@ def test_rolling_numerical_accuracy_kahan_sum(): def test_rolling_numerical_accuracy_jump(): # GH: 32761 - index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="1s").append( + index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="60s").append( pd.DatetimeIndex(["2020-01-03"]) ) data = np.random.rand(len(index)) df = pd.DataFrame({"data": data}, index=index) - df["mean"] = df.rolling("60s").mean() - assert df["mean"][-1] == df["data"][-1] + result = df.rolling("60s").mean() + tm.assert_frame_equal(result, df[["data"]]) def test_rolling_numerical_accuracy_small_values(): @@ -741,7 +741,8 @@ def test_rolling_numerical_accuracy_small_values(): data=[0.00012456, 0.0003, -0.0, -0.0], index=date_range("1999-02-03", "1999-02-06"), ) - assert s.rolling(1).mean()["1999-02-06"] == 0.0 + result = s.rolling(1).mean() + tm.assert_series_equal(result, s) def test_rolling_numerical_too_large_numbers(): From 4c5b351f6c580f7afae0c83cdfb1de43abaab91b Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 14 Sep 2020 22:14:31 +0200 Subject: [PATCH 8/9] Fix whats new --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bbeec2edf09e5..d4464b5edd850 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -105,7 +105,7 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- :meth:`Rolling.mean()`, :meth:`DataFrame.rolling().mean()` :meth:`Series.rolling().sum()` and :meth:`DataFrame.rolling().sum()`use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) .. _whatsnew_120.api_breaking.python: From c70c91cd5bcfbf4a5af527e91dda8bcc6b480623 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 15 Sep 2020 10:26:03 +0200 Subject: [PATCH 9/9] Add df to assert equal --- pandas/tests/window/test_rolling.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index b7e2b2dc77b65..88afcec0f7bf4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -712,7 +712,22 @@ def test_rolling_numerical_accuracy_kahan_mean(add): result = ( df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() ) - assert result.values[-1] == 0.0 + dates = pd.date_range("19700101 09:00:00", periods=7, freq="S") + expected = pd.DataFrame( + { + "A": [ + np.nan, + np.nan, + np.nan, + 3002399751580330.5, + 2001599834386887.25, + 1000799917193443.625, + 0.0, + ] + }, + index=dates, + ) + tm.assert_frame_equal(result, expected) def test_rolling_numerical_accuracy_kahan_sum():