diff --git a/src/pypromice/qc/persistence.py b/src/pypromice/qc/persistence.py index 0c04798d..82fe6df8 100644 --- a/src/pypromice/qc/persistence.py +++ b/src/pypromice/qc/persistence.py @@ -9,7 +9,7 @@ "persistence_qc", "find_persistent_regions", "count_consecutive_persistent_values", - "duration_consecutive_true", + "get_duration_consecutive_true", ] logger = logging.getLogger(__name__) @@ -152,19 +152,21 @@ def count_consecutive_persistent_values( ) -> pd.Series: diff = data.ffill().diff().abs() # forward filling all NaNs! mask: pd.Series = diff < max_diff - return duration_consecutive_true(mask) + return get_duration_consecutive_true(mask) -def duration_consecutive_true( +def get_duration_consecutive_true( series: pd.Series, ) -> pd.Series: """ From a boolean series, calculates the duration, in hours, of the periods with concecutive true values. + The first value will be set to NaN, as it is not possible to calculate the duration of a single value. + Examples -------- - >>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True])) - pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1]) + >>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True])) + pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1]) Parameters ---------- @@ -177,9 +179,11 @@ def duration_consecutive_true( Integer pandas Series or DataFrame with values representing the number of connective true values. """ - # assert series.dtype == bool - cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index) is_first = series.astype("int").diff() == 1 - offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0) + delta_time = (series.index.diff().total_seconds() / 3600).to_series( + index=series.index + ) + cumsum = delta_time.cumsum() + offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0) return (cumsum - offset) * series diff --git a/tests/unit/qc/__init__.py b/tests/unit/qc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/pypromice/qc/persistence_test.py b/tests/unit/qc/test_persistence.py similarity index 73% rename from src/pypromice/qc/persistence_test.py rename to tests/unit/qc/test_persistence.py index 5cd3d928..d343b0bc 100644 --- a/src/pypromice/qc/persistence_test.py +++ b/tests/unit/qc/test_persistence.py @@ -1,9 +1,9 @@ import unittest import numpy as np -import numpy.testing import pandas as pd +from pypromice.qc import persistence from pypromice.qc.persistence import find_persistent_regions @@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_no_persistent_period(self): time_range = pd.date_range( @@ -46,7 +48,9 @@ def test_no_persistent_period(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_persistent_period_longer_than_period_threshold(self): time_range = pd.date_range( @@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_period_threshold_longer_than_persistent_period(self): time_range = pd.date_range( @@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_persistent_period_at_the_end(self): time_range = pd.date_range( @@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_dont_filter_nan_values(self): time_range = pd.date_range( @@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self): input_series, min_repeats=min_repeats, max_diff=0.001 ) - pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False) + pd.testing.assert_series_equal( + expected_output, persistent_mask, check_names=False + ) def test_series_with_nan_values_between_persistent_values(self): time_range = pd.date_range( @@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self): np.testing.assert_equal(expected_mask, output_mask) + def test_get_duration_consecutive_true(self): + delta_time_hours = np.random.random(24) * 2 + time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta( + delta_time_hours.cumsum(), unit="h" + ) + values = time_range == False + values[0:2] = True + values[6] = True + values[10:14] = True + values[-3:] = True + series = pd.Series(index=time_range, data=values) + + duration_consecutive_true = persistence.get_duration_consecutive_true(series) + + self.assertTrue( + np.isnan(duration_consecutive_true[0]), "The first index should be ignored" + ) + np.testing.assert_almost_equal( + duration_consecutive_true[1], + delta_time_hours[1], + ) + np.testing.assert_almost_equal( + duration_consecutive_true[6], + delta_time_hours[6], + ) + np.testing.assert_almost_equal( + duration_consecutive_true[10:14], + delta_time_hours[10:14].cumsum(), + ) + np.testing.assert_almost_equal( + duration_consecutive_true[-3:], + delta_time_hours[-3:].cumsum(), + ) + if __name__ == "__main__": unittest.main()