Merge branch 'main' of github.com:cmu-delphi/covidcast-indicators into indicator_runner

sgsmob · sgsmob · commit b2485d29a1d6 · 2021-03-10T20:48:08.000-05:00
diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2
@@ -39,7 +39,7 @@
     "fb-survey": {
       "max_age": 3,
       "maintainers": ["U01069KCRS7"],
-      "retired-signals": ["smoothed_wearing_mask"]
+      "retired-signals": ["smoothed_wearing_mask", "smoothed_wwearing_mask"]
     },
     "indicator-combination": {
       "max_age": 4,
diff --git a/changehc/delphi_changehc/sensor.py b/changehc/delphi_changehc/sensor.py
@@ -26,19 +26,6 @@ class CHCSensor:
                         poly_fit_degree=1,
                         gaussian_bandwidth=Config.SMOOTHER_BANDWIDTH)
 
-    @staticmethod
-    def gauss_smooth(count,total):
-        """Smooth using the left_gauss_linear.
-
-        Args:
-            count, total: array
-        """
-        count_smooth = CHCSensor.smoother.smooth(count)
-        total_smooth = CHCSensor.smoother.smooth(total)
-        total_clip = np.clip(total_smooth, 0, None)
-        count_clip = np.clip(count_smooth, 0, total_clip)
-        return count_clip, total_clip
-
     @staticmethod
     def backfill(
             num,
@@ -120,30 +107,13 @@ def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
 
         # calculate smoothed counts and jeffreys rate
         # the left_gauss_linear smoother is not guaranteed to return values greater than 0
-
-        smoothed_total_counts, smoothed_total_visits = CHCSensor.gauss_smooth(
-            total_counts.flatten(), total_visits
-        )
-
-        # in smoothing, the numerator may have become more than the denominator
-        # simple fix is to clip the max values elementwise to the denominator (note that
-        # this has only been observed in synthetic data)
-        # smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
-
-        smoothed_total_rates = (
-                (smoothed_total_counts + 0.5) / (smoothed_total_visits + 1)
-        )
-
-        # checks - due to the smoother, the first value will be NA
-        assert (
-                np.sum(np.isnan(smoothed_total_rates[1:])) == 0
-        ), "NAs in rate calculation"
-        assert (
-                np.sum(smoothed_total_rates[1:] <= 0) == 0
-        ), f"0 or negative value, {geo_id}"
+        rates = total_counts.flatten() / total_visits
+        smoothed_rate = CHCSensor.smoother.smooth(rates)
+        clipped_smoothed_rate = np.clip(smoothed_rate, 0, 1)
+        jeffreys_rate = (clipped_smoothed_rate * total_visits + 0.5) / (total_visits + 1)
 
         # cut off at sensor indexes
-        rate_data = pd.DataFrame({'rate':smoothed_total_rates, 'den': smoothed_total_visits},
+        rate_data = pd.DataFrame({'rate': jeffreys_rate, 'den': total_visits},
                                  index=y_data.index)
         rate_data = rate_data[first_sensor_date:]
         include = rate_data['den'] >= Config.MIN_DEN
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -81,29 +81,52 @@ def test_geo_reindex(self):
                 ""
             )
             su_inst.shift_dates()
-            data_frame = su_inst.geo_reindex(self.small_test_data.reset_index())
+            test_data = pd.DataFrame({
+                "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600],
+                "fips": ['01001'] * 7 + ['04007'] * 6,
+                "den": [1000] * 7 + [2000] * 6,
+                "date": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
+            data_frame = su_inst.geo_reindex(test_data)
             assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
             assert (data_frame.sum() == (4200,19000)).all()
 
     def test_update_sensor(self):
         """Tests that the sensors are properly updated."""
-        for geo in ["state","hrr"]:
+        outputs = {}
+        for geo in ["county", "state", "hhs", "nation"]:
             td = TemporaryDirectory()
             su_inst = CHCSensorUpdator(
-                "02-01-2020",
-                "06-01-2020",
-                "06-12-2020",
+                "03-01-2020",
+                "03-22-2020",
+                "03-27-2020",
                 geo,
                 self.parallel,
                 self.weekday,
                 self.numtype,
                 self.se,
                 ""
             )
-            su_inst.update_sensor(self.small_test_data,  td.name)
+            # As of 3/3/21 (40c258a), this set of data has county outputting data, state and hhs not
+            # outputting data, and nation outputting data, which is undesirable. Ideal behaviour
+            # should be all output or a subregion only outputting if its parent has output,
+            # which is what is being tested here.
+            small_test_data = pd.DataFrame({
+                "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600] * 2,
+                "fips": ["01001"] * 13 + ["42003"] * 13,
+                "den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
+                "date": list(pd.date_range("20200301", "20200313")) * 2}).set_index(
+                ["fips", "date"])
+            su_inst.update_sensor(small_test_data,  td.name)
+            for f in os.listdir(td.name):
+                outputs[f] = pd.read_csv(os.path.join(td.name, f))
             assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
                 f"failed {geo} update sensor test"
             td.cleanup()
+        assert outputs["20200319_county_smoothed_outpatient_covid.csv"].empty
+        assert outputs["20200319_state_smoothed_outpatient_covid.csv"].empty
+        assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty
+        assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty
+
 
 class TestWriteToCsv:
     """Tests for writing output files to CSV."""
diff --git a/facebook/delphiFacebook/R/binary.R b/facebook/delphiFacebook/R/binary.R
@@ -51,12 +51,15 @@ get_binary_indicators <- function() {
     "smoothed_wtravel_outside_state_7d", "weight", "c_travel_state_7d", 6, compute_binary_response, jeffreys_binary,
     
     # work outside home
-    # pre-wave-4
+    # pre-wave 4
     "wip_smoothed_work_outside_home_5d", "weight_unif", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
     "wip_smoothed_wwork_outside_home_5d", "weight", "c_work_outside_5d", 6, compute_binary_response, jeffreys_binary,
-    # wave 4+
+    # wave 4+, pre-wave 10
     "smoothed_work_outside_home_1d", "weight_unif", "a_work_outside_home_1d", 6, compute_binary_response, jeffreys_binary,
     "smoothed_wwork_outside_home_1d", "weight", "a_work_outside_home_1d", 6, compute_binary_response, jeffreys_binary,
+    # wave 10+
+    "smoothed_work_outside_home_indoors_1d", "weight_unif", "a_work_outside_home_indoors_1d", 6, compute_binary_response, jeffreys_binary,
+    "smoothed_wwork_outside_home_indoors_1d", "weight", "a_work_outside_home_indoors_1d", 6, compute_binary_response, jeffreys_binary,
 
     # activities
     # pre-Wave 10
diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R
@@ -332,19 +332,30 @@ bodge_v4_translation <- function(input_data) {
   affected <- c("V4_1", "V4_2", "V4_3", "V4_4", "V4_5")
   corrected <- c("V4a_1", "V4a_2", "V4a_3", "V4a_4", "V4a_5")
 
-  # Step 1: For any non-English results, null out V4 responses. There are NAs
-  # because of filtering earlier in the pipeline that incorrectly handles NA, so
-  # also remove these.
-  non_english <- is.na(input_data$UserLanguage) | input_data$UserLanguage != "EN"
-  for (col in affected) {
-    input_data[non_english, col] <- NA
+  if (any(affected %in% names(input_data))) {
+    # This wave is affected by the problem. Step 1: For any non-English results,
+    # null out V4 responses. There are NAs because of filtering earlier in the
+    # pipeline that incorrectly handles NA, so also remove these.
+    non_english <- is.na(input_data$UserLanguage) | input_data$UserLanguage != "EN"
+    for (col in affected) {
+      input_data[non_english, col] <- NA
+    }
+  } else {
+    # This wave does not have V4, only V4a. We will move V4a's responses into V4
+    # below, so users do not need to know about our goof. Ensure the columns
+    # exist so the later code can move data into them.
+    for (col in affected) {
+      input_data[[col]] <- NA
+    }
   }
 
   # Step 2: If this data does not have V4a, stop.
   if (!("V4a_1" %in% names(input_data))) {
     return(input_data)
   }
 
+  # Step 3: Wherever there are values in the new columns, move them to the old
+  # columns.
   for (ii in seq_along(affected)) {
     bad <- affected[ii]
     good <- corrected[ii]
diff --git a/facebook/delphiFacebook/python/delphi_facebook/qualtrics.py b/facebook/delphiFacebook/python/delphi_facebook/qualtrics.py
@@ -94,7 +94,7 @@ def get(fetch,post,params):
                 print()
             wait,waitt = progress(t)
         if progressStatus=="failed":
-            return r
+            raise Exception(f"ERROR: could not download \"{surv['name']}\"\n{json.dumps(r.json(),sort_keys=True,indent=2)}")
         fileId = r.json()['result']['fileId']
         r = fetch(f"{base}{fileId}/file")
         if not r.ok: return r
diff --git a/facebook/delphiFacebook/tests/testthat/test-responses.R b/facebook/delphiFacebook/tests/testthat/test-responses.R
@@ -116,7 +116,7 @@ test_that("filter_responses works correctly", {
                expected)
 })
 
-test_that("filter_data_for_aggregatation works correctly", {
+test_that("filter_data_for_aggregation works correctly", {
   params <- list(start_date=as.Date("2021-01-05"), static_dir=test_path("static"))
   
   input <- tibble(
@@ -135,7 +135,7 @@ test_that("filter_data_for_aggregatation works correctly", {
     date = c("2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01")
   )
 
-  expect_equal(filter_data_for_aggregatation(input, params), 
+  expect_equal(filter_data_for_aggregation(input, params),
                expected)
 })
 
@@ -170,6 +170,73 @@ test_that("V4 bodge works correctly", {
 
   expect_equal(bodge_v4_translation(foo),
                expected)
+
+  # Ensure the bodge works when all responses are in English to V4a; see PR #888
+  foo <- tibble(
+    UserLanguage = "EN",
+    V4a_1 = c(2, NA, NA, NA, 3),
+    V4a_2 = c(2, NA, NA, NA, 3),
+    V4a_3 = c(2, NA, NA, NA, 3),
+    V4a_4 = c(1, NA, NA, NA, 3),
+    V4a_5 = c(2, NA, NA, NA, 3)
+  )
+
+  expected <- tibble(
+    UserLanguage = foo$UserLanguage,
+    V4a_1 = foo$V4a_1,
+    V4a_2 = foo$V4a_2,
+    V4a_3 = foo$V4a_3,
+    V4a_4 = foo$V4a_4,
+    V4a_5 = foo$V4a_5,
+    V4_1 = c(2, NA, NA, NA, 3),
+    V4_2 = c(2, NA, NA, NA, 3),
+    V4_3 = c(2, NA, NA, NA, 3),
+    V4_4 = c(1, NA, NA, NA, 3),
+    V4_5 = c(2, NA, NA, NA, 3)
+  )
+
+  expect_equal(bodge_v4_translation(foo),
+               expected)
+
+  # Ensure the bodge works when *no* responses are in English, but they're to
+  # V4a, which needs no censoring
+  foo <- tibble(
+    UserLanguage = "ES",
+    V4a_1 = c(2, NA, NA, NA, 3),
+    V4a_2 = c(2, NA, NA, NA, 3),
+    V4a_3 = c(2, NA, NA, NA, 3),
+    V4a_4 = c(1, NA, NA, NA, 3),
+    V4a_5 = c(2, NA, NA, NA, 3)
+  )
+
+  expected <- tibble(
+    UserLanguage = foo$UserLanguage,
+    V4a_1 = foo$V4a_1,
+    V4a_2 = foo$V4a_2,
+    V4a_3 = foo$V4a_3,
+    V4a_4 = foo$V4a_4,
+    V4a_5 = foo$V4a_5,
+    V4_1 = c(2, NA, NA, NA, 3),
+    V4_2 = c(2, NA, NA, NA, 3),
+    V4_3 = c(2, NA, NA, NA, 3),
+    V4_4 = c(1, NA, NA, NA, 3),
+    V4_5 = c(2, NA, NA, NA, 3)
+  )
+
+  expect_equal(bodge_v4_translation(foo),
+               expected)
+
+  # Ensure functioning on earlier waves before V4a happened
+  foo <- tibble(
+    UserLanguage = "EN",
+    V4_1 = c(2, NA, NA, NA, 3),
+    V4_2 = c(2, NA, 4, NA, 3),
+    V4_3 = c(2, NA, NA, NA, 3),
+    V4_4 = c(1, NA, NA, NA, 3),
+    V4_5 = c(2, NA, NA, NA, 3)
+  )
+
+  expect_equal(bodge_v4_translation(foo), foo)
 })
 
 
@@ -182,7 +249,7 @@ test_that("C6/8 bodge works correctly", {
     C8_2 = c(1, 2, 3, 4),
     C8_3 = c(1, 2, 3, 4)
   )
-  
+
   expect_equal(bodge_C6_C8(input),
                input)
   
diff --git a/sir_complainsalot/delphi_sir_complainsalot/check_source.py b/sir_complainsalot/delphi_sir_complainsalot/check_source.py
@@ -119,6 +119,17 @@ def check_source(data_source, meta, params, grace, logger):
 
         gap_days = [(day - prev_day).days
                     for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
+
+        # If we only have a single day of data available then gap days will be
+        # empty.
+        if not gap_days:
+            logger.info(
+                "Not enough data to calculate gap days.",
+                data_source=data_source,
+                signal=row["signal"],
+                geo_type=row["geo_type"])
+            continue
+
         gap = max(gap_days) - 1
         logger.info("Detecting days with data present",
                     data_source = data_source,
diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template
@@ -39,7 +39,7 @@
     "fb-survey": {
       "max_age": 3,
       "maintainers": ["U01069KCRS7"],
-      "retired-signals": ["smoothed_wearing_mask"]
+      "retired-signals": ["smoothed_wearing_mask", "smoothed_wwearing_mask"]
     },
     "indicator-combination": {
       "max_age": 3,