NANs USAFacts:

dshemetov · dshemetov · commit 6a2c215f4539 · 2021-04-27T14:13:58.000-07:00
* keep nan values in "val" column
* mark early smoothing data as "data insufficient"
* add missing column outputs
diff --git a/usafacts/delphi_usafacts/run.py b/usafacts/delphi_usafacts/run.py
@@ -9,6 +9,7 @@
 from itertools import product
 from typing import Dict, Any
 
+import pandas as pd
 import numpy as np
 from delphi_utils import (
     create_export_csv,
@@ -64,6 +65,27 @@
 ]
 
 
+def add_nancodes(df, smoother):
+    """Add nancodes to the dataframe."""
+    idx = pd.IndexSlice
+
+    # Default nancodes
+    df["missing_val"] = Nans.NOT_MISSING
+    df["missing_se"] = Nans.NOT_APPLICABLE
+    df["missing_sample_size"] = Nans.NOT_APPLICABLE
+
+    # Mark early smoothing entries as data insufficient
+    if smoother == "seven_day_average":
+        df.sort_index(inplace=True)
+        min_time_value = df.index.min()[0] + 6 * pd.Timedelta(days=1)
+        df.loc[idx[:min_time_value, :], "missing_val"] = Nans.PRIVACY
+
+    # Mark any remaining nans with unknown
+    remaining_nans_mask = df["val"].isnull() & (df["missing_val"] == Nans.NOT_MISSING)
+    df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
+    return df
+
+
 def run_module(params: Dict[str, Dict[str, Any]]):
     """Run the usafacts indicator.
 
@@ -112,37 +134,28 @@ def run_module(params: Dict[str, Dict[str, Any]]):
         df = dfs[metric]
         # Aggregate to appropriate geographic resolution
         df = geo_map(df, geo_res, sensor)
-        df["val"] = df[["geo_id", sensor]].groupby("geo_id")[sensor].transform(
-            SMOOTHERS_MAP[smoother][0].smooth
-        )
-        df["se"] = np.nan
-        df["sample_size"] = np.nan
+        df.set_index(["timestamp", "geo_id"], inplace=True)
 
-        # Default missing code
-        df["missing_val"] = Nans.NOT_MISSING
-        df["missing_se"] = Nans.NOT_APPLICABLE
-        df["missing_sample_size"] = Nans.NOT_APPLICABLE
+        # Smooth
+        smooth_obj, smoother_prefix, _, smoother_lag = SMOOTHERS_MAP[smoother]
+        df["val"] = df[sensor].groupby(level=1).transform(smooth_obj.smooth)
 
-        # Mark early smoothing entries as data insufficient
-        if smoother == "seven_day_average":
-            df.sort_index(inplace=True)
-            min_time_value = df.index.min()[0] + 6 * pd.Timedelta(days=1)
-            df.loc[idx[:min_time_value, :], "missing_val"] = Nans.DATA_INSUFFICIENT
+        # USAFacts is not a survey indicator
+        df["se"] = np.nan
+        df["sample_size"] = np.nan
 
-        # Mark any remaining nans with unknown
-        remaining_nans_mask = df["val"].isnull() & (df["missing_val"] == Nans.NOT_MISSING)
-        df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
+        df = add_nancodes(df, smoother)
 
         df.reset_index(inplace=True)
         sensor_name = SENSOR_NAME_MAP[sensor][0]
-        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
+        # if (SENSOR_NAME_MAP[sensor][1] or is_smooth_wip):
         #     metric = f"wip_{metric}"
         #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
-        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
+        sensor_name = smoother_prefix + sensor_name
         exported_csv_dates = create_export_csv(
             df,
             export_dir=export_dir,
-            start_date=SMOOTHERS_MAP[smoother][3](export_start_date),
+            start_date=smoother_lag(export_start_date),
             metric=metric,
             geo_res=geo_res,
             sensor=sensor_name,
diff --git a/usafacts/tests/test_run.py b/usafacts/tests/test_run.py
@@ -52,8 +52,6 @@ def test_output_files_exist(self):
         for date in dates:
             for geo in geos:
                 for metric in metrics:
-                    if "7dav" in metric and date in dates[:6]:
-                        continue  # there are no 7dav signals for first 6 days
                     expected_files += [date + "_" + geo + "_" + metric + ".csv"]
 
         assert set(csv_files) == set(expected_files)
@@ -65,4 +63,15 @@ def test_output_file_format(self):
         df = pd.read_csv(
             join("receiving", "20200310_state_confirmed_cumulative_num.csv")
         )
-        assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
+        assert (
+            df.columns.values
+            == [
+                "geo_id",
+                "val",
+                "se",
+                "sample_size",
+                "missing_val",
+                "missing_se",
+                "missing_sample_size",
+            ]
+        ).all()