Skip to content

Commit 6a2c215

Browse files
committed
NANs USAFacts:
* keep nan values in "val" column * mark early smoothing data as "data insufficient" * add missing column outputs
1 parent 08c9bd2 commit 6a2c215

File tree

2 files changed

+45
-23
lines changed

2 files changed

+45
-23
lines changed

usafacts/delphi_usafacts/run.py

+33-20
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from itertools import product
1010
from typing import Dict, Any
1111

12+
import pandas as pd
1213
import numpy as np
1314
from delphi_utils import (
1415
create_export_csv,
@@ -64,6 +65,27 @@
6465
]
6566

6667

68+
def add_nancodes(df, smoother):
69+
"""Add nancodes to the dataframe."""
70+
idx = pd.IndexSlice
71+
72+
# Default nancodes
73+
df["missing_val"] = Nans.NOT_MISSING
74+
df["missing_se"] = Nans.NOT_APPLICABLE
75+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
76+
77+
# Mark early smoothing entries as data insufficient
78+
if smoother == "seven_day_average":
79+
df.sort_index(inplace=True)
80+
min_time_value = df.index.min()[0] + 6 * pd.Timedelta(days=1)
81+
df.loc[idx[:min_time_value, :], "missing_val"] = Nans.PRIVACY
82+
83+
# Mark any remaining nans with unknown
84+
remaining_nans_mask = df["val"].isnull() & (df["missing_val"] == Nans.NOT_MISSING)
85+
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
86+
return df
87+
88+
6789
def run_module(params: Dict[str, Dict[str, Any]]):
6890
"""Run the usafacts indicator.
6991
@@ -112,37 +134,28 @@ def run_module(params: Dict[str, Dict[str, Any]]):
112134
df = dfs[metric]
113135
# Aggregate to appropriate geographic resolution
114136
df = geo_map(df, geo_res, sensor)
115-
df["val"] = df[["geo_id", sensor]].groupby("geo_id")[sensor].transform(
116-
SMOOTHERS_MAP[smoother][0].smooth
117-
)
118-
df["se"] = np.nan
119-
df["sample_size"] = np.nan
137+
df.set_index(["timestamp", "geo_id"], inplace=True)
120138

121-
# Default missing code
122-
df["missing_val"] = Nans.NOT_MISSING
123-
df["missing_se"] = Nans.NOT_APPLICABLE
124-
df["missing_sample_size"] = Nans.NOT_APPLICABLE
139+
# Smooth
140+
smooth_obj, smoother_prefix, _, smoother_lag = SMOOTHERS_MAP[smoother]
141+
df["val"] = df[sensor].groupby(level=1).transform(smooth_obj.smooth)
125142

126-
# Mark early smoothing entries as data insufficient
127-
if smoother == "seven_day_average":
128-
df.sort_index(inplace=True)
129-
min_time_value = df.index.min()[0] + 6 * pd.Timedelta(days=1)
130-
df.loc[idx[:min_time_value, :], "missing_val"] = Nans.DATA_INSUFFICIENT
143+
# USAFacts is not a survey indicator
144+
df["se"] = np.nan
145+
df["sample_size"] = np.nan
131146

132-
# Mark any remaining nans with unknown
133-
remaining_nans_mask = df["val"].isnull() & (df["missing_val"] == Nans.NOT_MISSING)
134-
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
147+
df = add_nancodes(df, smoother)
135148

136149
df.reset_index(inplace=True)
137150
sensor_name = SENSOR_NAME_MAP[sensor][0]
138-
# if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
151+
# if (SENSOR_NAME_MAP[sensor][1] or is_smooth_wip):
139152
# metric = f"wip_{metric}"
140153
# sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
141-
sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
154+
sensor_name = smoother_prefix + sensor_name
142155
exported_csv_dates = create_export_csv(
143156
df,
144157
export_dir=export_dir,
145-
start_date=SMOOTHERS_MAP[smoother][3](export_start_date),
158+
start_date=smoother_lag(export_start_date),
146159
metric=metric,
147160
geo_res=geo_res,
148161
sensor=sensor_name,

usafacts/tests/test_run.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,6 @@ def test_output_files_exist(self):
5252
for date in dates:
5353
for geo in geos:
5454
for metric in metrics:
55-
if "7dav" in metric and date in dates[:6]:
56-
continue # there are no 7dav signals for first 6 days
5755
expected_files += [date + "_" + geo + "_" + metric + ".csv"]
5856

5957
assert set(csv_files) == set(expected_files)
@@ -65,4 +63,15 @@ def test_output_file_format(self):
6563
df = pd.read_csv(
6664
join("receiving", "20200310_state_confirmed_cumulative_num.csv")
6765
)
68-
assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
66+
assert (
67+
df.columns.values
68+
== [
69+
"geo_id",
70+
"val",
71+
"se",
72+
"sample_size",
73+
"missing_val",
74+
"missing_se",
75+
"missing_sample_size",
76+
]
77+
).all()

0 commit comments

Comments
 (0)