Skip to content

Commit aefab74

Browse files
committed
Nans cdc_covidnet:
* keep nan values, add missing columns, add tests
1 parent 9b75e07 commit aefab74

File tree

3 files changed

+29
-3
lines changed

3 files changed

+29
-3
lines changed

cdc_covidnet/delphi_cdc_covidnet/update_sensor.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import numpy as np
1313
import pandas as pd
14-
from delphi_utils import GeoMapper, add_prefix
14+
from delphi_utils import GeoMapper, add_prefix, Nans
1515

1616
from .api_config import APIConfig
1717
from .constants import SIGNALS
@@ -47,6 +47,20 @@ def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str):
4747
sub_df.drop("epiweek", axis=1).to_csv(filename, na_rep="NA")
4848

4949

50+
def add_nancodes(df: pd.DataFrame) -> pd.DataFrame:
51+
"""Add basic missing columns to dataframe."""
52+
# Default missing code
53+
df["missing_val"] = Nans.NOT_MISSING
54+
missing_mask = ~df["val"].isnull()
55+
df.loc[missing_mask, "missing_val"] = Nans.UNKNOWN
56+
57+
# Fill in remaining expected columns
58+
df["missing_se"] = Nans.NOT_APPLICABLE
59+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
60+
61+
return df
62+
63+
5064
def update_sensor(
5165
state_files: List[str],
5266
mmwr_info: pd.DataFrame,
@@ -99,10 +113,11 @@ def update_sensor(
99113
assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
100114
hosp_df.set_index(["date", "geo_id"], inplace=True)
101115

102-
# Fill in remaining expected columns
103116
hosp_df["se"] = np.nan
104117
hosp_df["sample_size"] = np.nan
105118

119+
hosp_df = add_nancodes(hosp_df)
120+
106121
# Write results
107122
signals = add_prefix(SIGNALS,
108123
wip_signal=wip_signal,

cdc_covidnet/tests/test_run.py

+2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas.testing import assert_frame_equal
66

77
from delphi_cdc_covidnet.run import run_module
8+
from delphi_cdc_covidnet.update_sensor import add_nancodes
89

910

1011
class TestRun:
@@ -55,5 +56,6 @@ def test_match_old_to_new_output(self):
5556

5657
# Contents match
5758
expected_df = pd.read_csv(join("receiving_test", fname))
59+
expected_df = add_nancodes(expected_df)
5860
actual_df = pd.read_csv(join("receiving", fname))
5961
assert_frame_equal(expected_df, actual_df, check_less_precise=5)

cdc_covidnet/tests/test_update_sensor.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,16 @@ def test_syn_update_sensor(self):
9595

9696
for i, exp_file in enumerate(expected_files):
9797
data = pd.read_csv(exp_file)
98-
assert (data.columns == ["geo_id", "val", "se", "sample_size"]).all()
98+
expected_columns = [
99+
"geo_id",
100+
"val",
101+
"se",
102+
"sample_size",
103+
"missing_val",
104+
"missing_se",
105+
"missing_sample_size"
106+
]
107+
assert (data.columns == expected_columns).all()
99108

100109
# Check data for NA
101110
assert (~pd.isna(data["geo_id"])).all()

0 commit comments

Comments
 (0)