Skip to content

Commit 44cd861

Browse files
committed
Nans combo:
* add missing columns and tests
1 parent 0b7103a commit 44cd861

File tree

2 files changed

+45
-3
lines changed

2 files changed

+45
-3
lines changed

combo_cases_and_deaths/delphi_combo_cases_and_deaths/run.py

+27-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import covidcast
1616
import pandas as pd
1717

18-
from delphi_utils import add_prefix, get_structured_logger
18+
from delphi_utils import add_prefix, get_structured_logger, Nans
1919
from delphi_utils.geomap import GeoMapper
2020
from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS
2121

@@ -292,6 +292,25 @@ def configure_range(params, range_param, yesterday, next_day):
292292
date1 = params['indicator']['export_start_date']
293293
params['indicator'][range_param] = [date1, date2]
294294

295+
def add_nancodes(df):
296+
"""Add nancodes to the dataframe.
297+
298+
se and sample_size should already be nan and NOT_APPLICABLE, inheriting from USAFacts
299+
and JHU. Due to the geo aggregation, the missingness codes will get mixed up among rows.
300+
So for the time being, we use only one missing code (UNKNOWN) for nan values in the val
301+
column.
302+
"""
303+
# Default missingness codes
304+
df["missing_val"] = Nans.NOT_MISSING
305+
df["missing_se"] = Nans.NOT_APPLICABLE
306+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
307+
308+
# Missing codes for `val`
309+
missing_mask = df["val"].isnull()
310+
df.loc[missing_mask, "missing_val"] = Nans.OTHER
311+
312+
return df
313+
295314
def run_module(params):
296315
"""
297316
Produce a combined cases and deaths signal using data from JHU and USA Facts.
@@ -332,7 +351,7 @@ def run_module(params):
332351
extend_raw_date_range(params, sensor_name),
333352
logger,
334353
params['indicator']['issue_range'])
335-
df["timestamp"] = pd.to_datetime(df["timestamp"])
354+
df = add_nancodes(df)
336355
start_date = pd.to_datetime(params['indicator']['export_start_date'])
337356
export_dir = params["common"]["export_dir"]
338357
dates = pd.Series(
@@ -344,7 +363,12 @@ def run_module(params):
344363
prefix="wip_")
345364
for date_ in dates:
346365
export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv'
347-
df[df["timestamp"] == date_][["geo_id", "val", "se", "sample_size", ]].to_csv(
366+
date_mask = (df["timestamp"] == date_)
367+
columns_to_write = [
368+
"geo_id", "val", "se", "sample_size",
369+
"missing_val", "missing_se", "missing_sample_size"
370+
]
371+
df.loc[date_mask, columns_to_write].to_csv(
348372
f"{export_dir}/{export_fn}", index=False, na_rep="NA"
349373
)
350374

combo_cases_and_deaths/tests/test_run.py

+18
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111
from delphi_combo_cases_and_deaths.run import (
1212
run_module,
1313
extend_raw_date_range,
14+
add_nancodes,
15+
extend_raw_date_range,
1416
get_updated_dates,
1517
sensor_signal,
1618
combine_usafacts_and_jhu,
1719
compute_special_geo_dfs,
1820
COLUMN_MAPPING)
1921
from delphi_combo_cases_and_deaths.constants import METRICS, SMOOTH_TYPES, SENSORS
2022
from delphi_utils.geomap import GeoMapper
23+
from delphi_utils import Nans
2124

2225
TEST_LOGGER = logging.getLogger()
2326

@@ -301,5 +304,20 @@ def test_output_files(mock_combine):
301304
expected_files += [date + "_" + geo + "_" + metric + ".csv"]
302305
assert set(csv_files) == set(expected_files)
303306

307+
def test_add_nancodes():
308+
df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"],
309+
"val": [50, 100, None],
310+
"timestamp": [20200101, 20200101, 20200101]})
311+
expected_df = pd.DataFrame({"geo_id": ["01000", "01001", "01001"],
312+
"val": [50, 100, None],
313+
"timestamp": [20200101, 20200101, 20200101],
314+
"missing_val": [Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER],
315+
"missing_se": [Nans.NOT_APPLICABLE] * 3,
316+
"missing_sample_size": [Nans.NOT_APPLICABLE] * 3
317+
})
318+
df = add_nancodes(df)
319+
pd.testing.assert_frame_equal(df, expected_df)
320+
321+
304322
if __name__ == '__main__':
305323
unittest.main()

0 commit comments

Comments
 (0)