Skip to content

Commit 40e7e0f

Browse files
committed
Nans nchs:
* add missing column handling * update export function and tests * gitignore cache csvs
1 parent d3e44ce commit 40e7e0f

File tree

5 files changed

+41
-8
lines changed

5 files changed

+41
-8
lines changed

nchs_mortality/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ params.json
55

66
# Do not commit output files
77
receiving/*.csv
8+
daily_receiving/*.csv
9+
cache/*.csv
10+
daily_cache/*.csv
811

912
# Do not commit test files
1013
tests/receiving/*.csv

nchs_mortality/delphi_nchs_mortality/export.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ def export_csv(df, geo_name, sensor, export_dir, start_date):
2828
t = Week.fromdate(pd.to_datetime(str(date)))
2929
date_short = "weekly_" + str(t.year) + str(t.week).zfill(2)
3030
export_fn = f"{date_short}_{geo_name}_{sensor}.csv"
31-
result_df = df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size"]]
31+
expected_columns = [
32+
"geo_id", "val", "se", "sample_size",
33+
"missing_val", "missing_se", "missing_sample_size"
34+
]
35+
result_df = df[df["timestamp"] == date][expected_columns]
3236
result_df.to_csv(f"{export_dir}/{export_fn}",
3337
index=False,
3438
float_format="%.8f")

nchs_mortality/delphi_nchs_mortality/run.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from typing import Dict, Any
1010

1111
import numpy as np
12-
from delphi_utils import S3ArchiveDiffer, get_structured_logger
12+
from delphi_utils import S3ArchiveDiffer, get_structured_logger, Nans
1313

1414
from .archive_diffs import arch_diffs
1515
from .constants import (METRICS, SENSOR_NAME_MAP,
@@ -18,6 +18,18 @@
1818
from .pull import pull_nchs_mortality_data
1919

2020

21+
def add_nancodes(df):
22+
"""Add nancodes to the dataframe."""
23+
# Default missingness codes
24+
df["missing_val"] = Nans.NOT_MISSING
25+
df["missing_se"] = Nans.NOT_APPLICABLE
26+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
27+
28+
# Mark any remaining nans with unknown
29+
remaining_nans_mask = df["val"].isnull()
30+
df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN
31+
return df
32+
2133
def run_module(params: Dict[str, Any]):
2234
"""Run module for processing NCHS mortality data.
2335
@@ -67,7 +79,8 @@ def run_module(params: Dict[str, Any]):
6779
df["val"] = df[metric]
6880
df["se"] = np.nan
6981
df["sample_size"] = np.nan
70-
df = df[~df["val"].isnull()]
82+
df = add_nancodes(df)
83+
# df = df[~df["val"].isnull()]
7184
sensor_name = "_".join([SENSOR_NAME_MAP[metric]])
7285
export_csv(
7386
df,
@@ -86,7 +99,8 @@ def run_module(params: Dict[str, Any]):
8699
df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
87100
df["se"] = np.nan
88101
df["sample_size"] = np.nan
89-
df = df[~df["val"].isnull()]
102+
df = add_nancodes(df)
103+
# df = df[~df["val"].isnull()]
90104
sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
91105
export_csv(
92106
df,

nchs_mortality/tests/test_export.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55

66
from delphi_nchs_mortality.export import export_csv
7+
from delphi_utils import Nans
78

89

910
class TestExport:
@@ -16,7 +17,10 @@ def test_export(self):
1617
"val": [0, 2, 3, 5, 10, 12],
1718
"timestamp": [datetime(2020, 6, 2), datetime(2020, 6, 9)] * 3,
1819
"se": [0.01, 0.02, 0.01, 0.01, 0.005, 0.01],
19-
"sample_size": [100, 200, 500, 50, 80, 10]
20+
"sample_size": [100, 200, 500, 50, 80, 10],
21+
"missing_val": [Nans.NOT_MISSING] * 6,
22+
"missing_se": [Nans.NOT_MISSING] * 6,
23+
"missing_sample_size": [Nans.NOT_MISSING] * 6,
2024
}
2125
)
2226

@@ -34,7 +38,11 @@ def test_export(self):
3438

3539
output_data = pd.read_csv(join("./receiving", expected_name))
3640

37-
assert (output_data.columns == ["geo_id", "val", "se", "sample_size"]).all()
41+
expected_columns = [
42+
"geo_id", "val", "se", "sample_size",
43+
"missing_val", "missing_se", "missing_sample_size"
44+
]
45+
assert (output_data.columns == expected_columns).all()
3846
assert (output_data.geo_id == ["a", "b", "c"]).all()
3947
assert (output_data.se.values == [0.01, 0.01, 0.005]).all()
4048
assert (output_data.sample_size.values == [100, 500, 80]).all()
@@ -45,7 +53,7 @@ def test_export(self):
4553

4654
output_data = pd.read_csv(join("./receiving", expected_name))
4755

48-
assert (output_data.columns == ["geo_id", "val", "se", "sample_size"]).all()
56+
assert (output_data.columns == expected_columns).all()
4957
assert (output_data.geo_id == ["a", "b", "c"]).all()
5058
assert (output_data.se.values == [0.02, 0.01, 0.01]).all()
5159
assert (output_data.sample_size.values == [200, 50, 10]).all()

nchs_mortality/tests/test_run.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,8 @@ def test_output_file_format(self, run_as_module, date):
6060
df = pd.read_csv(
6161
join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv")
6262
)
63-
assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()
63+
expected_columns = [
64+
"geo_id", "val", "se", "sample_size",
65+
"missing_val", "missing_se", "missing_sample_size"
66+
]
67+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)