Skip to content

Commit 6715ba3

Browse files
committed
Nans changehc:
* allow nan values, add missing columns, and test
1 parent 6113dfa commit 6715ba3

File tree

2 files changed

+40
-18
lines changed

2 files changed

+40
-18
lines changed

changehc/delphi_changehc/update_sensor.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# third party
1212
import numpy as np
1313
import pandas as pd
14-
from delphi_utils import GeoMapper, read_params, add_prefix
14+
from delphi_utils import GeoMapper, add_prefix, Nans
1515

1616
# first party
1717
from .config import Config
@@ -46,7 +46,7 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
4646
out_name,
4747
)
4848
with open(filename, "w") as outfile:
49-
outfile.write("geo_id,val,se,direction,sample_size\n")
49+
outfile.write("geo_id,val,se,direction,sample_size,missing_val,missing_se,missing_sample_size\n")
5050
for geo_id in geo_ids:
5151
sensor = all_rates[geo_id][i]
5252
se = all_se[geo_id][i]
@@ -57,17 +57,36 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
5757
logging.warning("value suspiciously high, {0}: {1}".format(
5858
geo_id, sensor
5959
))
60-
assert se < 5, f"se suspiciously high, {geo_id}: {se}"
60+
assert se < 5, "se is suspiciously high, {0}: {1}".format(
61+
geo_id, sensor
62+
)
6163
if write_se:
6264
assert sensor > 0 and se > 0, "p=0, std_err=0 invalid"
6365
outfile.write(
64-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA))
66+
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
67+
geo_id, sensor, se, NA, NA,
68+
Nans.NOT_MISSING.value, Nans.NOT_MISSING.value, Nans.PRIVACY.value
69+
)
70+
)
6571
else:
6672
# for privacy reasons we will not report the standard error
6773
outfile.write(
68-
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA)
74+
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
75+
geo_id, sensor, NA, NA, NA,
76+
Nans.NOT_MISSING.value, Nans.PRIVACY.value, Nans.PRIVACY.value
77+
)
6978
)
7079
out_n += 1
80+
else:
81+
logging.warning("writing insufficient data for geo_id {0}, {1}".format(
82+
geo_id, i
83+
))
84+
outfile.write(
85+
"%s,%s,%s,%s,%s,%d,%d,%d\n" % (
86+
geo_id, NA, NA, NA, NA,
87+
Nans.PRIVACY.value, Nans.PRIVACY.value, Nans.NOT_APPLICABLE.value
88+
)
89+
)
7190
logging.debug("wrote {0} rows for {1} {2}".format(
7291
out_n, len(geo_ids), geo_level
7392
))

changehc/tests/test_update_sensor.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ def test_geo_reindex(self):
9393
def test_update_sensor(self):
9494
"""Tests that the sensors are properly updated."""
9595
outputs = {}
96-
for geo in ["county", "state", "hhs", "nation"]:
96+
geos = ["county", "state", "hhs", "nation"]
97+
for geo in geos:
9798
td = TemporaryDirectory()
9899
su_inst = CHCSensorUpdator(
99100
"03-01-2020",
@@ -116,17 +117,17 @@ def test_update_sensor(self):
116117
"den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
117118
"date": list(pd.date_range("20200301", "20200313")) * 2}).set_index(
118119
["fips", "date"])
120+
# breakpoint()
119121
su_inst.update_sensor(small_test_data, td.name)
120122
for f in os.listdir(td.name):
121123
outputs[f] = pd.read_csv(os.path.join(td.name, f))
122124
assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
123125
f"failed {geo} update sensor test"
124126
td.cleanup()
125-
assert outputs["20200319_county_smoothed_outpatient_covid.csv"].empty
126-
assert outputs["20200319_state_smoothed_outpatient_covid.csv"].empty
127-
assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty
128-
assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty
129-
127+
value_columns = ["val", "se", "direction", "sample_size"]
128+
for geo in geos:
129+
assert np.isnan(outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"][value_columns]).all().all()
130+
assert outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"]["missing_val"].eq(3).all()
130131

131132
class TestWriteToCsv:
132133
"""Tests for writing output files to CSV."""
@@ -161,8 +162,9 @@ def test_write_to_csv_results(self):
161162
expected_name = "20200502_geography_name_of_signal.csv"
162163
assert exists(join(td.name, expected_name))
163164
output_data = pd.read_csv(join(td.name, expected_name))
165+
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
164166
assert (
165-
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
167+
output_data.columns == expected_columns
166168
).all()
167169
assert (output_data.geo_id == ["a", "b"]).all()
168170
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
@@ -175,11 +177,12 @@ def test_write_to_csv_results(self):
175177
expected_name = "20200503_geography_name_of_signal.csv"
176178
assert exists(join(td.name, expected_name))
177179
output_data = pd.read_csv(join(td.name, expected_name))
180+
178181
assert (
179-
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
182+
output_data.columns == expected_columns
180183
).all()
181-
assert (output_data.geo_id == ["a"]).all()
182-
assert np.array_equal(output_data.val.values, np.array([0.5]))
184+
assert (output_data.geo_id == ["a", "b"]).all()
185+
assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True)
183186
assert np.isnan(output_data.se.values).all()
184187
assert np.isnan(output_data.direction.values).all()
185188
assert np.isnan(output_data.sample_size.values).all()
@@ -188,7 +191,7 @@ def test_write_to_csv_results(self):
188191
assert exists(join(td.name, expected_name))
189192
output_data = pd.read_csv(join(td.name, expected_name))
190193
assert (
191-
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
194+
output_data.columns == expected_columns
192195
).all()
193196
assert (output_data.geo_id == ["a", "b"]).all()
194197
assert np.array_equal(output_data.val.values, np.array([1.5, 3]))
@@ -224,13 +227,13 @@ def test_write_to_csv_with_se_results(self):
224227

225228
td = TemporaryDirectory()
226229
write_to_csv(res0, True, "name_of_signal", td.name)
227-
228230
# check outputs
229231
expected_name = "20200502_geography_name_of_signal.csv"
232+
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
230233
assert exists(join(td.name, expected_name))
231234
output_data = pd.read_csv(join(td.name, expected_name))
232235
assert (
233-
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
236+
output_data.columns == expected_columns
234237
).all()
235238
assert (output_data.geo_id == ["a", "b"]).all()
236239
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))

0 commit comments

Comments
 (0)