Nans changehc:

dshemetov · dshemetov · commit 6715ba33e522 · 2021-04-26T18:00:07.000-07:00
* allow nan values, add missing columns, and test
diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py
@@ -11,7 +11,7 @@
 # third party
 import numpy as np
 import pandas as pd
-from delphi_utils import GeoMapper, read_params, add_prefix
+from delphi_utils import GeoMapper, add_prefix, Nans
 
 # first party
 from .config import Config
@@ -46,7 +46,7 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
             out_name,
         )
         with open(filename, "w") as outfile:
-            outfile.write("geo_id,val,se,direction,sample_size\n")
+            outfile.write("geo_id,val,se,direction,sample_size,missing_val,missing_se,missing_sample_size\n")
             for geo_id in geo_ids:
                 sensor = all_rates[geo_id][i]
                 se = all_se[geo_id][i]
@@ -57,17 +57,36 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
                         logging.warning("value suspiciously high, {0}: {1}".format(
                             geo_id, sensor
                         ))
-                    assert se < 5, f"se suspiciously high, {geo_id}: {se}"
+                    assert se < 5, "se is suspiciously high, {0}: {1}".format(
+                        geo_id, sensor
+                    )
                     if write_se:
                         assert sensor > 0 and se > 0, "p=0, std_err=0 invalid"
                         outfile.write(
-                            "%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA))
+                            "%s,%f,%s,%s,%s,%d,%d,%d\n" % (
+                                geo_id, sensor, se, NA, NA,
+                                Nans.NOT_MISSING.value, Nans.NOT_MISSING.value, Nans.PRIVACY.value
+                            )
+                        )
                     else:
                         # for privacy reasons we will not report the standard error
                         outfile.write(
-                            "%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA)
+                            "%s,%f,%s,%s,%s,%d,%d,%d\n" % (
+                                geo_id, sensor, NA, NA, NA,
+                                Nans.NOT_MISSING.value, Nans.PRIVACY.value, Nans.PRIVACY.value
+                            )
                         )
                     out_n += 1
+                else:
+                    logging.warning("writing insufficient data for geo_id {0}, {1}".format(
+                            geo_id, i
+                        ))
+                    outfile.write(
+                        "%s,%s,%s,%s,%s,%d,%d,%d\n" % (
+                            geo_id, NA, NA, NA, NA,
+                            Nans.PRIVACY.value, Nans.PRIVACY.value, Nans.NOT_APPLICABLE.value
+                        )
+                    )
     logging.debug("wrote {0} rows for {1} {2}".format(
         out_n, len(geo_ids), geo_level
     ))
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -93,7 +93,8 @@ def test_geo_reindex(self):
     def test_update_sensor(self):
         """Tests that the sensors are properly updated."""
         outputs = {}
-        for geo in ["county", "state", "hhs", "nation"]:
+        geos = ["county", "state", "hhs", "nation"]
+        for geo in geos:
             td = TemporaryDirectory()
             su_inst = CHCSensorUpdator(
                 "03-01-2020",
@@ -116,17 +117,17 @@ def test_update_sensor(self):
                 "den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
                 "date": list(pd.date_range("20200301", "20200313")) * 2}).set_index(
                 ["fips", "date"])
+            # breakpoint()
             su_inst.update_sensor(small_test_data,  td.name)
             for f in os.listdir(td.name):
                 outputs[f] = pd.read_csv(os.path.join(td.name, f))
             assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
                 f"failed {geo} update sensor test"
             td.cleanup()
-        assert outputs["20200319_county_smoothed_outpatient_covid.csv"].empty
-        assert outputs["20200319_state_smoothed_outpatient_covid.csv"].empty
-        assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty
-        assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty
-
+        value_columns = ["val", "se", "direction", "sample_size"]
+        for geo in geos:
+            assert np.isnan(outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"][value_columns]).all().all()
+            assert outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"]["missing_val"].eq(3).all()
 
 class TestWriteToCsv:
     """Tests for writing output files to CSV."""
@@ -161,8 +162,9 @@ def test_write_to_csv_results(self):
         expected_name = "20200502_geography_name_of_signal.csv"
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
+        expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
@@ -175,11 +177,12 @@ def test_write_to_csv_results(self):
         expected_name = "20200503_geography_name_of_signal.csv"
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
+
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
-        assert (output_data.geo_id == ["a"]).all()
-        assert np.array_equal(output_data.val.values, np.array([0.5]))
+        assert (output_data.geo_id == ["a", "b"]).all()
+        assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True)
         assert np.isnan(output_data.se.values).all()
         assert np.isnan(output_data.direction.values).all()
         assert np.isnan(output_data.sample_size.values).all()
@@ -188,7 +191,7 @@ def test_write_to_csv_results(self):
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([1.5, 3]))
@@ -224,13 +227,13 @@ def test_write_to_csv_with_se_results(self):
 
         td = TemporaryDirectory()
         write_to_csv(res0, True, "name_of_signal", td.name)
-
         # check outputs
         expected_name = "20200502_geography_name_of_signal.csv"
+        expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([0.1, 1]))