15
15
import covidcast
16
16
import pandas as pd
17
17
18
- from delphi_utils import add_prefix , get_structured_logger
18
+ from delphi_utils import add_prefix , get_structured_logger , Nans
19
19
from delphi_utils .geomap import GeoMapper
20
20
from .constants import METRICS , SMOOTH_TYPES , SENSORS , GEO_RESOLUTIONS
21
21
@@ -292,6 +292,25 @@ def configure_range(params, range_param, yesterday, next_day):
292
292
date1 = params ['indicator' ]['export_start_date' ]
293
293
params ['indicator' ][range_param ] = [date1 , date2 ]
294
294
295
+ def add_nancodes (df ):
296
+ """Add nancodes to the dataframe.
297
+
298
+ se and sample_size should already be nan and NOT_APPLICABLE, inheriting from USAFacts
299
+ and JHU. Due to the geo aggregation, the missingness codes will get mixed up among rows.
300
+ So for the time being, we use only one missing code (UNKNOWN) for nan values in the val
301
+ column.
302
+ """
303
+ # Default missingness codes
304
+ df ["missing_val" ] = Nans .NOT_MISSING
305
+ df ["missing_se" ] = Nans .NOT_APPLICABLE
306
+ df ["missing_sample_size" ] = Nans .NOT_APPLICABLE
307
+
308
+ # Missing codes for `val`
309
+ missing_mask = df ["val" ].isnull ()
310
+ df .loc [missing_mask , "missing_val" ] = Nans .OTHER
311
+
312
+ return df
313
+
295
314
def run_module (params ):
296
315
"""
297
316
Produce a combined cases and deaths signal using data from JHU and USA Facts.
@@ -332,7 +351,7 @@ def run_module(params):
332
351
extend_raw_date_range (params , sensor_name ),
333
352
logger ,
334
353
params ['indicator' ]['issue_range' ])
335
- df [ "timestamp" ] = pd . to_datetime (df [ "timestamp" ] )
354
+ df = add_nancodes (df )
336
355
start_date = pd .to_datetime (params ['indicator' ]['export_start_date' ])
337
356
export_dir = params ["common" ]["export_dir" ]
338
357
dates = pd .Series (
@@ -344,7 +363,12 @@ def run_module(params):
344
363
prefix = "wip_" )
345
364
for date_ in dates :
346
365
export_fn = f'{ date_ .strftime ("%Y%m%d" )} _{ geo_res } _{ signal_name [0 ]} .csv'
347
- df [df ["timestamp" ] == date_ ][["geo_id" , "val" , "se" , "sample_size" , ]].to_csv (
366
+ date_mask = (df ["timestamp" ] == date_ )
367
+ columns_to_write = [
368
+ "geo_id" , "val" , "se" , "sample_size" ,
369
+ "missing_val" , "missing_se" , "missing_sample_size"
370
+ ]
371
+ df .loc [date_mask , columns_to_write ].to_csv (
348
372
f"{ export_dir } /{ export_fn } " , index = False , na_rep = "NA"
349
373
)
350
374
0 commit comments