Skip to content

Commit 7c2be71

Browse files
committed
NANs Safegraph:
* add missingness columns to safegraph * add data insufficient if the stderr is missing * add tests
1 parent f45cb9c commit 7c2be71

File tree

3 files changed

+74
-17
lines changed

3 files changed

+74
-17
lines changed

safegraph/delphi_safegraph/process.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
from typing import List
55
import numpy as np
66
import pandas as pd
7-
from delphi_utils.signal import add_prefix
8-
from delphi_utils.export import create_export_csv
9-
from delphi_utils.geomap import GeoMapper
7+
from delphi_utils import (
8+
add_prefix,
9+
create_export_csv,
10+
GeoMapper,
11+
NAN_CODES,
12+
)
1013

1114
from .constants import HOME_DWELL, COMPLETELY_HOME, FULL_TIME_WORK, PART_TIME_WORK, GEO_RESOLUTIONS
1215

@@ -172,7 +175,7 @@ def process_window(df_list: List[pd.DataFrame],
172175
173176
Parameters
174177
----------
175-
cbg_df: pd.DataFrame
178+
df_list: pd.DataFrame
176179
list of census block group-level frames.
177180
signal_names: List[str]
178181
signal names to be processed
@@ -192,15 +195,34 @@ def process_window(df_list: List[pd.DataFrame],
192195
for geo_res in geo_resolutions:
193196
aggregated_df = aggregate(cbg_df, signal_names, geo_res)
194197
for signal in signal_names:
198+
columns_to_export = (
199+
['geo_id'] +
200+
[f'{signal}_{x}' for x in ('mean', 'se', 'n')]
201+
)
195202
df_export = aggregated_df[
196-
['geo_id']
197-
+ [f'{signal}_{x}' for x in ('mean', 'se', 'n')]
203+
columns_to_export
198204
].rename({
199205
f'{signal}_mean': 'val',
200206
f'{signal}_se': 'se',
201207
f'{signal}_n': 'sample_size',
202208
}, axis=1)
203209
df_export["timestamp"] = date.strftime('%Y%m%d')
210+
211+
# Default missingness codes
212+
df_export["missing_val"] = NAN_CODES["Not Missing"]
213+
df_export["missing_se"] = NAN_CODES["Not Missing"]
214+
# Sample size will never be missing in this indicator
215+
# since sample_size just counts the presence of rows for a geo region
216+
df_export["missing_sample_size"] = NAN_CODES["Not Missing"]
217+
# Add missingness codes as detected
218+
# This may occur if all the values are missing for a geographic region
219+
remaining_nans_mask = df_export["val"].isnull()
220+
df_export.loc[remaining_nans_mask, "missing_val"] = NAN_CODES["Unknown"]
221+
# This may occur if all the values are missing for a geographic region
222+
# or if the sample size is 1
223+
remaining_nans_mask = df_export["se"].isnull()
224+
df_export.loc[remaining_nans_mask, "missing_se"] = NAN_CODES["Data Insufficient"]
225+
204226
create_export_csv(df_export,
205227
export_dir,
206228
geo_res,

safegraph/tests/test_process.py

+36-9
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ def test_process_window(self, tmp_path):
155155
'geo_id': [1053, 1073],
156156
'val': [0.04, 0.14],
157157
'se': [0.02, 0.10],
158-
'sample_size': [2, 2]
158+
'sample_size': [2, 2],
159+
'missing_val': [0, 0],
160+
'missing_se': [0, 0],
161+
'missing_sample_size': [0, 0],
159162
})
160163
actual = pd.read_csv(
161164
export_dir / '20200214_county_completely_home_prop.csv')
@@ -183,49 +186,73 @@ def test_process(self, tmp_path):
183186
'geo_id': ['al', 'ga'],
184187
'val': [6, 3.5],
185188
'se': [None, 0.5],
186-
'sample_size': [1, 2]
189+
'sample_size': [1, 2],
190+
'missing_val': [0, 0],
191+
'missing_se': [4, 0],
192+
'missing_sample_size': [0, 0],
187193
}),
188194
'completely_home_prop': pd.DataFrame(data={
189195
'geo_id': ['al', 'ga'],
190196
'val': [0.15, 0.055],
191197
'se': [None, 0.005],
192-
'sample_size': [1, 2]
198+
'sample_size': [1, 2],
199+
'missing_val': [0, 0],
200+
'missing_se': [4, 0],
201+
'missing_sample_size': [0, 0],
193202
}),
194203
'part_time_work_prop': pd.DataFrame(data={
195204
'geo_id': ['al', 'ga'],
196205
'val': [0.35, 0.055],
197206
'se': [None, 0.005],
198-
'sample_size': [1, 2]
207+
'sample_size': [1, 2],
208+
'missing_val': [0, 0],
209+
'missing_se': [4, 0],
210+
'missing_sample_size': [0, 0],
199211
}),
200212
'full_time_work_prop': pd.DataFrame(data={
201213
'geo_id': ['al', 'ga'],
202214
'val': [0.45, 0.055],
203215
'se': [None, 0.005],
204-
'sample_size': [1, 2]
216+
'sample_size': [1, 2],
217+
'missing_val': [0, 0],
218+
'missing_se': [4, 0],
219+
'missing_sample_size': [0, 0],
205220
}),
206221
'median_home_dwell_time_7dav': pd.DataFrame(data={
207222
'geo_id': ['al', 'ga', 'pa'],
208223
'val': [4.5, 3.5, 7.5],
209224
'se': [1.5, 0.5, 0.5],
210-
'sample_size': [2, 2, 2]
225+
'sample_size': [2, 2, 2],
226+
'missing_val': [0, 0, 0],
227+
'missing_se': [0, 0, 0],
228+
'missing_sample_size': [0, 0, 0],
211229
}),
212230
'wip_completely_home_prop_7dav': pd.DataFrame(data={
213231
'geo_id': ['al', 'ga', 'pa'],
214232
'val': [0.1, 0.055, 0.15],
215233
'se': [0.05, 0.005, 0.05],
216-
'sample_size': [2, 2, 2]
234+
'sample_size': [2, 2, 2],
235+
'missing_val': [0, 0, 0],
236+
'missing_se': [0, 0, 0],
237+
'missing_sample_size': [0, 0, 0],
217238
}),
218239
'part_time_work_prop_7dav': pd.DataFrame(data={
219240
'geo_id': ['al', 'ga', 'pa'],
220241
'val': [0.25, 0.055, 0.25],
221242
'se': [0.1, 0.005, 0.05],
222-
'sample_size': [2, 2, 2]
243+
'sample_size': [2, 2, 2],
244+
'missing_val': [0, 0, 0],
245+
'missing_se': [0, 0, 0],
246+
'missing_sample_size': [0, 0, 0],
223247
}),
224248
'full_time_work_prop_7dav': pd.DataFrame(data={
225249
'geo_id': ['al', 'ga', 'pa'],
226250
'val': [0.35, 0.055, 0.35],
227251
'se': [0.1, 0.005, 0.05],
228-
'sample_size': [2, 2, 2]
252+
'sample_size': [2, 2, 2],
253+
'missing_val': [0, 0, 0],
254+
'missing_se': [0, 0, 0],
255+
'missing_sample_size': [0, 0, 0],
229256
})
230257
}
231258
actual = {signal: pd.read_csv(

safegraph/tests/test_run.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,13 @@ def test_output_files_format(self, run_as_module):
3737
# triggered the error.
3838
print(filename)
3939
df = pd.read_csv(os.path.join("receiving", filename))
40-
assert (df.columns.values ==
41-
["geo_id", "val", "se", "sample_size"]).all()
40+
expected_columns = [
41+
"geo_id",
42+
"val",
43+
"se",
44+
"sample_size",
45+
"missing_val",
46+
"missing_se",
47+
"missing_sample_size"
48+
]
49+
assert (df.columns.values == expected_columns).all()

0 commit comments

Comments
 (0)