-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathacsData.py
97 lines (74 loc) · 4.51 KB
/
acsData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Wrapper to load in the ACS data
import json
import numpy as np
import pandas as pd
from folktables import ACSDataSource, ACSEmployment, ACSIncome, ACSPublicCoverage, ACSMobility, ACSTravelTime
from sklearn.model_selection import train_test_split
def get_columns(acs_task):
with open('acs_columns_names.json') as acs_column_names_file:
column_name_map = json.load(acs_column_names_file)
return column_name_map[acs_task]
def convert_to_numpy(acs_task, acs_data):
conversion_function = {
'employment': ACSEmployment.df_to_numpy,
'income': ACSIncome.df_to_numpy,
'public_coverage': ACSPublicCoverage.df_to_numpy,
'mobility': ACSMobility.df_to_numpy,
'travel_time': ACSTravelTime.df_to_numpy}
return conversion_function[acs_task](acs_data)
def get_data(test_size, acs_task, acs_year, acs_states, acs_horizon='1-Year', acs_survey='person', row_start=0,
row_end=-1, col_start=0, col_end=-1):
# test_size: percentage of data to be used by test dataset (e.g. 0.2 or 0.5)
# acs_task options: employment, income, public_coverage, mobility, and travel_time. only employment is tested code.
# acs_year: 2014-2018
# acs_states: any list of state abbreviations e.g. ['NY']
# acs_horizon: '1-Year' or '5-Year'
# acs_survey: 'person' or 'household'
columns = get_columns(acs_task)
data_source = ACSDataSource(survey_year=acs_year, horizon=acs_horizon, survey=acs_survey)
# this pulls in the raw data
acs_data = data_source.get_data(states=acs_states, download=True)
# this block pulls out the relevant data columns to one of the 5 following prediction tasks.
# each one reads out a features vector, a label vector (what you're trying to learn in that particular task),
# and a 'group' vector.
# the 'group' vector is always race; each x is labeled 1-9 depending on the racial category listed in appendix
# B of the paper https://arxiv.org/pdf/2108.04884.pdf
# Folktables stripped all the column names from the data, so it's hard to see what they are.
# I've added these back in, which is what the 'columns' variable is.
features, label, group = convert_to_numpy(acs_task, acs_data)
# the next bit pulls out just some subset of the total rows of data, as defined by your row_start and row_end,
# column_start, and column_end variables.
# currently, the number of rows it pulls is defined by rows_used, which can be input from the jupyter notebook
# into this function.
features = features[row_start:row_end, col_start:col_end]
label = label[row_start:row_end]
# the group variable is race; labeled 1-8 according to categories in Appendix B of paper
# https://arxiv.org/pdf/2108.04884.pdf
group = group[row_start:row_end]
# next, pull out all the test/train data.
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(features, label, group,
test_size=test_size, random_state=0)
# for our purposes, we want the race groups to be included in the training data
X_train = np.hstack((X_train, group_train[:, np.newaxis]))
X_test = np.hstack((X_test, group_test[:, np.newaxis]))
# making the training data into a pandas dataframe for ease of inspection
X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)
# Building the group functions ###
# first building the race indicators
group_functions = [lambda x, group_id=group_id: 1 if x['RAC1P'] == group_id else 0 for group_id in [1, 2, 6, 8, 9]]
group_functions.append(
lambda x: 1 if x['RAC1P'] == 3 or x['RAC1P'] == 4 or x['RAC1P'] == 5 or x['RAC1P'] == 7 else 0)
group_indicators = ['White', 'Black or African American', 'Asian',
'Native Hawaiian, Native American, Native Alaskan, or Pacific Islander', 'Some Other Race',
'Two or More Races']
# next, the sex indicators
group_functions += [lambda x: 1 if x['SEX'] == 1 else 0, lambda x: 1 if x['SEX'] == 2 else 0]
group_indicators += ['Male', 'Female']
# finally, the age indicators
min_age = 30
mid_age = 50
group_functions += [lambda x: 1 if x['AGEP'] < min_age else 0, lambda x: 1 if min_age <= x['AGEP'] < mid_age else 0,
lambda x: 1 if x['AGEP'] >= mid_age else 0]
group_indicators += ['Young', 'Middle', 'Old']
return [X_train, y_train, X_test, y_test, group_functions, group_indicators, min_age, mid_age]