forked from eyra/fertility-prediction-challenge
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubmission.R
310 lines (289 loc) · 11.4 KB
/
submission.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# This is an example script to generate the outcome variable given the input dataset.
#
# This script should be modified to prepare your own submission that predicts
# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
#
# The predict_outcomes function takes a data frame. The return value must
# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
# should contain the nomem_encr column from the input data frame. The outcome
# column should contain the predicted outcome for each nomem_encr. The outcome
# should be 0 (no child) or 1 (having a child).
#
# clean_df should be used to clean (preprocess) the data.
#
# run.R can be used to test your submission.
# List your packages here. Don't forget to update packages.R!
library(dplyr)
library(tidyr)
library(tidymodels)
library(xgboost)
clean_df <- function(df, background_df = NULL) {
# Preprocess the input dataframe to feed the model.
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
# Parameters:
# df (dataframe): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv).
# background (dataframe): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv).
# Returns:
# data frame: The cleaned dataframe with only the necessary columns and processed variables.
# Selecting variables for modelling
keepcols <- c(
"nomem_encr", # ID variable required for predictions,
"outcome_available", # Is there an outcome to predict?
# Savings
"ca20g012", "ca20g013", "ca20g078",
# Number of rooms
"cd20m034",
# Partnership status. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards cf20m029
"cf20m024", "cf20m025", "cf20m026", "cf20m029", "cf20m030",
# Expected kids
"cf20m128", "cf20m129", "cf20m130",
# Feelings about being single
"cf20m166",
# Existing children
"cf20m454", "cf20m455",
# Relationship with child
"cf20m513",
"cf20m514",
"cf20m515",
"cf20m516",
"cf20m517",
"cf20m518",
"cf20m519",
"cf20m520",
"cf20m521",
# Health
"ch20m004",
# Gynaecologist. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards this variable
"ch20m219",
# Gendered religiosity
"cr18k101", "cr18k102", "cr18k103", "cr18k104", "cr18k105",
# Religiosity
"cr20m162",
# Traditional fertility
"cv10c135", "cv10c136", "cv10c137", "cv10c138",
# Traditional motherhood
"cv20l109", "cv20l110", "cv20l111",
# Traditional fatherhood
"cv20l112", "cv20l113", "cv20l114", "cv20l115",
# Traditional marriage
"cv20l124",
"cv20l125",
"cv20l126",
"cv20l127",
"cv20l128",
"cv20l129",
"cv20l130",
# Against working mothers
"cv20l143", "cv20l144", "cv20l145", "cv20l146",
# Sexism
"cv20l151", "cv20l152", "cv20l153", "cv20l154",
# Birth year
"birthyear_bg",
# Primary occupation. We thank Sayash Kapoor and Benedikt Strobl's L1
# regression for directing our attention towards this variable
"belbezig_2020",
# Gender
"gender_bg",
# Origins
"migration_background_bg",
# Houshold Income
"nettohh_f_2020",
# Education
"oplmet_2020",
# Urban
"sted_2020",
# Dwelling type
"woning_2020"
)
## Keeping data with variables selected
df <- df[, keepcols]
## Keep only rows with available outcomes
df <- filter(df, outcome_available == 1) %>%
rowwise() %>%
mutate(
# Impute savings with range midpoints. Two exceptions: We impute -1200
# for those in the smallest category. -1200 is roughly the average
# savings of those who are in that category. Similarly we impute 62500
# for those in the largest category.
# Also, if one does not have accounts, then one does not have any savings
ca20g012 = case_when(ca20g078 == 0 ~ 0,
ca20g013 == 1 ~ -1200,
ca20g013 == 2 ~ 150,
ca20g013 == 3 ~ 375,
ca20g013 == 4 ~ 625,
ca20g013 == 5 ~ 875,
ca20g013 == 6 ~ 1750,
ca20g013 == 7 ~ 3750,
ca20g013 == 8 ~ 6250,
ca20g013 == 9 ~ 8750,
ca20g013 == 10 ~ 10750,
ca20g013 == 11 ~ 12750,
ca20g013 == 12 ~ 15500,
ca20g013 == 13 ~ 18500,
ca20g013 == 14 ~ 22500,
ca20g013 == 15 ~ 62500,
ca20g013 == 999 ~ NA,
ca20g012 < -9999999997 ~ NA,
TRUE ~ ca20g012
),
# If no partner, then one is not living together with partner
cf20m025 = ifelse(cf20m024 == 2, 2, cf20m025),
# If no partner, then one is not married to partner
cf20m030 = ifelse(cf20m024 == 2, 2, cf20m030),
# If no expected kids, then expected number of kids is 0
cf20m129 = ifelse(cf20m128 == 2, 0, cf20m129),
# If no expected kids, then a lower-bound estimate for the number of
# years within which to have kids is 31,
cf20m130 = case_when(cf20m128 == 2 ~ 31,
cf20m130 == 2025 ~ 5,
TRUE ~ cf20m130
),
# Feeling about being single
cf20m166 = ifelse(cf20m166 == 99, NA, cf20m166),
# If one never had children, then one does not have any living children
cf20m455 = ifelse(cf20m454 == 2, 0, cf20m455),
# Scale for feeling towards child
across(c(cf20m515, cf20m516, cf20m518, cf20m519, cf20m520, cf20m521),
~ 8 - .x
),
child_feeling = mean(c(cf20m513,
cf20m514,
cf20m515,
cf20m516,
cf20m517,
cf20m518,
cf20m519,
cf20m520,
cf20m521
),
na.rm = TRUE
),
# Scale on gendered religiosity
across(c(cr18k101, cr18k102, cr18k103, cr18k104, cr18k105),
~ case_when(.x == 1 ~ 3, .x == 2 ~ 1, .x > 2 ~ 2)
),
across(c(cr18k102, cr18k105), ~ 4 - .x),
gendered_religiosity = mean(
c(cr18k101, cr18k102, cr18k103, cr18k104, cr18k105),
na.rm = TRUE
),
# Religiosity
cr20m162 = ifelse(cr20m162 == -9, NA, cr20m162),
# Scale on traditional fertility
traditional_fertility = mean(c(cv10c135, cv10c136, cv10c137, cv10c138),
na.rm = TRUE
),
# Scale on traditional motherhood
cv20l109 = 6 - cv20l109,
traditional_motherhood = mean(c(cv20l109, cv20l110, cv20l111),
na.rm = TRUE
),
# Scale on traditional fatherhood
across(c(cv20l112, cv20l114, cv20l115), ~ 6 - .x),
traditional_fatherhood = mean(c(cv20l112, cv20l113, cv20l114, cv20l115),
na.rm = TRUE
),
# Scale on traditional marriage
across(c(cv20l126, cv20l127, cv20l128, cv20l129, cv20l130), ~ 6 - .x),
traditional_marriage = mean(
c(cv20l124, cv20l125, cv20l126, cv20l127, cv20l128, cv20l129, cv20l130
),
na.rm = TRUE
),
# Scale on being against working mothers
working_mother = mean(c(cv20l143, cv20l144, cv20l145, cv20l146),
na.rm = TRUE
),
# Scale on sexism
sexism = mean(c(cv20l151, cv20l152, cv20l153, cv20l154), na.rm = TRUE),
# Primary occupation
employee = ifelse(belbezig_2020 == 1, 1, 0),
freelance = ifelse(belbezig_2020 == 3, 1, 0),
student = ifelse(belbezig_2020 == 7, 1, 0),
homemaker = ifelse(belbezig_2020 == 8, 1, 0),
# Distinguish first- and second- non-Western migrants from others
migration_background_bg =
case_when(migration_background_bg %in% c(0, 101, 201) ~ 1,
migration_background_bg %in% c(102, 202) ~ 0),
# Combine the lowest levels of education
oplmet_2020 = case_when(oplmet_2020 %in% c(1, 2, 8, 9) ~ 2,
oplmet_2020 == 7 ~ NA,
TRUE ~ oplmet_2020),
# Distinguish between home owners and non-home owners
woning_2020 = case_when(woning_2020 == 1 ~ 1,
woning_2020 == 0 ~ NA,
TRUE ~ 0)
) %>%
select(-outcome_available,
-ca20g078, -ca20g013,
-cf20m128,
-cf20m454,
-cf20m513,
-cf20m514,
-cf20m515,
-cf20m516,
-cf20m517,
-cf20m518,
-cf20m519,
-cf20m520,
-cf20m521,
-cr18k101, -cr18k102, -cr18k103, -cr18k104, -cr18k105,
-cv10c135, -cv10c136, -cv10c137, -cv10c138,
-cv20l109, -cv20l110, -cv20l111,
-cv20l112, -cv20l113, -cv20l114, -cv20l115,
-cv20l124,
-cv20l125,
-cv20l126,
-cv20l127,
-cv20l128,
-cv20l129,
-cv20l130,
-cv20l143, -cv20l144, -cv20l145, -cv20l146,
-cv20l151, -cv20l152, -cv20l153, -cv20l154,
-belbezig_2020,
-migration_background_bg
) %>%
mutate(across(everything(), as.numeric), across(oplmet_2020, factor))
return(df)
}
predict_outcomes <- function(df, background_df = NULL, model_path = "./model.rds"){
# Generate predictions using the saved model and the input dataframe.
# The predict_outcomes function accepts a dataframe as an argument
# and returns a new dataframe with two columns: nomem_encr and
# prediction. The nomem_encr column in the new dataframe replicates the
# corresponding column from the input dataframe The prediction
# column contains predictions for each corresponding nomem_encr. Each
# prediction is represented as a binary value: '0' indicates that the
# individual did not have a child during 2021-2023, while '1' implies that
# they did.
# Parameters:
# df (dataframe): The data dataframe for which predictions are to be made.
# background_df (dataframe): The background data dataframe for which predictions are to be made.
# model_path (str): The path to the saved model file (which is the output of training.R).
# Returns:
# dataframe: A dataframe containing the identifiers and their corresponding predictions.
# Test for presence of nomem_encr
if( !("nomem_encr" %in% colnames(df)) ) {
warning("The identifier variable 'nomem_encr' should be in the dataset")
}
# Load the model
model <- readRDS(model_path)
# Preprocess the fake / holdout data
df <- clean_df(df, background_df)
# Exclude the variable nomem_encr if this variable is NOT in your model
vars_without_id <- colnames(df)[colnames(df) != "nomem_encr"]
# Generate predictions from model
predictions <- predict(model,
subset(df, select = vars_without_id)) %>%
mutate(across(.pred_class, ~ as.numeric(.x) - 1))
# Create predictions that should be 0s and 1s rather than, e.g., probabilities
predictions <- ifelse(predictions > 0.5, 1, 0)
# Output file should be data.frame with two columns, nomem_encr and predictions
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions)
# Force columnnames (overrides names that may be given by `predict`)
names(df_predict) <- c("nomem_encr", "prediction")
# Return only dataset with predictions and identifier
return( df_predict )
}