forked from eyra/fertility-prediction-challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.R
429 lines (333 loc) · 14.6 KB
/
training.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# This is an example script to train your model given the (cleaned) input dataset.
#
# This script will not be run on the holdout data,
# but the resulting model model.joblib will be applied to the holdout data.
#
# It is important to document your training steps here, including seed,
# number of folds, model, et cetera
library(tidyverse)
# manual implementation k fold cross validation for random forest ranger
cv_k_fold_rf = function(df, formula, k = 10){
# identify categorical variable
cat_col = df %>% dplyr::select_if(is.factor) %>% colnames() %>% as.vector()
# add always to stratify by new child the dependant var
cat_col = rev(c(cat_col, "new_child"))
# create balanced folds
set.seed(123)
df_sub = groupdata2::fold(data = df, k = k, cat_col= as.vector(cat_col))
# create matrix to save measure of performance
metrics_mat = matrix(NA, ncol= 5, nrow = k)
#Perform 10 fold cross validation
for(i in 1:k){
# get train and test set
df_test = df_sub %>% filter(.folds == i) %>% ungroup() %>% select(-c(.folds))
df_train = df_sub %>% filter(.folds != i) %>% ungroup() %>% select(-c(.folds))
df_train$new_child = as.factor(df_train$new_child)
# Use test and train data partitions
fit = ranger::ranger(as.formula(formula), data=df_train)
# fit = glm(as.formula(formula), data = df_train)
y_pred = predict(fit, data = df_test)
y_pred_dummy = y_pred$predictions
# add level 1 if not in
levels(y_pred_dummy) =c("0", "1")
true_y = as.factor(df_test$new_child)
# Create a confusion matrix
confusion <- caret::confusionMatrix(data = y_pred_dummy, reference = true_y, positive = "1")
# Extract metrics
accuracy <- confusion$overall["Accuracy"]
precision <- confusion$byClass["Pos Pred Value"]
recall <- confusion$byClass["Sensitivity"]
f1_score <- 2 * ((precision * recall) / (precision + recall))
if(is.na(f1_score)){
f1_score = 0
}
balanced_accuracy = confusion$byClass["Balanced Accuracy"]
# save in matrix
metrics_mat[i, ] = c(accuracy, precision, recall, f1_score, balanced_accuracy)
}
# return average over rows (fold)
vec_to_ret = colMeans(metrics_mat)
names(vec_to_ret) = c("accuracy", "precision", "recall", "f1_score", "balanced_accuracy")
return(list(vec_to_ret,metrics_mat ))
}
# build all possible combinations
model_combination <- function(
id_screening,
var_mat
){
# Generate all combinations of var_mat and id_screening
A <- rbind(
matrix(rep(var_mat,length(id_screening)),nrow=nrow(var_mat)),
rep(id_screening,each=ncol(var_mat))
)
# Remove duplicates:
# remove same model
A <- unique(apply(A,2,sort), MARGIN = 2)
# remove same attributes
id <- apply(A,2,anyDuplicated)>0
if(sum(id)==0){
return(A)
}else{
return(subset(A,select=!id))
}
}
#
# # # for testing
# data <- read_csv("not_commit/PreFer_train_data.csv")
# # # backgroud data
# data_back <- read_csv("not_commit/PreFer_train_background_data.csv")
# cleaned_df = clean_df(df = data, background_df = data_back)
# outcome_df = read_csv(file = "not_commit/PreFer_train_outcome.csv")
#
train_save_model <- function(cleaned_df, outcome_df) {
# Trains a model using the cleaned dataframe and saves the model to a file.
# Parameters:
# cleaned_df (dataframe): The cleaned data from clean_df function to be used for training the model.
# outcome_df (dataframe): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
## This script contains a bare minimum working example
# set.seed(1) # not useful here because logistic regression deterministic
# Combine cleaned_df and outcome_df and keep only observation for which we observe data
model_df <- merge(cleaned_df, outcome_df, by = "nomem_encr") %>%
filter(!is.na(new_child))
# train SWAG with Random forest
X = model_df %>% dplyr::select(-c(nomem_encr, nomem_encr_cube, nomem_encr_square, nomem_encr_sqrt_root, new_child))
y = model_df %>% pull(new_child)
# Meta-parameters swag algorithm
control <- list(pmax = 35, # maximum dimension explored
alpha = .1, #normally a small value, corresponds to the kept at each iteration
m = 80L, # max number of models explored per dimension
seed = 123L, #for replicability
verbose = T #keeps track of completed dimensions)
)
dim_x <- dim(X)
p <- dim_x[2]
n <- dim_x[1]
## Object storage
CVs <- vector("list",control$pmax)
CVs_f1_score <- vector("list",control$pmax)
CVs_accuracy <- vector("list",control$pmax)
CVs_precison <- vector("list",control$pmax)
CVs_recall <- vector("list",control$pmax)
IDs <- vector("list",control$pmax)
VarMat <- vector("list",control$pmax)
Sample_size <- vector("list",control$pmax)
# create vector to save results of dimension 1
cv_alpha <- rep(NA,control$pmax)
cv_errors <- rep(NA,ncol(X))
cv_errors_f1_score <- rep(NA,ncol(X))
cv_errors_accuracy <- rep(NA,ncol(X))
cv_errors_precison <- rep(NA,ncol(X))
cv_errors_recall <- rep(NA,ncol(X))
# set dimension to 1
d=1
# create sample size vector to save in list
sample_size = vector(mode="numeric", length=ncol(X))
# first step screen all variables
for(i in seq(ncol(X))){
# merge x and y and remove rows with NA at this step
df_sub = data.frame(new_child = y, X %>% dplyr::select(all_of(i))) %>% na.omit()
res_cv = cv_k_fold_rf(df = df_sub, formula ="new_child~.", k = 10)[[1]]
cv_errors[i] = res_cv["balanced_accuracy"]
cv_errors_f1_score[i] <- res_cv["f1_score"]
cv_errors_accuracy[i] <-res_cv["accuracy"]
cv_errors_precison[i] <- res_cv["precision"]
cv_errors_recall[i] <- res_cv["recall"]
sample_size[i] = nrow(df_sub)
if(control$verbose){
cat(paste0("Variable ",i,"/", ncol(X),". balanced accuracy of ", round(cv_errors[i] , 4), "\n"))
}
}
# create varmat for dimension 1
var_mat <- seq_len(p)
dim(var_mat) <- c(1,p)
CVs[[d]] <- cv_errors
CVs_f1_score[[d]] = cv_errors_f1_score
CVs_accuracy[[d]] = cv_errors_accuracy
CVs_precison[[d]] = cv_errors_precison
CVs_recall[[d]] = cv_errors_recall
cv_alpha[d] <- quantile(cv_errors, (1-control$alpha), na.rm=T)
IDs[[d]] <- which(cv_errors >= cv_alpha[d])
id_screening <- IDs[[d]]
VarMat[[d]] <- var_mat
Sample_size[[d]] = sample_size
for(d in 2:control$pmax){
# d=2
# Build all combinations
var_mat <- model_combination(id_screening,subset(VarMat[[d-1L]], select=IDs[[d-1]]))
# Reduce number of model if exceeding `m` and if dimension is greater than 1
if(ncol(var_mat)>control$m){
set.seed(123)
var_mat <- var_mat[,sample.int(ncol(var_mat),control$m)]
}
# Compute CV errors
cv_errors <- rep(NA,ncol(var_mat))
cv_errors_f1_score <- rep(NA,ncol(var_mat))
cv_errors_accuracy <- rep(NA,ncol(var_mat))
cv_errors_precison <- rep(NA,ncol(var_mat))
cv_errors_recall <- rep(NA,ncol(var_mat))
sample_size = vector(mode = "numeric", length = ncol(var_mat))
for(i in seq_len(ncol(var_mat))){
# merge x and y and remove rows with NA at this step
df_sub = suppressMessages(dplyr::bind_cols(y, X[,var_mat[,i]]))
df_sub_2 = as.data.frame(na.omit(df_sub))
colnames(df_sub_2)[1] = "new_child"
res_cv = cv_k_fold_rf(df = df_sub_2, formula ="new_child~.", k = 10)[[1]]
cv_errors[i] = res_cv["balanced_accuracy"]
cv_errors_f1_score[i] <- res_cv["f1_score"]
cv_errors_accuracy[i] <-res_cv["accuracy"]
cv_errors_precison[i] <- res_cv["precision"]
cv_errors_recall[i] <- res_cv["recall"]
# cat(paste0("CV computed for combination of variables ", paste(var_mat[,i], collapse = "-"), "\n"))
sample_size[i] = nrow(df_sub_2)
}
# Store results
CVs[[d]] <- cv_errors
CVs_f1_score[[d]] = cv_errors_f1_score
CVs_accuracy[[d]] = cv_errors_accuracy
CVs_precison[[d]] = cv_errors_precison
CVs_recall[[d]] = cv_errors_recall
VarMat[[d]] <- var_mat
cv_alpha[d] <- quantile(cv_errors,probs = (1-control$alpha),na.rm=T)
IDs[[d]] <- which(cv_errors >= cv_alpha[d])
# if proportion of f1 score per tested model , switch to f1 score, to do, for now...
measure_considered= " balanced accuracy"
if(d > 10){
# switch for f1 score
measure_considered= " f1 score"
cv_alpha[d] <- quantile(cv_errors_f1_score,probs = (1-control$alpha), na.rm=T) # save in vector of quantile per dimension
IDs[[d]] <- which(cv_errors_f1_score >= cv_alpha[d])
}
Sample_size[[d]] = sample_size
if(control$verbose) print(paste0("Dimension explored: ", d ,",", measure_considered," at quantile ",(1-control$alpha),":" ,round(cv_alpha[d],4)))
if(ncol(var_mat)==1) break
}
# ---------- format output from swag
out = structure(
list(
x=colnames(X),
y=y,
control=control,
CVs_balanced_accuracy=CVs[1:d],
CVs_f1_score = CVs_f1_score[1:d] ,
CVs_accuracy = CVs_accuracy[1:d],
CVs_precison = CVs_precison[1:d],
CVs_recall = CVs_recall[1:d],
VarMat=VarMat[1:d],
cv_alpha=cv_alpha[1:d],
IDs=IDs[1:d]
),
class="swag"
)
# save for dev
# save(out, file = "out.rda")
# extract best model
# construct set of best models
total_dim_explored = length(out$CVs_f1_score)
dim_up_to = min(total_dim_explored, 35)
quantile_f1_score_to_consider = quantile(out$CVs_f1_score[[dim_up_to]], probs = .9,na.rm = T)
find_models_above_treshold<- function(vec, threshold) {
which(vec > threshold)
}
# construct best model id list
list_best_model <- lapply(out$CVs_f1_score, find_models_above_treshold, quantile_f1_score_to_consider)
# obtain the variables associated with each best models
extract_associated_variables = function(list_varmat, list_best_model){
max_dimension = length(list_best_model)
list_varmat_best_model = vector("list", max_dimension)
for(i in seq(max_dimension)){
if(length(list_best_model[[i]]) == 0){
next
}else{
list_varmat_best_model[[i]] = list_varmat[[i]][, list_best_model[[i]]]
}
}
return(list_varmat_best_model)
}
list_varmat_best_model = extract_associated_variables(list_varmat = out$VarMat, list_best_model = list_best_model)
# check number of best models
total_nbr_of_best_models = sum(unlist(lapply(list_best_model, FUN = function(x){length(x)})))
# total_nbr_of_best_models
# create a big model that countain all ranger models
create_all_models = function(list_varmat_best_model, X, y, verbose=T){
list_all_estimated_models = list()
max_dimension = length(list_varmat_best_model)
counter_model = 1
for(i in seq(max_dimension)){
# these are for the dimension where there are no best models
if(is.null(list_varmat_best_model[[i]])){
next
}
# this is for the dimensions where there is only one "best" model
if(is.vector(list_varmat_best_model[[i]])){
names_var_mod = colnames(X)[list_varmat_best_model[[i]]]
df_sub = suppressMessages(dplyr::bind_cols(y, X %>% select(all_of(names_var_mod))))
colnames(df_sub)[1] = "new_child"
df_sub_no_na = na.omit(df_sub)
df_sub_no_na$new_child = as.factor(df_sub_no_na$new_child)
fit = ranger::ranger(formula = "new_child ~.", data = df_sub_no_na)
# save estimated model
list_all_estimated_models[[counter_model]] = fit
# verbose
if(verbose){
cat(paste0("fitted model ", counter_model , "\n"))
}
# update counter
counter_model = counter_model+1
}
if(is.matrix(list_varmat_best_model[[i]])){
var_mat_dim_i = list_varmat_best_model[[i]]
ncol_var_mat_dim_i = ncol(var_mat_dim_i)
for(j in seq(ncol_var_mat_dim_i)){
names_var_mod = colnames(X)[var_mat_dim_i[,j]]
df_sub = suppressMessages(dplyr::bind_cols(y, X %>% select(all_of(names_var_mod))))
colnames(df_sub)[1] = "new_child"
df_sub_no_na = na.omit(df_sub)
df_sub_no_na$new_child = as.factor( df_sub_no_na$new_child)
fit = ranger::ranger(formula = "new_child ~.", data = df_sub_no_na)
# save estimated model
list_all_estimated_models[[counter_model]] = fit
# verbose
if(verbose){
cat(paste0("fitted model ", counter_model , "\n"))
}
# update counter
counter_model = counter_model+1
}
}
# extract varmat for that dimension
}
return(list_all_estimated_models)
}
# estimate all best models
set_all_best_models = create_all_models(list_varmat_best_model = list_varmat_best_model, verbose = T, X=X, y=y)
# save list of estimated models
model = list(
"set_all_best_models" = set_all_best_models, # contain all estimated RF
"list_varmat_best_model" = list_varmat_best_model, # contain the index of the variables of each top model
"colnames_X" = colnames(X) # contain the names of the variables of the training set
)
# Save the model
saveRDS(model, "model.rds")
# Inform the user
cat("Model saved at ", "model.rds", "\n")
# --------------------- previous approach of Aldo
# # Logistic regression model
# #model <- gam(new_child ~ factor(employment_status)+s(birthyear_bg)+factor(partner_2020)+religious_participation+factor(type_dwelling)+factor(mig_by_origin)+factor(civil_status)+factor(cf20m128), family = "binomial", data = model_df)
#
# model_df$employment_status<-as.factor(model_df$employment_status)
# model_df$partner_2020<-as.factor(model_df$partner_2020)
# # model_est <- model_df[, c("new_child","employment_status","birthyear_bg","partner_2020")]
# # model_est<-model_est[complete.cases(model_est),]
# # model <- spikeSlabGAM(new_child ~(employment_status+birthyear_bg+partner_2020)^2, data = model_est, family = "binomial")
#
# model_est <- model_df[, c("new_child","having_children_future","years_next_children",
# "desired_nr_add_children","birthyear_bg","partner_2020",
# "log_net_personal_income")]
# model_est<-model_est[complete.cases(model_est),]
# model <- spikeSlabGAM(new_child ~(having_children_future+
# years_next_children+desired_nr_add_children+
# birthyear_bg+partner_2020)^2+log_net_personal_income,
# data = model_est, family = "binomial")
}
# train_save_model(cleaned_df = cleaned_df, outcome_df = outcome_df)
#