-
Notifications
You must be signed in to change notification settings - Fork 149
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
1 changed file
with
85 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,96 @@ | ||
# edit the preprocessing function using the code you used for preprocesing the train data | ||
clean_df <- function(df){ | ||
# Process the input data to feed the model | ||
# This is an example script to generate the outcome variable given the input dataset. | ||
# | ||
# This script should be modified to prepare your own submission that predicts | ||
# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function. | ||
# | ||
# The predict_outcomes function takes a data frame. The return value must | ||
# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column | ||
# should contain the nomem_encr column from the input data frame. The outcome | ||
# column should contain the predicted outcome for each nomem_encr. The outcome | ||
# should be 0 (no child) or 1 (having a child). | ||
# | ||
# clean_df should be used to clean (preprocess) the data. | ||
# | ||
# run.R can be used to test your submission. | ||
|
||
## Selecting variables | ||
keepcols = c('nomem_encr', 'birthyear_bg', 'gender_bg', 'burgstat_2020','oplmet_2020', 'cf20m454') | ||
|
||
df <- df %>% select(all_of(keepcols)) | ||
|
||
# imputing missing values with mode (for factors) or median (for interval variables) | ||
my_mode <- function(x) { | ||
x <-x[!is.na(x)] | ||
ux <- unique(x) | ||
tab <- tabulate(match(x, ux)) | ||
mode <- ux[tab == max(tab)] | ||
ifelse(length(mode) > 1, sample(mode, 1), mode) | ||
} | ||
|
||
df <- df %>% | ||
mutate(across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), ~replace_na(., my_mode(.))), | ||
across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), as.factor), | ||
across(birthyear_bg, ~replace_na(., median(., na.rm=TRUE)))) | ||
|
||
return(df) | ||
} | ||
# List your packages here. Don't forget to update packages.R! | ||
library(dplyr) # as an example, not used here | ||
|
||
clean_df <- function(df, background = NULL){ | ||
# Preprocess the input dataframe to feed the model. | ||
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command | ||
|
||
# Parameters: | ||
# df (dataframe): The input dataframe containing the raw data (from PreFer_train_data.csv). | ||
# background (dataframe): Optional input dataframe containing background data (from PreFer_train_background_data.csv). | ||
|
||
# if necessary, edit the function so it returns predicted classes (1/0), not probabilities | ||
predict_outcomes <- function(df, model_path="./model.rds"){ | ||
# preprocess the holdout data | ||
df <- clean_df(df) | ||
ids <- select(df, nomem_encr) | ||
# Returns: | ||
# data frame: The cleaned dataframe with only the necessary columns and processed variables. | ||
|
||
## This script contains a bare minimum working example | ||
# Create new age variable | ||
df$age <- 2024 - df$birthyear_bg | ||
|
||
# Filter cases for whom outcome is not available | ||
df <- df[ !is.na(df$new_child), ] | ||
|
||
# Selecting variables for modelling | ||
keepcols = c('nomem_encr', # ID variable required for predictions, | ||
'age', # newly created variable | ||
'new_child') # outcome variable | ||
|
||
## Keeping data with variables selected | ||
df <- df[ , keepcols ] | ||
|
||
return(df) | ||
} | ||
|
||
predict_outcomes <- function(df, model_path = "./model.rds"){ | ||
# Generate predictions using the saved model and the input dataframe. | ||
|
||
# The predict_outcomes function accepts a dataframe as an argument | ||
# and returns a new dataframe with two columns: nomem_encr and | ||
# prediction. The nomem_encr column in the new dataframe replicates the | ||
# corresponding column from the input dataframe The prediction | ||
# column contains predictions for each corresponding nomem_encr. Each | ||
# prediction is represented as a binary value: '0' indicates that the | ||
# individual did not have a child during 2021-2023, while '1' implies that | ||
# they did. | ||
|
||
# Parameters: | ||
# df (dataframe): The input dataframe for which predictions are to be made. | ||
# model_path (str): The path to the saved model file (which is the output of training.R). | ||
|
||
# Returns: | ||
# dataframe: A dataframe containing the identifiers and their corresponding predictions. | ||
|
||
## This script contains a bare minimum working example | ||
if( !("nomem_encr" %in% colnames(df)) ) { | ||
warning("The identifier variable 'nomem_encr' should be in the dataset") | ||
} | ||
|
||
# Load the model | ||
model <- readRDS(model_path) | ||
|
||
# Preprocess the fake / holdout data | ||
df <- clean_df(df) | ||
|
||
# IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards | ||
# get list of variables *without* the outcome: | ||
vars_without_outcome <- colnames(df)[colnames(df) != "new_child"] | ||
|
||
# !if necessary, make edits to produce predicted classes | ||
# E.g. if you used glm() function to train a model, add 'type="response"' to get probabilities | ||
pred <- predict(model, df, type="response") | ||
#and then transform them into predicted classes | ||
pred <- ifelse(pred>0.5, 1, 0) | ||
# Generate predictions from model, should be 0 (no child) or 1 (had child) | ||
predictions <- predict(model, | ||
subset(df, select = vars_without_outcome), | ||
type = "response") | ||
# Transform probabilities into predicted classes | ||
predictions <- ifelse(predictions > 0.5, 1, 0) | ||
|
||
# adding prediction column to id column | ||
ids$prediction<- pred | ||
# Output file should be data.frame with two columns, nomem_enc and predictions | ||
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions) | ||
# Force columnnames (overrides names that may be given by `predict`) | ||
names(df_predict) <- c("nomem_encr", "predictions") | ||
|
||
return(ids) | ||
# Return only dataset with predictions and identifier | ||
return( df_predict ) | ||
} | ||
|
||
|
||
# ######## do not edit this ############################ | ||
# df <- read.csv(args[1]) | ||
# predictions <- predict_holdout(df) | ||
# write.csv(predictions,"predictions.csv", row.names = FALSE) |