|
1 |
| -# edit the preprocessing function using the code you used for preprocesing the train data |
2 |
| -clean_df <- function(df){ |
3 |
| - # Process the input data to feed the model |
| 1 | +# This is an example script to generate the outcome variable given the input dataset. |
| 2 | +# |
| 3 | +# This script should be modified to prepare your own submission that predicts |
| 4 | +# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function. |
| 5 | +# |
| 6 | +# The predict_outcomes function takes a data frame. The return value must |
| 7 | +# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column |
| 8 | +# should contain the nomem_encr column from the input data frame. The outcome |
| 9 | +# column should contain the predicted outcome for each nomem_encr. The outcome |
| 10 | +# should be 0 (no child) or 1 (having a child). |
| 11 | +# |
| 12 | +# clean_df should be used to clean (preprocess) the data. |
| 13 | +# |
| 14 | +# run.R can be used to test your submission. |
4 | 15 |
|
5 |
| - ## Selecting variables |
6 |
| - keepcols = c('nomem_encr', 'birthyear_bg', 'gender_bg', 'burgstat_2020','oplmet_2020', 'cf20m454') |
7 |
| - |
8 |
| - df <- df %>% select(all_of(keepcols)) |
9 |
| - |
10 |
| - # imputing missing values with mode (for factors) or median (for interval variables) |
11 |
| - my_mode <- function(x) { |
12 |
| - x <-x[!is.na(x)] |
13 |
| - ux <- unique(x) |
14 |
| - tab <- tabulate(match(x, ux)) |
15 |
| - mode <- ux[tab == max(tab)] |
16 |
| - ifelse(length(mode) > 1, sample(mode, 1), mode) |
17 |
| - } |
18 |
| - |
19 |
| - df <- df %>% |
20 |
| - mutate(across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), ~replace_na(., my_mode(.))), |
21 |
| - across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), as.factor), |
22 |
| - across(birthyear_bg, ~replace_na(., median(., na.rm=TRUE)))) |
23 |
| - |
24 |
| - return(df) |
25 |
| -} |
| 16 | +# List your packages here. Don't forget to update packages.R! |
| 17 | +library(dplyr) # as an example, not used here |
26 | 18 |
|
| 19 | +clean_df <- function(df, background = NULL){ |
| 20 | + # Preprocess the input dataframe to feed the model. |
| 21 | + ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command |
27 | 22 |
|
| 23 | + # Parameters: |
| 24 | + # df (dataframe): The input dataframe containing the raw data (from PreFer_train_data.csv). |
| 25 | + # background (dataframe): Optional input dataframe containing background data (from PreFer_train_background_data.csv). |
28 | 26 |
|
29 |
| -# if necessary, edit the function so it returns predicted classes (1/0), not probabilities |
30 |
| -predict_outcomes <- function(df, model_path="./model.rds"){ |
31 |
| - # preprocess the holdout data |
32 |
| - df <- clean_df(df) |
33 |
| - ids <- select(df, nomem_encr) |
| 27 | + # Returns: |
| 28 | + # data frame: The cleaned dataframe with only the necessary columns and processed variables. |
| 29 | + |
| 30 | + ## This script contains a bare minimum working example |
| 31 | + # Create new age variable |
| 32 | + df$age <- 2024 - df$birthyear_bg |
| 33 | + |
| 34 | + # Filter cases for whom outcome is not available |
| 35 | + df <- df[ !is.na(df$new_child), ] |
| 36 | + |
| 37 | + # Selecting variables for modelling |
| 38 | + keepcols = c('nomem_encr', # ID variable required for predictions, |
| 39 | + 'age', # newly created variable |
| 40 | + 'new_child') # outcome variable |
| 41 | + |
| 42 | + ## Keeping data with variables selected |
| 43 | + df <- df[ , keepcols ] |
| 44 | + |
| 45 | + return(df) |
| 46 | +} |
| 47 | + |
| 48 | +predict_outcomes <- function(df, model_path = "./model.rds"){ |
| 49 | + # Generate predictions using the saved model and the input dataframe. |
| 50 | + |
| 51 | + # The predict_outcomes function accepts a dataframe as an argument |
| 52 | + # and returns a new dataframe with two columns: nomem_encr and |
| 53 | + # prediction. The nomem_encr column in the new dataframe replicates the |
| 54 | + # corresponding column from the input dataframe The prediction |
| 55 | + # column contains predictions for each corresponding nomem_encr. Each |
| 56 | + # prediction is represented as a binary value: '0' indicates that the |
| 57 | + # individual did not have a child during 2021-2023, while '1' implies that |
| 58 | + # they did. |
34 | 59 |
|
| 60 | + # Parameters: |
| 61 | + # df (dataframe): The input dataframe for which predictions are to be made. |
| 62 | + # model_path (str): The path to the saved model file (which is the output of training.R). |
| 63 | + |
| 64 | + # Returns: |
| 65 | + # dataframe: A dataframe containing the identifiers and their corresponding predictions. |
| 66 | + |
| 67 | + ## This script contains a bare minimum working example |
| 68 | + if( !("nomem_encr" %in% colnames(df)) ) { |
| 69 | + warning("The identifier variable 'nomem_encr' should be in the dataset") |
| 70 | + } |
| 71 | + |
35 | 72 | # Load the model
|
36 | 73 | model <- readRDS(model_path)
|
| 74 | + |
| 75 | + # Preprocess the fake / holdout data |
| 76 | + df <- clean_df(df) |
| 77 | + |
| 78 | + # IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards |
| 79 | + # get list of variables *without* the outcome: |
| 80 | + vars_without_outcome <- colnames(df)[colnames(df) != "new_child"] |
37 | 81 |
|
38 |
| - # !if necessary, make edits to produce predicted classes |
39 |
| - # E.g. if you used glm() function to train a model, add 'type="response"' to get probabilities |
40 |
| - pred <- predict(model, df, type="response") |
41 |
| - #and then transform them into predicted classes |
42 |
| - pred <- ifelse(pred>0.5, 1, 0) |
| 82 | + # Generate predictions from model, should be 0 (no child) or 1 (had child) |
| 83 | + predictions <- predict(model, |
| 84 | + subset(df, select = vars_without_outcome), |
| 85 | + type = "response") |
| 86 | + # Transform probabilities into predicted classes |
| 87 | + predictions <- ifelse(predictions > 0.5, 1, 0) |
43 | 88 |
|
44 |
| - # adding prediction column to id column |
45 |
| - ids$prediction<- pred |
| 89 | + # Output file should be data.frame with two columns, nomem_enc and predictions |
| 90 | + df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions) |
| 91 | + # Force columnnames (overrides names that may be given by `predict`) |
| 92 | + names(df_predict) <- c("nomem_encr", "predictions") |
46 | 93 |
|
47 |
| - return(ids) |
| 94 | + # Return only dataset with predictions and identifier |
| 95 | + return( df_predict ) |
48 | 96 | }
|
49 |
| - |
50 |
| - |
51 |
| -# ######## do not edit this ############################ |
52 |
| -# df <- read.csv(args[1]) |
53 |
| -# predictions <- predict_holdout(df) |
54 |
| -# write.csv(predictions,"predictions.csv", row.names = FALSE) |
|
0 commit comments