Update submission.R

AdrienneMendrik · web-flow · commit e39916919b8d · 2024-03-23T07:06:48.000+01:00
Updates Gert and Lisa
diff --git a/submission.R b/submission.R
@@ -1,54 +1,96 @@
-# edit the preprocessing function using the code you used for preprocesing the train data 
-clean_df <- function(df){
-    # Process the input data to feed the model
+# This is an example script to generate the outcome variable given the input dataset.
+# 
+# This script should be modified to prepare your own submission that predicts 
+# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
+# 
+# The predict_outcomes function takes a data frame. The return value must
+# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
+# should contain the nomem_encr column from the input data frame. The outcome
+# column should contain the predicted outcome for each nomem_encr. The outcome
+# should be 0 (no child) or 1 (having a child).
+# 
+# clean_df should be used to clean (preprocess) the data.
+# 
+# run.R can be used to test your submission.
 
-    ## Selecting variables
-    keepcols = c('nomem_encr', 'birthyear_bg', 'gender_bg', 'burgstat_2020','oplmet_2020', 'cf20m454')      
-    
-    df <- df %>% select(all_of(keepcols))
-
-    # imputing missing values with mode (for factors) or median (for interval variables)
-    my_mode <- function(x) {
-    x <-x[!is.na(x)]
-    ux <- unique(x)
-    tab <- tabulate(match(x, ux))
-    mode <- ux[tab == max(tab)]
-    ifelse(length(mode) > 1, sample(mode, 1), mode)
-    }
-    
-    df <- df %>% 
-      mutate(across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), ~replace_na(., my_mode(.))), 
-             across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), as.factor),
-             across(birthyear_bg, ~replace_na(., median(., na.rm=TRUE)))) 
-   
-    return(df)
-}
+# List your packages here. Don't forget to update packages.R!
+library(dplyr) # as an example, not used here
 
+clean_df <- function(df, background = NULL){
+  # Preprocess the input dataframe to feed the model.
+  ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
 
+  # Parameters:
+  # df (dataframe): The input dataframe containing the raw data (from PreFer_train_data.csv).
+  # background (dataframe): Optional input dataframe containing background data (from PreFer_train_background_data.csv).
 
-# if necessary, edit the function so it returns predicted classes (1/0), not probabilities
-predict_outcomes <- function(df, model_path="./model.rds"){
-  # preprocess the holdout data
-  df <- clean_df(df)
-  ids <- select(df, nomem_encr)
+  # Returns:
+  # data frame: The cleaned dataframe with only the necessary columns and processed variables.
+
+  ## This script contains a bare minimum working example
+  # Create new age variable
+  df$age <- 2024 - df$birthyear_bg
+
+  # Filter cases for whom outcome is not available
+  df <- df[ !is.na(df$new_child), ]
+  
+  # Selecting variables for modelling
+  keepcols = c('nomem_encr', # ID variable required for predictions,
+               'age',        # newly created variable
+               'new_child')  # outcome variable 
+  
+  ## Keeping data with variables selected
+  df <- df[ , keepcols ]
+
+  return(df)
+}
+
+predict_outcomes <- function(df, model_path = "./model.rds"){
+  # Generate predictions using the saved model and the input dataframe.
+    
+  # The predict_outcomes function accepts a dataframe as an argument
+  # and returns a new dataframe with two columns: nomem_encr and
+  # prediction. The nomem_encr column in the new dataframe replicates the
+  # corresponding column from the input dataframe The prediction
+  # column contains predictions for each corresponding nomem_encr. Each
+  # prediction is represented as a binary value: '0' indicates that the
+  # individual did not have a child during 2021-2023, while '1' implies that
+  # they did.
   
+  # Parameters:
+  # df (dataframe): The input dataframe for which predictions are to be made.
+  # model_path (str): The path to the saved model file (which is the output of training.R).
+
+  # Returns:
+  # dataframe: A dataframe containing the identifiers and their corresponding predictions.
+  
+  ## This script contains a bare minimum working example
+  if( !("nomem_encr" %in% colnames(df)) ) {
+    warning("The identifier variable 'nomem_encr' should be in the dataset")
+  }
+
   # Load the model
   model <- readRDS(model_path)
+    
+  # Preprocess the fake / holdout data
+  df <- clean_df(df)
+
+  # IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
+  # get list of variables *without* the outcome:
+  vars_without_outcome <- colnames(df)[colnames(df) != "new_child"]
   
-  # !if necessary, make edits to produce predicted classes
-  # E.g. if you used glm() function to train a model, add 'type="response"' to get probabilities 
-  pred <- predict(model, df, type="response") 
-  #and then transform them into predicted classes
-  pred <- ifelse(pred>0.5, 1, 0)  
+  # Generate predictions from model, should be 0 (no child) or 1 (had child)
+  predictions <- predict(model, 
+                         subset(df, select = vars_without_outcome), 
+                         type = "response") 
+  # Transform probabilities into predicted classes
+  predictions <- ifelse(predictions > 0.5, 1, 0)  
   
-  # adding prediction column to id column
-  ids$prediction<- pred  
+  # Output file should be data.frame with two columns, nomem_enc and predictions
+  df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions)
+  # Force columnnames (overrides names that may be given by `predict`)
+  names(df_predict) <- c("nomem_encr", "predictions") 
   
-  return(ids)
+  # Return only dataset with predictions and identifier
+  return( df_predict )
 }
-
-
-# ######## do not edit this ############################
-# df <- read.csv(args[1])
-# predictions <- predict_holdout(df)
-# write.csv(predictions,"predictions.csv", row.names = FALSE)