Skip to content

Commit e399169

Browse files
Update submission.R
Updates Gert and Lisa
1 parent 75be10b commit e399169

File tree

1 file changed

+85
-43
lines changed

1 file changed

+85
-43
lines changed

submission.R

Lines changed: 85 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,96 @@
1-
# edit the preprocessing function using the code you used for preprocesing the train data
2-
clean_df <- function(df){
3-
# Process the input data to feed the model
1+
# This is an example script to generate the outcome variable given the input dataset.
2+
#
3+
# This script should be modified to prepare your own submission that predicts
4+
# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
5+
#
6+
# The predict_outcomes function takes a data frame. The return value must
7+
# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
8+
# should contain the nomem_encr column from the input data frame. The outcome
9+
# column should contain the predicted outcome for each nomem_encr. The outcome
10+
# should be 0 (no child) or 1 (having a child).
11+
#
12+
# clean_df should be used to clean (preprocess) the data.
13+
#
14+
# run.R can be used to test your submission.
415

5-
## Selecting variables
6-
keepcols = c('nomem_encr', 'birthyear_bg', 'gender_bg', 'burgstat_2020','oplmet_2020', 'cf20m454')
7-
8-
df <- df %>% select(all_of(keepcols))
9-
10-
# imputing missing values with mode (for factors) or median (for interval variables)
11-
my_mode <- function(x) {
12-
x <-x[!is.na(x)]
13-
ux <- unique(x)
14-
tab <- tabulate(match(x, ux))
15-
mode <- ux[tab == max(tab)]
16-
ifelse(length(mode) > 1, sample(mode, 1), mode)
17-
}
18-
19-
df <- df %>%
20-
mutate(across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), ~replace_na(., my_mode(.))),
21-
across(c(gender_bg, burgstat_2020, oplmet_2020, cf20m454), as.factor),
22-
across(birthyear_bg, ~replace_na(., median(., na.rm=TRUE))))
23-
24-
return(df)
25-
}
16+
# List your packages here. Don't forget to update packages.R!
17+
library(dplyr) # as an example, not used here
2618

19+
clean_df <- function(df, background = NULL){
20+
# Preprocess the input dataframe to feed the model.
21+
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
2722

23+
# Parameters:
24+
# df (dataframe): The input dataframe containing the raw data (from PreFer_train_data.csv).
25+
# background (dataframe): Optional input dataframe containing background data (from PreFer_train_background_data.csv).
2826

29-
# if necessary, edit the function so it returns predicted classes (1/0), not probabilities
30-
predict_outcomes <- function(df, model_path="./model.rds"){
31-
# preprocess the holdout data
32-
df <- clean_df(df)
33-
ids <- select(df, nomem_encr)
27+
# Returns:
28+
# data frame: The cleaned dataframe with only the necessary columns and processed variables.
29+
30+
## This script contains a bare minimum working example
31+
# Create new age variable
32+
df$age <- 2024 - df$birthyear_bg
33+
34+
# Filter cases for whom outcome is not available
35+
df <- df[ !is.na(df$new_child), ]
36+
37+
# Selecting variables for modelling
38+
keepcols = c('nomem_encr', # ID variable required for predictions,
39+
'age', # newly created variable
40+
'new_child') # outcome variable
41+
42+
## Keeping data with variables selected
43+
df <- df[ , keepcols ]
44+
45+
return(df)
46+
}
47+
48+
predict_outcomes <- function(df, model_path = "./model.rds"){
49+
# Generate predictions using the saved model and the input dataframe.
50+
51+
# The predict_outcomes function accepts a dataframe as an argument
52+
# and returns a new dataframe with two columns: nomem_encr and
53+
# prediction. The nomem_encr column in the new dataframe replicates the
54+
# corresponding column from the input dataframe The prediction
55+
# column contains predictions for each corresponding nomem_encr. Each
56+
# prediction is represented as a binary value: '0' indicates that the
57+
# individual did not have a child during 2021-2023, while '1' implies that
58+
# they did.
3459

60+
# Parameters:
61+
# df (dataframe): The input dataframe for which predictions are to be made.
62+
# model_path (str): The path to the saved model file (which is the output of training.R).
63+
64+
# Returns:
65+
# dataframe: A dataframe containing the identifiers and their corresponding predictions.
66+
67+
## This script contains a bare minimum working example
68+
if( !("nomem_encr" %in% colnames(df)) ) {
69+
warning("The identifier variable 'nomem_encr' should be in the dataset")
70+
}
71+
3572
# Load the model
3673
model <- readRDS(model_path)
74+
75+
# Preprocess the fake / holdout data
76+
df <- clean_df(df)
77+
78+
# IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
79+
# get list of variables *without* the outcome:
80+
vars_without_outcome <- colnames(df)[colnames(df) != "new_child"]
3781

38-
# !if necessary, make edits to produce predicted classes
39-
# E.g. if you used glm() function to train a model, add 'type="response"' to get probabilities
40-
pred <- predict(model, df, type="response")
41-
#and then transform them into predicted classes
42-
pred <- ifelse(pred>0.5, 1, 0)
82+
# Generate predictions from model, should be 0 (no child) or 1 (had child)
83+
predictions <- predict(model,
84+
subset(df, select = vars_without_outcome),
85+
type = "response")
86+
# Transform probabilities into predicted classes
87+
predictions <- ifelse(predictions > 0.5, 1, 0)
4388

44-
# adding prediction column to id column
45-
ids$prediction<- pred
89+
# Output file should be data.frame with two columns, nomem_enc and predictions
90+
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions)
91+
# Force columnnames (overrides names that may be given by `predict`)
92+
names(df_predict) <- c("nomem_encr", "predictions")
4693

47-
return(ids)
94+
# Return only dataset with predictions and identifier
95+
return( df_predict )
4896
}
49-
50-
51-
# ######## do not edit this ############################
52-
# df <- read.csv(args[1])
53-
# predictions <- predict_holdout(df)
54-
# write.csv(predictions,"predictions.csv", row.names = FALSE)

0 commit comments

Comments
 (0)