Applied feeback from Gert, Lisa and Adrienne

vloothuis · vloothuis · commit dad42392e938 · 2024-03-24T13:39:14.000+01:00
diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
@@ -39,7 +39,15 @@ jobs:
           load: true
 
       - name: Run prediction
-        run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest predict /data/fake_data.csv --out=/data/predictions.csv
+        run: docker run --rm -v "$(pwd)/.:/data" eyra-rank:latest /data/PreFer_fake_data.csv /data/PreFer_fake_background_data.csv --out=/data/predictions.csv
+
+      - name: Build Docker scoring image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: python.Dockerfile
+          tags: eyra-rank:scoring
+          load: true
 
       - name: Run scoring
-        run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest score /data/predictions.csv /data/fake_data_ground_truth.csv
+        run: docker run --rm -v "$(pwd):/data" --entrypoint conda eyra-rank:scoring run -n eyra-rank python /app/score.py /data/predictions.csv /data/PreFer_fake_data.csv
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 .DS_Store
 .AppleDouble
 .LSOverride
+__pycache__/
+.tool-versions
diff --git a/environment.yml b/environment.yml
@@ -1,8 +1,9 @@
 name: eyra-rank
 channels:
   - defaults
+  - conda-forge
 dependencies:
-  - pandas=1.5
-  - scikit-learn=1.2
-  - joblib=1.1
-  - matplotlib=3.7
+  - pandas=2.2.1
+  - scikit-learn=1.4.1.post1
+  - joblib=1.3.2
+  - matplotlib=3.8.3
diff --git a/model.joblib b/model.joblib
diff --git a/model.rds b/model.rds
diff --git a/python.Dockerfile b/python.Dockerfile
@@ -4,10 +4,11 @@ COPY environment.yml /
 RUN conda env create -f /environment.yml
 
 RUN mkdir /app
+WORKDIR /app
 
-COPY data /data
-COPY *.py /
-COPY *.joblib /
+COPY *.csv /app
+COPY *.py /app
+COPY *.joblib /app
 
-ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"]
+ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/app/run.py"]
 CMD ["predict", "/data/fake_data.csv"]
diff --git a/run.R b/run.R
@@ -16,32 +16,19 @@ source("submission.R")
 
 print_usage <- function() {
   cat("Usage:\n")
-  cat("  Rscript script.R predict INPUT_FILE [--output OUTPUT_FILE]\n")
-  cat("  Rscript script.R score --prediction PREDICTION_FILE --ground_truth GROUND_TRUTH_FILE [--output OUTPUT_FILE]\n")
+  cat("  Rscript script.R DATA_FILE BACKGROUND_DATA_FILE [--output OUTPUT_FILE]\n")
 }
 
 parse_arguments <- function() {
   args <- list()
   command_args <- commandArgs(trailingOnly = TRUE)
-  if (length(command_args) > 0) {
-    args$command <- command_args[1]
+  if (length(command_args) < 2) {
+    return(args)
+  }    
     
-    if (is.null(args$command)) {
-      stop("Error: No command provided.")
-    }
-    
-    if (args$command == "predict") {
-      args$input <- commandArgs(trailingOnly = TRUE)[2]
-      args$output <- get_argument("--output")
-    } else if (args$command == "score") {
-      args$prediction <- get_argument("--prediction")
-      args$ground_truth <- get_argument("--ground_truth")
-      args$output <- get_argument("--output")
-    }
-  } else {
-    stop("Error: No command provided. Run the script with predict or score.")
-  }
-  
+  args$data <- commandArgs(trailingOnly = TRUE)[1]
+  args$background_data <- commandArgs(trailingOnly = TRUE)[2]
+  args$output <- get_argument("--output")
   return(args)
 }
 
@@ -56,41 +43,25 @@ get_argument <- function(arg_name) {
 }
 
 parse_and_run_predict <- function(args) {
-  if (is.null(args$input)) {
-    stop("Error: Please provide --input argument for prediction.")
+  if (is.null(args$data)||is.null(args$background_data)) {
+    stop("Error: Please provide data and background_data argument for prediction.")
   }
   
-  cat("Processing input data for prediction from:", args$input, "\n")
+  cat("Processing input data for prediction from:", args$data, " ", args$background_data, "\n")
   if (!is.null(args$output)) {
     cat("Output will be saved to:", args$output, "\n")
   }
-  run_predict(args$input, args$output)
-}
-
-run_score <- function(args) {
-  if (is.null(args$prediction) || is.null(args$ground_truth)) {
-    stop("Error: Please provide --prediction and --ground_truth arguments for scoring.")
-  }
-  
-  cat("Scoring predictions from:", args$prediction, "\n")
-  cat("Ground truth data from:", args$ground_truth, "\n")
-  if (!is.null(args$output)) {
-    cat("Evaluation score will be saved to:", args$output, "\n")
-  }
-  # Call your submission function for scoring here
+  run_predict(args$data, args$background_data, args$output)
 }
 
-run_predict <- function(input_path, output=NULL) {
+run_predict <- function(data_path, background_data_path, output=NULL) {
   if (is.null(output)) {
     output <- stdout()
   }
+  df <- read.csv(data_path, encoding="latin1")
+  background_df <- read.csv(background_data_path, encoding="latin1")
   
-  
-  # Read data from input file
-  df <- read.csv(input_path, encoding="latin1")
-  
-  # Make predictions
-  predictions <- predict_outcomes(df)  # Assuming predict_outcomes is a function in the submission package
+  predictions <- predict_outcomes(df, background_df)
   
   # Check if predictions have the required format
   stopifnot(ncol(predictions) == 2,
@@ -105,13 +76,7 @@ run_predict <- function(input_path, output=NULL) {
 main <- function() {
   args <- parse_arguments()
   
-  if (args$command == "predict") {
-    parse_and_run_predict(args)
-  } else if (args$command == "score") {
-    run_score(args)
-  } else {
-    stop("Error: Invalid command. Use 'predict' or 'score'.")
-  }
+  parse_and_run_predict(args)
 }
 
 # Call main function
diff --git a/run.py b/run.py
@@ -20,31 +20,21 @@
 import pandas as pd
 import submission
 
-parser = argparse.ArgumentParser(description="Process and score data.")
-subparsers = parser.add_subparsers(dest="command")
+parser = argparse.ArgumentParser(description="Process data.")
 
-# Process subcommand
-process_parser = subparsers.add_parser(
-    "predict", help="Process input data for prediction."
+parser.add_argument("data_path", help="Path to data data CSV file.")
+parser.add_argument(
+    "background_data_path", help="Path to background data data CSV file."
 )
-process_parser.add_argument("input_path", help="Path to input data CSV file.")
-process_parser.add_argument("--output", help="Path to prediction output CSV file.")
-
-# Score subcommand
-score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.")
-score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
-score_parser.add_argument(
-    "ground_truth_path", help="Path to ground truth outcome CSV file."
-)
-score_parser.add_argument("--output", help="Path to evaluation score output CSV file.")
+parser.add_argument("--output", help="Path to prediction output CSV file.")
 
 args = parser.parse_args()
 
 
-def predict(input_path, output):
+def predict(data_path, background_data_path, output):
     """Predict Score (evaluate) the predictions and write the metrics.
 
-    This function takes the path to an input CSV file containing the input data.
+    This function takes the path to an data CSV file containing the data data.
     It calls submission.py clean_df and predict_outcomes writes the predictions
     to a new output CSV file.
 
@@ -53,10 +43,17 @@ def predict(input_path, output):
 
     if output is None:
         output = sys.stdout
-    df = pd.read_csv(
-        input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
+    data_df = pd.read_csv(
+        data_path, encoding="latin-1", encoding_errors="replace", low_memory=False
+    )
+    background_data_df = pd.read_csv(
+        background_data_path,
+        encoding="latin-1",
+        encoding_errors="replace",
+        low_memory=False,
     )
-    predictions = submission.predict_outcomes(df)
+
+    predictions = submission.predict_outcomes(data_df, background_data_df)
     assert (
         predictions.shape[1] == 2
     ), "Predictions must have two columns: nomem_encr and prediction"
@@ -131,11 +128,4 @@ def score(prediction_path, ground_truth_path, output):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    if args.command == "predict":
-        predict(args.input_path, args.output)
-    elif args.command == "score":
-        score(args.prediction_path, args.ground_truth_path, args.output)
-    else:
-        parser.print_help()
-        predict(args.input_path, args.output)
-        sys.exit(1)
+    predict(args.data_path, args.background_data_path, args.output)
diff --git a/score.py b/score.py
@@ -0,0 +1,97 @@
+"""
+This script calls submission.py. Add your method to submission.py to run your
+prediction method.
+
+To test your submission use the following command:
+
+python run.py predict 
+
+For example:
+
+python run.py predict data/PreFer_fake_data.csv
+
+Optionally, you can use the score function to calculate evaluation scores given 
+your predictions and the ground truth within the training dataset.
+
+"""
+
+import sys
+import argparse
+import pandas as pd
+import submission
+
+parser = argparse.ArgumentParser(description="Score data.")
+# Score subcommand
+parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
+# Score subcommand
+parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.")
+# Score subcommand
+parser.add_argument("--output", help="Path to evaluation score output CSV file.")
+
+args = parser.parse_args()
+
+
+def score(prediction_path, ground_truth_path, output):
+    """Score (evaluate) the predictions and write the metrics.
+
+    This function takes the path to a CSV file containing predicted outcomes and the
+    path to a CSV file containing the ground truth outcomes. It calculates the overall
+    prediction accuracy, and precision, recall, and F1 score for having a child
+    and writes these scores to a new output CSV file.
+
+    This function should not be modified.
+    """
+
+    if output is None:
+        output = sys.stdout
+    # Load predictions and ground truth into dataframes
+    predictions_df = pd.read_csv(prediction_path)
+    ground_truth_df = pd.read_csv(ground_truth_path)
+
+    # Merge predictions and ground truth on the 'id' column
+    merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right")
+
+    # Calculate accuracy
+    accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len(
+        merged_df
+    )
+
+    # Calculate true positives, false positives, and false negatives
+    true_positives = len(
+        merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 1)]
+    )
+    false_positives = len(
+        merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 0)]
+    )
+    false_negatives = len(
+        merged_df[(merged_df["prediction"] == 0) & (merged_df["new_child"] == 1)]
+    )
+
+    # Calculate precision, recall, and F1 score
+    try:
+        precision = true_positives / (true_positives + false_positives)
+    except ZeroDivisionError:
+        precision = 0
+    try:
+        recall = true_positives / (true_positives + false_negatives)
+    except ZeroDivisionError:
+        recall = 0
+    try:
+        f1_score = 2 * (precision * recall) / (precision + recall)
+    except ZeroDivisionError:
+        f1_score = 0
+    # Write metric output to a new CSV file
+    metrics_df = pd.DataFrame(
+        {
+            "accuracy": [accuracy],
+            "precision": [precision],
+            "recall": [recall],
+            "f1_score": [f1_score],
+        }
+    )
+    metrics_df.to_csv(output, index=False)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    score(args.prediction_path, args.ground_truth_path, args.output)
diff --git a/submission.R b/submission.R
@@ -16,7 +16,7 @@
 # List your packages here. Don't forget to update packages.R!
 library(dplyr) # as an example, not used here
 
-clean_df <- function(df, background = NULL){
+clean_df <- function(df, background_df){
   # Preprocess the input dataframe to feed the model.
   ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
 
@@ -45,7 +45,7 @@ clean_df <- function(df, background = NULL){
   return(df)
 }
 
-predict_outcomes <- function(df, model_path = "./model.rds"){
+predict_outcomes <- function(df, background_df, model_path = "./model.rds"){
   # Generate predictions using the saved model and the input dataframe.
     
   # The predict_outcomes function accepts a dataframe as an argument
@@ -58,7 +58,8 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
   # they did.
   
   # Parameters:
-  # df (dataframe): The input dataframe for which predictions are to be made.
+  # df (dataframe): The data dataframe for which predictions are to be made.
+  # df (dataframe): The background data dataframe for which predictions are to be made.
   # model_path (str): The path to the saved model file (which is the output of training.R).
 
   # Returns:
@@ -73,7 +74,7 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
   model <- readRDS(model_path)
     
   # Preprocess the fake / holdout data
-  df <- clean_df(df)
+  df <- clean_df(df, background_df)
 
   # IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
   # get list of variables *without* the outcome:
@@ -87,9 +88,9 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
   predictions <- ifelse(predictions > 0.5, 1, 0)  
   
   # Output file should be data.frame with two columns, nomem_enc and predictions
-  df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions)
+  df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions)
   # Force columnnames (overrides names that may be given by `predict`)
-  names(df_predict) <- c("nomem_encr", "predictions") 
+  names(df_predict) <- c("nomem_encr", "prediction") 
   
   # Return only dataset with predictions and identifier
   return( df_predict )
diff --git a/submission.py b/submission.py