Skip to content

Commit dad4239

Browse files
committed
Applied feeback from Gert, Lisa and Adrienne
1 parent adbd497 commit dad4239

File tree

11 files changed

+186
-117
lines changed

11 files changed

+186
-117
lines changed

.github/workflows/checks.yaml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,15 @@ jobs:
3939
load: true
4040

4141
- name: Run prediction
42-
run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest predict /data/fake_data.csv --out=/data/predictions.csv
42+
run: docker run --rm -v "$(pwd)/.:/data" eyra-rank:latest /data/PreFer_fake_data.csv /data/PreFer_fake_background_data.csv --out=/data/predictions.csv
43+
44+
- name: Build Docker scoring image
45+
uses: docker/build-push-action@v4
46+
with:
47+
context: .
48+
file: python.Dockerfile
49+
tags: eyra-rank:scoring
50+
load: true
4351

4452
- name: Run scoring
45-
run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest score /data/predictions.csv /data/fake_data_ground_truth.csv
53+
run: docker run --rm -v "$(pwd):/data" --entrypoint conda eyra-rank:scoring run -n eyra-rank python /app/score.py /data/predictions.csv /data/PreFer_fake_data.csv

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@
33
.DS_Store
44
.AppleDouble
55
.LSOverride
6+
__pycache__/
7+
.tool-versions

environment.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
name: eyra-rank
22
channels:
33
- defaults
4+
- conda-forge
45
dependencies:
5-
- pandas=1.5
6-
- scikit-learn=1.2
7-
- joblib=1.1
8-
- matplotlib=3.7
6+
- pandas=2.2.1
7+
- scikit-learn=1.4.1.post1
8+
- joblib=1.3.2
9+
- matplotlib=3.8.3

model.joblib

-2.36 KB
Binary file not shown.

model.rds

-68.7 KB
Binary file not shown.

python.Dockerfile

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ COPY environment.yml /
44
RUN conda env create -f /environment.yml
55

66
RUN mkdir /app
7+
WORKDIR /app
78

8-
COPY data /data
9-
COPY *.py /
10-
COPY *.joblib /
9+
COPY *.csv /app
10+
COPY *.py /app
11+
COPY *.joblib /app
1112

12-
ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"]
13+
ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/app/run.py"]
1314
CMD ["predict", "/data/fake_data.csv"]

run.R

Lines changed: 16 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,19 @@ source("submission.R")
1616

1717
print_usage <- function() {
1818
cat("Usage:\n")
19-
cat(" Rscript script.R predict INPUT_FILE [--output OUTPUT_FILE]\n")
20-
cat(" Rscript script.R score --prediction PREDICTION_FILE --ground_truth GROUND_TRUTH_FILE [--output OUTPUT_FILE]\n")
19+
cat(" Rscript script.R DATA_FILE BACKGROUND_DATA_FILE [--output OUTPUT_FILE]\n")
2120
}
2221

2322
parse_arguments <- function() {
2423
args <- list()
2524
command_args <- commandArgs(trailingOnly = TRUE)
26-
if (length(command_args) > 0) {
27-
args$command <- command_args[1]
25+
if (length(command_args) < 2) {
26+
return(args)
27+
}
2828

29-
if (is.null(args$command)) {
30-
stop("Error: No command provided.")
31-
}
32-
33-
if (args$command == "predict") {
34-
args$input <- commandArgs(trailingOnly = TRUE)[2]
35-
args$output <- get_argument("--output")
36-
} else if (args$command == "score") {
37-
args$prediction <- get_argument("--prediction")
38-
args$ground_truth <- get_argument("--ground_truth")
39-
args$output <- get_argument("--output")
40-
}
41-
} else {
42-
stop("Error: No command provided. Run the script with predict or score.")
43-
}
44-
29+
args$data <- commandArgs(trailingOnly = TRUE)[1]
30+
args$background_data <- commandArgs(trailingOnly = TRUE)[2]
31+
args$output <- get_argument("--output")
4532
return(args)
4633
}
4734

@@ -56,41 +43,25 @@ get_argument <- function(arg_name) {
5643
}
5744

5845
parse_and_run_predict <- function(args) {
59-
if (is.null(args$input)) {
60-
stop("Error: Please provide --input argument for prediction.")
46+
if (is.null(args$data)||is.null(args$background_data)) {
47+
stop("Error: Please provide data and background_data argument for prediction.")
6148
}
6249

63-
cat("Processing input data for prediction from:", args$input, "\n")
50+
cat("Processing input data for prediction from:", args$data, " ", args$background_data, "\n")
6451
if (!is.null(args$output)) {
6552
cat("Output will be saved to:", args$output, "\n")
6653
}
67-
run_predict(args$input, args$output)
68-
}
69-
70-
run_score <- function(args) {
71-
if (is.null(args$prediction) || is.null(args$ground_truth)) {
72-
stop("Error: Please provide --prediction and --ground_truth arguments for scoring.")
73-
}
74-
75-
cat("Scoring predictions from:", args$prediction, "\n")
76-
cat("Ground truth data from:", args$ground_truth, "\n")
77-
if (!is.null(args$output)) {
78-
cat("Evaluation score will be saved to:", args$output, "\n")
79-
}
80-
# Call your submission function for scoring here
54+
run_predict(args$data, args$background_data, args$output)
8155
}
8256

83-
run_predict <- function(input_path, output=NULL) {
57+
run_predict <- function(data_path, background_data_path, output=NULL) {
8458
if (is.null(output)) {
8559
output <- stdout()
8660
}
61+
df <- read.csv(data_path, encoding="latin1")
62+
background_df <- read.csv(background_data_path, encoding="latin1")
8763

88-
89-
# Read data from input file
90-
df <- read.csv(input_path, encoding="latin1")
91-
92-
# Make predictions
93-
predictions <- predict_outcomes(df) # Assuming predict_outcomes is a function in the submission package
64+
predictions <- predict_outcomes(df, background_df)
9465

9566
# Check if predictions have the required format
9667
stopifnot(ncol(predictions) == 2,
@@ -105,13 +76,7 @@ run_predict <- function(input_path, output=NULL) {
10576
main <- function() {
10677
args <- parse_arguments()
10778

108-
if (args$command == "predict") {
109-
parse_and_run_predict(args)
110-
} else if (args$command == "score") {
111-
run_score(args)
112-
} else {
113-
stop("Error: Invalid command. Use 'predict' or 'score'.")
114-
}
79+
parse_and_run_predict(args)
11580
}
11681

11782
# Call main function

run.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,21 @@
2020
import pandas as pd
2121
import submission
2222

23-
parser = argparse.ArgumentParser(description="Process and score data.")
24-
subparsers = parser.add_subparsers(dest="command")
23+
parser = argparse.ArgumentParser(description="Process data.")
2524

26-
# Process subcommand
27-
process_parser = subparsers.add_parser(
28-
"predict", help="Process input data for prediction."
25+
parser.add_argument("data_path", help="Path to data data CSV file.")
26+
parser.add_argument(
27+
"background_data_path", help="Path to background data data CSV file."
2928
)
30-
process_parser.add_argument("input_path", help="Path to input data CSV file.")
31-
process_parser.add_argument("--output", help="Path to prediction output CSV file.")
32-
33-
# Score subcommand
34-
score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.")
35-
score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
36-
score_parser.add_argument(
37-
"ground_truth_path", help="Path to ground truth outcome CSV file."
38-
)
39-
score_parser.add_argument("--output", help="Path to evaluation score output CSV file.")
29+
parser.add_argument("--output", help="Path to prediction output CSV file.")
4030

4131
args = parser.parse_args()
4232

4333

44-
def predict(input_path, output):
34+
def predict(data_path, background_data_path, output):
4535
"""Predict Score (evaluate) the predictions and write the metrics.
4636
47-
This function takes the path to an input CSV file containing the input data.
37+
This function takes the path to an data CSV file containing the data data.
4838
It calls submission.py clean_df and predict_outcomes writes the predictions
4939
to a new output CSV file.
5040
@@ -53,10 +43,17 @@ def predict(input_path, output):
5343

5444
if output is None:
5545
output = sys.stdout
56-
df = pd.read_csv(
57-
input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
46+
data_df = pd.read_csv(
47+
data_path, encoding="latin-1", encoding_errors="replace", low_memory=False
48+
)
49+
background_data_df = pd.read_csv(
50+
background_data_path,
51+
encoding="latin-1",
52+
encoding_errors="replace",
53+
low_memory=False,
5854
)
59-
predictions = submission.predict_outcomes(df)
55+
56+
predictions = submission.predict_outcomes(data_df, background_data_df)
6057
assert (
6158
predictions.shape[1] == 2
6259
), "Predictions must have two columns: nomem_encr and prediction"
@@ -131,11 +128,4 @@ def score(prediction_path, ground_truth_path, output):
131128

132129
if __name__ == "__main__":
133130
args = parser.parse_args()
134-
if args.command == "predict":
135-
predict(args.input_path, args.output)
136-
elif args.command == "score":
137-
score(args.prediction_path, args.ground_truth_path, args.output)
138-
else:
139-
parser.print_help()
140-
predict(args.input_path, args.output)
141-
sys.exit(1)
131+
predict(args.data_path, args.background_data_path, args.output)

score.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
This script calls submission.py. Add your method to submission.py to run your
3+
prediction method.
4+
5+
To test your submission use the following command:
6+
7+
python run.py predict
8+
9+
For example:
10+
11+
python run.py predict data/PreFer_fake_data.csv
12+
13+
Optionally, you can use the score function to calculate evaluation scores given
14+
your predictions and the ground truth within the training dataset.
15+
16+
"""
17+
18+
import sys
19+
import argparse
20+
import pandas as pd
21+
import submission
22+
23+
parser = argparse.ArgumentParser(description="Score data.")
24+
# Score subcommand
25+
parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
26+
# Score subcommand
27+
parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.")
28+
# Score subcommand
29+
parser.add_argument("--output", help="Path to evaluation score output CSV file.")
30+
31+
args = parser.parse_args()
32+
33+
34+
def score(prediction_path, ground_truth_path, output):
35+
"""Score (evaluate) the predictions and write the metrics.
36+
37+
This function takes the path to a CSV file containing predicted outcomes and the
38+
path to a CSV file containing the ground truth outcomes. It calculates the overall
39+
prediction accuracy, and precision, recall, and F1 score for having a child
40+
and writes these scores to a new output CSV file.
41+
42+
This function should not be modified.
43+
"""
44+
45+
if output is None:
46+
output = sys.stdout
47+
# Load predictions and ground truth into dataframes
48+
predictions_df = pd.read_csv(prediction_path)
49+
ground_truth_df = pd.read_csv(ground_truth_path)
50+
51+
# Merge predictions and ground truth on the 'id' column
52+
merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right")
53+
54+
# Calculate accuracy
55+
accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len(
56+
merged_df
57+
)
58+
59+
# Calculate true positives, false positives, and false negatives
60+
true_positives = len(
61+
merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 1)]
62+
)
63+
false_positives = len(
64+
merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 0)]
65+
)
66+
false_negatives = len(
67+
merged_df[(merged_df["prediction"] == 0) & (merged_df["new_child"] == 1)]
68+
)
69+
70+
# Calculate precision, recall, and F1 score
71+
try:
72+
precision = true_positives / (true_positives + false_positives)
73+
except ZeroDivisionError:
74+
precision = 0
75+
try:
76+
recall = true_positives / (true_positives + false_negatives)
77+
except ZeroDivisionError:
78+
recall = 0
79+
try:
80+
f1_score = 2 * (precision * recall) / (precision + recall)
81+
except ZeroDivisionError:
82+
f1_score = 0
83+
# Write metric output to a new CSV file
84+
metrics_df = pd.DataFrame(
85+
{
86+
"accuracy": [accuracy],
87+
"precision": [precision],
88+
"recall": [recall],
89+
"f1_score": [f1_score],
90+
}
91+
)
92+
metrics_df.to_csv(output, index=False)
93+
94+
95+
if __name__ == "__main__":
96+
args = parser.parse_args()
97+
score(args.prediction_path, args.ground_truth_path, args.output)

submission.R

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# List your packages here. Don't forget to update packages.R!
1717
library(dplyr) # as an example, not used here
1818

19-
clean_df <- function(df, background = NULL){
19+
clean_df <- function(df, background_df){
2020
# Preprocess the input dataframe to feed the model.
2121
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command
2222

@@ -45,7 +45,7 @@ clean_df <- function(df, background = NULL){
4545
return(df)
4646
}
4747

48-
predict_outcomes <- function(df, model_path = "./model.rds"){
48+
predict_outcomes <- function(df, background_df, model_path = "./model.rds"){
4949
# Generate predictions using the saved model and the input dataframe.
5050

5151
# The predict_outcomes function accepts a dataframe as an argument
@@ -58,7 +58,8 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
5858
# they did.
5959

6060
# Parameters:
61-
# df (dataframe): The input dataframe for which predictions are to be made.
61+
# df (dataframe): The data dataframe for which predictions are to be made.
62+
# df (dataframe): The background data dataframe for which predictions are to be made.
6263
# model_path (str): The path to the saved model file (which is the output of training.R).
6364

6465
# Returns:
@@ -73,7 +74,7 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
7374
model <- readRDS(model_path)
7475

7576
# Preprocess the fake / holdout data
76-
df <- clean_df(df)
77+
df <- clean_df(df, background_df)
7778

7879
# IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
7980
# get list of variables *without* the outcome:
@@ -87,9 +88,9 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
8788
predictions <- ifelse(predictions > 0.5, 1, 0)
8889

8990
# Output file should be data.frame with two columns, nomem_enc and predictions
90-
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions)
91+
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions)
9192
# Force columnnames (overrides names that may be given by `predict`)
92-
names(df_predict) <- c("nomem_encr", "predictions")
93+
names(df_predict) <- c("nomem_encr", "prediction")
9394

9495
# Return only dataset with predictions and identifier
9596
return( df_predict )

0 commit comments

Comments
 (0)