Skip to content

Commit dec62d8

Browse files
committed
remove undersampling, transform copy of dataframe
1 parent d45de88 commit dec62d8

File tree

7 files changed

+24
-21
lines changed

7 files changed

+24
-21
lines changed

Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
NO_OF_TEST_FILES := $(words $(wildcard tests/test_*.py))
99
NO_OF_REPORT_FILES := $(words $(wildcard reports/))
1010
NO_OF_REPORT_FILES := $(words $(filter-out reports/.gitkeep, $(SRC_FILES)))
11+
TEST_CSV := ./data/transformed/test_balanced_creditcard.csv
1112

1213
###############################################################
1314
# COMMANDS #
@@ -39,10 +40,12 @@ linting:
3940

4041
test-package:
4142
@echo ">>> running coverage pytest"
42-
coverage run -m pytest ./tests/
43+
coverage run -m pytest ./tests/test_data.py ./tests/test_generate_data.py ./tests/test_train.py ./tests/test_predict.py
4344
coverage report -m --include=./tests/*
4445

4546
test: generate-dataset train prediction clean test-package ## run extensive tests
4647

4748
help: ## show help on available commands
4849
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
50+
51+

ml_skeleton_py/etl/generate_dataset.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,13 @@ def remove_outliers(df: pd.DataFrame, params: dict) -> pd.DataFrame:
3131
Return:
3232
df (pd.DataFrame): dataframe with removed outliers
3333
"""
34+
df_dropped = df.copy(deep=True)
3435
for variable in ["V10", "V12", "V14"]:
35-
upper_outliers = df[variable] > params[f"{variable}_upper"]
36-
lower_outliers = df[variable] < params[f"{variable}_lower"]
37-
df = df.drop(df[upper_outliers | lower_outliers].index)
38-
logger.info(f"Number of Instances after outliers removal: {len(df)}")
39-
return df
36+
upper_outliers = df_dropped[variable] > params[f"{variable}_upper"]
37+
lower_outliers = df_dropped[variable] < params[f"{variable}_lower"]
38+
df_dropped = df_dropped.drop(df_dropped[upper_outliers | lower_outliers].index)
39+
logger.info(f"Number of Instances after outliers removal: {len(df_dropped)}")
40+
return df_dropped
4041

4142

4243
def generate(dataset: str) -> Optional[pd.DataFrame]:

ml_skeleton_py/model/predict.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def load_model(model_name: str) -> BaseEstimator:
3737
model = pickle.load(handle)["model"]
3838
return model
3939

40-
40+
# @dploy endpoint predict
4141
def predict(observation: np.array, model_name: str = "lr.p") -> float:
4242
"""
4343
Predict one single observation.

ml_skeleton_py/model/train.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
logging.getLogger().setLevel(logging.INFO)
2121

2222

23-
def train(model_name: str, dataset: str) -> None:
23+
def train(dataset: str, model_name: str = "lr") -> None:
2424
"""
2525
Train models using X_train and y_train with a specific classifier.
2626
@@ -46,25 +46,24 @@ def train(model_name: str, dataset: str) -> None:
4646

4747
# preprocessing
4848
scaler = RobustScaler()
49-
X = scaler.fit_transform(X)
50-
rus = RandomUnderSampler(replacement=False)
51-
X, y = rus.fit_resample(X, y)
5249

5350
# In this specific example logistic regression was chosen as
5451
# the most optimal model after running several experiments.
5552
classifier = LogisticRegression(max_iter=4000, penalty="l2", C=0.01)
5653

54+
# create pipeline
55+
predict_pipeline = make_pipeline(scaler, classifier)
56+
5757
# training
58-
classifier.fit(X, y)
59-
training_score = cross_val_score(classifier, X, y, cv=5, scoring="roc_auc")
60-
logger.info(f"Classifier: {classifier.__class__.__name__}")
58+
predict_pipeline.fit(X, y)
59+
training_score = cross_val_score(predict_pipeline, X, y, cv=5, scoring="roc_auc")
60+
logger.info(f"Classifier: {predict_pipeline.__class__.__name__}")
6161
logger.info(
6262
"Has a training score "
6363
+ f"of {round(training_score.mean(), 2) * 100} % roc_auc"
6464
)
6565

6666
# saving
67-
predict_pipeline = make_pipeline(scaler, classifier)
6867
pred_result = {
6968
"clf": model_name,
7069
"training score roc_auc": training_score.mean(),

scripts/train.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,21 @@
77
@click.command()
88
@click.option("--model_name", default="lr")
99
@click.option("--dataset", default="creditcard.csv")
10-
def train(model_name: str, dataset: str) -> None:
10+
def train(dataset: str, model_name: str) -> None:
1111
"""
1212
Train a model on a dataset and store the model and its results.
1313
1414
Parameters:
15+
dataset (str): the dataset on which you want to train
16+
1517
model_name (str): the model_name that you want to use as a save
1618
default:
1719
"lr": logistic regression
1820
19-
dataset (str): the dataset on which you want to train
20-
2121
Returns:
2222
None
2323
"""
24-
model.train(model_name, dataset)
24+
model.train(dataset, model_name)
2525

2626

2727
if __name__ == "__main__":

tests/test_predict.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def test_predict_1() -> None:
1313
"""
1414
Test whether an observation makes a prediction.
1515
"""
16-
train(MODEL_NAME, DATASET)
16+
train(DATASET, MODEL_NAME)
1717
model_name = "lr_test.p"
1818
observation = [
1919
-0.51056756,

tests/test_train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def test_train_lr() -> None:
1212
"""
1313
Test whether logistic regression is trained and can be loaded.
1414
"""
15-
train(MODEL_NAME, DATASET)
15+
train(DATASET, MODEL_NAME)
1616
with open(os.path.join(s.MODEL_DIR, MODEL_NAME) + ".p", "rb") as handle:
1717
pred_result = pickle.load(handle)
1818
classifier = is_classifier(pred_result["model"])

0 commit comments

Comments
 (0)