Skip to content

Commit 7e38da3

Browse files
author
“vijayg15”
committed
model trainer added
1 parent 745874d commit 7e38da3

File tree

7 files changed

+308
-6
lines changed

7 files changed

+308
-6
lines changed

main.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
44
from mlProject.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
55
from mlProject.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline
6-
6+
from mlProject.pipeline.stage_04_model_trainer import ModelTrainerTrainingPipeline
77

88

99

@@ -41,6 +41,18 @@
4141
data_ingestion = DataTransformationTrainingPipeline()
4242
data_ingestion.main()
4343
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
44+
except Exception as e:
45+
logger.exception(e)
46+
raise e
47+
48+
49+
50+
STAGE_NAME = "Model Trainer stage"
51+
try:
52+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
53+
data_ingestion = ModelTrainerTrainingPipeline()
54+
data_ingestion.main()
55+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
4456
except Exception as e:
4557
logger.exception(e)
4658
raise e

params.yaml

+8-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,8 @@
1-
key: value
1+
RandomForestClassifier:
2+
n_estimators: 500
3+
criterion: 'gini'
4+
max_depth: None
5+
min_samples_split: 2
6+
min_samples_leaf: 1
7+
bootstrap: True
8+
ccp_alpha: 0.0

research/04_model_trainer.ipynb

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"text/plain": [
11+
"'d:\\\\Machine_Learning\\\\self_projects\\\\end_to_end_projects_with_deployment\\\\ML_projects\\\\bank_customer_churn\\\\Machine-Learning-project-with-MLflow-deployment\\\\research'"
12+
]
13+
},
14+
"execution_count": 2,
15+
"metadata": {},
16+
"output_type": "execute_result"
17+
}
18+
],
19+
"source": [
20+
"import os\n",
21+
"\n",
22+
"%pwd"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 3,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"os.chdir(\"../\")"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": 4,
37+
"metadata": {},
38+
"outputs": [
39+
{
40+
"data": {
41+
"text/plain": [
42+
"'d:\\\\Machine_Learning\\\\self_projects\\\\end_to_end_projects_with_deployment\\\\ML_projects\\\\bank_customer_churn\\\\Machine-Learning-project-with-MLflow-deployment'"
43+
]
44+
},
45+
"execution_count": 4,
46+
"metadata": {},
47+
"output_type": "execute_result"
48+
}
49+
],
50+
"source": [
51+
"%pwd"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 6,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"import pandas as pd"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 7,
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"train_data = pd.read_csv(\"artifacts/data_transformation/train.csv\")\n",
70+
"val_data = pd.read_csv(\"artifacts/data_transformation/test.csv\")"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 8,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"X_train = train_data.drop(['Exited'], axis=1)\n",
80+
"y_train = train_data['Exited']\n",
81+
"X_val = val_data.drop(['Exited'], axis=1)\n",
82+
"y_val = val_data['Exited']"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 10,
88+
"metadata": {},
89+
"outputs": [],
90+
"source": [
91+
"from sklearn.ensemble import RandomForestClassifier"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": 18,
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"clf = RandomForestClassifier(n_estimators = 1000, \n",
101+
" criterion = 'gini', \n",
102+
" max_depth = None,\n",
103+
" min_samples_split = 2,\n",
104+
" min_samples_leaf = 1,\n",
105+
" bootstrap = True,\n",
106+
" #ccp_alpha = 0.0,\n",
107+
" n_jobs = -1,\n",
108+
" verbose = 1\n",
109+
" )"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": 19,
115+
"metadata": {},
116+
"outputs": [
117+
{
118+
"name": "stderr",
119+
"output_type": "stream",
120+
"text": [
121+
"[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.\n",
122+
"[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.1s\n",
123+
"[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.4s\n",
124+
"[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.5s\n",
125+
"[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 8.3s\n",
126+
"[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 11.9s finished\n"
127+
]
128+
},
129+
{
130+
"data": {
131+
"text/plain": [
132+
"RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1)"
133+
]
134+
},
135+
"execution_count": 19,
136+
"metadata": {},
137+
"output_type": "execute_result"
138+
}
139+
],
140+
"source": [
141+
"clf.fit(X_train, y_train)"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": null,
147+
"metadata": {},
148+
"outputs": [],
149+
"source": []
150+
}
151+
],
152+
"metadata": {
153+
"kernelspec": {
154+
"display_name": "mlops",
155+
"language": "python",
156+
"name": "python3"
157+
},
158+
"language_info": {
159+
"codemirror_mode": {
160+
"name": "ipython",
161+
"version": 3
162+
},
163+
"file_extension": ".py",
164+
"mimetype": "text/x-python",
165+
"name": "python",
166+
"nbconvert_exporter": "python",
167+
"pygments_lexer": "ipython3",
168+
"version": "3.7.0"
169+
}
170+
},
171+
"nbformat": 4,
172+
"nbformat_minor": 2
173+
}
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import os
2+
import joblib
3+
import pandas as pd
4+
from mlProject import logger
5+
from sklearn.ensemble import RandomForestClassifier
6+
from mlProject.entity.config_entity import ModelTrainerConfig
7+
8+
9+
10+
class ModelTrainer:
11+
def __init__(self, config: ModelTrainerConfig):
12+
self.config = config
13+
14+
15+
def train(self):
16+
train_data = pd.read_csv(self.config.train_data_path)
17+
test_data = pd.read_csv(self.config.test_data_path)
18+
19+
20+
X_train = train_data.drop([self.config.target_column], axis=1)
21+
y_train = train_data[[self.config.target_column]]
22+
#X_val = test_data.drop([self.config.target_column], axis=1)
23+
#y_val = test_data[[self.config.target_column]]
24+
25+
26+
clf = RandomForestClassifier(n_estimators = self.config.n_estimators,
27+
criterion = self.config.criterion,
28+
#max_depth = self.config.max_depth,
29+
min_samples_split = self.config.min_samples_split,
30+
min_samples_leaf = self.config.min_samples_leaf,
31+
bootstrap = self.config.bootstrap,
32+
#ccp_alpha = self.config.ccp_alpha,
33+
n_jobs=-1, verbose=1, random_state=40)
34+
clf.fit(X_train, y_train)
35+
36+
joblib.dump(clf, os.path.join(self.config.root_dir, self.config.model_name))
37+

src/mlProject/config/configuration.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
from mlProject.utils.common import read_yaml, create_directories
33
from mlProject.entity.config_entity import (DataIngestionConfig,
44
DataValidationConfig,
5-
DataTransformationConfig,)
6-
5+
DataTransformationConfig,
6+
ModelTrainerConfig,)
77

88

99
class ConfigurationManager:
@@ -62,4 +62,30 @@ def get_data_transformation_config(self) -> DataTransformationConfig:
6262
data_path=config.data_path,
6363
)
6464

65-
return data_transformation_config
65+
return data_transformation_config
66+
67+
68+
def get_model_trainer_config(self) -> ModelTrainerConfig:
69+
config = self.config.model_trainer
70+
params = self.params.RandomForestClassifier
71+
schema = self.schema.TARGET_COLUMN
72+
73+
create_directories([config.root_dir])
74+
75+
model_trainer_config = ModelTrainerConfig(
76+
root_dir=config.root_dir,
77+
train_data_path = config.train_data_path,
78+
test_data_path = config.test_data_path,
79+
model_name = config.model_name,
80+
n_estimators = params.n_estimators,
81+
criterion = params.criterion,
82+
max_depth = params.max_depth,
83+
min_samples_split = params.min_samples_split,
84+
min_samples_leaf = params.min_samples_leaf,
85+
bootstrap = params.bootstrap,
86+
ccp_alpha = params.ccp_alpha,
87+
target_column = schema.name
88+
89+
)
90+
91+
return model_trainer_config

src/mlProject/entity/config_entity.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,21 @@ class DataValidationConfig:
2323
@dataclass(frozen=True)
2424
class DataTransformationConfig:
2525
root_dir: Path
26-
data_path: Path
26+
data_path: Path
27+
28+
29+
30+
@dataclass(frozen=True)
31+
class ModelTrainerConfig:
32+
root_dir: Path
33+
train_data_path: Path
34+
test_data_path: Path
35+
model_name: str
36+
n_estimators: int
37+
criterion: str
38+
max_depth: int
39+
min_samples_split: float
40+
min_samples_leaf: float
41+
bootstrap: bool
42+
ccp_alpha: float
43+
target_column: str
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from mlProject.config.configuration import ConfigurationManager
2+
from mlProject.components.model_trainer import ModelTrainer
3+
from mlProject import logger
4+
5+
6+
7+
STAGE_NAME = "Model Trainer stage"
8+
9+
class ModelTrainerTrainingPipeline:
10+
def __init__(self):
11+
pass
12+
13+
def main(self):
14+
config = ConfigurationManager()
15+
model_trainer_config = config.get_model_trainer_config()
16+
model_trainer_config = ModelTrainer(config=model_trainer_config)
17+
model_trainer_config.train()
18+
19+
20+
21+
22+
if __name__ == '__main__':
23+
try:
24+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
25+
obj = ModelTrainerTrainingPipeline()
26+
obj.main()
27+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
28+
except Exception as e:
29+
logger.exception(e)
30+
raise e

0 commit comments

Comments
 (0)