model trainer added

“vijayg15” · “vijayg15” · commit 7e38da378641 · 2024-07-05T02:04:37.000+05:30
diff --git a/main.py b/main.py
@@ -3,7 +3,7 @@
 from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
 from mlProject.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
 from mlProject.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline
-
+from mlProject.pipeline.stage_04_model_trainer import ModelTrainerTrainingPipeline
 
 
 
@@ -41,6 +41,18 @@
    data_ingestion = DataTransformationTrainingPipeline()
    data_ingestion.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+except Exception as e:
+        logger.exception(e)
+        raise e
+
+
+
+STAGE_NAME = "Model Trainer stage"
+try:
+   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
+   data_ingestion = ModelTrainerTrainingPipeline()
+   data_ingestion.main()
+   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 except Exception as e:
         logger.exception(e)
         raise e
diff --git a/params.yaml b/params.yaml
@@ -1 +1,8 @@
-key: value
+RandomForestClassifier:
+  n_estimators: 500 
+  criterion: 'gini' 
+  max_depth: None
+  min_samples_split: 2
+  min_samples_leaf: 1
+  bootstrap: True
+  ccp_alpha: 0.0
diff --git a/research/04_model_trainer.ipynb b/research/04_model_trainer.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'d:\\\\Machine_Learning\\\\self_projects\\\\end_to_end_projects_with_deployment\\\\ML_projects\\\\bank_customer_churn\\\\Machine-Learning-project-with-MLflow-deployment\\\\research'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir(\"../\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'d:\\\\Machine_Learning\\\\self_projects\\\\end_to_end_projects_with_deployment\\\\ML_projects\\\\bank_customer_churn\\\\Machine-Learning-project-with-MLflow-deployment'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv(\"artifacts/data_transformation/train.csv\")\n",
+    "val_data = pd.read_csv(\"artifacts/data_transformation/test.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = train_data.drop(['Exited'], axis=1)\n",
+    "y_train = train_data['Exited']\n",
+    "X_val = val_data.drop(['Exited'], axis=1)\n",
+    "y_val = val_data['Exited']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf = RandomForestClassifier(n_estimators = 1000, \n",
+    "                             criterion = 'gini', \n",
+    "                             max_depth = None,\n",
+    "                             min_samples_split = 2,\n",
+    "                             min_samples_leaf = 1,\n",
+    "                             bootstrap = True,\n",
+    "                             #ccp_alpha = 0.0,\n",
+    "                             n_jobs = -1,\n",
+    "                             verbose = 1\n",
+    "                             )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.\n",
+      "[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s\n",
+      "[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    1.4s\n",
+      "[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    3.5s\n",
+      "[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    8.3s\n",
+      "[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   11.9s finished\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlops",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/mlProject/components/model_trainer.py b/src/mlProject/components/model_trainer.py
@@ -0,0 +1,37 @@
+import os
+import joblib
+import pandas as pd
+from mlProject import logger
+from sklearn.ensemble import RandomForestClassifier
+from mlProject.entity.config_entity import ModelTrainerConfig
+
+
+
+class ModelTrainer:
+    def __init__(self, config: ModelTrainerConfig):
+        self.config = config
+
+    
+    def train(self):
+        train_data = pd.read_csv(self.config.train_data_path)
+        test_data = pd.read_csv(self.config.test_data_path)
+
+
+        X_train = train_data.drop([self.config.target_column], axis=1)
+        y_train = train_data[[self.config.target_column]]
+        #X_val = test_data.drop([self.config.target_column], axis=1)
+        #y_val = test_data[[self.config.target_column]]
+
+
+        clf = RandomForestClassifier(n_estimators = self.config.n_estimators,
+                                     criterion = self.config.criterion,
+                                     #max_depth = self.config.max_depth,
+                                     min_samples_split = self.config.min_samples_split,
+                                     min_samples_leaf = self.config.min_samples_leaf,
+                                     bootstrap = self.config.bootstrap,
+                                     #ccp_alpha = self.config.ccp_alpha,
+                                     n_jobs=-1, verbose=1, random_state=40)
+        clf.fit(X_train, y_train)
+
+        joblib.dump(clf, os.path.join(self.config.root_dir, self.config.model_name))
+
diff --git a/src/mlProject/config/configuration.py b/src/mlProject/config/configuration.py
@@ -2,8 +2,8 @@
 from mlProject.utils.common import read_yaml, create_directories
 from mlProject.entity.config_entity import (DataIngestionConfig,
                                             DataValidationConfig, 
-                                            DataTransformationConfig,)
-
+                                            DataTransformationConfig,
+                                            ModelTrainerConfig,)
 
 
 class ConfigurationManager:
@@ -62,4 +62,30 @@ def get_data_transformation_config(self) -> DataTransformationConfig:
             data_path=config.data_path,
             )
 
-        return data_transformation_config
+        return data_transformation_config
+    
+
+    def get_model_trainer_config(self) -> ModelTrainerConfig:
+        config = self.config.model_trainer
+        params = self.params.RandomForestClassifier
+        schema =  self.schema.TARGET_COLUMN
+
+        create_directories([config.root_dir])
+
+        model_trainer_config = ModelTrainerConfig(
+            root_dir=config.root_dir,
+            train_data_path = config.train_data_path,
+            test_data_path = config.test_data_path,
+            model_name = config.model_name,
+            n_estimators = params.n_estimators,
+            criterion = params.criterion,
+            max_depth = params.max_depth,
+            min_samples_split = params.min_samples_split,
+            min_samples_leaf = params.min_samples_leaf,
+            bootstrap = params.bootstrap,
+            ccp_alpha = params.ccp_alpha,
+            target_column = schema.name
+            
+            )
+
+        return model_trainer_config
diff --git a/src/mlProject/entity/config_entity.py b/src/mlProject/entity/config_entity.py
@@ -23,4 +23,21 @@ class DataValidationConfig:
 @dataclass(frozen=True)
 class DataTransformationConfig:
     root_dir: Path
-    data_path: Path
+    data_path: Path
+
+
+
+@dataclass(frozen=True)
+class ModelTrainerConfig:
+    root_dir: Path
+    train_data_path: Path
+    test_data_path: Path
+    model_name: str 
+    n_estimators: int
+    criterion: str
+    max_depth: int
+    min_samples_split: float
+    min_samples_leaf: float
+    bootstrap: bool
+    ccp_alpha: float
+    target_column: str
diff --git a/src/mlProject/pipeline/stage_04_model_trainer.py b/src/mlProject/pipeline/stage_04_model_trainer.py
@@ -0,0 +1,30 @@
+from mlProject.config.configuration import ConfigurationManager
+from mlProject.components.model_trainer import ModelTrainer
+from mlProject import logger
+
+
+
+STAGE_NAME = "Model Trainer stage"
+
+class ModelTrainerTrainingPipeline:
+    def __init__(self):
+        pass
+
+    def main(self):
+        config = ConfigurationManager()
+        model_trainer_config = config.get_model_trainer_config()
+        model_trainer_config = ModelTrainer(config=model_trainer_config)
+        model_trainer_config.train()
+
+
+
+
+if __name__ == '__main__':
+    try:
+        logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
+        obj = ModelTrainerTrainingPipeline()
+        obj.main()
+        logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+    except Exception as e:
+        logger.exception(e)
+        raise e