Skip to content

Commit 3cc7e28

Browse files
author
“vijayg15”
committed
data ingestion added
1 parent b0840a7 commit 3cc7e28

File tree

11 files changed

+317
-0
lines changed

11 files changed

+317
-0
lines changed

config/config.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
artifacts_root: artifacts
2+
3+
4+
data_ingestion:
5+
root_dir: artifacts/data_ingestion
6+
source_URL: D:\Machine_Learning/self_projects/end_to_end_projects_with_deployment/ML_projects/bank_customer_churn/data.zip
7+
local_data_file: artifacts/data_ingestion/data.zip
8+
unzip_dir: artifacts/data_ingestion
9+
10+
11+
data_validation:
12+
root_dir: artifacts/data_validation
13+
unzip_data_dir: artifacts/data_ingestion/Churn_Modelling.csv
14+
STATUS_FILE: artifacts/data_validation/status.txt
15+
16+
17+
data_transformation:
18+
root_dir: artifacts/data_transformation
19+
data_path: artifacts/data_ingestion/Churn_Modelling.csv
20+
21+
22+
model_trainer:
23+
root_dir: artifacts/model_trainer
24+
train_data_path: artifacts/data_transformation/train.csv
25+
test_data_path: artifacts/data_transformation/test.csv
26+
model_name: model.joblib
27+
28+
29+
model_evaluation:
30+
root_dir: artifacts/model_evaluation
31+
test_data_path: artifacts/data_transformation/test.csv
32+
model_path: artifacts/model_trainer/model.joblib
33+
metric_file_name: artifacts/model_evaluation/metrics.json

main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import os
2+
from mlProject import logger
3+
from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
4+
5+
6+
7+
STAGE_NAME = "Data Ingestion stage"
8+
try:
9+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
10+
data_ingestion = DataIngestionTrainingPipeline()
11+
data_ingestion.main()
12+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
13+
except Exception as e:
14+
logger.exception(e)
15+
raise e

params.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
key: value

schema.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
key: value

src/mlProject/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import os
2+
import sys
3+
import logging
4+
5+
logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
6+
7+
log_dir = "logs"
8+
log_filepath = os.path.join(log_dir,"running_logs.log")
9+
os.makedirs(log_dir, exist_ok=True)
10+
11+
12+
logging.basicConfig(
13+
level= logging.INFO,
14+
format= logging_str,
15+
16+
handlers=[
17+
logging.FileHandler(log_filepath),
18+
logging.StreamHandler(sys.stdout)
19+
]
20+
)
21+
22+
logger = logging.getLogger("mlProjectLogger")
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import shutil
3+
import urllib.request as request
4+
import zipfile
5+
from mlProject import logger
6+
from mlProject.utils.common import get_size
7+
from pathlib import Path
8+
from mlProject.entity.config_entity import (DataIngestionConfig)
9+
10+
11+
class DataIngestion:
12+
def __init__(self, config: DataIngestionConfig):
13+
self.config = config
14+
15+
16+
17+
def transfer_file(self):
18+
if not os.path.exists(self.config.local_data_file):
19+
shutil.copy2(self.config.source_URL, self.config.root_dir)
20+
logger.info(f"{self.config.root_dir}/{os.path.basename(self.config.local_data_file).split('/')[-1]} is copied!")
21+
else:
22+
logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")
23+
24+
25+
26+
def extract_zip_file(self):
27+
"""
28+
zip_file_path: str
29+
Extracts the zip file into the data directory
30+
Function returns None
31+
"""
32+
unzip_path = self.config.unzip_dir
33+
os.makedirs(unzip_path, exist_ok=True)
34+
with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
35+
zip_ref.extractall(unzip_path)
36+

src/mlProject/config/configuration.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from mlProject.constants import *
2+
from mlProject.utils.common import read_yaml, create_directories
3+
from mlProject.entity.config_entity import (DataIngestionConfig,
4+
)
5+
6+
class ConfigurationManager:
7+
def __init__(
8+
self,
9+
config_filepath = CONFIG_FILE_PATH,
10+
params_filepath = PARAMS_FILE_PATH,
11+
schema_filepath = SCHEMA_FILE_PATH):
12+
13+
self.config = read_yaml(config_filepath)
14+
self.params = read_yaml(params_filepath)
15+
self.schema = read_yaml(schema_filepath)
16+
17+
create_directories([self.config.artifacts_root])
18+
19+
20+
21+
def get_data_ingestion_config(self) -> DataIngestionConfig:
22+
config = self.config.data_ingestion
23+
24+
create_directories([config.root_dir])
25+
26+
data_ingestion_config = DataIngestionConfig(
27+
root_dir=config.root_dir,
28+
source_URL=config.source_URL,
29+
local_data_file=config.local_data_file,
30+
unzip_dir=config.unzip_dir
31+
)
32+
33+
return data_ingestion_config

src/mlProject/constants/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from pathlib import Path
2+
3+
CONFIG_FILE_PATH = Path("config/config.yaml")
4+
PARAMS_FILE_PATH = Path("params.yaml")
5+
SCHEMA_FILE_PATH = Path("schema.yaml")

src/mlProject/entity/config_entity.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from dataclasses import dataclass
2+
from pathlib import Path
3+
4+
5+
@dataclass(frozen=True)
6+
class DataIngestionConfig:
7+
root_dir: Path
8+
source_URL: str
9+
local_data_file: Path
10+
unzip_dir: Path
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from mlProject.config.configuration import ConfigurationManager
2+
from mlProject.components.data_ingestion import DataIngestion
3+
from mlProject import logger
4+
5+
6+
7+
STAGE_NAME = "Data Ingestion stage"
8+
9+
class DataIngestionTrainingPipeline:
10+
def __init__(self):
11+
pass
12+
13+
def main(self):
14+
config = ConfigurationManager()
15+
data_ingestion_config = config.get_data_ingestion_config()
16+
data_ingestion = DataIngestion(config=data_ingestion_config)
17+
data_ingestion.transfer_file()
18+
data_ingestion.extract_zip_file()
19+
20+
21+
22+
if __name__ == '__main__':
23+
try:
24+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
25+
obj = DataIngestionTrainingPipeline()
26+
obj.main()
27+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
28+
except Exception as e:
29+
logger.exception(e)
30+
raise e
31+

src/mlProject/utils/common.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import os
2+
from box.exceptions import BoxValueError
3+
import yaml
4+
from mlProject import logger
5+
import json
6+
import joblib
7+
from ensure import ensure_annotations
8+
from box import ConfigBox
9+
from pathlib import Path
10+
from typing import Any
11+
12+
13+
14+
@ensure_annotations
15+
def read_yaml(path_to_yaml: Path) -> ConfigBox:
16+
"""reads yaml file and returns
17+
18+
Args:
19+
path_to_yaml (str): path like input
20+
21+
Raises:
22+
ValueError: if yaml file is empty
23+
e: empty file
24+
25+
Returns:
26+
ConfigBox: ConfigBox type
27+
"""
28+
try:
29+
with open(path_to_yaml) as yaml_file:
30+
content = yaml.safe_load(yaml_file)
31+
logger.info(f"yaml file: {path_to_yaml} loaded successfully")
32+
return ConfigBox(content)
33+
except BoxValueError:
34+
raise ValueError("yaml file is empty")
35+
except Exception as e:
36+
raise e
37+
38+
39+
40+
@ensure_annotations
41+
def create_directories(path_to_directories: list, verbose=True):
42+
"""create list of directories
43+
44+
Args:
45+
path_to_directories (list): list of path of directories
46+
ignore_log (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
47+
"""
48+
for path in path_to_directories:
49+
os.makedirs(path, exist_ok=True)
50+
if verbose:
51+
logger.info(f"created directory at: {path}")
52+
53+
54+
@ensure_annotations
55+
def save_json(path: Path, data: dict):
56+
"""save json data
57+
58+
Args:
59+
path (Path): path to json file
60+
data (dict): data to be saved in json file
61+
"""
62+
with open(path, "w") as f:
63+
json.dump(data, f, indent=4)
64+
65+
logger.info(f"json file saved at: {path}")
66+
67+
68+
69+
70+
@ensure_annotations
71+
def load_json(path: Path) -> ConfigBox:
72+
"""load json files data
73+
74+
Args:
75+
path (Path): path to json file
76+
77+
Returns:
78+
ConfigBox: data as class attributes instead of dict
79+
"""
80+
with open(path) as f:
81+
content = json.load(f)
82+
83+
logger.info(f"json file loaded succesfully from: {path}")
84+
return ConfigBox(content)
85+
86+
87+
@ensure_annotations
88+
def save_bin(data: Any, path: Path):
89+
"""save binary file
90+
91+
Args:
92+
data (Any): data to be saved as binary
93+
path (Path): path to binary file
94+
"""
95+
joblib.dump(value=data, filename=path)
96+
logger.info(f"binary file saved at: {path}")
97+
98+
99+
@ensure_annotations
100+
def load_bin(path: Path) -> Any:
101+
"""load binary data
102+
103+
Args:
104+
path (Path): path to binary file
105+
106+
Returns:
107+
Any: object stored in the file
108+
"""
109+
data = joblib.load(path)
110+
logger.info(f"binary file loaded from: {path}")
111+
return data
112+
113+
114+
115+
@ensure_annotations
116+
def get_size(path: Path) -> str:
117+
"""get size in KB
118+
119+
Args:
120+
path (Path): path of the file
121+
122+
Returns:
123+
str: size in KB
124+
"""
125+
size_in_kb = round(os.path.getsize(path)/1024)
126+
return f"~ {size_in_kb} KB"
127+
128+
129+
130+

0 commit comments

Comments
 (0)