Skip to content

Commit 4e9eec7

Browse files
author
“vijayg15”
committed
data validation added
1 parent 3cc7e28 commit 4e9eec7

File tree

6 files changed

+136
-4
lines changed

6 files changed

+136
-4
lines changed

main.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import os
22
from mlProject import logger
33
from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
4+
from mlProject.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
5+
6+
7+
8+
9+
10+
411

512

613

@@ -10,6 +17,18 @@
1017
data_ingestion = DataIngestionTrainingPipeline()
1118
data_ingestion.main()
1219
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
20+
except Exception as e:
21+
logger.exception(e)
22+
raise e
23+
24+
25+
26+
STAGE_NAME = "Data Validation stage"
27+
try:
28+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
29+
data_ingestion = DataValidationTrainingPipeline()
30+
data_ingestion.main()
31+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
1332
except Exception as e:
1433
logger.exception(e)
1534
raise e

schema.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,19 @@
1-
key: value
1+
COLUMNS:
2+
RowNumber: int64
3+
CustomerId: int64
4+
Surname: str
5+
CreditScore: int64
6+
Geography: str
7+
Gender: str
8+
Age: float64
9+
Tenure: int64
10+
Balance: float64
11+
NumOfProducts: int64
12+
HasCrCardl: float64
13+
IsActiveMember: float64
14+
EstimatedSalary: float64
15+
Exited: int64
16+
17+
18+
TARGET_COLUMN:
19+
name: Exited
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
from mlProject import logger
3+
from mlProject.entity.config_entity import DataValidationConfig
4+
import pandas as pd
5+
6+
7+
class DataValiadtion:
8+
def __init__(self, config: DataValidationConfig):
9+
self.config = config
10+
11+
12+
def validate_all_columns(self)-> bool:
13+
try:
14+
validation_status = None
15+
16+
data = pd.read_csv(self.config.unzip_data_dir)
17+
all_cols = list(data.columns)
18+
19+
all_schema = self.config.all_schema.keys()
20+
21+
22+
for col in all_cols:
23+
if col not in all_schema:
24+
validation_status = False
25+
with open(self.config.STATUS_FILE, 'w') as f:
26+
f.write(f"Validation status: {validation_status}")
27+
else:
28+
validation_status = True
29+
with open(self.config.STATUS_FILE, 'w') as f:
30+
f.write(f"Validation status: {validation_status}")
31+
32+
return validation_status
33+
34+
except Exception as e:
35+
raise e
36+

src/mlProject/config/configuration.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from mlProject.constants import *
22
from mlProject.utils.common import read_yaml, create_directories
3-
from mlProject.entity.config_entity import (DataIngestionConfig,
3+
from mlProject.entity.config_entity import (DataIngestionConfig,
4+
DataValidationConfig
45
)
56

7+
8+
69
class ConfigurationManager:
710
def __init__(
811
self,
@@ -30,4 +33,20 @@ def get_data_ingestion_config(self) -> DataIngestionConfig:
3033
unzip_dir=config.unzip_dir
3134
)
3235

33-
return data_ingestion_config
36+
return data_ingestion_config
37+
38+
39+
def get_data_validation_config(self) -> DataValidationConfig:
40+
config = self.config.data_validation
41+
schema = self.schema.COLUMNS
42+
43+
create_directories([config.root_dir])
44+
45+
data_validation_config = DataValidationConfig(
46+
root_dir=config.root_dir,
47+
STATUS_FILE=config.STATUS_FILE,
48+
unzip_data_dir = config.unzip_data_dir,
49+
all_schema=schema,
50+
)
51+
52+
return data_validation_config

src/mlProject/entity/config_entity.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,13 @@ class DataIngestionConfig:
77
root_dir: Path
88
source_URL: str
99
local_data_file: Path
10-
unzip_dir: Path
10+
unzip_dir: Path
11+
12+
13+
14+
@dataclass(frozen=True)
15+
class DataValidationConfig:
16+
root_dir: Path
17+
STATUS_FILE: str
18+
unzip_data_dir: Path
19+
all_schema: dict
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from mlProject.config.configuration import ConfigurationManager
2+
from mlProject.components.data_validation import DataValiadtion
3+
from mlProject import logger
4+
5+
6+
STAGE_NAME = "Data Validation stage"
7+
8+
class DataValidationTrainingPipeline:
9+
def __init__(self):
10+
pass
11+
12+
def main(self):
13+
config = ConfigurationManager()
14+
data_validation_config = config.get_data_validation_config()
15+
data_validation = DataValiadtion(config=data_validation_config)
16+
data_validation.validate_all_columns()
17+
18+
19+
20+
21+
22+
if __name__ == '__main__':
23+
try:
24+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
25+
obj = DataValidationTrainingPipeline()
26+
obj.main()
27+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
28+
except Exception as e:
29+
logger.exception(e)
30+
raise e
31+

0 commit comments

Comments
 (0)