-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
79 lines (49 loc) · 2.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import logging
from loan_prediction.config import ProjectConfig
from loan_prediction.data_loader import DataLoader
from loan_prediction.data_processor import DataProcessor
from loan_prediction.loan_classifier_model import LoanClassifierModel
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Load configuration
config = ProjectConfig.from_yaml(config_path="./config/config.yml")
logger.debug(config)
logger.info("Configuration loaded")
# Load data
filepath = "./data/sample/sample.csv"
# filepath = "./data/raw/train.csv"
dataloader = DataLoader(filepath)
dataloader.load()
logger.info(f"Data Loaded: {len(dataloader.data)}")
# Initialize DataProcessor
data_processor = DataProcessor(dataloader, config)
data_processor.load_data()
logger.info("DataProcessor initialized.")
logger.info(f"data_processor data shape: {data_processor.dataloader.data.shape}")
data_processor.preprocess()
logger.info("DataProcessor processed.")
# Split the data
# X_train, X_test, y_train, y_test = data_processor.split_train_test()
# logger.info("Data split into training and test sets.")
# logger.debug(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
train_set, test_set = data_processor.split_train_test()
# run on databricks
# data_processor.save_to_catalog(train_set=train_set, test_set=test_set, spark=spark)
logger.info("Saved to catalog")
# Initialize and train the model
model = LoanClassifierModel(data_processor.preprocessor, config)
X_train, y_train = data_processor.xy_split(train_set)
model.train(X_train, y_train)
logger.info("Model training completed.")
exit()
# # Evaluate the model
# score = model.evaluate(X_test, y_test)
# logger.info(f"Model evaluation completed: score={score}")
# ## Visualizing Results
# y_pred = model.predict(X_test)
# visualize_results(y_test, y_pred)
# logger.info("Results visualization completed.")
# ## Feature Importance
# feature_importance, feature_names = model.get_feature_importance()
# plot_feature_importance(feature_importance, feature_names)
# logger.info("Feature importance plot generated.")