Skip to content

Commit 8ecbf02

Browse files
updates
1 parent cec3c5d commit 8ecbf02

File tree

639 files changed

+328
-81
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

639 files changed

+328
-81
lines changed

DimRed/analysis.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,44 @@
11
from DimRed import *
2+
from DimRed import *
3+
import math
24

35

46
class Analysis:
57
def __init__(self, X: np.array, y: np.array):
8+
"""Initialize Analysis class.
9+
10+
Args:
11+
X (np.array): Input data.
12+
y (np.array): Target labels.
13+
"""
14+
615
self.X = X
716
self.size_per_side = int(math.sqrt(len(self.X)))
817
self.y = y
918

1019
def produce_combinations(
1120
self, name: str, standard_pipeline: Pipeline, pipeline: Pipeline
1221
) -> None:
22+
"""Produce combinations of scatter plots.
23+
24+
Args:
25+
name (str): Name of the scatter plot.
26+
standard_pipeline (Pipeline): Standard pipeline object.
27+
pipeline (Pipeline): Custom pipeline object.
28+
"""
1329
standard_pipeline.fit(self.X, self.y)
1430
pipeline.fit(self.X, self.y)
1531
X_standard_embedded = standard_pipeline.transform(self.X)
1632
X_custom_pipeline_embedded = pipeline.transform(self.X)
1733
fig = plt.figure(figsize=(12, 10))
1834
fig.suptitle(f"{name}-{standard_pipeline}")
35+
1936
ax_custom = (
2037
fig.add_subplot(111, projection="3d")
2138
if X_custom_pipeline_embedded.shape[-1] > 2
2239
else fig.add_subplot(111)
2340
)
41+
2442
if X_custom_pipeline_embedded.shape[-1] == 1:
2543
x, y = (
2644
X_custom_pipeline_embedded[:, 0],

DimRed/config.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,49 @@
11
from DimRed import *
22

3-
3+
# XGBoost configuration
44
xgb_config = {
5-
"objective": "multi:softmax",
6-
"n_estimators": 100,
7-
"max_depth": 3,
8-
"learning_rate": 0.1,
9-
"subsample": 0.8,
10-
"seed": 42,
11-
"tree_method": "gpu_hist",
12-
"num_class": 10,
5+
"objective": "multi:softmax", # Objective function for XGBoost
6+
"n_estimators": 100, # Number of trees in the forest
7+
"max_depth": 3, # Maximum depth of each tree
8+
"learning_rate": 0.1, # Learning rate for boosting
9+
"subsample": 0.8, # Subsample ratio of the training instances
10+
"seed": 0, # Random seed
11+
"tree_method": "gpu_hist", # Tree construction method
12+
"num_class": 10, # Number of classes
1313
}
14+
15+
# LightGBM configuration
1416
lgb_config = {
15-
"objective": "multiclass",
16-
"num_leaves": 31,
17-
"learning_rate": 0.05,
18-
"max_depth": -1,
19-
"subsample": 0.8,
20-
"metric": "multi_logloss",
21-
"seed": 42,
22-
"device": "gpu",
23-
"num_class": 10,
24-
"verbose": -1,
17+
"objective": "multiclass", # Objective function for LightGBM
18+
"num_leaves": 31, # Maximum number of leaves in one tree
19+
"learning_rate": 0.05, # Learning rate for boosting
20+
"max_depth": -1, # Maximum depth of each tree
21+
"subsample": 0.8, # Subsample ratio of the training instances
22+
"metric": "multi_logloss", # Metric to be used for evaluation
23+
"seed": 0, # Random seed
24+
"device": "gpu", # Device to use for training
25+
"num_class": 10, # Number of classes
26+
"verbose": -1, # Verbosity mode
2527
}
2628

29+
# Scikit-learn configuration
2730
sklearn_config = {
28-
LogisticRegression: {},
31+
LogisticRegression: {}, # Configuration for Logistic Regression
2932
SVC: {
30-
"kernel": ["rbf"],
31-
"probability": [True],
33+
"kernel": ["rbf"], # Kernel type for Support Vector Classifier
34+
"probability": [True], # Whether to enable probability estimates
35+
},
36+
DecisionTreeClassifier: {}, # Configuration for Decision Tree Classifier
37+
RandomForestClassifier: {}, # Configuration for Random Forest Classifier
38+
KNeighborsClassifier: {}, # Configuration for K-Nearest Neighbors Classifier
39+
GaussianProcessClassifier: {}, # Configuration for Gaussian Process Classifier
40+
MLPClassifier: {
41+
"alpha": [1, 2, 3, 4, 5], # Regularization parameter for MLP Classifier
42+
"max_iter": [100, 200, 400, 800], # Maximum number of iterations
3243
},
33-
DecisionTreeClassifier: {},
34-
RandomForestClassifier: {},
35-
KNeighborsClassifier: {},
36-
GaussianProcessClassifier: {},
37-
MLPClassifier: {"alpha": [1, 2, 3, 4, 5], "max_iter": [100, 200, 400, 800]},
38-
AdaBoostClassifier: {"algorithm": ["SAMME"]},
39-
GaussianNB: {},
40-
QuadraticDiscriminantAnalysis: {},
44+
AdaBoostClassifier: {
45+
"algorithm": ["SAMME"]
46+
}, # Configuration for AdaBoost Classifier
47+
GaussianNB: {}, # Configuration for Gaussian Naive Bayes Classifier
48+
QuadraticDiscriminantAnalysis: {}, # Configuration for Quadratic Discriminant Analysis Classifier
4149
}

DimRed/evaluation.py

Lines changed: 81 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ def __init__(
1212
lgb_config: Dict[str, Union[str, int]] = lgb_config,
1313
xgb_config: Dict[str, Union[str, int]] = xgb_config,
1414
) -> None:
15+
"""Initialize the Evaluation class.
16+
17+
Args:
18+
_data: A dictionary containing the data.
19+
all_possible_variations: A dictionary containing all possible variations.
20+
labels: An array containing the labels.
21+
metric: The evaluation metric to use (default: "accuracy").
22+
sklearn_config: A dictionary containing the sklearn configuration.
23+
lgb_config: A dictionary containing the lightgbm configuration.
24+
xgb_config: A dictionary containing the xgboost configuration.
25+
"""
1526
self.sklearn_config = sklearn_config
1627
self.lgb_config = lgb_config
1728
self.xgb_config = xgb_config
@@ -30,9 +41,22 @@ def sklearn(
3041
results: Dict = {},
3142
dimred_technique: str = None,
3243
) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
44+
"""Perform evaluation using sklearn models.
45+
46+
Args:
47+
X_train: The training data.
48+
X_test: The testing data.
49+
y_train: The training labels.
50+
y_test: The testing labels.
51+
inner_iterator: The inner iterator.
52+
results: A dictionary to store the results.
53+
dimred_technique: The dimensionality reduction technique.
54+
55+
Returns:
56+
A tuple containing the results and the best model.
57+
"""
3358
best_model = [0, {}]
3459
for model in tqdm(self.sklearn_config):
35-
print(model)
3660
name = dimred_technique + model().__class__.__name__
3761
inner_iterator.set_description(name)
3862
model_config = self.sklearn_config[model]
@@ -56,18 +80,18 @@ def sklearn(
5680
metrics = classification_report(y_test, y_preds, output_dict=True)
5781
results[model.__class__.__name__] = metrics
5882
wandb.log(metrics)
59-
wandb.sklearn.plot_classifier(
60-
model,
61-
X_train,
62-
X_test,
63-
y_train,
64-
y_test,
65-
y_preds,
66-
y_probas,
67-
range(min(y_probas.shape)),
68-
model_name=name,
69-
feature_names=None,
70-
)
83+
# wandb.sklearn.plot_classifier(
84+
# model,
85+
# X_train,
86+
# X_test,
87+
# y_train,
88+
# y_test,
89+
# y_preds,
90+
# y_probas,
91+
# range(min(y_probas.shape)),
92+
# model_name=name,
93+
# feature_names=None,
94+
# )
7195
if metrics[self.metric] > best_model[0]:
7296
best_model[0] = metrics[self.metric]
7397
best_model[1] = metrics
@@ -85,6 +109,19 @@ def xgb(
85109
results: Dict = {},
86110
dimred_technique: str = None,
87111
) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
112+
"""Perform evaluation using xgboost model.
113+
114+
Args:
115+
X_train: The training data.
116+
X_test: The testing data.
117+
y_train: The training labels.
118+
y_test: The testing labels.
119+
results: A dictionary to store the results.
120+
dimred_technique: The dimensionality reduction technique.
121+
122+
Returns:
123+
A tuple containing the results and the metrics.
124+
"""
88125
model = xgb.XGBClassifier(**self.xgb_config)
89126
name = dimred_technique + model.__class__.__name__
90127
wandb.init(
@@ -102,7 +139,7 @@ def xgb(
102139
cp.asarray(X_train),
103140
cp.asarray(y_train),
104141
eval_set=[(cp.asarray(X_test), cp.asarray(y_test))],
105-
callbacks=[WandbCallback(log_model=True)],
142+
# callbacks=[WandbCallback(log_model=True)],
106143
)
107144
y_preds = model.predict(X_test)
108145
metrics = classification_report(y_test, y_preds, output_dict=True)
@@ -122,6 +159,19 @@ def lgb(
122159
results: Dict = {},
123160
dimred_technique: str = None,
124161
) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
162+
"""Perform evaluation using lightgbm model.
163+
164+
Args:
165+
X_train: The training data.
166+
X_test: The testing data.
167+
y_train: The training labels.
168+
y_test: The testing labels.
169+
results: A dictionary to store the results.
170+
dimred_technique: The dimensionality reduction technique.
171+
172+
Returns:
173+
A tuple containing the results and the metrics.
174+
"""
125175
name = dimred_technique + "LGBClf"
126176
wandb.init(
127177
project=PROJECT_NAME,
@@ -139,14 +189,14 @@ def lgb(
139189
self.lgb_config,
140190
train_data,
141191
valid_sets=[test_data],
142-
callbacks=[wandb_callback()],
192+
# callbacks=[wandb_callback()],
143193
)
144194
y_preds = model.predict(X_test)
145195
metrics = classification_report(
146196
y_test, np.argmax(y_preds, axis=1), output_dict=True
147197
)
148198
results[name] = metrics
149-
log_summary(model, save_model_checkpoint=True)
199+
# log_summary(model, save_model_checkpoint=True)
150200
wandb.log(metrics)
151201
wandb.finish()
152202
dirs = director_exist(os.path.join(os.getenv("MODEL_PATH"), run))
@@ -155,6 +205,11 @@ def lgb(
155205
return results, metrics
156206

157207
def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
208+
"""Perform evaluation of all pipeline variations.
209+
210+
Returns:
211+
A dictionary containing all pipeline performances and the best performances.
212+
"""
158213
all_pipeline_performance = {}
159214
outer_iterator = tqdm(self.all_variations)
160215
best_performances = {
@@ -168,7 +223,7 @@ def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
168223
specific_pipeline_variations = self.all_variations[pipeline_variation_name]
169224
inner_iterator = tqdm(specific_pipeline_variations, leave=False)
170225
for pipeline_variation in inner_iterator:
171-
name_of_pipeline = pipeline_variation.__class__.__name__
226+
name_of_pipeline = pipeline_variation.steps[-1][-1].__class__.__name__
172227
pipeline_performance = {}
173228
X_train = pipeline_variation.fit_transform(self._data["X_train"])
174229
X_test = pipeline_variation.transform(self._data["X_test"])
@@ -207,19 +262,17 @@ def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
207262
avg_var = average_metric(
208263
self.metric, [sklearn_metrics, xgb_metrics, lgb_metrics]
209264
)
210-
if avg_var > best_performing_pipeline[0]:
211-
best_performing_pipeline[0] = avg_var
212-
best_performing_pipeline[1] = str(pipeline_variation).strip
213-
# best_performing_pipeline[2] = pipeline_performance
265+
if float(avg_var) > float(best_performing_pipeline[0]):
266+
best_performing_pipeline[0] = str(avg_var)
267+
best_performing_pipeline[1] = name_of_pipeline
214268
inner_iterator.set_description(f"{name_of_pipeline} Done :)")
215-
# best_performances[pipeline_variation_name] = best_performing_pipeline
216269
best_performances = add_to_dictionary(
217270
best_performances, best_performing_pipeline
218271
)
219-
with open(f'{os.getenv("DATA_PATH")}/all_performance_data.json', "w") as f:
220-
json.dump(all_pipeline_performance, f)
221-
with open(
222-
f'{os.getenv("DATA_PATH")}/best_performance_dimred.json', "w"
223-
) as json_f:
224-
json.dump(best_performances, json_f)
272+
with open(f'{os.getenv("DATA_PATH")}/all_performance_data.json', "w") as f:
273+
json.dump(all_pipeline_performance, f)
274+
with open(
275+
f'{os.getenv("DATA_PATH")}/best_performance_dimred.json', "w"
276+
) as json_f:
277+
json.dump(best_performances, json_f)
225278
return all_pipeline_performance, best_performances

DimRed/helper_functions.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44
def load_dataset(
55
test_split: float = 0.25,
66
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
7+
"""
8+
Load the digits dataset and split it into training and testing sets.
9+
Parameters:
10+
test_split (float): The proportion of the dataset to include in the test split.
11+
Returns:
12+
Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: A tuple containing the training and testing data and labels.
13+
"""
714
digits = datasets.load_digits()
815
X, y = digits.data, digits.target
916
X_train, X_test, y_train, y_test = train_test_split(
@@ -13,13 +20,29 @@ def load_dataset(
1320

1421

1522
def label_encoding(y_train: np.ndarray, y_test: np.ndarray) -> Tuple:
23+
"""
24+
Encode the labels using label encoding.
25+
Parameters:
26+
y_train (np.ndarray): The training labels.
27+
y_test (np.ndarray): The testing labels.
28+
Returns:
29+
Tuple: A tuple containing the encoded training and testing labels.
30+
"""
1631
le = LabelEncoder()
1732
y_train = le.fit_transform(y_train)
1833
y_test = le.transform(y_test)
1934
return y_train, y_test
2035

2136

2237
def average_metric(metric, dictionaries: List[Dict[str, Union[str, int]]]) -> float:
38+
"""
39+
Calculate the average value of a given metric from a list of dictionaries.
40+
Parameters:
41+
metric (str): The metric to calculate the average for.
42+
dictionaries (List[Dict[str, Union[str, int]]]): A list of dictionaries containing the metrics.
43+
Returns:
44+
float: The average value of the metric.
45+
"""
2346
avg = 0
2447
for dictionary in dictionaries:
2548
avg += dictionary[metric]
@@ -29,13 +52,27 @@ def average_metric(metric, dictionaries: List[Dict[str, Union[str, int]]]) -> fl
2952
def add_to_dictionary(
3053
dictionary: Dict[str, List[Union[str, int]]], list_of_values: List[Union[str, int]]
3154
) -> Dict[str, List[Union[str, int]]]:
55+
"""
56+
Add a list of values to a dictionary.
57+
Parameters:
58+
dictionary (Dict[str, List[Union[str, int]]]): The dictionary to add the values to.
59+
list_of_values (List[Union[str, int]]): The list of values to add.
60+
Returns:
61+
Dict[str, List[Union[str, int]]]: The updated dictionary.
62+
"""
3263
for idx, key in enumerate(dictionary):
3364
dictionary[key].append(list_of_values[idx])
3465
return dictionary
3566

3667

3768
def director_exist(path):
38-
# Create the directory if it does not exist
69+
"""
70+
Create the directory if it does not exist.
71+
Parameters:
72+
path (str): The path of the directory.
73+
Returns:
74+
str: The path of the directory.
75+
"""
3976
if not os.path.exists(path):
4077
os.makedirs(path)
4178
return path

0 commit comments

Comments
 (0)