Programmer-RD-AI
diff --git a/‎DimRed/analysis.py
Lines changed: 18 additions & 0 deletions b/‎DimRed/analysis.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎DimRed/config.py
Lines changed: 38 additions & 30 deletions b/‎DimRed/config.py
Lines changed: 38 additions & 30 deletions
diff --git a/‎DimRed/evaluation.py
Lines changed: 81 additions & 28 deletions b/‎DimRed/evaluation.py
Lines changed: 81 additions & 28 deletions
diff --git a/‎DimRed/helper_functions.py
Lines changed: 38 additions & 1 deletion b/‎DimRed/helper_functions.py
Lines changed: 38 additions & 1 deletion
@@ -1,26 +1,44 @@
 from DimRed import *
+from DimRed import *
+import math
 
 
 class Analysis:
     def __init__(self, X: np.array, y: np.array):
+        """Initialize Analysis class.
+
+        Args:
+            X (np.array): Input data.
+            y (np.array): Target labels.
+        """
+
         self.X = X
         self.size_per_side = int(math.sqrt(len(self.X)))
         self.y = y
 
     def produce_combinations(
         self, name: str, standard_pipeline: Pipeline, pipeline: Pipeline
     ) -> None:
+        """Produce combinations of scatter plots.
+
+        Args:
+            name (str): Name of the scatter plot.
+            standard_pipeline (Pipeline): Standard pipeline object.
+            pipeline (Pipeline): Custom pipeline object.
+        """
         standard_pipeline.fit(self.X, self.y)
         pipeline.fit(self.X, self.y)
         X_standard_embedded = standard_pipeline.transform(self.X)
         X_custom_pipeline_embedded = pipeline.transform(self.X)
         fig = plt.figure(figsize=(12, 10))
         fig.suptitle(f"{name}-{standard_pipeline}")
+
         ax_custom = (
             fig.add_subplot(111, projection="3d")
             if X_custom_pipeline_embedded.shape[-1] > 2
             else fig.add_subplot(111)
         )
+
         if X_custom_pipeline_embedded.shape[-1] == 1:
             x, y = (
                 X_custom_pipeline_embedded[:, 0],
 
@@ -1,41 +1,49 @@
 from DimRed import *
 
-
+# XGBoost configuration
 xgb_config = {
-    "objective": "multi:softmax",
-    "n_estimators": 100,
-    "max_depth": 3,
-    "learning_rate": 0.1,
-    "subsample": 0.8,
-    "seed": 42,
-    "tree_method": "gpu_hist",
-    "num_class": 10,
+    "objective": "multi:softmax",  # Objective function for XGBoost
+    "n_estimators": 100,  # Number of trees in the forest
+    "max_depth": 3,  # Maximum depth of each tree
+    "learning_rate": 0.1,  # Learning rate for boosting
+    "subsample": 0.8,  # Subsample ratio of the training instances
+    "seed": 0,  # Random seed
+    "tree_method": "gpu_hist",  # Tree construction method
+    "num_class": 10,  # Number of classes
 }
+
+# LightGBM configuration
 lgb_config = {
-    "objective": "multiclass",
-    "num_leaves": 31,
-    "learning_rate": 0.05,
-    "max_depth": -1,
-    "subsample": 0.8,
-    "metric": "multi_logloss",
-    "seed": 42,
-    "device": "gpu",
-    "num_class": 10,
-    "verbose": -1,
+    "objective": "multiclass",  # Objective function for LightGBM
+    "num_leaves": 31,  # Maximum number of leaves in one tree
+    "learning_rate": 0.05,  # Learning rate for boosting
+    "max_depth": -1,  # Maximum depth of each tree
+    "subsample": 0.8,  # Subsample ratio of the training instances
+    "metric": "multi_logloss",  # Metric to be used for evaluation
+    "seed": 0,  # Random seed
+    "device": "gpu",  # Device to use for training
+    "num_class": 10,  # Number of classes
+    "verbose": -1,  # Verbosity mode
 }
 
+# Scikit-learn configuration
 sklearn_config = {
-    LogisticRegression: {},
+    LogisticRegression: {},  # Configuration for Logistic Regression
     SVC: {
-        "kernel": ["rbf"],
-        "probability": [True],
+        "kernel": ["rbf"],  # Kernel type for Support Vector Classifier
+        "probability": [True],  # Whether to enable probability estimates
+    },
+    DecisionTreeClassifier: {},  # Configuration for Decision Tree Classifier
+    RandomForestClassifier: {},  # Configuration for Random Forest Classifier
+    KNeighborsClassifier: {},  # Configuration for K-Nearest Neighbors Classifier
+    GaussianProcessClassifier: {},  # Configuration for Gaussian Process Classifier
+    MLPClassifier: {
+        "alpha": [1, 2, 3, 4, 5],  # Regularization parameter for MLP Classifier
+        "max_iter": [100, 200, 400, 800],  # Maximum number of iterations
     },
-    DecisionTreeClassifier: {},
-    RandomForestClassifier: {},
-    KNeighborsClassifier: {},
-    GaussianProcessClassifier: {},
-    MLPClassifier: {"alpha": [1, 2, 3, 4, 5], "max_iter": [100, 200, 400, 800]},
-    AdaBoostClassifier: {"algorithm": ["SAMME"]},
-    GaussianNB: {},
-    QuadraticDiscriminantAnalysis: {},
+    AdaBoostClassifier: {
+        "algorithm": ["SAMME"]
+    },  # Configuration for AdaBoost Classifier
+    GaussianNB: {},  # Configuration for Gaussian Naive Bayes Classifier
+    QuadraticDiscriminantAnalysis: {},  # Configuration for Quadratic Discriminant Analysis Classifier
 }
@@ -12,6 +12,17 @@ def __init__(
         lgb_config: Dict[str, Union[str, int]] = lgb_config,
         xgb_config: Dict[str, Union[str, int]] = xgb_config,
     ) -> None:
+        """Initialize the Evaluation class.
+
+        Args:
+            _data: A dictionary containing the data.
+            all_possible_variations: A dictionary containing all possible variations.
+            labels: An array containing the labels.
+            metric: The evaluation metric to use (default: "accuracy").
+            sklearn_config: A dictionary containing the sklearn configuration.
+            lgb_config: A dictionary containing the lightgbm configuration.
+            xgb_config: A dictionary containing the xgboost configuration.
+        """
         self.sklearn_config = sklearn_config
         self.lgb_config = lgb_config
         self.xgb_config = xgb_config
@@ -30,9 +41,22 @@ def sklearn(
         results: Dict = {},
         dimred_technique: str = None,
     ) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
+        """Perform evaluation using sklearn models.
+
+        Args:
+            X_train: The training data.
+            X_test: The testing data.
+            y_train: The training labels.
+            y_test: The testing labels.
+            inner_iterator: The inner iterator.
+            results: A dictionary to store the results.
+            dimred_technique: The dimensionality reduction technique.
+
+        Returns:
+            A tuple containing the results and the best model.
+        """
         best_model = [0, {}]
         for model in tqdm(self.sklearn_config):
-            print(model)
             name = dimred_technique + model().__class__.__name__
             inner_iterator.set_description(name)
             model_config = self.sklearn_config[model]
@@ -56,18 +80,18 @@ def sklearn(
             metrics = classification_report(y_test, y_preds, output_dict=True)
             results[model.__class__.__name__] = metrics
             wandb.log(metrics)
-            wandb.sklearn.plot_classifier(
-                model,
-                X_train,
-                X_test,
-                y_train,
-                y_test,
-                y_preds,
-                y_probas,
-                range(min(y_probas.shape)),
-                model_name=name,
-                feature_names=None,
-            )
+            # wandb.sklearn.plot_classifier(
+            #     model,
+            #     X_train,
+            #     X_test,
+            #     y_train,
+            #     y_test,
+            #     y_preds,
+            #     y_probas,
+            #     range(min(y_probas.shape)),
+            #     model_name=name,
+            #     feature_names=None,
+            # )
             if metrics[self.metric] > best_model[0]:
                 best_model[0] = metrics[self.metric]
                 best_model[1] = metrics
@@ -85,6 +109,19 @@ def xgb(
         results: Dict = {},
         dimred_technique: str = None,
     ) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
+        """Perform evaluation using xgboost model.
+
+        Args:
+            X_train: The training data.
+            X_test: The testing data.
+            y_train: The training labels.
+            y_test: The testing labels.
+            results: A dictionary to store the results.
+            dimred_technique: The dimensionality reduction technique.
+
+        Returns:
+            A tuple containing the results and the metrics.
+        """
         model = xgb.XGBClassifier(**self.xgb_config)
         name = dimred_technique + model.__class__.__name__
         wandb.init(
@@ -102,7 +139,7 @@ def xgb(
             cp.asarray(X_train),
             cp.asarray(y_train),
             eval_set=[(cp.asarray(X_test), cp.asarray(y_test))],
-            callbacks=[WandbCallback(log_model=True)],
+            # callbacks=[WandbCallback(log_model=True)],
         )
         y_preds = model.predict(X_test)
         metrics = classification_report(y_test, y_preds, output_dict=True)
@@ -122,6 +159,19 @@ def lgb(
         results: Dict = {},
         dimred_technique: str = None,
     ) -> Tuple[Dict[str, Union[str, int]], Dict[str, int]]:
+        """Perform evaluation using lightgbm model.
+
+        Args:
+            X_train: The training data.
+            X_test: The testing data.
+            y_train: The training labels.
+            y_test: The testing labels.
+            results: A dictionary to store the results.
+            dimred_technique: The dimensionality reduction technique.
+
+        Returns:
+            A tuple containing the results and the metrics.
+        """
         name = dimred_technique + "LGBClf"
         wandb.init(
             project=PROJECT_NAME,
@@ -139,14 +189,14 @@ def lgb(
             self.lgb_config,
             train_data,
             valid_sets=[test_data],
-            callbacks=[wandb_callback()],
+            # callbacks=[wandb_callback()],
         )
         y_preds = model.predict(X_test)
         metrics = classification_report(
             y_test, np.argmax(y_preds, axis=1), output_dict=True
         )
         results[name] = metrics
-        log_summary(model, save_model_checkpoint=True)
+        # log_summary(model, save_model_checkpoint=True)
         wandb.log(metrics)
         wandb.finish()
         dirs = director_exist(os.path.join(os.getenv("MODEL_PATH"), run))
@@ -155,6 +205,11 @@ def lgb(
         return results, metrics
 
     def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
+        """Perform evaluation of all pipeline variations.
+
+        Returns:
+            A dictionary containing all pipeline performances and the best performances.
+        """
         all_pipeline_performance = {}
         outer_iterator = tqdm(self.all_variations)
         best_performances = {
@@ -168,7 +223,7 @@ def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
             specific_pipeline_variations = self.all_variations[pipeline_variation_name]
             inner_iterator = tqdm(specific_pipeline_variations, leave=False)
             for pipeline_variation in inner_iterator:
-                name_of_pipeline = pipeline_variation.__class__.__name__
+                name_of_pipeline = pipeline_variation.steps[-1][-1].__class__.__name__
                 pipeline_performance = {}
                 X_train = pipeline_variation.fit_transform(self._data["X_train"])
                 X_test = pipeline_variation.transform(self._data["X_test"])
@@ -207,19 +262,17 @@ def evaluate(self) -> Dict[str, Dict[str, Dict[str, Union[str, int]]]]:
                 avg_var = average_metric(
                     self.metric, [sklearn_metrics, xgb_metrics, lgb_metrics]
                 )
-                if avg_var > best_performing_pipeline[0]:
-                    best_performing_pipeline[0] = avg_var
-                    best_performing_pipeline[1] = str(pipeline_variation).strip
-                    # best_performing_pipeline[2] = pipeline_performance
+                if float(avg_var) > float(best_performing_pipeline[0]):
+                    best_performing_pipeline[0] = str(avg_var)
+                    best_performing_pipeline[1] = name_of_pipeline
                 inner_iterator.set_description(f"{name_of_pipeline} Done :)")
-            # best_performances[pipeline_variation_name] = best_performing_pipeline
             best_performances = add_to_dictionary(
                 best_performances, best_performing_pipeline
             )
-        with open(f'{os.getenv("DATA_PATH")}/all_performance_data.json', "w") as f:
-            json.dump(all_pipeline_performance, f)
-        with open(
-            f'{os.getenv("DATA_PATH")}/best_performance_dimred.json', "w"
-        ) as json_f:
-            json.dump(best_performances, json_f)
+            with open(f'{os.getenv("DATA_PATH")}/all_performance_data.json', "w") as f:
+                json.dump(all_pipeline_performance, f)
+            with open(
+                f'{os.getenv("DATA_PATH")}/best_performance_dimred.json', "w"
+            ) as json_f:
+                json.dump(best_performances, json_f)
         return all_pipeline_performance, best_performances
@@ -4,6 +4,13 @@
 def load_dataset(
     test_split: float = 0.25,
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Load the digits dataset and split it into training and testing sets.
+    Parameters:
+        test_split (float): The proportion of the dataset to include in the test split.
+    Returns:
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: A tuple containing the training and testing data and labels.
+    """
     digits = datasets.load_digits()
     X, y = digits.data, digits.target
     X_train, X_test, y_train, y_test = train_test_split(
@@ -13,13 +20,29 @@ def load_dataset(
 
 
 def label_encoding(y_train: np.ndarray, y_test: np.ndarray) -> Tuple:
+    """
+    Encode the labels using label encoding.
+    Parameters:
+        y_train (np.ndarray): The training labels.
+        y_test (np.ndarray): The testing labels.
+    Returns:
+        Tuple: A tuple containing the encoded training and testing labels.
+    """
     le = LabelEncoder()
     y_train = le.fit_transform(y_train)
     y_test = le.transform(y_test)
     return y_train, y_test
 
 
 def average_metric(metric, dictionaries: List[Dict[str, Union[str, int]]]) -> float:
+    """
+    Calculate the average value of a given metric from a list of dictionaries.
+    Parameters:
+        metric (str): The metric to calculate the average for.
+        dictionaries (List[Dict[str, Union[str, int]]]): A list of dictionaries containing the metrics.
+    Returns:
+        float: The average value of the metric.
+    """
     avg = 0
     for dictionary in dictionaries:
         avg += dictionary[metric]
@@ -29,13 +52,27 @@ def average_metric(metric, dictionaries: List[Dict[str, Union[str, int]]]) -> fl
 def add_to_dictionary(
     dictionary: Dict[str, List[Union[str, int]]], list_of_values: List[Union[str, int]]
 ) -> Dict[str, List[Union[str, int]]]:
+    """
+    Add a list of values to a dictionary.
+    Parameters:
+        dictionary (Dict[str, List[Union[str, int]]]): The dictionary to add the values to.
+        list_of_values (List[Union[str, int]]): The list of values to add.
+    Returns:
+        Dict[str, List[Union[str, int]]]: The updated dictionary.
+    """
     for idx, key in enumerate(dictionary):
         dictionary[key].append(list_of_values[idx])
     return dictionary
 
 
 def director_exist(path):
-    # Create the directory if it does not exist
+    """
+    Create the directory if it does not exist.
+    Parameters:
+        path (str): The path of the directory.
+    Returns:
+        str: The path of the directory.
+    """
     if not os.path.exists(path):
         os.makedirs(path)
     return path