From 5d65358ddf37da0f6604c6887d051f03d1737af7 Mon Sep 17 00:00:00 2001
From: Diego Marvid <dmarvid@tryolabs.com>
Date: Mon, 18 Mar 2024 21:04:08 -0300
Subject: [PATCH] improve readme and fix minor issues

---
 README.md                                     | 133 ++++++++++++++++--
 .../core/steps/explainer_dashboard.py         |   6 +-
 .../implementation/tabular/xgboost/predict.py |   7 +-
 3 files changed, 133 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 44a3b56..1066d54 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,17 @@
 # Pipeline Library
 
-The Pipeline Library is designed to simplify the creation of machine learning pipelines. Currently, it supports XGBoost models, with plans to expand support for more models in the future.
+The Pipeline Library is a powerful and flexible tool designed to simplify the creation and management of machine learning pipelines. It provides a high-level interface for defining and executing pipelines, allowing users to focus on the core aspects of their machine learning projects. The library currently supports XGBoost models, with plans to expand support for more models in the future.
+
+## Features
+
+* Intuitive and easy-to-use API for defining pipeline steps and configurations
+* Support for various data loading formats, including CSV and Parquet
+* Flexible data preprocessing steps, such as data cleaning, feature calculation, and encoding
+* Seamless integration with XGBoost for model training and prediction
+* Hyperparameter optimization using Optuna for fine-tuning models
+* Evaluation metrics calculation and reporting
+* Explainable AI (XAI) dashboard for model interpretability
+* Extensible architecture for adding custom pipeline steps
 
 ## Installation
 
@@ -45,25 +56,48 @@ Here's an example of how to use the library to run an XGBoost pipeline:
 
 ```json
 {
-    "custom_steps_path": "examples/ocf/",
     "pipeline": {
         "name": "XGBoostTrainingPipeline",
         "description": "Training pipeline for XGBoost models.",
         "steps": [
             {
-                "step_type": "OCFGenerateStep",
+                "step_type": "GenerateStep",
                 "parameters": {
                     "path": "examples/ocf/data/trainset_new.parquet"
                 }
             },
             {
-                "step_type": "OCFCleanStep",
-                "parameters": {}
+                "step_type": "CleanStep",
+                "parameters": {
+                    "drop_na_columns": [
+                        "average_power_kw"
+                    ],
+                    "drop_ids": {
+                        "ss_id": [
+                            7759,
+                            7061
+                        ]
+                    }
+                }
+            },
+            {
+                "step_type": "CalculateFeaturesStep",
+                "parameters": {
+                    "datetime_columns": [
+                        "date"
+                    ],
+                    "features": [
+                        "year",
+                        "month",
+                        "day",
+                        "hour",
+                        "minute"
+                    ]
+                }
             },
             {
                 "step_type": "TabularSplitStep",
                 "parameters": {
-                    "id_column": "ss_id",
                     "train_percentage": 0.95
                 }
             },
@@ -72,7 +106,9 @@ Here's an example of how to use the library to run an XGBoost pipeline:
                 "parameters": {
                     "target": "average_power_kw",
                     "drop_columns": [
-                        "ss_id"
+                        "ss_id",
+                        "operational_at",
+                        "total_energy_kwh"
                     ],
                     "xgb_params": {
                         "max_depth": 12,
@@ -80,12 +116,13 @@ Here's an example of how to use the library to run an XGBoost pipeline:
                         "objective": "reg:squarederror",
                         "eval_metric": "mae",
                         "n_jobs": -1,
-                        "n_estimators": 2,
+                        "n_estimators": 672,
                         "min_child_weight": 7,
                         "subsample": 0.8057743223537057,
-                        "colsample_bytree": 0.6316852278944352
+                        "colsample_bytree": 0.6316852278944352,
+                        "early_stopping_rounds": 10
                     },
-                    "save_model": true
+                    "save_path": "model.joblib"
                 }
             }
         ]
@@ -105,4 +142,78 @@ logging.basicConfig(level=logging.INFO)
 Pipeline.from_json("train.json").run()
 ```
 
-The library allows users to define custom steps for generating and cleaning their own data, which can be used in the pipeline.
+3. Create a `predict.json` file with the pipeline configuration for prediction:
+
+```json
+{
+    "pipeline": {
+        "name": "XGBoostPredictionPipeline",
+        "description": "Prediction pipeline for XGBoost models.",
+        "steps": [
+            {
+                "step_type": "GenerateStep",
+                "parameters": {
+                    "path": "examples/ocf/data/testset_new.parquet"
+                }
+            },
+            {
+                "step_type": "CleanStep",
+                "parameters": {
+                    "drop_na_columns": [
+                        "average_power_kw"
+                    ]
+                }
+            },
+            {
+                "step_type": "CalculateFeaturesStep",
+                "parameters": {
+                    "datetime_columns": [
+                        "date"
+                    ],
+                    "features": [
+                        "year",
+                        "month",
+                        "day",
+                        "hour",
+                        "minute"
+                    ]
+                }
+            },
+            {
+                "step_type": "XGBoostPredictStep",
+                "parameters": {
+                    "target": "average_power_kw",
+                    "drop_columns": [
+                        "ss_id",
+                        "operational_at",
+                        "total_energy_kwh"
+                    ],
+                    "load_path": "model.joblib"
+                }
+            },
+            {
+                "step_type": "CalculateMetricsStep",
+                "parameters": {}
+            },
+            {
+                "step_type": "ExplainerDashboardStep",
+                "parameters": {
+                    "max_samples": 1000
+                }
+            }
+        ]
+    }
+}
+```
+
+4. Run the prediction pipeline:
+
+```python
+Pipeline.from_json("predict.json").run()
+```
+
+The library allows users to define custom steps for data generation, cleaning, and preprocessing, which can be seamlessly integrated into the pipeline.
+
+## Contributing
+
+Contributions to the Pipeline Library are welcome! If you encounter any issues, have suggestions for improvements, or want to add new features, please open an issue or submit a pull request on the GitHub repository.
\ No newline at end of file
diff --git a/pipeline_lib/core/steps/explainer_dashboard.py b/pipeline_lib/core/steps/explainer_dashboard.py
index 5a652c5..6863b95 100644
--- a/pipeline_lib/core/steps/explainer_dashboard.py
+++ b/pipeline_lib/core/steps/explainer_dashboard.py
@@ -25,7 +25,11 @@ def execute(self, data: DataContainer) -> DataContainer:
         if target is None:
             raise ValueError("Target column not found in any parameter.")
 
-        df = data.get(DataContainer.CLEAN)
+        df = (
+            data[DataContainer.FEATURES]
+            if DataContainer.FEATURES in data
+            else data[DataContainer.CLEAN]
+        )
 
         if len(df) > self.max_samples:
             # Randomly sample a subset of data points if the dataset is larger than max_samples
diff --git a/pipeline_lib/implementation/tabular/xgboost/predict.py b/pipeline_lib/implementation/tabular/xgboost/predict.py
index ce9b62f..e52e9a1 100644
--- a/pipeline_lib/implementation/tabular/xgboost/predict.py
+++ b/pipeline_lib/implementation/tabular/xgboost/predict.py
@@ -30,9 +30,14 @@ def __init__(
     def execute(self, data: DataContainer) -> DataContainer:
         self.logger.debug("Obtaining predictions for XGBoost model.")
 
-        model_input = data[DataContainer.CLEAN]
+        model_input = (
+            data[DataContainer.FEATURES]
+            if DataContainer.FEATURES in data
+            else data[DataContainer.CLEAN]
+        )
 
         if self.drop_columns:
+            self.logger.info(f"Dropping columns: {self.drop_columns}")
             model_input = model_input.drop(columns=self.drop_columns)
 
         predictions = self.model.predict(model_input.drop(columns=[self.target]))