end-to-end-mlops-databricks · Garett601 · Oct 30, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
@@ -98,3 +98,9 @@ dmypy.json
 
 .databricks
 .ruff_cache/
+
+# JSON files
+*.json
+
+# MLFlow
+mlruns/
@@ -29,6 +29,10 @@ uv lock
 # Updates, Issues, Workarounds, Notes
 
 ---
+<h1 align="center">
+week 1
+</h1>
+
 ## 15/10/2024 - Workaround for environment setup
 > **Note**
 - loosened python-version in pyproject.toml to `requires-python = ">=3.11, <3.12"`
@@ -99,3 +103,51 @@ uv lock
 - The dataset is not included in the repository to avoid large file size. It should first attempt to get the data from the UCI ML Repository.
 - If that fails, the dataset is expected to be in `data/Tetuan City power consumption.csv`. You can download it from [here](https://www.kaggle.com/datasets/gmkeshav/tetuan-city-power-consumption).
 ---
+---
+<h1 align="center">
+week 2
+</h1>
+
+## 26/10/2024 - Feature Engineering
+> **Note**
+- Dataset is now available in UC as table
+- updated DataPreprocessor and separated data loading and preprocessing
+
+> **Issue**
+- Feature Engineering was not working when running from within the IDE
+    ```shell
+    Exception: {'error_code': 'PERMISSION_DENIED', 'message': "Request failed access control checks. Permission check failed for 'heiaepgah71pwedmld01001.power_consumption.power_consumption_features'."}
+    ```
+- In example code, the features generated at runtime were not used in the fe model
+> **Workaround**
+- Ran the feature engineering notebook from Databricks workspace, this resolved permissions issues
+- Ran the feature engineering feature function on the training and testing set and included the new features in the fe model
+-   ```python
+    testing_set = fe.create_training_set(
+        df=test_set,
+        label=target,
+        feature_lookups=[
+            FeatureFunction(
+                udf_name=function_name,
+                output_name="weather_interaction",
+                input_bindings={
+                    "temperature": "Temperature",
+                    "humidity": "Humidity",
+                    "wind_speed": "Wind_Speed"
+                },
+            ),
+        ],
+        exclude_columns=["update_timestamp_utc"]
+    )
+    ```
+    ```python
+    training_df = training_set.load_df().toPandas()
+    testing_df = testing_set.load_df().toPandas()
+
+    X_train = training_df[num_features + cat_features + ["weather_interaction"]]
+    y_train = training_df[target]
+
+    X_test= testing_df[num_features + cat_features + ["weather_interaction"]]
+    y_test = testing_df[target]
+    ```
+---
@@ -6,7 +6,7 @@ hyperparameters:
   n_estimators: 1000
   max_depth: 6
 
-features:
+processed_features:
   num_features:
     - Temperature
     - Humidity
@@ -18,8 +18,13 @@ features:
     - diffuse_flows
 
   cat_features:
-    - DayOfWeek
-    - IsWeekend
+    - DayOfWeek_1
+    - DayOfWeek_2
+    - DayOfWeek_3
+    - DayOfWeek_4
+    - DayOfWeek_5
+    - DayOfWeek_6
+    - IsWeekend_1
 
 target:
   target:
@@ -28,4 +33,17 @@ target:
     - Zone_3_Power_Consumption
 
 dataset:
-  id: 849
+  raw_data_table: tetuan_city_power_consumption
+  num_features:
+    - Temperature
+    - Humidity
+    - Wind_Speed
+    - Hour
+    - Day
+    - Month
+    - general_diffuse_flows
+    - diffuse_flows
+
+  cat_features:
+    - DayOfWeek
+    - IsWeekend
@@ -0,0 +1,68 @@
+# Databricks notebook source
+from power_consumption.preprocessing.data_preprocessor import DataProcessor
+from power_consumption.model.rf_model import ConsumptionModel
+from power_consumption.utils import visualise_results, plot_actual_vs_predicted, plot_feature_importance
+from power_consumption.config import Config
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.getOrCreate()
+
+
+# COMMAND ----------
+
+config = Config.from_yaml("../../configs/project_configs.yml")
+
+# COMMAND ----------
+catalog_name = config.catalog_name
+schema_name = config.schema_name
+raw_data_table = config.dataset.raw_data_table
+# COMMAND ----------
+data_spark = spark.table(f"{catalog_name}.{schema_name}.{raw_data_table}")
+# COMMAND ----------
+data_pandas = data_spark.toPandas()
+# COMMAND ----------
+data_processor = DataProcessor(config, data_pandas)
+# COMMAND ----------
+data_processor.preprocess_data()
+# COMMAND ----------
+train_set, test_set = data_processor.split_data()
+
+# COMMAND ----------
+target_columns = config.target.target
+feature_columns = config.processed_features.num_features + config.processed_features.cat_features
+
+X_train = train_set[feature_columns]
+y_train = train_set[target_columns]
+X_test = test_set[feature_columns]
+y_test = test_set[target_columns]
+
+# COMMAND ----------
+model = ConsumptionModel(config)
+model.train(X_train, y_train)
+
+# COMMAND ----------
+
+# Make predictions and evaluate the model
+y_pred = model.predict(X_test)
+mse, r2 = model.evaluate(X_test, y_test)
+
+# COMMAND ----------
+
+# Visualize results as time series
+visualise_results(y_test, y_pred, target_columns)
+
+# COMMAND ----------
+
+# Get feature importance
+feature_importance, feature_names = model.get_feature_importance()
+# COMMAND ----------
+
+# Plot actual vs predicted values
+plot_actual_vs_predicted(y_test.values, y_pred, target_columns)
+
+# COMMAND ----------
+
+# Plot feature importance
+plot_feature_importance(feature_importance, feature_names, top_n=15)
+
+# COMMAND ----------
@@ -0,0 +1,32 @@
+# Databricks notebook source
+from power_consumption.preprocessing.data_preprocessor import DataProcessor
+from power_consumption.config import Config
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.getOrCreate()
+
+
+# COMMAND ----------
+
+config = Config.from_yaml("../../configs/project_configs.yml")
+
+# COMMAND ----------
+catalog_name = config.catalog_name
+schema_name = config.schema_name
+raw_data_table = config.dataset.raw_data_table
+# COMMAND ----------
+data_spark = spark.table(f"{catalog_name}.{schema_name}.{raw_data_table}")
+# COMMAND ----------
+data_pandas = data_spark.toPandas()
+# COMMAND ----------
+data_processor = DataProcessor(config, data_pandas)
+# COMMAND ----------
+data_processor.preprocess_data()
+# COMMAND ----------
+train_set, test_set = data_processor.split_data()
+# COMMAND ----------
+train_set.reset_index(inplace=True)
+test_set.reset_index(inplace=True)
+# COMMAND ----------
+data_processor.save_to_catalog(train_set=train_set, test_set=test_set, spark=spark)
+# COMMAND ----------
@@ -0,0 +1,53 @@
+# Databricks notebook source
+import json
+
+import mlflow
+
+mlflow.set_tracking_uri("databricks")
+
+mlflow.set_experiment(experiment_name="/Shared/power-consumption")
+mlflow.set_experiment_tags({"repository_name": "power-consumption"})
+
+# COMMAND ----------
+experiments = mlflow.search_experiments(
+    filter_string="tags.repository_name='power-consumption'"
+)
+
+print(experiments)
+
+# COMMAND ----------
+with open("mlflow_experiment.json", "w") as json_file:
+    json.dump(experiments[0].__dict__, json_file, indent=4)
+
+# COMMAND ----------
+with mlflow.start_run(
+    run_name="test-run",
+    tags={
+        "git_sha": "30d57afb2efca70cede3061d00f2a553c2b4779b"
+    }
+) as run:
+    mlflow.log_params({"type": "demo"})
+    mlflow.log_metrics(
+        {
+            "metric_1": 1.0,
+            "metric_2": 2.0
+        }
+    )
+# COMMAND ----------
+run_id = mlflow.search_runs(
+    experiment_names=["/Shared/power-consumption"],
+    filter_string="tags.git_sha='30d57afb2efca70cede3061d00f2a553c2b4779b'",
+).run_id[0]
+run_info = mlflow.get_run(run_id=f"{run_id}").to_dictionary()
+print(run_info)
+
+# COMMAND ----------
+with open("run_info.json", "w") as json_file:
+    json.dump(run_info, json_file, indent=4)
+
+# COMMAND ----------
+print(run_info["data"]["metrics"])
+
+# COMMAND ----------
+print(run_info["data"]["params"])
+# COMMAND ----------
@@ -0,0 +1,104 @@
+# Databricks notebook source
+import mlflow
+from mlflow.models import infer_signature
+
+from pyspark.sql import SparkSession
+from power_consumption.config import Config
+
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from lightgbm import LGBMRegressor
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+
+mlflow.set_tracking_uri("databricks")
+mlflow.set_registry_uri("databricks-uc")
+# COMMAND ----------
+config = Config.from_yaml("../../configs/project_configs.yml")
+# COMMAND ----------
+num_features = config.processed_features.num_features
+cat_features = config.processed_features.cat_features
+target = config.target.target
+parameters = config.hyperparameters.__dict__
+
+catalog_name = config.catalog_name
+schema_name = config.schema_name
+# COMMAND ----------
+spark = SparkSession.builder.getOrCreate()
+
+train_set_spark = spark.table(f"{catalog_name}.{schema_name}.train_set")
+train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas()
+test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()
+# COMMAND ----------
+X_train = train_set[num_features + cat_features]
+y_train = train_set[target]
+
+X_test = test_set[num_features + cat_features]
+y_test = test_set[target]
+# COMMAND ----------
+# Define the preprocessor for categorical features
+preprocessor = ColumnTransformer(
+    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)],
+    remainder='passthrough'
+)
+
+# Create the pipeline with preprocessing and the multi-output LightGBM regressor
+pipeline = Pipeline(steps=[
+    ('preprocessor', preprocessor),
+    ('regressor', MultiOutputRegressor(LGBMRegressor(**parameters)))
+])
+# COMMAND ----------
+mlflow.set_experiment(experiment_name='/Shared/power-consumption')
+git_sha = "30d57afb2efca70cede3061d00f2a553c2b4779b"
+
+# Start an MLflow run to track the training process
+with mlflow.start_run(
+    tags={"git_sha": f"{git_sha}",
+        "branch": "feature/week2"},
+) as run:
+    run_id = run.info.run_id
+
+    pipeline.fit(X_train, y_train)
+    y_pred = pipeline.predict(X_test)
+
+    # Evaluate the model performance
+    mse = mean_squared_error(y_test, y_pred)
+    mae = mean_absolute_error(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
+
+    print(f"Mean Squared Error: {mse}")
+    print(f"Mean Absolute Error: {mae}")
+    print(f"R2 Score: {r2}")
+
+    # Log parameters, metrics, and the model to MLflow
+    mlflow.log_param("model_type", "LightGBM with preprocessing")
+    mlflow.log_params(parameters)
+    mlflow.log_metric("mse", mse)
+    mlflow.log_metric("mae", mae)
+    mlflow.log_metric("r2_score", r2)
+    signature = infer_signature(model_input=X_train, model_output=y_pred)
+
+    dataset = mlflow.data.from_spark(
+    train_set_spark, table_name=f"{catalog_name}.{schema_name}.train_set",
+    version="0")
+    mlflow.log_input(dataset, context="training")
+
+    mlflow.sklearn.log_model(
+        sk_model=pipeline,
+        artifact_path="lightgbm-pipeline-model",
+        signature=signature
+    )
+# COMMAND ----------
+model_version = mlflow.register_model(
+    model_uri=f'runs:/{run_id}/lightgbm-pipeline-model',
+    name=f"{catalog_name}.{schema_name}.power_consumption_model",
+    tags={"git_sha": f"{git_sha}"})
+
+# COMMAND ----------
+run = mlflow.get_run(run_id)
+dataset_info = run.inputs.dataset_inputs[0].dataset
+dataset_source = mlflow.data.get_source(dataset_info)
+dataset_source.load()
+
+# COMMAND ----------