From e7d6bcaf9f9972746abb11d2cb261fcdb82e1678 Mon Sep 17 00:00:00 2001
From: Artur Makowski <artmak1@st.amu.edu.pl>
Date: Sat, 23 Nov 2024 20:11:38 +0100
Subject: [PATCH 1/2] feat: add CI/CD workflow for Databricks deployment

---
 .github/workflows/cd.yaml                     |  39 ++++++
 databricks.yml                                | 124 ++++++++++++++++--
 pyproject.toml                                |   2 +
 run.sh                                        |  25 +++-
 .../data_preprocessing/dataclasses.py         |   2 +-
 .../pipeline/__init__.py                      |   0
 .../pipeline/deploy_model.py                  |  27 ++++
 .../pipeline/evaluate_model.py                | 121 +++++++++++++++++
 .../pipeline/generate_data.py                 |  75 +++++++++++
 .../pipeline/preprocess.py                    |  56 ++++++++
 .../pipeline/train_model.py                   | 117 +++++++++++++++++
 uv.lock                                       |  67 ++++++++--
 12 files changed, 634 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/cd.yaml
 create mode 100644 src/mlops_with_databricks/pipeline/__init__.py
 create mode 100644 src/mlops_with_databricks/pipeline/deploy_model.py
 create mode 100644 src/mlops_with_databricks/pipeline/evaluate_model.py
 create mode 100644 src/mlops_with_databricks/pipeline/generate_data.py
 create mode 100644 src/mlops_with_databricks/pipeline/preprocess.py
 create mode 100644 src/mlops_with_databricks/pipeline/train_model.py

diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml
new file mode 100644
index 0000000..5c2db08
--- /dev/null
+++ b/.github/workflows/cd.yaml
@@ -0,0 +1,39 @@
+name: MLOPs with Databricks
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - '[0-9]+.[0-9]+.[0-9]+'
+
+
+jobs:
+  setup-validate:
+    name: Set Up Environment
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Source Code
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          ref: ${{ github.ref_name }}
+
+      - name: Set Up Python
+        uses: actions/setup-python@b64ffcaf5b410884ad320a9cfac8866006a109aa
+        with:
+          python-version: 3.11
+
+      - name: Install UV
+        uses: astral-sh/setup-uv@2e657c127d5b1635d5a8e3fa40e0ac50a5bf6992
+
+      - name: Install Databricks CLI
+        uses: databricks/setup-cli@948d7379a31615a4c8e9ccbbc5445a12d6b32736
+        with:
+          version: 0.221.1
+
+      - name: Deploy to Databricks
+        env:
+          DATABRICKS_BUNDLE_ENV: prod # bundle target
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+        run: databricks bundle deploy --var="git_sha=${{ github.sha }}"
diff --git a/databricks.yml b/databricks.yml
index 04bc364..63cf547 100644
--- a/databricks.yml
+++ b/databricks.yml
@@ -1,9 +1,109 @@
-# This is a Databricks asset bundle definition for module-code-examples.
-# The Databricks extension requires databricks.yml configuration file.
-# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
-
 bundle:
   name: armak-bundle
+  cluster_id: 1020-170922-owfkgqkz
+
+artifacts:
+  default:
+    type: whl
+    build: python -m build
+    path: .
+
+variables:
+  root_path:
+    description: root_path for the target
+    default: /Shared/.bundle/${bundle.target}/${bundle.name}
+  git_sha:
+    description: git_sha
+    default: ${bundle.git.commit}
+  git_branch:
+    description: git_sha
+    default: ${bundle.git.branch}
+
+
+resources:
+  jobs:
+    ad-cick:
+      name: ad-click-workflow
+      schedule:
+        quartz_cron_expression: "0 0 6 ? * MON"
+        timezone_id: "Europe/Warsaw"
+        pause_status: UNPAUSED
+      tags:
+        project_name: "ad-click"
+      job_clusters:
+        - job_cluster_key: "ad-click-cluster"
+          new_cluster:
+            spark_version: "15.4.x-scala2.12"
+            data_security_mode: "SINGLE_USER"
+            node_type_id: "i3.xlarge"
+            driver_node_type_id: "i3.xlarge"
+            autoscale:
+              min_workers: 1
+              max_workers: 1
+
+      tasks:
+        - task_key: "preprocessing"
+          job_cluster_key: "had-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/preprocess.py"
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: if_refreshed
+          condition_task:
+            op: "EQUAL_TO"
+            left: "{{tasks.preprocessing.values.refreshed}}"
+            right: "1"
+          depends_on:
+            - task_key: "preprocessing"
+        - task_key: "train_model"
+          depends_on:
+            - task_key: "if_refreshed"
+              outcome: "true"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/train_model.py"
+            parameters:
+              - "--git_sha"
+              - ${var.git_sha}
+              - "--git_branch"
+              - ${var.git_branch}
+              - "--job_run_id"
+              - "{{job.id}}"
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: "evaluate_model"
+          depends_on:
+            - task_key: "train_model"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/evaluate_model.py"
+            parameters:
+              - "--new_model_uri"
+              - "{{tasks.train_model.values.new_model_uri}}"
+              - "--job_run_id"
+              - "{{job.id}}"
+              - "--git_sha"
+              - ${var.git_sha}
+              - "--git_branch"
+              - ${var.git_branch}
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: model_update
+          condition_task:
+            op: "EQUAL_TO"
+            left: "{{tasks.evaluate_model.values.model_update}}"
+            right: "1"
+          depends_on:
+            - task_key: "evaluate_model"
+        - task_key: "deploy_model"
+          depends_on:
+            - task_key: "model_update"
+              outcome: "true"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/deploy_model.py"
+          libraries:
+            - whl: ./dist/*.whl
 
 targets:
   dev:
@@ -11,9 +111,15 @@ targets:
     default: true
     workspace:
       host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name}
 
-  ## Optionally, there could be 'staging' or 'prod' targets here.
-  #
-  # prod:
-  #   workspace:
-  #     host: https://dbc-45ad9c70-3532.cloud.databricks.com
+  stage:
+    mode: development
+    workspace:
+      host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name}
+
+  prod:
+    workspace:
+      host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: ${var.root_path}
diff --git a/pyproject.toml b/pyproject.toml
index 25e0709..ec1d56d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,8 @@ dependencies = [
     "matplotlib>=3.9.2, <4",
     "databricks-feature-engineering>=0.6, <1",
     "pre-commit>=4.0.1",
+    "loguru>=0.7.2",
+    "build>=1.2.2.post1",
 ]
 
 [project.optional-dependencies]
diff --git a/run.sh b/run.sh
index d5a7af9..5ae3e2f 100755
--- a/run.sh
+++ b/run.sh
@@ -22,5 +22,26 @@ function deploy_package() {
     fi
 }
 
-# Run the function
-deploy_package
+function run_databricks_bundle_stage() {
+    echo "Running Databricks bundle deploy..."
+    databricks bundle deploy --profile DEFAULT --target stage
+    echo "Databricks bundle deploy successful"
+    echo "Generating data..."
+    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
+    echo "Data generated successfully"
+    echo "Running Databricks bundle run..."
+    databricks bundle run --profile DEFAULT --target stage
+    echo "Databricks bundle run successful"
+}
+
+function run_databricks_bundle_prod() {
+    echo "Running Databricks bundle deploy..."
+    databricks bundle deploy --profile DEFAULT --target prod
+    echo "Databricks bundle deploy successful"
+    echo "Generating data..."
+    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
+    echo "Data generated successfully"
+    echo "Running Databricks bundle run..."
+    databricks bundle run --profile DEFAULT --target prod
+    echo "Databricks bundle run successful"
+}
diff --git a/src/mlops_with_databricks/data_preprocessing/dataclasses.py b/src/mlops_with_databricks/data_preprocessing/dataclasses.py
index baa430a..2ec4e86 100644
--- a/src/mlops_with_databricks/data_preprocessing/dataclasses.py
+++ b/src/mlops_with_databricks/data_preprocessing/dataclasses.py
@@ -105,4 +105,4 @@ class LightGBMConfig(TypedDict):
     max_depth: int
 
 
-light_gbm_config = LightGBMConfig(learning_rate=0.001, n_estimators=200, max_depth=10)
+light_gbm_config = LightGBMConfig(learning_rate=0.1, n_estimators=400, max_depth=15)
diff --git a/src/mlops_with_databricks/pipeline/__init__.py b/src/mlops_with_databricks/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/mlops_with_databricks/pipeline/deploy_model.py b/src/mlops_with_databricks/pipeline/deploy_model.py
new file mode 100644
index 0000000..571ff59
--- /dev/null
+++ b/src/mlops_with_databricks/pipeline/deploy_model.py
@@ -0,0 +1,27 @@
+"""This script is used to deploy the model to the serving endpoint. The model version is fetched from the evaluate_model task."""
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.serving import ServedEntityInput
+
+from mlops_with_databricks.data_preprocessing.dataclasses import DatabricksConfig, ModelConfig, ModelServingConfig
+
+workspace = WorkspaceClient()
+
+
+model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
+
+
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+workspace.serving_endpoints.update_config_and_wait(
+    name=ModelServingConfig.serving_endpoint_name,
+    served_entities=[
+        ServedEntityInput(
+            entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
+            scale_to_zero_enabled=True,
+            workload_size="Small",
+            entity_version=model_version,
+        )
+    ],
+)
diff --git a/src/mlops_with_databricks/pipeline/evaluate_model.py b/src/mlops_with_databricks/pipeline/evaluate_model.py
new file mode 100644
index 0000000..832176e
--- /dev/null
+++ b/src/mlops_with_databricks/pipeline/evaluate_model.py
@@ -0,0 +1,121 @@
+"""Evaluate the model and register it if it performs better than the previous model."""
+
+import argparse
+import sys
+
+import mlflow
+import mlflow.sklearn
+from databricks import feature_engineering
+from databricks.sdk import WorkspaceClient
+from loguru import logger
+from pyspark.sql import SparkSession
+from sklearn.metrics import f1_score
+
+from mlops_with_databricks.data_preprocessing.dataclasses import (
+    DatabricksConfig,
+    ModelServingConfig,
+    ProcessedAdClickDataConfig,
+)
+
+logger.remove()
+
+logger.add(sink=sys.stderr, level="DEBUG")
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--new_model_uri",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--job_run_id",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--git_sha",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--git_branch",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+
+args = parser.parse_args()
+new_model_uri = args.new_model_uri
+job_run_id = args.job_run_id
+git_sha = args.git_sha
+git_branch = args.git_branch
+
+
+spark = SparkSession.builder.getOrCreate()
+workspace = WorkspaceClient()
+fe = feature_engineering.FeatureEngineeringClient()
+
+mlflow.set_registry_uri("databricks-uc")
+mlflow.set_tracking_uri("databricks")
+
+num_features = ProcessedAdClickDataConfig.num_features
+cat_features = ProcessedAdClickDataConfig.cat_features
+target = ProcessedAdClickDataConfig.target
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+serving_endpoint_name = ModelServingConfig.serving_endpoint_name
+serving_endpoint = workspace.serving_endpoints.get(serving_endpoint_name)
+model_name = serving_endpoint.config.served_models[0].model_name
+model_version = serving_endpoint.config.served_models[0].model_version
+previous_model_uri = f"models:/{model_name}/{model_version}"
+
+test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()
+
+X_test = test_set[list(num_features) + list(cat_features)]
+y_test = test_set[target]
+
+logger.debug(f"New Model URI: {new_model_uri}")
+logger.debug(f"Previous Model URI: {previous_model_uri}")
+
+model_new = mlflow.sklearn.load_model(new_model_uri)
+predictions_new = model_new.predict(X_test)
+
+model_previous = mlflow.sklearn.load_model(previous_model_uri)
+predictions_previous = model_previous.predict(X_test)
+
+logger.info(f"Predictions for New Model: {predictions_new}")
+logger.info(f"Previous for Old Model: {predictions_previous}")
+
+
+# Calculate F1 scores
+f1_new = f1_score(y_test, predictions_new)
+f1_previous = f1_score(y_test, predictions_previous)
+
+logger.info(f"F1 Score for New Model: {f1_new}")
+logger.info(f"F1 Score for Old Model: {f1_previous}")
+
+if f1_new > f1_previous:
+    logger.info("New model performs better. Registering...")
+    model_version = mlflow.register_model(
+        model_uri=new_model_uri,
+        name=f"{catalog_name}.{schema_name}.ad_click_model_basic",
+        tags={"branch": git_branch, "git_sha": f"{git_sha}", "job_run_id": job_run_id},
+    )
+    workspace.dbutils.jobs.taskValues.set(key="model_version", value=model_version.version)
+    workspace.dbutils.jobs.taskValues.set(key="model_update", value=1)
+    logger.info(f"New model registered with version: {model_version.version}")
+else:
+    logger.info("Previous model performs better. No update needed.")
+    workspace.dbutils.jobs.taskValues.set(key="model_update", value=0)
diff --git a/src/mlops_with_databricks/pipeline/generate_data.py b/src/mlops_with_databricks/pipeline/generate_data.py
new file mode 100644
index 0000000..dc8cb96
--- /dev/null
+++ b/src/mlops_with_databricks/pipeline/generate_data.py
@@ -0,0 +1,75 @@
+"""Generate synthetic data and save it to the source_data table."""
+
+import numpy as np
+import pandas as pd
+from loguru import logger
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import current_timestamp, to_utc_timestamp
+
+from mlops_with_databricks.data_preprocessing.dataclasses import (
+    DatabricksConfig,
+)
+
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+spark = SparkSession.builder.getOrCreate()
+
+train_set_spark = spark.table(f"{catalog_name}.{schema_name}.train_set")
+train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas()
+test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()
+combined_set = pd.concat([train_set, test_set], ignore_index=True)
+
+
+def create_synthetic_data(df: pd.DataFrame, num_rows=100) -> pd.DataFrame:
+    synthetic_data = pd.DataFrame()
+
+    for column in df.columns:
+        logger.info(f"Creating synthetic data for column: {column}")
+        if column == "click":
+            synthetic_data[column] = np.random.choice([0, 1], num_rows, p=[0.5, 0.5])
+        else:
+            if pd.api.types.is_numeric_dtype(df[column]):
+                max, min = df[column].max(), df[column].min()
+                synthetic_data[column] = np.random.randint(min, max, num_rows)
+
+            elif pd.api.types.is_object_dtype(df[column]):
+                synthetic_data[column] = np.random.choice(
+                    df[column].unique(), num_rows, p=df[column].value_counts(normalize=True)
+                )
+
+            elif isinstance(df[column].dtype, pd.CategoricalDtype) or isinstance(df[column].dtype, pd.StringDtype):
+                synthetic_data[column] = np.random.choice(
+                    df[column].unique(), num_rows, p=df[column].value_counts(normalize=True)
+                )
+
+            elif pd.api.types.is_datetime64_any_dtype(df[column]):
+                min_date, max_date = df[column].min(), df[column].max()
+                if min_date < max_date:
+                    synthetic_data[column] = pd.to_datetime(np.random.randint(min_date.value, max_date.value, num_rows))
+                else:
+                    synthetic_data[column] = [min_date] * num_rows
+
+            else:
+                synthetic_data[column] = np.random.choice(df[column], num_rows)
+
+    return synthetic_data
+
+
+synthetic_df = create_synthetic_data(combined_set)
+
+existing_schema = spark.table(f"{catalog_name}.{schema_name}.train_set").schema
+
+synthetic_spark_df = spark.createDataFrame(synthetic_df, schema=existing_schema)
+
+train_set_with_timestamp = synthetic_spark_df.withColumn(
+    "update_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC")
+)
+
+train_set_with_timestamp.show(5)
+train_set_with_timestamp.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.source_data")
+
+spark.sql(
+    f"ALTER TABLE {DatabricksConfig.catalog_name}.{DatabricksConfig.schema_name}.source_data "
+    "SET TBLPROPERTIES (delta.enableChangeDataFeed = true);"
+)
diff --git a/src/mlops_with_databricks/pipeline/preprocess.py b/src/mlops_with_databricks/pipeline/preprocess.py
new file mode 100644
index 0000000..c6df80a
--- /dev/null
+++ b/src/mlops_with_databricks/pipeline/preprocess.py
@@ -0,0 +1,56 @@
+"""Preprocess data and update train and test sets."""
+
+import argparse
+
+from databricks.sdk import WorkspaceClient
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+from pyspark.sql.functions import max as spark_max
+
+from mlops_with_databricks.data_preprocessing.dataclasses import DatabricksConfig
+
+workspace = WorkspaceClient()
+
+
+parser = argparse.ArgumentParser()
+
+args = parser.parse_args()
+
+spark = SparkSession.builder.getOrCreate()
+
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+
+source_data = spark.table(f"{catalog_name}.{schema_name}.source_data")
+
+max_train_timestamp = (
+    spark.table(f"{catalog_name}.{schema_name}.train_set")
+    .select(spark_max("update_timestamp_utc").alias("max_update_timestamp"))
+    .collect()[0]["max_update_timestamp"]
+)
+
+max_test_timestamp = (
+    spark.table(f"{catalog_name}.{schema_name}.test_set")
+    .select(spark_max("update_timestamp_utc").alias("max_update_timestamp"))
+    .collect()[0]["max_update_timestamp"]
+)
+
+latest_timestamp = max(max_train_timestamp, max_test_timestamp)
+
+new_data = source_data.filter(col("update_timestamp_utc") > latest_timestamp)
+
+new_data_train, new_data_test = new_data.randomSplit([0.8, 0.2], seed=42)
+
+new_data_train.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.train_set")
+new_data_test.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.test_set")
+
+affected_rows_train = new_data_train.count()
+affected_rows_test = new_data_test.count()
+
+if affected_rows_train > 0 or affected_rows_test > 0:
+    refreshed = 1
+else:
+    refreshed = 0
+
+workspace.dbutils.jobs.taskValues.set(key="refreshed", value=refreshed)
diff --git a/src/mlops_with_databricks/pipeline/train_model.py b/src/mlops_with_databricks/pipeline/train_model.py
new file mode 100644
index 0000000..dfbe7fb
--- /dev/null
+++ b/src/mlops_with_databricks/pipeline/train_model.py
@@ -0,0 +1,117 @@
+"""Train a LightGBM model with preprocessing and log the model to MLflow."""
+
+import argparse
+
+import mlflow
+from databricks.sdk import WorkspaceClient
+from lightgbm import LGBMClassifier
+from mlflow.models import infer_signature
+from pyspark.sql import SparkSession
+from sklearn.compose import ColumnTransformer
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+from mlops_with_databricks.data_preprocessing.dataclasses import (
+    DatabricksConfig,
+    ProcessedAdClickDataConfig,
+    light_gbm_config,
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--git_sha",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+parser.add_argument(
+    "--git_branch",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+parser.add_argument(
+    "--job_run_id",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+args = parser.parse_args()
+git_sha = args.git_sha
+git_branch = args.git_branch
+job_run_id = args.job_run_id
+
+
+spark = SparkSession.builder.getOrCreate()
+workspace = WorkspaceClient()
+
+mlflow.set_registry_uri("databricks-uc")
+mlflow.set_tracking_uri("databricks")
+
+
+num_features = ProcessedAdClickDataConfig.num_features
+cat_features = ProcessedAdClickDataConfig.cat_features
+target = ProcessedAdClickDataConfig.target
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+train_set_spark = spark.table(f"{catalog_name}.{schema_name}.train_set")
+train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas()
+test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()
+
+X_train = train_set[list(num_features) + list(cat_features)]
+y_train = train_set[target]
+
+X_test = test_set[list(num_features) + list(cat_features)]
+y_test = test_set[target]
+
+preprocessor = ColumnTransformer(
+    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)], remainder="passthrough"
+)
+pipeline = Pipeline(steps=[("onehot", preprocessor), ("classifier", LGBMClassifier(**light_gbm_config))])
+
+mlflow.set_experiment(experiment_name="/Shared/ad-click")
+
+with mlflow.start_run(tags={"branch": git_branch, "git_sha": f"{git_sha}", "job_run_id": job_run_id}) as run:
+    run_id = run.info.run_id
+
+    pipeline.fit(X_train, y_train)
+    y_pred = pipeline.predict(X_test)
+
+    f1 = f1_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    roc_auc = roc_auc_score(y_test, y_pred)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    mlflow.log_param("model_type", "LightGBM with preprocessing")
+
+    parameters = {
+        "classifier__learning_rate": light_gbm_config["learning_rate"],
+        "classifier__n_estimators": light_gbm_config["n_estimators"],
+        "classifier__max_depth": light_gbm_config["max_depth"],
+    }
+
+    mlflow.log_params(parameters)
+    mlflow.log_metrics({"f1": f1, "accuracy": accuracy, "precision": precision, "recall": recall, "roc_auc": roc_auc})
+    signature = infer_signature(model_input=X_test, model_output=y_pred)
+
+    dataset = mlflow.data.from_spark(train_set_spark, table_name=f"{catalog_name}.{schema_name}.train_set", version="0")
+    mlflow.log_input(dataset, context="training")
+
+    mlflow.sklearn.log_model(sk_model=pipeline, artifact_path="lightgbm-pipeline-model", signature=signature)
+
+
+model_uri = f"runs:/{run_id}/lightgbm-pipeline-model"
+workspace.dbutils.jobs.taskValues.set(key="new_model_uri", value=model_uri)
diff --git a/uv.lock b/uv.lock
index 8597d17..cadbc10 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,8 +1,10 @@
 version = 1
-requires-python = "==3.11.10"
+requires-python = ">=3.11"
 resolution-markers = [
-    "platform_system != 'Windows'",
-    "platform_system == 'Windows'",
+    "python_full_version < '3.12' and platform_system != 'Windows'",
+    "python_full_version >= '3.12' and platform_system != 'Windows'",
+    "python_full_version < '3.12' and platform_system == 'Windows'",
+    "python_full_version >= '3.12' and platform_system == 'Windows'",
 ]
 
 [[package]]
@@ -135,13 +137,27 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jmespath" },
     { name = "python-dateutil" },
-    { name = "urllib3", marker = "python_full_version == '3.11.10'" },
+    { name = "urllib3" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d3/0c/2bcd566397ab06661b222b9b5156ba0c40d5a97d3727c88ccaefea275cb4/botocore-1.35.42.tar.gz", hash = "sha256:af348636f73dc24b7e2dc760a34d08c8f2f94366e9b4c78d877307b128abecef", size = 12835012 }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2e/f5/0e67c7e6a7f5f8c068cf444dc25d03097a22428380587542978d7ad9d86a/botocore-1.35.42-py3-none-any.whl", hash = "sha256:05af0bb8b9cea7ce7bc589c332348d338a21b784e9d088a588fd10ec145007ff", size = 12621471 },
 ]
 
+[[package]]
+name = "build"
+version = "1.2.2.post1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "os_name == 'nt'" },
+    { name = "packaging" },
+    { name = "pyproject-hooks" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/46/aeab111f8e06793e4f0e421fcad593d547fb8313b50990f31681ee2fb1ad/build-1.2.2.post1.tar.gz", hash = "sha256:b36993e92ca9375a219c99e606a122ff365a760a2d4bba0caa09bd5278b608b7", size = 46701 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950 },
+]
+
 [[package]]
 name = "cachetools"
 version = "5.5.0"
@@ -925,7 +941,7 @@ dependencies = [
     { name = "pygments" },
     { name = "stack-data" },
     { name = "traitlets" },
-    { name = "typing-extensions", marker = "python_full_version == '3.11.10'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.12'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f7/21/48db7d9dd622b9692575004c7c98f85f5629428f58596c59606d36c51b58/ipython-8.28.0.tar.gz", hash = "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a", size = 5495762 }
 wheels = [
@@ -1095,6 +1111,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d9/28/3be76b591a2e14a031b681b8283acf1dec2ad521f6f1701b7957df68c466/lightgbm-4.5.0-py3-none-win_amd64.whl", hash = "sha256:7ccb73ee9fb74fbbf89ad24c57a6edad505aa8f2165d02b999a082dbbbb0ee57", size = 1444319 },
 ]
 
+[[package]]
+name = "loguru"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9e/30/d87a423766b24db416a46e9335b9602b054a72b96a88a241f2b09b560fa8/loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac", size = 145103 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", size = 62549 },
+]
+
 [[package]]
 name = "mako"
 version = "1.3.5"
@@ -1282,10 +1311,12 @@ name = "mlops-with-databricks"
 version = "0.0.1"
 source = { editable = "." }
 dependencies = [
+    { name = "build" },
     { name = "cffi" },
     { name = "cloudpickle" },
     { name = "databricks-feature-engineering" },
     { name = "lightgbm" },
+    { name = "loguru" },
     { name = "matplotlib" },
     { name = "mlflow" },
     { name = "numpy" },
@@ -1306,6 +1337,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
+    { name = "build", specifier = ">=1.2.2.post1" },
     { name = "cffi", specifier = ">=1.17.1,<2" },
     { name = "cloudpickle", specifier = ">=3.0.0,<4" },
     { name = "databricks-connect", marker = "extra == 'dev'", specifier = ">=15.4.1,<16" },
@@ -1313,6 +1345,7 @@ requires-dist = [
     { name = "databricks-sdk", marker = "extra == 'dev'", specifier = ">=0.32.0,<0.33" },
     { name = "ipykernel", marker = "extra == 'dev'", specifier = ">=6.29.5,<7" },
     { name = "lightgbm", specifier = ">=4.5.0,<5" },
+    { name = "loguru", specifier = ">=0.7.2" },
     { name = "matplotlib", specifier = ">=3.9.2,<4" },
     { name = "mlflow", specifier = ">=2.16.0,<3" },
     { name = "numpy", specifier = ">=1.26.4,<2" },
@@ -1417,7 +1450,7 @@ name = "pandas"
 version = "2.2.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "python_full_version == '3.11.10'" },
+    { name = "numpy" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -1601,8 +1634,6 @@ version = "6.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/18/c7/8c6872f7372eb6a6b2e4708b88419fb46b857f7a2e1892966b851cc79fc9/psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2", size = 508067 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c5/66/78c9c3020f573c58101dc43a44f6855d01bbbd747e24da2f0c4491200ea3/psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35", size = 249766 },
-    { url = "https://files.pythonhosted.org/packages/e1/3f/2403aa9558bea4d3854b0e5e567bc3dd8e9fbc1fc4453c0aa9aafeb75467/psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1", size = 253024 },
     { url = "https://files.pythonhosted.org/packages/0b/37/f8da2fbd29690b3557cca414c1949f92162981920699cd62095a984983bf/psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0", size = 250961 },
     { url = "https://files.pythonhosted.org/packages/35/56/72f86175e81c656a01c4401cd3b1c923f891b31fbcebe98985894176d7c9/psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0", size = 287478 },
     { url = "https://files.pythonhosted.org/packages/19/74/f59e7e0d392bc1070e9a70e2f9190d652487ac115bb16e2eff6b22ad1d24/psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", size = 290455 },
@@ -1712,6 +1743,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/be/ec/2eb3cd785efd67806c46c13a17339708ddc346cbb684eade7a6e6f79536a/pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84", size = 106921 },
 ]
 
+[[package]]
+name = "pyproject-hooks"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/82/28175b2414effca1cdac8dc99f76d660e7a4fb0ceefa4b4ab8f5f6742925/pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8", size = 19228 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216 },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -1975,7 +2015,7 @@ name = "sqlalchemy"
 version = "2.0.36"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "greenlet", marker = "(platform_machine == 'AMD64' and python_full_version == '3.11.10') or (platform_machine == 'WIN32' and python_full_version == '3.11.10') or (platform_machine == 'aarch64' and python_full_version == '3.11.10') or (platform_machine == 'amd64' and python_full_version == '3.11.10') or (platform_machine == 'ppc64le' and python_full_version == '3.11.10') or (platform_machine == 'win32' and python_full_version == '3.11.10') or (platform_machine == 'x86_64' and python_full_version == '3.11.10')" },
+    { name = "greenlet", marker = "(python_full_version < '3.13' and platform_machine == 'AMD64') or (python_full_version < '3.13' and platform_machine == 'WIN32') or (python_full_version < '3.13' and platform_machine == 'aarch64') or (python_full_version < '3.13' and platform_machine == 'amd64') or (python_full_version < '3.13' and platform_machine == 'ppc64le') or (python_full_version < '3.13' and platform_machine == 'win32') or (python_full_version < '3.13' and platform_machine == 'x86_64')" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/65/9cbc9c4c3287bed2499e05033e207473504dc4df999ce49385fb1f8b058a/sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5", size = 9574485 }
@@ -2137,6 +2177,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4b/84/997bbf7c2bf2dc3f09565c6d0b4959fefe5355c18c4096cfd26d83e0785b/werkzeug-3.0.4-py3-none-any.whl", hash = "sha256:02c9eb92b7d6c06f31a782811505d2157837cea66aaede3e217c7c27c039476c", size = 227554 },
 ]
 
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6b/dd/f95a13d2b235a28d613ba23ebad55191514550debb968b46aab99f2e3a30/win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2", size = 3676 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/e6/a7d828fef907843b2a5773ebff47fb79ac0c1c88d60c0ca9530ee941e248/win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad", size = 3604 },
+]
+
 [[package]]
 name = "wrapt"
 version = "1.16.0"

From 0c5b4a4ed1e1192906aba0ac19a5eded704417d8 Mon Sep 17 00:00:00 2001
From: Artur Makowski <artmak1@st.amu.edu.pl>
Date: Sun, 24 Nov 2024 16:13:18 +0100
Subject: [PATCH 2/2] fix: databrick.yaml

---
 databricks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/databricks.yml b/databricks.yml
index 63cf547..33bd5c9 100644
--- a/databricks.yml
+++ b/databricks.yml
@@ -43,7 +43,7 @@ resources:
 
       tasks:
         - task_key: "preprocessing"
-          job_cluster_key: "had-click-cluster"
+          job_cluster_key: "ad-click-cluster"
           spark_python_task:
             python_file: "src/mlops_with_databricks/pipeline/preprocess.py"
           libraries:
@@ -122,4 +122,4 @@ targets:
   prod:
     workspace:
       host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
-      root_path: ${var.root_path}
+      root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name}