end-to-end-mlops-databricks · ArturMakowski · Nov 23, 2024 · Nov 24, 2024 · coderabbitai · Nov 25, 2024
@@ -0,0 +1,39 @@
+name: MLOPs with Databricks
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - '[0-9]+.[0-9]+.[0-9]+'
+
+
+jobs:
+  setup-validate:
+    name: Set Up Environment
+    runs-on: ubuntu-latest
-jobs:
-  setup-validate:
-    name: Set Up Environment
-    runs-on: ubuntu-latest
+jobs:
+  deploy:
+    name: Deploy to Databricks
+    runs-on: ubuntu-latest
+    environment:
+      name: production
+      url: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+    concurrency:
+      group: databricks-${{ github.ref }}
+      cancel-in-progress: false
-jobs:
-  setup-validate:
-    name: Set Up Environment
-    runs-on: ubuntu-latest
+jobs:
+  deploy:
+    name: Deploy to Databricks
+    runs-on: ubuntu-latest
+    environment:
+      name: production
+      url: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+    concurrency:
+      group: databricks-${{ github.ref }}
+      cancel-in-progress: false
+
+    steps:
+      - name: Checkout Source Code
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          ref: ${{ github.ref_name }}
+
+      - name: Set Up Python
+        uses: actions/setup-python@b64ffcaf5b410884ad320a9cfac8866006a109aa
+        with:
+          python-version: 3.11
+
+      - name: Install UV
+        uses: astral-sh/setup-uv@2e657c127d5b1635d5a8e3fa40e0ac50a5bf6992
+
+      - name: Install Databricks CLI
+        uses: databricks/setup-cli@948d7379a31615a4c8e9ccbbc5445a12d6b32736
+        with:
+          version: 0.221.1
+
+      - name: Deploy to Databricks
+        env:
+          DATABRICKS_BUNDLE_ENV: prod # bundle target
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+        run: databricks bundle deploy --var="git_sha=${{ github.sha }}"
-      - name: Deploy to Databricks
-        env:
-          DATABRICKS_BUNDLE_ENV: prod # bundle target
-          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
-        run: databricks bundle deploy --var="git_sha=${{ github.sha }}"
+      - name: Validate Bundle
+        env:
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+        run: databricks bundle validate
+
+      - name: Deploy to Databricks
+        env:
+          DATABRICKS_BUNDLE_ENV: prod # bundle target
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+          DATABRICKS_HOST: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+        run: |
+          for i in {1..3}; do
+            if databricks bundle deploy \
+              --var="git_sha=${{ github.sha }}" \
+              --var="environment=prod"; then
+              exit 0
+            fi
+            echo "Deployment attempt $i failed. Retrying..."
+            sleep 10
+          done
+          exit 1
+
+      - name: Verify Deployment
+        if: success()
+        env:
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+          DATABRICKS_HOST: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+        run: |
+          # Verify the deployment status
+          databricks bundle validate --target prod
-      - name: Deploy to Databricks
-        env:
-          DATABRICKS_BUNDLE_ENV: prod # bundle target
-          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
-        run: databricks bundle deploy --var="git_sha=${{ github.sha }}"
+      - name: Validate Bundle
+        env:
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+        run: databricks bundle validate
+
+      - name: Deploy to Databricks
+        env:
+          DATABRICKS_BUNDLE_ENV: prod # bundle target
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+          DATABRICKS_HOST: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+        run: |
+          for i in {1..3}; do
+            if databricks bundle deploy \
+              --var="git_sha=${{ github.sha }}" \
+              --var="environment=prod"; then
+              exit 0
+            fi
+            echo "Deployment attempt $i failed. Retrying..."
+            sleep 10
+          done
+          exit 1
+
+      - name: Verify Deployment
+        if: success()
+        env:
+          DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+          DATABRICKS_HOST: ${{ vars.DATABRICKS_WORKSPACE_URL }}
+        run: |
+          # Verify the deployment status
+          databricks bundle validate --target prod
@@ -1,19 +1,125 @@
-# This is a Databricks asset bundle definition for module-code-examples.
-# The Databricks extension requires databricks.yml configuration file.
-# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
-
 bundle:
   name: armak-bundle
+  cluster_id: 1020-170922-owfkgqkz
+
+artifacts:
+  default:
+    type: whl
+    build: python -m build
+    path: .
+
+variables:
+  root_path:
+    description: root_path for the target
+    default: /Shared/.bundle/${bundle.target}/${bundle.name}
+  git_sha:
+    description: git_sha
+    default: ${bundle.git.commit}
+  git_branch:
+    description: git_sha
+    default: ${bundle.git.branch}
+
+
+resources:
+  jobs:
+    ad-cick:
+      name: ad-click-workflow
+      schedule:
+        quartz_cron_expression: "0 0 6 ? * MON"
+        timezone_id: "Europe/Warsaw"
+        pause_status: UNPAUSED
+      tags:
+        project_name: "ad-click"
+      job_clusters:
+        - job_cluster_key: "ad-click-cluster"
+          new_cluster:
+            spark_version: "15.4.x-scala2.12"
+            data_security_mode: "SINGLE_USER"
+            node_type_id: "i3.xlarge"
+            driver_node_type_id: "i3.xlarge"
+            autoscale:
+              min_workers: 1
+              max_workers: 1
+
+      tasks:
+        - task_key: "preprocessing"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/preprocess.py"
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: if_refreshed
+          condition_task:
+            op: "EQUAL_TO"
+            left: "{{tasks.preprocessing.values.refreshed}}"
+            right: "1"
+          depends_on:
+            - task_key: "preprocessing"
+        - task_key: "train_model"
+          depends_on:
+            - task_key: "if_refreshed"
+              outcome: "true"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/train_model.py"
+            parameters:
+              - "--git_sha"
+              - ${var.git_sha}
+              - "--git_branch"
+              - ${var.git_branch}
+              - "--job_run_id"
+              - "{{job.id}}"
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: "evaluate_model"
+          depends_on:
+            - task_key: "train_model"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/evaluate_model.py"
+            parameters:
+              - "--new_model_uri"
+              - "{{tasks.train_model.values.new_model_uri}}"
+              - "--job_run_id"
+              - "{{job.id}}"
+              - "--git_sha"
+              - ${var.git_sha}
+              - "--git_branch"
+              - ${var.git_branch}
+          libraries:
+            - whl: ./dist/*.whl
+        - task_key: model_update
+          condition_task:
+            op: "EQUAL_TO"
+            left: "{{tasks.evaluate_model.values.model_update}}"
+            right: "1"
+          depends_on:
+            - task_key: "evaluate_model"
+        - task_key: "deploy_model"
+          depends_on:
+            - task_key: "model_update"
+              outcome: "true"
+          job_cluster_key: "ad-click-cluster"
+          spark_python_task:
+            python_file: "src/mlops_with_databricks/pipeline/deploy_model.py"
+          libraries:
+            - whl: ./dist/*.whl
 
 targets:
   dev:
     mode: development
     default: true
     workspace:
       host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: /Workspace/Users/[email protected]/.bundle/${bundle.target}/${bundle.name}
 
-  ## Optionally, there could be 'staging' or 'prod' targets here.
-  #
-  # prod:
-  #   workspace:
-  #     host: https://dbc-45ad9c70-3532.cloud.databricks.com
+  stage:
+    mode: development
+    workspace:
+      host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: /Workspace/Users/[email protected]/.bundle/${bundle.target}/${bundle.name}
+
+  prod:
+    workspace:
+      host: https://dbc-643c4c2b-d6c9.cloud.databricks.com
+      root_path: /Workspace/Users/[email protected]/.bundle/${bundle.target}/${bundle.name}
@@ -17,6 +17,8 @@ dependencies = [
     "matplotlib>=3.9.2, <4",
     "databricks-feature-engineering>=0.6, <1",
     "pre-commit>=4.0.1",
+    "loguru>=0.7.2",
+    "build>=1.2.2.post1",
 ]
 
 [project.optional-dependencies]

@@ -22,5 +22,26 @@ function deploy_package() {
     fi
 }
 
-# Run the function
-deploy_package
+function run_databricks_bundle_stage() {
+    echo "Running Databricks bundle deploy..."
+    databricks bundle deploy --profile DEFAULT --target stage
+    echo "Databricks bundle deploy successful"
+    echo "Generating data..."
+    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
+    echo "Data generated successfully"
+    echo "Running Databricks bundle run..."
+    databricks bundle run --profile DEFAULT --target stage
+    echo "Databricks bundle run successful"
+}
-function run_databricks_bundle_stage() {
-    echo "Running Databricks bundle deploy..."
-    databricks bundle deploy --profile DEFAULT --target stage
-    echo "Databricks bundle deploy successful"
-    echo "Generating data..."
-    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
-    echo "Data generated successfully"
-    echo "Running Databricks bundle run..."
-    databricks bundle run --profile DEFAULT --target stage
-    echo "Databricks bundle run successful"
-}
+# Define common variables
+PYTHON_PATH="python"  # Or use environment variable
+DATA_GENERATOR_SCRIPT="src/mlops_with_databricks/pipeline/generate_data.py"
+
+function run_databricks_command() {
+    local command=$1
+    local target=$2
+    
+    echo "Running Databricks bundle $command..."
+    if ! databricks bundle $command --profile DEFAULT --target $target; then
+        echo "Error: Databricks bundle $command failed"
+        return 1
+    fi
+    echo "Databricks bundle $command successful"
+}
+
+function generate_data() {
+    echo "Generating data..."
+    if ! $PYTHON_PATH "$DATA_GENERATOR_SCRIPT"; then
+        echo "Error: Data generation failed"
+        return 1
+    fi
+    echo "Data generated successfully"
+}
+
+function run_databricks_bundle_stage() {
+    run_databricks_command "deploy" "stage" || return 1
+    generate_data || return 1
+    run_databricks_command "run" "stage" || return 1
+}
-function run_databricks_bundle_stage() {
-    echo "Running Databricks bundle deploy..."
-    databricks bundle deploy --profile DEFAULT --target stage
-    echo "Databricks bundle deploy successful"
-    echo "Generating data..."
-    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
-    echo "Data generated successfully"
-    echo "Running Databricks bundle run..."
-    databricks bundle run --profile DEFAULT --target stage
-    echo "Databricks bundle run successful"
-}
+# Define common variables
+PYTHON_PATH="python"  # Or use environment variable
+DATA_GENERATOR_SCRIPT="src/mlops_with_databricks/pipeline/generate_data.py"
+
+function run_databricks_command() {
+    local command=$1
+    local target=$2
+    
+    echo "Running Databricks bundle $command..."
+    if ! databricks bundle $command --profile DEFAULT --target $target; then
+        echo "Error: Databricks bundle $command failed"
+        return 1
+    fi
+    echo "Databricks bundle $command successful"
+}
+
+function generate_data() {
+    echo "Generating data..."
+    if ! $PYTHON_PATH "$DATA_GENERATOR_SCRIPT"; then
+        echo "Error: Data generation failed"
+        return 1
+    fi
+    echo "Data generated successfully"
+}
+
+function run_databricks_bundle_stage() {
+    run_databricks_command "deploy" "stage" || return 1
+    generate_data || return 1
+    run_databricks_command "run" "stage" || return 1
+}
+
+function run_databricks_bundle_prod() {
+    echo "Running Databricks bundle deploy..."
+    databricks bundle deploy --profile DEFAULT --target prod
+    echo "Databricks bundle deploy successful"
+    echo "Generating data..."
+    "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py"
+    echo "Data generated successfully"
+    echo "Running Databricks bundle run..."
+    databricks bundle run --profile DEFAULT --target prod
+    echo "Databricks bundle run successful"
+}
@@ -105,4 +105,4 @@ class LightGBMConfig(TypedDict):
     max_depth: int
 
 
-light_gbm_config = LightGBMConfig(learning_rate=0.001, n_estimators=200, max_depth=10)
+light_gbm_config = LightGBMConfig(learning_rate=0.1, n_estimators=400, max_depth=15)
@@ -0,0 +1,27 @@
+"""This script is used to deploy the model to the serving endpoint. The model version is fetched from the evaluate_model task."""
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.serving import ServedEntityInput
+
+from mlops_with_databricks.data_preprocessing.dataclasses import DatabricksConfig, ModelConfig, ModelServingConfig
+
+workspace = WorkspaceClient()
+
+
+model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
-
-model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
+
+try:
+    model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
+    if not model_version:
+        raise ValueError("Model version not found in task values")
+    # Validate model version format if applicable
+    if not isinstance(model_version, (int, str)):
+        raise TypeError(f"Invalid model version type: {type(model_version)}")
+except Exception as e:
+    raise RuntimeError(f"Failed to retrieve model version: {str(e)}")
-
-model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
+
+try:
+    model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version")
+    if not model_version:
+        raise ValueError("Model version not found in task values")
+    # Validate model version format if applicable
+    if not isinstance(model_version, (int, str)):
+        raise TypeError(f"Invalid model version type: {type(model_version)}")
+except Exception as e:
+    raise RuntimeError(f"Failed to retrieve model version: {str(e)}")
+
+
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
-catalog_name = DatabricksConfig.catalog_name
-schema_name = DatabricksConfig.schema_name
+def validate_config_value(name: str, value: str) -> str:
+    if not value or not isinstance(value, str):
+        raise ValueError(f"Invalid {name}: {value}")
+    return value
+
+catalog_name = validate_config_value("catalog_name", DatabricksConfig.catalog_name)
+schema_name = validate_config_value("schema_name", DatabricksConfig.schema_name)
-catalog_name = DatabricksConfig.catalog_name
-schema_name = DatabricksConfig.schema_name
+def validate_config_value(name: str, value: str) -> str:
+    if not value or not isinstance(value, str):
+        raise ValueError(f"Invalid {name}: {value}")
+    return value
+
+catalog_name = validate_config_value("catalog_name", DatabricksConfig.catalog_name)
+schema_name = validate_config_value("schema_name", DatabricksConfig.schema_name)
+workspace.serving_endpoints.update_config_and_wait(
+    name=ModelServingConfig.serving_endpoint_name,
+    served_entities=[
+        ServedEntityInput(
+            entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
+            scale_to_zero_enabled=True,
+            workload_size="Small",
+            entity_version=model_version,
+        )
+    ],
+)
-workspace.serving_endpoints.update_config_and_wait(
-    name=ModelServingConfig.serving_endpoint_name,
-    served_entities=[
-        ServedEntityInput(
-            entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
-            scale_to_zero_enabled=True,
-            workload_size="Small",
-            entity_version=model_version,
-        )
-    ],
-)
+import time
+from databricks.sdk.service.serving import EndpointStateResponse
+
+def wait_for_endpoint_ready(workspace: WorkspaceClient, endpoint_name: str, timeout_seconds: int = 300) -> None:
+    start_time = time.time()
+    while time.time() - start_time < timeout_seconds:
+        state = workspace.serving_endpoints.get_state(name=endpoint_name)
+        if state.ready:
+            return
+        time.sleep(10)
+    raise TimeoutError(f"Endpoint {endpoint_name} not ready after {timeout_seconds} seconds")
+
+try:
+    workspace.serving_endpoints.update_config_and_wait(
+        name=ModelServingConfig.serving_endpoint_name,
+        served_entities=[
+            ServedEntityInput(
+                entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
+                scale_to_zero_enabled=True,
+                workload_size="Small",
+                entity_version=model_version,
+            )
+        ],
+        timeout=300  # 5 minutes timeout
+    )
+    # Validate deployment
+    wait_for_endpoint_ready(workspace, ModelServingConfig.serving_endpoint_name)
+    print(f"Successfully deployed model version {model_version} to endpoint {ModelServingConfig.serving_endpoint_name}")
+except Exception as e:
+    raise RuntimeError(f"Failed to deploy model to serving endpoint: {str(e)}")
-workspace.serving_endpoints.update_config_and_wait(
-    name=ModelServingConfig.serving_endpoint_name,
-    served_entities=[
-        ServedEntityInput(
-            entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
-            scale_to_zero_enabled=True,
-            workload_size="Small",
-            entity_version=model_version,
-        )
-    ],
-)
+import time
+from databricks.sdk.service.serving import EndpointStateResponse
+
+def wait_for_endpoint_ready(workspace: WorkspaceClient, endpoint_name: str, timeout_seconds: int = 300) -> None:
+    start_time = time.time()
+    while time.time() - start_time < timeout_seconds:
+        state = workspace.serving_endpoints.get_state(name=endpoint_name)
+        if state.ready:
+            return
+        time.sleep(10)
+    raise TimeoutError(f"Endpoint {endpoint_name} not ready after {timeout_seconds} seconds")
+
+try:
+    workspace.serving_endpoints.update_config_and_wait(
+        name=ModelServingConfig.serving_endpoint_name,
+        served_entities=[
+            ServedEntityInput(
+                entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}",
+                scale_to_zero_enabled=True,
+                workload_size="Small",
+                entity_version=model_version,
+            )
+        ],
+        timeout=300  # 5 minutes timeout
+    )
+    # Validate deployment
+    wait_for_endpoint_ready(workspace, ModelServingConfig.serving_endpoint_name)
+    print(f"Successfully deployed model version {model_version} to endpoint {ModelServingConfig.serving_endpoint_name}")
+except Exception as e:
+    raise RuntimeError(f"Failed to deploy model to serving endpoint: {str(e)}")
@@ -0,0 +1,121 @@
+"""Evaluate the model and register it if it performs better than the previous model."""
+
+import argparse
+import sys
+
+import mlflow
+import mlflow.sklearn
+from databricks import feature_engineering
+from databricks.sdk import WorkspaceClient
+from loguru import logger
+from pyspark.sql import SparkSession
+from sklearn.metrics import f1_score
+
+from mlops_with_databricks.data_preprocessing.dataclasses import (
+    DatabricksConfig,
+    ModelServingConfig,
+    ProcessedAdClickDataConfig,
+)
+
+logger.remove()
+
+logger.add(sink=sys.stderr, level="DEBUG")
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--new_model_uri",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--job_run_id",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--git_sha",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+parser.add_argument(
+    "--git_branch",
+    action="store",
+    default=None,
+    type=str,
+    required=True,
+)
+
+
+args = parser.parse_args()
+new_model_uri = args.new_model_uri
+job_run_id = args.job_run_id
+git_sha = args.git_sha
+git_branch = args.git_branch
+
+
+spark = SparkSession.builder.getOrCreate()
+workspace = WorkspaceClient()
+fe = feature_engineering.FeatureEngineeringClient()
+
+mlflow.set_registry_uri("databricks-uc")
+mlflow.set_tracking_uri("databricks")
+
+num_features = ProcessedAdClickDataConfig.num_features
+cat_features = ProcessedAdClickDataConfig.cat_features
+target = ProcessedAdClickDataConfig.target
+catalog_name = DatabricksConfig.catalog_name
+schema_name = DatabricksConfig.schema_name
+
+serving_endpoint_name = ModelServingConfig.serving_endpoint_name
+serving_endpoint = workspace.serving_endpoints.get(serving_endpoint_name)
+model_name = serving_endpoint.config.served_models[0].model_name
+model_version = serving_endpoint.config.served_models[0].model_version
+previous_model_uri = f"models:/{model_name}/{model_version}"
+
+test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()
+
+X_test = test_set[list(num_features) + list(cat_features)]
+y_test = test_set[target]
+
+logger.debug(f"New Model URI: {new_model_uri}")
+logger.debug(f"Previous Model URI: {previous_model_uri}")
+
+model_new = mlflow.sklearn.load_model(new_model_uri)
+predictions_new = model_new.predict(X_test)
+
+model_previous = mlflow.sklearn.load_model(previous_model_uri)
+predictions_previous = model_previous.predict(X_test)
+
+logger.info(f"Predictions for New Model: {predictions_new}")
+logger.info(f"Previous for Old Model: {predictions_previous}")
+
+
+# Calculate F1 scores
+f1_new = f1_score(y_test, predictions_new)
+f1_previous = f1_score(y_test, predictions_previous)
+
+logger.info(f"F1 Score for New Model: {f1_new}")
+logger.info(f"F1 Score for Old Model: {f1_previous}")
+
+if f1_new > f1_previous:
+    logger.info("New model performs better. Registering...")
+    model_version = mlflow.register_model(
+        model_uri=new_model_uri,
+        name=f"{catalog_name}.{schema_name}.ad_click_model_basic",
+        tags={"branch": git_branch, "git_sha": f"{git_sha}", "job_run_id": job_run_id},
+    )
+    workspace.dbutils.jobs.taskValues.set(key="model_version", value=model_version.version)
+    workspace.dbutils.jobs.taskValues.set(key="model_update", value=1)
+    logger.info(f"New model registered with version: {model_version.version}")
+else:
+    logger.info("Previous model performs better. No update needed.")
+    workspace.dbutils.jobs.taskValues.set(key="model_update", value=0)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -105,4 +105,4 @@ class LightGBMConfig(TypedDict):
		max_depth: int


		light_gbm_config = LightGBMConfig(learning_rate=0.001, n_estimators=200, max_depth=10)
		light_gbm_config = LightGBMConfig(learning_rate=0.1, n_estimators=400, max_depth=15)