From e7d6bcaf9f9972746abb11d2cb261fcdb82e1678 Mon Sep 17 00:00:00 2001 From: Artur Makowski Date: Sat, 23 Nov 2024 20:11:38 +0100 Subject: [PATCH 1/2] feat: add CI/CD workflow for Databricks deployment --- .github/workflows/cd.yaml | 39 ++++++ databricks.yml | 124 ++++++++++++++++-- pyproject.toml | 2 + run.sh | 25 +++- .../data_preprocessing/dataclasses.py | 2 +- .../pipeline/__init__.py | 0 .../pipeline/deploy_model.py | 27 ++++ .../pipeline/evaluate_model.py | 121 +++++++++++++++++ .../pipeline/generate_data.py | 75 +++++++++++ .../pipeline/preprocess.py | 56 ++++++++ .../pipeline/train_model.py | 117 +++++++++++++++++ uv.lock | 67 ++++++++-- 12 files changed, 634 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/cd.yaml create mode 100644 src/mlops_with_databricks/pipeline/__init__.py create mode 100644 src/mlops_with_databricks/pipeline/deploy_model.py create mode 100644 src/mlops_with_databricks/pipeline/evaluate_model.py create mode 100644 src/mlops_with_databricks/pipeline/generate_data.py create mode 100644 src/mlops_with_databricks/pipeline/preprocess.py create mode 100644 src/mlops_with_databricks/pipeline/train_model.py diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml new file mode 100644 index 0000000..5c2db08 --- /dev/null +++ b/.github/workflows/cd.yaml @@ -0,0 +1,39 @@ +name: MLOPs with Databricks + +on: + push: + branches: + - 'main' + tags: + - '[0-9]+.[0-9]+.[0-9]+' + + +jobs: + setup-validate: + name: Set Up Environment + runs-on: ubuntu-latest + + steps: + - name: Checkout Source Code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + ref: ${{ github.ref_name }} + + - name: Set Up Python + uses: actions/setup-python@b64ffcaf5b410884ad320a9cfac8866006a109aa + with: + python-version: 3.11 + + - name: Install UV + uses: astral-sh/setup-uv@2e657c127d5b1635d5a8e3fa40e0ac50a5bf6992 + + - name: Install Databricks CLI + uses: databricks/setup-cli@948d7379a31615a4c8e9ccbbc5445a12d6b32736 + with: + version: 0.221.1 + + - name: Deploy to Databricks + env: + DATABRICKS_BUNDLE_ENV: prod # bundle target + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} + run: databricks bundle deploy --var="git_sha=${{ github.sha }}" diff --git a/databricks.yml b/databricks.yml index 04bc364..63cf547 100644 --- a/databricks.yml +++ b/databricks.yml @@ -1,9 +1,109 @@ -# This is a Databricks asset bundle definition for module-code-examples. -# The Databricks extension requires databricks.yml configuration file. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. - bundle: name: armak-bundle + cluster_id: 1020-170922-owfkgqkz + +artifacts: + default: + type: whl + build: python -m build + path: . + +variables: + root_path: + description: root_path for the target + default: /Shared/.bundle/${bundle.target}/${bundle.name} + git_sha: + description: git_sha + default: ${bundle.git.commit} + git_branch: + description: git_sha + default: ${bundle.git.branch} + + +resources: + jobs: + ad-cick: + name: ad-click-workflow + schedule: + quartz_cron_expression: "0 0 6 ? * MON" + timezone_id: "Europe/Warsaw" + pause_status: UNPAUSED + tags: + project_name: "ad-click" + job_clusters: + - job_cluster_key: "ad-click-cluster" + new_cluster: + spark_version: "15.4.x-scala2.12" + data_security_mode: "SINGLE_USER" + node_type_id: "i3.xlarge" + driver_node_type_id: "i3.xlarge" + autoscale: + min_workers: 1 + max_workers: 1 + + tasks: + - task_key: "preprocessing" + job_cluster_key: "had-click-cluster" + spark_python_task: + python_file: "src/mlops_with_databricks/pipeline/preprocess.py" + libraries: + - whl: ./dist/*.whl + - task_key: if_refreshed + condition_task: + op: "EQUAL_TO" + left: "{{tasks.preprocessing.values.refreshed}}" + right: "1" + depends_on: + - task_key: "preprocessing" + - task_key: "train_model" + depends_on: + - task_key: "if_refreshed" + outcome: "true" + job_cluster_key: "ad-click-cluster" + spark_python_task: + python_file: "src/mlops_with_databricks/pipeline/train_model.py" + parameters: + - "--git_sha" + - ${var.git_sha} + - "--git_branch" + - ${var.git_branch} + - "--job_run_id" + - "{{job.id}}" + libraries: + - whl: ./dist/*.whl + - task_key: "evaluate_model" + depends_on: + - task_key: "train_model" + job_cluster_key: "ad-click-cluster" + spark_python_task: + python_file: "src/mlops_with_databricks/pipeline/evaluate_model.py" + parameters: + - "--new_model_uri" + - "{{tasks.train_model.values.new_model_uri}}" + - "--job_run_id" + - "{{job.id}}" + - "--git_sha" + - ${var.git_sha} + - "--git_branch" + - ${var.git_branch} + libraries: + - whl: ./dist/*.whl + - task_key: model_update + condition_task: + op: "EQUAL_TO" + left: "{{tasks.evaluate_model.values.model_update}}" + right: "1" + depends_on: + - task_key: "evaluate_model" + - task_key: "deploy_model" + depends_on: + - task_key: "model_update" + outcome: "true" + job_cluster_key: "ad-click-cluster" + spark_python_task: + python_file: "src/mlops_with_databricks/pipeline/deploy_model.py" + libraries: + - whl: ./dist/*.whl targets: dev: @@ -11,9 +111,15 @@ targets: default: true workspace: host: https://dbc-643c4c2b-d6c9.cloud.databricks.com + root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name} - ## Optionally, there could be 'staging' or 'prod' targets here. - # - # prod: - # workspace: - # host: https://dbc-45ad9c70-3532.cloud.databricks.com + stage: + mode: development + workspace: + host: https://dbc-643c4c2b-d6c9.cloud.databricks.com + root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name} + + prod: + workspace: + host: https://dbc-643c4c2b-d6c9.cloud.databricks.com + root_path: ${var.root_path} diff --git a/pyproject.toml b/pyproject.toml index 25e0709..ec1d56d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,8 @@ dependencies = [ "matplotlib>=3.9.2, <4", "databricks-feature-engineering>=0.6, <1", "pre-commit>=4.0.1", + "loguru>=0.7.2", + "build>=1.2.2.post1", ] [project.optional-dependencies] diff --git a/run.sh b/run.sh index d5a7af9..5ae3e2f 100755 --- a/run.sh +++ b/run.sh @@ -22,5 +22,26 @@ function deploy_package() { fi } -# Run the function -deploy_package +function run_databricks_bundle_stage() { + echo "Running Databricks bundle deploy..." + databricks bundle deploy --profile DEFAULT --target stage + echo "Databricks bundle deploy successful" + echo "Generating data..." + "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py" + echo "Data generated successfully" + echo "Running Databricks bundle run..." + databricks bundle run --profile DEFAULT --target stage + echo "Databricks bundle run successful" +} + +function run_databricks_bundle_prod() { + echo "Running Databricks bundle deploy..." + databricks bundle deploy --profile DEFAULT --target prod + echo "Databricks bundle deploy successful" + echo "Generating data..." + "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/.venv/bin/python" "/Users/arturmakowski/.vscode/extensions/databricks.databricks-2.4.8-darwin-arm64/resources/python/dbconnect-bootstrap.py" "/Users/arturmakowski/Documents/Python_projects/marvelous-databricks-course-ArturMakowski/src/mlops_with_databricks/pipeline/generate_data.py" + echo "Data generated successfully" + echo "Running Databricks bundle run..." + databricks bundle run --profile DEFAULT --target prod + echo "Databricks bundle run successful" +} diff --git a/src/mlops_with_databricks/data_preprocessing/dataclasses.py b/src/mlops_with_databricks/data_preprocessing/dataclasses.py index baa430a..2ec4e86 100644 --- a/src/mlops_with_databricks/data_preprocessing/dataclasses.py +++ b/src/mlops_with_databricks/data_preprocessing/dataclasses.py @@ -105,4 +105,4 @@ class LightGBMConfig(TypedDict): max_depth: int -light_gbm_config = LightGBMConfig(learning_rate=0.001, n_estimators=200, max_depth=10) +light_gbm_config = LightGBMConfig(learning_rate=0.1, n_estimators=400, max_depth=15) diff --git a/src/mlops_with_databricks/pipeline/__init__.py b/src/mlops_with_databricks/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mlops_with_databricks/pipeline/deploy_model.py b/src/mlops_with_databricks/pipeline/deploy_model.py new file mode 100644 index 0000000..571ff59 --- /dev/null +++ b/src/mlops_with_databricks/pipeline/deploy_model.py @@ -0,0 +1,27 @@ +"""This script is used to deploy the model to the serving endpoint. The model version is fetched from the evaluate_model task.""" + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.serving import ServedEntityInput + +from mlops_with_databricks.data_preprocessing.dataclasses import DatabricksConfig, ModelConfig, ModelServingConfig + +workspace = WorkspaceClient() + + +model_version = workspace.dbutils.jobs.taskValues.get(taskKey="evaluate_model", key="model_version") + + +catalog_name = DatabricksConfig.catalog_name +schema_name = DatabricksConfig.schema_name + +workspace.serving_endpoints.update_config_and_wait( + name=ModelServingConfig.serving_endpoint_name, + served_entities=[ + ServedEntityInput( + entity_name=f"{catalog_name}.{schema_name}.{ModelConfig.model_name}", + scale_to_zero_enabled=True, + workload_size="Small", + entity_version=model_version, + ) + ], +) diff --git a/src/mlops_with_databricks/pipeline/evaluate_model.py b/src/mlops_with_databricks/pipeline/evaluate_model.py new file mode 100644 index 0000000..832176e --- /dev/null +++ b/src/mlops_with_databricks/pipeline/evaluate_model.py @@ -0,0 +1,121 @@ +"""Evaluate the model and register it if it performs better than the previous model.""" + +import argparse +import sys + +import mlflow +import mlflow.sklearn +from databricks import feature_engineering +from databricks.sdk import WorkspaceClient +from loguru import logger +from pyspark.sql import SparkSession +from sklearn.metrics import f1_score + +from mlops_with_databricks.data_preprocessing.dataclasses import ( + DatabricksConfig, + ModelServingConfig, + ProcessedAdClickDataConfig, +) + +logger.remove() + +logger.add(sink=sys.stderr, level="DEBUG") + +parser = argparse.ArgumentParser() +parser.add_argument( + "--new_model_uri", + action="store", + default=None, + type=str, + required=True, +) + +parser.add_argument( + "--job_run_id", + action="store", + default=None, + type=str, + required=True, +) + +parser.add_argument( + "--git_sha", + action="store", + default=None, + type=str, + required=True, +) + +parser.add_argument( + "--git_branch", + action="store", + default=None, + type=str, + required=True, +) + + +args = parser.parse_args() +new_model_uri = args.new_model_uri +job_run_id = args.job_run_id +git_sha = args.git_sha +git_branch = args.git_branch + + +spark = SparkSession.builder.getOrCreate() +workspace = WorkspaceClient() +fe = feature_engineering.FeatureEngineeringClient() + +mlflow.set_registry_uri("databricks-uc") +mlflow.set_tracking_uri("databricks") + +num_features = ProcessedAdClickDataConfig.num_features +cat_features = ProcessedAdClickDataConfig.cat_features +target = ProcessedAdClickDataConfig.target +catalog_name = DatabricksConfig.catalog_name +schema_name = DatabricksConfig.schema_name + +serving_endpoint_name = ModelServingConfig.serving_endpoint_name +serving_endpoint = workspace.serving_endpoints.get(serving_endpoint_name) +model_name = serving_endpoint.config.served_models[0].model_name +model_version = serving_endpoint.config.served_models[0].model_version +previous_model_uri = f"models:/{model_name}/{model_version}" + +test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas() + +X_test = test_set[list(num_features) + list(cat_features)] +y_test = test_set[target] + +logger.debug(f"New Model URI: {new_model_uri}") +logger.debug(f"Previous Model URI: {previous_model_uri}") + +model_new = mlflow.sklearn.load_model(new_model_uri) +predictions_new = model_new.predict(X_test) + +model_previous = mlflow.sklearn.load_model(previous_model_uri) +predictions_previous = model_previous.predict(X_test) + +logger.info(f"Predictions for New Model: {predictions_new}") +logger.info(f"Previous for Old Model: {predictions_previous}") + + +# Calculate F1 scores +f1_new = f1_score(y_test, predictions_new) +f1_previous = f1_score(y_test, predictions_previous) + +logger.info(f"F1 Score for New Model: {f1_new}") +logger.info(f"F1 Score for Old Model: {f1_previous}") + +if f1_new > f1_previous: + logger.info("New model performs better. Registering...") + model_version = mlflow.register_model( + model_uri=new_model_uri, + name=f"{catalog_name}.{schema_name}.ad_click_model_basic", + tags={"branch": git_branch, "git_sha": f"{git_sha}", "job_run_id": job_run_id}, + ) + workspace.dbutils.jobs.taskValues.set(key="model_version", value=model_version.version) + workspace.dbutils.jobs.taskValues.set(key="model_update", value=1) + logger.info(f"New model registered with version: {model_version.version}") +else: + logger.info("Previous model performs better. No update needed.") + workspace.dbutils.jobs.taskValues.set(key="model_update", value=0) diff --git a/src/mlops_with_databricks/pipeline/generate_data.py b/src/mlops_with_databricks/pipeline/generate_data.py new file mode 100644 index 0000000..dc8cb96 --- /dev/null +++ b/src/mlops_with_databricks/pipeline/generate_data.py @@ -0,0 +1,75 @@ +"""Generate synthetic data and save it to the source_data table.""" + +import numpy as np +import pandas as pd +from loguru import logger +from pyspark.sql import SparkSession +from pyspark.sql.functions import current_timestamp, to_utc_timestamp + +from mlops_with_databricks.data_preprocessing.dataclasses import ( + DatabricksConfig, +) + +catalog_name = DatabricksConfig.catalog_name +schema_name = DatabricksConfig.schema_name + +spark = SparkSession.builder.getOrCreate() + +train_set_spark = spark.table(f"{catalog_name}.{schema_name}.train_set") +train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas() +test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas() +combined_set = pd.concat([train_set, test_set], ignore_index=True) + + +def create_synthetic_data(df: pd.DataFrame, num_rows=100) -> pd.DataFrame: + synthetic_data = pd.DataFrame() + + for column in df.columns: + logger.info(f"Creating synthetic data for column: {column}") + if column == "click": + synthetic_data[column] = np.random.choice([0, 1], num_rows, p=[0.5, 0.5]) + else: + if pd.api.types.is_numeric_dtype(df[column]): + max, min = df[column].max(), df[column].min() + synthetic_data[column] = np.random.randint(min, max, num_rows) + + elif pd.api.types.is_object_dtype(df[column]): + synthetic_data[column] = np.random.choice( + df[column].unique(), num_rows, p=df[column].value_counts(normalize=True) + ) + + elif isinstance(df[column].dtype, pd.CategoricalDtype) or isinstance(df[column].dtype, pd.StringDtype): + synthetic_data[column] = np.random.choice( + df[column].unique(), num_rows, p=df[column].value_counts(normalize=True) + ) + + elif pd.api.types.is_datetime64_any_dtype(df[column]): + min_date, max_date = df[column].min(), df[column].max() + if min_date < max_date: + synthetic_data[column] = pd.to_datetime(np.random.randint(min_date.value, max_date.value, num_rows)) + else: + synthetic_data[column] = [min_date] * num_rows + + else: + synthetic_data[column] = np.random.choice(df[column], num_rows) + + return synthetic_data + + +synthetic_df = create_synthetic_data(combined_set) + +existing_schema = spark.table(f"{catalog_name}.{schema_name}.train_set").schema + +synthetic_spark_df = spark.createDataFrame(synthetic_df, schema=existing_schema) + +train_set_with_timestamp = synthetic_spark_df.withColumn( + "update_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC") +) + +train_set_with_timestamp.show(5) +train_set_with_timestamp.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.source_data") + +spark.sql( + f"ALTER TABLE {DatabricksConfig.catalog_name}.{DatabricksConfig.schema_name}.source_data " + "SET TBLPROPERTIES (delta.enableChangeDataFeed = true);" +) diff --git a/src/mlops_with_databricks/pipeline/preprocess.py b/src/mlops_with_databricks/pipeline/preprocess.py new file mode 100644 index 0000000..c6df80a --- /dev/null +++ b/src/mlops_with_databricks/pipeline/preprocess.py @@ -0,0 +1,56 @@ +"""Preprocess data and update train and test sets.""" + +import argparse + +from databricks.sdk import WorkspaceClient +from pyspark.sql import SparkSession +from pyspark.sql.functions import col +from pyspark.sql.functions import max as spark_max + +from mlops_with_databricks.data_preprocessing.dataclasses import DatabricksConfig + +workspace = WorkspaceClient() + + +parser = argparse.ArgumentParser() + +args = parser.parse_args() + +spark = SparkSession.builder.getOrCreate() + +catalog_name = DatabricksConfig.catalog_name +schema_name = DatabricksConfig.schema_name + + +source_data = spark.table(f"{catalog_name}.{schema_name}.source_data") + +max_train_timestamp = ( + spark.table(f"{catalog_name}.{schema_name}.train_set") + .select(spark_max("update_timestamp_utc").alias("max_update_timestamp")) + .collect()[0]["max_update_timestamp"] +) + +max_test_timestamp = ( + spark.table(f"{catalog_name}.{schema_name}.test_set") + .select(spark_max("update_timestamp_utc").alias("max_update_timestamp")) + .collect()[0]["max_update_timestamp"] +) + +latest_timestamp = max(max_train_timestamp, max_test_timestamp) + +new_data = source_data.filter(col("update_timestamp_utc") > latest_timestamp) + +new_data_train, new_data_test = new_data.randomSplit([0.8, 0.2], seed=42) + +new_data_train.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.train_set") +new_data_test.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.test_set") + +affected_rows_train = new_data_train.count() +affected_rows_test = new_data_test.count() + +if affected_rows_train > 0 or affected_rows_test > 0: + refreshed = 1 +else: + refreshed = 0 + +workspace.dbutils.jobs.taskValues.set(key="refreshed", value=refreshed) diff --git a/src/mlops_with_databricks/pipeline/train_model.py b/src/mlops_with_databricks/pipeline/train_model.py new file mode 100644 index 0000000..dfbe7fb --- /dev/null +++ b/src/mlops_with_databricks/pipeline/train_model.py @@ -0,0 +1,117 @@ +"""Train a LightGBM model with preprocessing and log the model to MLflow.""" + +import argparse + +import mlflow +from databricks.sdk import WorkspaceClient +from lightgbm import LGBMClassifier +from mlflow.models import infer_signature +from pyspark.sql import SparkSession +from sklearn.compose import ColumnTransformer +from sklearn.metrics import ( + accuracy_score, + f1_score, + precision_score, + recall_score, + roc_auc_score, +) +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +from mlops_with_databricks.data_preprocessing.dataclasses import ( + DatabricksConfig, + ProcessedAdClickDataConfig, + light_gbm_config, +) + +parser = argparse.ArgumentParser() +parser.add_argument( + "--git_sha", + action="store", + default=None, + type=str, + required=True, +) +parser.add_argument( + "--git_branch", + action="store", + default=None, + type=str, + required=True, +) +parser.add_argument( + "--job_run_id", + action="store", + default=None, + type=str, + required=True, +) + +args = parser.parse_args() +git_sha = args.git_sha +git_branch = args.git_branch +job_run_id = args.job_run_id + + +spark = SparkSession.builder.getOrCreate() +workspace = WorkspaceClient() + +mlflow.set_registry_uri("databricks-uc") +mlflow.set_tracking_uri("databricks") + + +num_features = ProcessedAdClickDataConfig.num_features +cat_features = ProcessedAdClickDataConfig.cat_features +target = ProcessedAdClickDataConfig.target +catalog_name = DatabricksConfig.catalog_name +schema_name = DatabricksConfig.schema_name + +train_set_spark = spark.table(f"{catalog_name}.{schema_name}.train_set") +train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas() +test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas() + +X_train = train_set[list(num_features) + list(cat_features)] +y_train = train_set[target] + +X_test = test_set[list(num_features) + list(cat_features)] +y_test = test_set[target] + +preprocessor = ColumnTransformer( + transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)], remainder="passthrough" +) +pipeline = Pipeline(steps=[("onehot", preprocessor), ("classifier", LGBMClassifier(**light_gbm_config))]) + +mlflow.set_experiment(experiment_name="/Shared/ad-click") + +with mlflow.start_run(tags={"branch": git_branch, "git_sha": f"{git_sha}", "job_run_id": job_run_id}) as run: + run_id = run.info.run_id + + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + + f1 = f1_score(y_test, y_pred) + precision = precision_score(y_test, y_pred) + recall = recall_score(y_test, y_pred) + roc_auc = roc_auc_score(y_test, y_pred) + accuracy = accuracy_score(y_test, y_pred) + + mlflow.log_param("model_type", "LightGBM with preprocessing") + + parameters = { + "classifier__learning_rate": light_gbm_config["learning_rate"], + "classifier__n_estimators": light_gbm_config["n_estimators"], + "classifier__max_depth": light_gbm_config["max_depth"], + } + + mlflow.log_params(parameters) + mlflow.log_metrics({"f1": f1, "accuracy": accuracy, "precision": precision, "recall": recall, "roc_auc": roc_auc}) + signature = infer_signature(model_input=X_test, model_output=y_pred) + + dataset = mlflow.data.from_spark(train_set_spark, table_name=f"{catalog_name}.{schema_name}.train_set", version="0") + mlflow.log_input(dataset, context="training") + + mlflow.sklearn.log_model(sk_model=pipeline, artifact_path="lightgbm-pipeline-model", signature=signature) + + +model_uri = f"runs:/{run_id}/lightgbm-pipeline-model" +workspace.dbutils.jobs.taskValues.set(key="new_model_uri", value=model_uri) diff --git a/uv.lock b/uv.lock index 8597d17..cadbc10 100644 --- a/uv.lock +++ b/uv.lock @@ -1,8 +1,10 @@ version = 1 -requires-python = "==3.11.10" +requires-python = ">=3.11" resolution-markers = [ - "platform_system != 'Windows'", - "platform_system == 'Windows'", + "python_full_version < '3.12' and platform_system != 'Windows'", + "python_full_version >= '3.12' and platform_system != 'Windows'", + "python_full_version < '3.12' and platform_system == 'Windows'", + "python_full_version >= '3.12' and platform_system == 'Windows'", ] [[package]] @@ -135,13 +137,27 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, - { name = "urllib3", marker = "python_full_version == '3.11.10'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d3/0c/2bcd566397ab06661b222b9b5156ba0c40d5a97d3727c88ccaefea275cb4/botocore-1.35.42.tar.gz", hash = "sha256:af348636f73dc24b7e2dc760a34d08c8f2f94366e9b4c78d877307b128abecef", size = 12835012 } wheels = [ { url = "https://files.pythonhosted.org/packages/2e/f5/0e67c7e6a7f5f8c068cf444dc25d03097a22428380587542978d7ad9d86a/botocore-1.35.42-py3-none-any.whl", hash = "sha256:05af0bb8b9cea7ce7bc589c332348d338a21b784e9d088a588fd10ec145007ff", size = 12621471 }, ] +[[package]] +name = "build" +version = "1.2.2.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "os_name == 'nt'" }, + { name = "packaging" }, + { name = "pyproject-hooks" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/46/aeab111f8e06793e4f0e421fcad593d547fb8313b50990f31681ee2fb1ad/build-1.2.2.post1.tar.gz", hash = "sha256:b36993e92ca9375a219c99e606a122ff365a760a2d4bba0caa09bd5278b608b7", size = 46701 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950 }, +] + [[package]] name = "cachetools" version = "5.5.0" @@ -925,7 +941,7 @@ dependencies = [ { name = "pygments" }, { name = "stack-data" }, { name = "traitlets" }, - { name = "typing-extensions", marker = "python_full_version == '3.11.10'" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f7/21/48db7d9dd622b9692575004c7c98f85f5629428f58596c59606d36c51b58/ipython-8.28.0.tar.gz", hash = "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a", size = 5495762 } wheels = [ @@ -1095,6 +1111,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/28/3be76b591a2e14a031b681b8283acf1dec2ad521f6f1701b7957df68c466/lightgbm-4.5.0-py3-none-win_amd64.whl", hash = "sha256:7ccb73ee9fb74fbbf89ad24c57a6edad505aa8f2165d02b999a082dbbbb0ee57", size = 1444319 }, ] +[[package]] +name = "loguru" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "win32-setctime", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/30/d87a423766b24db416a46e9335b9602b054a72b96a88a241f2b09b560fa8/loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac", size = 145103 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", size = 62549 }, +] + [[package]] name = "mako" version = "1.3.5" @@ -1282,10 +1311,12 @@ name = "mlops-with-databricks" version = "0.0.1" source = { editable = "." } dependencies = [ + { name = "build" }, { name = "cffi" }, { name = "cloudpickle" }, { name = "databricks-feature-engineering" }, { name = "lightgbm" }, + { name = "loguru" }, { name = "matplotlib" }, { name = "mlflow" }, { name = "numpy" }, @@ -1306,6 +1337,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "build", specifier = ">=1.2.2.post1" }, { name = "cffi", specifier = ">=1.17.1,<2" }, { name = "cloudpickle", specifier = ">=3.0.0,<4" }, { name = "databricks-connect", marker = "extra == 'dev'", specifier = ">=15.4.1,<16" }, @@ -1313,6 +1345,7 @@ requires-dist = [ { name = "databricks-sdk", marker = "extra == 'dev'", specifier = ">=0.32.0,<0.33" }, { name = "ipykernel", marker = "extra == 'dev'", specifier = ">=6.29.5,<7" }, { name = "lightgbm", specifier = ">=4.5.0,<5" }, + { name = "loguru", specifier = ">=0.7.2" }, { name = "matplotlib", specifier = ">=3.9.2,<4" }, { name = "mlflow", specifier = ">=2.16.0,<3" }, { name = "numpy", specifier = ">=1.26.4,<2" }, @@ -1417,7 +1450,7 @@ name = "pandas" version = "2.2.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", marker = "python_full_version == '3.11.10'" }, + { name = "numpy" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -1601,8 +1634,6 @@ version = "6.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/18/c7/8c6872f7372eb6a6b2e4708b88419fb46b857f7a2e1892966b851cc79fc9/psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2", size = 508067 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/66/78c9c3020f573c58101dc43a44f6855d01bbbd747e24da2f0c4491200ea3/psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35", size = 249766 }, - { url = "https://files.pythonhosted.org/packages/e1/3f/2403aa9558bea4d3854b0e5e567bc3dd8e9fbc1fc4453c0aa9aafeb75467/psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1", size = 253024 }, { url = "https://files.pythonhosted.org/packages/0b/37/f8da2fbd29690b3557cca414c1949f92162981920699cd62095a984983bf/psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0", size = 250961 }, { url = "https://files.pythonhosted.org/packages/35/56/72f86175e81c656a01c4401cd3b1c923f891b31fbcebe98985894176d7c9/psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0", size = 287478 }, { url = "https://files.pythonhosted.org/packages/19/74/f59e7e0d392bc1070e9a70e2f9190d652487ac115bb16e2eff6b22ad1d24/psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", size = 290455 }, @@ -1712,6 +1743,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/ec/2eb3cd785efd67806c46c13a17339708ddc346cbb684eade7a6e6f79536a/pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84", size = 106921 }, ] +[[package]] +name = "pyproject-hooks" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/82/28175b2414effca1cdac8dc99f76d660e7a4fb0ceefa4b4ab8f5f6742925/pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8", size = 19228 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1975,7 +2015,7 @@ name = "sqlalchemy" version = "2.0.36" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "greenlet", marker = "(platform_machine == 'AMD64' and python_full_version == '3.11.10') or (platform_machine == 'WIN32' and python_full_version == '3.11.10') or (platform_machine == 'aarch64' and python_full_version == '3.11.10') or (platform_machine == 'amd64' and python_full_version == '3.11.10') or (platform_machine == 'ppc64le' and python_full_version == '3.11.10') or (platform_machine == 'win32' and python_full_version == '3.11.10') or (platform_machine == 'x86_64' and python_full_version == '3.11.10')" }, + { name = "greenlet", marker = "(python_full_version < '3.13' and platform_machine == 'AMD64') or (python_full_version < '3.13' and platform_machine == 'WIN32') or (python_full_version < '3.13' and platform_machine == 'aarch64') or (python_full_version < '3.13' and platform_machine == 'amd64') or (python_full_version < '3.13' and platform_machine == 'ppc64le') or (python_full_version < '3.13' and platform_machine == 'win32') or (python_full_version < '3.13' and platform_machine == 'x86_64')" }, { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/65/9cbc9c4c3287bed2499e05033e207473504dc4df999ce49385fb1f8b058a/sqlalchemy-2.0.36.tar.gz", hash = "sha256:7f2767680b6d2398aea7082e45a774b2b0767b5c8d8ffb9c8b683088ea9b29c5", size = 9574485 } @@ -2137,6 +2177,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/84/997bbf7c2bf2dc3f09565c6d0b4959fefe5355c18c4096cfd26d83e0785b/werkzeug-3.0.4-py3-none-any.whl", hash = "sha256:02c9eb92b7d6c06f31a782811505d2157837cea66aaede3e217c7c27c039476c", size = 227554 }, ] +[[package]] +name = "win32-setctime" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/dd/f95a13d2b235a28d613ba23ebad55191514550debb968b46aab99f2e3a30/win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2", size = 3676 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/e6/a7d828fef907843b2a5773ebff47fb79ac0c1c88d60c0ca9530ee941e248/win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad", size = 3604 }, +] + [[package]] name = "wrapt" version = "1.16.0" From 0c5b4a4ed1e1192906aba0ac19a5eded704417d8 Mon Sep 17 00:00:00 2001 From: Artur Makowski Date: Sun, 24 Nov 2024 16:13:18 +0100 Subject: [PATCH 2/2] fix: databrick.yaml --- databricks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databricks.yml b/databricks.yml index 63cf547..33bd5c9 100644 --- a/databricks.yml +++ b/databricks.yml @@ -43,7 +43,7 @@ resources: tasks: - task_key: "preprocessing" - job_cluster_key: "had-click-cluster" + job_cluster_key: "ad-click-cluster" spark_python_task: python_file: "src/mlops_with_databricks/pipeline/preprocess.py" libraries: @@ -122,4 +122,4 @@ targets: prod: workspace: host: https://dbc-643c4c2b-d6c9.cloud.databricks.com - root_path: ${var.root_path} + root_path: /Workspace/Users/armak58@gmail.com/.bundle/${bundle.target}/${bundle.name}