end-to-end-mlops-databricks · MahaAmin · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
@@ -1 +1,5 @@
-*	@end-to-end-mlops-databricks/teachers @MahaAmin
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+*       @end-to-end-mlops-databricks/teachers @MahaAmin
@@ -18,8 +18,9 @@ jobs:
         run: uv python install 3.11
 
       - name: Install the dependencies
-        run: uv sync 
+        run: uv sync
 
       - name: Run pre-commit checks
         run: |
-          pre-commit run --all-files
+          uv pip install pre-commit
+          uv run pre-commit run --all-files
@@ -10,7 +10,8 @@ __pycache__/
 *.so
 
 # Folders
-# data/
+catboost_info/
+data/
 
 # Distribution / packaging
 .Python
@@ -30,6 +31,7 @@ wheels/
 .installed.cfg
 *.egg
 MANIFEST
+venv/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -95,3 +97,5 @@ dmypy.json
 # VS code configuration
 .vscode
 .history
+
+.databricks
@@ -13,5 +13,4 @@ repos:
     rev: v0.6.9
     hooks:
       - id: ruff
-        args: [--fix, --exit-non-zero-on-fix, --show-fixes]
       - id: ruff-format
@@ -1,26 +1,56 @@
 <h1 align="center">
 Marvelous MLOps End-to-end MLOps with Databricks course
 
-## Practical information
-- Weekly lectures on Wednesdays 16:00-18:00 CET.
-- Code for the lecture is shared before the lecture. 
-- Presentation and lecture materials are shared right after the lecture.
-- Video of the lecture is uploaded within 24 hours after the lecture.
+## Course Project Description
 
-- Every week we set up a deliverable, and you implement it with your own dataset. 
-- To submit the deliverable, create a feature branch in that repository, and a PR to main branch. The code can be merged after we review & approve & CI pipeline runs successfully.
-- The deliverables can be submitted with a delay (for example, lecture 1 & 2 together), but we expect you to finish all assignments for the course before the 25th of November.
+- **Dataset:** [Kaggle - Credit Card Fraud Detection Dataset 2023](https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023)
+
+
+### Course Deliverables
+
+#### PR #1
+
+- Azure Databricks Environment Setup
+- Select dataset [Kaggle - Credit Card Fraud Detection Dataset 2023](https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023)
+- Run python notebooks in databricks cluster for fraud_credit_cards usecase
+- Create "DataProcessor" and "FraudModel" classes
+- Push data.csv to databricks volume
+- Push package.whl to databricks volume
 
 
 ## Set up your environment
-In this course, we use Databricks 15.4 LTS runtime, which uses Python 3.11. 
+In this course, we use Databricks 15.4 LTS runtime, which uses Python 3.11.
 In our examples, we use UV. Check out the documentation on how to install it: https://docs.astral.sh/uv/getting-started/installation/
 
-To create a new environment and create a lockfile, run:
+### To create a new environment and create a lockfile, run:
 
 ```
-uv venv -p 3.11.0 venv
-source venv/bin/activate
+uv venv -p 3.11.11 .venv
+source .venv/bin/activate
 uv pip install -r pyproject.toml --all-extras
 uv lock
 ```
+
+### To Build fraud_credit_cards package
+
+```
+uv build
+```
+
+To install and run fraud_credit_cards package
+
+```
+uv pip install dist/fraud_credit_cards-0.0.1-py3-none-any.whl
+```
+
+```
+uv run python main.py
+```
+
+### Pre-Commit Checks
+
+To run pre-commit checks
+
+```
+uv run pre-commit run --all-files
+```
@@ -0,0 +1,3 @@
+{
+    "target": "Class"
+}
@@ -0,0 +1,19 @@
+# This is a Databricks asset bundle definition for marvelous-databricks-course-MahaAmin.
+# The Databricks extension requires databricks.yml configuration file.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+
+bundle:
+  name: marvelous-databricks-course-MahaAmin
+
+targets:
+  dev:
+    mode: development
+    default: true
+    workspace:
+      host: https://adb-3537333413571968.8.azuredatabricks.net
+
+  ## Optionally, there could be 'staging' or 'prod' targets here.
+  #
+  # prod:
+  #   workspace:
+  #     host: https://adb-3537333413571968.8.azuredatabricks.net
@@ -0,0 +1,5 @@
+from databricks.connect import DatabricksSession
+
+spark = DatabricksSession.builder.profile("adb-3537333413571968").getOrCreate()
+df = spark.read.table("samples.nyctaxi.trips")
+df.show(5)
@@ -0,0 +1,63 @@
+import logging
+
+import yaml
+from colorama import Back, Fore, Style
+from sklearn.metrics import classification_report
+
+from fraud_credit_cards.data_processor import DataProcessor
+from fraud_credit_cards.fraud_model import FraudModel
+
+
+def print_evaluation(y_test, y_pred, accuracy):
+    print("Accuracy:", accuracy)
+    print("\n" + Back.BLUE + Fore.WHITE + "Classification Report" + Style.RESET_ALL)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    for key, value in report.items():
+        if key in ["0", "1"]:
+            color = Fore.GREEN if value["precision"] > 0.8 else Fore.RED
+            print(f"Class {key}:")
+            print(f"  Precision: {color}{value['precision']:.2f}{Style.RESET_ALL}")
+            color = Fore.GREEN if value["recall"] > 0.8 else Fore.RED
+            print(f"  Recall: {color}{value['recall']:.2f}{Style.RESET_ALL}")
+            color = Fore.GREEN if value["f1-score"] > 0.8 else Fore.RED
+            print(f"  F1-score: {color}{value['f1-score']:.2f}{Style.RESET_ALL}")
+            print(f"  Support: {value['support']}")
+        else:
+            print(key + ":", value)
+
+
+# configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# load configurations
+with open("project_config.yml", "r") as file:
+    config = yaml.safe_load(file)
+
+logger.info("Configuration loaded: ")
+print(yaml.dump(config, default_flow_style=False))
+
+# initialize DataProcessor
+data_processor = DataProcessor("data/creditcard_2023.csv", config)
-data_processor = DataProcessor("data/creditcard_2023.csv", config)
+data_processor = DataProcessor(config.get('data_path', 'data/creditcard_2023.csv'), config)
-data_processor = DataProcessor("data/creditcard_2023.csv", config)
+data_processor = DataProcessor(config.get('data_path', 'data/creditcard_2023.csv'), config)
+logging.info("DataProcessor Initialized ...")
+
+# preprocess the data
+data_processor.preprocess_data()
+logging.info("Data preprocessed ...")
+
+# Split the data
+X_train, X_test, y_train, y_test = data_processor.split_data()
+logger.info("Data split into training and test sets.")
+logger.debug(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
+
+# Intialize and train model
+model = FraudModel(data_processor.preprocessor)
+model.train(X_train, y_train)
+logger.info("Model training completed.")
+
+# evaluate model
+y_pred, accuracy, precision, recall, mse, f1 = model.evaluate(X_test, y_test)
+logging.info("Model evaluation completed. ")
+
+# print evaluation report
+print_evaluation(y_test, y_pred, accuracy)
@@ -0,0 +1,102 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # Credit Card 2023 Fraud Detection
+# MAGIC
+# MAGIC **Dataset:** [Kaggle-Credit-Card-Fraud-Detection-Dataset-2023](https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023/data)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Installing Packages
+
+# COMMAND ----------
+
+# MAGIC %pip install colorama==0.4.6 catboost==1.2.0 gecs==0.1.1
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Import Libraries
+
+# COMMAND ----------
+
+import warnings
+
+import pandas as pd
+from catboost import CatBoostClassifier
+from colorama import Back, Fore, Style
+from sklearn.compose import ColumnTransformer
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore")
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Reading Data From DB Catalog Volume
+
+# COMMAND ----------
+
+df_tr = pd.read_csv("/Volumes/fraud_credit_cards/data/credit_cards_2023/creditcard_2023.csv")
+
+
+# COMMAND ----------
+
+df_tr.head()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ### Data Preprocessing and Modeling
+
+# COMMAND ----------
+
+# Spliting the data into features and target
+X = df_tr.drop("Class", axis=1)
+y = df_tr["Class"]
+
+# Define numeric features (remove categorical columns)
+numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
+
+# Define preprocessing steps
+numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
+
+preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)])
+
+# Define the model
+model = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", CatBoostClassifier(verbose=False))])
+
+# Split the data into training and test sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Fit the model
+model.fit(X_train, y_train)
+
+# Predict on the test set
+y_pred = model.predict(X_test)
+
+# Calculate accuracy
+accuracy = accuracy_score(y_test, y_pred)
+print("Accuracy:", accuracy)
+
+# Display classification report with colors and heading
+print("\n" + Back.BLUE + Fore.WHITE + "Classification Report" + Style.RESET_ALL)
+report = classification_report(y_test, y_pred, output_dict=True)
+for key, value in report.items():
+    if key in ["0", "1"]:
+        color = Fore.GREEN if value["precision"] > 0.8 else Fore.RED
+        print(f"Class {key}:")
+        print(f"  Precision: {color}{value['precision']:.2f}{Style.RESET_ALL}")
+        color = Fore.GREEN if value["recall"] > 0.8 else Fore.RED
+        print(f"  Recall: {color}{value['recall']:.2f}{Style.RESET_ALL}")
+        color = Fore.GREEN if value["f1-score"] > 0.8 else Fore.RED
+        print(f"  F1-score: {color}{value['f1-score']:.2f}{Style.RESET_ALL}")
+        print(f"  Support: {value['support']}")
+    else:
+        print(key + ":", value)
+
+# COMMAND ----------