Skip to content

Commit

Permalink
updating group scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
clachevv committed Sep 20, 2023
1 parent d2bea37 commit 8a9dcac
Show file tree
Hide file tree
Showing 6 changed files with 348 additions and 9 deletions.
69 changes: 69 additions & 0 deletions notebooks/group_01/evidently_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Databricks notebook source
#!/usr/bin/env python3

"""
Purpose: Run evidently tests against two comparing datasets.
"""

import sys
import os

sys.path.append(os.path.abspath("../.."))

from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics

import pandas as pd
import numpy as np
import pytest

# COMMAND ----------

path_reference = "train_01"
path_current = "train_02"

# loading reference and current data
dataset_dict = load_reference_current_data(
path_reference=path_reference, path_current=path_current
)

# COMMAND ----------

# DATA QUALITY EVIDENTLY OUTPUT
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

# DATA DRIFT EVIDENTLY OUTPUT
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

# TARGET DRIFT EVIDENTLY OUTPUT
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_01"

# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()

# Get the repo's root directory name.
repo_root = os.path.dirname(os.path.dirname(notebook_path))

# Prepare to run pytest from the repo.
# os.chdir(f"{repo_name}")
print(os.getcwd())

# Skip writing pyc files on a readonly filesystem.
sys.dont_write_bytecode = True

# Run pytest.
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])

# Fail the cell execution if there are any test failures.
assert retcode == 0, "The pytest invocation failed. See the log for details."

# COMMAND ----------

#
92 changes: 92 additions & 0 deletions notebooks/group_01/test_drift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3

"""
Purpose: Loads json test files and performs data tests.
"""

import sys
import os

project_root_dir = os.path.abspath("../../..")
sys.path.append(project_root_dir)
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation'

import pandas as pd
import pytest
import json

@pytest.fixture
def loading_data_quality_json_as_dict() -> dict:
"""Returns "Data Quality" results from Evidently.ai lib
Returns:
dict: data quality metrics.
"""

file_path = os.path.join(root_dir,"data/output","data_quality.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


@pytest.fixture()
def loading_data_drift_json_as_dict() -> dict:
"""Returns "Data Drift" results from Evidently.ai lib
Returns:
dict: data drift metrics.
"""

file_path = os.path.join(root_dir,"data/output","data_drift.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


@pytest.fixture()
def loading_target_drift_json_as_dict() -> dict:
"""Returns "Target Drift" results from Evidently.ai lib
Returns:
dict: target drift metrics.
"""

file_path = os.path.join(root_dir,"data/output","target_drift.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


def test_failed_tests_percentage(loading_data_quality_json_as_dict:dict) -> None:
"""Checks whether the amount of failed tests are above 50% of the total tests.
Args:
loading_data_quality_json_as_dict (dict): evidently output json info
"""
#### WRITE YOUR CODE HERE


def test_pickup_date_drift(loading_data_drift_json_as_dict: dict) -> None:
"""Checks whether the feature pickup_day data drift is above threshold.
Args:
loading_data_drift_json_as_dict (dict): evidently output json info
"""
#### WRITE YOUR CODE HERE


def test_negative_kendall_feature_target_correlation(loading_target_drift_json_as_dict: dict) -> None:
"""Checks whether any feature in the current dataset has passed the
kendall test for being negatively correlated with the target.
Args:
loading_target_drift_json_as_dict (dict): evidently output json info
"""
#### WRITE YOUR CODE HERE

71 changes: 71 additions & 0 deletions notebooks/group_02/evidently_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Databricks notebook source
#!/usr/bin/env python3

"""
Purpose: Run evidently tests against two comparing datasets.
"""

import sys
import os

sys.path.append(os.path.abspath("../.."))
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation'

from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics

import pandas as pd
import numpy as np
import pytest
import json

# COMMAND ----------

path_reference = "train_01"
path_current = "train_02"

# loading reference and current data
dataset_dict = load_reference_current_data(
path_reference=path_reference, path_current=path_current
)

# COMMAND ----------

# DATA QUALITY EVIDENTLY OUTPUT
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

# DATA DRIFT EVIDENTLY OUTPUT
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

# TARGET DRIFT EVIDENTLY OUTPUT
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')

# COMMAND ----------

repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_02"

# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()

# Get the repo's root directory name.
repo_root = os.path.dirname(os.path.dirname(notebook_path))

# Prepare to run pytest from the repo.
# os.chdir(f"{repo_name}")
print(os.getcwd())

# Skip writing pyc files on a readonly filesystem.
sys.dont_write_bytecode = True

# Run pytest.
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])

# Fail the cell execution if there are any test failures.
assert retcode == 0, "The pytest invocation failed. See the log for details."

# COMMAND ----------


97 changes: 97 additions & 0 deletions notebooks/group_02/test_drift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3

"""
Purpose: Loads json test files and performs data tests.
"""

import sys
import os

project_root_dir = os.path.abspath("../../..")
sys.path.append(project_root_dir)
root_dir = "/Workspace/Repos/[email protected]/evidently_implementation"

import pandas as pd
import pytest
import json


@pytest.fixture
def loading_data_quality_json_as_dict() -> dict:
"""Returns "Data Quality" results from Evidently.ai lib
Returns:
dict: data quality metrics.
"""

file_path = os.path.join(root_dir, "data/output", "data_quality.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


@pytest.fixture()
def loading_data_drift_json_as_dict() -> dict:
"""Returns "Data Drift" results from Evidently.ai lib
Returns:
dict: data drift metrics.
"""

file_path = os.path.join(root_dir, "data/output", "data_drift.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


@pytest.fixture()
def loading_target_drift_json_as_dict() -> dict:
"""Returns "Target Drift" results from Evidently.ai lib
Returns:
dict: target drift metrics.
"""

file_path = os.path.join(root_dir, "data/output", "target_drift.json")

with open(file_path, "r") as file:
data = json.load(file)

return data


def test_failed_tests_percentage(loading_data_quality_json_as_dict: dict) -> None:
"""Checks whether the amount of failed tests are below 50% of the total tests.
Args:
loading_data_quality (dict): evidently output json info
"""

##### WRITE YOUR CODE HERE


def test_pickup_date_data_drift(loading_data_drift_json_as_dict: dict) -> None:
"""Checks whether the feature dropoff_latitude data drift is above threshold.
Args:
loading_data_drift_json_as_dict (dict): evidently output json info
"""

##### WRITE YOUR CODE HERE


def test_negative_kendall_feature_target_correlation(
loading_target_drift_json_as_dict: dict,
) -> None:
"""Checks whether any feature in the current dataset has passed the
kendall test for being negatively correlated with the target.
Args:
loading_target_drift_json_as_dict (dict): evidently output json info
"""

##### WRITE YOUR CODE HERE
15 changes: 11 additions & 4 deletions src/evidently_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def load_reference_current_data(path_reference: str, path_current: str) -> dict:
dataset_dict = {}
for name, path in zip(["reference", "current"], [path_reference, path_current]):
dataset = pd.read_csv(
f"data/processed/{path}.csv"
f"/Workspace/Repos/[email protected]/evidently_implementation/data/processed/{path}.csv"
)

dataset_sample = dataset.sample(n=5000, replace=False)
Expand All @@ -69,9 +69,11 @@ def return_data_quality_metrics(dataset_dict: dict) -> None:
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
)
data_quality_test_suite.save_json(
"data/output/data_quality.json"
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_quality.json"
)

return data_quality_test_suite


def return_data_drift_metrics(dataset_dict: dict) -> None:
"""Returns data drift metrics
Expand All @@ -85,9 +87,11 @@ def return_data_drift_metrics(dataset_dict: dict) -> None:
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
)
report.save_json(
"data/output/data_drift.json"
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_drift.json"
)

return report


def return_target_drift_metrics(dataset_dict: dict) -> None:
"""Returns target drift metrics
Expand All @@ -103,12 +107,15 @@ def return_target_drift_metrics(dataset_dict: dict) -> None:
)

num_target_drift_report.run(
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
reference_data=dataset_dict["reference"],
current_data=dataset_dict["current"]
)
num_target_drift_report.save_json(
"data/output/target_drift.json"
)

return num_target_drift_report


def create_all_presets_metrics():
"""Run all presets metrics
Expand Down
Loading

0 comments on commit 8a9dcac

Please sign in to comment.