-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
clachevv
committed
Sep 20, 2023
1 parent
d2bea37
commit 8a9dcac
Showing
6 changed files
with
348 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Databricks notebook source | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Purpose: Run evidently tests against two comparing datasets. | ||
""" | ||
|
||
import sys | ||
import os | ||
|
||
sys.path.append(os.path.abspath("../..")) | ||
|
||
from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pytest | ||
|
||
# COMMAND ---------- | ||
|
||
path_reference = "train_01" | ||
path_current = "train_02" | ||
|
||
# loading reference and current data | ||
dataset_dict = load_reference_current_data( | ||
path_reference=path_reference, path_current=path_current | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# DATA QUALITY EVIDENTLY OUTPUT | ||
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
# DATA DRIFT EVIDENTLY OUTPUT | ||
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
# TARGET DRIFT EVIDENTLY OUTPUT | ||
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_01" | ||
|
||
# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}". | ||
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get() | ||
|
||
# Get the repo's root directory name. | ||
repo_root = os.path.dirname(os.path.dirname(notebook_path)) | ||
|
||
# Prepare to run pytest from the repo. | ||
# os.chdir(f"{repo_name}") | ||
print(os.getcwd()) | ||
|
||
# Skip writing pyc files on a readonly filesystem. | ||
sys.dont_write_bytecode = True | ||
|
||
# Run pytest. | ||
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"]) | ||
|
||
# Fail the cell execution if there are any test failures. | ||
assert retcode == 0, "The pytest invocation failed. See the log for details." | ||
|
||
# COMMAND ---------- | ||
|
||
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Purpose: Loads json test files and performs data tests. | ||
""" | ||
|
||
import sys | ||
import os | ||
|
||
project_root_dir = os.path.abspath("../../..") | ||
sys.path.append(project_root_dir) | ||
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation' | ||
|
||
import pandas as pd | ||
import pytest | ||
import json | ||
|
||
@pytest.fixture | ||
def loading_data_quality_json_as_dict() -> dict: | ||
"""Returns "Data Quality" results from Evidently.ai lib | ||
Returns: | ||
dict: data quality metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir,"data/output","data_quality.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
@pytest.fixture() | ||
def loading_data_drift_json_as_dict() -> dict: | ||
"""Returns "Data Drift" results from Evidently.ai lib | ||
Returns: | ||
dict: data drift metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir,"data/output","data_drift.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
@pytest.fixture() | ||
def loading_target_drift_json_as_dict() -> dict: | ||
"""Returns "Target Drift" results from Evidently.ai lib | ||
Returns: | ||
dict: target drift metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir,"data/output","target_drift.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
def test_failed_tests_percentage(loading_data_quality_json_as_dict:dict) -> None: | ||
"""Checks whether the amount of failed tests are above 50% of the total tests. | ||
Args: | ||
loading_data_quality_json_as_dict (dict): evidently output json info | ||
""" | ||
#### WRITE YOUR CODE HERE | ||
|
||
|
||
def test_pickup_date_drift(loading_data_drift_json_as_dict: dict) -> None: | ||
"""Checks whether the feature pickup_day data drift is above threshold. | ||
Args: | ||
loading_data_drift_json_as_dict (dict): evidently output json info | ||
""" | ||
#### WRITE YOUR CODE HERE | ||
|
||
|
||
def test_negative_kendall_feature_target_correlation(loading_target_drift_json_as_dict: dict) -> None: | ||
"""Checks whether any feature in the current dataset has passed the | ||
kendall test for being negatively correlated with the target. | ||
Args: | ||
loading_target_drift_json_as_dict (dict): evidently output json info | ||
""" | ||
#### WRITE YOUR CODE HERE | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Databricks notebook source | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Purpose: Run evidently tests against two comparing datasets. | ||
""" | ||
|
||
import sys | ||
import os | ||
|
||
sys.path.append(os.path.abspath("../..")) | ||
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation' | ||
|
||
from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pytest | ||
import json | ||
|
||
# COMMAND ---------- | ||
|
||
path_reference = "train_01" | ||
path_current = "train_02" | ||
|
||
# loading reference and current data | ||
dataset_dict = load_reference_current_data( | ||
path_reference=path_reference, path_current=path_current | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# DATA QUALITY EVIDENTLY OUTPUT | ||
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
# DATA DRIFT EVIDENTLY OUTPUT | ||
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
# TARGET DRIFT EVIDENTLY OUTPUT | ||
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline') | ||
|
||
# COMMAND ---------- | ||
|
||
repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_02" | ||
|
||
# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}". | ||
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get() | ||
|
||
# Get the repo's root directory name. | ||
repo_root = os.path.dirname(os.path.dirname(notebook_path)) | ||
|
||
# Prepare to run pytest from the repo. | ||
# os.chdir(f"{repo_name}") | ||
print(os.getcwd()) | ||
|
||
# Skip writing pyc files on a readonly filesystem. | ||
sys.dont_write_bytecode = True | ||
|
||
# Run pytest. | ||
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"]) | ||
|
||
# Fail the cell execution if there are any test failures. | ||
assert retcode == 0, "The pytest invocation failed. See the log for details." | ||
|
||
# COMMAND ---------- | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Purpose: Loads json test files and performs data tests. | ||
""" | ||
|
||
import sys | ||
import os | ||
|
||
project_root_dir = os.path.abspath("../../..") | ||
sys.path.append(project_root_dir) | ||
root_dir = "/Workspace/Repos/[email protected]/evidently_implementation" | ||
|
||
import pandas as pd | ||
import pytest | ||
import json | ||
|
||
|
||
@pytest.fixture | ||
def loading_data_quality_json_as_dict() -> dict: | ||
"""Returns "Data Quality" results from Evidently.ai lib | ||
Returns: | ||
dict: data quality metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir, "data/output", "data_quality.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
@pytest.fixture() | ||
def loading_data_drift_json_as_dict() -> dict: | ||
"""Returns "Data Drift" results from Evidently.ai lib | ||
Returns: | ||
dict: data drift metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir, "data/output", "data_drift.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
@pytest.fixture() | ||
def loading_target_drift_json_as_dict() -> dict: | ||
"""Returns "Target Drift" results from Evidently.ai lib | ||
Returns: | ||
dict: target drift metrics. | ||
""" | ||
|
||
file_path = os.path.join(root_dir, "data/output", "target_drift.json") | ||
|
||
with open(file_path, "r") as file: | ||
data = json.load(file) | ||
|
||
return data | ||
|
||
|
||
def test_failed_tests_percentage(loading_data_quality_json_as_dict: dict) -> None: | ||
"""Checks whether the amount of failed tests are below 50% of the total tests. | ||
Args: | ||
loading_data_quality (dict): evidently output json info | ||
""" | ||
|
||
##### WRITE YOUR CODE HERE | ||
|
||
|
||
def test_pickup_date_data_drift(loading_data_drift_json_as_dict: dict) -> None: | ||
"""Checks whether the feature dropoff_latitude data drift is above threshold. | ||
Args: | ||
loading_data_drift_json_as_dict (dict): evidently output json info | ||
""" | ||
|
||
##### WRITE YOUR CODE HERE | ||
|
||
|
||
def test_negative_kendall_feature_target_correlation( | ||
loading_target_drift_json_as_dict: dict, | ||
) -> None: | ||
"""Checks whether any feature in the current dataset has passed the | ||
kendall test for being negatively correlated with the target. | ||
Args: | ||
loading_target_drift_json_as_dict (dict): evidently output json info | ||
""" | ||
|
||
##### WRITE YOUR CODE HERE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,7 +42,7 @@ def load_reference_current_data(path_reference: str, path_current: str) -> dict: | |
dataset_dict = {} | ||
for name, path in zip(["reference", "current"], [path_reference, path_current]): | ||
dataset = pd.read_csv( | ||
f"data/processed/{path}.csv" | ||
f"/Workspace/Repos/[email protected]/evidently_implementation/data/processed/{path}.csv" | ||
) | ||
|
||
dataset_sample = dataset.sample(n=5000, replace=False) | ||
|
@@ -69,9 +69,11 @@ def return_data_quality_metrics(dataset_dict: dict) -> None: | |
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"] | ||
) | ||
data_quality_test_suite.save_json( | ||
"data/output/data_quality.json" | ||
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_quality.json" | ||
) | ||
|
||
return data_quality_test_suite | ||
|
||
|
||
def return_data_drift_metrics(dataset_dict: dict) -> None: | ||
"""Returns data drift metrics | ||
|
@@ -85,9 +87,11 @@ def return_data_drift_metrics(dataset_dict: dict) -> None: | |
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"] | ||
) | ||
report.save_json( | ||
"data/output/data_drift.json" | ||
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_drift.json" | ||
) | ||
|
||
return report | ||
|
||
|
||
def return_target_drift_metrics(dataset_dict: dict) -> None: | ||
"""Returns target drift metrics | ||
|
@@ -103,12 +107,15 @@ def return_target_drift_metrics(dataset_dict: dict) -> None: | |
) | ||
|
||
num_target_drift_report.run( | ||
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"] | ||
reference_data=dataset_dict["reference"], | ||
current_data=dataset_dict["current"] | ||
) | ||
num_target_drift_report.save_json( | ||
"data/output/target_drift.json" | ||
) | ||
|
||
return num_target_drift_report | ||
|
||
|
||
def create_all_presets_metrics(): | ||
"""Run all presets metrics | ||
|
Oops, something went wrong.