Skip to content

Commit 8a9dcac

Browse files
author
clachevv
committed
updating group scripts
1 parent d2bea37 commit 8a9dcac

File tree

6 files changed

+348
-9
lines changed

6 files changed

+348
-9
lines changed

notebooks/group_01/evidently_run.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Databricks notebook source
2+
#!/usr/bin/env python3
3+
4+
"""
5+
Purpose: Run evidently tests against two comparing datasets.
6+
"""
7+
8+
import sys
9+
import os
10+
11+
sys.path.append(os.path.abspath("../.."))
12+
13+
from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics
14+
15+
import pandas as pd
16+
import numpy as np
17+
import pytest
18+
19+
# COMMAND ----------
20+
21+
path_reference = "train_01"
22+
path_current = "train_02"
23+
24+
# loading reference and current data
25+
dataset_dict = load_reference_current_data(
26+
path_reference=path_reference, path_current=path_current
27+
)
28+
29+
# COMMAND ----------
30+
31+
# DATA QUALITY EVIDENTLY OUTPUT
32+
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')
33+
34+
# COMMAND ----------
35+
36+
# DATA DRIFT EVIDENTLY OUTPUT
37+
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
38+
39+
# COMMAND ----------
40+
41+
# TARGET DRIFT EVIDENTLY OUTPUT
42+
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
43+
44+
# COMMAND ----------
45+
46+
repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_01"
47+
48+
# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
49+
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
50+
51+
# Get the repo's root directory name.
52+
repo_root = os.path.dirname(os.path.dirname(notebook_path))
53+
54+
# Prepare to run pytest from the repo.
55+
# os.chdir(f"{repo_name}")
56+
print(os.getcwd())
57+
58+
# Skip writing pyc files on a readonly filesystem.
59+
sys.dont_write_bytecode = True
60+
61+
# Run pytest.
62+
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])
63+
64+
# Fail the cell execution if there are any test failures.
65+
assert retcode == 0, "The pytest invocation failed. See the log for details."
66+
67+
# COMMAND ----------
68+
69+
#

notebooks/group_01/test_drift.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Purpose: Loads json test files and performs data tests.
5+
"""
6+
7+
import sys
8+
import os
9+
10+
project_root_dir = os.path.abspath("../../..")
11+
sys.path.append(project_root_dir)
12+
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation'
13+
14+
import pandas as pd
15+
import pytest
16+
import json
17+
18+
@pytest.fixture
19+
def loading_data_quality_json_as_dict() -> dict:
20+
"""Returns "Data Quality" results from Evidently.ai lib
21+
22+
Returns:
23+
dict: data quality metrics.
24+
"""
25+
26+
file_path = os.path.join(root_dir,"data/output","data_quality.json")
27+
28+
with open(file_path, "r") as file:
29+
data = json.load(file)
30+
31+
return data
32+
33+
34+
@pytest.fixture()
35+
def loading_data_drift_json_as_dict() -> dict:
36+
"""Returns "Data Drift" results from Evidently.ai lib
37+
38+
Returns:
39+
dict: data drift metrics.
40+
"""
41+
42+
file_path = os.path.join(root_dir,"data/output","data_drift.json")
43+
44+
with open(file_path, "r") as file:
45+
data = json.load(file)
46+
47+
return data
48+
49+
50+
@pytest.fixture()
51+
def loading_target_drift_json_as_dict() -> dict:
52+
"""Returns "Target Drift" results from Evidently.ai lib
53+
54+
Returns:
55+
dict: target drift metrics.
56+
"""
57+
58+
file_path = os.path.join(root_dir,"data/output","target_drift.json")
59+
60+
with open(file_path, "r") as file:
61+
data = json.load(file)
62+
63+
return data
64+
65+
66+
def test_failed_tests_percentage(loading_data_quality_json_as_dict:dict) -> None:
67+
"""Checks whether the amount of failed tests are above 50% of the total tests.
68+
69+
Args:
70+
loading_data_quality_json_as_dict (dict): evidently output json info
71+
"""
72+
#### WRITE YOUR CODE HERE
73+
74+
75+
def test_pickup_date_drift(loading_data_drift_json_as_dict: dict) -> None:
76+
"""Checks whether the feature pickup_day data drift is above threshold.
77+
78+
Args:
79+
loading_data_drift_json_as_dict (dict): evidently output json info
80+
"""
81+
#### WRITE YOUR CODE HERE
82+
83+
84+
def test_negative_kendall_feature_target_correlation(loading_target_drift_json_as_dict: dict) -> None:
85+
"""Checks whether any feature in the current dataset has passed the
86+
kendall test for being negatively correlated with the target.
87+
88+
Args:
89+
loading_target_drift_json_as_dict (dict): evidently output json info
90+
"""
91+
#### WRITE YOUR CODE HERE
92+

notebooks/group_02/evidently_run.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Databricks notebook source
2+
#!/usr/bin/env python3
3+
4+
"""
5+
Purpose: Run evidently tests against two comparing datasets.
6+
"""
7+
8+
import sys
9+
import os
10+
11+
sys.path.append(os.path.abspath("../.."))
12+
root_dir = '/Workspace/Repos/[email protected]/evidently_implementation'
13+
14+
from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics
15+
16+
import pandas as pd
17+
import numpy as np
18+
import pytest
19+
import json
20+
21+
# COMMAND ----------
22+
23+
path_reference = "train_01"
24+
path_current = "train_02"
25+
26+
# loading reference and current data
27+
dataset_dict = load_reference_current_data(
28+
path_reference=path_reference, path_current=path_current
29+
)
30+
31+
# COMMAND ----------
32+
33+
# DATA QUALITY EVIDENTLY OUTPUT
34+
return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')
35+
36+
# COMMAND ----------
37+
38+
# DATA DRIFT EVIDENTLY OUTPUT
39+
return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
40+
41+
# COMMAND ----------
42+
43+
# TARGET DRIFT EVIDENTLY OUTPUT
44+
return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
45+
46+
# COMMAND ----------
47+
48+
repo_name = "/Repos/[email protected]/evidently_implementation/notebooks/group_02"
49+
50+
# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
51+
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
52+
53+
# Get the repo's root directory name.
54+
repo_root = os.path.dirname(os.path.dirname(notebook_path))
55+
56+
# Prepare to run pytest from the repo.
57+
# os.chdir(f"{repo_name}")
58+
print(os.getcwd())
59+
60+
# Skip writing pyc files on a readonly filesystem.
61+
sys.dont_write_bytecode = True
62+
63+
# Run pytest.
64+
retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])
65+
66+
# Fail the cell execution if there are any test failures.
67+
assert retcode == 0, "The pytest invocation failed. See the log for details."
68+
69+
# COMMAND ----------
70+
71+

notebooks/group_02/test_drift.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Purpose: Loads json test files and performs data tests.
5+
"""
6+
7+
import sys
8+
import os
9+
10+
project_root_dir = os.path.abspath("../../..")
11+
sys.path.append(project_root_dir)
12+
root_dir = "/Workspace/Repos/[email protected]/evidently_implementation"
13+
14+
import pandas as pd
15+
import pytest
16+
import json
17+
18+
19+
@pytest.fixture
20+
def loading_data_quality_json_as_dict() -> dict:
21+
"""Returns "Data Quality" results from Evidently.ai lib
22+
23+
Returns:
24+
dict: data quality metrics.
25+
"""
26+
27+
file_path = os.path.join(root_dir, "data/output", "data_quality.json")
28+
29+
with open(file_path, "r") as file:
30+
data = json.load(file)
31+
32+
return data
33+
34+
35+
@pytest.fixture()
36+
def loading_data_drift_json_as_dict() -> dict:
37+
"""Returns "Data Drift" results from Evidently.ai lib
38+
39+
Returns:
40+
dict: data drift metrics.
41+
"""
42+
43+
file_path = os.path.join(root_dir, "data/output", "data_drift.json")
44+
45+
with open(file_path, "r") as file:
46+
data = json.load(file)
47+
48+
return data
49+
50+
51+
@pytest.fixture()
52+
def loading_target_drift_json_as_dict() -> dict:
53+
"""Returns "Target Drift" results from Evidently.ai lib
54+
55+
Returns:
56+
dict: target drift metrics.
57+
"""
58+
59+
file_path = os.path.join(root_dir, "data/output", "target_drift.json")
60+
61+
with open(file_path, "r") as file:
62+
data = json.load(file)
63+
64+
return data
65+
66+
67+
def test_failed_tests_percentage(loading_data_quality_json_as_dict: dict) -> None:
68+
"""Checks whether the amount of failed tests are below 50% of the total tests.
69+
70+
Args:
71+
loading_data_quality (dict): evidently output json info
72+
"""
73+
74+
##### WRITE YOUR CODE HERE
75+
76+
77+
def test_pickup_date_data_drift(loading_data_drift_json_as_dict: dict) -> None:
78+
"""Checks whether the feature dropoff_latitude data drift is above threshold.
79+
80+
Args:
81+
loading_data_drift_json_as_dict (dict): evidently output json info
82+
"""
83+
84+
##### WRITE YOUR CODE HERE
85+
86+
87+
def test_negative_kendall_feature_target_correlation(
88+
loading_target_drift_json_as_dict: dict,
89+
) -> None:
90+
"""Checks whether any feature in the current dataset has passed the
91+
kendall test for being negatively correlated with the target.
92+
93+
Args:
94+
loading_target_drift_json_as_dict (dict): evidently output json info
95+
"""
96+
97+
##### WRITE YOUR CODE HERE

src/evidently_impl.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def load_reference_current_data(path_reference: str, path_current: str) -> dict:
4242
dataset_dict = {}
4343
for name, path in zip(["reference", "current"], [path_reference, path_current]):
4444
dataset = pd.read_csv(
45-
f"data/processed/{path}.csv"
45+
f"/Workspace/Repos/[email protected]/evidently_implementation/data/processed/{path}.csv"
4646
)
4747

4848
dataset_sample = dataset.sample(n=5000, replace=False)
@@ -69,9 +69,11 @@ def return_data_quality_metrics(dataset_dict: dict) -> None:
6969
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
7070
)
7171
data_quality_test_suite.save_json(
72-
"data/output/data_quality.json"
72+
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_quality.json"
7373
)
7474

75+
return data_quality_test_suite
76+
7577

7678
def return_data_drift_metrics(dataset_dict: dict) -> None:
7779
"""Returns data drift metrics
@@ -85,9 +87,11 @@ def return_data_drift_metrics(dataset_dict: dict) -> None:
8587
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
8688
)
8789
report.save_json(
88-
"data/output/data_drift.json"
90+
"/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_drift.json"
8991
)
9092

93+
return report
94+
9195

9296
def return_target_drift_metrics(dataset_dict: dict) -> None:
9397
"""Returns target drift metrics
@@ -103,12 +107,15 @@ def return_target_drift_metrics(dataset_dict: dict) -> None:
103107
)
104108

105109
num_target_drift_report.run(
106-
reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
110+
reference_data=dataset_dict["reference"],
111+
current_data=dataset_dict["current"]
107112
)
108113
num_target_drift_report.save_json(
109114
"data/output/target_drift.json"
110115
)
111116

117+
return num_target_drift_report
118+
112119

113120
def create_all_presets_metrics():
114121
"""Run all presets metrics

0 commit comments

Comments
 (0)