updating group scripts

clachevv · clachevv · commit 8a9dcac6b1ec · 2023-09-20T14:01:04.000Z
diff --git a/notebooks/group_01/evidently_run.py b/notebooks/group_01/evidently_run.py
@@ -0,0 +1,69 @@
+# Databricks notebook source
+#!/usr/bin/env python3
+
+"""
+    Purpose: Run evidently tests against two comparing datasets.
+"""
+
+import sys
+import os
+
+sys.path.append(os.path.abspath("../.."))
+
+from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics
+
+import pandas as pd
+import numpy as np
+import pytest
+
+# COMMAND ----------
+
+path_reference = "train_01"
+path_current = "train_02"
+
+# loading reference and current data
+dataset_dict = load_reference_current_data(
+    path_reference=path_reference, path_current=path_current
+)
+
+# COMMAND ----------
+
+# DATA QUALITY EVIDENTLY OUTPUT
+return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+# DATA DRIFT EVIDENTLY OUTPUT
+return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+# TARGET DRIFT EVIDENTLY OUTPUT
+return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+repo_name = "/Repos/clarissa.chevalier@artefact.com/evidently_implementation/notebooks/group_01"
+
+# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
+notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
+
+# Get the repo's root directory name.
+repo_root = os.path.dirname(os.path.dirname(notebook_path))
+
+# Prepare to run pytest from the repo.
+# os.chdir(f"{repo_name}")
+print(os.getcwd())
+
+# Skip writing pyc files on a readonly filesystem.
+sys.dont_write_bytecode = True
+
+# Run pytest.
+retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])
+
+# Fail the cell execution if there are any test failures.
+assert retcode == 0, "The pytest invocation failed. See the log for details."
+
+# COMMAND ----------
+
+# 
diff --git a/notebooks/group_01/test_drift.py b/notebooks/group_01/test_drift.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+"""
+    Purpose: Loads json test files and performs data tests.
+"""
+
+import sys
+import os
+
+project_root_dir = os.path.abspath("../../..")
+sys.path.append(project_root_dir)
+root_dir = '/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation'
+
+import pandas as pd
+import pytest
+import json
+
+@pytest.fixture
+def loading_data_quality_json_as_dict() -> dict:
+    """Returns "Data Quality" results from Evidently.ai lib
+
+    Returns:
+        dict: data quality metrics.
+    """
+
+    file_path = os.path.join(root_dir,"data/output","data_quality.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+        
+    return data
+
+
+@pytest.fixture()
+def loading_data_drift_json_as_dict() -> dict:
+    """Returns "Data Drift" results from Evidently.ai lib
+
+    Returns:
+        dict: data drift metrics.
+    """
+
+    file_path = os.path.join(root_dir,"data/output","data_drift.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+        
+    return data
+
+
+@pytest.fixture()
+def loading_target_drift_json_as_dict() -> dict:
+    """Returns "Target Drift" results from Evidently.ai lib
+
+    Returns:
+        dict: target drift metrics.
+    """
+
+    file_path = os.path.join(root_dir,"data/output","target_drift.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+        
+    return data
+
+
+def test_failed_tests_percentage(loading_data_quality_json_as_dict:dict) -> None:
+    """Checks whether the amount of failed tests are above 50% of the total tests.
+
+    Args:
+        loading_data_quality_json_as_dict (dict): evidently output json info
+    """
+    #### WRITE YOUR CODE HERE 
+
+
+def test_pickup_date_drift(loading_data_drift_json_as_dict: dict) -> None:
+    """Checks whether the feature pickup_day data drift is above threshold. 
+
+    Args:
+        loading_data_drift_json_as_dict (dict): evidently output json info
+    """
+    #### WRITE YOUR CODE HERE 
+
+
+def test_negative_kendall_feature_target_correlation(loading_target_drift_json_as_dict: dict) -> None:
+    """Checks whether any feature in the current dataset has passed the 
+    kendall test for being negatively correlated with the target.
+
+    Args:
+        loading_target_drift_json_as_dict (dict): evidently output json info
+    """
+    #### WRITE YOUR CODE HERE 
+
diff --git a/notebooks/group_02/evidently_run.py b/notebooks/group_02/evidently_run.py
@@ -0,0 +1,71 @@
+# Databricks notebook source
+#!/usr/bin/env python3
+
+"""
+    Purpose: Run evidently tests against two comparing datasets.
+"""
+
+import sys
+import os
+
+sys.path.append(os.path.abspath("../.."))
+root_dir = '/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation'
+
+from src.evidently_impl import load_reference_current_data, return_data_quality_metrics, return_data_drift_metrics, return_target_drift_metrics
+
+import pandas as pd
+import numpy as np
+import pytest
+import json
+
+# COMMAND ----------
+
+path_reference = "train_01"
+path_current = "train_02"
+
+# loading reference and current data
+dataset_dict = load_reference_current_data(
+    path_reference=path_reference, path_current=path_current
+)
+
+# COMMAND ----------
+
+# DATA QUALITY EVIDENTLY OUTPUT
+return_data_quality_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+# DATA DRIFT EVIDENTLY OUTPUT
+return_data_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+# TARGET DRIFT EVIDENTLY OUTPUT
+return_target_drift_metrics(dataset_dict = dataset_dict).show(mode='inline')
+
+# COMMAND ----------
+
+repo_name = "/Repos/clarissa.chevalier@artefact.com/evidently_implementation/notebooks/group_02"
+
+# Get the path to this notebook, for example "/Workspace/Repos/{username}/{repo-name}".
+notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
+
+# Get the repo's root directory name.
+repo_root = os.path.dirname(os.path.dirname(notebook_path))
+
+# Prepare to run pytest from the repo.
+# os.chdir(f"{repo_name}")
+print(os.getcwd())
+
+# Skip writing pyc files on a readonly filesystem.
+sys.dont_write_bytecode = True
+
+# Run pytest.
+retcode = pytest.main([".", "-v", "-p", "no:cacheprovider"])
+
+# Fail the cell execution if there are any test failures.
+assert retcode == 0, "The pytest invocation failed. See the log for details."
+
+# COMMAND ----------
+
+
diff --git a/notebooks/group_02/test_drift.py b/notebooks/group_02/test_drift.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+"""
+    Purpose: Loads json test files and performs data tests.
+"""
+
+import sys
+import os
+
+project_root_dir = os.path.abspath("../../..")
+sys.path.append(project_root_dir)
+root_dir = "/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation"
+
+import pandas as pd
+import pytest
+import json
+
+
+@pytest.fixture
+def loading_data_quality_json_as_dict() -> dict:
+    """Returns "Data Quality" results from Evidently.ai lib
+
+    Returns:
+        dict: data quality metrics.
+    """
+
+    file_path = os.path.join(root_dir, "data/output", "data_quality.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+
+    return data
+
+
+@pytest.fixture()
+def loading_data_drift_json_as_dict() -> dict:
+    """Returns "Data Drift" results from Evidently.ai lib
+
+    Returns:
+        dict: data drift metrics.
+    """
+
+    file_path = os.path.join(root_dir, "data/output", "data_drift.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+
+    return data
+
+
+@pytest.fixture()
+def loading_target_drift_json_as_dict() -> dict:
+    """Returns "Target Drift" results from Evidently.ai lib
+
+    Returns:
+        dict: target drift metrics.
+    """
+
+    file_path = os.path.join(root_dir, "data/output", "target_drift.json")
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+
+    return data
+
+
+def test_failed_tests_percentage(loading_data_quality_json_as_dict: dict) -> None:
+    """Checks whether the amount of failed tests are below 50% of the total tests.
+
+    Args:
+        loading_data_quality (dict): evidently output json info
+    """
+
+    ##### WRITE YOUR CODE HERE
+
+
+def test_pickup_date_data_drift(loading_data_drift_json_as_dict: dict) -> None:
+    """Checks whether the feature dropoff_latitude data drift is above threshold.
+
+    Args:
+        loading_data_drift_json_as_dict (dict): evidently output json info
+    """
+
+    ##### WRITE YOUR CODE HERE
+
+
+def test_negative_kendall_feature_target_correlation(
+    loading_target_drift_json_as_dict: dict,
+) -> None:
+    """Checks whether any feature in the current dataset has passed the
+    kendall test for being negatively correlated with the target.
+
+    Args:
+        loading_target_drift_json_as_dict (dict): evidently output json info
+    """
+
+    ##### WRITE YOUR CODE HERE
diff --git a/src/evidently_impl.py b/src/evidently_impl.py
@@ -42,7 +42,7 @@ def load_reference_current_data(path_reference: str, path_current: str) -> dict:
     dataset_dict = {}
     for name, path in zip(["reference", "current"], [path_reference, path_current]):
         dataset = pd.read_csv(
-            f"data/processed/{path}.csv"
+            f"/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation/data/processed/{path}.csv"
         )
 
         dataset_sample = dataset.sample(n=5000, replace=False)
@@ -69,9 +69,11 @@ def return_data_quality_metrics(dataset_dict: dict) -> None:
         reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
     )
     data_quality_test_suite.save_json(
-        "data/output/data_quality.json"
+        "/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation/data/output/data_quality.json"
     )
 
+    return data_quality_test_suite
+
 
 def return_data_drift_metrics(dataset_dict: dict) -> None:
     """Returns data drift metrics
@@ -85,9 +87,11 @@ def return_data_drift_metrics(dataset_dict: dict) -> None:
         reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
     )
     report.save_json(
-        "data/output/data_drift.json"
+        "/Workspace/Repos/clarissa.chevalier@artefact.com/evidently_implementation/data/output/data_drift.json"
     )
 
+    return report
+
 
 def return_target_drift_metrics(dataset_dict: dict) -> None:
     """Returns target drift metrics
@@ -103,12 +107,15 @@ def return_target_drift_metrics(dataset_dict: dict) -> None:
     )
 
     num_target_drift_report.run(
-        reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]
+        reference_data=dataset_dict["reference"], 
+        current_data=dataset_dict["current"]
     )
     num_target_drift_report.save_json(
         "data/output/target_drift.json"
     )
 
+    return num_target_drift_report
+
 
 def create_all_presets_metrics():
     """Run all presets metrics
diff --git a/src/tests_impl.py b/src/tests_impl.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def load_reference_current_data(path_reference: str, path_current: str) -> dict:`
`42`	`42`	`dataset_dict = {}`
`43`	`43`	`for name, path in zip(["reference", "current"], [path_reference, path_current]):`
`44`	`44`	`dataset = pd.read_csv(`
`45`		`- f"data/processed/{path}.csv"`
	`45`	`+ f"/Workspace/Repos/[email protected]/evidently_implementation/data/processed/{path}.csv"`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`dataset_sample = dataset.sample(n=5000, replace=False)`
`@@ -69,9 +69,11 @@ def return_data_quality_metrics(dataset_dict: dict) -> None:`
`69`	`69`	`reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]`
`70`	`70`	`)`
`71`	`71`	`data_quality_test_suite.save_json(`
`72`		`- "data/output/data_quality.json"`
	`72`	`+ "/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_quality.json"`
`73`	`73`	`)`
`74`	`74`
	`75`	`+ return data_quality_test_suite`
	`76`	`+`
`75`	`77`
`76`	`78`	`def return_data_drift_metrics(dataset_dict: dict) -> None:`
`77`	`79`	`"""Returns data drift metrics`
`@@ -85,9 +87,11 @@ def return_data_drift_metrics(dataset_dict: dict) -> None:`
`85`	`87`	`reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]`
`86`	`88`	`)`
`87`	`89`	`report.save_json(`
`88`		`- "data/output/data_drift.json"`
	`90`	`+ "/Workspace/Repos/[email protected]/evidently_implementation/data/output/data_drift.json"`
`89`	`91`	`)`
`90`	`92`
	`93`	`+ return report`
	`94`	`+`
`91`	`95`
`92`	`96`	`def return_target_drift_metrics(dataset_dict: dict) -> None:`
`93`	`97`	`"""Returns target drift metrics`
`@@ -103,12 +107,15 @@ def return_target_drift_metrics(dataset_dict: dict) -> None:`
`103`	`107`	`)`
`104`	`108`
`105`	`109`	`num_target_drift_report.run(`
`106`		`- reference_data=dataset_dict["reference"], current_data=dataset_dict["current"]`
	`110`	`+ reference_data=dataset_dict["reference"],`
	`111`	`+ current_data=dataset_dict["current"]`
`107`	`112`	`)`
`108`	`113`	`num_target_drift_report.save_json(`
`109`	`114`	`"data/output/target_drift.json"`
`110`	`115`	`)`
`111`	`116`
	`117`	`+ return num_target_drift_report`
	`118`	`+`
`112`	`119`
`113`	`120`	`def create_all_presets_metrics():`
`114`	`121`	`"""Run all presets metrics`