refactor(data_preprocessor.py): improve data path resolution

remove hardcoded paths and use Pathlib to resolve the data/ path for local read
end-to-end-mlops-databricks · Oct 23, 2024 · 7277287 · 7277287
1 parent b599df7
commit 7277287
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 71 deletions.
diff --git a/power_consumption/preprocessing/data_preprocessor.py b/power_consumption/preprocessing/data_preprocessor.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import os
+from pathlib import Path
 from typing import Optional, Tuple
 
 import numpy as np
@@ -34,15 +34,21 @@ def load_data(self) -> None:
         """
         Load the dataset from UCI ML Repository or local CSV file.
 
-        Parameters
-        ----------
-        dataset_id : int
-            The ID of the dataset to fetch from UCI ML Repository.
+        This method attempts to load the dataset from the UCI ML Repository using the dataset ID
+        specified in the configuration. If that fails, it falls back to loading from a local CSV file.
+
+        The dataset ID is obtained from the configuration (self.config.dataset.id).
 
         Notes
         -----
         If loading from UCI ML Repository fails, the method will attempt to load
-        the data from '../data/Tetuan City power consumption.csv'.
+        the data from '../data/Tetuan City power consumption.csv' or
+        './data/Tetuan City power consumption.csv'.
+
+        Raises
+        ------
+        Exception
+            If both UCI ML Repository fetch and local CSV file loading fail.
         """
         dataset_id = self.config.dataset.id
         try:
@@ -54,12 +60,19 @@ def load_data(self) -> None:
         except Exception as e:
             logger.warning(f"Failed to load data from UCI ML Repository: {e}")
             logger.info("Attempting to load data from local CSV file")
-            csv_path = "../data/Tetuan City power consumption.csv"
-            if not os.path.exists(csv_path):
-                csv_path = "./data/Tetuan City power consumption.csv"
+            data_dir = Path(__file__).resolve().parents[2] / "data"
+            csv_filename = "Tetuan City power consumption.csv"
+            csv_path = data_dir / csv_filename
+
+            if not csv_path.exists():
+                csv_path = Path.cwd() / "data" / csv_filename
+
             try:
                 self.data = pd.read_csv(csv_path)
                 logger.info(f"Successfully loaded data from {csv_path}")
+            except FileNotFoundError:
+                logger.error(f"CSV file not found at {csv_path}")
+                raise
             except Exception as e:
                 logger.error(f"Failed to load data from {csv_path}: {e}")
                 raise

diff --git a/tests/preprocessing/test_data_preprocessor.py b/tests/preprocessing/test_data_preprocessor.py
@@ -3,6 +3,7 @@
 import pytest
 from power_consumption.preprocessing.data_preprocessor import DataProcessor
 from power_consumption.config import Config
+from pathlib import Path
 
 
 @pytest.fixture
@@ -54,17 +55,23 @@ def test_load_data(project_config, mocker):
 
 
 def test_load_data_fallback(project_config, mocker):
-    mock_fetch = mocker.patch(
+    mocker.patch(
         "power_consumption.preprocessing.data_preprocessor.fetch_ucirepo",
         side_effect=Exception("UCI fetch failed"),
     )
+
     mock_read_csv = mocker.patch("pandas.read_csv")
     mock_read_csv.return_value = pd.DataFrame({"B": [4, 5, 6]})
 
+    mock_path = mocker.patch("power_consumption.preprocessing.data_preprocessor.Path")
+    mock_path.return_value.exists.return_value = True
+    mock_path.return_value.resolve.return_value.parents.__getitem__.return_value = Path("/mocked/project/root")
+
     processor = DataProcessor(config=project_config)
 
-    mock_fetch.assert_called_once_with(id=849)
-    mock_read_csv.assert_called_once_with("./data/Tetuan City power consumption.csv")
+    mock_read_csv.assert_called_once()
+
+    assert processor.data is not None
     assert processor.data.equals(pd.DataFrame({"B": [4, 5, 6]}))
 
 

diff --git a/uv.lock b/uv.lock