Skip to content

Commit

Permalink
refactor(data_preprocessor.py): improve data path resolution
Browse files Browse the repository at this point in the history
remove hardcoded paths and use Pathlib to resolve the data/ path for local read
  • Loading branch information
Garett601 committed Oct 23, 2024
1 parent b599df7 commit 7277287
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 71 deletions.
31 changes: 22 additions & 9 deletions power_consumption/preprocessing/data_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import os
from pathlib import Path
from typing import Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -34,15 +34,21 @@ def load_data(self) -> None:
"""
Load the dataset from UCI ML Repository or local CSV file.
Parameters
----------
dataset_id : int
The ID of the dataset to fetch from UCI ML Repository.
This method attempts to load the dataset from the UCI ML Repository using the dataset ID
specified in the configuration. If that fails, it falls back to loading from a local CSV file.
The dataset ID is obtained from the configuration (self.config.dataset.id).
Notes
-----
If loading from UCI ML Repository fails, the method will attempt to load
the data from '../data/Tetuan City power consumption.csv'.
the data from '../data/Tetuan City power consumption.csv' or
'./data/Tetuan City power consumption.csv'.
Raises
------
Exception
If both UCI ML Repository fetch and local CSV file loading fail.
"""
dataset_id = self.config.dataset.id
try:
Expand All @@ -54,12 +60,19 @@ def load_data(self) -> None:
except Exception as e:
logger.warning(f"Failed to load data from UCI ML Repository: {e}")
logger.info("Attempting to load data from local CSV file")
csv_path = "../data/Tetuan City power consumption.csv"
if not os.path.exists(csv_path):
csv_path = "./data/Tetuan City power consumption.csv"
data_dir = Path(__file__).resolve().parents[2] / "data"
csv_filename = "Tetuan City power consumption.csv"
csv_path = data_dir / csv_filename

if not csv_path.exists():
csv_path = Path.cwd() / "data" / csv_filename

try:
self.data = pd.read_csv(csv_path)
logger.info(f"Successfully loaded data from {csv_path}")
except FileNotFoundError:
logger.error(f"CSV file not found at {csv_path}")
raise
except Exception as e:
logger.error(f"Failed to load data from {csv_path}: {e}")
raise
Expand Down
13 changes: 10 additions & 3 deletions tests/preprocessing/test_data_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
from power_consumption.preprocessing.data_preprocessor import DataProcessor
from power_consumption.config import Config
from pathlib import Path


@pytest.fixture
Expand Down Expand Up @@ -54,17 +55,23 @@ def test_load_data(project_config, mocker):


def test_load_data_fallback(project_config, mocker):
mock_fetch = mocker.patch(
mocker.patch(
"power_consumption.preprocessing.data_preprocessor.fetch_ucirepo",
side_effect=Exception("UCI fetch failed"),
)

mock_read_csv = mocker.patch("pandas.read_csv")
mock_read_csv.return_value = pd.DataFrame({"B": [4, 5, 6]})

mock_path = mocker.patch("power_consumption.preprocessing.data_preprocessor.Path")
mock_path.return_value.exists.return_value = True
mock_path.return_value.resolve.return_value.parents.__getitem__.return_value = Path("/mocked/project/root")

processor = DataProcessor(config=project_config)

mock_fetch.assert_called_once_with(id=849)
mock_read_csv.assert_called_once_with("./data/Tetuan City power consumption.csv")
mock_read_csv.assert_called_once()

assert processor.data is not None
assert processor.data.equals(pd.DataFrame({"B": [4, 5, 6]}))


Expand Down
118 changes: 59 additions & 59 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7277287

Please sign in to comment.