-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add generate and clean steps to core
- Loading branch information
1 parent
67978af
commit 86edc65
Showing
21 changed files
with
1,572 additions
and
1,549 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,10 @@ __pycache__/ | |
# C extensions | ||
*.so | ||
|
||
*.joblib | ||
*.bin | ||
*.json | ||
|
||
# ignore examples folder | ||
examples/ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,2 @@ | ||
from .data_container import DataContainer # noqa: F401 | ||
from .pipeline import Pipeline # noqa: F401 | ||
from .steps import PipelineStep # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from typing import Optional | ||
|
||
from pipeline_lib.core import DataContainer | ||
from pipeline_lib.core.steps.base import PipelineStep | ||
|
||
|
||
class CleanStep(PipelineStep): | ||
def __init__( | ||
self, | ||
fill_missing: Optional[dict] = None, | ||
remove_outliers: Optional[dict] = None, | ||
convert_dtypes: Optional[dict] = None, | ||
): | ||
self.init_logger() | ||
self.fill_missing = fill_missing | ||
self.remove_outliers = remove_outliers | ||
self.convert_dtypes = convert_dtypes | ||
|
||
def execute(self, data: DataContainer) -> DataContainer: | ||
self.logger.info("Cleaning tabular data...") | ||
|
||
df = data[DataContainer.RAW] | ||
|
||
if self.fill_missing: | ||
for column, fill_value in self.fill_missing.items(): | ||
if column in df.columns: | ||
df[column].fillna(fill_value, inplace=True) | ||
self.logger.info( | ||
f"Filled missing values in column '{column}' with {fill_value}" | ||
) | ||
else: | ||
self.logger.warning(f"Column '{column}' not found in the DataFrame") | ||
|
||
if self.remove_outliers: | ||
for column, method in self.remove_outliers.items(): | ||
if column in df.columns: | ||
if method == "clip": | ||
q1 = df[column].quantile(0.25) | ||
q3 = df[column].quantile(0.75) | ||
iqr = q3 - q1 | ||
lower_bound = q1 - (1.5 * iqr) | ||
upper_bound = q3 + (1.5 * iqr) | ||
df[column] = df[column].clip(lower=lower_bound, upper=upper_bound) | ||
self.logger.info(f"Clipped outliers in column '{column}'") | ||
elif method == "drop": | ||
q1 = df[column].quantile(0.25) | ||
q3 = df[column].quantile(0.75) | ||
iqr = q3 - q1 | ||
lower_bound = q1 - (1.5 * iqr) | ||
upper_bound = q3 + (1.5 * iqr) | ||
outliers = (df[column] < lower_bound) | (df[column] > upper_bound) | ||
df = df[~outliers] | ||
self.logger.info(f"Dropped outliers in column '{column}'") | ||
else: | ||
self.logger.warning(f"Unsupported outlier removal method '{method}'") | ||
else: | ||
self.logger.warning(f"Column '{column}' not found in the DataFrame") | ||
|
||
if self.convert_dtypes: | ||
for column, dtype in self.convert_dtypes.items(): | ||
if column in df.columns: | ||
df[column] = df[column].astype(dtype) | ||
self.logger.info(f"Converted column '{column}' to {dtype}") | ||
else: | ||
self.logger.warning(f"Column '{column}' not found in the DataFrame") | ||
|
||
data[DataContainer.CLEAN] = df | ||
|
||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
from enum import Enum | ||
|
||
import pandas as pd | ||
|
||
from pipeline_lib.core import DataContainer | ||
from pipeline_lib.core.steps.base import PipelineStep | ||
|
||
|
||
class FileType(Enum): | ||
CSV = ".csv" | ||
PARQUET = ".parquet" | ||
|
||
|
||
class GenerateStep(PipelineStep): | ||
def __init__(self, path: str, **kwargs): | ||
self.init_logger() | ||
self.file_path = path | ||
self.kwargs = kwargs | ||
|
||
def execute(self, data: DataContainer) -> DataContainer: | ||
self.logger.info(f"Generating data from file: {self.file_path}") | ||
|
||
if not os.path.exists(self.file_path): | ||
raise FileNotFoundError(f"File not found: {self.file_path}") | ||
|
||
file_type = self._infer_file_type() | ||
|
||
if file_type == FileType.CSV: | ||
df = self._read_csv() | ||
elif file_type == FileType.PARQUET: | ||
df = self._read_parquet() | ||
else: | ||
raise ValueError(f"Unsupported file type: {file_type}") | ||
|
||
data[DataContainer.RAW] = df | ||
|
||
self.logger.info(f"Generated DataFrame with shape: {df.shape}") | ||
|
||
return data | ||
|
||
def _infer_file_type(self) -> FileType: | ||
_, file_extension = os.path.splitext(self.file_path) | ||
file_extension = file_extension.lower() | ||
|
||
try: | ||
return FileType(file_extension) | ||
except ValueError: | ||
raise ValueError(f"Unsupported file extension: {file_extension}") | ||
|
||
def _read_csv(self) -> pd.DataFrame: | ||
kwargs = self.kwargs.copy() | ||
index_col = kwargs.pop("index", None) | ||
df = pd.read_csv(self.file_path, **kwargs) | ||
if index_col is not None: | ||
df.set_index(index_col, inplace=True) | ||
return df | ||
|
||
def _read_parquet(self) -> pd.DataFrame: | ||
kwargs = self.kwargs.copy() | ||
index_col = kwargs.pop("index", None) | ||
df = pd.read_parquet(self.file_path, **kwargs) | ||
if index_col is not None: | ||
df.set_index(index_col, inplace=True) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.