Skip to content

Commit 86edc65

Browse files
committed
add generate and clean steps to core
1 parent 67978af commit 86edc65

21 files changed

+1572
-1549
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ __pycache__/
66
# C extensions
77
*.so
88

9+
*.joblib
10+
*.bin
11+
*.json
12+
913
# ignore examples folder
1014
examples/
1115

pipeline_lib/core/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
from .data_container import DataContainer # noqa: F401
2-
from .pipeline import Pipeline # noqa: F401
32
from .steps import PipelineStep # noqa: F401

pipeline_lib/core/step_registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
import pkgutil
55

6-
from pipeline_lib.core.steps import PipelineStep
6+
from pipeline_lib.core.steps.base import PipelineStep
77

88

99
class StepClassNotFoundError(Exception):

pipeline_lib/core/steps/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
from .calculate_features import CalculateFeaturesStep # noqa: F401
44
from .calculate_metrics import CalculateMetricsStep # noqa: F401
55
from .calculate_reports import CalculateReportsStep # noqa: F401
6+
from .clean import CleanStep # noqa: F401
67
from .encode import EncodeStep # noqa: F401
78
from .explainer_dashboard import ExplainerDashboardStep # noqa: F401
89
from .fit_encoders import FitEncodersStep # noqa: F401
910
from .fit_model import FitModelStep # noqa: F401
11+
from .generate import GenerateStep # noqa: F401
1012
from .input_scaling import InputScalingStep # noqa: F401
1113
from .predict import PredictStep # noqa: F401
1214
from .tabular_split import TabularSplitStep # noqa: F401

pipeline_lib/core/steps/augment.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from typing import Optional
22

33
from pipeline_lib.core import DataContainer
4-
5-
from .base import PipelineStep
4+
from pipeline_lib.core.steps.base import PipelineStep
65

76

87
class AugmentStep(PipelineStep):

pipeline_lib/core/steps/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from abc import ABC, abstractmethod
33
from typing import Optional
44

5-
from pipeline_lib.core import DataContainer
5+
from pipeline_lib.core.data_container import DataContainer
66

77

88
class PipelineStep(ABC):

pipeline_lib/core/steps/calculate_features.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from typing import Optional
22

33
from pipeline_lib.core import DataContainer
4-
5-
from .base import PipelineStep
4+
from pipeline_lib.core.steps.base import PipelineStep
65

76

87
class CalculateFeaturesStep(PipelineStep):

pipeline_lib/core/steps/calculate_metrics.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
from sklearn.metrics import mean_absolute_error, mean_squared_error
33

44
from pipeline_lib.core import DataContainer
5-
6-
from .base import PipelineStep
5+
from pipeline_lib.core.steps.base import PipelineStep
76

87

98
class CalculateMetricsStep(PipelineStep):

pipeline_lib/core/steps/calculate_reports.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from typing import Optional
22

33
from pipeline_lib.core import DataContainer
4-
5-
from .base import PipelineStep
4+
from pipeline_lib.core.steps.base import PipelineStep
65

76

87
class CalculateReportsStep(PipelineStep):

pipeline_lib/core/steps/clean.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from typing import Optional
2+
3+
from pipeline_lib.core import DataContainer
4+
from pipeline_lib.core.steps.base import PipelineStep
5+
6+
7+
class CleanStep(PipelineStep):
8+
def __init__(
9+
self,
10+
fill_missing: Optional[dict] = None,
11+
remove_outliers: Optional[dict] = None,
12+
convert_dtypes: Optional[dict] = None,
13+
):
14+
self.init_logger()
15+
self.fill_missing = fill_missing
16+
self.remove_outliers = remove_outliers
17+
self.convert_dtypes = convert_dtypes
18+
19+
def execute(self, data: DataContainer) -> DataContainer:
20+
self.logger.info("Cleaning tabular data...")
21+
22+
df = data[DataContainer.RAW]
23+
24+
if self.fill_missing:
25+
for column, fill_value in self.fill_missing.items():
26+
if column in df.columns:
27+
df[column].fillna(fill_value, inplace=True)
28+
self.logger.info(
29+
f"Filled missing values in column '{column}' with {fill_value}"
30+
)
31+
else:
32+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
33+
34+
if self.remove_outliers:
35+
for column, method in self.remove_outliers.items():
36+
if column in df.columns:
37+
if method == "clip":
38+
q1 = df[column].quantile(0.25)
39+
q3 = df[column].quantile(0.75)
40+
iqr = q3 - q1
41+
lower_bound = q1 - (1.5 * iqr)
42+
upper_bound = q3 + (1.5 * iqr)
43+
df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
44+
self.logger.info(f"Clipped outliers in column '{column}'")
45+
elif method == "drop":
46+
q1 = df[column].quantile(0.25)
47+
q3 = df[column].quantile(0.75)
48+
iqr = q3 - q1
49+
lower_bound = q1 - (1.5 * iqr)
50+
upper_bound = q3 + (1.5 * iqr)
51+
outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
52+
df = df[~outliers]
53+
self.logger.info(f"Dropped outliers in column '{column}'")
54+
else:
55+
self.logger.warning(f"Unsupported outlier removal method '{method}'")
56+
else:
57+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
58+
59+
if self.convert_dtypes:
60+
for column, dtype in self.convert_dtypes.items():
61+
if column in df.columns:
62+
df[column] = df[column].astype(dtype)
63+
self.logger.info(f"Converted column '{column}' to {dtype}")
64+
else:
65+
self.logger.warning(f"Column '{column}' not found in the DataFrame")
66+
67+
data[DataContainer.CLEAN] = df
68+
69+
return data

0 commit comments

Comments
 (0)