Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create Encoding step #4

Merged
merged 25 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
fb35cc9
first encoding solution
diegomarvid Apr 2, 2024
5d34015
handle high cardinality with ordinal encoder
diegomarvid Apr 2, 2024
62da964
improve logging
diegomarvid Apr 2, 2024
846adf4
swap ordinal encoding and target encoding, ensure output is in order
diegomarvid Apr 2, 2024
9f9b353
correctly handle dtypes and refactor class
diegomarvid Apr 3, 2024
42d8246
convert ordinal int to smallest possible int
diegomarvid Apr 3, 2024
045fc18
fix dtype perservation
diegomarvid Apr 3, 2024
aa85c96
remove unnecessary print from Generate Step
diegomarvid Apr 3, 2024
2a5c135
personalize encoders in json
diegomarvid Apr 4, 2024
7e6aa5d
minor change
diegomarvid Apr 4, 2024
ec50baa
check supported encoders in __init__ instead of exectue
diegomarvid Apr 4, 2024
b5bd9a1
change saving method to save fitted encoders
diegomarvid Apr 4, 2024
a50bf60
remove unneccesary print
diegomarvid Apr 4, 2024
742eea5
fix fit encoders for predicting
diegomarvid Apr 4, 2024
e96f281
fix predict without target
diegomarvid Apr 4, 2024
69e5d4d
improve error message in explainer
diegomarvid Apr 4, 2024
7a41eeb
fix drop columns modification
diegomarvid Apr 4, 2024
17423cc
add feature map in json for configuration
diegomarvid Apr 5, 2024
4630a42
update default cardinality threshold
diegomarvid Apr 5, 2024
beecfa8
improve error handling in data container for explainer at training
diegomarvid Apr 5, 2024
6e9316c
update poetry
diegomarvid Apr 10, 2024
9328910
fix encode index issue
diegomarvid Apr 11, 2024
d0d1f43
change downcast to palf function
diegomarvid Apr 12, 2024
611e68e
improve docstring in column transformer function
diegomarvid Apr 12, 2024
095ef05
change target param to Generate Step
diegomarvid Apr 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion pipeline_lib/core/data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pandas as pd
import yaml
from sklearn.compose import ColumnTransformer

from pipeline_lib.core.model import Model

Expand Down Expand Up @@ -166,7 +167,7 @@ def save(self, file_path: str, keys: Optional[Union[str, list[str]]] = None):
if isinstance(keys, str):
keys = [keys]

data_to_save = {k: self.data[k] for k in keys} if keys else self.data
data_to_save = {k: self.data.get(k) for k in keys} if keys else self.data

serialized_data = pickle.dumps(data_to_save)
data_size_bytes = sys.getsizeof(serialized_data)
Expand Down Expand Up @@ -485,6 +486,11 @@ def explainer(self) -> Any:
Any
The explainer stored in the DataContainer.
"""
if self.is_train:
raise ValueError(
"Explainer is only available for prediction. Pipeline was executed on training"
" mode."
)
return self["explainer"]

@explainer.setter
Expand Down Expand Up @@ -619,6 +625,30 @@ def is_train(self, value: bool):
"""
self["is_train"] = value

@property
def _encoder(self) -> ColumnTransformer:
"""
Get the encoder from the DataContainer.

Returns
-------
ColumnTransformer
The encoder stored in the DataContainer.
"""
return self["encoder"]

@_encoder.setter
def _encoder(self, value: ColumnTransformer):
"""
Set the encoder in the DataContainer.

Parameters
----------
value
The encoder to be stored in the DataContainer.
"""
self["encoder"] = value

def __eq__(self, other) -> bool:
"""
Compare this DataContainer with another for equality.
Expand Down
38 changes: 18 additions & 20 deletions pipeline_lib/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ class Pipeline:
step_registry = StepRegistry()
model_registry = ModelRegistry()

KEYS_TO_SAVE = ["model", "encoder", "_drop_columns", "target"]

def __init__(self, initial_data: Optional[DataContainer] = None):
self.steps = []
self.initial_data = initial_data
self.save_path = None
self.load_path = None
self.model_path = None
self.config = None
self.save_data_path = None

def add_steps(self, steps: list[PipelineStep]):
"""Add steps to the pipeline."""
Expand All @@ -35,22 +35,32 @@ def add_steps(self, steps: list[PipelineStep]):
def run(self, is_train: bool, save: bool = True) -> DataContainer:
"""Run the pipeline on the given data."""

data = DataContainer.from_pickle(self.load_path) if self.load_path else DataContainer()
data.is_train = is_train
if not self.save_data_path:
raise ValueError(
"A path for saving the data must be provided. Use the `save_data_path` attribute."
)

data = DataContainer()

if is_train:
steps_to_run = [step for step in self.steps if step.used_for_training]
self.logger.info("Training the pipeline")
else:
data = DataContainer.from_pickle(self.save_data_path)
steps_to_run = [step for step in self.steps if step.used_for_prediction]
self.logger.info("Predicting with the pipeline")

data.is_train = is_train

for i, step in enumerate(steps_to_run):
Pipeline.logger.info(
f"Running {step.__class__.__name__} - {i + 1} / {len(steps_to_run)}"
)
data = step.execute(data)

if is_train:
data.save(self.save_data_path, keys=self.KEYS_TO_SAVE)

if save:
self.save_run(data)

Expand Down Expand Up @@ -78,18 +88,15 @@ def from_json(cls, path: str) -> Pipeline:
if custom_steps_path:
cls.step_registry.load_and_register_custom_steps(custom_steps_path)

save_data_path = config["pipeline"].get("save_data_path")

pipeline = Pipeline()

pipeline.load_path = config.get("load_path")
pipeline.save_path = config.get("save_path")
pipeline.config = config
pipeline.save_data_path = save_data_path

steps = []

model_path = None
drop_columns = None
target = None

for step_config in config["pipeline"]["steps"]:
step_type = step_config["step_type"]
parameters = step_config.get("parameters", {})
Expand All @@ -103,15 +110,6 @@ def from_json(cls, path: str) -> Pipeline:
model_class_name = parameters.pop("model_class")
model_class = cls.model_registry.get_model_class(model_class_name)
parameters["model_class"] = model_class
model_path = parameters.get("save_path")
drop_columns = parameters.get("drop_columns")
target = parameters.get("target")

# if step type is prediction, add model path
if step_type == "PredictStep":
parameters["load_path"] = model_path
parameters["drop_columns"] = drop_columns
parameters["target"] = target

step_class = cls.step_registry.get_step_class(step_type)
step = step_class(**parameters)
Expand Down
Loading
Loading