Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

converted OneHotEncodingTransformer to narwhals #356

Merged
merged 4 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Changed
^^^^^^^

- converted DropOriginalMixin to narwhals `#352 <https://github.com/lvgig/tubular/issues/352>_`
- placeholder
- converted OneHotEncodingTransformer to narwhals `#355 <https://github.com/lvgig/tubular/issues/355>_`
- placeholder
- placeholder
- placeholder
Expand Down
254 changes: 155 additions & 99 deletions tests/nominal/test_OneHotEncodingTransformer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import numpy as np
import pandas as pd
import narwhals as nw
import pytest
import test_aide as ta
from test_BaseNominalTransformer import GenericNominalTransformTests

import tests.test_data as d
Expand All @@ -13,6 +11,7 @@
GenericTransformTests,
SeparatorInitMixintests,
)
from tests.utils import assert_frame_equal_dispatch, dataframe_init_dispatch
from tubular.nominal import OneHotEncodingTransformer


Expand All @@ -35,9 +34,13 @@ class TestFit(GenericFitTests):
def setup_class(cls):
cls.transformer_name = "OneHotEncodingTransformer"

def test_nulls_in_X_error(self):
@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_nulls_in_X_error(self, library):
"""Test that an exception is raised if X has nulls in column to be fit on."""
df = d.create_df_2()
df = d.create_df_2(library=library)

x = OneHotEncodingTransformer(columns=["b", "c"])

Expand All @@ -47,10 +50,15 @@ def test_nulls_in_X_error(self):
):
x.fit(df)

def test_fields_with_over_100_levels_error(self):
@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_fields_with_over_100_levels_error(self, library):
"""Test that OneHotEncodingTransformer.fit on fields with more than 100 levels raises error."""
df = pd.DataFrame({"b": list(range(101))})
df["a"] = 1
df_dict = {"a": [1] * 101, "b": list(range(101))}

df = dataframe_init_dispatch(library=library, dataframe_dict=df_dict)

x = OneHotEncodingTransformer(columns=["a", "b"])

Expand All @@ -72,66 +80,82 @@ class TestTransform(
def setup_class(cls):
cls.transformer_name = "OneHotEncodingTransformer"

def create_OneHotEncoderTransformer_test_df_1():
def create_OneHotEncoderTransformer_test_df_1(self, library="pandas"):
"""Create DataFrame to test OneHotEncoderTransformer

binary columns are representative of transformed output of column b

Parameters
----------
library : str
Whether to return polars of pandas df

"""
df = pd.DataFrame(
{
"a": [4, 2, 2, 1, 3],
"b": ["x", "z", "y", "x", "x"],
"c": ["c", "a", "a", "c", "b"],
},
)

df["c"] = df["c"].astype("category")
df_dict = {
"a": [4, 2, 2, 1, 3],
"b": ["x", "z", "y", "x", "x"],
"c": ["c", "a", "a", "c", "b"],
"b_x": [1.0, 0.0, 0.0, 1.0, 1.0],
"b_y": [0.0, 0.0, 1.0, 0.0, 0.0],
"b_z": [0.0, 1.0, 0.0, 0.0, 0.0],
}

df = dataframe_init_dispatch(library=library, dataframe_dict=df_dict)

df["b_x"] = [1.0, 0.0, 0.0, 1.0, 1.0]
df["b_y"] = [0.0, 0.0, 1.0, 0.0, 0.0]
df["b_z"] = [0.0, 1.0, 0.0, 0.0, 0.0]
df = nw.from_native(df)
df = df.with_columns(nw.col("c").cast(nw.dtypes.Categorical))

return df
return df.to_native()

def create_OneHotEncoderTransformer_test_df_2():
def create_OneHotEncoderTransformer_test_df_2(self, library="pandas"):
"""Create DataFrame to test OneHotEncoderTransformer

binary columns are representative of transformed output of all columns

"""
df = pd.DataFrame(
{
"a": [1, 5, 2, 3, 3],
"b": ["w", "w", "z", "y", "x"],
"c": ["a", "a", "c", "b", "a"],
},
)
Parameters
----------
library : str
Whether to return polars of pandas df

df["c"] = df["c"].astype("category")

df["a_1"] = [1.0, 0.0, 0.0, 0.0, 0.0]
df["a_2"] = [0.0, 0.0, 1.0, 0.0, 0.0]
df["a_3"] = [0.0, 0.0, 0.0, 1.0, 1.0]
df["a_4"] = [0.0, 0.0, 0.0, 0.0, 0.0]
df["b_x"] = [0.0, 0.0, 0.0, 0.0, 1.0]
df["b_y"] = [0.0, 0.0, 0.0, 1.0, 0.0]
df["b_z"] = [0.0, 0.0, 1.0, 0.0, 0.0]
df["c_a"] = [1.0, 1.0, 0.0, 0.0, 1.0]
df["c_b"] = [0.0, 0.0, 0.0, 1.0, 0.0]
df["c_c"] = [0.0, 0.0, 1.0, 0.0, 0.0]
"""

return df
df_dict = {
"a": [1, 5, 2, 3, 3],
"b": ["w", "w", "z", "y", "x"],
"c": ["a", "a", "c", "b", "a"],
"a_1": [1.0, 0.0, 0.0, 0.0, 0.0],
"a_2": [0.0, 0.0, 1.0, 0.0, 0.0],
"a_3": [0.0, 0.0, 0.0, 1.0, 1.0],
"a_4": [0.0, 0.0, 0.0, 0.0, 0.0],
"b_x": [0.0, 0.0, 0.0, 0.0, 1.0],
"b_y": [0.0, 0.0, 0.0, 1.0, 0.0],
"b_z": [0.0, 0.0, 1.0, 0.0, 0.0],
"c_a": [1.0, 1.0, 0.0, 0.0, 1.0],
"c_b": [0.0, 0.0, 0.0, 1.0, 0.0],
"c_c": [0.0, 0.0, 1.0, 0.0, 0.0],
}

df = dataframe_init_dispatch(dataframe_dict=df_dict, library=library)

df = nw.from_native(df)
df = df.with_columns(nw.col("c").cast(nw.dtypes.Categorical))

return df.to_native()

def test_non_mappable_rows_exception_raised(self):
"""Test inherited from GenericBaseNominalTransformerTests needs to be overwritten,
inherited test tests the mapping attribute, which OHE transfomer doesn't have.
"""

def test_non_numeric_column_error_1(self):
@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_non_numeric_column_error_1(self, library):
"""Test that transform will raise an error if a column to transform has nulls."""
df_train = d.create_df_1()
df_test = d.create_df_2()
df_train = d.create_df_1(library=library)
df_test = d.create_df_2(library=library)

x = OneHotEncodingTransformer(columns=["b"])

Expand All @@ -144,42 +168,57 @@ def test_non_numeric_column_error_1(self):
x.transform(df_test)

@pytest.mark.parametrize(
("df_test", "expected"),
ta.pandas.adjusted_dataframe_params(
d.create_df_7(),
create_OneHotEncoderTransformer_test_df_1(),
),
"library",
["pandas", "polars"],
)
def test_expected_output(self, df_test, expected):
def test_expected_output(self, library):
"""Test that OneHotEncodingTransformer.transform encodes the feature correctly.

Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns.
"""
# transformer is fit on the whole dataset separately from the input df to work with the decorators
columns = ["b"]
df_train = d.create_df_7()
df_train = d.create_df_7(library=library)
df_train = nw.from_native(df_train)

df_test = df_train.clone()
expected = self.create_OneHotEncoderTransformer_test_df_1(library=library)

x = OneHotEncodingTransformer(columns=columns)
x.fit(df_train)

df_transformed = x.transform(df_test)
df_transformed = x.transform(df_test.to_native())

expected = nw.from_native(expected)
for col in [
column + f"_{value}"
for column in columns
for value in df_train[column].unique().tolist()
for value in df_train.select(nw.col(column).unique())
.get_column(column)
.to_list()
]:
expected[col] = expected[col].astype(np.int8)
expected = expected.with_columns(nw.col(col).cast(nw.Boolean))

ta.equality.assert_frame_equal_msg(
expected=expected,
actual=df_transformed,
msg_tag="Unspecified columns changed in transform",
)
assert_frame_equal_dispatch(expected.to_native(), df_transformed)

# also test single row transform
for i in range(len(df_test)):
df_transformed_row = x.transform(df_test[[i]].to_native())
df_expected_row = expected[[i]].to_native()

def test_categories_not_modified(self):
assert_frame_equal_dispatch(
df_transformed_row,
df_expected_row,
)

@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_categories_not_modified(self, library):
"""Test that the categories from fit are not changed in transform."""
df_train = d.create_df_1()
df_test = d.create_df_7()
df_train = d.create_df_1(library=library)
df_test = d.create_df_7(library=library)

x = OneHotEncodingTransformer(columns=["a", "b"], verbose=False)
x2 = OneHotEncodingTransformer(columns=["a", "b"], verbose=False)
Expand All @@ -189,21 +228,17 @@ def test_categories_not_modified(self):

x.transform(df_test)

ta.equality.assert_equal_dispatch(
expected=list(x2.categories_[0]),
actual=list(x.categories_[0]),
msg="categories_ (index 0) modified during transform",
)

ta.equality.assert_equal_dispatch(
expected=list(x2.categories_[1]),
actual=list(x.categories_[1]),
msg="categories_ (index 1) modified during transform",
)
assert (
x2.categories_ == x.categories_
), f"categories_ modified during transform, pre transform had {x2.categories_} but post transform has {x.categories_}"

def test_renaming_feature_works_as_expected(self):
@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_renaming_feature_works_as_expected(self, library):
"""Test OneHotEncodingTransformer.transform() is renaming features correctly."""
df = d.create_df_7()
df = d.create_df_7(library=library)
df = df[["b", "c"]]

x = OneHotEncodingTransformer(
Expand All @@ -216,16 +251,23 @@ def test_renaming_feature_works_as_expected(self):

df_transformed = x.transform(df)

ta.equality.assert_equal_dispatch(
expected=["b|x", "b|y", "b|z", "c|a", "c|b", "c|c"],
actual=list(df_transformed.columns.values),
msg="renaming columns feature in OneHotEncodingTransformer.transform",
)
expected_columns = ["b|x", "b|y", "b|z", "c|a", "c|b", "c|c"]

df_transformed = nw.from_native(df_transformed)
actual_columns = df_transformed.columns

def test_warning_generated_by_unseen_categories(self):
assert (
set(expected_columns) == set(actual_columns)
), f"renaming columns feature in OneHotEncodingTransformer.transform, expected {expected_columns} but got {actual_columns}"

@pytest.mark.parametrize(
"library",
["pandas", "polars"],
)
def test_warning_generated_by_unseen_categories(self, library):
"""Test OneHotEncodingTransformer.transform triggers a warning for unseen categories."""
df_train = d.create_df_7()
df_test = d.create_df_8()
df_train = d.create_df_7(library=library)
df_test = d.create_df_8(library=library)

x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=True)

Expand All @@ -235,31 +277,45 @@ def test_warning_generated_by_unseen_categories(self):
x.transform(df_test)

@pytest.mark.parametrize(
("df_test", "expected"),
ta.pandas.adjusted_dataframe_params(
d.create_df_8(),
create_OneHotEncoderTransformer_test_df_2(),
),
"library",
["pandas", "polars"],
)
def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected):
def test_unseen_categories_encoded_as_all_zeroes(self, library):
"""Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s)."""
# transformer is fit on the whole dataset separately from the input df to work with the decorators
df_train = d.create_df_7()
df_train = d.create_df_7(library=library)

columns = ["a", "b", "c"]
x = OneHotEncodingTransformer(columns=columns, verbose=False)
x.fit(df_train)

df_test = d.create_df_8(library=library)
expected = self.create_OneHotEncoderTransformer_test_df_2(library=library)

df_transformed = x.transform(df_test)

df_train = nw.from_native(df_train)
expected = nw.from_native(expected)

for col in [
column + f"_{value}"
for column in columns
for value in df_train[column].unique().tolist()
for value in df_train.select(nw.col(column).unique())
.get_column(column)
.to_list()
]:
expected[col] = expected[col].astype(np.int8)
expected = expected.with_columns(nw.col(col).cast(nw.Boolean))

ta.equality.assert_equal_dispatch(
expected=expected,
actual=df_transformed,
msg="unseen category rows not encoded as 0s",
)
column_order = expected.columns
assert_frame_equal_dispatch(expected.to_native(), df_transformed[column_order])

# also test single row transform
df_test = nw.from_native(df_test)
for i in range(len(df_test)):
df_transformed_row = x.transform(df_test[[i]].to_native())
df_expected_row = expected[[i]].to_native()

assert_frame_equal_dispatch(
df_transformed_row[column_order],
df_expected_row,
)
Loading