Merge pull request #388 from azukds/feature/ohe_values_2

Boluwatife28 · web-flow · commit 9308197248d2 · 2025-03-04T14:49:27.000Z
Optional wanted_values feature added to OHE
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -46,6 +46,8 @@ functionality, as this is more complicated when transform is opinionated on type
 - narwhalified GroupRareLevelsTransformer. As part of this, had to make transformer more opinionated
 and refuse columns with nulls (raises an error directing to imputers.) `#372 <https://github.com/lvgig/tubular/issues/372>_`
 - narwhalified BaseDatetimeTransformer `#375 <https://github.com/azukds/tubular/issues/375>`
+- Optional wanted_levels feature has been integrated into the OneHotEncodingTransformer which allows users to specify which levels in a column they wish to encode. `#384 <https://github.com/azukds/tubular/issues/384>_`
+- Created unit tests to check if the values provided for wanted_values are as expected and if the output is as expected.
 - placeholder
 - placeholder
 - placeholder
diff --git a/tests/nominal/test_OneHotEncodingTransformer.py b/tests/nominal/test_OneHotEncodingTransformer.py
@@ -26,6 +26,78 @@ class TestInit(
     def setup_class(cls):
         cls.transformer_name = "OneHotEncodingTransformer"
 
+    # Tests for wanted_values parameter
+
+    @pytest.mark.parametrize(
+        "values",
+        ["a", ["a", "b"], 123, True],
+    )
+    def test_wanted_values_is_dict(self, values, minimal_attribute_dict):
+        args = minimal_attribute_dict[self.transformer_name]
+        args["wanted_values"] = values
+
+        with pytest.raises(
+            TypeError,
+            match="OneHotEncodingTransformer: wanted_values should be a dictionary",
+        ):
+            OneHotEncodingTransformer(**args)
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            {1: ["a", "b"]},
+            {True: ["a"]},
+            {("a",): ["b", "c"]},
+        ],
+    )
+    def test_wanted_values_key_is_str(self, values, minimal_attribute_dict):
+        args = minimal_attribute_dict[self.transformer_name]
+        args["wanted_values"] = values
+
+        with pytest.raises(
+            TypeError,
+            match="OneHotEncodingTransformer:  Key in 'wanted_values' should be a string",
+        ):
+            OneHotEncodingTransformer(**args)
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            {"a": "b"},
+            {"a": ("a", "b")},
+            {"a": True},
+            {"a": 123},
+        ],
+    )
+    def test_wanted_values_value_is_list(self, values, minimal_attribute_dict):
+        args = minimal_attribute_dict[self.transformer_name]
+        args["wanted_values"] = values
+
+        with pytest.raises(
+            TypeError,
+            match="OneHotEncodingTransformer: Values in the 'wanted_values' dictionary should be a list",
+        ):
+            OneHotEncodingTransformer(**args)
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            {"a": ["b", 123]},
+            {"a": ["b", True]},
+            {"a": ["b", None]},
+            {"a": ["b", ["a", "b"]]},
+        ],
+    )
+    def test_wanted_values_entries_are_str(self, values, minimal_attribute_dict):
+        args = minimal_attribute_dict[self.transformer_name]
+        args["wanted_values"] = values
+
+        with pytest.raises(
+            TypeError,
+            match="OneHotEncodingTransformer: Entries in 'wanted_values' list should be a string",
+        ):
+            OneHotEncodingTransformer(**args)
+
 
 class TestFit(GenericFitTests):
     """Generic tests for transformer.fit()"""
@@ -50,6 +122,27 @@ def test_nulls_in_X_error(self, library):
         ):
             transformer.fit(df)
 
+    @pytest.mark.parametrize(
+        "library",
+        ["pandas", "polars"],
+    )
+    def test_fit_missing_levels_warning(self, library):
+        """Test OneHotEncodingTransformer.fit triggers a warning for missing levels."""
+        df = d.create_df_1(library=library)
+
+        transformer = OneHotEncodingTransformer(
+            columns=["b"],
+            wanted_values={"b": ["f", "g"]},
+        )
+
+        with pytest.warns(
+            UserWarning,
+            match=(
+                r"OneHotEncodingTransformer: column b includes user-specified values \['g'\] not found in the dataset"
+            ),
+        ):
+            transformer.fit(df)
+
     @pytest.mark.parametrize(
         "library",
         ["pandas", "polars"],
@@ -68,6 +161,24 @@ def test_fields_with_over_100_levels_error(self, library):
         ):
             transformer.fit(df)
 
+    @pytest.mark.parametrize(
+        "library",
+        ["pandas", "polars"],
+    )
+    def test_fit_no_warning_if_all_wanted_values_present(self, library, recwarn):
+        """Test that OneHotEncodingTransformer.fit does NOT raise a warning when all levels in wanted_levels are present in the data."""
+        df = d.create_df_1(library=library)
+
+        transformer = OneHotEncodingTransformer(
+            columns=["b"],
+            wanted_values={"b": ["a", "b", "c", "d", "e", "f"]},
+        )
+
+        transformer.fit(df)
+        assert (
+            len(recwarn) == 0
+        ), "OneHotEncodingTransformer.fit is raising unexpected warnings"
+
 
 class TestTransform(
     DropOriginalTransformMixinTests,
@@ -276,6 +387,28 @@ def test_warning_generated_by_unseen_categories(self, library):
         with pytest.warns(UserWarning, match="unseen categories"):
             transformer.transform(df_test)
 
+    @pytest.mark.parametrize(
+        "library",
+        ["pandas", "polars"],
+    )
+    def test_transform_missing_levels_warning(self, library):
+        """Test OneHotEncodingTransformer.transform triggers a warning for missing levels."""
+        df_train = d.create_df_7(library=library)
+        df_test = d.create_df_8(library=library)
+
+        transformer = OneHotEncodingTransformer(
+            columns=["b"],
+            wanted_values={"b": ["v", "x", "z"]},
+        )
+
+        transformer.fit(df_train)
+
+        with pytest.warns(
+            UserWarning,
+            match=r"OneHotEncodingTransformer: column b includes user-specified values \['v'\] not found in the dataset",
+        ):
+            transformer.transform(df_test)
+
     @pytest.mark.parametrize(
         "library",
         ["pandas", "polars"],
@@ -319,3 +452,68 @@ def test_unseen_categories_encoded_as_all_zeroes(self, library):
                 df_transformed_row[column_order],
                 df_expected_row,
             )
+
+    @pytest.mark.parametrize(
+        "library",
+        ["pandas", "polars"],
+    )
+    def test_transform_output_with_wanted_values_arg(self, library):
+        """
+        Test to verify OneHotEncodingTransformer.transform zero-filled levels from user-specified "wanted_levels" and encodes only those listed in "wanted_levels".
+
+        """
+        df_train = d.create_df_7(library=library)
+        df_test = d.create_df_8(library=library)
+
+        transformer = OneHotEncodingTransformer(
+            columns=["b"],
+            wanted_values={"b": ["v", "x", "z"]},
+        )
+
+        transformer.fit(df_train)
+        df_transformed = transformer.transform(df_test)
+
+        expected_df_dict = {
+            "a": [1, 5, 2, 3, 3],
+            "b": ["w", "w", "z", "y", "x"],
+            "c": ["a", "a", "c", "b", "a"],
+            "b_v": [0] * 5,
+            "b_x": [0, 0, 0, 0, 1],
+            "b_z": [0, 0, 1, 0, 0],
+        }
+        expected_df = dataframe_init_dispatch(
+            library=library,
+            dataframe_dict=expected_df_dict,
+        )
+        expected_df = nw.from_native(expected_df)
+        # cast the columns
+        boolean_cols = ["b_v", "b_x", "b_z"]
+        for col_name in boolean_cols:
+            expected_df = expected_df.with_columns(
+                nw.col(col_name).cast(nw.Boolean),
+            )
+        expected_df = expected_df.with_columns(
+            nw.col("c").cast(nw.Categorical),
+        )
+
+        assert_frame_equal_dispatch(df_transformed, expected_df.to_native())
+
+    @pytest.mark.parametrize(
+        "library",
+        ["pandas", "polars"],
+    )
+    def test_transform_no_warning_if_all_wanted_values_present(self, library, recwarn):
+        """Test that OneHotEncodingTransformer.transform does NOT raise a warning when all levels in wanted_levels are present in the data."""
+        df_train = d.create_df_8(library=library)
+        df_test = d.create_df_7(library=library)
+
+        transformer = OneHotEncodingTransformer(
+            columns=["b"],
+            wanted_values={"b": ["z", "y", "x"]},
+        )
+        transformer.fit(df_train)
+        transformer.transform(df_test)
+
+        assert (
+            len(recwarn) == 0
+        ), "OneHotEncodingTransformer.transform is raising unexpected warnings"
diff --git a/tubular/nominal.py b/tubular/nominal.py
@@ -1134,6 +1134,9 @@ class OneHotEncodingTransformer(
         Names of columns to transform. If the default of None is supplied all object and category
         columns in X are used.
 
+    wanted_values: dict[str, list[str] or None , default = None
+        Optional parameter to select specific column levels to be transformed. If it is None, all levels in the categorical column will be encoded. It will take the format {col1: [level_1, level_2, ...]}.
+
     separator : str
         Used to create dummy column names, the name will take
         the format [categorical feature][separator][category level]
@@ -1170,6 +1173,7 @@ class attribute, indicates whether transformer has been converted to polars/pand
     def __init__(
         self,
         columns: str | list[str] | None = None,
+        wanted_values: dict[str, list[str]] | None = None,
         separator: str = "_",
         drop_original: bool = False,
         copy: bool | None = None,
@@ -1184,6 +1188,29 @@ def __init__(
             **kwargs,
         )
 
+        if wanted_values is not None:
+            if not isinstance(wanted_values, dict):
+                msg = f"{self.classname()}: wanted_values should be a dictionary"
+                raise TypeError(msg)
+
+            for key, val_list in wanted_values.items():
+                # check key is a string
+                if not isinstance(key, str):
+                    msg = f"{self.classname()}:  Key in 'wanted_values' should be a string"
+                    raise TypeError(msg)
+
+                # check value is a list
+                if not isinstance(val_list, list):
+                    msg = f"{self.classname()}: Values in the 'wanted_values' dictionary should be a list"
+                    raise TypeError(msg)
+
+                # check if each value within the list is a string
+                for val in val_list:
+                    if not isinstance(val, str):
+                        msg = f"{self.classname()}: Entries in 'wanted_values' list should be a string"
+                        raise TypeError(msg)
+
+        self.wanted_values = wanted_values
         self.set_drop_original_column(drop_original)
         self.check_and_set_separator_column(separator)
 
@@ -1214,6 +1241,7 @@ def fit(self, X: FrameT, y: nw.Series | None = None) -> FrameT:
         self.categories_ = {}
         self.new_feature_names_ = {}
         # Check each field has less than 100 categories/levels
+        missing_levels = {}
         for c in self.columns:
             levels = X.select(nw.col(c).unique())
 
@@ -1231,12 +1259,60 @@ def fit(self, X: FrameT, y: nw.Series | None = None) -> FrameT:
             # for consistency
             levels_list.sort()
 
-            self.categories_[c] = levels_list
+            # categories if 'values' is provided
+            selected_values = (
+                self.wanted_values.get(c, None) if self.wanted_values else None
+            )
+
+            if selected_values is None:
+                final_categories = levels_list
+            else:
+                final_categories = selected_values
 
+            self.categories_[c] = final_categories
             self.new_feature_names_[c] = self._get_feature_names(column=c)
 
+            present_levels = set(X.get_column(c).unique().to_list())
+            missing_levels = self._warn_missing_levels(
+                present_levels,
+                c,
+                missing_levels,
+            )
+
         return self
 
+    def _warn_missing_levels(
+        self,
+        present_levels: list,
+        c: str,
+        missing_levels: dict[str, list[str]],
+    ) -> dict[str, list[str]]:
+        """Logs a warning for user-specifed levels that are not found in the dataset and updates "missing_levels[c]" with those missing levels.
+
+        Parameters
+        ----------
+        present_levels: list
+            List of levels observed in the data.
+        c: str
+            The column name being checked for missing user-specified levels.
+        missing_levels: dict[str, list[str]]
+            Dictionary containing missing user-specified levels for each column.
+        Returns
+        -------
+        missing_levels : dict[str, list[str]]
+            Dictionary updated to reflect new missing levels for column c
+
+        """
+        # print warning for missing levels
+        missing_levels[c] = list(
+            set(self.categories_[c]).difference(present_levels),
+        )
+        if len(missing_levels[c]) > 0:
+            warning_msg = f"{self.classname()}: column {c} includes user-specified values {missing_levels[c]} not found in the dataset"
+            warnings.warn(warning_msg, UserWarning, stacklevel=2)
+
+        return missing_levels
+
     def _get_feature_names(
         self,
         column: str,
@@ -1287,17 +1363,14 @@ def transform(self, X: FrameT) -> FrameT:
                 )
 
             # print warning for unseen levels
-            present_levels = set(X.select(nw.col(c).unique()).get_column(c).to_list())
+            present_levels = set(X.get_column(c).unique().to_list())
             unseen_levels = present_levels.difference(set(self.categories_[c]))
-            missing_levels[c] = list(
-                set(self.categories_[c]).difference(present_levels),
-            )
             if len(unseen_levels) > 0:
-                warnings.warn(
-                    f"{self.classname()}: column {c} has unseen categories: {unseen_levels}",
-                    UserWarning,
-                    stacklevel=2,
-                )
+                warning_msg = f"{self.classname()}: column {c} has unseen categories: {unseen_levels}"
+                warnings.warn(warning_msg, UserWarning, stacklevel=2)
+
+            # print warning for missing levels
+            self._warn_missing_levels(present_levels, c, missing_levels)
 
             dummies = X.get_column(c).to_dummies(separator=self.separator)