BUG Better in-out support with different arrays type (#681)

chkoar · glemaitre · commit 09359a41dbc4 · 2020-02-16T12:09:02.000+01:00
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -14,6 +14,7 @@
 from sklearn.utils.multiclass import check_classification_targets
 
 from .utils import check_sampling_strategy, check_target_type
+from .utils._validation import ArraysTransformer
 
 
 class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -72,6 +73,7 @@ def fit_resample(self, X, y):
             The corresponding label of `X_resampled`.
         """
         check_classification_targets(y)
+        arrays_transformer = ArraysTransformer(X, y)
         X, y, binarize_y = self._check_X_y(X, y)
 
         self.sampling_strategy_ = check_sampling_strategy(
@@ -80,21 +82,10 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._X_columns is not None or self._y_name is not None:
-            import pandas as pd
-
-        if self._X_columns is not None:
-            X_ = pd.DataFrame(output[0], columns=self._X_columns)
-            X_ = X_.astype(self._X_dtypes)
-        else:
-            X_ = output[0]
-
         y_ = (label_binarize(output[1], np.unique(y))
               if binarize_y else output[1])
 
-        if self._y_name is not None:
-            y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
-
+        X_, y_ = arrays_transformer.transform(output[0], y_)
         return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 
     #  define an alias for back-compatibility
@@ -137,22 +128,6 @@ def __init__(self, sampling_strategy="auto"):
         self.sampling_strategy = sampling_strategy
 
     def _check_X_y(self, X, y, accept_sparse=None):
-        if hasattr(X, "loc"):
-            # store information to build dataframe
-            self._X_columns = X.columns
-            self._X_dtypes = X.dtypes
-        else:
-            self._X_columns = None
-            self._X_dtypes = None
-
-        if hasattr(y, "loc"):
-            # store information to build a series
-            self._y_name = y.name
-            self._y_dtype = y.dtype
-        else:
-            self._y_name = None
-            self._y_dtype = None
-
         if accept_sparse is None:
             accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
@@ -265,8 +240,8 @@ def fit_resample(self, X, y):
         y_resampled : array-like of shape (n_samples_new,)
             The corresponding label of `X_resampled`.
         """
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        arrays_transformer = ArraysTransformer(X, y)
+
         if self.validate:
             check_classification_targets(y)
             X, y, binarize_y = self._check_X_y(
@@ -280,22 +255,12 @@ def fit_resample(self, X, y):
         output = self._fit_resample(X, y)
 
         if self.validate:
-            if self._X_columns is not None or self._y_name is not None:
-                import pandas as pd
-
-            if self._X_columns is not None:
-                X_ = pd.DataFrame(output[0], columns=self._X_columns)
-                X_ = X_.astype(self._X_dtypes)
-            else:
-                X_ = output[0]
 
             y_ = (label_binarize(output[1], np.unique(y))
                   if binarize_y else output[1])
-
-            if self._y_name is not None:
-                y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
-
+            X_, y_ = arrays_transformer.transform(output[0], y_)
             return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
+
         return output
 
     def _fit_resample(self, X, y):
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -16,7 +16,6 @@
 from ..utils import Substitution
 from ..utils._docstring import _random_state_docstring
 
-
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring,
@@ -75,22 +74,6 @@ def __init__(self, sampling_strategy="auto", random_state=None):
         self.random_state = random_state
 
     def _check_X_y(self, X, y):
-        if hasattr(X, "loc"):
-            # store information to build dataframe
-            self._X_columns = X.columns
-            self._X_dtypes = X.dtypes
-        else:
-            self._X_columns = None
-            self._X_dtypes = None
-
-        if hasattr(y, "loc"):
-            # store information to build a series
-            self._y_name = y.name
-            self._y_dtype = y.dtype
-        else:
-            self._y_name = None
-            self._y_dtype = None
-
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -891,22 +891,6 @@ def _check_X_y(self, X, y):
         """Overwrite the checking to let pass some string for categorical
         features.
         """
-        if hasattr(X, "loc"):
-            # store information to build dataframe
-            self._X_columns = X.columns
-            self._X_dtypes = X.dtypes
-        else:
-            self._X_columns = None
-            self._X_dtypes = None
-
-        if hasattr(y, "loc"):
-            # store information to build a series
-            self._y_name = y.name
-            self._y_dtype = y.dtype
-        else:
-            self._y_name = None
-            self._y_dtype = None
-
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
         return X, y, binarize_y
diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
@@ -247,8 +247,10 @@ def _fit_resample(self, X, y):
                         _safe_indexing(X, minority_class_indices)
                     )
                     idx_vec_farthest = np.unique(idx_vec.reshape(-1))
-                    X_class_selected = _safe_indexing(X_class, idx_vec_farthest)
-                    y_class_selected = _safe_indexing(y_class, idx_vec_farthest)
+                    X_class_selected = _safe_indexing(
+                        X_class, idx_vec_farthest)
+                    y_class_selected = _safe_indexing(
+                        y_class, idx_vec_farthest)
 
                     dist_vec, idx_vec = self.nn_.kneighbors(
                         X_class_selected, n_neighbors=self.nn_.n_neighbors
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -81,22 +81,6 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        if hasattr(X, "loc"):
-            # store information to build dataframe
-            self._X_columns = X.columns
-            self._X_dtypes = X.dtypes
-        else:
-            self._X_columns = None
-            self._X_dtypes = None
-
-        if hasattr(y, "loc"):
-            # store information to build a series
-            self._y_name = y.name
-            self._y_dtype = y.dtype
-        else:
-            self._y_name = None
-            self._y_dtype = None
-
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
@@ -27,6 +27,42 @@
 TARGET_KIND = ("binary", "multiclass", "multilabel-indicator")
 
 
+class ArraysTransformer:
+    """A class to convert sampler ouput arrays to their orinal types."""
+
+    def __init__(self, X, y):
+        self.x_props = self._gets_props(X)
+        self.y_props = self._gets_props(y)
+
+    def transform(self, X, y):
+        X = self._transfrom_one(X, self.x_props)
+        y = self._transfrom_one(y, self.y_props)
+        return X, y
+
+    def _gets_props(self, array):
+        props = {}
+        props["type"] = array.__class__.__name__
+        props["columns"] = getattr(array, "columns", None)
+        props["name"] = getattr(array, "name", None)
+        props["dtypes"] = getattr(array, "dtypes", None)
+        return props
+
+    def _transfrom_one(self, array, props):
+        type_ = props["type"].lower()
+        if type_ == "list":
+            ret = array.tolist()
+        elif type_ == "dataframe":
+            import pandas as pd
+            ret = pd.DataFrame(array, columns=props["columns"])
+            ret = ret.astype(props["dtypes"])
+        elif type_ == "series":
+            import pandas as pd
+            ret = pd.Series(array, dtype=props["dtypes"], name=props["name"])
+        else:
+            ret = array
+        return ret
+
+
 def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
     """Check the objects is consistent to be a NN.
 
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -41,6 +41,7 @@ def _yield_sampler_checks(name, Estimator):
     yield check_samplers_sampling_strategy_fit_resample
     yield check_samplers_sparse
     yield check_samplers_pandas
+    yield check_samplers_list
     yield check_samplers_multiclass_ova
     yield check_samplers_preserve_dtype
     yield check_samplers_sample_indices
@@ -242,8 +243,9 @@ def check_samplers_pandas(name, Sampler):
         weights=[0.2, 0.3, 0.5],
         random_state=0,
     )
-    X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
-    y_pd = pd.Series(y, name="class")
+    X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
+    y_df = pd.DataFrame(y)
+    y_s = pd.Series(y, name="class")
     sampler = Sampler()
     if isinstance(Sampler(), NearMiss):
         samplers = [Sampler(version=version) for version in (1, 2, 3)]
@@ -253,16 +255,52 @@ def check_samplers_pandas(name, Sampler):
 
     for sampler in samplers:
         set_random_state(sampler)
-        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
+        X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
+        X_res_df, y_res_df = sampler.fit_resample(X_df, y_df)
         X_res, y_res = sampler.fit_resample(X, y)
 
-        # check that we return a pandas dataframe if a dataframe was given in
-        assert isinstance(X_res_pd, pd.DataFrame)
-        assert isinstance(y_res_pd, pd.Series)
-        assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
-        assert y_pd.name == y_res_pd.name
-        assert_allclose(X_res_pd.to_numpy(), X_res)
-        assert_allclose(y_res_pd.to_numpy(), y_res)
+        # check that we return the same type for dataframes or series types
+        assert isinstance(X_res_df, pd.DataFrame)
+        assert isinstance(y_res_df, pd.DataFrame)
+        assert isinstance(y_res_s, pd.Series)
+
+        assert X_df.columns.to_list() == X_res_df.columns.to_list()
+        assert y_df.columns.to_list() == y_res_df.columns.to_list()
+        assert y_s.name == y_res_s.name
+
+        assert_allclose(X_res_df.to_numpy(), X_res)
+        assert_allclose(y_res_df.to_numpy().ravel(), y_res)
+        assert_allclose(y_res_s.to_numpy(), y_res)
+
+
+def check_samplers_list(name, Sampler):
+    # Check that the can samplers handle simple lists
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    X_list = X.tolist()
+    y_list = y.tolist()
+    sampler = Sampler()
+    if isinstance(sampler, NearMiss):
+        samplers = [Sampler(version=version) for version in (1, 2, 3)]
+
+    else:
+        samplers = [sampler]
+
+    for sampler in samplers:
+        set_random_state(sampler)
+        X_res, y_res = sampler.fit_resample(X, y)
+        X_res_list, y_res_list = sampler.fit_resample(X_list, y_list)
+
+        assert isinstance(X_res_list, list)
+        assert isinstance(y_res_list, list)
+
+        assert_allclose(X_res, X_res_list)
+        assert_allclose(y_res, y_res_list)
 
 
 def check_samplers_multiclass_ova(name, Sampler):
diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py
@@ -17,6 +17,7 @@
 from imblearn.utils import check_neighbors_object
 from imblearn.utils import check_sampling_strategy
 from imblearn.utils import check_target_type
+from imblearn.utils._validation import ArraysTransformer
 
 multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25)
 binary_target = np.array([1] * 25 + [0] * 100)
@@ -315,3 +316,53 @@ def test_sampling_strategy_check_order(
         sampling_strategy, y, sampling_type
     )
     assert sampling_strategy_ == expected_result
+
+
+def test_arrays_transformer_plain_list():
+    X = np.array([[0, 0], [1, 1]])
+    y = np.array([[0, 0], [1, 1]])
+
+    arrays_transformer = ArraysTransformer(X.tolist(), y.tolist())
+    X_res, y_res = arrays_transformer.transform(X, y)
+    assert isinstance(X_res, list)
+    assert isinstance(y_res, list)
+
+
+def test_arrays_transformer_numpy():
+    X = np.array([[0, 0], [1, 1]])
+    y = np.array([[0, 0], [1, 1]])
+
+    arrays_transformer = ArraysTransformer(X, y)
+    X_res, y_res = arrays_transformer.transform(X, y)
+    assert isinstance(X_res, np.ndarray)
+    assert isinstance(y_res, np.ndarray)
+
+
+def test_arrays_transformer_pandas():
+    pd = pytest.importorskip("pandas")
+
+    X = np.array([[0, 0], [1, 1]])
+    y = np.array([0, 1])
+
+    X_df = pd.DataFrame(X, columns=["a", "b"])
+    X_df = X_df.astype(int)
+    y_df = pd.DataFrame(y, columns=["target", ])
+    y_df = y_df.astype(int)
+    y_s = pd.Series(y, name="target", dtype=int)
+
+    # DataFrame and DataFrame case
+    arrays_transformer = ArraysTransformer(X_df, y_df)
+    X_res, y_res = arrays_transformer.transform(X, y)
+    assert isinstance(X_res, pd.DataFrame)
+    assert_array_equal(X_res.columns, X_df.columns)
+    assert_array_equal(X_res.dtypes, X_df.dtypes)
+    assert isinstance(y_res, pd.DataFrame)
+    assert_array_equal(y_res.columns, y_df.columns)
+    assert_array_equal(y_res.dtypes, y_df.dtypes)
+
+    # DataFrames and Series case
+    arrays_transformer = ArraysTransformer(X_df, y_s)
+    _, y_res = arrays_transformer.transform(X, y)
+    assert isinstance(y_res, pd.Series)
+    assert_array_equal(y_res.name, y_s.name)
+    assert_array_equal(y_res.dtype, y_s.dtype)