From 55d8f279d512e44861407086761079464699d74d Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 12:29:55 +0200 Subject: [PATCH 1/9] Better in-out support for pandas. --- imblearn/base.py | 49 +++---------------- .../over_sampling/_random_over_sampler.py | 18 +------ imblearn/over_sampling/_smote.py | 18 +------ .../_random_under_sampler.py | 18 +------ imblearn/utils/_validation.py | 45 +++++++++++++++++ imblearn/utils/estimator_checks.py | 27 ++++++---- 6 files changed, 75 insertions(+), 100 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index c5d6b0185..13033ca40 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,6 +14,7 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type +from .utils._validation import _OutputReconstructor class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -80,21 +81,10 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] - y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = self._reconstructor.reconstruct(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility @@ -137,22 +127,7 @@ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -265,8 +240,8 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + self._reconstructor = _OutputReconstructor(X, y) + if self.validate: check_classification_targets(y) X, y, binarize_y = self._check_X_y( @@ -280,22 +255,12 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) if self.validate: - if self._X_columns is not None or self._y_name is not None: - import pandas as pd - - if self._X_columns is not None: - X_ = pd.DataFrame(output[0], columns=self._X_columns) - X_ = X_.astype(self._X_dtypes) - else: - X_ = output[0] y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - - if self._y_name is not None: - y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - + X_, y_ = self._reconstructor.reconstruct(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) + return output def _fit_resample(self, X, y): diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index afcb89da5..8addb2a87 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -15,6 +15,7 @@ from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring +from ..utils._validation import _OutputReconstructor @Substitution( @@ -75,22 +76,7 @@ def __init__(self, sampling_strategy="auto", random_state=None): self.random_state = random_state def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index cea14cfd2..8d8de9a0a 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -31,6 +31,7 @@ from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring +from ..utils._validation import _OutputReconstructor class BaseSMOTE(BaseOverSampler): @@ -891,22 +892,7 @@ def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 8d7c08c93..fb90f2bfb 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -15,6 +15,7 @@ from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring +from ...utils._validation import _OutputReconstructor @Substitution( @@ -81,22 +82,7 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - if hasattr(X, "loc"): - # store information to build dataframe - self._X_columns = X.columns - self._X_dtypes = X.dtypes - else: - self._X_columns = None - self._X_dtypes = None - - if hasattr(y, "loc"): - # store information to build a series - self._y_name = y.name - self._y_dtype = y.dtype - else: - self._y_name = None - self._y_dtype = None - + self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index d1b0069b7..0f9f2f9c3 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -27,6 +27,51 @@ TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") +class _OutputReconstructor: + """A class for converting input types to numpy and back.""" + + def __init__(self, X, y): + self.x_props = self._gets_props(X) + self.y_props = self._gets_props(y) + + def reconstruct(self, X, y): + X = self._transfrom(X, self.x_props) + y = self._transfrom(y, self.y_props) + return X, y + + def _gets_props(self, array): + props = {} + props["type"] = array.__class__.__name__ + props["columns"] = getattr(array, "columns", None) + props["name"] = getattr(array, "name", None) + props["dtypes"] = getattr(array, "dtypes", None) + return props + + def _transfrom(self, array, props): + type_ = props["type"].lower() + msg="Could not convert to {}".format(type_) + if type_ == "list": + ret = array.tolist() + elif type_ == "dataframe": + try: + import pandas as pd + ret = pd.DataFrame(array, columns=props["columns"]) + ret = ret.astype(props["dtypes"]) + except Exception: + warnings.warn(msg) + elif type_ == "series": + try: + import pandas as pd + ret = pd.Series(array, + dtype=props["dtypes"], + name=props["name"]) + except Exception: + warnings.warn(msg) + else: + ret = array + return ret + + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 51a039f85..8f094397d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -242,8 +242,9 @@ def check_samplers_pandas(name, Sampler): weights=[0.2, 0.3, 0.5], random_state=0, ) - X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) - y_pd = pd.Series(y, name="class") + X_df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + y_df = pd.DataFrame(y) + y_s = pd.Series(y, name="class") sampler = Sampler() if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] @@ -253,16 +254,22 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) - # check that we return a pandas dataframe if a dataframe was given in - assert isinstance(X_res_pd, pd.DataFrame) - assert isinstance(y_res_pd, pd.Series) - assert X_pd.columns.to_list() == X_res_pd.columns.to_list() - assert y_pd.name == y_res_pd.name - assert_allclose(X_res_pd.to_numpy(), X_res) - assert_allclose(y_res_pd.to_numpy(), y_res) + # check that we return the same type for dataframes or seires types + assert isinstance(X_res_df, pd.DataFrame) + assert isinstance(y_res_df, pd.DataFrame) + assert isinstance(y_res_s, pd.Series) + + assert X_df.columns.to_list() == X_res_df.columns.to_list() + assert y_df.columns.to_list() == y_res_df.columns.to_list() + assert y_s.name == y_res_s.name + + assert_allclose(X_res_df.to_numpy(), X_res) + assert_allclose(y_res_df.to_numpy().ravel(), y_res) + assert_allclose(y_res_s.to_numpy(), y_res) def check_samplers_multiclass_ova(name, Sampler): From 92dab4786fa368130d56021ef48097e169dd7435 Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 13:45:07 +0200 Subject: [PATCH 2/9] Make pep8 happy! --- imblearn/utils/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 0f9f2f9c3..0d02ea28b 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -49,7 +49,7 @@ def _gets_props(self, array): def _transfrom(self, array, props): type_ = props["type"].lower() - msg="Could not convert to {}".format(type_) + msg = "Could not convert to {}".format(type_) if type_ == "list": ret = array.tolist() elif type_ == "dataframe": From 35d7af936aac6471c4097b14a8d958c2e3086040 Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 14:04:35 +0200 Subject: [PATCH 3/9] Relax reconstructor checks. Add test for simple lists. --- imblearn/utils/_validation.py | 19 +++++------------- imblearn/utils/estimator_checks.py | 32 +++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 0d02ea28b..df25a52a0 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -49,24 +49,15 @@ def _gets_props(self, array): def _transfrom(self, array, props): type_ = props["type"].lower() - msg = "Could not convert to {}".format(type_) if type_ == "list": ret = array.tolist() elif type_ == "dataframe": - try: - import pandas as pd - ret = pd.DataFrame(array, columns=props["columns"]) - ret = ret.astype(props["dtypes"]) - except Exception: - warnings.warn(msg) + import pandas as pd + ret = pd.DataFrame(array, columns=props["columns"]) + ret = ret.astype(props["dtypes"]) elif type_ == "series": - try: - import pandas as pd - ret = pd.Series(array, - dtype=props["dtypes"], - name=props["name"]) - except Exception: - warnings.warn(msg) + import pandas as pd + ret = pd.Series(array, dtype=props["dtypes"], name=props["name"]) else: ret = array return ret diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 8f094397d..5b5ef34fe 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -258,7 +258,7 @@ def check_samplers_pandas(name, Sampler): X_res_df, y_res_df = sampler.fit_resample(X_df, y_df) X_res, y_res = sampler.fit_resample(X, y) - # check that we return the same type for dataframes or seires types + # check that we return the same type for dataframes or series types assert isinstance(X_res_df, pd.DataFrame) assert isinstance(y_res_df, pd.DataFrame) assert isinstance(y_res_s, pd.Series) @@ -272,6 +272,36 @@ def check_samplers_pandas(name, Sampler): assert_allclose(y_res_s.to_numpy(), y_res) +def check_samplers_list(name, Sampler): + # Check that the can samplers handle simple lists + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + X_list = X.tolist() + y_list = y.tolist() + sampler = Sampler() + if isinstance(Sampler(), NearMiss): + samplers = [Sampler(version=version) for version in (1, 2, 3)] + + else: + samplers = [Sampler()] + + for sampler in samplers: + set_random_state(sampler) + X_res, y_res = sampler.fit_resample(X, y) + X_res_list, y_res_list = sampler.fit_resample(X_list, y_list) + + assert isinstance(X_res_list, list) + assert isinstance(y_res_list, list) + + assert_allclose(X_res, X_res_list) + assert_allclose(y_res, y_res_list) + + def check_samplers_multiclass_ova(name, Sampler): # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification( From 182dc6e0266cc966fbbe73847636a49f20dde19f Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 14:17:54 +0200 Subject: [PATCH 4/9] Add missing estimator check --- imblearn/utils/estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 5b5ef34fe..7d7901fa7 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -41,6 +41,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas + yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype yield check_samplers_sample_indices From 1fd289ae11170549f19fa7f564e571084b5d6c64 Mon Sep 17 00:00:00 2001 From: chkoar Date: Mon, 3 Feb 2020 14:23:15 +0200 Subject: [PATCH 5/9] Fix a check --- imblearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 7d7901fa7..7bd77c2f3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -285,11 +285,11 @@ def check_samplers_list(name, Sampler): X_list = X.tolist() y_list = y.tolist() sampler = Sampler() - if isinstance(Sampler(), NearMiss): + if isinstance(sampler, NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] else: - samplers = [Sampler()] + samplers = [sampler] for sampler in samplers: set_random_state(sampler) From fa8c96830da07c66639b0e326d0a44c6167e06a9 Mon Sep 17 00:00:00 2001 From: chkoar Date: Wed, 5 Feb 2020 11:25:46 +0200 Subject: [PATCH 6/9] Refactor --- imblearn/base.py | 10 +++++----- imblearn/over_sampling/_random_over_sampler.py | 3 --- imblearn/over_sampling/_smote.py | 2 -- .../under_sampling/_prototype_selection/_nearmiss.py | 6 ++++-- .../_prototype_selection/_random_under_sampler.py | 2 -- imblearn/utils/_validation.py | 4 ++-- 6 files changed, 11 insertions(+), 16 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 13033ca40..63bab5d74 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,7 +14,7 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type -from .utils._validation import _OutputReconstructor +from .utils._validation import OutputFormater class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -73,6 +73,7 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled`. """ check_classification_targets(y) + self._formater = OutputFormater(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( @@ -84,7 +85,7 @@ def fit_resample(self, X, y): y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - X_, y_ = self._reconstructor.reconstruct(output[0], y_) + X_, y_ = self._formater.format(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility @@ -127,7 +128,6 @@ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): - self._reconstructor = _OutputReconstructor(X, y) if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -240,7 +240,7 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - self._reconstructor = _OutputReconstructor(X, y) + self._formater = OutputFormater(X, y) if self.validate: check_classification_targets(y) @@ -258,7 +258,7 @@ def fit_resample(self, X, y): y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - X_, y_ = self._reconstructor.reconstruct(output[0], y_) + X_, y_ = self._formater.format(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) return output diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 8addb2a87..fbe2f17f9 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -15,8 +15,6 @@ from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring -from ..utils._validation import _OutputReconstructor - @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, @@ -76,7 +74,6 @@ def __init__(self, sampling_strategy="auto", random_state=None): self.random_state = random_state def _check_X_y(self, X, y): - self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 8d8de9a0a..961ce55c5 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -31,7 +31,6 @@ from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring -from ..utils._validation import _OutputReconstructor class BaseSMOTE(BaseOverSampler): @@ -892,7 +891,6 @@ def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ - self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index af8a13dde..386463d5c 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -247,8 +247,10 @@ def _fit_resample(self, X, y): _safe_indexing(X, minority_class_indices) ) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - X_class_selected = _safe_indexing(X_class, idx_vec_farthest) - y_class_selected = _safe_indexing(y_class, idx_vec_farthest) + X_class_selected = _safe_indexing( + X_class, idx_vec_farthest) + y_class_selected = _safe_indexing( + y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index fb90f2bfb..900d8e3fe 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -15,7 +15,6 @@ from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring -from ...utils._validation import _OutputReconstructor @Substitution( @@ -82,7 +81,6 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - self._reconstructor = _OutputReconstructor(X, y) y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index df25a52a0..c07474d95 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -27,14 +27,14 @@ TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") -class _OutputReconstructor: +class OutputFormater: """A class for converting input types to numpy and back.""" def __init__(self, X, y): self.x_props = self._gets_props(X) self.y_props = self._gets_props(y) - def reconstruct(self, X, y): + def format(self, X, y): X = self._transfrom(X, self.x_props) y = self._transfrom(y, self.y_props) return X, y From 8cb2a92dcc4badc6b1f744d2ebf85338859584fb Mon Sep 17 00:00:00 2001 From: chkoar Date: Wed, 5 Feb 2020 11:35:15 +0200 Subject: [PATCH 7/9] Add tests --- imblearn/utils/tests/test_validation.py | 41 +++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 634f502f0..76955d254 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -17,6 +17,7 @@ from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type +from imblearn.utils._validation import OutputFormater multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) binary_target = np.array([1] * 25 + [0] * 100) @@ -315,3 +316,43 @@ def test_sampling_strategy_check_order( sampling_strategy, y, sampling_type ) assert sampling_strategy_ == expected_result + + +def test_output_formater_plain_list(): + X = np.array([[0, 0], [1, 1]]) + y = np.array([[0, 0], [1, 1]]) + + formater = OutputFormater(X.tolist(), y.tolist()) + X_res, y_res = formater.format(X, y) + assert isinstance(X_res, list) + assert isinstance(y_res, list) + + +def test_output_formater_pandas(): + pd = pytest.importorskip("pandas") + + X = np.array([[0, 0], [1, 1]]) + y = np.array([0, 1]) + + X_df = pd.DataFrame(X, columns=["a", "b"]) + X_df = X_df.astype(int) + y_df = pd.DataFrame(y, columns=["target", ]) + y_df = y_df.astype(int) + y_s = pd.Series(y, name="target", dtype=int) + + # DataFrame and DataFrame case + formater = OutputFormater(X_df, y_df) + X_res, y_res = formater.format(X, y) + assert isinstance(X_res, pd.DataFrame) + assert_array_equal(X_res.columns, X_df.columns) + assert_array_equal(X_res.dtypes, X_df.dtypes) + assert isinstance(y_res, pd.DataFrame) + assert_array_equal(y_res.columns, y_df.columns) + assert_array_equal(y_res.dtypes, y_df.dtypes) + + # DataFrames and Series case + formater = OutputFormater(X_df, y_s) + _, y_res = formater.format(X, y) + assert isinstance(y_res, pd.Series) + assert_array_equal(y_res.name, y_s.name) + assert_array_equal(y_res.dtype, y_s.dtype) From d9edfc4fc86c0126bf5245bf0a48b26abc36ecce Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 7 Feb 2020 15:59:49 +0200 Subject: [PATCH 8/9] Add test, rename class and detach the instance from the sampler --- imblearn/base.py | 10 ++++----- imblearn/utils/_validation.py | 12 +++++------ imblearn/utils/tests/test_validation.py | 28 +++++++++++++++++-------- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 63bab5d74..014e4dd9f 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -14,7 +14,7 @@ from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type -from .utils._validation import OutputFormater +from .utils._validation import ArraysTransformer class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -73,7 +73,7 @@ def fit_resample(self, X, y): The corresponding label of `X_resampled`. """ check_classification_targets(y) - self._formater = OutputFormater(X, y) + arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( @@ -85,7 +85,7 @@ def fit_resample(self, X, y): y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - X_, y_ = self._formater.format(output[0], y_) + X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility @@ -240,7 +240,7 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - self._formater = OutputFormater(X, y) + arrays_transformer = ArraysTransformer(X, y) if self.validate: check_classification_targets(y) @@ -258,7 +258,7 @@ def fit_resample(self, X, y): y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) - X_, y_ = self._formater.format(output[0], y_) + X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) return output diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index c07474d95..dccc0dd4d 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -27,16 +27,16 @@ TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") -class OutputFormater: - """A class for converting input types to numpy and back.""" +class ArraysTransformer: + """A class to convert sampler ouput arrays to their orinal types.""" def __init__(self, X, y): self.x_props = self._gets_props(X) self.y_props = self._gets_props(y) - def format(self, X, y): - X = self._transfrom(X, self.x_props) - y = self._transfrom(y, self.y_props) + def transform(self, X, y): + X = self._transfrom_one(X, self.x_props) + y = self._transfrom_one(y, self.y_props) return X, y def _gets_props(self, array): @@ -47,7 +47,7 @@ def _gets_props(self, array): props["dtypes"] = getattr(array, "dtypes", None) return props - def _transfrom(self, array, props): + def _transfrom_one(self, array, props): type_ = props["type"].lower() if type_ == "list": ret = array.tolist() diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 76955d254..3b4571862 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -17,7 +17,7 @@ from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type -from imblearn.utils._validation import OutputFormater +from imblearn.utils._validation import ArraysTransformer multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) binary_target = np.array([1] * 25 + [0] * 100) @@ -318,17 +318,27 @@ def test_sampling_strategy_check_order( assert sampling_strategy_ == expected_result -def test_output_formater_plain_list(): +def test_arrays_transformer_plain_list(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) - formater = OutputFormater(X.tolist(), y.tolist()) - X_res, y_res = formater.format(X, y) + arrays_transformer = ArraysTransformer(X.tolist(), y.tolist()) + X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, list) assert isinstance(y_res, list) -def test_output_formater_pandas(): +def test_arrays_transformer_numpy(): + X = np.array([[0, 0], [1, 1]]) + y = np.array([[0, 0], [1, 1]]) + + arrays_transformer = ArraysTransformer(X, y) + X_res, y_res = arrays_transformer.transform(X, y) + assert isinstance(X_res, np.array) + assert isinstance(y_res, np.array) + + +def test_arrays_transformer_pandas(): pd = pytest.importorskip("pandas") X = np.array([[0, 0], [1, 1]]) @@ -341,8 +351,8 @@ def test_output_formater_pandas(): y_s = pd.Series(y, name="target", dtype=int) # DataFrame and DataFrame case - formater = OutputFormater(X_df, y_df) - X_res, y_res = formater.format(X, y) + arrays_transformer = ArraysTransformer(X_df, y_df) + X_res, y_res = arrays_transformer.transform(X, y) assert isinstance(X_res, pd.DataFrame) assert_array_equal(X_res.columns, X_df.columns) assert_array_equal(X_res.dtypes, X_df.dtypes) @@ -351,8 +361,8 @@ def test_output_formater_pandas(): assert_array_equal(y_res.dtypes, y_df.dtypes) # DataFrames and Series case - formater = OutputFormater(X_df, y_s) - _, y_res = formater.format(X, y) + arrays_transformer = ArraysTransformer(X_df, y_s) + _, y_res = arrays_transformer.transform(X, y) assert isinstance(y_res, pd.Series) assert_array_equal(y_res.name, y_s.name) assert_array_equal(y_res.dtype, y_s.dtype) From c5f40b46bff8cf7a4ed2fc3fc869b2c378c27efd Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 7 Feb 2020 16:21:26 +0200 Subject: [PATCH 9/9] Fix type --- imblearn/utils/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 3b4571862..a40b47f4b 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -334,8 +334,8 @@ def test_arrays_transformer_numpy(): arrays_transformer = ArraysTransformer(X, y) X_res, y_res = arrays_transformer.transform(X, y) - assert isinstance(X_res, np.array) - assert isinstance(y_res, np.array) + assert isinstance(X_res, np.ndarray) + assert isinstance(y_res, np.ndarray) def test_arrays_transformer_pandas():