Skip to content

Commit e49c30a

Browse files
glemaitremassich
authored andcommitted
[MRG+1] EHN: scikit-learn API transition towards fit_resample (#462)
closes #460 This PR implements: - [x] Removing `sample`. - [x] Having a single `fit_resample`. In addition, we kept an alias `fit_sample` for backcompatibility.
1 parent 5730192 commit e49c30a

File tree

78 files changed

+465
-746
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+465
-746
lines changed

doc/api.rst

-1
Original file line numberDiff line numberDiff line change
@@ -247,4 +247,3 @@ Imbalance-learn provides some fast-prototyping tools.
247247
utils.check_neighbors_object
248248
utils.check_ratio
249249
utils.check_sampling_strategy
250-
utils.hash_X_y

doc/combine.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ to their former samplers::
3333
[(0, 64), (1, 262), (2, 4674)]
3434
>>> from imblearn.combine import SMOTEENN
3535
>>> smote_enn = SMOTEENN(random_state=0)
36-
>>> X_resampled, y_resampled = smote_enn.fit_sample(X, y)
36+
>>> X_resampled, y_resampled = smote_enn.fit_resample(X, y)
3737
>>> print(sorted(Counter(y_resampled).items()))
3838
[(0, 4060), (1, 4381), (2, 3502)]
3939
>>> from imblearn.combine import SMOTETomek
4040
>>> smote_tomek = SMOTETomek(random_state=0)
41-
>>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y)
41+
>>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
4242
>>> print(sorted(Counter(y_resampled).items()))
4343
[(0, 4499), (1, 4566), (2, 4413)]
4444

doc/ensemble.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ under-sampling the original set::
3333
[(0, 64), (1, 262), (2, 4674)]
3434
>>> from imblearn.ensemble import EasyEnsemble
3535
>>> ee = EasyEnsemble(random_state=0, n_subsets=10)
36-
>>> X_resampled, y_resampled = ee.fit_sample(X, y)
36+
>>> X_resampled, y_resampled = ee.fit_resample(X, y)
3737
>>> print(X_resampled.shape)
3838
(10, 192, 2)
3939
>>> print(sorted(Counter(y_resampled[0]).items()))
@@ -55,7 +55,7 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with
5555
>>> bc = BalanceCascade(random_state=0,
5656
... estimator=LogisticRegression(random_state=0),
5757
... n_max_subset=4)
58-
>>> X_resampled, y_resampled = bc.fit_sample(X, y)
58+
>>> X_resampled, y_resampled = bc.fit_resample(X, y)
5959
>>> print(X_resampled.shape)
6060
(4, 192, 2)
6161
>>> print(sorted(Counter(y_resampled[0]).items()))

doc/introduction.rst

+2-6
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,11 @@ and adding a sampling functionality through the ``sample`` method:
1818

1919
estimator = obj.fit(data, targets)
2020

21-
:Sampler:
21+
:Resampler:
2222

2323
To resample a data sets, each sampler implements::
2424

25-
data_resampled, targets_resampled = obj.sample(data, targets)
26-
27-
Fitting and sampling can also be done in one step::
28-
29-
data_resampled, targets_resampled = obj.fit_sample(data, targets)
25+
data_resampled, targets_resampled = obj.fit_resample(data, targets)
3026

3127
Imbalanced-learn samplers accept the same inputs that in scikit-learn:
3228

doc/miscellaneous.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ to retain the 10 first elements of the array ``X`` and ``y``::
2828
>>> def func(X, y):
2929
... return X[:10], y[:10]
3030
>>> sampler = FunctionSampler(func=func)
31-
>>> X_res, y_res = sampler.fit_sample(X, y)
31+
>>> X_res, y_res = sampler.fit_resample(X, y)
3232
>>> np.all(X_res == X[:10])
3333
True
3434
>>> np.all(y_res == y[:10])

doc/over_sampling.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ randomly sampling with replacement the current available samples. The
2727
... class_sep=0.8, random_state=0)
2828
>>> from imblearn.over_sampling import RandomOverSampler
2929
>>> ros = RandomOverSampler(random_state=0)
30-
>>> X_resampled, y_resampled = ros.fit_sample(X, y)
30+
>>> X_resampled, y_resampled = ros.fit_resample(X, y)
3131
>>> from collections import Counter
3232
>>> print(sorted(Counter(y_resampled).items()))
3333
[(0, 4674), (1, 4674), (2, 4674)]
@@ -59,7 +59,7 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
5959
>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
6060
... dtype=np.object)
6161
>>> y_hetero = np.array([0, 0, 1])
62-
>>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
62+
>>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero)
6363
>>> print(X_resampled)
6464
[['xxx' 1 1.0]
6565
['yyy' 2 2.0]
@@ -82,11 +82,11 @@ to over-sample minority classes: (i) the Synthetic Minority Oversampling Techniq
8282
can be used in the same manner::
8383

8484
>>> from imblearn.over_sampling import SMOTE, ADASYN
85-
>>> X_resampled, y_resampled = SMOTE().fit_sample(X, y)
85+
>>> X_resampled, y_resampled = SMOTE().fit_resample(X, y)
8686
>>> print(sorted(Counter(y_resampled).items()))
8787
[(0, 4674), (1, 4674), (2, 4674)]
8888
>>> clf_smote = LinearSVC().fit(X_resampled, y_resampled)
89-
>>> X_resampled, y_resampled = ADASYN().fit_sample(X, y)
89+
>>> X_resampled, y_resampled = ADASYN().fit_resample(X, y)
9090
>>> print(sorted(Counter(y_resampled).items()))
9191
[(0, 4673), (1, 4662), (2, 4674)]
9292
>>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled)
@@ -147,7 +147,7 @@ The :class:`BorderlineSMOTE` and :class:`SVMSMOTE` offer some variant of the SMO
147147
algorithm::
148148

149149
>>> from imblearn.over_sampling import BorderlineSMOTE
150-
>>> X_resampled, y_resampled = BorderlineSMOTE().fit_sample(X, y)
150+
>>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
151151
>>> print(sorted(Counter(y_resampled).items()))
152152
[(0, 4674), (1, 4674), (2, 4674)]
153153

doc/under_sampling.rst

+12-12
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ K-means method instead of the original samples::
3232
[(0, 64), (1, 262), (2, 4674)]
3333
>>> from imblearn.under_sampling import ClusterCentroids
3434
>>> cc = ClusterCentroids(random_state=0)
35-
>>> X_resampled, y_resampled = cc.fit_sample(X, y)
35+
>>> X_resampled, y_resampled = cc.fit_resample(X, y)
3636
>>> print(sorted(Counter(y_resampled).items()))
3737
[(0, 64), (1, 64), (2, 64)]
3838

@@ -82,7 +82,7 @@ randomly selecting a subset of data for the targeted classes::
8282

8383
>>> from imblearn.under_sampling import RandomUnderSampler
8484
>>> rus = RandomUnderSampler(random_state=0)
85-
>>> X_resampled, y_resampled = rus.fit_sample(X, y)
85+
>>> X_resampled, y_resampled = rus.fit_resample(X, y)
8686
>>> print(sorted(Counter(y_resampled).items()))
8787
[(0, 64), (1, 64), (2, 64)]
8888

@@ -99,7 +99,7 @@ by considering independently each targeted class::
9999
>>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
100100
(192, 2)
101101
>>> rus = RandomUnderSampler(random_state=0, replacement=True)
102-
>>> X_resampled, y_resampled = rus.fit_sample(X, y)
102+
>>> X_resampled, y_resampled = rus.fit_resample(X, y)
103103
>>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
104104
(181, 2)
105105

@@ -109,7 +109,7 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
109109
>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
110110
... dtype=np.object)
111111
>>> y_hetero = np.array([0, 0, 1])
112-
>>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
112+
>>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero)
113113
>>> print(X_resampled)
114114
[['xxx' 1 1.0]
115115
['zzz' 3 3.0]]
@@ -126,7 +126,7 @@ be selected with the parameter ``version``::
126126

127127
>>> from imblearn.under_sampling import NearMiss
128128
>>> nm1 = NearMiss(version=1)
129-
>>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y)
129+
>>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y)
130130
>>> print(sorted(Counter(y_resampled).items()))
131131
[(0, 64), (1, 64), (2, 64)]
132132

@@ -261,7 +261,7 @@ the sample inspected to keep it in the dataset::
261261
[(0, 64), (1, 262), (2, 4674)]
262262
>>> from imblearn.under_sampling import EditedNearestNeighbours
263263
>>> enn = EditedNearestNeighbours()
264-
>>> X_resampled, y_resampled = enn.fit_sample(X, y)
264+
>>> X_resampled, y_resampled = enn.fit_resample(X, y)
265265
>>> print(sorted(Counter(y_resampled).items()))
266266
[(0, 64), (1, 213), (2, 4568)]
267267

@@ -275,7 +275,7 @@ Generally, repeating the algorithm will delete more data::
275275

276276
>>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours
277277
>>> renn = RepeatedEditedNearestNeighbours()
278-
>>> X_resampled, y_resampled = renn.fit_sample(X, y)
278+
>>> X_resampled, y_resampled = renn.fit_resample(X, y)
279279
>>> print(sorted(Counter(y_resampled).items()))
280280
[(0, 64), (1, 208), (2, 4551)]
281281

@@ -285,7 +285,7 @@ internal nearest neighbors algorithm is increased at each iteration::
285285

286286
>>> from imblearn.under_sampling import AllKNN
287287
>>> allknn = AllKNN()
288-
>>> X_resampled, y_resampled = allknn.fit_sample(X, y)
288+
>>> X_resampled, y_resampled = allknn.fit_resample(X, y)
289289
>>> print(sorted(Counter(y_resampled).items()))
290290
[(0, 64), (1, 220), (2, 4601)]
291291

@@ -323,7 +323,7 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner::
323323

324324
>>> from imblearn.under_sampling import CondensedNearestNeighbour
325325
>>> cnn = CondensedNearestNeighbour(random_state=0)
326-
>>> X_resampled, y_resampled = cnn.fit_sample(X, y)
326+
>>> X_resampled, y_resampled = cnn.fit_resample(X, y)
327327
>>> print(sorted(Counter(y_resampled).items()))
328328
[(0, 64), (1, 24), (2, 115)]
329329

@@ -338,7 +338,7 @@ used as::
338338

339339
>>> from imblearn.under_sampling import OneSidedSelection
340340
>>> oss = OneSidedSelection(random_state=0)
341-
>>> X_resampled, y_resampled = oss.fit_sample(X, y)
341+
>>> X_resampled, y_resampled = oss.fit_resample(X, y)
342342
>>> print(sorted(Counter(y_resampled).items()))
343343
[(0, 64), (1, 174), (2, 4403)]
344344

@@ -352,7 +352,7 @@ neighbors classifier. The class can be used as::
352352

353353
>>> from imblearn.under_sampling import NeighbourhoodCleaningRule
354354
>>> ncr = NeighbourhoodCleaningRule()
355-
>>> X_resampled, y_resampled = ncr.fit_sample(X, y)
355+
>>> X_resampled, y_resampled = ncr.fit_resample(X, y)
356356
>>> print(sorted(Counter(y_resampled).items()))
357357
[(0, 64), (1, 234), (2, 4666)]
358358

@@ -380,7 +380,7 @@ removed. The class can be used as::
380380
>>> from imblearn.under_sampling import InstanceHardnessThreshold
381381
>>> iht = InstanceHardnessThreshold(random_state=0,
382382
... estimator=LogisticRegression())
383-
>>> X_resampled, y_resampled = iht.fit_sample(X, y)
383+
>>> X_resampled, y_resampled = iht.fit_resample(X, y)
384384
>>> print(sorted(Counter(y_resampled).items()))
385385
[(0, 64), (1, 64), (2, 64)]
386386

doc/whats_new/v0.0.4.rst

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ API
1818
- Enable to use a ``list`` for the cleaning methods to specify the class to
1919
sample. :issue:`411` by :user:`Guillaume Lemaitre <glemaitre>`.
2020

21+
- Replace ``fit_sample`` by ``fit_resample``. An alias is still available for
22+
backward compatibility. In addition, ``sample`` has been removed to avoid
23+
resampling on different set of data.
24+
:issue:`462` by :user:`Guillaume Lemaitre <glemaitre>`.
25+
2126
New features
2227
............
2328

examples/applications/plot_over_sampling_benchmark_lfw.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def sample(self, X, y):
3939
def fit(self, X, y):
4040
return self
4141

42-
def fit_sample(self, X, y):
42+
def fit_resample(self, X, y):
4343
return self.sample(X, y)
4444

4545

examples/applications/porto_seguro_keras_under_sampling.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
###############################################################################
5050

5151
from sklearn.compose import ColumnTransformer
52-
from sklearn.pipeline import Pipeline, make_pipeline
52+
from sklearn.pipeline import make_pipeline
5353
from sklearn.preprocessing import OneHotEncoder
5454
from sklearn.preprocessing import StandardScaler
5555
from sklearn.preprocessing import FunctionTransformer

examples/combine/plot_comparison_combine.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,
4747

4848

4949
def plot_resampling(X, y, sampling, ax):
50-
X_res, y_res = sampling.fit_sample(X, y)
50+
X_res, y_res = sampling.fit_resample(X, y)
5151
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
5252
# make nice plotting
5353
ax.spines['top'].set_visible(False)

examples/combine/plot_smote_enn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
# Apply SMOTE + ENN
3434
sm = SMOTEENN()
35-
X_resampled, y_resampled = sm.fit_sample(X, y)
35+
X_resampled, y_resampled = sm.fit_resample(X, y)
3636
X_res_vis = pca.transform(X_resampled)
3737

3838
# Two subplots, unpack the axes array immediately

examples/combine/plot_smote_tomek.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
# Apply SMOTE + Tomek links
3434
sm = SMOTETomek()
35-
X_resampled, y_resampled = sm.fit_sample(X, y)
35+
X_resampled, y_resampled = sm.fit_resample(X, y)
3636
X_res_vis = pca.transform(X_resampled)
3737

3838
# Two subplots, unpack the axes array immediately

examples/ensemble/plot_balance_cascade.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
# Apply Balance Cascade method
3434
bc = BalanceCascade()
35-
X_resampled, y_resampled = bc.fit_sample(X, y)
35+
X_resampled, y_resampled = bc.fit_resample(X, y)
3636
X_res_vis = []
3737
for X_res in X_resampled:
3838
X_res_vis.append(pca.transform(X_res))

examples/ensemble/plot_easy_ensemble.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
# Apply Easy Ensemble
3434
ee = EasyEnsemble(n_subsets=3)
35-
X_resampled, y_resampled = ee.fit_sample(X, y)
35+
X_resampled, y_resampled = ee.fit_resample(X, y)
3636
X_res_vis = []
3737
for X_res in X_resampled:
3838
X_res_vis.append(pca.transform(X_res))

examples/over-sampling/plot_adasyn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
# Apply the random over-sampling
3535
ada = ADASYN()
36-
X_resampled, y_resampled = ada.fit_sample(X, y)
36+
X_resampled, y_resampled = ada.fit_resample(X, y)
3737
X_res_vis = pca.transform(X_resampled)
3838

3939
# Two subplots, unpack the axes array immediately

examples/over-sampling/plot_comparison_over_sampling.py

+5-15
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
from imblearn.over_sampling import ADASYN
2424
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
2525
from imblearn.over_sampling import RandomOverSampler
26-
from imblearn.base import SamplerMixin
27-
from imblearn.utils import hash_X_y
26+
from imblearn.base import BaseSampler
2827

2928
print(__doc__)
3029

@@ -49,7 +48,7 @@ def create_dataset(n_samples=1000, weights=(0.01, 0.01, 0.98), n_classes=3,
4948

5049

5150
def plot_resampling(X, y, sampling, ax):
52-
X_res, y_res = sampling.fit_sample(X, y)
51+
X_res, y_res = sampling.fit_resample(X, y)
5352
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
5453
# make nice plotting
5554
ax.spines['top'].set_visible(False)
@@ -131,20 +130,11 @@ def plot_decision_function(X, y, clf, ax):
131130

132131

133132
# Make an identity sampler
134-
class FakeSampler(SamplerMixin):
133+
class FakeSampler(BaseSampler):
135134

136-
def fit(self, X, y):
137-
self.ratio_ = 1
138-
self.X_hash_ = hash_X_y(X, y)
139-
return self
135+
_sampling_type = 'bypass'
140136

141-
def sample(self, X, y):
142-
return X,
143-
144-
def _sample(self, X, y):
145-
pass
146-
147-
def fit_sample(self, X, y):
137+
def _fit_resample(self, X, y):
148138
return X, y
149139

150140

examples/over-sampling/plot_random_over_sampling.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
# Apply the random over-sampling
3434
ros = RandomOverSampler()
35-
X_resampled, y_resampled = ros.fit_sample(X, y)
35+
X_resampled, y_resampled = ros.fit_resample(X, y)
3636
X_res_vis = pca.transform(X_resampled)
3737

3838
# Two subplots, unpack the axes array immediately

examples/over-sampling/plot_smote.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def plot_resampling(ax, X, y, title):
5757
y_resampled = []
5858
X_res_vis = []
5959
for method in sm:
60-
X_res, y_res = method.fit_sample(X, y)
60+
X_res, y_res = method.fit_resample(X, y)
6161
X_resampled.append(X_res)
6262
y_resampled.append(y_res)
6363
X_res_vis.append(pca.transform(X_res))

examples/plot_outlier_rejections.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def outlier_rejection(X, y):
7373

7474

7575
reject_sampler = FunctionSampler(func=outlier_rejection)
76-
X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
76+
X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
7777
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
7878

7979
pipe = make_pipeline(FunctionSampler(func=outlier_rejection),

0 commit comments

Comments
 (0)