1
1
"""Class to perform over-sampling using Geometric SMOTE."""
2
2
3
3
# Author: Georgios Douzas <[email protected] >
4
+ # Joao Fonseca <[email protected] >
4
5
# License: BSD 3 clause
5
6
6
7
import numpy as np
7
8
from numpy .linalg import norm
9
+ from scipy import sparse
8
10
from sklearn .utils import check_random_state
9
- from imblearn . over_sampling .base import BaseOverSampler
11
+ from . .base import BaseOverSampler
10
12
from imblearn .utils import check_neighbors_object , Substitution
11
13
from imblearn .utils ._docstring import _random_state_docstring
12
14
13
- SELECTION_STRATEGY = (' combined' , ' majority' , ' minority' )
15
+ SELECTION_STRATEGY = (" combined" , " majority" , " minority" )
14
16
15
17
16
18
def _make_geometric_sample (
@@ -119,6 +121,33 @@ class GeometricSMOTE(BaseOverSampler):
119
121
n_jobs : int, optional (default=1)
120
122
The number of threads to open if possible.
121
123
124
+ Attributes
125
+ ----------
126
+
127
+ sampling_strategy_ : dict
128
+ Dictionary containing the information to sample the dataset. The keys
129
+ corresponds to the class labels from which to sample and the values
130
+ are the number of samples to sample.
131
+
132
+ n_features_in_ : int
133
+ Number of features in the input dataset.
134
+
135
+ nns_pos_ : estimator object
136
+ Validated k-nearest neighbours created from the `k_neighbors` parameter. It is
137
+ used to find the nearest neighbors of the same class of a selected
138
+ observation.
139
+
140
+ nn_neg_ : estimator object
141
+ Validated k-nearest neighbours created from the `k_neighbors` parameter. It is
142
+ used to find the nearest neighbor of the remaining classes (k=1) of a selected
143
+ observation.
144
+
145
+ random_state_ : instance of RandomState
146
+ If the `random_state` parameter is None, it is a RandomState singleton used by
147
+ np.random. If `random_state` is an int, it is a RandomState instance seeded with
148
+ seed. If `random_state` is already a RandomState instance, it is the same
149
+ object.
150
+
122
151
Notes
123
152
-----
124
153
See the original paper: [1]_ for more details.
@@ -142,7 +171,8 @@ class GeometricSMOTE(BaseOverSampler):
142
171
143
172
>>> from collections import Counter
144
173
>>> from sklearn.datasets import make_classification
145
- >>> from gsmote import GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE
174
+ >>> from imblearn.over_sampling import \
175
+ GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE
146
176
>>> X, y = make_classification(n_classes=2, class_sep=2,
147
177
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
148
178
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
@@ -157,11 +187,11 @@ class GeometricSMOTE(BaseOverSampler):
157
187
158
188
def __init__ (
159
189
self ,
160
- sampling_strategy = ' auto' ,
190
+ sampling_strategy = " auto" ,
161
191
random_state = None ,
162
192
truncation_factor = 1.0 ,
163
193
deformation_factor = 0.0 ,
164
- selection_strategy = ' combined' ,
194
+ selection_strategy = " combined" ,
165
195
k_neighbors = 5 ,
166
196
n_jobs = 1 ,
167
197
):
@@ -182,23 +212,23 @@ def _validate_estimator(self):
182
212
# Validate strategy
183
213
if self .selection_strategy not in SELECTION_STRATEGY :
184
214
error_msg = (
185
- ' Unknown selection_strategy for Geometric SMOTE algorithm. '
186
- ' Choices are {}. Got {} instead.'
215
+ " Unknown selection_strategy for Geometric SMOTE algorithm. "
216
+ " Choices are {}. Got {} instead."
187
217
)
188
218
raise ValueError (
189
219
error_msg .format (SELECTION_STRATEGY , self .selection_strategy )
190
220
)
191
221
192
222
# Create nearest neighbors object for positive class
193
- if self .selection_strategy in (' minority' , ' combined' ):
223
+ if self .selection_strategy in (" minority" , " combined" ):
194
224
self .nns_pos_ = check_neighbors_object (
195
- ' nns_positive' , self .k_neighbors , additional_neighbor = 1
225
+ " nns_positive" , self .k_neighbors , additional_neighbor = 1
196
226
)
197
227
self .nns_pos_ .set_params (n_jobs = self .n_jobs )
198
228
199
229
# Create nearest neighbors object for negative class
200
- if self .selection_strategy in (' majority' , ' combined' ):
201
- self .nn_neg_ = check_neighbors_object (' nn_negative' , nn_object = 1 )
230
+ if self .selection_strategy in (" majority" , " combined" ):
231
+ self .nn_neg_ = check_neighbors_object (" nn_negative" , nn_object = 1 )
202
232
self .nn_neg_ .set_params (n_jobs = self .n_jobs )
203
233
204
234
def _make_geometric_samples (self , X , y , pos_class_label , n_samples ):
@@ -237,11 +267,11 @@ def _make_geometric_samples(self, X, y, pos_class_label, n_samples):
237
267
238
268
# Force minority strategy if no negative class samples are present
239
269
self .selection_strategy_ = (
240
- ' minority' if len ( X ) == len ( X_pos ) else self .selection_strategy
270
+ " minority" if X . shape [ 0 ] == X_pos . shape [ 0 ] else self .selection_strategy
241
271
)
242
272
243
273
# Minority or combined strategy
244
- if self .selection_strategy_ in (' minority' , ' combined' ):
274
+ if self .selection_strategy_ in (" minority" , " combined" ):
245
275
self .nns_pos_ .fit (X_pos )
246
276
points_pos = self .nns_pos_ .kneighbors (X_pos )[1 ][:, 1 :]
247
277
samples_indices = self .random_state_ .randint (
@@ -251,11 +281,11 @@ def _make_geometric_samples(self, X, y, pos_class_label, n_samples):
251
281
cols = np .mod (samples_indices , points_pos .shape [1 ])
252
282
253
283
# Majority or combined strategy
254
- if self .selection_strategy_ in (' majority' , ' combined' ):
284
+ if self .selection_strategy_ in (" majority" , " combined" ):
255
285
X_neg = X [y != pos_class_label ]
256
286
self .nn_neg_ .fit (X_neg )
257
287
points_neg = self .nn_neg_ .kneighbors (X_pos )[1 ]
258
- if self .selection_strategy_ == ' majority' :
288
+ if self .selection_strategy_ == " majority" :
259
289
samples_indices = self .random_state_ .randint (
260
290
low = 0 , high = len (points_neg .flatten ()), size = n_samples
261
291
)
@@ -270,11 +300,11 @@ def _make_geometric_samples(self, X, y, pos_class_label, n_samples):
270
300
center = X_pos [row ]
271
301
272
302
# Minority strategy
273
- if self .selection_strategy_ == ' minority' :
303
+ if self .selection_strategy_ == " minority" :
274
304
surface_point = X_pos [points_pos [row , col ]]
275
305
276
306
# Majority strategy
277
- elif self .selection_strategy_ == ' majority' :
307
+ elif self .selection_strategy_ == " majority" :
278
308
surface_point = X_neg [points_neg [row , col ]]
279
309
280
310
# Combined strategy
@@ -306,19 +336,28 @@ def _fit_resample(self, X, y):
306
336
# Validate estimator's parameters
307
337
self ._validate_estimator ()
308
338
339
+ # Ensure the input data is dense
340
+ X_dense = X .toarray () if sparse .issparse (X ) else X
341
+
309
342
# Copy data
310
- X_resampled , y_resampled = X .copy (), y .copy ()
343
+ X_resampled , y_resampled = [ X_dense .copy ()], [ y .copy ()]
311
344
312
345
# Resample data
313
346
for class_label , n_samples in self .sampling_strategy_ .items ():
314
347
315
348
# Apply gsmote mechanism
316
- X_new , y_new = self ._make_geometric_samples (X , y , class_label , n_samples )
317
-
318
- # Append new data
319
- X_resampled , y_resampled = (
320
- np .vstack ((X_resampled , X_new )),
321
- np .hstack ((y_resampled , y_new )),
349
+ X_new , y_new = self ._make_geometric_samples (
350
+ X_dense , y , class_label , n_samples
322
351
)
323
352
353
+ X_resampled .append (X_new )
354
+ y_resampled .append (y_new )
355
+
356
+ # Append new data
357
+ if sparse .issparse (X ):
358
+ X_resampled = sparse .vstack (X_resampled , format = X .format )
359
+ else :
360
+ X_resampled = np .vstack (X_resampled ).astype (X .dtype )
361
+ y_resampled = np .hstack (y_resampled ).astype (y .dtype )
362
+
324
363
return X_resampled , y_resampled
0 commit comments