1
- """Class to perform random over-sampling."""
1
+ """Class to perform over-sampling using ADASYN ."""
2
2
3
3
# Authors: Guillaume Lemaitre <[email protected] >
4
4
# Christos Aridas
@@ -104,8 +104,8 @@ def _fit_resample(self, X, y):
104
104
self ._validate_estimator ()
105
105
random_state = check_random_state (self .random_state )
106
106
107
- X_resampled = X .copy ()
108
- y_resampled = y .copy ()
107
+ X_resampled = [ X .copy ()]
108
+ y_resampled = [ y .copy ()]
109
109
110
110
for class_sample , n_samples in self .sampling_strategy_ .items ():
111
111
if n_samples == 0 :
@@ -114,13 +114,12 @@ def _fit_resample(self, X, y):
114
114
X_class = _safe_indexing (X , target_class_indices )
115
115
116
116
self .nn_ .fit (X )
117
- _ , nn_index = self .nn_ .kneighbors (X_class )
117
+ nns = self .nn_ .kneighbors (X_class , return_distance = False )[:, 1 :]
118
118
# The ratio is computed using a one-vs-rest manner. Using majority
119
119
# in multi-class would lead to slightly different results at the
120
120
# cost of introducing a new parameter.
121
- ratio_nn = np .sum (y [nn_index [:, 1 :]] != class_sample , axis = 1 ) / (
122
- self .nn_ .n_neighbors - 1
123
- )
121
+ n_neighbors = self .nn_ .n_neighbors - 1
122
+ ratio_nn = np .sum (y [nns ] != class_sample , axis = 1 ) / n_neighbors
124
123
if not np .sum (ratio_nn ):
125
124
raise RuntimeError (
126
125
"Not any neigbours belong to the majority"
@@ -131,7 +130,9 @@ def _fit_resample(self, X, y):
131
130
)
132
131
ratio_nn /= np .sum (ratio_nn )
133
132
n_samples_generate = np .rint (ratio_nn * n_samples ).astype (int )
134
- if not np .sum (n_samples_generate ):
133
+ # rounding may cause new amount for n_samples
134
+ n_samples = np .sum (n_samples_generate )
135
+ if not n_samples :
135
136
raise ValueError (
136
137
"No samples will be generated with the"
137
138
" provided ratio settings."
@@ -140,66 +141,30 @@ def _fit_resample(self, X, y):
140
141
# the nearest neighbors need to be fitted only on the current class
141
142
# to find the class NN to generate new samples
142
143
self .nn_ .fit (X_class )
143
- _ , nn_index = self .nn_ .kneighbors (X_class )
144
+ nns = self .nn_ .kneighbors (X_class , return_distance = False )[:, 1 :]
144
145
145
- if sparse .issparse (X ):
146
- row_indices , col_indices , samples = [], [], []
147
- n_samples_generated = 0
148
- for x_i , x_i_nn , num_sample_i in zip (
149
- X_class , nn_index , n_samples_generate
150
- ):
151
- if num_sample_i == 0 :
152
- continue
153
- nn_zs = random_state .randint (
154
- 1 , high = self .nn_ .n_neighbors , size = num_sample_i
155
- )
156
- steps = random_state .uniform (size = len (nn_zs ))
157
- if x_i .nnz :
158
- for step , nn_z in zip (steps , nn_zs ):
159
- sample = x_i + step * (
160
- X_class [x_i_nn [nn_z ], :] - x_i
161
- )
162
- row_indices += [n_samples_generated ] * len (
163
- sample .indices
164
- )
165
- col_indices += sample .indices .tolist ()
166
- samples += sample .data .tolist ()
167
- n_samples_generated += 1
168
- X_new = sparse .csr_matrix (
169
- (samples , (row_indices , col_indices )),
170
- [np .sum (n_samples_generate ), X .shape [1 ]],
171
- dtype = X .dtype ,
172
- )
173
- y_new = np .array (
174
- [class_sample ] * np .sum (n_samples_generate ), dtype = y .dtype
175
- )
176
- else :
177
- x_class_gen = []
178
- for x_i , x_i_nn , num_sample_i in zip (
179
- X_class , nn_index , n_samples_generate
180
- ):
181
- if num_sample_i == 0 :
182
- continue
183
- nn_zs = random_state .randint (
184
- 1 , high = self .nn_ .n_neighbors , size = num_sample_i
185
- )
186
- steps = random_state .uniform (size = len (nn_zs ))
187
- x_class_gen .append (
188
- [
189
- x_i + step * (X_class [x_i_nn [nn_z ], :] - x_i )
190
- for step , nn_z in zip (steps , nn_zs )
191
- ]
192
- )
193
-
194
- X_new = np .concatenate (x_class_gen ).astype (X .dtype )
195
- y_new = np .array (
196
- [class_sample ] * np .sum (n_samples_generate ), dtype = y .dtype
197
- )
146
+ enumerated_class_indices = np .arange (len (target_class_indices ))
147
+ rows = np .repeat (enumerated_class_indices , n_samples_generate )
148
+ cols = random_state .choice (n_neighbors , size = n_samples )
149
+ diffs = X_class [nns [rows , cols ]] - X_class [rows ]
150
+ steps = random_state .uniform (size = (n_samples , 1 ))
198
151
199
- if sparse .issparse (X_new ):
200
- X_resampled = sparse .vstack ([X_resampled , X_new ])
152
+ if sparse .issparse (X ):
153
+ sparse_func = type (X ).__name__
154
+ steps = getattr (sparse , sparse_func )(steps )
155
+ X_new = X_class [rows ] + steps .multiply (diffs )
201
156
else :
202
- X_resampled = np .vstack ((X_resampled , X_new ))
203
- y_resampled = np .hstack ((y_resampled , y_new ))
157
+ X_new = X_class [rows ] + steps * diffs
158
+
159
+ X_new = X_new .astype (X .dtype )
160
+ y_new = np .full (n_samples , fill_value = class_sample , dtype = y .dtype )
161
+ X_resampled .append (X_new )
162
+ y_resampled .append (y_new )
163
+
164
+ if sparse .issparse (X ):
165
+ X_resampled = sparse .vstack (X_resampled , format = X .format )
166
+ else :
167
+ X_resampled = np .vstack (X_resampled )
168
+ y_resampled = np .hstack (y_resampled )
204
169
205
170
return X_resampled , y_resampled
0 commit comments