FIX: get the right index when tie breaking in SMOTE NC (#497)

glemaitre · web-flow · commit 07b4253036a5 · 2018-11-06T17:57:30.000+01:00
closes #494
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -188,11 +188,11 @@ features or a boolean mask marking these features::
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 30), (1, 30)]
   >>> print(X_resampled[-5:])
-  [['B' 0.5246469549655818 0]
-   ['A' -0.3657680728116921 0]
-   ['B' 0.9344237230779993 0]
-   ['A' 0.3710891618824609 0]
-   ['A' 0.3327240726719727 0]]
+  [['A' 0.5246469549655818 2]
+   ['B' -0.3657680728116921 2]
+   ['A' 0.9344237230779993 2]
+   ['B' 0.3710891618824609 2]
+   ['B' 0.3327240726719727 2]]
 
 Therefore, it can be seen that the samples generated in the first and last
 columns are belonging to the same categories originally presented without any
diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst
@@ -18,10 +18,14 @@ Bug fixes
   target or multilabel targets. Imbalanced-learn does not support this case.
   By :user:`Guillaume Lemaitre <glemaitre>` in :issue:`490`.
 
-- Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which an sparse
+- Fix a bug in :class:`imblearn.over_sampling.SMOTENC` in which a sparse
   matrices were densify during ``inverse_transform``.
   By :user:`Guillaume Lemaitre <glemaitre>` in :issue:`495`.
 
+- Fix a bug in :class:`imblearn.over_sampling.SMOTE_NC` in which a the tie
+  breaking was wrongly sampling.
+  By :user:`Guillaume Lemaitre <glemaitre>` in :issue:`497`.
+
 Version 0.4
 ===========
 
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -1032,11 +1032,13 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step):
 
         categories_size = ([self.continuous_features_.size] +
                            [cat.size for cat in self.ohe_.categories_])
+
         for start_idx, end_idx in zip(np.cumsum(categories_size)[:-1],
                                       np.cumsum(categories_size)[1:]):
             col_max = all_neighbors[:, start_idx:end_idx].sum(axis=0)
             # tie breaking argmax
-            col_sel = rng.choice(col_max == col_max.max())
+            col_sel = rng.choice(np.flatnonzero(
+                np.isclose(col_max, col_max.max())))
             sample[start_idx:end_idx] = 0
             sample[start_idx + col_sel] = 1