scikit-learn-contrib · tianlinhe · Jun 10, 2021 · Jun 18, 2021
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -102,7 +102,7 @@ def _make_samples(
 
         X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps)
         y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
-        return X_new, y_new
+        return X_new, y_new, rows, cols
 
     def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
         r"""Generate a synthetic sample.
@@ -299,6 +299,9 @@ def _fit_resample(self, X, y):
         X_resampled = [X.copy()]
         y_resampled = [y.copy()]
 
+        self.real_indices = [i for i in range(len(y))]
+        self.which_neighbors = [0]*len(y)
+
         for class_sample, n_samples in self.sampling_strategy_.items():
             if n_samples == 0:
                 continue
@@ -307,19 +310,39 @@ def _fit_resample(self, X, y):
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
-            X_new, y_new = self._make_samples(
+            X_new, y_new, rows, cols = self._make_samples(
                 X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
             )
             X_resampled.append(X_new)
             y_resampled.append(y_new)
+            self.real_indices.append(target_class_indices[rows])
+            self.which_neighbors.append(cols)
 
         if sparse.issparse(X):
             X_resampled = sparse.vstack(X_resampled, format=X.format)
         else:
             X_resampled = np.vstack(X_resampled)
         y_resampled = np.hstack(y_resampled)
+        self.real_indices = np.hstack(self.real_indices)
+        self.which_neighbors = np.hstack(self.which_neighbors)
 
         return X_resampled, y_resampled
+
+    def sample_indices(self, get_which_neighbors=False):
+        """return indices
+        - for real sample, return its own index
+        - for synthetic sample, return the index of its "mother" real sample
+
+        Parameters
+        -----------
+        get_which_neighbors: if ==True returns which nearest neighbor is used
+            For samples that are not generated, returns 0
+        """
+
+        if get_which_neighbors is True:
+            return [(i, j) for i, j in zip(self.real_indices, self.which_neighbors)]
+        else:
+            return self.real_indices
 
 
 @Substitution(
@@ -518,7 +541,7 @@ def _fit_resample(self, X, y):
         X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
 
         # reverse the encoding of the categorical features
-        X_res_cat = X_resampled[:, self.continuous_features_.size :]
+        X_res_cat = X_resampled[:, self.continuous_features_.size:]
         X_res_cat.data = np.ones_like(X_res_cat.data)
         X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)
 
@@ -573,7 +596,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
         # create non-null entry based on the encoded of OHE
         if math.isclose(self.median_std_, 0):
             nn_data[
-                :, self.continuous_features_.size :
+                :, self.continuous_features_.size:
             ] = self._X_categorical_minority_encoded
 
         all_neighbors = nn_data[nn_num[rows]]