From 6df29fe8e30cf3cb10ff231a6eda3b4545237532 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Sun, 18 Sep 2022 10:03:59 -0700
Subject: [PATCH 01/33] Add support for MLSMOTE

Co-authored-by: Bruno Alvisio <bruno.alvisio@gmail.com>
Co-authored-by: Simon Ermler <simon_ermler@web.de>
---
 imblearn/over_sampling/__init__.py           |   2 +
 imblearn/over_sampling/_mlsmote.py           | 281 +++++++++++++++++++
 imblearn/over_sampling/tests/test_mlsmote.py | 126 +++++++++
 3 files changed, 409 insertions(+)
 create mode 100644 imblearn/over_sampling/_mlsmote.py
 create mode 100644 imblearn/over_sampling/tests/test_mlsmote.py

diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
index a959cbb43..54ab07e18 100644
--- a/imblearn/over_sampling/__init__.py
+++ b/imblearn/over_sampling/__init__.py
@@ -11,6 +11,7 @@
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
 from ._smote import SMOTEN
+from ._mlsmote import MLSMOTE
 
 __all__ = [
     "ADASYN",
@@ -21,4 +22,5 @@
     "SVMSMOTE",
     "SMOTENC",
     "SMOTEN",
+    "MLSMOTE",
 ]
diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
new file mode 100644
index 000000000..24ba43098
--- /dev/null
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -0,0 +1,281 @@
+import numpy as np
+import itertools
+import collections
+import random
+from scipy import sparse
+class MLSMOTE:
+    """Over-sampling using MLSMOTE.
+
+    Parameters
+    ----------
+    sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking'
+        Strategy to generate labelsets
+
+
+    k_neighbors : int or object, default=5
+        If ``int``, number of nearest neighbours to used to construct synthetic
+        samples.
+
+    categorical_features : ndarray of shape (n_cat_features,) or (n_features,)
+        Specified which features are categorical. Can either be:
+
+        - array of indices specifying the categorical features;
+        - mask array of shape (n_features, ) and ``bool`` dtype for which
+          ``True`` indicates the categorical features.
+
+    Notes
+    -----
+    See the original papers: [1]_ for more details.
+
+
+    References
+    ----------
+    .. [1]  Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015).
+            MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation.
+            Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. 
+
+    """
+
+    def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'):
+        self.k_neighbors = k_neighbors
+        self.sampling_strategy_ = sampling_strategy
+        self.categorical_features = categorical_features
+        self.continuous_features_ = None
+        self.unique_labels = []
+        self.labels = []
+        self.features = []
+
+    def fit_resample(self, X, y):
+        self.n_features_ = X.shape[1]
+
+        self._validate_estimator()
+
+        X_resampled = X.copy()
+        y_resampled = y.copy()
+
+        if sparse.issparse(y):
+            self.labels = y
+            self.unique_labels = range(0, y.shape[1])
+        else:
+            self.labels = np.array([np.array(xi) for xi in y])
+            self.unique_labels = self._collect_unique_labels(y)
+        self.features = X
+
+        X_synth = []
+        y_synth = []
+
+        append_X_synth = X_synth.append
+        append_y_synth = y_synth.append
+        mean_ir = self._get_mean_imbalance_ratio()
+        
+        if sparse.issparse(y):
+            y_synth = None
+
+            for label in self.unique_labels:
+                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
+                if irlbl > mean_ir:
+                    min_bag = self._get_all_instances_of_label(label)
+                    for sample in min_bag:
+                        distances = self._calc_distances(sample, min_bag)
+                        distances = np.sort(distances, order='distance')
+                        neighbours = distances[:self.k_neighbors]
+                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        X_new, y_new = self._create_new_sample(
+                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                        append_X_synth(X_new)
+                        y_resambled = sparse.vstack((y_resampled, y_new))
+            return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
+        else:
+            for index, label in np.ndenumerate(self.unique_labels):
+                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
+                if irlbl > mean_ir:
+                    min_bag = self._get_all_instances_of_label(label)
+                    for sample in min_bag:
+                        distances = self._calc_distances(sample, min_bag)
+                        distances = np.sort(distances, order='distance')
+                        neighbours = distances[:self.k_neighbors]
+                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        X_new, y_new = self._create_new_sample(
+                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                        append_X_synth(X_new)
+                        append_y_synth(y_new)
+            return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
+
+    def _validate_estimator(self):
+        categorical_features = np.asarray(self.categorical_features)
+        if categorical_features.dtype.name == "bool":
+            self.categorical_features_ = np.flatnonzero(categorical_features)
+        else:
+            if any(
+                [
+                    cat not in np.arange(self.n_features_)
+                    for cat in categorical_features
+                ]
+            ):
+                raise ValueError(
+                    "Some of the categorical indices are out of range. Indices"
+                    " should be between 0 and {}".format(self.n_features_)
+                )
+            self.categorical_features_ = categorical_features
+        self.continuous_features_ = np.setdiff1d(
+            np.arange(self.n_features_), self.categorical_features_
+        )
+
+    def _collect_unique_labels(self, y):
+        """A support function that flattens the labelsets and return one set of unique labels"""
+        return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
+
+    def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
+        sample = self.features[sample_id]
+        synth_sample = np.copy(sample)
+        ref_neigh = self.features[ref_neigh_id]
+        sample_labels = self.labels[sample_id]
+
+        for i in range(synth_sample.shape[0]):
+            if i in self.continuous_features_:
+                diff = ref_neigh[i]-sample[i]
+                offset = diff*random.uniform(0, 1)
+                synth_sample[i] = sample[i]+offset
+            if i in self.categorical_features_:
+                synth_sample[i] = self._get_most_frequent_value(
+                    self.features[neighbour_ids, i])
+        X = synth_sample
+
+        if sparse.issparse(self.labels):
+            neighbours_labels = self.labels[neighbour_ids]
+            possible_labels = neighbours_labels.sum(axis=0)
+            y = np.zeros((1, len(self.unique_labels)))
+            if self.sampling_strategy_ == 'ranking':
+                head_index = int((self.k_neighbors + 1)/2)
+                choosen_labels = possible_labels.nonzero()[1][:head_index]
+                y[0, choosen_labels] = 1
+            if self.sampling_strategy_ == 'union':
+                choosen_labels = possible_labels.nonzero()[0]
+                y[choosen_labels] = 1
+            if self.sampling_strategy_ == 'intersection':
+                choosen_labels = sparse.find(possible_labels == len(neighbours_labels))
+                y[choosen_labels] = 1
+            y = sparse.csr_matrix(y)
+
+        else:
+            neighbours_labels = []
+            for ni in neighbour_ids:
+                neighbours_labels.append(self.labels[ni].tolist())        
+
+            labels = []  # sample_labels.tolist()
+            labels += [a for x in neighbours_labels for a in (
+                x if isinstance(x, list) else [x])]
+            labels = list(set(labels))
+            if self.sampling_strategy_ == 'ranking':
+                head_index = int((self.k_neighbors + 1)/2)
+                y = labels[:head_index]
+            if self.sampling_strategy_ == 'union':
+                y = labels[:]
+            if self.sampling_strategy_ == 'intersection':
+                y = list(set.intersection(*neighbours_labels))
+
+        return X, y
+
+    def _calc_distances(self, sample, min_bag):
+        def calc_dist(bag_sample):
+            nominal_distance = sum([self._get_vdm(
+                self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_])
+            ordinal_distance = sum([self._get_euclidean_distance(
+                self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_])
+            dist = sum([nominal_distance, ordinal_distance])
+            return (dist, bag_sample)
+        distances = [calc_dist(bag_sample) for bag_sample in min_bag]
+        dtype = np.dtype([('distance', float), ('index', int)])
+        return np.array(distances, dtype=dtype)
+
+    def _get_euclidean_distance(self, first, second):
+        euclidean_distance = np.linalg.norm(first-second)
+        return euclidean_distance
+
+    def _get_vdm(self, first, second, category):
+        """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
+        if sparse.issparse(self.features):
+            def f_sparse(c):
+                N_ax = len(sparse.find(self.features[:, category] == first)[0])
+                N_ay = len(sparse.find(
+                    self.features[:, category] == second)[0])
+                c_instances = self._get_all_instances_of_label(c)
+                N_axc = len(sparse.find(
+                    self.features[c_instances, category] == first)[0])
+                N_ayc = len(sparse.find(
+                    self.features[c_instances, category] == second)[0])
+                p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
+                return p
+
+            vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels]))
+            return vdm
+
+        category_rows = self.features[:, category]
+        N_ax = len(np.where(category_rows == first))
+        N_ay = len(np.where(category_rows == second))
+
+        def f(c):
+            class_instances = self._get_all_instances_of_label(c)
+            class_instance_rows = category_rows[class_instances]
+            N_axc = len(np.where(class_instance_rows == first)[0])
+            N_ayc = len(np.where(class_instance_rows == second)[0])
+            p = abs((N_axc/N_ax)-(N_ayc/N_ay))
+            return p
+
+        vdm = np.array([f(c)for c in self.unique_labels]).sum()
+        return vdm
+
+    def _get_all_instances_of_label(self, label):
+        if sparse.issparse(self.labels):
+            return self.labels[:, label].nonzero()[0]
+        instance_ids = []
+        append_instance_id = instance_ids.append
+        for i, label_set in enumerate(self.labels):
+            if label in label_set:
+                append_instance_id(i)
+        return np.array(instance_ids)
+
+    def _get_mean_imbalance_ratio(self):
+        ratio_sum = np.sum(np.array(
+            list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
+        return ratio_sum/len(self.unique_labels)
+
+    def _get_imbalance_ratio_per_label(self, label, labels=None):
+        sum_h = self._sum_h
+        if labels is None:
+            sum_array = np.array([sum_h(l, self.labels)
+                                  for l in self.unique_labels])
+            ratio = sum_array.max()/sum_h(label, self.labels)
+        else:
+            sum_array = np.array([sum_h(l, labels)for l in self.unique_labels])
+            ratio = sum_array.max()/sum_h(label, labels)
+
+        return ratio
+
+    def _sum_h(self, label, labels):
+        if sparse.issparse(labels):
+            return labels[:, label].count_nonzero()
+
+        h_sum = 0
+
+        def h(l, Y):
+            if l in Y:
+                return 1
+            else:
+                return 0
+
+        for label_set in labels:
+            h_sum += h(label, label_set)
+
+        return h_sum
+
+    def _get_label_frequencies(self, labels):
+        """"A support function to get the frequencies of labels"""
+        frequency_map = np.array(np.unique(labels, return_counts=True)).T
+        frequencies = np.array([x[1] for x in frequency_map])
+        return frequencies
+
+    def _get_most_frequent_value(self, values):
+        """"A support function to get most frequent value if a list of values"""
+        uniques, indices = np.unique(values, return_inverse=True)
+        return uniques[np.argmax(np.bincount(indices))]
diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py
new file mode 100644
index 000000000..49f1c0317
--- /dev/null
+++ b/imblearn/over_sampling/tests/test_mlsmote.py
@@ -0,0 +1,126 @@
+"""Test the module MLSMOTE."""
+
+
+from collections import Counter
+
+import pytest
+
+import numpy as np
+from scipy import sparse
+from sklearn.preprocessing import MultiLabelBinarizer
+
+
+from imblearn.over_sampling import MLSMOTE
+
+
+def data_heterogneous_ordered():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, :2] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [2, 3]
+
+
+def data_heterogneous_unordered():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [0, 3]
+
+
+def data_heterogneous_masked():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [True, False, True]
+
+
+def data_sparse():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=np.float64)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.randint(3, size=30)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    labelBinarizer = MultiLabelBinarizer()
+    y = labelBinarizer.fit_transform(y)
+    y = sparse.csr_matrix(y)
+    return X, y, [0, 3]
+
+
+def test_mlsmote_error():
+    X, y, _ = data_heterogneous_unordered()
+    categorical_features = [0, 10]
+    smote = MLSMOTE(categorical_features=categorical_features)
+    with pytest.raises(ValueError, match="indices are out of range"):
+        smote.fit_resample(X, y)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        data_heterogneous_ordered(),
+        data_heterogneous_unordered(),
+        data_heterogneous_masked(),
+        data_sparse()
+    ],
+)
+def test_mlsmote(data):
+    X, y, categorical_features = data
+    smote = MLSMOTE(categorical_features=categorical_features)
+    X_resampled, y_resampled = smote.fit_resample(X, y)
+
+    assert X_resampled.dtype == X.dtype
+
+    categorical_features = np.array(categorical_features)
+    if categorical_features.dtype == bool:
+        categorical_features = np.flatnonzero(categorical_features)
+    for cat_idx in categorical_features:
+        assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
+        assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
+
+
+def test_mlsmote_fit():
+    X, y, categorical_features = data_heterogneous_unordered()
+    smote = MLSMOTE(categorical_features=categorical_features)
+    smote.fit_resample(X, y)
+    assert hasattr(
+        smote, "sampling_strategy_"
+    ), "No fitted attribute sampling_strategy_"
+
+
+def test_mlsmote_fit_resample():
+    X, y, categorical_features = data_heterogneous_unordered()
+    target_stats = Counter(np.unique(
+        np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
+    smote = MLSMOTE(categorical_features=categorical_features)
+    _, y_res = smote.fit_resample(X, y)
+    classes_res = np.unique(
+        np.array([a for x in y_res
+                  for a in (x if isinstance(x, list) else [x])]))
+    _ = Counter(classes_res)
+    n_samples = max(target_stats.values())
+    assert all(value >= n_samples for value in Counter(classes_res).values())

From c049468b2af8c438e51302d3f815bc17b02ea15f Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Sun, 18 Sep 2022 14:10:44 -0700
Subject: [PATCH 02/33] Added random_state, documentation and formatting

---
 imblearn/over_sampling/_mlsmote.py | 233 ++++++++++++++++++++---------
 1 file changed, 159 insertions(+), 74 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 24ba43098..13ce060e1 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -1,14 +1,17 @@
+"""Class to perfrom over-sampling using MLSMOTE."""
+
 import numpy as np
-import itertools
-import collections
-import random
 from scipy import sparse
+
+from sklearn.utils import check_random_state
+
+
 class MLSMOTE:
     """Over-sampling using MLSMOTE.
 
     Parameters
     ----------
-    sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking'
+    sampling_strategy: 'ranking', 'union' or 'intersection' default: 'ranking'
         Strategy to generate labelsets
 
 
@@ -17,7 +20,7 @@ class MLSMOTE:
         samples.
 
     categorical_features : ndarray of shape (n_cat_features,) or (n_features,)
-        Specified which features are categorical. Can either be:
+        Specifies which features are categorical. Can either be:
 
         - array of indices specifying the categorical features;
         - mask array of shape (n_features, ) and ``bool`` dtype for which
@@ -25,18 +28,29 @@ class MLSMOTE:
 
     Notes
     -----
-    See the original papers: [1]_ for more details.
-
+    The implementation is based on [1]_.
 
     References
     ----------
-    .. [1]  Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015).
-            MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation.
-            Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. 
-
+    .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera,
+           Francisco. (2015). "MLSMOTE: Approaching imbalanced multilabel learning
+           through synthetic instance generation."
+           Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_multilabel_classification
     """
 
-    def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'):
+    def __init__(
+        self,
+        *,
+        sampling_strategy="ranking",
+        categorical_features,
+        random_state=None,
+        k_neighbors=5,
+    ):
+        self.random_state = random_state
         self.k_neighbors = k_neighbors
         self.sampling_strategy_ = sampling_strategy
         self.categorical_features = categorical_features
@@ -46,19 +60,52 @@ def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranki
         self.features = []
 
     def fit_resample(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe, sparse matrix} of shape \
+                (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : {array-like, sparse matrix of shape \
+                (n_samples, n_labels)
+            or a list of lists of labels.
+            See "sklearn.datasets.make_multilabel_classification" and \
+                the "return_indicate" input parameter for more \
+                information on possible label sets formats.
+
+            Corresponding label sets for each sample in X. Sparse matrix \
+                should be of CSR format.
+
+        Returns
+        -------
+        X_resampled : {array-like, dataframe, sparse matrix} of shape \
+                (n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : array-like of shape (n_samples_new, n_labels)
+            The corresponding label sets of `X_resampled`.
+        """
         self.n_features_ = X.shape[1]
 
         self._validate_estimator()
+        random_state = check_random_state(self.random_state)
 
         X_resampled = X.copy()
         y_resampled = y.copy()
 
-        if sparse.issparse(y):
+        if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:
             self.labels = y
             self.unique_labels = range(0, y.shape[1])
-        else:
+        elif type(y) == list:
             self.labels = np.array([np.array(xi) for xi in y])
             self.unique_labels = self._collect_unique_labels(y)
+        else:
+            raise TypeError(
+                "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'"
+                " or 'list'"
+            )
         self.features = X
 
         X_synth = []
@@ -67,7 +114,7 @@ def fit_resample(self, X, y):
         append_X_synth = X_synth.append
         append_y_synth = y_synth.append
         mean_ir = self._get_mean_imbalance_ratio()
-        
+
         if sparse.issparse(y):
             y_synth = None
 
@@ -77,11 +124,15 @@ def fit_resample(self, X, y):
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
                         distances = self._calc_distances(sample, min_bag)
-                        distances = np.sort(distances, order='distance')
-                        neighbours = distances[:self.k_neighbors]
-                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        distances = np.sort(distances, order="distance")
+                        neighbours = distances[: self.k_neighbors]
+                        ref_neigh = random_state.choice(neighbours, 1)[0]
                         X_new, y_new = self._create_new_sample(
-                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                            sample,
+                            ref_neigh[1],
+                            [x[1] for x in neighbours],
+                            random_state,
+                        )
                         append_X_synth(X_new)
                         y_resambled = sparse.vstack((y_resampled, y_new))
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
@@ -92,29 +143,32 @@ def fit_resample(self, X, y):
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
                         distances = self._calc_distances(sample, min_bag)
-                        distances = np.sort(distances, order='distance')
-                        neighbours = distances[:self.k_neighbors]
-                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        distances = np.sort(distances, order="distance")
+                        neighbours = distances[: self.k_neighbors]
+                        ref_neigh = random_state.choice(neighbours, 1)[0]
                         X_new, y_new = self._create_new_sample(
-                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                            sample,
+                            ref_neigh[1],
+                            [x[1] for x in neighbours],
+                            random_state,
+                        )
                         append_X_synth(X_new)
                         append_y_synth(y_new)
-            return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
+            return np.concatenate((X_resampled, np.array(X_synth))), np.array(
+                y_resampled.tolist() + y_synth
+            )
 
     def _validate_estimator(self):
         categorical_features = np.asarray(self.categorical_features)
-        if categorical_features.dtype.name == "bool":
+        if categorical_features.dtype.name == bool:
             self.categorical_features_ = np.flatnonzero(categorical_features)
         else:
             if any(
-                [
-                    cat not in np.arange(self.n_features_)
-                    for cat in categorical_features
-                ]
+                [cat not in np.arange(self.n_features_) for cat in categorical_features]
             ):
                 raise ValueError(
                     "Some of the categorical indices are out of range. Indices"
-                    " should be between 0 and {}".format(self.n_features_)
+                    f" should be between 0 and {self.n_features_ - 1}"
                 )
             self.categorical_features_ = categorical_features
         self.continuous_features_ = np.setdiff1d(
@@ -122,10 +176,22 @@ def _validate_estimator(self):
         )
 
     def _collect_unique_labels(self, y):
-        """A support function that flattens the labelsets and return one set of unique labels"""
-        return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
+        """A support function that flattens the labelsets and return one set of unique
+        labels
+        """
+        return np.unique(
+            np.array(
+                [
+                    label
+                    for label_set in y
+                    for label in (
+                        label_set if isinstance(label_set, list) else [label_set]
+                    )
+                ]
+            )
+        )
 
-    def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
+    def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_state):
         sample = self.features[sample_id]
         synth_sample = np.copy(sample)
         ref_neigh = self.features[ref_neigh_id]
@@ -133,26 +199,27 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
 
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
-                diff = ref_neigh[i]-sample[i]
-                offset = diff*random.uniform(0, 1)
-                synth_sample[i] = sample[i]+offset
+                diff = ref_neigh[i] - sample[i]
+                offset = diff * random_state.uniform(0, 1)
+                synth_sample[i] = sample[i] + offset
             if i in self.categorical_features_:
                 synth_sample[i] = self._get_most_frequent_value(
-                    self.features[neighbour_ids, i])
+                    self.features[neighbour_ids, i]
+                )
         X = synth_sample
 
         if sparse.issparse(self.labels):
             neighbours_labels = self.labels[neighbour_ids]
             possible_labels = neighbours_labels.sum(axis=0)
             y = np.zeros((1, len(self.unique_labels)))
-            if self.sampling_strategy_ == 'ranking':
-                head_index = int((self.k_neighbors + 1)/2)
+            if self.sampling_strategy_ == "ranking":
+                head_index = int((self.k_neighbors + 1) / 2)
                 choosen_labels = possible_labels.nonzero()[1][:head_index]
                 y[0, choosen_labels] = 1
-            if self.sampling_strategy_ == 'union':
+            if self.sampling_strategy_ == "union":
                 choosen_labels = possible_labels.nonzero()[0]
                 y[choosen_labels] = 1
-            if self.sampling_strategy_ == 'intersection':
+            if self.sampling_strategy_ == "intersection":
                 choosen_labels = sparse.find(possible_labels == len(neighbours_labels))
                 y[choosen_labels] = 1
             y = sparse.csr_matrix(y)
@@ -160,54 +227,72 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
         else:
             neighbours_labels = []
             for ni in neighbour_ids:
-                neighbours_labels.append(self.labels[ni].tolist())        
+                neighbours_labels.append(self.labels[ni].tolist())
 
             labels = []  # sample_labels.tolist()
-            labels += [a for x in neighbours_labels for a in (
-                x if isinstance(x, list) else [x])]
+            labels += [
+                a
+                for x in neighbours_labels
+                for a in (x if isinstance(x, list) else [x])
+            ]
             labels = list(set(labels))
-            if self.sampling_strategy_ == 'ranking':
-                head_index = int((self.k_neighbors + 1)/2)
+            if self.sampling_strategy_ == "ranking":
+                head_index = int((self.k_neighbors + 1) / 2)
                 y = labels[:head_index]
-            if self.sampling_strategy_ == 'union':
+            if self.sampling_strategy_ == "union":
                 y = labels[:]
-            if self.sampling_strategy_ == 'intersection':
+            if self.sampling_strategy_ == "intersection":
                 y = list(set.intersection(*neighbours_labels))
 
         return X, y
 
     def _calc_distances(self, sample, min_bag):
         def calc_dist(bag_sample):
-            nominal_distance = sum([self._get_vdm(
-                self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_])
-            ordinal_distance = sum([self._get_euclidean_distance(
-                self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_])
+            nominal_distance = sum(
+                [
+                    self._get_vdm(
+                        self.features[sample, cat], self.features[bag_sample, cat], cat
+                    )
+                    for cat in self.categorical_features_
+                ]
+            )
+            ordinal_distance = sum(
+                [
+                    self._get_euclidean_distance(
+                        self.features[sample, num], self.features[bag_sample, num]
+                    )
+                    for num in self.continuous_features_
+                ]
+            )
             dist = sum([nominal_distance, ordinal_distance])
             return (dist, bag_sample)
+
         distances = [calc_dist(bag_sample) for bag_sample in min_bag]
-        dtype = np.dtype([('distance', float), ('index', int)])
+        dtype = np.dtype([("distance", float), ("index", int)])
         return np.array(distances, dtype=dtype)
 
     def _get_euclidean_distance(self, first, second):
-        euclidean_distance = np.linalg.norm(first-second)
+        euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
     def _get_vdm(self, first, second, category):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
         if sparse.issparse(self.features):
+
             def f_sparse(c):
                 N_ax = len(sparse.find(self.features[:, category] == first)[0])
-                N_ay = len(sparse.find(
-                    self.features[:, category] == second)[0])
+                N_ay = len(sparse.find(self.features[:, category] == second)[0])
                 c_instances = self._get_all_instances_of_label(c)
-                N_axc = len(sparse.find(
-                    self.features[c_instances, category] == first)[0])
-                N_ayc = len(sparse.find(
-                    self.features[c_instances, category] == second)[0])
-                p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
+                N_axc = len(
+                    sparse.find(self.features[c_instances, category] == first)[0]
+                )
+                N_ayc = len(
+                    sparse.find(self.features[c_instances, category] == second)[0]
+                )
+                p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay)))
                 return p
 
-            vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels]))
+            vdm = np.sum(np.array([f_sparse(c) for c in self.unique_labels]))
             return vdm
 
         category_rows = self.features[:, category]
@@ -219,10 +304,10 @@ def f(c):
             class_instance_rows = category_rows[class_instances]
             N_axc = len(np.where(class_instance_rows == first)[0])
             N_ayc = len(np.where(class_instance_rows == second)[0])
-            p = abs((N_axc/N_ax)-(N_ayc/N_ay))
+            p = abs((N_axc / N_ax) - (N_ayc / N_ay))
             return p
 
-        vdm = np.array([f(c)for c in self.unique_labels]).sum()
+        vdm = np.array([f(c) for c in self.unique_labels]).sum()
         return vdm
 
     def _get_all_instances_of_label(self, label):
@@ -236,19 +321,19 @@ def _get_all_instances_of_label(self, label):
         return np.array(instance_ids)
 
     def _get_mean_imbalance_ratio(self):
-        ratio_sum = np.sum(np.array(
-            list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
-        return ratio_sum/len(self.unique_labels)
+        ratio_sum = np.sum(
+            np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))
+        )
+        return ratio_sum / len(self.unique_labels)
 
     def _get_imbalance_ratio_per_label(self, label, labels=None):
         sum_h = self._sum_h
         if labels is None:
-            sum_array = np.array([sum_h(l, self.labels)
-                                  for l in self.unique_labels])
-            ratio = sum_array.max()/sum_h(label, self.labels)
+            sum_array = np.array([sum_h(l, self.labels) for l in self.unique_labels])
+            ratio = sum_array.max() / sum_h(label, self.labels)
         else:
-            sum_array = np.array([sum_h(l, labels)for l in self.unique_labels])
-            ratio = sum_array.max()/sum_h(label, labels)
+            sum_array = np.array([sum_h(l, labels) for l in self.unique_labels])
+            ratio = sum_array.max() / sum_h(label, labels)
 
         return ratio
 
@@ -270,12 +355,12 @@ def h(l, Y):
         return h_sum
 
     def _get_label_frequencies(self, labels):
-        """"A support function to get the frequencies of labels"""
+        """A support function to get the frequencies of labels"""
         frequency_map = np.array(np.unique(labels, return_counts=True)).T
         frequencies = np.array([x[1] for x in frequency_map])
         return frequencies
 
     def _get_most_frequent_value(self, values):
-        """"A support function to get most frequent value if a list of values"""
+        """A support function to get most frequent value if a list of values"""
         uniques, indices = np.unique(values, return_inverse=True)
         return uniques[np.argmax(np.bincount(indices))]

From 192519b429ba3aa799aaaacda5ef6fd5847a8a50 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Sun, 18 Sep 2022 15:27:17 -0700
Subject: [PATCH 03/33] Refactor code to remove self.unique_labels

---
 imblearn/over_sampling/_mlsmote.py | 75 ++++++++++++++++++------------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 13ce060e1..989dd8286 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -1,5 +1,6 @@
 """Class to perfrom over-sampling using MLSMOTE."""
 
+import itertools
 import numpy as np
 from scipy import sparse
 
@@ -55,7 +56,6 @@ def __init__(
         self.sampling_strategy_ = sampling_strategy
         self.categorical_features = categorical_features
         self.continuous_features_ = None
-        self.unique_labels = []
         self.labels = []
         self.features = []
 
@@ -96,11 +96,11 @@ def fit_resample(self, X, y):
         y_resampled = y.copy()
 
         if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:
-            self.labels = y
-            self.unique_labels = range(0, y.shape[1])
+            labels = y
+            unique_labels = range(0, y.shape[1])
         elif type(y) == list:
-            self.labels = np.array([np.array(xi) for xi in y])
-            self.unique_labels = self._collect_unique_labels(y)
+            labels = np.array([np.array(xi) for xi in y], dtype=object)
+            unique_labels = self._collect_unique_labels(y)
         else:
             raise TypeError(
                 "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'"
@@ -113,17 +113,19 @@ def fit_resample(self, X, y):
 
         append_X_synth = X_synth.append
         append_y_synth = y_synth.append
-        mean_ir = self._get_mean_imbalance_ratio()
+        mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels)
 
         if sparse.issparse(y):
             y_synth = None
 
-            for label in self.unique_labels:
-                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
+            for label in unique_labels:
+                irlbl = self._get_imbalance_ratio_per_label(
+                    label, unique_labels, y_resampled
+                )
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
-                        distances = self._calc_distances(sample, min_bag)
+                        distances = self._calc_distances(sample, min_bag, unique_labels)
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
                         ref_neigh = random_state.choice(neighbours, 1)[0]
@@ -131,18 +133,21 @@ def fit_resample(self, X, y):
                             sample,
                             ref_neigh[1],
                             [x[1] for x in neighbours],
+                            unique_labels,
                             random_state,
                         )
                         append_X_synth(X_new)
                         y_resambled = sparse.vstack((y_resampled, y_new))
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
-            for index, label in np.ndenumerate(self.unique_labels):
-                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
+            for index, label in np.ndenumerate(unique_labels):
+                irlbl = self._get_imbalance_ratio_per_label(
+                    label, unique_labels, y_resampled
+                )
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
-                        distances = self._calc_distances(sample, min_bag)
+                        distances = self._calc_distances(sample, min_bag, unique_labels)
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
                         ref_neigh = random_state.choice(neighbours, 1)[0]
@@ -150,6 +155,7 @@ def fit_resample(self, X, y):
                             sample,
                             ref_neigh[1],
                             [x[1] for x in neighbours],
+                            unique_labels,
                             random_state,
                         )
                         append_X_synth(X_new)
@@ -191,7 +197,9 @@ def _collect_unique_labels(self, y):
             )
         )
 
-    def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_state):
+    def _create_new_sample(
+        self, sample_id, ref_neigh_id, neighbour_ids, unique_labels, random_state
+    ):
         sample = self.features[sample_id]
         synth_sample = np.copy(sample)
         ref_neigh = self.features[ref_neigh_id]
@@ -211,7 +219,7 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_stat
         if sparse.issparse(self.labels):
             neighbours_labels = self.labels[neighbour_ids]
             possible_labels = neighbours_labels.sum(axis=0)
-            y = np.zeros((1, len(self.unique_labels)))
+            y = np.zeros((1, len(unique_labels)))
             if self.sampling_strategy_ == "ranking":
                 head_index = int((self.k_neighbors + 1) / 2)
                 choosen_labels = possible_labels.nonzero()[1][:head_index]
@@ -246,12 +254,15 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_stat
 
         return X, y
 
-    def _calc_distances(self, sample, min_bag):
+    def _calc_distances(self, sample, min_bag, unique_labels):
         def calc_dist(bag_sample):
             nominal_distance = sum(
                 [
                     self._get_vdm(
-                        self.features[sample, cat], self.features[bag_sample, cat], cat
+                        self.features[sample, cat],
+                        self.features[bag_sample, cat],
+                        cat,
+                        unique_labels,
                     )
                     for cat in self.categorical_features_
                 ]
@@ -275,7 +286,7 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second, category):
+    def _get_vdm(self, first, second, category, unique_labels):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
         if sparse.issparse(self.features):
 
@@ -292,7 +303,7 @@ def f_sparse(c):
                 p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay)))
                 return p
 
-            vdm = np.sum(np.array([f_sparse(c) for c in self.unique_labels]))
+            vdm = np.sum(np.array([f_sparse(c) for c in unique_labels]))
             return vdm
 
         category_rows = self.features[:, category]
@@ -307,7 +318,7 @@ def f(c):
             p = abs((N_axc / N_ax) - (N_ayc / N_ay))
             return p
 
-        vdm = np.array([f(c) for c in self.unique_labels]).sum()
+        vdm = np.array([f(c) for c in unique_labels]).sum()
         return vdm
 
     def _get_all_instances_of_label(self, label):
@@ -320,21 +331,25 @@ def _get_all_instances_of_label(self, label):
                 append_instance_id(i)
         return np.array(instance_ids)
 
-    def _get_mean_imbalance_ratio(self):
+    def _get_mean_imbalance_ratio(self, unique_labels, labels):
         ratio_sum = np.sum(
-            np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))
+            np.array(
+                list(
+                    map(
+                        self._get_imbalance_ratio_per_label,
+                        unique_labels,
+                        itertools.repeat(unique_labels),
+                        itertools.repeat(labels),
+                    )
+                )
+            )
         )
-        return ratio_sum / len(self.unique_labels)
+        return ratio_sum / len(unique_labels)
 
-    def _get_imbalance_ratio_per_label(self, label, labels=None):
+    def _get_imbalance_ratio_per_label(self, label, unique_labels, labels):
         sum_h = self._sum_h
-        if labels is None:
-            sum_array = np.array([sum_h(l, self.labels) for l in self.unique_labels])
-            ratio = sum_array.max() / sum_h(label, self.labels)
-        else:
-            sum_array = np.array([sum_h(l, labels) for l in self.unique_labels])
-            ratio = sum_array.max() / sum_h(label, labels)
-
+        sum_array = np.array([sum_h(l, labels) for l in unique_labels])
+        ratio = sum_array.max() / sum_h(label, labels)
         return ratio
 
     def _sum_h(self, label, labels):

From cefac5367e9b95608dc9d38cc7c0e7018e3e1c99 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Sun, 18 Sep 2022 18:53:43 -0700
Subject: [PATCH 04/33] Refactor code to avoid redundant calculations of the
 IRLbl numerator

---
 imblearn/over_sampling/_mlsmote.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 989dd8286..53fedc8ad 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -119,8 +119,11 @@ def fit_resample(self, X, y):
             y_synth = None
 
             for label in unique_labels:
+                irlbl_num = self._get_imbalance_ratio_numerator(
+                    unique_labels, y_resampled
+                )
                 irlbl = self._get_imbalance_ratio_per_label(
-                    label, unique_labels, y_resampled
+                    label, irlbl_num, y_resampled
                 )
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
@@ -141,8 +144,11 @@ def fit_resample(self, X, y):
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
             for index, label in np.ndenumerate(unique_labels):
+                irlbl_num = self._get_imbalance_ratio_numerator(
+                    unique_labels, y_resampled
+                )
                 irlbl = self._get_imbalance_ratio_per_label(
-                    label, unique_labels, y_resampled
+                    label, irlbl_num, y_resampled
                 )
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
@@ -332,13 +338,14 @@ def _get_all_instances_of_label(self, label):
         return np.array(instance_ids)
 
     def _get_mean_imbalance_ratio(self, unique_labels, labels):
+        irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels)
         ratio_sum = np.sum(
             np.array(
                 list(
                     map(
                         self._get_imbalance_ratio_per_label,
                         unique_labels,
-                        itertools.repeat(unique_labels),
+                        itertools.repeat(irlbl_num),
                         itertools.repeat(labels),
                     )
                 )
@@ -346,11 +353,12 @@ def _get_mean_imbalance_ratio(self, unique_labels, labels):
         )
         return ratio_sum / len(unique_labels)
 
-    def _get_imbalance_ratio_per_label(self, label, unique_labels, labels):
-        sum_h = self._sum_h
-        sum_array = np.array([sum_h(l, labels) for l in unique_labels])
-        ratio = sum_array.max() / sum_h(label, labels)
-        return ratio
+    def _get_imbalance_ratio_numerator(self, unique_labels, labels):
+        sum_array = np.array([self._sum_h(label, labels) for label in unique_labels])
+        return sum_array.max()
+
+    def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
+        return irlbl_numerator / self._sum_h(label, labels)
 
     def _sum_h(self, label, labels):
         if sparse.issparse(labels):

From 600d51ad8c1a8128272b44df23a2743634ea6fb2 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 09:14:40 -0700
Subject: [PATCH 05/33] Refactor code to remove self.labels

---
 imblearn/over_sampling/_mlsmote.py | 58 ++++++++++++++++++------------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 53fedc8ad..ba4ae7d9f 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -56,7 +56,6 @@ def __init__(
         self.sampling_strategy_ = sampling_strategy
         self.categorical_features = categorical_features
         self.continuous_features_ = None
-        self.labels = []
         self.features = []
 
     def fit_resample(self, X, y):
@@ -126,9 +125,11 @@ def fit_resample(self, X, y):
                     label, irlbl_num, y_resampled
                 )
                 if irlbl > mean_ir:
-                    min_bag = self._get_all_instances_of_label(label)
+                    min_bag = self._get_all_instances_of_label(label, labels)
                     for sample in min_bag:
-                        distances = self._calc_distances(sample, min_bag, unique_labels)
+                        distances = self._calc_distances(
+                            sample, min_bag, unique_labels, labels
+                        )
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
                         ref_neigh = random_state.choice(neighbours, 1)[0]
@@ -137,6 +138,7 @@ def fit_resample(self, X, y):
                             ref_neigh[1],
                             [x[1] for x in neighbours],
                             unique_labels,
+                            labels,
                             random_state,
                         )
                         append_X_synth(X_new)
@@ -151,9 +153,11 @@ def fit_resample(self, X, y):
                     label, irlbl_num, y_resampled
                 )
                 if irlbl > mean_ir:
-                    min_bag = self._get_all_instances_of_label(label)
+                    min_bag = self._get_all_instances_of_label(label, labels)
                     for sample in min_bag:
-                        distances = self._calc_distances(sample, min_bag, unique_labels)
+                        distances = self._calc_distances(
+                            sample, min_bag, unique_labels, labels
+                        )
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
                         ref_neigh = random_state.choice(neighbours, 1)[0]
@@ -162,6 +166,7 @@ def fit_resample(self, X, y):
                             ref_neigh[1],
                             [x[1] for x in neighbours],
                             unique_labels,
+                            labels,
                             random_state,
                         )
                         append_X_synth(X_new)
@@ -204,12 +209,18 @@ def _collect_unique_labels(self, y):
         )
 
     def _create_new_sample(
-        self, sample_id, ref_neigh_id, neighbour_ids, unique_labels, random_state
+        self,
+        sample_id,
+        ref_neigh_id,
+        neighbour_ids,
+        unique_labels,
+        labels,
+        random_state,
     ):
         sample = self.features[sample_id]
         synth_sample = np.copy(sample)
         ref_neigh = self.features[ref_neigh_id]
-        sample_labels = self.labels[sample_id]
+        sample_labels = labels[sample_id]
 
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
@@ -222,8 +233,8 @@ def _create_new_sample(
                 )
         X = synth_sample
 
-        if sparse.issparse(self.labels):
-            neighbours_labels = self.labels[neighbour_ids]
+        if sparse.issparse(labels):
+            neighbours_labels = labels[neighbour_ids]
             possible_labels = neighbours_labels.sum(axis=0)
             y = np.zeros((1, len(unique_labels)))
             if self.sampling_strategy_ == "ranking":
@@ -241,26 +252,26 @@ def _create_new_sample(
         else:
             neighbours_labels = []
             for ni in neighbour_ids:
-                neighbours_labels.append(self.labels[ni].tolist())
+                neighbours_labels.append(labels[ni].tolist())
 
-            labels = []  # sample_labels.tolist()
-            labels += [
+            new_labels = []  # sample_labels.tolist()
+            new_labels += [
                 a
                 for x in neighbours_labels
                 for a in (x if isinstance(x, list) else [x])
             ]
-            labels = list(set(labels))
+            new_labels = list(set(new_labels))
             if self.sampling_strategy_ == "ranking":
                 head_index = int((self.k_neighbors + 1) / 2)
-                y = labels[:head_index]
+                y = new_labels[:head_index]
             if self.sampling_strategy_ == "union":
-                y = labels[:]
+                y = new_labels[:]
             if self.sampling_strategy_ == "intersection":
                 y = list(set.intersection(*neighbours_labels))
 
         return X, y
 
-    def _calc_distances(self, sample, min_bag, unique_labels):
+    def _calc_distances(self, sample, min_bag, unique_labels, labels):
         def calc_dist(bag_sample):
             nominal_distance = sum(
                 [
@@ -269,6 +280,7 @@ def calc_dist(bag_sample):
                         self.features[bag_sample, cat],
                         cat,
                         unique_labels,
+                        labels,
                     )
                     for cat in self.categorical_features_
                 ]
@@ -292,14 +304,14 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second, category, unique_labels):
+    def _get_vdm(self, first, second, category, unique_labels, labels):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
         if sparse.issparse(self.features):
 
             def f_sparse(c):
                 N_ax = len(sparse.find(self.features[:, category] == first)[0])
                 N_ay = len(sparse.find(self.features[:, category] == second)[0])
-                c_instances = self._get_all_instances_of_label(c)
+                c_instances = self._get_all_instances_of_label(c, labels)
                 N_axc = len(
                     sparse.find(self.features[c_instances, category] == first)[0]
                 )
@@ -317,7 +329,7 @@ def f_sparse(c):
         N_ay = len(np.where(category_rows == second))
 
         def f(c):
-            class_instances = self._get_all_instances_of_label(c)
+            class_instances = self._get_all_instances_of_label(c, labels)
             class_instance_rows = category_rows[class_instances]
             N_axc = len(np.where(class_instance_rows == first)[0])
             N_ayc = len(np.where(class_instance_rows == second)[0])
@@ -327,12 +339,12 @@ def f(c):
         vdm = np.array([f(c) for c in unique_labels]).sum()
         return vdm
 
-    def _get_all_instances_of_label(self, label):
-        if sparse.issparse(self.labels):
-            return self.labels[:, label].nonzero()[0]
+    def _get_all_instances_of_label(self, label, labels):
+        if sparse.issparse(labels):
+            return labels[:, label].nonzero()[0]
         instance_ids = []
         append_instance_id = instance_ids.append
-        for i, label_set in enumerate(self.labels):
+        for i, label_set in enumerate(labels):
             if label in label_set:
                 append_instance_id(i)
         return np.array(instance_ids)

From a96cefa3d13adf16253d338c47ed40f4204305d6 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 09:57:36 -0700
Subject: [PATCH 06/33] Return over-sampled y as a list of lists

---
 imblearn/over_sampling/_mlsmote.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index ba4ae7d9f..c3b304bab 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -114,7 +114,7 @@ def fit_resample(self, X, y):
         append_y_synth = y_synth.append
         mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels)
 
-        if sparse.issparse(y):
+        if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:
             y_synth = None
 
             for label in unique_labels:
@@ -171,9 +171,8 @@ def fit_resample(self, X, y):
                         )
                         append_X_synth(X_new)
                         append_y_synth(y_new)
-            return np.concatenate((X_resampled, np.array(X_synth))), np.array(
-                y_resampled.tolist() + y_synth
-            )
+            y_resampled.extend(y_synth)
+            return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
 
     def _validate_estimator(self):
         categorical_features = np.asarray(self.categorical_features)

From 735a8925d6aa4c775cfc0082b88b76bda5b14572 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 10:15:03 -0700
Subject: [PATCH 07/33] Fix small bug on variable name

---
 imblearn/over_sampling/_mlsmote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index c3b304bab..1661a30ab 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -142,7 +142,7 @@ def fit_resample(self, X, y):
                             random_state,
                         )
                         append_X_synth(X_new)
-                        y_resambled = sparse.vstack((y_resampled, y_new))
+                        y_resampled = sparse.vstack((y_resampled, y_new))
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
             for index, label in np.ndenumerate(unique_labels):

From 783486fc29f820df280134b63499886df66c70b6 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 10:16:23 -0700
Subject: [PATCH 08/33] Remove unnecessary index variable

---
 imblearn/over_sampling/_mlsmote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 1661a30ab..c5165206b 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -145,7 +145,7 @@ def fit_resample(self, X, y):
                         y_resampled = sparse.vstack((y_resampled, y_new))
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
-            for index, label in np.ndenumerate(unique_labels):
+            for label in unique_labels:
                 irlbl_num = self._get_imbalance_ratio_numerator(
                     unique_labels, y_resampled
                 )

From 6541cdb7c18dd50c605916af598fb174c9803e88 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 11:03:12 -0700
Subject: [PATCH 09/33] Handle case where y is a dense array when calculating
 mean IR and IRLbl

---
 imblearn/over_sampling/_mlsmote.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index c5165206b..410c81d82 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -232,7 +232,7 @@ def _create_new_sample(
                 )
         X = synth_sample
 
-        if sparse.issparse(labels):
+        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
             neighbours_labels = labels[neighbour_ids]
             possible_labels = neighbours_labels.sum(axis=0)
             y = np.zeros((1, len(unique_labels)))
@@ -305,7 +305,7 @@ def _get_euclidean_distance(self, first, second):
 
     def _get_vdm(self, first, second, category, unique_labels, labels):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
-        if sparse.issparse(self.features):
+        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
 
             def f_sparse(c):
                 N_ax = len(sparse.find(self.features[:, category] == first)[0])
@@ -339,7 +339,7 @@ def f(c):
         return vdm
 
     def _get_all_instances_of_label(self, label, labels):
-        if sparse.issparse(labels):
+        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
             return labels[:, label].nonzero()[0]
         instance_ids = []
         append_instance_id = instance_ids.append
@@ -372,21 +372,23 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
         return irlbl_numerator / self._sum_h(label, labels)
 
     def _sum_h(self, label, labels):
-        if sparse.issparse(labels):
+        if type(labels) == sparse._csr.csr_matrix:
             return labels[:, label].count_nonzero()
+        elif type(labels) == np.ndarray:
+            return np.count_nonzero(labels[:, label])
+        else:
+            h_sum = 0
 
-        h_sum = 0
-
-        def h(l, Y):
-            if l in Y:
-                return 1
-            else:
-                return 0
+            def h(l, Y):
+                if l in Y:
+                    return 1
+                else:
+                    return 0
 
-        for label_set in labels:
-            h_sum += h(label, label_set)
+            for label_set in labels:
+                h_sum += h(label, label_set)
 
-        return h_sum
+            return h_sum
 
     def _get_label_frequencies(self, labels):
         """A support function to get the frequencies of labels"""

From 9f2d7cf706b25f9f7afca22575b341dec0172e8d Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 12:07:11 -0700
Subject: [PATCH 10/33] Refactor code to remove self.features

---
 imblearn/over_sampling/_mlsmote.py | 40 ++++++++++++++----------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 410c81d82..156e2d0a3 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -56,7 +56,6 @@ def __init__(
         self.sampling_strategy_ = sampling_strategy
         self.categorical_features = categorical_features
         self.continuous_features_ = None
-        self.features = []
 
     def fit_resample(self, X, y):
         """Resample the dataset.
@@ -105,7 +104,6 @@ def fit_resample(self, X, y):
                 "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'"
                 " or 'list'"
             )
-        self.features = X
 
         X_synth = []
         y_synth = []
@@ -128,7 +126,7 @@ def fit_resample(self, X, y):
                     min_bag = self._get_all_instances_of_label(label, labels)
                     for sample in min_bag:
                         distances = self._calc_distances(
-                            sample, min_bag, unique_labels, labels
+                            sample, min_bag, X, unique_labels, labels
                         )
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
@@ -137,6 +135,7 @@ def fit_resample(self, X, y):
                             sample,
                             ref_neigh[1],
                             [x[1] for x in neighbours],
+                            X,
                             unique_labels,
                             labels,
                             random_state,
@@ -156,7 +155,7 @@ def fit_resample(self, X, y):
                     min_bag = self._get_all_instances_of_label(label, labels)
                     for sample in min_bag:
                         distances = self._calc_distances(
-                            sample, min_bag, unique_labels, labels
+                            sample, min_bag, X, unique_labels, labels
                         )
                         distances = np.sort(distances, order="distance")
                         neighbours = distances[: self.k_neighbors]
@@ -165,6 +164,7 @@ def fit_resample(self, X, y):
                             sample,
                             ref_neigh[1],
                             [x[1] for x in neighbours],
+                            X,
                             unique_labels,
                             labels,
                             random_state,
@@ -212,13 +212,14 @@ def _create_new_sample(
         sample_id,
         ref_neigh_id,
         neighbour_ids,
+        features,
         unique_labels,
         labels,
         random_state,
     ):
-        sample = self.features[sample_id]
+        sample = features[sample_id]
         synth_sample = np.copy(sample)
-        ref_neigh = self.features[ref_neigh_id]
+        ref_neigh = features[ref_neigh_id]
         sample_labels = labels[sample_id]
 
         for i in range(synth_sample.shape[0]):
@@ -228,7 +229,7 @@ def _create_new_sample(
                 synth_sample[i] = sample[i] + offset
             if i in self.categorical_features_:
                 synth_sample[i] = self._get_most_frequent_value(
-                    self.features[neighbour_ids, i]
+                    features[neighbour_ids, i]
                 )
         X = synth_sample
 
@@ -270,13 +271,14 @@ def _create_new_sample(
 
         return X, y
 
-    def _calc_distances(self, sample, min_bag, unique_labels, labels):
+    def _calc_distances(self, sample, min_bag, features, unique_labels, labels):
         def calc_dist(bag_sample):
             nominal_distance = sum(
                 [
                     self._get_vdm(
-                        self.features[sample, cat],
-                        self.features[bag_sample, cat],
+                        features[sample, cat],
+                        features[bag_sample, cat],
+                        features,
                         cat,
                         unique_labels,
                         labels,
@@ -287,7 +289,7 @@ def calc_dist(bag_sample):
             ordinal_distance = sum(
                 [
                     self._get_euclidean_distance(
-                        self.features[sample, num], self.features[bag_sample, num]
+                        features[sample, num], features[bag_sample, num]
                     )
                     for num in self.continuous_features_
                 ]
@@ -303,27 +305,23 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second, category, unique_labels, labels):
+    def _get_vdm(self, first, second, features, category, unique_labels, labels):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
         if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
 
             def f_sparse(c):
-                N_ax = len(sparse.find(self.features[:, category] == first)[0])
-                N_ay = len(sparse.find(self.features[:, category] == second)[0])
+                N_ax = len(sparse.find(features[:, category] == first)[0])
+                N_ay = len(sparse.find(features[:, category] == second)[0])
                 c_instances = self._get_all_instances_of_label(c, labels)
-                N_axc = len(
-                    sparse.find(self.features[c_instances, category] == first)[0]
-                )
-                N_ayc = len(
-                    sparse.find(self.features[c_instances, category] == second)[0]
-                )
+                N_axc = len(sparse.find(features[c_instances, category] == first)[0])
+                N_ayc = len(sparse.find(features[c_instances, category] == second)[0])
                 p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay)))
                 return p
 
             vdm = np.sum(np.array([f_sparse(c) for c in unique_labels]))
             return vdm
 
-        category_rows = self.features[:, category]
+        category_rows = features[:, category]
         N_ax = len(np.where(category_rows == first))
         N_ay = len(np.where(category_rows == second))
 

From 60fbcd129b9aec19de748c22a6d565b8f7fe3109 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 13:06:27 -0700
Subject: [PATCH 11/33] Add TODO to fix case where mean IR is infinity

---
 imblearn/over_sampling/_mlsmote.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 156e2d0a3..e904b4606 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -110,6 +110,10 @@ def fit_resample(self, X, y):
 
         append_X_synth = X_synth.append
         append_y_synth = y_synth.append
+
+        """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has
+        no samples
+        """
         mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels)
 
         if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:

From 04aae006a207ff29efe4ecdc37713f608f72c77d Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 19 Sep 2022 13:22:42 -0700
Subject: [PATCH 12/33] Handle/fix case where X_synth is empty

---
 imblearn/over_sampling/_mlsmote.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index e904b4606..9af4bcf3d 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -105,10 +105,9 @@ def fit_resample(self, X, y):
                 " or 'list'"
             )
 
-        X_synth = []
+        X_synth = np.array([]).reshape(0, self.n_features_)
         y_synth = []
 
-        append_X_synth = X_synth.append
         append_y_synth = y_synth.append
 
         """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has
@@ -144,7 +143,7 @@ def fit_resample(self, X, y):
                             labels,
                             random_state,
                         )
-                        append_X_synth(X_new)
+                        X_synth = np.vstack((X_synth, X_new))
                         y_resampled = sparse.vstack((y_resampled, y_new))
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
@@ -173,7 +172,7 @@ def fit_resample(self, X, y):
                             labels,
                             random_state,
                         )
-                        append_X_synth(X_new)
+                        X_synth = np.vstack((X_synth, X_new))
                         append_y_synth(y_new)
             y_resampled.extend(y_synth)
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled

From a4e70fd5a0f1520fdc80a458527636cdf8963fa8 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 09:17:01 -0700
Subject: [PATCH 13/33] Refactored code to reduce branching. Input label always
 converted to sparse matrix

---
 imblearn/over_sampling/_mlsmote.py | 224 +++++++++--------------------
 1 file changed, 67 insertions(+), 157 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 9af4bcf3d..25b6da201 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -15,9 +15,8 @@ class MLSMOTE:
     sampling_strategy: 'ranking', 'union' or 'intersection' default: 'ranking'
         Strategy to generate labelsets
 
-
     k_neighbors : int or object, default=5
-        If ``int``, number of nearest neighbours to used to construct synthetic
+        If ``int``, number of nearest neighbors used to construct synthetic
         samples.
 
     categorical_features : ndarray of shape (n_cat_features,) or (n_features,)
@@ -91,91 +90,55 @@ def fit_resample(self, X, y):
         random_state = check_random_state(self.random_state)
 
         X_resampled = X.copy()
-        y_resampled = y.copy()
 
-        if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:
-            labels = y
-            unique_labels = range(0, y.shape[1])
+        # Convert 'y' to a sparse matrix
+        if type(y) == sparse._csr.csr_matrix:
+            y_resampled = y.copy()
+            unique_labels = range(0, y_resampled.shape[1])
+        elif type(y) == np.ndarray:
+            y_resampled = sparse.csr_matrix(y, dtype=int)
+            unique_labels = range(0, y_resampled.shape[1])
         elif type(y) == list:
-            labels = np.array([np.array(xi) for xi in y], dtype=object)
             unique_labels = self._collect_unique_labels(y)
+            y_resampled = sparse.csr_matrix((len(y), len(unique_labels)))
+            for i, sample in enumerate(y):
+                for label in sample:
+                    y_resampled[i, label] = 1
         else:
             raise TypeError(
-                "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'"
-                " or 'list'"
+                "'y' can only be of type 'numpy.ndarray', "
+                "'scipy.sparse._csr.csr_matrix' or 'list'"
             )
 
-        X_synth = np.array([]).reshape(0, self.n_features_)
-        y_synth = []
-
-        append_y_synth = y_synth.append
-
         """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has
         no samples
         """
-        mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels)
-
-        if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix:
-            y_synth = None
-
-            for label in unique_labels:
-                irlbl_num = self._get_imbalance_ratio_numerator(
-                    unique_labels, y_resampled
-                )
-                irlbl = self._get_imbalance_ratio_per_label(
-                    label, irlbl_num, y_resampled
-                )
-                if irlbl > mean_ir:
-                    min_bag = self._get_all_instances_of_label(label, labels)
-                    for sample in min_bag:
-                        distances = self._calc_distances(
-                            sample, min_bag, X, unique_labels, labels
-                        )
-                        distances = np.sort(distances, order="distance")
-                        neighbours = distances[: self.k_neighbors]
-                        ref_neigh = random_state.choice(neighbours, 1)[0]
-                        X_new, y_new = self._create_new_sample(
-                            sample,
-                            ref_neigh[1],
-                            [x[1] for x in neighbours],
-                            X,
-                            unique_labels,
-                            labels,
-                            random_state,
-                        )
-                        X_synth = np.vstack((X_synth, X_new))
-                        y_resampled = sparse.vstack((y_resampled, y_new))
-            return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
-        else:
-            for label in unique_labels:
-                irlbl_num = self._get_imbalance_ratio_numerator(
-                    unique_labels, y_resampled
-                )
-                irlbl = self._get_imbalance_ratio_per_label(
-                    label, irlbl_num, y_resampled
-                )
-                if irlbl > mean_ir:
-                    min_bag = self._get_all_instances_of_label(label, labels)
-                    for sample in min_bag:
-                        distances = self._calc_distances(
-                            sample, min_bag, X, unique_labels, labels
-                        )
-                        distances = np.sort(distances, order="distance")
-                        neighbours = distances[: self.k_neighbors]
-                        ref_neigh = random_state.choice(neighbours, 1)[0]
-                        X_new, y_new = self._create_new_sample(
-                            sample,
-                            ref_neigh[1],
-                            [x[1] for x in neighbours],
-                            X,
-                            unique_labels,
-                            labels,
-                            random_state,
-                        )
-                        X_synth = np.vstack((X_synth, X_new))
-                        append_y_synth(y_new)
-            y_resampled.extend(y_synth)
-            return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
+        mean_ir = self._get_mean_imbalance_ratio(unique_labels, y_resampled)
+
+        for label in unique_labels:
+            irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, y_resampled)
+            irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
+            if irlbl > mean_ir:
+                min_bag = self._get_all_instances_of_label(label, y_resampled)
+                for sample in min_bag:
+                    distances = self._calc_distances(
+                        sample, min_bag, X_resampled, unique_labels, y_resampled
+                    )
+                    distances = np.sort(distances, order="distance")
+                    neighbors = distances[: self.k_neighbors]
+                    ref_neigh = random_state.choice(neighbors, 1)[0]
+                    X_new, y_new = self._create_new_sample(
+                        sample,
+                        ref_neigh[1],
+                        [x[1] for x in neighbors],
+                        X_resampled,
+                        unique_labels,
+                        y_resampled,
+                        random_state,
+                    )
+                    X_resampled = np.vstack((X_resampled, X_new))
+                    y_resampled = sparse.vstack((y_resampled, y_new))
+        return X_resampled, y_resampled
 
     def _validate_estimator(self):
         categorical_features = np.asarray(self.categorical_features)
@@ -194,27 +157,11 @@ def _validate_estimator(self):
             np.arange(self.n_features_), self.categorical_features_
         )
 
-    def _collect_unique_labels(self, y):
-        """A support function that flattens the labelsets and return one set of unique
-        labels
-        """
-        return np.unique(
-            np.array(
-                [
-                    label
-                    for label_set in y
-                    for label in (
-                        label_set if isinstance(label_set, list) else [label_set]
-                    )
-                ]
-            )
-        )
-
     def _create_new_sample(
         self,
         sample_id,
         ref_neigh_id,
-        neighbour_ids,
+        neighbor_ids,
         features,
         unique_labels,
         labels,
@@ -223,7 +170,6 @@ def _create_new_sample(
         sample = features[sample_id]
         synth_sample = np.copy(sample)
         ref_neigh = features[ref_neigh_id]
-        sample_labels = labels[sample_id]
 
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
@@ -232,48 +178,33 @@ def _create_new_sample(
                 synth_sample[i] = sample[i] + offset
             if i in self.categorical_features_:
                 synth_sample[i] = self._get_most_frequent_value(
-                    features[neighbour_ids, i]
+                    features[neighbor_ids, i]
                 )
         X = synth_sample
 
-        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
-            neighbours_labels = labels[neighbour_ids]
-            possible_labels = neighbours_labels.sum(axis=0)
-            y = np.zeros((1, len(unique_labels)))
-            if self.sampling_strategy_ == "ranking":
-                head_index = int((self.k_neighbors + 1) / 2)
-                choosen_labels = possible_labels.nonzero()[1][:head_index]
-                y[0, choosen_labels] = 1
-            if self.sampling_strategy_ == "union":
-                choosen_labels = possible_labels.nonzero()[0]
-                y[choosen_labels] = 1
-            if self.sampling_strategy_ == "intersection":
-                choosen_labels = sparse.find(possible_labels == len(neighbours_labels))
-                y[choosen_labels] = 1
-            y = sparse.csr_matrix(y)
-
-        else:
-            neighbours_labels = []
-            for ni in neighbour_ids:
-                neighbours_labels.append(labels[ni].tolist())
-
-            new_labels = []  # sample_labels.tolist()
-            new_labels += [
-                a
-                for x in neighbours_labels
-                for a in (x if isinstance(x, list) else [x])
-            ]
-            new_labels = list(set(new_labels))
-            if self.sampling_strategy_ == "ranking":
-                head_index = int((self.k_neighbors + 1) / 2)
-                y = new_labels[:head_index]
-            if self.sampling_strategy_ == "union":
-                y = new_labels[:]
-            if self.sampling_strategy_ == "intersection":
-                y = list(set.intersection(*neighbours_labels))
+        neighbors_labels = labels[neighbor_ids]
+        possible_labels = neighbors_labels.sum(axis=0)
+        y = np.zeros((1, len(unique_labels)))
+        if self.sampling_strategy_ == "ranking":
+            head_index = int((self.k_neighbors + 1) / 2)
+            choosen_labels = possible_labels.nonzero()[1][:head_index]
+            y[0, choosen_labels] = 1
+        if self.sampling_strategy_ == "union":
+            choosen_labels = possible_labels.nonzero()[0]
+            y[choosen_labels] = 1
+        if self.sampling_strategy_ == "intersection":
+            choosen_labels = sparse.find(possible_labels == len(neighbors_labels))
+            y[choosen_labels] = 1
+        y = sparse.csr_matrix(y)
 
         return X, y
 
+    def _collect_unique_labels(self, y):
+        """A support function that flattens the labelsets and return one set of unique
+        labels
+        """
+        return np.unique(np.array([label for label_set in y for label in label_set]))
+
     def _calc_distances(self, sample, min_bag, features, unique_labels, labels):
         def calc_dist(bag_sample):
             nominal_distance = sum(
@@ -309,7 +240,9 @@ def _get_euclidean_distance(self, first, second):
         return euclidean_distance
 
     def _get_vdm(self, first, second, features, category, unique_labels, labels):
-        """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
+        """A support function to compute the Value Difference Metric(VDM) discribed in
+        https://arxiv.org/pdf/cs/9701101.pdf
+        """
         if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
 
             def f_sparse(c):
@@ -340,14 +273,7 @@ def f(c):
         return vdm
 
     def _get_all_instances_of_label(self, label, labels):
-        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
-            return labels[:, label].nonzero()[0]
-        instance_ids = []
-        append_instance_id = instance_ids.append
-        for i, label_set in enumerate(labels):
-            if label in label_set:
-                append_instance_id(i)
-        return np.array(instance_ids)
+        return labels[:, label].nonzero()[0]
 
     def _get_mean_imbalance_ratio(self, unique_labels, labels):
         irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels)
@@ -373,23 +299,7 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
         return irlbl_numerator / self._sum_h(label, labels)
 
     def _sum_h(self, label, labels):
-        if type(labels) == sparse._csr.csr_matrix:
-            return labels[:, label].count_nonzero()
-        elif type(labels) == np.ndarray:
-            return np.count_nonzero(labels[:, label])
-        else:
-            h_sum = 0
-
-            def h(l, Y):
-                if l in Y:
-                    return 1
-                else:
-                    return 0
-
-            for label_set in labels:
-                h_sum += h(label, label_set)
-
-            return h_sum
+        return labels[:, label].count_nonzero()
 
     def _get_label_frequencies(self, labels):
         """A support function to get the frequencies of labels"""

From 0d70b3dd8187badcc50545d0a30f66b7789f0508 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 09:28:37 -0700
Subject: [PATCH 14/33] Fix bug where 'sample' was included in the neighbor set

---
 imblearn/over_sampling/_mlsmote.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 25b6da201..4cadcc53e 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -125,7 +125,9 @@ def fit_resample(self, X, y):
                         sample, min_bag, X_resampled, unique_labels, y_resampled
                     )
                     distances = np.sort(distances, order="distance")
-                    neighbors = distances[: self.k_neighbors]
+                    neighbors = distances[
+                        1 : self.k_neighbors + 1
+                    ]  # Remove 'sample' from neighbor set
                     ref_neigh = random_state.choice(neighbors, 1)[0]
                     X_new, y_new = self._create_new_sample(
                         sample,

From 1a6b2492ca8c88b40610c202c026a0ac290cb331 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 09:36:20 -0700
Subject: [PATCH 15/33] Handle the case (skip generating synth samples) when
 there is only sample for a given label

---
 imblearn/over_sampling/_mlsmote.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 4cadcc53e..6ec901ac8 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -120,6 +120,10 @@ def fit_resample(self, X, y):
             irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
             if irlbl > mean_ir:
                 min_bag = self._get_all_instances_of_label(label, y_resampled)
+                if (
+                    len(min_bag) <= 1
+                ):  # If there is only one sample, the neighbor set will be empty
+                    continue
                 for sample in min_bag:
                     distances = self._calc_distances(
                         sample, min_bag, X_resampled, unique_labels, y_resampled

From aa212af33a7644686e49421c09832e65619cba1a Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 10:13:23 -0700
Subject: [PATCH 16/33] Reorganized code to make it more consistent with other
 implementations

---
 imblearn/over_sampling/_mlsmote.py | 50 +++++++++++++++++-------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 6ec901ac8..9afe1cdd9 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -42,19 +42,44 @@ class MLSMOTE:
     >>> from sklearn.datasets import make_multilabel_classification
     """
 
+    _required_parameters = ["categorical_features"]
+    _sampling_strategies = ["intersection", "ranking", "union"]
+
     def __init__(
         self,
+        categorical_features,
         *,
         sampling_strategy="ranking",
-        categorical_features,
         random_state=None,
         k_neighbors=5,
     ):
+        if sampling_strategy not in MLSMOTE._sampling_strategies:
+            raise ValueError(
+                "Sampling Strategy can only be one of: 'ranking', 'union' or "
+                "'intersection'"
+            )
+
+        self.categorical_features = categorical_features
+        self.sampling_strategy_ = sampling_strategy
         self.random_state = random_state
         self.k_neighbors = k_neighbors
-        self.sampling_strategy_ = sampling_strategy
-        self.categorical_features = categorical_features
-        self.continuous_features_ = None
+
+    def _validate_estimator(self):
+        categorical_features = np.asarray(self.categorical_features)
+        if categorical_features.dtype.name == "bool":
+            self.categorical_features_ = np.flatnonzero(categorical_features)
+        else:
+            if any(
+                [cat not in np.arange(self.n_features_) for cat in categorical_features]
+            ):
+                raise ValueError(
+                    "Some of the categorical indices are out of range. Indices"
+                    f" should be between 0 and {self.n_features_}"
+                )
+            self.categorical_features_ = categorical_features
+        self.continuous_features_ = np.setdiff1d(
+            np.arange(self.n_features_), self.categorical_features_
+        )
 
     def fit_resample(self, X, y):
         """Resample the dataset.
@@ -146,23 +171,6 @@ def fit_resample(self, X, y):
                     y_resampled = sparse.vstack((y_resampled, y_new))
         return X_resampled, y_resampled
 
-    def _validate_estimator(self):
-        categorical_features = np.asarray(self.categorical_features)
-        if categorical_features.dtype.name == bool:
-            self.categorical_features_ = np.flatnonzero(categorical_features)
-        else:
-            if any(
-                [cat not in np.arange(self.n_features_) for cat in categorical_features]
-            ):
-                raise ValueError(
-                    "Some of the categorical indices are out of range. Indices"
-                    f" should be between 0 and {self.n_features_ - 1}"
-                )
-            self.categorical_features_ = categorical_features
-        self.continuous_features_ = np.setdiff1d(
-            np.arange(self.n_features_), self.categorical_features_
-        )
-
     def _create_new_sample(
         self,
         sample_id,

From 7fccdd636101365b8db372805e1c6caf29e359f2 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 10:58:17 -0700
Subject: [PATCH 17/33] Improve code readability

---
 imblearn/over_sampling/_mlsmote.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 9afe1cdd9..4ae72fdf3 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -149,9 +149,9 @@ def fit_resample(self, X, y):
                     len(min_bag) <= 1
                 ):  # If there is only one sample, the neighbor set will be empty
                     continue
-                for sample in min_bag:
+                for sample_id in min_bag:
                     distances = self._calc_distances(
-                        sample, min_bag, X_resampled, unique_labels, y_resampled
+                        sample_id, min_bag, X_resampled, unique_labels, y_resampled
                     )
                     distances = np.sort(distances, order="distance")
                     neighbors = distances[
@@ -159,7 +159,7 @@ def fit_resample(self, X, y):
                     ]  # Remove 'sample' from neighbor set
                     ref_neigh = random_state.choice(neighbors, 1)[0]
                     X_new, y_new = self._create_new_sample(
-                        sample,
+                        sample_id,
                         ref_neigh[1],
                         [x[1] for x in neighbors],
                         X_resampled,
@@ -176,42 +176,41 @@ def _create_new_sample(
         sample_id,
         ref_neigh_id,
         neighbor_ids,
-        features,
+        X_resampled,
         unique_labels,
-        labels,
+        y_resampled,
         random_state,
     ):
-        sample = features[sample_id]
-        synth_sample = np.copy(sample)
-        ref_neigh = features[ref_neigh_id]
+        sample = X_resampled[sample_id]
+        synth_sample = np.zeros_like(sample)
+        ref_neigh = X_resampled[ref_neigh_id]
 
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
                 diff = ref_neigh[i] - sample[i]
                 offset = diff * random_state.uniform(0, 1)
                 synth_sample[i] = sample[i] + offset
-            if i in self.categorical_features_:
+            elif i in self.categorical_features_:
                 synth_sample[i] = self._get_most_frequent_value(
-                    features[neighbor_ids, i]
+                    X_resampled[neighbor_ids, i]
                 )
-        X = synth_sample
 
-        neighbors_labels = labels[neighbor_ids]
+        neighbors_labels = y_resampled[neighbor_ids]
         possible_labels = neighbors_labels.sum(axis=0)
         y = np.zeros((1, len(unique_labels)))
         if self.sampling_strategy_ == "ranking":
             head_index = int((self.k_neighbors + 1) / 2)
             choosen_labels = possible_labels.nonzero()[1][:head_index]
             y[0, choosen_labels] = 1
-        if self.sampling_strategy_ == "union":
+        elif self.sampling_strategy_ == "union":
             choosen_labels = possible_labels.nonzero()[0]
             y[choosen_labels] = 1
-        if self.sampling_strategy_ == "intersection":
+        elif self.sampling_strategy_ == "intersection":
             choosen_labels = sparse.find(possible_labels == len(neighbors_labels))
             y[choosen_labels] = 1
         y = sparse.csr_matrix(y)
 
-        return X, y
+        return synth_sample, y
 
     def _collect_unique_labels(self, y):
         """A support function that flattens the labelsets and return one set of unique

From c4ccaeb433d2da70dc8bcea508b3bd4f64202c14 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 11:22:09 -0700
Subject: [PATCH 18/33] Simplify _get_most_frequent_value

---
 imblearn/over_sampling/_mlsmote.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 4ae72fdf3..041a59321 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -321,6 +321,9 @@ def _get_label_frequencies(self, labels):
         return frequencies
 
     def _get_most_frequent_value(self, values):
-        """A support function to get most frequent value if a list of values"""
-        uniques, indices = np.unique(values, return_inverse=True)
-        return uniques[np.argmax(np.bincount(indices))]
+        """A support function to get most frequent value if a list of values
+        TODO: We might want to randomize 'unique' and 'counts' to avoid always returning
+        the first occurrence when multiple occurrences of the maximum value.
+        """
+        uniques, counts = np.unique(values, return_counts=True)
+        return uniques[np.argmax(counts)]

From d7e4e42e34db47af17fff90ed16be9179706261a Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 12:32:48 -0700
Subject: [PATCH 19/33] Fixed erroneous implementation of ranking and
 intersection strategies. Improved code readability

---
 imblearn/over_sampling/_mlsmote.py | 41 +++++++++++++++++-------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 041a59321..56f041ae2 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -43,13 +43,17 @@ class MLSMOTE:
     """
 
     _required_parameters = ["categorical_features"]
-    _sampling_strategies = ["intersection", "ranking", "union"]
+
+    INTERSECTION = "intersection"
+    RANKING = "ranking"
+    UNION = "union"
+    _sampling_strategies = [INTERSECTION, RANKING, UNION]
 
     def __init__(
         self,
         categorical_features,
         *,
-        sampling_strategy="ranking",
+        sampling_strategy=RANKING,
         random_state=None,
         k_neighbors=5,
     ):
@@ -196,21 +200,24 @@ def _create_new_sample(
                 )
 
         neighbors_labels = y_resampled[neighbor_ids]
-        possible_labels = neighbors_labels.sum(axis=0)
-        y = np.zeros((1, len(unique_labels)))
-        if self.sampling_strategy_ == "ranking":
-            head_index = int((self.k_neighbors + 1) / 2)
-            choosen_labels = possible_labels.nonzero()[1][:head_index]
-            y[0, choosen_labels] = 1
-        elif self.sampling_strategy_ == "union":
-            choosen_labels = possible_labels.nonzero()[0]
-            y[choosen_labels] = 1
-        elif self.sampling_strategy_ == "intersection":
-            choosen_labels = sparse.find(possible_labels == len(neighbors_labels))
-            y[choosen_labels] = 1
-        y = sparse.csr_matrix(y)
-
-        return synth_sample, y
+        label_counts = np.squeeze(
+            np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0))
+        )
+        synth_sample_labels = sparse.csr_matrix((1, len(unique_labels)))
+        if self.sampling_strategy_ == MLSMOTE.RANKING:
+            # Note: Paper states "present in half or more of the instances considered"
+            # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We
+            # follow the pseudocode for now.
+            quorum = int((len(neighbor_ids) + 1) / 2)
+            chosen_labels = label_counts > quorum
+        elif self.sampling_strategy_ == MLSMOTE.UNION:
+            chosen_labels = label_counts.nonzero()
+        elif self.sampling_strategy_ == MLSMOTE.INTERSECTION:
+            chosen_labels = label_counts == len(neighbor_ids) + 1
+
+        synth_sample_labels[0, chosen_labels] = 1
+
+        return synth_sample, synth_sample_labels
 
     def _collect_unique_labels(self, y):
         """A support function that flattens the labelsets and return one set of unique

From 8d1c4c966802439050f7930815d7c350938366c8 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 13:43:19 -0700
Subject: [PATCH 20/33] Add support function to return labels equal to their
 input type

---
 imblearn/over_sampling/_mlsmote.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 56f041ae2..d35346816 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -173,7 +173,7 @@ def fit_resample(self, X, y):
                     )
                     X_resampled = np.vstack((X_resampled, X_new))
                     y_resampled = sparse.vstack((y_resampled, y_new))
-        return X_resampled, y_resampled
+        return X_resampled, self.convert_to_input_type(y_resampled, unique_labels, type(y))
 
     def _create_new_sample(
         self,
@@ -334,3 +334,16 @@ def _get_most_frequent_value(self, values):
         """
         uniques, counts = np.unique(values, return_counts=True)
         return uniques[np.argmax(counts)]
+
+    def convert_to_input_type(self, y_resampled, unique_labels, input_type):
+        """A support function that converts the labels back to its input format"""
+        if input_type == sparse._csr.csr_matrix:
+            return y_resampled
+        elif input_type == np.ndarray:
+            return np.asarray(y_resampled.todense())
+        elif input_type == list:
+            labels = [[] for _ in range(y_resampled.shape[0])]
+            rows, cols = y_resampled.nonzero()
+            for row, col in zip(rows, cols):
+                labels[row].append(unique_labels[col])
+            return labels

From 66566c6f8fa78b200a30913a1a722104b96dbcfe Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 15:57:19 -0700
Subject: [PATCH 21/33] Refactor use of unique_labels

---
 imblearn/over_sampling/_mlsmote.py | 50 ++++++++++++++++--------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index d35346816..cd424d0e1 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -120,32 +120,33 @@ def fit_resample(self, X, y):
 
         X_resampled = X.copy()
 
+        unique_labels = None
         # Convert 'y' to a sparse matrix
         if type(y) == sparse._csr.csr_matrix:
             y_resampled = y.copy()
-            unique_labels = range(0, y_resampled.shape[1])
         elif type(y) == np.ndarray:
             y_resampled = sparse.csr_matrix(y, dtype=int)
-            unique_labels = range(0, y_resampled.shape[1])
         elif type(y) == list:
             unique_labels = self._collect_unique_labels(y)
             y_resampled = sparse.csr_matrix((len(y), len(unique_labels)))
-            for i, sample in enumerate(y):
-                for label in sample:
-                    y_resampled[i, label] = 1
+            for i, sample_labels in enumerate(y):
+                for label in sample_labels:
+                    y_resampled[i, np.where(unique_labels == label)] = 1
         else:
             raise TypeError(
                 "'y' can only be of type 'numpy.ndarray', "
                 "'scipy.sparse._csr.csr_matrix' or 'list'"
             )
 
+        self.n_classes_ = y_resampled.shape[1]
+
         """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has
         no samples
         """
-        mean_ir = self._get_mean_imbalance_ratio(unique_labels, y_resampled)
+        mean_ir = self._get_mean_imbalance_ratio(y_resampled)
 
-        for label in unique_labels:
-            irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, y_resampled)
+        for label in range(self.n_classes_):
+            irlbl_num = self._get_imbalance_ratio_numerator(y_resampled)
             irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
             if irlbl > mean_ir:
                 min_bag = self._get_all_instances_of_label(label, y_resampled)
@@ -155,7 +156,7 @@ def fit_resample(self, X, y):
                     continue
                 for sample_id in min_bag:
                     distances = self._calc_distances(
-                        sample_id, min_bag, X_resampled, unique_labels, y_resampled
+                        sample_id, min_bag, X_resampled, y_resampled
                     )
                     distances = np.sort(distances, order="distance")
                     neighbors = distances[
@@ -167,13 +168,14 @@ def fit_resample(self, X, y):
                         ref_neigh[1],
                         [x[1] for x in neighbors],
                         X_resampled,
-                        unique_labels,
                         y_resampled,
                         random_state,
                     )
                     X_resampled = np.vstack((X_resampled, X_new))
                     y_resampled = sparse.vstack((y_resampled, y_new))
-        return X_resampled, self.convert_to_input_type(y_resampled, unique_labels, type(y))
+        return X_resampled, self.convert_to_input_type(
+            y_resampled, unique_labels, type(y)
+        )
 
     def _create_new_sample(
         self,
@@ -181,7 +183,6 @@ def _create_new_sample(
         ref_neigh_id,
         neighbor_ids,
         X_resampled,
-        unique_labels,
         y_resampled,
         random_state,
     ):
@@ -203,7 +204,7 @@ def _create_new_sample(
         label_counts = np.squeeze(
             np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0))
         )
-        synth_sample_labels = sparse.csr_matrix((1, len(unique_labels)))
+        synth_sample_labels = sparse.csr_matrix((1, self.n_classes_), dtype=int)
         if self.sampling_strategy_ == MLSMOTE.RANKING:
             # Note: Paper states "present in half or more of the instances considered"
             # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We
@@ -225,7 +226,7 @@ def _collect_unique_labels(self, y):
         """
         return np.unique(np.array([label for label_set in y for label in label_set]))
 
-    def _calc_distances(self, sample, min_bag, features, unique_labels, labels):
+    def _calc_distances(self, sample, min_bag, features, labels):
         def calc_dist(bag_sample):
             nominal_distance = sum(
                 [
@@ -234,7 +235,6 @@ def calc_dist(bag_sample):
                         features[bag_sample, cat],
                         features,
                         cat,
-                        unique_labels,
                         labels,
                     )
                     for cat in self.categorical_features_
@@ -259,7 +259,7 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second, features, category, unique_labels, labels):
+    def _get_vdm(self, first, second, features, category, labels):
         """A support function to compute the Value Difference Metric(VDM) discribed in
         https://arxiv.org/pdf/cs/9701101.pdf
         """
@@ -274,7 +274,7 @@ def f_sparse(c):
                 p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay)))
                 return p
 
-            vdm = np.sum(np.array([f_sparse(c) for c in unique_labels]))
+            vdm = np.sum(np.array([f_sparse(c) for c in range(self.n_classes_)]))
             return vdm
 
         category_rows = features[:, category]
@@ -289,30 +289,32 @@ def f(c):
             p = abs((N_axc / N_ax) - (N_ayc / N_ay))
             return p
 
-        vdm = np.array([f(c) for c in unique_labels]).sum()
+        vdm = np.array([f(c) for c in range(self.n_classes_)]).sum()
         return vdm
 
     def _get_all_instances_of_label(self, label, labels):
         return labels[:, label].nonzero()[0]
 
-    def _get_mean_imbalance_ratio(self, unique_labels, labels):
-        irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels)
+    def _get_mean_imbalance_ratio(self, labels):
+        irlbl_num = self._get_imbalance_ratio_numerator(labels)
         ratio_sum = np.sum(
             np.array(
                 list(
                     map(
                         self._get_imbalance_ratio_per_label,
-                        unique_labels,
+                        range(self.n_classes_),
                         itertools.repeat(irlbl_num),
                         itertools.repeat(labels),
                     )
                 )
             )
         )
-        return ratio_sum / len(unique_labels)
+        return ratio_sum / self.n_classes_
 
-    def _get_imbalance_ratio_numerator(self, unique_labels, labels):
-        sum_array = np.array([self._sum_h(label, labels) for label in unique_labels])
+    def _get_imbalance_ratio_numerator(self, labels):
+        sum_array = np.array(
+            [self._sum_h(label, labels) for label in range(self.n_classes_)]
+        )
         return sum_array.max()
 
     def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):

From 3aa029fec5decd1eca65974e599b07b1c3516218 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 18:23:00 -0700
Subject: [PATCH 22/33] Optimize _get_vdm

---
 imblearn/over_sampling/_mlsmote.py | 40 +++++++++---------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index cd424d0e1..1382b2ed6 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -248,7 +248,7 @@ def calc_dist(bag_sample):
                     for num in self.continuous_features_
                 ]
             )
-            dist = sum([nominal_distance, ordinal_distance])
+            dist = nominal_distance + ordinal_distance
             return (dist, bag_sample)
 
         distances = [calc_dist(bag_sample) for bag_sample in min_bag]
@@ -259,37 +259,21 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first - second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second, features, category, labels):
-        """A support function to compute the Value Difference Metric(VDM) discribed in
+    def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels):
+        """A support function to compute the Value Difference Metric(VDM) described in
         https://arxiv.org/pdf/cs/9701101.pdf
         """
-        if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix:
-
-            def f_sparse(c):
-                N_ax = len(sparse.find(features[:, category] == first)[0])
-                N_ay = len(sparse.find(features[:, category] == second)[0])
-                c_instances = self._get_all_instances_of_label(c, labels)
-                N_axc = len(sparse.find(features[c_instances, category] == first)[0])
-                N_ayc = len(sparse.find(features[c_instances, category] == second)[0])
-                p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay)))
-                return p
-
-            vdm = np.sum(np.array([f_sparse(c) for c in range(self.n_classes_)]))
-            return vdm
-
-        category_rows = features[:, category]
-        N_ax = len(np.where(category_rows == first))
-        N_ay = len(np.where(category_rows == second))
-
-        def f(c):
-            class_instances = self._get_all_instances_of_label(c, labels)
-            class_instance_rows = category_rows[class_instances]
-            N_axc = len(np.where(class_instance_rows == first)[0])
-            N_ayc = len(np.where(class_instance_rows == second)[0])
-            p = abs((N_axc / N_ax) - (N_ayc / N_ay))
+
+        def f_sparse(_class):
+            c_instances = self._get_all_instances_of_label(_class, labels)
+            N_axc = np.count_nonzero(features[c_instances, category] == x_attr_val)
+            N_ayc = np.count_nonzero(features[c_instances, category] == y_attr_val)
+            p = abs((N_axc / N_ax) - (N_ayc / N_ay)) ** 2
             return p
 
-        vdm = np.array([f(c) for c in range(self.n_classes_)]).sum()
+        N_ax = np.count_nonzero(features[:, category] == x_attr_val)
+        N_ay = np.count_nonzero(features[:, category] == y_attr_val)
+        vdm = sum([f_sparse(_class) for _class in range(self.n_classes_)])
         return vdm
 
     def _get_all_instances_of_label(self, label, labels):

From feb2f98aaa854e51747bdbda3e9d8ed2fd3cc42f Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 19:10:13 -0700
Subject: [PATCH 23/33] Add documentation

---
 README.rst                         |  3 +++
 imblearn/over_sampling/_mlsmote.py | 35 +++++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index dee6419cb..36936712f 100644
--- a/README.rst
+++ b/README.rst
@@ -185,6 +185,7 @@ Below is a list of the methods currently implemented in this module.
     7. ADASYN - Adaptive synthetic sampling approach for imbalanced learning [15]_
     8. KMeans-SMOTE [17]_
     9. ROSE - Random OverSampling Examples [19]_
+    10. MLSMOTE - Multilabel Synthetic Minority Over-sampling Technique [20]_
 
 * Over-sampling followed by under-sampling
     1. SMOTE + Tomek links [12]_
@@ -243,3 +244,5 @@ References:
 .. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197.
 
 .. [19] : Menardi, G., Torelli, N.: "Training and assessing classification rules with unbalanced data", Data Mining and Knowledge Discovery,  28, (2014): 92–122
+
+.. [20] : Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 1382b2ed6..646e3ac20 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -39,7 +39,36 @@ class MLSMOTE:
 
     Examples
     --------
+    >>> import numpy as np
     >>> from sklearn.datasets import make_multilabel_classification
+    >>> from imblearn.over_sampling import MLSMOTE
+    >>> X, y = make_multilabel_classification(n_classes=5, n_features=20,
+    ... random_state=42)
+    >>> print("Original Dataset")
+    Original Dataset
+    >>> print(f"Samples: {X.shape[0]}")
+    Samples: 100
+    >>> for _class in range(y.shape[1]):
+    ...     print(f"Class {_class} count: {np.count_nonzero(y[:, _class])}")
+    Class 0 count: 30
+    Class 1 count: 54
+    Class 2 count: 48
+    Class 3 count: 33
+    Class 4 count: 14
+    >>> categorical_features = np.full((20,), True)
+    >>> mlsmote = MLSMOTE(categorical_features, random_state=42)
+    >>> X_res, y_res = mlsmote.fit_resample(X, y)
+    >>> print("Resampled Dataset")
+    Resampled Dataset
+    >>> print(f"Samples: {X_res.shape[0]}")
+    Samples: 114
+    >>> for _class in range(y_res.shape[1]):
+    ...     print(f"Class {_class} count: {np.count_nonzero(y_res[:, _class])}")
+    Class 0 count: 30
+    Class 1 count: 60
+    Class 2 count: 56
+    Class 3 count: 33
+    Class 4 count: 28
     """
 
     _required_parameters = ["categorical_features"]
@@ -95,8 +124,7 @@ def fit_resample(self, X, y):
             Matrix containing the data which have to be sampled.
 
         y : {array-like, sparse matrix of shape \
-                (n_samples, n_labels)
-            or a list of lists of labels.
+                (n_samples, n_labels) or a list of lists of labels.
             See "sklearn.datasets.make_multilabel_classification" and \
                 the "return_indicate" input parameter for more \
                 information on possible label sets formats.
@@ -110,7 +138,8 @@ def fit_resample(self, X, y):
                 (n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : array-like of shape (n_samples_new, n_labels)
+        y_resampled : array-like of shape (n_samples_new, n_labels) \
+                or a list of lists of labels.
             The corresponding label sets of `X_resampled`.
         """
         self.n_features_ = X.shape[1]

From ff89a0494d043f51a00fedd7e5d68a9e8e327418 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Tue, 20 Sep 2022 20:33:29 -0700
Subject: [PATCH 24/33] Add and update tests

---
 imblearn/over_sampling/tests/test_mlsmote.py | 216 +++++++++++++++----
 1 file changed, 176 insertions(+), 40 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py
index 49f1c0317..5174c197e 100644
--- a/imblearn/over_sampling/tests/test_mlsmote.py
+++ b/imblearn/over_sampling/tests/test_mlsmote.py
@@ -1,17 +1,16 @@
 """Test the module MLSMOTE."""
 
-
-from collections import Counter
-
-import pytest
-
 import numpy as np
-from scipy import sparse
-from sklearn.preprocessing import MultiLabelBinarizer
+import pytest
 
+from sklearn.datasets import make_multilabel_classification
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import assert_array_equal
 
 from imblearn.over_sampling import MLSMOTE
 
+R_TOL = 1e-4
+
 
 def data_heterogneous_ordered():
     rng = np.random.RandomState(42)
@@ -22,7 +21,7 @@ def data_heterogneous_ordered():
     X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20
     # return the categories
     return X, y, [2, 3]
 
@@ -36,7 +35,7 @@ def data_heterogneous_unordered():
     X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20
     # return the categories
     return X, y, [0, 3]
 
@@ -50,28 +49,33 @@ def data_heterogneous_masked():
     X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20
     # return the categories
     return X, y, [True, False, True]
 
 
 def data_sparse():
-    rng = np.random.RandomState(42)
-    X = np.empty((30, 4), dtype=np.float64)
-    # create 2 random continuous feature
-    X[:, [1, 2]] = rng.randn(30, 2)
-    # create a categorical feature using some string
-    X[:, 0] = rng.randint(3, size=30)
-    # create a categorical feature using some integer
-    X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
-    labelBinarizer = MultiLabelBinarizer()
-    y = labelBinarizer.fit_transform(y)
-    y = sparse.csr_matrix(y)
-    return X, y, [0, 3]
+    X, y = make_multilabel_classification(
+        n_samples=20, n_features=5, return_indicator="sparse", random_state=42
+    )
+    return X, y, []
+
+
+def data_dense():
+    X, y = make_multilabel_classification(
+        n_samples=20, n_features=5, return_indicator="dense", random_state=42
+    )
+    return X, y, []
 
 
-def test_mlsmote_error():
+def data_list_of_lists():
+    X, y = make_multilabel_classification(
+        n_samples=20, n_features=5, return_indicator=False, random_state=42
+    )
+    return X, y, []
+
+
+def test_mlsmote_categorical_features_error():
     X, y, _ = data_heterogneous_unordered()
     categorical_features = [0, 10]
     smote = MLSMOTE(categorical_features=categorical_features)
@@ -79,21 +83,32 @@ def test_mlsmote_error():
         smote.fit_resample(X, y)
 
 
+def test_mlsmote_invalid_strategy_error():
+    _, _, categorical_features = data_heterogneous_unordered()
+    with pytest.raises(
+        ValueError,
+        match="Sampling Strategy can only be one of:",
+    ):
+        _ = MLSMOTE(categorical_features=categorical_features, sampling_strategy="foo")
+
+
 @pytest.mark.parametrize(
     "data",
     [
         data_heterogneous_ordered(),
         data_heterogneous_unordered(),
         data_heterogneous_masked(),
-        data_sparse()
+        data_sparse(),
+        data_dense(),
+        data_list_of_lists(),
     ],
 )
 def test_mlsmote(data):
     X, y, categorical_features = data
     smote = MLSMOTE(categorical_features=categorical_features)
     X_resampled, y_resampled = smote.fit_resample(X, y)
-
     assert X_resampled.dtype == X.dtype
+    assert type(y) == type(y_resampled)
 
     categorical_features = np.array(categorical_features)
     if categorical_features.dtype == bool:
@@ -103,24 +118,145 @@ def test_mlsmote(data):
         assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
 
 
-def test_mlsmote_fit():
+def test_mlsmote_fit_resample_1():
     X, y, categorical_features = data_heterogneous_unordered()
+    classes = set([a for x in y for a in x])
     smote = MLSMOTE(categorical_features=categorical_features)
-    smote.fit_resample(X, y)
+    _, y_res = smote.fit_resample(X, y)
+    classes_res = set([a for x in y_res for a in x])
+
+    assert classes == classes_res
     assert hasattr(
         smote, "sampling_strategy_"
     ), "No fitted attribute sampling_strategy_"
 
 
-def test_mlsmote_fit_resample():
-    X, y, categorical_features = data_heterogneous_unordered()
-    target_stats = Counter(np.unique(
-        np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
-    smote = MLSMOTE(categorical_features=categorical_features)
-    _, y_res = smote.fit_resample(X, y)
-    classes_res = np.unique(
-        np.array([a for x in y_res
-                  for a in (x if isinstance(x, list) else [x])]))
-    _ = Counter(classes_res)
-    n_samples = max(target_stats.values())
-    assert all(value >= n_samples for value in Counter(classes_res).values())
+def test_mlsmote_fit_resample_2():
+    X = np.array(
+        [
+            [25.0, 34.0],
+            [38.0, 10.0],
+            [47.0, 7.0],
+            [32.0, 15.0],
+            [23.0, 27.0],
+            [36.0, 9.0],
+            [45.0, 10.0],
+            [39.0, 7.0],
+            [29.0, 26.0],
+            [31.0, 18.0],
+            [36.0, 6.0],
+            [37.0, 7.0],
+            [44.0, 10.0],
+            [42.0, 16.0],
+            [39.0, 5.0],
+            [44.0, 9.0],
+            [33.0, 13.0],
+            [36.0, 12.0],
+            [32.0, 6.0],
+            [28.0, 9.0],
+        ]
+    )
+
+    y = np.array(
+        [
+            [0, 0],
+            [1, 1],
+            [1, 0],
+            [1, 1],
+            [0, 0],
+            [1, 1],
+            [1, 1],
+            [0, 1],
+            [0, 0],
+            [0, 0],
+            [0, 1],
+            [1, 0],
+            [1, 1],
+            [0, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [0, 1],
+            [1, 1],
+            [0, 1],
+        ]
+    )
+
+    X_resampled_exp = np.array(
+        [
+            [25.0, 34.0],
+            [38.0, 10.0],
+            [47.0, 7.0],
+            [32.0, 15.0],
+            [23.0, 27.0],
+            [36.0, 9.0],
+            [45.0, 10.0],
+            [39.0, 7.0],
+            [29.0, 26.0],
+            [31.0, 18.0],
+            [36.0, 6.0],
+            [37.0, 7.0],
+            [44.0, 10.0],
+            [42.0, 16.0],
+            [39.0, 5.0],
+            [44.0, 9.0],
+            [33.0, 13.0],
+            [36.0, 12.0],
+            [32.0, 6.0],
+            [28.0, 9.0],
+            [38.95071431, 6.34003029],
+            [42.22519874, 6.10833449],
+            [33.83699557, 12.99774833],
+            [36.06175348, 5.12036059],
+            [38.43013104, 10.0],
+            [36.08297745, 6.69575776],
+            [40.54443985, 9.70877086],
+            [37.80041708, 5.18666265],
+            [41.80182894, 9.45606998],
+            [34.91230996, 10.05030734],
+            [32.23225206, 6.60754485],
+        ]
+    )
+
+    y_resampled_exp = np.array(
+        [
+            [0, 0],
+            [1, 1],
+            [1, 0],
+            [1, 1],
+            [0, 0],
+            [1, 1],
+            [1, 1],
+            [0, 1],
+            [0, 0],
+            [0, 0],
+            [0, 1],
+            [1, 0],
+            [1, 1],
+            [0, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [0, 1],
+            [1, 1],
+            [0, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+            [1, 1],
+        ]
+    )
+
+    smote = MLSMOTE(categorical_features=[], random_state=42)
+    X_resampled, y_resampled = smote.fit_resample(X, y)
+    print(X_resampled)
+    print(y_resampled)
+    assert_allclose(X_resampled, X_resampled_exp, rtol=R_TOL)
+    assert_array_equal(y_resampled, y_resampled_exp)

From 1484f948b9c825d81fba657f3189309d8b23fe38 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Wed, 21 Sep 2022 08:51:14 -0700
Subject: [PATCH 25/33] Remove unused '_get_label_frequencies' function

---
 imblearn/over_sampling/_mlsmote.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 646e3ac20..23289cb3e 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -336,12 +336,6 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
     def _sum_h(self, label, labels):
         return labels[:, label].count_nonzero()
 
-    def _get_label_frequencies(self, labels):
-        """A support function to get the frequencies of labels"""
-        frequency_map = np.array(np.unique(labels, return_counts=True)).T
-        frequencies = np.array([x[1] for x in frequency_map])
-        return frequencies
-
     def _get_most_frequent_value(self, values):
         """A support function to get most frequent value if a list of values
         TODO: We might want to randomize 'unique' and 'counts' to avoid always returning

From 4938543e9ebbe66befd61dcbb2a100d19e1f8895 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Wed, 21 Sep 2022 08:52:49 -0700
Subject: [PATCH 26/33] Rename function _convert_to_input_type

---
 imblearn/over_sampling/_mlsmote.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 23289cb3e..eb1553784 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -202,7 +202,7 @@ def fit_resample(self, X, y):
                     )
                     X_resampled = np.vstack((X_resampled, X_new))
                     y_resampled = sparse.vstack((y_resampled, y_new))
-        return X_resampled, self.convert_to_input_type(
+        return X_resampled, self._convert_to_input_type(
             y_resampled, unique_labels, type(y)
         )
 
@@ -344,7 +344,7 @@ def _get_most_frequent_value(self, values):
         uniques, counts = np.unique(values, return_counts=True)
         return uniques[np.argmax(counts)]
 
-    def convert_to_input_type(self, y_resampled, unique_labels, input_type):
+    def _convert_to_input_type(self, y_resampled, unique_labels, input_type):
         """A support function that converts the labels back to its input format"""
         if input_type == sparse._csr.csr_matrix:
             return y_resampled

From 8c1319f0f7a24074da968d47ae26942b48cb00e0 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Mon, 10 Oct 2022 10:04:49 -0600
Subject: [PATCH 27/33] Simplified calculation of Mean IR

---
 imblearn/over_sampling/_mlsmote.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index eb1553784..1d6bfa2ee 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -1,6 +1,5 @@
 """Class to perfrom over-sampling using MLSMOTE."""
 
-import itertools
 import numpy as np
 from scipy import sparse
 
@@ -309,19 +308,11 @@ def _get_all_instances_of_label(self, label, labels):
         return labels[:, label].nonzero()[0]
 
     def _get_mean_imbalance_ratio(self, labels):
-        irlbl_num = self._get_imbalance_ratio_numerator(labels)
-        ratio_sum = np.sum(
-            np.array(
-                list(
-                    map(
-                        self._get_imbalance_ratio_per_label,
-                        range(self.n_classes_),
-                        itertools.repeat(irlbl_num),
-                        itertools.repeat(labels),
-                    )
-                )
-            )
+        sum_per_label = np.array(
+            [self._sum_h(label, labels) for label in range(self.n_classes_)]
         )
+        irlbl_num = sum_per_label.max()
+        ratio_sum = np.sum(irlbl_num / sum_per_label)
         return ratio_sum / self.n_classes_
 
     def _get_imbalance_ratio_numerator(self, labels):

From 2b82cda90921d34aab53a4b75864fd316550304f Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Thu, 13 Oct 2022 11:20:56 -0500
Subject: [PATCH 28/33] Simplify get_euclidean_distance

---
 imblearn/over_sampling/_mlsmote.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 1d6bfa2ee..b71e05e05 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -284,8 +284,10 @@ def calc_dist(bag_sample):
         return np.array(distances, dtype=dtype)
 
     def _get_euclidean_distance(self, first, second):
-        euclidean_distance = np.linalg.norm(first - second)
-        return euclidean_distance
+        """Since the inputs are of type 'float' the euclidean distance is just
+        the absolute value of their difference.
+        """
+        return abs(first - second)
 
     def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels):
         """A support function to compute the Value Difference Metric(VDM) described in

From 07a2cc67cbd96c2731da4a48efc7758c299cae00 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Wed, 12 Oct 2022 09:32:38 -0600
Subject: [PATCH 29/33] Use a cache to calculate euclidean distance

---
 imblearn/over_sampling/_mlsmote.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index b71e05e05..1811e1993 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -1,5 +1,6 @@
 """Class to perfrom over-sampling using MLSMOTE."""
 
+from itertools import combinations
 import numpy as np
 from scipy import sparse
 
@@ -178,13 +179,22 @@ def fit_resample(self, X, y):
             irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
             if irlbl > mean_ir:
                 min_bag = self._get_all_instances_of_label(label, y_resampled)
+                euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0]))
+                X_sliced = X_resampled[:][:,self.continuous_features_]
+                pairs = list(combinations(min_bag, 2))
+                for m, n in pairs:
+                    distance = sum(self._get_euclidean_distance(
+                        X_sliced[m, :], X_sliced[n, :]
+                    ))
+                    euclidean_dist_cache[m, n] = distance
+                    euclidean_dist_cache[n, m] = distance
                 if (
                     len(min_bag) <= 1
                 ):  # If there is only one sample, the neighbor set will be empty
                     continue
                 for sample_id in min_bag:
                     distances = self._calc_distances(
-                        sample_id, min_bag, X_resampled, y_resampled
+                        sample_id, min_bag, X_resampled, y_resampled, euclidean_dist_cache,
                     )
                     distances = np.sort(distances, order="distance")
                     neighbors = distances[
@@ -254,7 +264,7 @@ def _collect_unique_labels(self, y):
         """
         return np.unique(np.array([label for label_set in y for label in label_set]))
 
-    def _calc_distances(self, sample, min_bag, features, labels):
+    def _calc_distances(self, sample, min_bag, features, labels, euclidean_dist_cache):
         def calc_dist(bag_sample):
             nominal_distance = sum(
                 [
@@ -268,14 +278,7 @@ def calc_dist(bag_sample):
                     for cat in self.categorical_features_
                 ]
             )
-            ordinal_distance = sum(
-                [
-                    self._get_euclidean_distance(
-                        features[sample, num], features[bag_sample, num]
-                    )
-                    for num in self.continuous_features_
-                ]
-            )
+            ordinal_distance = euclidean_dist_cache[sample, bag_sample]
             dist = nominal_distance + ordinal_distance
             return (dist, bag_sample)
 

From d36a132a93dad72a588e97ef20ab1e0274672cdd Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Thu, 27 Oct 2022 12:45:08 -0700
Subject: [PATCH 30/33] Avoid repeated calculation of 'c_instances'

---
 imblearn/over_sampling/_mlsmote.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 1811e1993..a717f2325 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -273,7 +273,7 @@ def calc_dist(bag_sample):
                         features[bag_sample, cat],
                         features,
                         cat,
-                        labels,
+                        c_instances,
                     )
                     for cat in self.categorical_features_
                 ]
@@ -282,6 +282,9 @@ def calc_dist(bag_sample):
             dist = nominal_distance + ordinal_distance
             return (dist, bag_sample)
 
+        c_instances = [
+            self._get_all_instances_of_label(_class, labels) for _class in range(self.n_classes_)
+        ]
         distances = [calc_dist(bag_sample) for bag_sample in min_bag]
         dtype = np.dtype([("distance", float), ("index", int)])
         return np.array(distances, dtype=dtype)
@@ -292,15 +295,14 @@ def _get_euclidean_distance(self, first, second):
         """
         return abs(first - second)
 
-    def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels):
+    def _get_vdm(self, x_attr_val, y_attr_val, features, category, c_instances):
         """A support function to compute the Value Difference Metric(VDM) described in
         https://arxiv.org/pdf/cs/9701101.pdf
         """
 
         def f_sparse(_class):
-            c_instances = self._get_all_instances_of_label(_class, labels)
-            N_axc = np.count_nonzero(features[c_instances, category] == x_attr_val)
-            N_ayc = np.count_nonzero(features[c_instances, category] == y_attr_val)
+            N_axc = np.count_nonzero(features[c_instances[_class], category] == x_attr_val)
+            N_ayc = np.count_nonzero(features[c_instances[_class], category] == y_attr_val)
             p = abs((N_axc / N_ax) - (N_ayc / N_ay)) ** 2
             return p
 

From b593edf87bbf8a798f7069583d250e9491b8008c Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Thu, 27 Oct 2022 14:50:39 -0700
Subject: [PATCH 31/33] Minor format and code reorganization

---
 imblearn/over_sampling/_mlsmote.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index a717f2325..a3ca95d29 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -179,19 +179,19 @@ def fit_resample(self, X, y):
             irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
             if irlbl > mean_ir:
                 min_bag = self._get_all_instances_of_label(label, y_resampled)
+                if (
+                    len(min_bag) <= 1
+                ):  # If there is only one sample, the neighbor set will be empty
+                    continue
                 euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0]))
-                X_sliced = X_resampled[:][:,self.continuous_features_]
+                X_cont = X_resampled[:][:, self.continuous_features_]
                 pairs = list(combinations(min_bag, 2))
                 for m, n in pairs:
                     distance = sum(self._get_euclidean_distance(
-                        X_sliced[m, :], X_sliced[n, :]
+                        X_cont[m, :], X_cont[n, :]
                     ))
                     euclidean_dist_cache[m, n] = distance
                     euclidean_dist_cache[n, m] = distance
-                if (
-                    len(min_bag) <= 1
-                ):  # If there is only one sample, the neighbor set will be empty
-                    continue
                 for sample_id in min_bag:
                     distances = self._calc_distances(
                         sample_id, min_bag, X_resampled, y_resampled, euclidean_dist_cache,

From c135cfa5b4e5619798ca6abcf21a249628ee2126 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Thu, 27 Oct 2022 15:28:34 -0700
Subject: [PATCH 32/33] Add note about calculation of VDM distances

---
 imblearn/over_sampling/_mlsmote.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index a3ca95d29..608b72967 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -183,6 +183,15 @@ def fit_resample(self, X, y):
                     len(min_bag) <= 1
                 ):  # If there is only one sample, the neighbor set will be empty
                     continue
+                # Note: Only the distance for numeric attributes can be
+                # cached. The Value Difference Metric (VDM) distance for
+                # categorical/nominal attributes CANNOT be cached because VDMs
+                # are dependent on the total number of samples in the dataset
+                # that have specific values for the different attributes.
+                # Given that each synthetic sample is added to the dataset in
+                # the inner loop (line 17 of 'Algorithm 1' of the MLSMOTE,
+                # Charte, F. et al. paper), the VDM between samples has to be
+                # computed in every inner iteration.
                 euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0]))
                 X_cont = X_resampled[:][:, self.continuous_features_]
                 pairs = list(combinations(min_bag, 2))

From 70e1f966414cdf9425d764c52a0a92d99c732051 Mon Sep 17 00:00:00 2001
From: Bruno Alvisio <bruno.alvisio@gmail.com>
Date: Thu, 27 Oct 2022 16:05:49 -0700
Subject: [PATCH 33/33] Use numpy array instead of sparse matrix

---
 imblearn/over_sampling/_mlsmote.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 608b72967..a7133eb41 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -150,14 +150,14 @@ def fit_resample(self, X, y):
         X_resampled = X.copy()
 
         unique_labels = None
-        # Convert 'y' to a sparse matrix
+        # Convert 'y' to a numpy array
         if type(y) == sparse._csr.csr_matrix:
-            y_resampled = y.copy()
+            y_resampled = y.toarray()
         elif type(y) == np.ndarray:
-            y_resampled = sparse.csr_matrix(y, dtype=int)
+            y_resampled = np.copy(y)
         elif type(y) == list:
             unique_labels = self._collect_unique_labels(y)
-            y_resampled = sparse.csr_matrix((len(y), len(unique_labels)))
+            y_resampled = np.zeros((len(y), len(unique_labels)))
             for i, sample_labels in enumerate(y):
                 for label in sample_labels:
                     y_resampled[i, np.where(unique_labels == label)] = 1
@@ -219,7 +219,7 @@ def fit_resample(self, X, y):
                         random_state,
                     )
                     X_resampled = np.vstack((X_resampled, X_new))
-                    y_resampled = sparse.vstack((y_resampled, y_new))
+                    y_resampled = np.vstack((y_resampled, y_new))
         return X_resampled, self._convert_to_input_type(
             y_resampled, unique_labels, type(y)
         )
@@ -251,7 +251,7 @@ def _create_new_sample(
         label_counts = np.squeeze(
             np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0))
         )
-        synth_sample_labels = sparse.csr_matrix((1, self.n_classes_), dtype=int)
+        synth_sample_labels = np.zeros((1, self.n_classes_), dtype=int)
         if self.sampling_strategy_ == MLSMOTE.RANKING:
             # Note: Paper states "present in half or more of the instances considered"
             # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We
@@ -321,7 +321,7 @@ def f_sparse(_class):
         return vdm
 
     def _get_all_instances_of_label(self, label, labels):
-        return labels[:, label].nonzero()[0]
+        return np.nonzero(labels[:, label])[0]
 
     def _get_mean_imbalance_ratio(self, labels):
         sum_per_label = np.array(
@@ -341,7 +341,7 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
         return irlbl_numerator / self._sum_h(label, labels)
 
     def _sum_h(self, label, labels):
-        return labels[:, label].count_nonzero()
+        return np.count_nonzero(labels[:, label])
 
     def _get_most_frequent_value(self, values):
         """A support function to get most frequent value if a list of values
@@ -354,9 +354,9 @@ def _get_most_frequent_value(self, values):
     def _convert_to_input_type(self, y_resampled, unique_labels, input_type):
         """A support function that converts the labels back to its input format"""
         if input_type == sparse._csr.csr_matrix:
-            return y_resampled
+            return sparse.csr_matrix(y_resampled, dtype=int)
         elif input_type == np.ndarray:
-            return np.asarray(y_resampled.todense())
+            return y_resampled
         elif input_type == list:
             labels = [[] for _ in range(y_resampled.shape[0])]
             rows, cols = y_resampled.nonzero()