From 802caae2e04861198aea16e86d4cb3b5f85ce448 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Mon, 11 May 2020 00:51:37 +0200
Subject: [PATCH 01/10] add basic tests for mlsmote

---
 imblearn/over_sampling/tests/test_mlsmote.py | 104 +++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 imblearn/over_sampling/tests/test_mlsmote.py

diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py
new file mode 100644
index 000000000..eda1f0708
--- /dev/null
+++ b/imblearn/over_sampling/tests/test_mlsmote.py
@@ -0,0 +1,104 @@
+"""Test the module MLSMOTE."""
+
+
+from collections import Counter
+
+import pytest
+
+import numpy as np
+
+
+from imblearn.over_sampling import MLSMOTE
+
+
+def data_heterogneous_ordered():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, :2] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [2, 3]
+
+
+def data_heterogneous_unordered():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [0, 3]
+
+
+def data_heterogneous_masked():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    # return the categories
+    return X, y, [True, False, True]
+
+
+def test_mlsmote_error():
+    X, y, _ = data_heterogneous_unordered()
+    categorical_features = [0, 10]
+    smote = MLSMOTE(categorical_features=categorical_features)
+    with pytest.raises(ValueError, match="indices are out of range"):
+        smote.fit_resample(X, y)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        data_heterogneous_ordered(),
+        data_heterogneous_unordered(),
+        data_heterogneous_masked(),
+    ],
+)
+def test_mlsmote(data):
+    X, y, categorical_features = data
+    smote = MLSMOTE(categorical_features=categorical_features)
+    X_resampled, y_resampled = smote.fit_resample(X, y)
+
+    assert X_resampled.dtype == X.dtype
+
+    categorical_features = np.array(categorical_features)
+    if categorical_features.dtype == bool:
+        categorical_features = np.flatnonzero(categorical_features)
+    for cat_idx in categorical_features:
+        assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
+        assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
+
+def test_mlsmote_fit():
+    X, y, categorical_features = data_heterogneous_unordered()
+    smote = MLSMOTE(categorical_features=categorical_features)
+    smote.fit_resample(X, y)
+    assert hasattr(
+        smote, "sampling_strategy_"
+    ), "No fitted attribute sampling_strategy_"
+
+
+def test_mlsmote_fit_resample():
+    X, y, categorical_features = data_heterogneous_unordered()
+    target_stats = Counter(np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
+    smote = MLSMOTE(categorical_features=categorical_features)
+    _, y_res = smote.fit_resample(X, y)
+    classes_res=np.unique(np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
+    _ = Counter(classes_res)
+    n_samples = max(target_stats.values())
+    assert all(value >= n_samples for value in Counter(classes_res).values())
+

From 9b2ec7f1ae3a5708121d2004662d7f8bdb98195b Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Mon, 11 May 2020 00:52:07 +0200
Subject: [PATCH 02/10] add mlsmote implementation

---
 imblearn/over_sampling/__init__.py |   2 +
 imblearn/over_sampling/_mlsmote.py | 202 +++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 imblearn/over_sampling/_mlsmote.py

diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
index bd20b76ea..07f10a3f6 100644
--- a/imblearn/over_sampling/__init__.py
+++ b/imblearn/over_sampling/__init__.py
@@ -10,6 +10,7 @@
 from ._smote import KMeansSMOTE
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
+from ._mlsmote import MLSMOTE
 
 __all__ = [
     "ADASYN",
@@ -19,4 +20,5 @@
     "BorderlineSMOTE",
     "SVMSMOTE",
     "SMOTENC",
+    "MLSMOTE"
 ]
diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
new file mode 100644
index 000000000..15b19ae02
--- /dev/null
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -0,0 +1,202 @@
+import numpy as np
+import itertools
+import collections
+import random
+
+class MLSMOTE:
+    """Over-sampling using MLSMOTE.
+
+    Parameters
+    ----------
+    sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking'
+        Strategy to generate labelsets
+
+
+    k_neighbors : int or object, default=5
+        If ``int``, number of nearest neighbours to used to construct synthetic
+        samples.
+
+    categorical_features : ndarray of shape (n_cat_features,) or (n_features,)
+        Specified which features are categorical. Can either be:
+
+        - array of indices specifying the categorical features;
+        - mask array of shape (n_features, ) and ``bool`` dtype for which
+          ``True`` indicates the categorical features.
+
+    Notes
+    -----
+    See the original papers: [1]_ for more details.
+
+
+    References
+    ----------
+    .. [1]  Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015).
+            MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation.
+            Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. 
+
+    """
+    def __init__(self,categorical_features,k_neighbors=5 ,sampling_strategy='ranking'):
+        self.k_neighbors=k_neighbors
+        self.sampling_strategy_=sampling_strategy
+        self.categorical_features = categorical_features
+        self.continuous_features_= None
+        self.unique_labels = []
+        self.labels=[]
+        self.features=[]
+
+    def fit_resample(self,X,y):
+        self.n_features_ = X.shape[1]
+        self.labels=np.array([np.array(xi) for xi in y])
+
+        self._validate_estimator()
+
+        X_resampled = X.copy()
+        y_resampled = y.copy()
+
+        self.unique_labels = self._collect_unique_labels(y)
+        self.features=X
+
+        X_synth=[]
+        y_synth=[]
+
+        append_X_synth=X_synth.append
+        append_y_synth=y_synth.append
+        mean_ir=self._get_mean_imbalance_ratio()
+        for label in self.unique_labels:
+            irlbl=self._get_imbalance_ratio_per_label(label)
+            if irlbl > mean_ir:
+                min_bag=self._get_all_instances_of_label(label)
+                for sample in min_bag:
+                    distances=self._calc_distances(sample,min_bag)
+                    distances=np.sort(distances,order='distance')
+                    neighbours=distances[:self.k_neighbors]
+                    ref_neigh=np.random.choice(neighbours,1)[0]
+                    X_new,y_new=self._create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours])
+                    append_X_synth(X_new)
+                    append_y_synth(y_new)
+
+        return np.concatenate((X_resampled,np.array(X_synth))),np.array(y_resampled.tolist()+y_synth)
+
+    def _validate_estimator(self):
+        categorical_features = np.asarray(self.categorical_features)
+        if categorical_features.dtype.name == "bool":
+            self.categorical_features_ = np.flatnonzero(categorical_features)
+        else:
+            if any(
+                [
+                    cat not in np.arange(self.n_features_)
+                    for cat in categorical_features
+                ]
+            ):
+                raise ValueError(
+                    "Some of the categorical indices are out of range. Indices"
+                    " should be between 0 and {}".format(self.n_features_)
+                )
+            self.categorical_features_ = categorical_features
+        self.continuous_features_ = np.setdiff1d(
+            np.arange(self.n_features_), self.categorical_features_
+        )
+
+    def _collect_unique_labels(self, y):
+        """A support function that flattens the labelsets and return one set of unique labels"""
+        return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
+
+    def _create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids):
+        sample=self.features[sample_id]
+        sample_labels=self.labels[sample_id]
+        synth_sample=np.copy(sample)
+        ref_neigh=self.features[ref_neigh_id]
+        neighbours_labels=[]
+        for ni in neighbour_ids:
+            neighbours_labels.append(self.labels[ni].tolist())
+        for i in range(synth_sample.shape[0]):
+            if i in self.continuous_features_:
+                diff=ref_neigh[i]-sample[i]
+                offset=diff*random.uniform(0,1)
+                synth_sample[i]=sample[i]+offset
+            if i in self.categorical_features_:
+                synth_sample[i]=self._get_most_frequent_value(self.features[neighbour_ids,i])
+
+        labels=sample_labels.tolist()
+        labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])]
+        labels=list(set(labels))
+        if self.sampling_strategy_=='ranking':
+            head_index=int((self.k_neighbors+ 1)/2)
+            y=labels[:head_index]
+        if self.sampling_strategy_=='union':
+            y=labels[:]
+        if self.sampling_strategy_=='intersection':
+            y=list(set.intersection(*neighbours_labels))
+
+        X=synth_sample
+        return X,y
+
+
+    def _calc_distances(self,sample,min_bag):
+        distances=[]
+        append_distances=distances.append
+        for bag_sample in min_bag:
+            nominal_distances=np.array([self._get_vdm(self.features[sample,cat],self.features[bag_sample,cat])for cat in self.categorical_features_])
+            ordinal_distances=np.array([self._get_euclidean_distance(self.features[sample,num],self.features[bag_sample,num])for num in self.continuous_features_])
+            dists=np.array([nominal_distances.sum(),ordinal_distances.sum()])
+            append_distances((dists.sum(),bag_sample))
+        dtype =  np.dtype([('distance', float), ('index', int)])
+        return np.array(distances,dtype=dtype)
+
+   
+    def _get_euclidean_distance(self,first,second):
+        euclidean_distance=np.linalg.norm(first-second)
+        return euclidean_distance
+
+    def _get_vdm(self,first,second):
+        """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
+        def f(c):
+            N_ax=len(np.where(self.features[:,self.categorical_features_]==first))
+            N_ay=len(np.where(self.features[:,self.categorical_features_]==second))
+            c_instances=self._get_all_instances_of_label(c)
+            N_axc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==first)[0])
+            N_ayc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==second)[0])
+            return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
+     
+        return np.sum(np.array([f(c)for c in self.unique_labels]))
+
+    def _get_all_instances_of_label(self,label):
+        instance_ids=[]
+        append_instance_id=instance_ids.append
+        for i,label_set in enumerate(self.labels):
+            if label in label_set:
+                append_instance_id(i)
+        return np.array(instance_ids)
+
+    def _get_mean_imbalance_ratio(self):
+        ratio_sum=np.sum(np.array(list(map(self._get_imbalance_ratio_per_label,self.unique_labels))))
+        return ratio_sum/self.unique_labels.shape[0]
+
+    def _get_imbalance_ratio_per_label(self,label):
+        sum_array=list(map(self._sum_h,self.unique_labels))
+        sum_array=np.array(sum_array)
+        return sum_array.max()/self._sum_h(label)
+
+    def _sum_h(self,label):
+        h_sum=0
+        def h(l,Y):
+            if l in Y:
+                return 1
+            else:
+                return 0
+
+        for label_set in self.labels:
+            h_sum+=h(label,label_set)
+        return h_sum
+
+  
+    def _get_label_frequencies(self,labels):
+        """"A support function to get the frequencies of labels"""
+        frequency_map=np.array(np.unique(labels, return_counts=True)).T
+        frequencies=np.array([x[1] for x in count_map])
+        return frequencies
+    
+    def _get_most_frequent_value(self, values):
+        """"A support function to get most frequent value if a list of values"""
+        uniques, indices = np.unique(values, return_inverse=True)
+        return uniques[np.argmax(np.bincount(indices))]
\ No newline at end of file

From 948da4af21d37072b9acba950b4d19d58b93fa6a Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Mon, 11 May 2020 01:11:27 +0200
Subject: [PATCH 03/10] format code

---
 imblearn/over_sampling/_mlsmote.py           | 183 ++++++++++---------
 imblearn/over_sampling/tests/test_mlsmote.py |  14 +-
 2 files changed, 105 insertions(+), 92 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 15b19ae02..b943bcc52 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -3,6 +3,7 @@
 import collections
 import random
 
+
 class MLSMOTE:
     """Over-sampling using MLSMOTE.
 
@@ -35,18 +36,19 @@ class MLSMOTE:
             Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. 
 
     """
-    def __init__(self,categorical_features,k_neighbors=5 ,sampling_strategy='ranking'):
-        self.k_neighbors=k_neighbors
-        self.sampling_strategy_=sampling_strategy
+
+    def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'):
+        self.k_neighbors = k_neighbors
+        self.sampling_strategy_ = sampling_strategy
         self.categorical_features = categorical_features
-        self.continuous_features_= None
+        self.continuous_features_ = None
         self.unique_labels = []
-        self.labels=[]
-        self.features=[]
+        self.labels = []
+        self.features = []
 
-    def fit_resample(self,X,y):
+    def fit_resample(self, X, y):
         self.n_features_ = X.shape[1]
-        self.labels=np.array([np.array(xi) for xi in y])
+        self.labels = np.array([np.array(xi) for xi in y])
 
         self._validate_estimator()
 
@@ -54,28 +56,29 @@ def fit_resample(self,X,y):
         y_resampled = y.copy()
 
         self.unique_labels = self._collect_unique_labels(y)
-        self.features=X
+        self.features = X
 
-        X_synth=[]
-        y_synth=[]
+        X_synth = []
+        y_synth = []
 
-        append_X_synth=X_synth.append
-        append_y_synth=y_synth.append
-        mean_ir=self._get_mean_imbalance_ratio()
+        append_X_synth = X_synth.append
+        append_y_synth = y_synth.append
+        mean_ir = self._get_mean_imbalance_ratio()
         for label in self.unique_labels:
-            irlbl=self._get_imbalance_ratio_per_label(label)
+            irlbl = self._get_imbalance_ratio_per_label(label)
             if irlbl > mean_ir:
-                min_bag=self._get_all_instances_of_label(label)
+                min_bag = self._get_all_instances_of_label(label)
                 for sample in min_bag:
-                    distances=self._calc_distances(sample,min_bag)
-                    distances=np.sort(distances,order='distance')
-                    neighbours=distances[:self.k_neighbors]
-                    ref_neigh=np.random.choice(neighbours,1)[0]
-                    X_new,y_new=self._create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours])
+                    distances = self._calc_distances(sample, min_bag)
+                    distances = np.sort(distances, order='distance')
+                    neighbours = distances[:self.k_neighbors]
+                    ref_neigh = np.random.choice(neighbours, 1)[0]
+                    X_new, y_new = self._create_new_sample(
+                        sample, ref_neigh[1], [x[1] for x in neighbours])
                     append_X_synth(X_new)
                     append_y_synth(y_new)
 
-        return np.concatenate((X_resampled,np.array(X_synth))),np.array(y_resampled.tolist()+y_synth)
+        return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
 
     def _validate_estimator(self):
         categorical_features = np.asarray(self.categorical_features)
@@ -101,102 +104,110 @@ def _collect_unique_labels(self, y):
         """A support function that flattens the labelsets and return one set of unique labels"""
         return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
 
-    def _create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids):
-        sample=self.features[sample_id]
-        sample_labels=self.labels[sample_id]
-        synth_sample=np.copy(sample)
-        ref_neigh=self.features[ref_neigh_id]
-        neighbours_labels=[]
+    def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
+        sample = self.features[sample_id]
+        sample_labels = self.labels[sample_id]
+        synth_sample = np.copy(sample)
+        ref_neigh = self.features[ref_neigh_id]
+        neighbours_labels = []
         for ni in neighbour_ids:
             neighbours_labels.append(self.labels[ni].tolist())
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
-                diff=ref_neigh[i]-sample[i]
-                offset=diff*random.uniform(0,1)
-                synth_sample[i]=sample[i]+offset
+                diff = ref_neigh[i]-sample[i]
+                offset = diff*random.uniform(0, 1)
+                synth_sample[i] = sample[i]+offset
             if i in self.categorical_features_:
-                synth_sample[i]=self._get_most_frequent_value(self.features[neighbour_ids,i])
-
-        labels=sample_labels.tolist()
-        labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])]
-        labels=list(set(labels))
-        if self.sampling_strategy_=='ranking':
-            head_index=int((self.k_neighbors+ 1)/2)
-            y=labels[:head_index]
-        if self.sampling_strategy_=='union':
-            y=labels[:]
-        if self.sampling_strategy_=='intersection':
-            y=list(set.intersection(*neighbours_labels))
-
-        X=synth_sample
-        return X,y
-
-
-    def _calc_distances(self,sample,min_bag):
-        distances=[]
-        append_distances=distances.append
+                synth_sample[i] = self._get_most_frequent_value(
+                    self.features[neighbour_ids, i])
+
+        labels = sample_labels.tolist()
+        labels += [a for x in neighbours_labels for a in (
+            x if isinstance(x, list) else [x])]
+        labels = list(set(labels))
+        if self.sampling_strategy_ == 'ranking':
+            head_index = int((self.k_neighbors + 1)/2)
+            y = labels[:head_index]
+        if self.sampling_strategy_ == 'union':
+            y = labels[:]
+        if self.sampling_strategy_ == 'intersection':
+            y = list(set.intersection(*neighbours_labels))
+
+        X = synth_sample
+        return X, y
+
+    def _calc_distances(self, sample, min_bag):
+        distances = []
+        append_distances = distances.append
         for bag_sample in min_bag:
-            nominal_distances=np.array([self._get_vdm(self.features[sample,cat],self.features[bag_sample,cat])for cat in self.categorical_features_])
-            ordinal_distances=np.array([self._get_euclidean_distance(self.features[sample,num],self.features[bag_sample,num])for num in self.continuous_features_])
-            dists=np.array([nominal_distances.sum(),ordinal_distances.sum()])
-            append_distances((dists.sum(),bag_sample))
-        dtype =  np.dtype([('distance', float), ('index', int)])
-        return np.array(distances,dtype=dtype)
-
-   
-    def _get_euclidean_distance(self,first,second):
-        euclidean_distance=np.linalg.norm(first-second)
+            nominal_distances = np.array([self._get_vdm(
+                self.features[sample, cat], self.features[bag_sample, cat])for cat in self.categorical_features_])
+            ordinal_distances = np.array([self._get_euclidean_distance(
+                self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_])
+            dists = np.array(
+                [nominal_distances.sum(), ordinal_distances.sum()])
+            append_distances((dists.sum(), bag_sample))
+        dtype = np.dtype([('distance', float), ('index', int)])
+        return np.array(distances, dtype=dtype)
+
+    def _get_euclidean_distance(self, first, second):
+        euclidean_distance = np.linalg.norm(first-second)
         return euclidean_distance
 
-    def _get_vdm(self,first,second):
+    def _get_vdm(self, first, second):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
         def f(c):
-            N_ax=len(np.where(self.features[:,self.categorical_features_]==first))
-            N_ay=len(np.where(self.features[:,self.categorical_features_]==second))
-            c_instances=self._get_all_instances_of_label(c)
-            N_axc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==first)[0])
-            N_ayc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==second)[0])
+            N_ax = len(
+                np.where(self.features[:, self.categorical_features_] == first))
+            N_ay = len(
+                np.where(self.features[:, self.categorical_features_] == second))
+            c_instances = self._get_all_instances_of_label(c)
+            N_axc = len(np.where(self.features[np.ix_(
+                c_instances, self.categorical_features_)] == first)[0])
+            N_ayc = len(np.where(self.features[np.ix_(
+                c_instances, self.categorical_features_)] == second)[0])
             return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
-     
+
         return np.sum(np.array([f(c)for c in self.unique_labels]))
 
-    def _get_all_instances_of_label(self,label):
-        instance_ids=[]
-        append_instance_id=instance_ids.append
-        for i,label_set in enumerate(self.labels):
+    def _get_all_instances_of_label(self, label):
+        instance_ids = []
+        append_instance_id = instance_ids.append
+        for i, label_set in enumerate(self.labels):
             if label in label_set:
                 append_instance_id(i)
         return np.array(instance_ids)
 
     def _get_mean_imbalance_ratio(self):
-        ratio_sum=np.sum(np.array(list(map(self._get_imbalance_ratio_per_label,self.unique_labels))))
+        ratio_sum = np.sum(
+            np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
         return ratio_sum/self.unique_labels.shape[0]
 
-    def _get_imbalance_ratio_per_label(self,label):
-        sum_array=list(map(self._sum_h,self.unique_labels))
-        sum_array=np.array(sum_array)
+    def _get_imbalance_ratio_per_label(self, label):
+        sum_array = list(map(self._sum_h, self.unique_labels))
+        sum_array = np.array(sum_array)
         return sum_array.max()/self._sum_h(label)
 
-    def _sum_h(self,label):
-        h_sum=0
-        def h(l,Y):
+    def _sum_h(self, label):
+        h_sum = 0
+
+        def h(l, Y):
             if l in Y:
                 return 1
             else:
                 return 0
 
         for label_set in self.labels:
-            h_sum+=h(label,label_set)
+            h_sum += h(label, label_set)
         return h_sum
 
-  
-    def _get_label_frequencies(self,labels):
+    def _get_label_frequencies(self, labels):
         """"A support function to get the frequencies of labels"""
-        frequency_map=np.array(np.unique(labels, return_counts=True)).T
-        frequencies=np.array([x[1] for x in count_map])
+        frequency_map = np.array(np.unique(labels, return_counts=True)).T
+        frequencies = np.array([x[1] for x in count_map])
         return frequencies
-    
+
     def _get_most_frequent_value(self, values):
         """"A support function to get most frequent value if a list of values"""
         uniques, indices = np.unique(values, return_inverse=True)
-        return uniques[np.argmax(np.bincount(indices))]
\ No newline at end of file
+        return uniques[np.argmax(np.bincount(indices))]
diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py
index eda1f0708..2fc53cb0e 100644
--- a/imblearn/over_sampling/tests/test_mlsmote.py
+++ b/imblearn/over_sampling/tests/test_mlsmote.py
@@ -20,7 +20,7 @@ def data_heterogneous_ordered():
     X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
     # return the categories
     return X, y, [2, 3]
 
@@ -34,7 +34,7 @@ def data_heterogneous_unordered():
     X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
     # return the categories
     return X, y, [0, 3]
 
@@ -48,7 +48,7 @@ def data_heterogneous_masked():
     X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
     # create a categorical feature using some integer
     X[:, 3] = rng.randint(3, size=30)
-    y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
     # return the categories
     return X, y, [True, False, True]
 
@@ -83,6 +83,7 @@ def test_mlsmote(data):
         assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
         assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
 
+
 def test_mlsmote_fit():
     X, y, categorical_features = data_heterogneous_unordered()
     smote = MLSMOTE(categorical_features=categorical_features)
@@ -94,11 +95,12 @@ def test_mlsmote_fit():
 
 def test_mlsmote_fit_resample():
     X, y, categorical_features = data_heterogneous_unordered()
-    target_stats = Counter(np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
+    target_stats = Counter(np.unique(
+        np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
     smote = MLSMOTE(categorical_features=categorical_features)
     _, y_res = smote.fit_resample(X, y)
-    classes_res=np.unique(np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
+    classes_res = np.unique(
+        np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
     _ = Counter(classes_res)
     n_samples = max(target_stats.values())
     assert all(value >= n_samples for value in Counter(classes_res).values())
-

From bef048749b997f8e03f21a49a11b49826e290a52 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Mon, 11 May 2020 23:13:44 +0200
Subject: [PATCH 04/10] fix refactor error

---
 imblearn/over_sampling/_mlsmote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index b943bcc52..c150d13eb 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -204,7 +204,7 @@ def h(l, Y):
     def _get_label_frequencies(self, labels):
         """"A support function to get the frequencies of labels"""
         frequency_map = np.array(np.unique(labels, return_counts=True)).T
-        frequencies = np.array([x[1] for x in count_map])
+        frequencies = np.array([x[1] for x in frequency_map])
         return frequencies
 
     def _get_most_frequent_value(self, values):

From 25eb158b4e22e33b987452a62e8bec0b7261abb4 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Wed, 13 May 2020 13:25:20 +0200
Subject: [PATCH 05/10] compute imbalance_ratio_per_label just once

---
 imblearn/over_sampling/_mlsmote.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index c150d13eb..ecab4687e 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -56,6 +56,8 @@ def fit_resample(self, X, y):
         y_resampled = y.copy()
 
         self.unique_labels = self._collect_unique_labels(y)
+        self.imbalance_ratio_per_label = np.array(
+            list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))
         self.features = X
 
         X_synth = []
@@ -64,8 +66,8 @@ def fit_resample(self, X, y):
         append_X_synth = X_synth.append
         append_y_synth = y_synth.append
         mean_ir = self._get_mean_imbalance_ratio()
-        for label in self.unique_labels:
-            irlbl = self._get_imbalance_ratio_per_label(label)
+        for index, label in np.ndenumerate(self.unique_labels):
+            irlbl = self.imbalance_ratio_per_label[index]
             if irlbl > mean_ir:
                 min_bag = self._get_all_instances_of_label(label)
                 for sample in min_bag:
@@ -179,8 +181,7 @@ def _get_all_instances_of_label(self, label):
         return np.array(instance_ids)
 
     def _get_mean_imbalance_ratio(self):
-        ratio_sum = np.sum(
-            np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
+        ratio_sum = np.sum(self.imbalance_ratio_per_label)
         return ratio_sum/self.unique_labels.shape[0]
 
     def _get_imbalance_ratio_per_label(self, label):

From e6c847e57e18da8660cb03cbb815d42d9693d5c7 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Wed, 13 May 2020 22:56:38 +0200
Subject: [PATCH 06/10] add sparse matrix support for labels

---
 imblearn/over_sampling/_mlsmote.py           | 123 +++++++++++++------
 imblearn/over_sampling/tests/test_mlsmote.py |  22 +++-
 2 files changed, 108 insertions(+), 37 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index ecab4687e..2311e0f13 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -2,8 +2,7 @@
 import itertools
 import collections
 import random
-
-
+from scipy import sparse
 class MLSMOTE:
     """Over-sampling using MLSMOTE.
 
@@ -48,14 +47,18 @@ def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranki
 
     def fit_resample(self, X, y):
         self.n_features_ = X.shape[1]
-        self.labels = np.array([np.array(xi) for xi in y])
 
         self._validate_estimator()
 
         X_resampled = X.copy()
         y_resampled = y.copy()
 
-        self.unique_labels = self._collect_unique_labels(y)
+        if sparse.issparse(y):
+            self.labels = y
+            self.unique_labels = range(0, y.shape[1])
+        else:
+            self.labels = np.array([np.array(xi) for xi in y])
+            self.unique_labels = self._collect_unique_labels(y)
         self.imbalance_ratio_per_label = np.array(
             list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))
         self.features = X
@@ -66,21 +69,39 @@ def fit_resample(self, X, y):
         append_X_synth = X_synth.append
         append_y_synth = y_synth.append
         mean_ir = self._get_mean_imbalance_ratio()
-        for index, label in np.ndenumerate(self.unique_labels):
-            irlbl = self.imbalance_ratio_per_label[index]
-            if irlbl > mean_ir:
-                min_bag = self._get_all_instances_of_label(label)
-                for sample in min_bag:
-                    distances = self._calc_distances(sample, min_bag)
-                    distances = np.sort(distances, order='distance')
-                    neighbours = distances[:self.k_neighbors]
-                    ref_neigh = np.random.choice(neighbours, 1)[0]
-                    X_new, y_new = self._create_new_sample(
-                        sample, ref_neigh[1], [x[1] for x in neighbours])
-                    append_X_synth(X_new)
-                    append_y_synth(y_new)
-
-        return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
+        
+        if sparse.issparse(y):
+            y_synth = None
+
+            for label in self.unique_labels:
+                irlbl = self.imbalance_ratio_per_label[label]
+                if irlbl > mean_ir:
+                    min_bag = self._get_all_instances_of_label(label)
+                    for sample in min_bag:
+                        distances = self._calc_distances(sample, min_bag)
+                        distances = np.sort(distances, order='distance')
+                        neighbours = distances[:self.k_neighbors]
+                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        X_new, y_new = self._create_new_sample(
+                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                        append_X_synth(X_new)
+                        y_resambled = sparse.vstack((y_resampled, y_new))
+            return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
+        else:
+            for index, label in np.ndenumerate(self.unique_labels):
+                irlbl = self.imbalance_ratio_per_label[index]
+                if irlbl > mean_ir:
+                    min_bag = self._get_all_instances_of_label(label)
+                    for sample in min_bag:
+                        distances = self._calc_distances(sample, min_bag)
+                        distances = np.sort(distances, order='distance')
+                        neighbours = distances[:self.k_neighbors]
+                        ref_neigh = np.random.choice(neighbours, 1)[0]
+                        X_new, y_new = self._create_new_sample(
+                            sample, ref_neigh[1], [x[1] for x in neighbours])
+                        append_X_synth(X_new)
+                        append_y_synth(y_new)
+            return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
 
     def _validate_estimator(self):
         categorical_features = np.asarray(self.categorical_features)
@@ -108,12 +129,10 @@ def _collect_unique_labels(self, y):
 
     def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
         sample = self.features[sample_id]
-        sample_labels = self.labels[sample_id]
         synth_sample = np.copy(sample)
         ref_neigh = self.features[ref_neigh_id]
-        neighbours_labels = []
-        for ni in neighbour_ids:
-            neighbours_labels.append(self.labels[ni].tolist())
+        sample_labels = self.labels[sample_id]
+
         for i in range(synth_sample.shape[0]):
             if i in self.continuous_features_:
                 diff = ref_neigh[i]-sample[i]
@@ -122,20 +141,46 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
             if i in self.categorical_features_:
                 synth_sample[i] = self._get_most_frequent_value(
                     self.features[neighbour_ids, i])
+        X = synth_sample
 
-        labels = sample_labels.tolist()
-        labels += [a for x in neighbours_labels for a in (
-            x if isinstance(x, list) else [x])]
-        labels = list(set(labels))
-        if self.sampling_strategy_ == 'ranking':
-            head_index = int((self.k_neighbors + 1)/2)
-            y = labels[:head_index]
-        if self.sampling_strategy_ == 'union':
-            y = labels[:]
-        if self.sampling_strategy_ == 'intersection':
-            y = list(set.intersection(*neighbours_labels))
+        if sparse.issparse(self.labels):
+            neighbours_labels = self.labels[neighbour_ids]
+            print("ns", neighbours_labels)
+            possible_labels = neighbours_labels.sum(axis=0)
+            print("possible", possible_labels)
+            y = np.zeros((1, len(self.unique_labels)))
+            if self.sampling_strategy_ == 'ranking':
+                head_index = int((self.k_neighbors + 1)/2)
+                print("choosen_nonz", possible_labels.nonzero())
+                choosen_labels = possible_labels.nonzero()[1][:head_index]
+                print("choosen", choosen_labels)
+                y[0, choosen_labels] = 1
+                print("y", y)
+            if self.sampling_strategy_ == 'union':
+                choosen_labels = possible_labels.nonzero()[0]
+                y[choosen_labels] = 1
+            if self.sampling_strategy_ == 'intersection':
+                choosen_labels = sparse.find(possible_labels == len(neighbours_labels))
+                y[choosen_labels] = 1
+            y = sparse.csr_matrix(y)
+
+        else:
+            neighbours_labels = []
+            for ni in neighbour_ids:
+                neighbours_labels.append(self.labels[ni].tolist())        
+
+            labels = []  # sample_labels.tolist()
+            labels += [a for x in neighbours_labels for a in (
+                x if isinstance(x, list) else [x])]
+            labels = list(set(labels))
+            if self.sampling_strategy_ == 'ranking':
+                head_index = int((self.k_neighbors + 1)/2)
+                y = labels[:head_index]
+            if self.sampling_strategy_ == 'union':
+                y = labels[:]
+            if self.sampling_strategy_ == 'intersection':
+                y = list(set.intersection(*neighbours_labels))
 
-        X = synth_sample
         return X, y
 
     def _calc_distances(self, sample, min_bag):
@@ -173,6 +218,8 @@ def f(c):
         return np.sum(np.array([f(c)for c in self.unique_labels]))
 
     def _get_all_instances_of_label(self, label):
+        if sparse.issparse(self.labels):
+            return self.labels[:, label].nonzero()[0]
         instance_ids = []
         append_instance_id = instance_ids.append
         for i, label_set in enumerate(self.labels):
@@ -182,7 +229,7 @@ def _get_all_instances_of_label(self, label):
 
     def _get_mean_imbalance_ratio(self):
         ratio_sum = np.sum(self.imbalance_ratio_per_label)
-        return ratio_sum/self.unique_labels.shape[0]
+        return ratio_sum/len(self.unique_labels)
 
     def _get_imbalance_ratio_per_label(self, label):
         sum_array = list(map(self._sum_h, self.unique_labels))
@@ -190,6 +237,9 @@ def _get_imbalance_ratio_per_label(self, label):
         return sum_array.max()/self._sum_h(label)
 
     def _sum_h(self, label):
+        if sparse.issparse(self.labels):
+            return self.labels[:, label].count_nonzero()
+
         h_sum = 0
 
         def h(l, Y):
@@ -200,6 +250,7 @@ def h(l, Y):
 
         for label_set in self.labels:
             h_sum += h(label, label_set)
+
         return h_sum
 
     def _get_label_frequencies(self, labels):
diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py
index 2fc53cb0e..49f1c0317 100644
--- a/imblearn/over_sampling/tests/test_mlsmote.py
+++ b/imblearn/over_sampling/tests/test_mlsmote.py
@@ -6,6 +6,8 @@
 import pytest
 
 import numpy as np
+from scipy import sparse
+from sklearn.preprocessing import MultiLabelBinarizer
 
 
 from imblearn.over_sampling import MLSMOTE
@@ -53,6 +55,22 @@ def data_heterogneous_masked():
     return X, y, [True, False, True]
 
 
+def data_sparse():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=np.float64)
+    # create 2 random continuous feature
+    X[:, [1, 2]] = rng.randn(30, 2)
+    # create a categorical feature using some string
+    X[:, 0] = rng.randint(3, size=30)
+    # create a categorical feature using some integer
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
+    labelBinarizer = MultiLabelBinarizer()
+    y = labelBinarizer.fit_transform(y)
+    y = sparse.csr_matrix(y)
+    return X, y, [0, 3]
+
+
 def test_mlsmote_error():
     X, y, _ = data_heterogneous_unordered()
     categorical_features = [0, 10]
@@ -67,6 +85,7 @@ def test_mlsmote_error():
         data_heterogneous_ordered(),
         data_heterogneous_unordered(),
         data_heterogneous_masked(),
+        data_sparse()
     ],
 )
 def test_mlsmote(data):
@@ -100,7 +119,8 @@ def test_mlsmote_fit_resample():
     smote = MLSMOTE(categorical_features=categorical_features)
     _, y_res = smote.fit_resample(X, y)
     classes_res = np.unique(
-        np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
+        np.array([a for x in y_res
+                  for a in (x if isinstance(x, list) else [x])]))
     _ = Counter(classes_res)
     n_samples = max(target_stats.values())
     assert all(value >= n_samples for value in Counter(classes_res).values())

From 4bb3474e3ac261bd592ffe0e2d79ee534eaddb5b Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Tue, 16 Jun 2020 19:08:25 +0200
Subject: [PATCH 07/10] calculate imbalance ratio on every run instead once

---
 imblearn/over_sampling/_mlsmote.py | 32 ++++++++++++++++++------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 2311e0f13..7a435445e 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -59,8 +59,6 @@ def fit_resample(self, X, y):
         else:
             self.labels = np.array([np.array(xi) for xi in y])
             self.unique_labels = self._collect_unique_labels(y)
-        self.imbalance_ratio_per_label = np.array(
-            list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))
         self.features = X
 
         X_synth = []
@@ -74,7 +72,7 @@ def fit_resample(self, X, y):
             y_synth = None
 
             for label in self.unique_labels:
-                irlbl = self.imbalance_ratio_per_label[label]
+                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
@@ -89,7 +87,7 @@ def fit_resample(self, X, y):
             return np.concatenate((X_resampled, np.array(X_synth))), y_resampled
         else:
             for index, label in np.ndenumerate(self.unique_labels):
-                irlbl = self.imbalance_ratio_per_label[index]
+                irlbl = self._get_imbalance_ratio_per_label(label, y_resampled)
                 if irlbl > mean_ir:
                     min_bag = self._get_all_instances_of_label(label)
                     for sample in min_bag:
@@ -228,17 +226,25 @@ def _get_all_instances_of_label(self, label):
         return np.array(instance_ids)
 
     def _get_mean_imbalance_ratio(self):
-        ratio_sum = np.sum(self.imbalance_ratio_per_label)
+        ratio_sum = np.sum(np.array(
+            list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
         return ratio_sum/len(self.unique_labels)
 
-    def _get_imbalance_ratio_per_label(self, label):
-        sum_array = list(map(self._sum_h, self.unique_labels))
-        sum_array = np.array(sum_array)
-        return sum_array.max()/self._sum_h(label)
+    def _get_imbalance_ratio_per_label(self, label, labels=None):
+        sum_h = self._sum_h
+        if labels is None:
+            sum_array = np.array([sum_h(l, self.labels)
+                                  for l in self.unique_labels])
+            ratio = sum_array.max()/sum_h(label, self.labels)
+        else:
+            sum_array = np.array([sum_h(l, labels)for l in self.unique_labels])
+            ratio = sum_array.max()/sum_h(label, labels)
 
-    def _sum_h(self, label):
-        if sparse.issparse(self.labels):
-            return self.labels[:, label].count_nonzero()
+        return ratio
+
+    def _sum_h(self, label, labels):
+        if sparse.issparse(labels):
+            return labels[:, label].count_nonzero()
 
         h_sum = 0
 
@@ -248,7 +254,7 @@ def h(l, Y):
             else:
                 return 0
 
-        for label_set in self.labels:
+        for label_set in labels:
             h_sum += h(label, label_set)
 
         return h_sum

From 32a7b5552d298a6508fde7c1b74cd6227d824806 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Tue, 16 Jun 2020 19:11:31 +0200
Subject: [PATCH 08/10] use list comprehension for better performance

---
 imblearn/over_sampling/_mlsmote.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index 7a435445e..c90827125 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -182,16 +182,14 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
         return X, y
 
     def _calc_distances(self, sample, min_bag):
-        distances = []
-        append_distances = distances.append
-        for bag_sample in min_bag:
-            nominal_distances = np.array([self._get_vdm(
-                self.features[sample, cat], self.features[bag_sample, cat])for cat in self.categorical_features_])
-            ordinal_distances = np.array([self._get_euclidean_distance(
+        def calc_dist(bag_sample):
+            nominal_distance = sum([self._get_vdm(
+                self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_])
+            ordinal_distance = sum([self._get_euclidean_distance(
                 self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_])
-            dists = np.array(
-                [nominal_distances.sum(), ordinal_distances.sum()])
-            append_distances((dists.sum(), bag_sample))
+            dist = sum([nominal_distance, ordinal_distance])
+            return (dist, bag_sample)
+        distances = [calc_dist(bag_sample) for bag_sample in min_bag]
         dtype = np.dtype([('distance', float), ('index', int)])
         return np.array(distances, dtype=dtype)
 

From f1343496a2e922afda5c3372f4bc9efc5b25197e Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Tue, 16 Jun 2020 19:15:19 +0200
Subject: [PATCH 09/10] fix vdm and add experimental sparse implementation

---
 imblearn/over_sampling/_mlsmote.py | 43 +++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index c90827125..b3a055cba 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -197,21 +197,38 @@ def _get_euclidean_distance(self, first, second):
         euclidean_distance = np.linalg.norm(first-second)
         return euclidean_distance
 
-    def _get_vdm(self, first, second):
+    def _get_vdm(self, first, second, category):
         """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
+        if sparse.issparse(self.features):
+            def f_sparse(c):
+                N_ax = len(sparse.find(self.features[:, category] == first)[0])
+                N_ay = len(sparse.find(
+                    self.features[:, category] == second)[0])
+                c_instances = self._get_all_instances_of_label(c)
+                N_axc = len(sparse.find(
+                    self.features[c_instances, category] == first)[0])
+                N_ayc = len(sparse.find(
+                    self.features[c_instances, category] == second)[0])
+                p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
+                return p
+
+            vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels]))
+            return vdm
+
+        category_rows = self.features[:, category]
+        N_ax = len(np.where(category_rows == first))
+        N_ay = len(np.where(category_rows == second))
+
         def f(c):
-            N_ax = len(
-                np.where(self.features[:, self.categorical_features_] == first))
-            N_ay = len(
-                np.where(self.features[:, self.categorical_features_] == second))
-            c_instances = self._get_all_instances_of_label(c)
-            N_axc = len(np.where(self.features[np.ix_(
-                c_instances, self.categorical_features_)] == first)[0])
-            N_ayc = len(np.where(self.features[np.ix_(
-                c_instances, self.categorical_features_)] == second)[0])
-            return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
-
-        return np.sum(np.array([f(c)for c in self.unique_labels]))
+            class_instances = self._get_all_instances_of_label(c)
+            class_instance_rows = category_rows[class_instances]
+            N_axc = len(np.where(class_instance_rows == first)[0])
+            N_ayc = len(np.where(class_instance_rows == second)[0])
+            p = abs((N_axc/N_ax)-(N_ayc/N_ay))
+            return p
+
+        vdm = np.array([f(c)for c in self.unique_labels]).sum()
+        return vdm
 
     def _get_all_instances_of_label(self, label):
         if sparse.issparse(self.labels):

From 3361578e469e8817bb3af356e78408bf5b3a54f2 Mon Sep 17 00:00:00 2001
From: Simon Ermler <simon_ermler@web.de>
Date: Tue, 16 Jun 2020 19:15:39 +0200
Subject: [PATCH 10/10] remove  prints

---
 imblearn/over_sampling/_mlsmote.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
index b3a055cba..24ba43098 100644
--- a/imblearn/over_sampling/_mlsmote.py
+++ b/imblearn/over_sampling/_mlsmote.py
@@ -143,17 +143,12 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
 
         if sparse.issparse(self.labels):
             neighbours_labels = self.labels[neighbour_ids]
-            print("ns", neighbours_labels)
             possible_labels = neighbours_labels.sum(axis=0)
-            print("possible", possible_labels)
             y = np.zeros((1, len(self.unique_labels)))
             if self.sampling_strategy_ == 'ranking':
                 head_index = int((self.k_neighbors + 1)/2)
-                print("choosen_nonz", possible_labels.nonzero())
                 choosen_labels = possible_labels.nonzero()[1][:head_index]
-                print("choosen", choosen_labels)
                 y[0, choosen_labels] = 1
-                print("y", y)
             if self.sampling_strategy_ == 'union':
                 choosen_labels = possible_labels.nonzero()[0]
                 y[choosen_labels] = 1