From 802caae2e04861198aea16e86d4cb3b5f85ce448 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Mon, 11 May 2020 00:51:37 +0200 Subject: [PATCH 01/10] add basic tests for mlsmote --- imblearn/over_sampling/tests/test_mlsmote.py | 104 +++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 imblearn/over_sampling/tests/test_mlsmote.py diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py new file mode 100644 index 000000000..eda1f0708 --- /dev/null +++ b/imblearn/over_sampling/tests/test_mlsmote.py @@ -0,0 +1,104 @@ +"""Test the module MLSMOTE.""" + + +from collections import Counter + +import pytest + +import numpy as np + + +from imblearn.over_sampling import MLSMOTE + + +def data_heterogneous_ordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, :2] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + # return the categories + return X, y, [2, 3] + + +def data_heterogneous_unordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + # return the categories + return X, y, [0, 3] + + +def data_heterogneous_masked(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + # return the categories + return X, y, [True, False, True] + + +def test_mlsmote_error(): + X, y, _ = data_heterogneous_unordered() + categorical_features = [0, 10] + smote = MLSMOTE(categorical_features=categorical_features) + with pytest.raises(ValueError, match="indices are out of range"): + smote.fit_resample(X, y) + + +@pytest.mark.parametrize( + "data", + [ + data_heterogneous_ordered(), + data_heterogneous_unordered(), + data_heterogneous_masked(), + ], +) +def test_mlsmote(data): + X, y, categorical_features = data + smote = MLSMOTE(categorical_features=categorical_features) + X_resampled, y_resampled = smote.fit_resample(X, y) + + assert X_resampled.dtype == X.dtype + + categorical_features = np.array(categorical_features) + if categorical_features.dtype == bool: + categorical_features = np.flatnonzero(categorical_features) + for cat_idx in categorical_features: + assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) + assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype + +def test_mlsmote_fit(): + X, y, categorical_features = data_heterogneous_unordered() + smote = MLSMOTE(categorical_features=categorical_features) + smote.fit_resample(X, y) + assert hasattr( + smote, "sampling_strategy_" + ), "No fitted attribute sampling_strategy_" + + +def test_mlsmote_fit_resample(): + X, y, categorical_features = data_heterogneous_unordered() + target_stats = Counter(np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))) + smote = MLSMOTE(categorical_features=categorical_features) + _, y_res = smote.fit_resample(X, y) + classes_res=np.unique(np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])])) + _ = Counter(classes_res) + n_samples = max(target_stats.values()) + assert all(value >= n_samples for value in Counter(classes_res).values()) + From 9b2ec7f1ae3a5708121d2004662d7f8bdb98195b Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Mon, 11 May 2020 00:52:07 +0200 Subject: [PATCH 02/10] add mlsmote implementation --- imblearn/over_sampling/__init__.py | 2 + imblearn/over_sampling/_mlsmote.py | 202 +++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 imblearn/over_sampling/_mlsmote.py diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index bd20b76ea..07f10a3f6 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -10,6 +10,7 @@ from ._smote import KMeansSMOTE from ._smote import SVMSMOTE from ._smote import SMOTENC +from ._mlsmote import MLSMOTE __all__ = [ "ADASYN", @@ -19,4 +20,5 @@ "BorderlineSMOTE", "SVMSMOTE", "SMOTENC", + "MLSMOTE" ] diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py new file mode 100644 index 000000000..15b19ae02 --- /dev/null +++ b/imblearn/over_sampling/_mlsmote.py @@ -0,0 +1,202 @@ +import numpy as np +import itertools +import collections +import random + +class MLSMOTE: + """Over-sampling using MLSMOTE. + + Parameters + ---------- + sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking' + Strategy to generate labelsets + + + k_neighbors : int or object, default=5 + If ``int``, number of nearest neighbours to used to construct synthetic + samples. + + categorical_features : ndarray of shape (n_cat_features,) or (n_features,) + Specified which features are categorical. Can either be: + + - array of indices specifying the categorical features; + - mask array of shape (n_features, ) and ``bool`` dtype for which + ``True`` indicates the categorical features. + + Notes + ----- + See the original papers: [1]_ for more details. + + + References + ---------- + .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). + MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. + Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. + + """ + def __init__(self,categorical_features,k_neighbors=5 ,sampling_strategy='ranking'): + self.k_neighbors=k_neighbors + self.sampling_strategy_=sampling_strategy + self.categorical_features = categorical_features + self.continuous_features_= None + self.unique_labels = [] + self.labels=[] + self.features=[] + + def fit_resample(self,X,y): + self.n_features_ = X.shape[1] + self.labels=np.array([np.array(xi) for xi in y]) + + self._validate_estimator() + + X_resampled = X.copy() + y_resampled = y.copy() + + self.unique_labels = self._collect_unique_labels(y) + self.features=X + + X_synth=[] + y_synth=[] + + append_X_synth=X_synth.append + append_y_synth=y_synth.append + mean_ir=self._get_mean_imbalance_ratio() + for label in self.unique_labels: + irlbl=self._get_imbalance_ratio_per_label(label) + if irlbl > mean_ir: + min_bag=self._get_all_instances_of_label(label) + for sample in min_bag: + distances=self._calc_distances(sample,min_bag) + distances=np.sort(distances,order='distance') + neighbours=distances[:self.k_neighbors] + ref_neigh=np.random.choice(neighbours,1)[0] + X_new,y_new=self._create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours]) + append_X_synth(X_new) + append_y_synth(y_new) + + return np.concatenate((X_resampled,np.array(X_synth))),np.array(y_resampled.tolist()+y_synth) + + def _validate_estimator(self): + categorical_features = np.asarray(self.categorical_features) + if categorical_features.dtype.name == "bool": + self.categorical_features_ = np.flatnonzero(categorical_features) + else: + if any( + [ + cat not in np.arange(self.n_features_) + for cat in categorical_features + ] + ): + raise ValueError( + "Some of the categorical indices are out of range. Indices" + " should be between 0 and {}".format(self.n_features_) + ) + self.categorical_features_ = categorical_features + self.continuous_features_ = np.setdiff1d( + np.arange(self.n_features_), self.categorical_features_ + ) + + def _collect_unique_labels(self, y): + """A support function that flattens the labelsets and return one set of unique labels""" + return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])) + + def _create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids): + sample=self.features[sample_id] + sample_labels=self.labels[sample_id] + synth_sample=np.copy(sample) + ref_neigh=self.features[ref_neigh_id] + neighbours_labels=[] + for ni in neighbour_ids: + neighbours_labels.append(self.labels[ni].tolist()) + for i in range(synth_sample.shape[0]): + if i in self.continuous_features_: + diff=ref_neigh[i]-sample[i] + offset=diff*random.uniform(0,1) + synth_sample[i]=sample[i]+offset + if i in self.categorical_features_: + synth_sample[i]=self._get_most_frequent_value(self.features[neighbour_ids,i]) + + labels=sample_labels.tolist() + labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])] + labels=list(set(labels)) + if self.sampling_strategy_=='ranking': + head_index=int((self.k_neighbors+ 1)/2) + y=labels[:head_index] + if self.sampling_strategy_=='union': + y=labels[:] + if self.sampling_strategy_=='intersection': + y=list(set.intersection(*neighbours_labels)) + + X=synth_sample + return X,y + + + def _calc_distances(self,sample,min_bag): + distances=[] + append_distances=distances.append + for bag_sample in min_bag: + nominal_distances=np.array([self._get_vdm(self.features[sample,cat],self.features[bag_sample,cat])for cat in self.categorical_features_]) + ordinal_distances=np.array([self._get_euclidean_distance(self.features[sample,num],self.features[bag_sample,num])for num in self.continuous_features_]) + dists=np.array([nominal_distances.sum(),ordinal_distances.sum()]) + append_distances((dists.sum(),bag_sample)) + dtype = np.dtype([('distance', float), ('index', int)]) + return np.array(distances,dtype=dtype) + + + def _get_euclidean_distance(self,first,second): + euclidean_distance=np.linalg.norm(first-second) + return euclidean_distance + + def _get_vdm(self,first,second): + """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" + def f(c): + N_ax=len(np.where(self.features[:,self.categorical_features_]==first)) + N_ay=len(np.where(self.features[:,self.categorical_features_]==second)) + c_instances=self._get_all_instances_of_label(c) + N_axc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==first)[0]) + N_ayc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==second)[0]) + return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) + + return np.sum(np.array([f(c)for c in self.unique_labels])) + + def _get_all_instances_of_label(self,label): + instance_ids=[] + append_instance_id=instance_ids.append + for i,label_set in enumerate(self.labels): + if label in label_set: + append_instance_id(i) + return np.array(instance_ids) + + def _get_mean_imbalance_ratio(self): + ratio_sum=np.sum(np.array(list(map(self._get_imbalance_ratio_per_label,self.unique_labels)))) + return ratio_sum/self.unique_labels.shape[0] + + def _get_imbalance_ratio_per_label(self,label): + sum_array=list(map(self._sum_h,self.unique_labels)) + sum_array=np.array(sum_array) + return sum_array.max()/self._sum_h(label) + + def _sum_h(self,label): + h_sum=0 + def h(l,Y): + if l in Y: + return 1 + else: + return 0 + + for label_set in self.labels: + h_sum+=h(label,label_set) + return h_sum + + + def _get_label_frequencies(self,labels): + """"A support function to get the frequencies of labels""" + frequency_map=np.array(np.unique(labels, return_counts=True)).T + frequencies=np.array([x[1] for x in count_map]) + return frequencies + + def _get_most_frequent_value(self, values): + """"A support function to get most frequent value if a list of values""" + uniques, indices = np.unique(values, return_inverse=True) + return uniques[np.argmax(np.bincount(indices))] \ No newline at end of file From 948da4af21d37072b9acba950b4d19d58b93fa6a Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Mon, 11 May 2020 01:11:27 +0200 Subject: [PATCH 03/10] format code --- imblearn/over_sampling/_mlsmote.py | 183 ++++++++++--------- imblearn/over_sampling/tests/test_mlsmote.py | 14 +- 2 files changed, 105 insertions(+), 92 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 15b19ae02..b943bcc52 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -3,6 +3,7 @@ import collections import random + class MLSMOTE: """Over-sampling using MLSMOTE. @@ -35,18 +36,19 @@ class MLSMOTE: Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. """ - def __init__(self,categorical_features,k_neighbors=5 ,sampling_strategy='ranking'): - self.k_neighbors=k_neighbors - self.sampling_strategy_=sampling_strategy + + def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'): + self.k_neighbors = k_neighbors + self.sampling_strategy_ = sampling_strategy self.categorical_features = categorical_features - self.continuous_features_= None + self.continuous_features_ = None self.unique_labels = [] - self.labels=[] - self.features=[] + self.labels = [] + self.features = [] - def fit_resample(self,X,y): + def fit_resample(self, X, y): self.n_features_ = X.shape[1] - self.labels=np.array([np.array(xi) for xi in y]) + self.labels = np.array([np.array(xi) for xi in y]) self._validate_estimator() @@ -54,28 +56,29 @@ def fit_resample(self,X,y): y_resampled = y.copy() self.unique_labels = self._collect_unique_labels(y) - self.features=X + self.features = X - X_synth=[] - y_synth=[] + X_synth = [] + y_synth = [] - append_X_synth=X_synth.append - append_y_synth=y_synth.append - mean_ir=self._get_mean_imbalance_ratio() + append_X_synth = X_synth.append + append_y_synth = y_synth.append + mean_ir = self._get_mean_imbalance_ratio() for label in self.unique_labels: - irlbl=self._get_imbalance_ratio_per_label(label) + irlbl = self._get_imbalance_ratio_per_label(label) if irlbl > mean_ir: - min_bag=self._get_all_instances_of_label(label) + min_bag = self._get_all_instances_of_label(label) for sample in min_bag: - distances=self._calc_distances(sample,min_bag) - distances=np.sort(distances,order='distance') - neighbours=distances[:self.k_neighbors] - ref_neigh=np.random.choice(neighbours,1)[0] - X_new,y_new=self._create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours]) + distances = self._calc_distances(sample, min_bag) + distances = np.sort(distances, order='distance') + neighbours = distances[:self.k_neighbors] + ref_neigh = np.random.choice(neighbours, 1)[0] + X_new, y_new = self._create_new_sample( + sample, ref_neigh[1], [x[1] for x in neighbours]) append_X_synth(X_new) append_y_synth(y_new) - return np.concatenate((X_resampled,np.array(X_synth))),np.array(y_resampled.tolist()+y_synth) + return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth) def _validate_estimator(self): categorical_features = np.asarray(self.categorical_features) @@ -101,102 +104,110 @@ def _collect_unique_labels(self, y): """A support function that flattens the labelsets and return one set of unique labels""" return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])) - def _create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids): - sample=self.features[sample_id] - sample_labels=self.labels[sample_id] - synth_sample=np.copy(sample) - ref_neigh=self.features[ref_neigh_id] - neighbours_labels=[] + def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): + sample = self.features[sample_id] + sample_labels = self.labels[sample_id] + synth_sample = np.copy(sample) + ref_neigh = self.features[ref_neigh_id] + neighbours_labels = [] for ni in neighbour_ids: neighbours_labels.append(self.labels[ni].tolist()) for i in range(synth_sample.shape[0]): if i in self.continuous_features_: - diff=ref_neigh[i]-sample[i] - offset=diff*random.uniform(0,1) - synth_sample[i]=sample[i]+offset + diff = ref_neigh[i]-sample[i] + offset = diff*random.uniform(0, 1) + synth_sample[i] = sample[i]+offset if i in self.categorical_features_: - synth_sample[i]=self._get_most_frequent_value(self.features[neighbour_ids,i]) - - labels=sample_labels.tolist() - labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])] - labels=list(set(labels)) - if self.sampling_strategy_=='ranking': - head_index=int((self.k_neighbors+ 1)/2) - y=labels[:head_index] - if self.sampling_strategy_=='union': - y=labels[:] - if self.sampling_strategy_=='intersection': - y=list(set.intersection(*neighbours_labels)) - - X=synth_sample - return X,y - - - def _calc_distances(self,sample,min_bag): - distances=[] - append_distances=distances.append + synth_sample[i] = self._get_most_frequent_value( + self.features[neighbour_ids, i]) + + labels = sample_labels.tolist() + labels += [a for x in neighbours_labels for a in ( + x if isinstance(x, list) else [x])] + labels = list(set(labels)) + if self.sampling_strategy_ == 'ranking': + head_index = int((self.k_neighbors + 1)/2) + y = labels[:head_index] + if self.sampling_strategy_ == 'union': + y = labels[:] + if self.sampling_strategy_ == 'intersection': + y = list(set.intersection(*neighbours_labels)) + + X = synth_sample + return X, y + + def _calc_distances(self, sample, min_bag): + distances = [] + append_distances = distances.append for bag_sample in min_bag: - nominal_distances=np.array([self._get_vdm(self.features[sample,cat],self.features[bag_sample,cat])for cat in self.categorical_features_]) - ordinal_distances=np.array([self._get_euclidean_distance(self.features[sample,num],self.features[bag_sample,num])for num in self.continuous_features_]) - dists=np.array([nominal_distances.sum(),ordinal_distances.sum()]) - append_distances((dists.sum(),bag_sample)) - dtype = np.dtype([('distance', float), ('index', int)]) - return np.array(distances,dtype=dtype) - - - def _get_euclidean_distance(self,first,second): - euclidean_distance=np.linalg.norm(first-second) + nominal_distances = np.array([self._get_vdm( + self.features[sample, cat], self.features[bag_sample, cat])for cat in self.categorical_features_]) + ordinal_distances = np.array([self._get_euclidean_distance( + self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_]) + dists = np.array( + [nominal_distances.sum(), ordinal_distances.sum()]) + append_distances((dists.sum(), bag_sample)) + dtype = np.dtype([('distance', float), ('index', int)]) + return np.array(distances, dtype=dtype) + + def _get_euclidean_distance(self, first, second): + euclidean_distance = np.linalg.norm(first-second) return euclidean_distance - def _get_vdm(self,first,second): + def _get_vdm(self, first, second): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" def f(c): - N_ax=len(np.where(self.features[:,self.categorical_features_]==first)) - N_ay=len(np.where(self.features[:,self.categorical_features_]==second)) - c_instances=self._get_all_instances_of_label(c) - N_axc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==first)[0]) - N_ayc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==second)[0]) + N_ax = len( + np.where(self.features[:, self.categorical_features_] == first)) + N_ay = len( + np.where(self.features[:, self.categorical_features_] == second)) + c_instances = self._get_all_instances_of_label(c) + N_axc = len(np.where(self.features[np.ix_( + c_instances, self.categorical_features_)] == first)[0]) + N_ayc = len(np.where(self.features[np.ix_( + c_instances, self.categorical_features_)] == second)[0]) return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) - + return np.sum(np.array([f(c)for c in self.unique_labels])) - def _get_all_instances_of_label(self,label): - instance_ids=[] - append_instance_id=instance_ids.append - for i,label_set in enumerate(self.labels): + def _get_all_instances_of_label(self, label): + instance_ids = [] + append_instance_id = instance_ids.append + for i, label_set in enumerate(self.labels): if label in label_set: append_instance_id(i) return np.array(instance_ids) def _get_mean_imbalance_ratio(self): - ratio_sum=np.sum(np.array(list(map(self._get_imbalance_ratio_per_label,self.unique_labels)))) + ratio_sum = np.sum( + np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))) return ratio_sum/self.unique_labels.shape[0] - def _get_imbalance_ratio_per_label(self,label): - sum_array=list(map(self._sum_h,self.unique_labels)) - sum_array=np.array(sum_array) + def _get_imbalance_ratio_per_label(self, label): + sum_array = list(map(self._sum_h, self.unique_labels)) + sum_array = np.array(sum_array) return sum_array.max()/self._sum_h(label) - def _sum_h(self,label): - h_sum=0 - def h(l,Y): + def _sum_h(self, label): + h_sum = 0 + + def h(l, Y): if l in Y: return 1 else: return 0 for label_set in self.labels: - h_sum+=h(label,label_set) + h_sum += h(label, label_set) return h_sum - - def _get_label_frequencies(self,labels): + def _get_label_frequencies(self, labels): """"A support function to get the frequencies of labels""" - frequency_map=np.array(np.unique(labels, return_counts=True)).T - frequencies=np.array([x[1] for x in count_map]) + frequency_map = np.array(np.unique(labels, return_counts=True)).T + frequencies = np.array([x[1] for x in count_map]) return frequencies - + def _get_most_frequent_value(self, values): """"A support function to get most frequent value if a list of values""" uniques, indices = np.unique(values, return_inverse=True) - return uniques[np.argmax(np.bincount(indices))] \ No newline at end of file + return uniques[np.argmax(np.bincount(indices))] diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py index eda1f0708..2fc53cb0e 100644 --- a/imblearn/over_sampling/tests/test_mlsmote.py +++ b/imblearn/over_sampling/tests/test_mlsmote.py @@ -20,7 +20,7 @@ def data_heterogneous_ordered(): X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) # return the categories return X, y, [2, 3] @@ -34,7 +34,7 @@ def data_heterogneous_unordered(): X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) # return the categories return X, y, [0, 3] @@ -48,7 +48,7 @@ def data_heterogneous_masked(): X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) # return the categories return X, y, [True, False, True] @@ -83,6 +83,7 @@ def test_mlsmote(data): assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype + def test_mlsmote_fit(): X, y, categorical_features = data_heterogneous_unordered() smote = MLSMOTE(categorical_features=categorical_features) @@ -94,11 +95,12 @@ def test_mlsmote_fit(): def test_mlsmote_fit_resample(): X, y, categorical_features = data_heterogneous_unordered() - target_stats = Counter(np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))) + target_stats = Counter(np.unique( + np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))) smote = MLSMOTE(categorical_features=categorical_features) _, y_res = smote.fit_resample(X, y) - classes_res=np.unique(np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])])) + classes_res = np.unique( + np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])])) _ = Counter(classes_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(classes_res).values()) - From bef048749b997f8e03f21a49a11b49826e290a52 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Mon, 11 May 2020 23:13:44 +0200 Subject: [PATCH 04/10] fix refactor error --- imblearn/over_sampling/_mlsmote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index b943bcc52..c150d13eb 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -204,7 +204,7 @@ def h(l, Y): def _get_label_frequencies(self, labels): """"A support function to get the frequencies of labels""" frequency_map = np.array(np.unique(labels, return_counts=True)).T - frequencies = np.array([x[1] for x in count_map]) + frequencies = np.array([x[1] for x in frequency_map]) return frequencies def _get_most_frequent_value(self, values): From 25eb158b4e22e33b987452a62e8bec0b7261abb4 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Wed, 13 May 2020 13:25:20 +0200 Subject: [PATCH 05/10] compute imbalance_ratio_per_label just once --- imblearn/over_sampling/_mlsmote.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index c150d13eb..ecab4687e 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -56,6 +56,8 @@ def fit_resample(self, X, y): y_resampled = y.copy() self.unique_labels = self._collect_unique_labels(y) + self.imbalance_ratio_per_label = np.array( + list(map(self._get_imbalance_ratio_per_label, self.unique_labels))) self.features = X X_synth = [] @@ -64,8 +66,8 @@ def fit_resample(self, X, y): append_X_synth = X_synth.append append_y_synth = y_synth.append mean_ir = self._get_mean_imbalance_ratio() - for label in self.unique_labels: - irlbl = self._get_imbalance_ratio_per_label(label) + for index, label in np.ndenumerate(self.unique_labels): + irlbl = self.imbalance_ratio_per_label[index] if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) for sample in min_bag: @@ -179,8 +181,7 @@ def _get_all_instances_of_label(self, label): return np.array(instance_ids) def _get_mean_imbalance_ratio(self): - ratio_sum = np.sum( - np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))) + ratio_sum = np.sum(self.imbalance_ratio_per_label) return ratio_sum/self.unique_labels.shape[0] def _get_imbalance_ratio_per_label(self, label): From e6c847e57e18da8660cb03cbb815d42d9693d5c7 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Wed, 13 May 2020 22:56:38 +0200 Subject: [PATCH 06/10] add sparse matrix support for labels --- imblearn/over_sampling/_mlsmote.py | 123 +++++++++++++------ imblearn/over_sampling/tests/test_mlsmote.py | 22 +++- 2 files changed, 108 insertions(+), 37 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index ecab4687e..2311e0f13 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -2,8 +2,7 @@ import itertools import collections import random - - +from scipy import sparse class MLSMOTE: """Over-sampling using MLSMOTE. @@ -48,14 +47,18 @@ def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranki def fit_resample(self, X, y): self.n_features_ = X.shape[1] - self.labels = np.array([np.array(xi) for xi in y]) self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() - self.unique_labels = self._collect_unique_labels(y) + if sparse.issparse(y): + self.labels = y + self.unique_labels = range(0, y.shape[1]) + else: + self.labels = np.array([np.array(xi) for xi in y]) + self.unique_labels = self._collect_unique_labels(y) self.imbalance_ratio_per_label = np.array( list(map(self._get_imbalance_ratio_per_label, self.unique_labels))) self.features = X @@ -66,21 +69,39 @@ def fit_resample(self, X, y): append_X_synth = X_synth.append append_y_synth = y_synth.append mean_ir = self._get_mean_imbalance_ratio() - for index, label in np.ndenumerate(self.unique_labels): - irlbl = self.imbalance_ratio_per_label[index] - if irlbl > mean_ir: - min_bag = self._get_all_instances_of_label(label) - for sample in min_bag: - distances = self._calc_distances(sample, min_bag) - distances = np.sort(distances, order='distance') - neighbours = distances[:self.k_neighbors] - ref_neigh = np.random.choice(neighbours, 1)[0] - X_new, y_new = self._create_new_sample( - sample, ref_neigh[1], [x[1] for x in neighbours]) - append_X_synth(X_new) - append_y_synth(y_new) - - return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth) + + if sparse.issparse(y): + y_synth = None + + for label in self.unique_labels: + irlbl = self.imbalance_ratio_per_label[label] + if irlbl > mean_ir: + min_bag = self._get_all_instances_of_label(label) + for sample in min_bag: + distances = self._calc_distances(sample, min_bag) + distances = np.sort(distances, order='distance') + neighbours = distances[:self.k_neighbors] + ref_neigh = np.random.choice(neighbours, 1)[0] + X_new, y_new = self._create_new_sample( + sample, ref_neigh[1], [x[1] for x in neighbours]) + append_X_synth(X_new) + y_resambled = sparse.vstack((y_resampled, y_new)) + return np.concatenate((X_resampled, np.array(X_synth))), y_resampled + else: + for index, label in np.ndenumerate(self.unique_labels): + irlbl = self.imbalance_ratio_per_label[index] + if irlbl > mean_ir: + min_bag = self._get_all_instances_of_label(label) + for sample in min_bag: + distances = self._calc_distances(sample, min_bag) + distances = np.sort(distances, order='distance') + neighbours = distances[:self.k_neighbors] + ref_neigh = np.random.choice(neighbours, 1)[0] + X_new, y_new = self._create_new_sample( + sample, ref_neigh[1], [x[1] for x in neighbours]) + append_X_synth(X_new) + append_y_synth(y_new) + return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth) def _validate_estimator(self): categorical_features = np.asarray(self.categorical_features) @@ -108,12 +129,10 @@ def _collect_unique_labels(self, y): def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): sample = self.features[sample_id] - sample_labels = self.labels[sample_id] synth_sample = np.copy(sample) ref_neigh = self.features[ref_neigh_id] - neighbours_labels = [] - for ni in neighbour_ids: - neighbours_labels.append(self.labels[ni].tolist()) + sample_labels = self.labels[sample_id] + for i in range(synth_sample.shape[0]): if i in self.continuous_features_: diff = ref_neigh[i]-sample[i] @@ -122,20 +141,46 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): if i in self.categorical_features_: synth_sample[i] = self._get_most_frequent_value( self.features[neighbour_ids, i]) + X = synth_sample - labels = sample_labels.tolist() - labels += [a for x in neighbours_labels for a in ( - x if isinstance(x, list) else [x])] - labels = list(set(labels)) - if self.sampling_strategy_ == 'ranking': - head_index = int((self.k_neighbors + 1)/2) - y = labels[:head_index] - if self.sampling_strategy_ == 'union': - y = labels[:] - if self.sampling_strategy_ == 'intersection': - y = list(set.intersection(*neighbours_labels)) + if sparse.issparse(self.labels): + neighbours_labels = self.labels[neighbour_ids] + print("ns", neighbours_labels) + possible_labels = neighbours_labels.sum(axis=0) + print("possible", possible_labels) + y = np.zeros((1, len(self.unique_labels))) + if self.sampling_strategy_ == 'ranking': + head_index = int((self.k_neighbors + 1)/2) + print("choosen_nonz", possible_labels.nonzero()) + choosen_labels = possible_labels.nonzero()[1][:head_index] + print("choosen", choosen_labels) + y[0, choosen_labels] = 1 + print("y", y) + if self.sampling_strategy_ == 'union': + choosen_labels = possible_labels.nonzero()[0] + y[choosen_labels] = 1 + if self.sampling_strategy_ == 'intersection': + choosen_labels = sparse.find(possible_labels == len(neighbours_labels)) + y[choosen_labels] = 1 + y = sparse.csr_matrix(y) + + else: + neighbours_labels = [] + for ni in neighbour_ids: + neighbours_labels.append(self.labels[ni].tolist()) + + labels = [] # sample_labels.tolist() + labels += [a for x in neighbours_labels for a in ( + x if isinstance(x, list) else [x])] + labels = list(set(labels)) + if self.sampling_strategy_ == 'ranking': + head_index = int((self.k_neighbors + 1)/2) + y = labels[:head_index] + if self.sampling_strategy_ == 'union': + y = labels[:] + if self.sampling_strategy_ == 'intersection': + y = list(set.intersection(*neighbours_labels)) - X = synth_sample return X, y def _calc_distances(self, sample, min_bag): @@ -173,6 +218,8 @@ def f(c): return np.sum(np.array([f(c)for c in self.unique_labels])) def _get_all_instances_of_label(self, label): + if sparse.issparse(self.labels): + return self.labels[:, label].nonzero()[0] instance_ids = [] append_instance_id = instance_ids.append for i, label_set in enumerate(self.labels): @@ -182,7 +229,7 @@ def _get_all_instances_of_label(self, label): def _get_mean_imbalance_ratio(self): ratio_sum = np.sum(self.imbalance_ratio_per_label) - return ratio_sum/self.unique_labels.shape[0] + return ratio_sum/len(self.unique_labels) def _get_imbalance_ratio_per_label(self, label): sum_array = list(map(self._sum_h, self.unique_labels)) @@ -190,6 +237,9 @@ def _get_imbalance_ratio_per_label(self, label): return sum_array.max()/self._sum_h(label) def _sum_h(self, label): + if sparse.issparse(self.labels): + return self.labels[:, label].count_nonzero() + h_sum = 0 def h(l, Y): @@ -200,6 +250,7 @@ def h(l, Y): for label_set in self.labels: h_sum += h(label, label_set) + return h_sum def _get_label_frequencies(self, labels): diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py index 2fc53cb0e..49f1c0317 100644 --- a/imblearn/over_sampling/tests/test_mlsmote.py +++ b/imblearn/over_sampling/tests/test_mlsmote.py @@ -6,6 +6,8 @@ import pytest import numpy as np +from scipy import sparse +from sklearn.preprocessing import MultiLabelBinarizer from imblearn.over_sampling import MLSMOTE @@ -53,6 +55,22 @@ def data_heterogneous_masked(): return X, y, [True, False, True] +def data_sparse(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=np.float64) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.randint(3, size=30) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + labelBinarizer = MultiLabelBinarizer() + y = labelBinarizer.fit_transform(y) + y = sparse.csr_matrix(y) + return X, y, [0, 3] + + def test_mlsmote_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] @@ -67,6 +85,7 @@ def test_mlsmote_error(): data_heterogneous_ordered(), data_heterogneous_unordered(), data_heterogneous_masked(), + data_sparse() ], ) def test_mlsmote(data): @@ -100,7 +119,8 @@ def test_mlsmote_fit_resample(): smote = MLSMOTE(categorical_features=categorical_features) _, y_res = smote.fit_resample(X, y) classes_res = np.unique( - np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])])) + np.array([a for x in y_res + for a in (x if isinstance(x, list) else [x])])) _ = Counter(classes_res) n_samples = max(target_stats.values()) assert all(value >= n_samples for value in Counter(classes_res).values()) From 4bb3474e3ac261bd592ffe0e2d79ee534eaddb5b Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Tue, 16 Jun 2020 19:08:25 +0200 Subject: [PATCH 07/10] calculate imbalance ratio on every run instead once --- imblearn/over_sampling/_mlsmote.py | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 2311e0f13..7a435445e 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -59,8 +59,6 @@ def fit_resample(self, X, y): else: self.labels = np.array([np.array(xi) for xi in y]) self.unique_labels = self._collect_unique_labels(y) - self.imbalance_ratio_per_label = np.array( - list(map(self._get_imbalance_ratio_per_label, self.unique_labels))) self.features = X X_synth = [] @@ -74,7 +72,7 @@ def fit_resample(self, X, y): y_synth = None for label in self.unique_labels: - irlbl = self.imbalance_ratio_per_label[label] + irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) for sample in min_bag: @@ -89,7 +87,7 @@ def fit_resample(self, X, y): return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: for index, label in np.ndenumerate(self.unique_labels): - irlbl = self.imbalance_ratio_per_label[index] + irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) for sample in min_bag: @@ -228,17 +226,25 @@ def _get_all_instances_of_label(self, label): return np.array(instance_ids) def _get_mean_imbalance_ratio(self): - ratio_sum = np.sum(self.imbalance_ratio_per_label) + ratio_sum = np.sum(np.array( + list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))) return ratio_sum/len(self.unique_labels) - def _get_imbalance_ratio_per_label(self, label): - sum_array = list(map(self._sum_h, self.unique_labels)) - sum_array = np.array(sum_array) - return sum_array.max()/self._sum_h(label) + def _get_imbalance_ratio_per_label(self, label, labels=None): + sum_h = self._sum_h + if labels is None: + sum_array = np.array([sum_h(l, self.labels) + for l in self.unique_labels]) + ratio = sum_array.max()/sum_h(label, self.labels) + else: + sum_array = np.array([sum_h(l, labels)for l in self.unique_labels]) + ratio = sum_array.max()/sum_h(label, labels) - def _sum_h(self, label): - if sparse.issparse(self.labels): - return self.labels[:, label].count_nonzero() + return ratio + + def _sum_h(self, label, labels): + if sparse.issparse(labels): + return labels[:, label].count_nonzero() h_sum = 0 @@ -248,7 +254,7 @@ def h(l, Y): else: return 0 - for label_set in self.labels: + for label_set in labels: h_sum += h(label, label_set) return h_sum From 32a7b5552d298a6508fde7c1b74cd6227d824806 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Tue, 16 Jun 2020 19:11:31 +0200 Subject: [PATCH 08/10] use list comprehension for better performance --- imblearn/over_sampling/_mlsmote.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 7a435445e..c90827125 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -182,16 +182,14 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): return X, y def _calc_distances(self, sample, min_bag): - distances = [] - append_distances = distances.append - for bag_sample in min_bag: - nominal_distances = np.array([self._get_vdm( - self.features[sample, cat], self.features[bag_sample, cat])for cat in self.categorical_features_]) - ordinal_distances = np.array([self._get_euclidean_distance( + def calc_dist(bag_sample): + nominal_distance = sum([self._get_vdm( + self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_]) + ordinal_distance = sum([self._get_euclidean_distance( self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_]) - dists = np.array( - [nominal_distances.sum(), ordinal_distances.sum()]) - append_distances((dists.sum(), bag_sample)) + dist = sum([nominal_distance, ordinal_distance]) + return (dist, bag_sample) + distances = [calc_dist(bag_sample) for bag_sample in min_bag] dtype = np.dtype([('distance', float), ('index', int)]) return np.array(distances, dtype=dtype) From f1343496a2e922afda5c3372f4bc9efc5b25197e Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Tue, 16 Jun 2020 19:15:19 +0200 Subject: [PATCH 09/10] fix vdm and add experimental sparse implementation --- imblearn/over_sampling/_mlsmote.py | 43 +++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index c90827125..b3a055cba 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -197,21 +197,38 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first-second) return euclidean_distance - def _get_vdm(self, first, second): + def _get_vdm(self, first, second, category): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" + if sparse.issparse(self.features): + def f_sparse(c): + N_ax = len(sparse.find(self.features[:, category] == first)[0]) + N_ay = len(sparse.find( + self.features[:, category] == second)[0]) + c_instances = self._get_all_instances_of_label(c) + N_axc = len(sparse.find( + self.features[c_instances, category] == first)[0]) + N_ayc = len(sparse.find( + self.features[c_instances, category] == second)[0]) + p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) + return p + + vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels])) + return vdm + + category_rows = self.features[:, category] + N_ax = len(np.where(category_rows == first)) + N_ay = len(np.where(category_rows == second)) + def f(c): - N_ax = len( - np.where(self.features[:, self.categorical_features_] == first)) - N_ay = len( - np.where(self.features[:, self.categorical_features_] == second)) - c_instances = self._get_all_instances_of_label(c) - N_axc = len(np.where(self.features[np.ix_( - c_instances, self.categorical_features_)] == first)[0]) - N_ayc = len(np.where(self.features[np.ix_( - c_instances, self.categorical_features_)] == second)[0]) - return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) - - return np.sum(np.array([f(c)for c in self.unique_labels])) + class_instances = self._get_all_instances_of_label(c) + class_instance_rows = category_rows[class_instances] + N_axc = len(np.where(class_instance_rows == first)[0]) + N_ayc = len(np.where(class_instance_rows == second)[0]) + p = abs((N_axc/N_ax)-(N_ayc/N_ay)) + return p + + vdm = np.array([f(c)for c in self.unique_labels]).sum() + return vdm def _get_all_instances_of_label(self, label): if sparse.issparse(self.labels): From 3361578e469e8817bb3af356e78408bf5b3a54f2 Mon Sep 17 00:00:00 2001 From: Simon Ermler Date: Tue, 16 Jun 2020 19:15:39 +0200 Subject: [PATCH 10/10] remove prints --- imblearn/over_sampling/_mlsmote.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index b3a055cba..24ba43098 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -143,17 +143,12 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): if sparse.issparse(self.labels): neighbours_labels = self.labels[neighbour_ids] - print("ns", neighbours_labels) possible_labels = neighbours_labels.sum(axis=0) - print("possible", possible_labels) y = np.zeros((1, len(self.unique_labels))) if self.sampling_strategy_ == 'ranking': head_index = int((self.k_neighbors + 1)/2) - print("choosen_nonz", possible_labels.nonzero()) choosen_labels = possible_labels.nonzero()[1][:head_index] - print("choosen", choosen_labels) y[0, choosen_labels] = 1 - print("y", y) if self.sampling_strategy_ == 'union': choosen_labels = possible_labels.nonzero()[0] y[choosen_labels] = 1