From 6df29fe8e30cf3cb10ff231a6eda3b4545237532 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Sun, 18 Sep 2022 10:03:59 -0700 Subject: [PATCH 01/33] Add support for MLSMOTE Co-authored-by: Bruno Alvisio Co-authored-by: Simon Ermler --- imblearn/over_sampling/__init__.py | 2 + imblearn/over_sampling/_mlsmote.py | 281 +++++++++++++++++++ imblearn/over_sampling/tests/test_mlsmote.py | 126 +++++++++ 3 files changed, 409 insertions(+) create mode 100644 imblearn/over_sampling/_mlsmote.py create mode 100644 imblearn/over_sampling/tests/test_mlsmote.py diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index a959cbb43..54ab07e18 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -11,6 +11,7 @@ from ._smote import SVMSMOTE from ._smote import SMOTENC from ._smote import SMOTEN +from ._mlsmote import MLSMOTE __all__ = [ "ADASYN", @@ -21,4 +22,5 @@ "SVMSMOTE", "SMOTENC", "SMOTEN", + "MLSMOTE", ] diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py new file mode 100644 index 000000000..24ba43098 --- /dev/null +++ b/imblearn/over_sampling/_mlsmote.py @@ -0,0 +1,281 @@ +import numpy as np +import itertools +import collections +import random +from scipy import sparse +class MLSMOTE: + """Over-sampling using MLSMOTE. + + Parameters + ---------- + sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking' + Strategy to generate labelsets + + + k_neighbors : int or object, default=5 + If ``int``, number of nearest neighbours to used to construct synthetic + samples. + + categorical_features : ndarray of shape (n_cat_features,) or (n_features,) + Specified which features are categorical. Can either be: + + - array of indices specifying the categorical features; + - mask array of shape (n_features, ) and ``bool`` dtype for which + ``True`` indicates the categorical features. + + Notes + ----- + See the original papers: [1]_ for more details. + + + References + ---------- + .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). + MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. + Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. + + """ + + def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'): + self.k_neighbors = k_neighbors + self.sampling_strategy_ = sampling_strategy + self.categorical_features = categorical_features + self.continuous_features_ = None + self.unique_labels = [] + self.labels = [] + self.features = [] + + def fit_resample(self, X, y): + self.n_features_ = X.shape[1] + + self._validate_estimator() + + X_resampled = X.copy() + y_resampled = y.copy() + + if sparse.issparse(y): + self.labels = y + self.unique_labels = range(0, y.shape[1]) + else: + self.labels = np.array([np.array(xi) for xi in y]) + self.unique_labels = self._collect_unique_labels(y) + self.features = X + + X_synth = [] + y_synth = [] + + append_X_synth = X_synth.append + append_y_synth = y_synth.append + mean_ir = self._get_mean_imbalance_ratio() + + if sparse.issparse(y): + y_synth = None + + for label in self.unique_labels: + irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) + if irlbl > mean_ir: + min_bag = self._get_all_instances_of_label(label) + for sample in min_bag: + distances = self._calc_distances(sample, min_bag) + distances = np.sort(distances, order='distance') + neighbours = distances[:self.k_neighbors] + ref_neigh = np.random.choice(neighbours, 1)[0] + X_new, y_new = self._create_new_sample( + sample, ref_neigh[1], [x[1] for x in neighbours]) + append_X_synth(X_new) + y_resambled = sparse.vstack((y_resampled, y_new)) + return np.concatenate((X_resampled, np.array(X_synth))), y_resampled + else: + for index, label in np.ndenumerate(self.unique_labels): + irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) + if irlbl > mean_ir: + min_bag = self._get_all_instances_of_label(label) + for sample in min_bag: + distances = self._calc_distances(sample, min_bag) + distances = np.sort(distances, order='distance') + neighbours = distances[:self.k_neighbors] + ref_neigh = np.random.choice(neighbours, 1)[0] + X_new, y_new = self._create_new_sample( + sample, ref_neigh[1], [x[1] for x in neighbours]) + append_X_synth(X_new) + append_y_synth(y_new) + return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth) + + def _validate_estimator(self): + categorical_features = np.asarray(self.categorical_features) + if categorical_features.dtype.name == "bool": + self.categorical_features_ = np.flatnonzero(categorical_features) + else: + if any( + [ + cat not in np.arange(self.n_features_) + for cat in categorical_features + ] + ): + raise ValueError( + "Some of the categorical indices are out of range. Indices" + " should be between 0 and {}".format(self.n_features_) + ) + self.categorical_features_ = categorical_features + self.continuous_features_ = np.setdiff1d( + np.arange(self.n_features_), self.categorical_features_ + ) + + def _collect_unique_labels(self, y): + """A support function that flattens the labelsets and return one set of unique labels""" + return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])) + + def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): + sample = self.features[sample_id] + synth_sample = np.copy(sample) + ref_neigh = self.features[ref_neigh_id] + sample_labels = self.labels[sample_id] + + for i in range(synth_sample.shape[0]): + if i in self.continuous_features_: + diff = ref_neigh[i]-sample[i] + offset = diff*random.uniform(0, 1) + synth_sample[i] = sample[i]+offset + if i in self.categorical_features_: + synth_sample[i] = self._get_most_frequent_value( + self.features[neighbour_ids, i]) + X = synth_sample + + if sparse.issparse(self.labels): + neighbours_labels = self.labels[neighbour_ids] + possible_labels = neighbours_labels.sum(axis=0) + y = np.zeros((1, len(self.unique_labels))) + if self.sampling_strategy_ == 'ranking': + head_index = int((self.k_neighbors + 1)/2) + choosen_labels = possible_labels.nonzero()[1][:head_index] + y[0, choosen_labels] = 1 + if self.sampling_strategy_ == 'union': + choosen_labels = possible_labels.nonzero()[0] + y[choosen_labels] = 1 + if self.sampling_strategy_ == 'intersection': + choosen_labels = sparse.find(possible_labels == len(neighbours_labels)) + y[choosen_labels] = 1 + y = sparse.csr_matrix(y) + + else: + neighbours_labels = [] + for ni in neighbour_ids: + neighbours_labels.append(self.labels[ni].tolist()) + + labels = [] # sample_labels.tolist() + labels += [a for x in neighbours_labels for a in ( + x if isinstance(x, list) else [x])] + labels = list(set(labels)) + if self.sampling_strategy_ == 'ranking': + head_index = int((self.k_neighbors + 1)/2) + y = labels[:head_index] + if self.sampling_strategy_ == 'union': + y = labels[:] + if self.sampling_strategy_ == 'intersection': + y = list(set.intersection(*neighbours_labels)) + + return X, y + + def _calc_distances(self, sample, min_bag): + def calc_dist(bag_sample): + nominal_distance = sum([self._get_vdm( + self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_]) + ordinal_distance = sum([self._get_euclidean_distance( + self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_]) + dist = sum([nominal_distance, ordinal_distance]) + return (dist, bag_sample) + distances = [calc_dist(bag_sample) for bag_sample in min_bag] + dtype = np.dtype([('distance', float), ('index', int)]) + return np.array(distances, dtype=dtype) + + def _get_euclidean_distance(self, first, second): + euclidean_distance = np.linalg.norm(first-second) + return euclidean_distance + + def _get_vdm(self, first, second, category): + """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" + if sparse.issparse(self.features): + def f_sparse(c): + N_ax = len(sparse.find(self.features[:, category] == first)[0]) + N_ay = len(sparse.find( + self.features[:, category] == second)[0]) + c_instances = self._get_all_instances_of_label(c) + N_axc = len(sparse.find( + self.features[c_instances, category] == first)[0]) + N_ayc = len(sparse.find( + self.features[c_instances, category] == second)[0]) + p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) + return p + + vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels])) + return vdm + + category_rows = self.features[:, category] + N_ax = len(np.where(category_rows == first)) + N_ay = len(np.where(category_rows == second)) + + def f(c): + class_instances = self._get_all_instances_of_label(c) + class_instance_rows = category_rows[class_instances] + N_axc = len(np.where(class_instance_rows == first)[0]) + N_ayc = len(np.where(class_instance_rows == second)[0]) + p = abs((N_axc/N_ax)-(N_ayc/N_ay)) + return p + + vdm = np.array([f(c)for c in self.unique_labels]).sum() + return vdm + + def _get_all_instances_of_label(self, label): + if sparse.issparse(self.labels): + return self.labels[:, label].nonzero()[0] + instance_ids = [] + append_instance_id = instance_ids.append + for i, label_set in enumerate(self.labels): + if label in label_set: + append_instance_id(i) + return np.array(instance_ids) + + def _get_mean_imbalance_ratio(self): + ratio_sum = np.sum(np.array( + list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))) + return ratio_sum/len(self.unique_labels) + + def _get_imbalance_ratio_per_label(self, label, labels=None): + sum_h = self._sum_h + if labels is None: + sum_array = np.array([sum_h(l, self.labels) + for l in self.unique_labels]) + ratio = sum_array.max()/sum_h(label, self.labels) + else: + sum_array = np.array([sum_h(l, labels)for l in self.unique_labels]) + ratio = sum_array.max()/sum_h(label, labels) + + return ratio + + def _sum_h(self, label, labels): + if sparse.issparse(labels): + return labels[:, label].count_nonzero() + + h_sum = 0 + + def h(l, Y): + if l in Y: + return 1 + else: + return 0 + + for label_set in labels: + h_sum += h(label, label_set) + + return h_sum + + def _get_label_frequencies(self, labels): + """"A support function to get the frequencies of labels""" + frequency_map = np.array(np.unique(labels, return_counts=True)).T + frequencies = np.array([x[1] for x in frequency_map]) + return frequencies + + def _get_most_frequent_value(self, values): + """"A support function to get most frequent value if a list of values""" + uniques, indices = np.unique(values, return_inverse=True) + return uniques[np.argmax(np.bincount(indices))] diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py new file mode 100644 index 000000000..49f1c0317 --- /dev/null +++ b/imblearn/over_sampling/tests/test_mlsmote.py @@ -0,0 +1,126 @@ +"""Test the module MLSMOTE.""" + + +from collections import Counter + +import pytest + +import numpy as np +from scipy import sparse +from sklearn.preprocessing import MultiLabelBinarizer + + +from imblearn.over_sampling import MLSMOTE + + +def data_heterogneous_ordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, :2] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + # return the categories + return X, y, [2, 3] + + +def data_heterogneous_unordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + # return the categories + return X, y, [0, 3] + + +def data_heterogneous_masked(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + # return the categories + return X, y, [True, False, True] + + +def data_sparse(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=np.float64) + # create 2 random continuous feature + X[:, [1, 2]] = rng.randn(30, 2) + # create a categorical feature using some string + X[:, 0] = rng.randint(3, size=30) + # create a categorical feature using some integer + X[:, 3] = rng.randint(3, size=30) + y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + labelBinarizer = MultiLabelBinarizer() + y = labelBinarizer.fit_transform(y) + y = sparse.csr_matrix(y) + return X, y, [0, 3] + + +def test_mlsmote_error(): + X, y, _ = data_heterogneous_unordered() + categorical_features = [0, 10] + smote = MLSMOTE(categorical_features=categorical_features) + with pytest.raises(ValueError, match="indices are out of range"): + smote.fit_resample(X, y) + + +@pytest.mark.parametrize( + "data", + [ + data_heterogneous_ordered(), + data_heterogneous_unordered(), + data_heterogneous_masked(), + data_sparse() + ], +) +def test_mlsmote(data): + X, y, categorical_features = data + smote = MLSMOTE(categorical_features=categorical_features) + X_resampled, y_resampled = smote.fit_resample(X, y) + + assert X_resampled.dtype == X.dtype + + categorical_features = np.array(categorical_features) + if categorical_features.dtype == bool: + categorical_features = np.flatnonzero(categorical_features) + for cat_idx in categorical_features: + assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) + assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype + + +def test_mlsmote_fit(): + X, y, categorical_features = data_heterogneous_unordered() + smote = MLSMOTE(categorical_features=categorical_features) + smote.fit_resample(X, y) + assert hasattr( + smote, "sampling_strategy_" + ), "No fitted attribute sampling_strategy_" + + +def test_mlsmote_fit_resample(): + X, y, categorical_features = data_heterogneous_unordered() + target_stats = Counter(np.unique( + np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))) + smote = MLSMOTE(categorical_features=categorical_features) + _, y_res = smote.fit_resample(X, y) + classes_res = np.unique( + np.array([a for x in y_res + for a in (x if isinstance(x, list) else [x])])) + _ = Counter(classes_res) + n_samples = max(target_stats.values()) + assert all(value >= n_samples for value in Counter(classes_res).values()) From c049468b2af8c438e51302d3f815bc17b02ea15f Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Sun, 18 Sep 2022 14:10:44 -0700 Subject: [PATCH 02/33] Added random_state, documentation and formatting --- imblearn/over_sampling/_mlsmote.py | 233 ++++++++++++++++++++--------- 1 file changed, 159 insertions(+), 74 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 24ba43098..13ce060e1 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -1,14 +1,17 @@ +"""Class to perfrom over-sampling using MLSMOTE.""" + import numpy as np -import itertools -import collections -import random from scipy import sparse + +from sklearn.utils import check_random_state + + class MLSMOTE: """Over-sampling using MLSMOTE. Parameters ---------- - sampling_strategy: 'ranking','union' or 'intersection' default: 'ranking' + sampling_strategy: 'ranking', 'union' or 'intersection' default: 'ranking' Strategy to generate labelsets @@ -17,7 +20,7 @@ class MLSMOTE: samples. categorical_features : ndarray of shape (n_cat_features,) or (n_features,) - Specified which features are categorical. Can either be: + Specifies which features are categorical. Can either be: - array of indices specifying the categorical features; - mask array of shape (n_features, ) and ``bool`` dtype for which @@ -25,18 +28,29 @@ class MLSMOTE: Notes ----- - See the original papers: [1]_ for more details. - + The implementation is based on [1]_. References ---------- - .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). - MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. - Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. - + .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, + Francisco. (2015). "MLSMOTE: Approaching imbalanced multilabel learning + through synthetic instance generation." + Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. + + Examples + -------- + >>> from sklearn.datasets import make_multilabel_classification """ - def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'): + def __init__( + self, + *, + sampling_strategy="ranking", + categorical_features, + random_state=None, + k_neighbors=5, + ): + self.random_state = random_state self.k_neighbors = k_neighbors self.sampling_strategy_ = sampling_strategy self.categorical_features = categorical_features @@ -46,19 +60,52 @@ def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranki self.features = [] def fit_resample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : {array-like, dataframe, sparse matrix} of shape \ + (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : {array-like, sparse matrix of shape \ + (n_samples, n_labels) + or a list of lists of labels. + See "sklearn.datasets.make_multilabel_classification" and \ + the "return_indicate" input parameter for more \ + information on possible label sets formats. + + Corresponding label sets for each sample in X. Sparse matrix \ + should be of CSR format. + + Returns + ------- + X_resampled : {array-like, dataframe, sparse matrix} of shape \ + (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : array-like of shape (n_samples_new, n_labels) + The corresponding label sets of `X_resampled`. + """ self.n_features_ = X.shape[1] self._validate_estimator() + random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() - if sparse.issparse(y): + if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: self.labels = y self.unique_labels = range(0, y.shape[1]) - else: + elif type(y) == list: self.labels = np.array([np.array(xi) for xi in y]) self.unique_labels = self._collect_unique_labels(y) + else: + raise TypeError( + "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'" + " or 'list'" + ) self.features = X X_synth = [] @@ -67,7 +114,7 @@ def fit_resample(self, X, y): append_X_synth = X_synth.append append_y_synth = y_synth.append mean_ir = self._get_mean_imbalance_ratio() - + if sparse.issparse(y): y_synth = None @@ -77,11 +124,15 @@ def fit_resample(self, X, y): min_bag = self._get_all_instances_of_label(label) for sample in min_bag: distances = self._calc_distances(sample, min_bag) - distances = np.sort(distances, order='distance') - neighbours = distances[:self.k_neighbors] - ref_neigh = np.random.choice(neighbours, 1)[0] + distances = np.sort(distances, order="distance") + neighbours = distances[: self.k_neighbors] + ref_neigh = random_state.choice(neighbours, 1)[0] X_new, y_new = self._create_new_sample( - sample, ref_neigh[1], [x[1] for x in neighbours]) + sample, + ref_neigh[1], + [x[1] for x in neighbours], + random_state, + ) append_X_synth(X_new) y_resambled = sparse.vstack((y_resampled, y_new)) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled @@ -92,29 +143,32 @@ def fit_resample(self, X, y): min_bag = self._get_all_instances_of_label(label) for sample in min_bag: distances = self._calc_distances(sample, min_bag) - distances = np.sort(distances, order='distance') - neighbours = distances[:self.k_neighbors] - ref_neigh = np.random.choice(neighbours, 1)[0] + distances = np.sort(distances, order="distance") + neighbours = distances[: self.k_neighbors] + ref_neigh = random_state.choice(neighbours, 1)[0] X_new, y_new = self._create_new_sample( - sample, ref_neigh[1], [x[1] for x in neighbours]) + sample, + ref_neigh[1], + [x[1] for x in neighbours], + random_state, + ) append_X_synth(X_new) append_y_synth(y_new) - return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth) + return np.concatenate((X_resampled, np.array(X_synth))), np.array( + y_resampled.tolist() + y_synth + ) def _validate_estimator(self): categorical_features = np.asarray(self.categorical_features) - if categorical_features.dtype.name == "bool": + if categorical_features.dtype.name == bool: self.categorical_features_ = np.flatnonzero(categorical_features) else: if any( - [ - cat not in np.arange(self.n_features_) - for cat in categorical_features - ] + [cat not in np.arange(self.n_features_) for cat in categorical_features] ): raise ValueError( "Some of the categorical indices are out of range. Indices" - " should be between 0 and {}".format(self.n_features_) + f" should be between 0 and {self.n_features_ - 1}" ) self.categorical_features_ = categorical_features self.continuous_features_ = np.setdiff1d( @@ -122,10 +176,22 @@ def _validate_estimator(self): ) def _collect_unique_labels(self, y): - """A support function that flattens the labelsets and return one set of unique labels""" - return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])) + """A support function that flattens the labelsets and return one set of unique + labels + """ + return np.unique( + np.array( + [ + label + for label_set in y + for label in ( + label_set if isinstance(label_set, list) else [label_set] + ) + ] + ) + ) - def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): + def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_state): sample = self.features[sample_id] synth_sample = np.copy(sample) ref_neigh = self.features[ref_neigh_id] @@ -133,26 +199,27 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): for i in range(synth_sample.shape[0]): if i in self.continuous_features_: - diff = ref_neigh[i]-sample[i] - offset = diff*random.uniform(0, 1) - synth_sample[i] = sample[i]+offset + diff = ref_neigh[i] - sample[i] + offset = diff * random_state.uniform(0, 1) + synth_sample[i] = sample[i] + offset if i in self.categorical_features_: synth_sample[i] = self._get_most_frequent_value( - self.features[neighbour_ids, i]) + self.features[neighbour_ids, i] + ) X = synth_sample if sparse.issparse(self.labels): neighbours_labels = self.labels[neighbour_ids] possible_labels = neighbours_labels.sum(axis=0) y = np.zeros((1, len(self.unique_labels))) - if self.sampling_strategy_ == 'ranking': - head_index = int((self.k_neighbors + 1)/2) + if self.sampling_strategy_ == "ranking": + head_index = int((self.k_neighbors + 1) / 2) choosen_labels = possible_labels.nonzero()[1][:head_index] y[0, choosen_labels] = 1 - if self.sampling_strategy_ == 'union': + if self.sampling_strategy_ == "union": choosen_labels = possible_labels.nonzero()[0] y[choosen_labels] = 1 - if self.sampling_strategy_ == 'intersection': + if self.sampling_strategy_ == "intersection": choosen_labels = sparse.find(possible_labels == len(neighbours_labels)) y[choosen_labels] = 1 y = sparse.csr_matrix(y) @@ -160,54 +227,72 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids): else: neighbours_labels = [] for ni in neighbour_ids: - neighbours_labels.append(self.labels[ni].tolist()) + neighbours_labels.append(self.labels[ni].tolist()) labels = [] # sample_labels.tolist() - labels += [a for x in neighbours_labels for a in ( - x if isinstance(x, list) else [x])] + labels += [ + a + for x in neighbours_labels + for a in (x if isinstance(x, list) else [x]) + ] labels = list(set(labels)) - if self.sampling_strategy_ == 'ranking': - head_index = int((self.k_neighbors + 1)/2) + if self.sampling_strategy_ == "ranking": + head_index = int((self.k_neighbors + 1) / 2) y = labels[:head_index] - if self.sampling_strategy_ == 'union': + if self.sampling_strategy_ == "union": y = labels[:] - if self.sampling_strategy_ == 'intersection': + if self.sampling_strategy_ == "intersection": y = list(set.intersection(*neighbours_labels)) return X, y def _calc_distances(self, sample, min_bag): def calc_dist(bag_sample): - nominal_distance = sum([self._get_vdm( - self.features[sample, cat], self.features[bag_sample, cat], cat)for cat in self.categorical_features_]) - ordinal_distance = sum([self._get_euclidean_distance( - self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_]) + nominal_distance = sum( + [ + self._get_vdm( + self.features[sample, cat], self.features[bag_sample, cat], cat + ) + for cat in self.categorical_features_ + ] + ) + ordinal_distance = sum( + [ + self._get_euclidean_distance( + self.features[sample, num], self.features[bag_sample, num] + ) + for num in self.continuous_features_ + ] + ) dist = sum([nominal_distance, ordinal_distance]) return (dist, bag_sample) + distances = [calc_dist(bag_sample) for bag_sample in min_bag] - dtype = np.dtype([('distance', float), ('index', int)]) + dtype = np.dtype([("distance", float), ("index", int)]) return np.array(distances, dtype=dtype) def _get_euclidean_distance(self, first, second): - euclidean_distance = np.linalg.norm(first-second) + euclidean_distance = np.linalg.norm(first - second) return euclidean_distance def _get_vdm(self, first, second, category): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" if sparse.issparse(self.features): + def f_sparse(c): N_ax = len(sparse.find(self.features[:, category] == first)[0]) - N_ay = len(sparse.find( - self.features[:, category] == second)[0]) + N_ay = len(sparse.find(self.features[:, category] == second)[0]) c_instances = self._get_all_instances_of_label(c) - N_axc = len(sparse.find( - self.features[c_instances, category] == first)[0]) - N_ayc = len(sparse.find( - self.features[c_instances, category] == second)[0]) - p = np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay))) + N_axc = len( + sparse.find(self.features[c_instances, category] == first)[0] + ) + N_ayc = len( + sparse.find(self.features[c_instances, category] == second)[0] + ) + p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay))) return p - vdm = np.sum(np.array([f_sparse(c)for c in self.unique_labels])) + vdm = np.sum(np.array([f_sparse(c) for c in self.unique_labels])) return vdm category_rows = self.features[:, category] @@ -219,10 +304,10 @@ def f(c): class_instance_rows = category_rows[class_instances] N_axc = len(np.where(class_instance_rows == first)[0]) N_ayc = len(np.where(class_instance_rows == second)[0]) - p = abs((N_axc/N_ax)-(N_ayc/N_ay)) + p = abs((N_axc / N_ax) - (N_ayc / N_ay)) return p - vdm = np.array([f(c)for c in self.unique_labels]).sum() + vdm = np.array([f(c) for c in self.unique_labels]).sum() return vdm def _get_all_instances_of_label(self, label): @@ -236,19 +321,19 @@ def _get_all_instances_of_label(self, label): return np.array(instance_ids) def _get_mean_imbalance_ratio(self): - ratio_sum = np.sum(np.array( - list(map(self._get_imbalance_ratio_per_label, self.unique_labels)))) - return ratio_sum/len(self.unique_labels) + ratio_sum = np.sum( + np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels))) + ) + return ratio_sum / len(self.unique_labels) def _get_imbalance_ratio_per_label(self, label, labels=None): sum_h = self._sum_h if labels is None: - sum_array = np.array([sum_h(l, self.labels) - for l in self.unique_labels]) - ratio = sum_array.max()/sum_h(label, self.labels) + sum_array = np.array([sum_h(l, self.labels) for l in self.unique_labels]) + ratio = sum_array.max() / sum_h(label, self.labels) else: - sum_array = np.array([sum_h(l, labels)for l in self.unique_labels]) - ratio = sum_array.max()/sum_h(label, labels) + sum_array = np.array([sum_h(l, labels) for l in self.unique_labels]) + ratio = sum_array.max() / sum_h(label, labels) return ratio @@ -270,12 +355,12 @@ def h(l, Y): return h_sum def _get_label_frequencies(self, labels): - """"A support function to get the frequencies of labels""" + """A support function to get the frequencies of labels""" frequency_map = np.array(np.unique(labels, return_counts=True)).T frequencies = np.array([x[1] for x in frequency_map]) return frequencies def _get_most_frequent_value(self, values): - """"A support function to get most frequent value if a list of values""" + """A support function to get most frequent value if a list of values""" uniques, indices = np.unique(values, return_inverse=True) return uniques[np.argmax(np.bincount(indices))] From 192519b429ba3aa799aaaacda5ef6fd5847a8a50 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Sun, 18 Sep 2022 15:27:17 -0700 Subject: [PATCH 03/33] Refactor code to remove self.unique_labels --- imblearn/over_sampling/_mlsmote.py | 75 ++++++++++++++++++------------ 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 13ce060e1..989dd8286 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -1,5 +1,6 @@ """Class to perfrom over-sampling using MLSMOTE.""" +import itertools import numpy as np from scipy import sparse @@ -55,7 +56,6 @@ def __init__( self.sampling_strategy_ = sampling_strategy self.categorical_features = categorical_features self.continuous_features_ = None - self.unique_labels = [] self.labels = [] self.features = [] @@ -96,11 +96,11 @@ def fit_resample(self, X, y): y_resampled = y.copy() if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: - self.labels = y - self.unique_labels = range(0, y.shape[1]) + labels = y + unique_labels = range(0, y.shape[1]) elif type(y) == list: - self.labels = np.array([np.array(xi) for xi in y]) - self.unique_labels = self._collect_unique_labels(y) + labels = np.array([np.array(xi) for xi in y], dtype=object) + unique_labels = self._collect_unique_labels(y) else: raise TypeError( "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'" @@ -113,17 +113,19 @@ def fit_resample(self, X, y): append_X_synth = X_synth.append append_y_synth = y_synth.append - mean_ir = self._get_mean_imbalance_ratio() + mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels) if sparse.issparse(y): y_synth = None - for label in self.unique_labels: - irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) + for label in unique_labels: + irlbl = self._get_imbalance_ratio_per_label( + label, unique_labels, y_resampled + ) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) for sample in min_bag: - distances = self._calc_distances(sample, min_bag) + distances = self._calc_distances(sample, min_bag, unique_labels) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] ref_neigh = random_state.choice(neighbours, 1)[0] @@ -131,18 +133,21 @@ def fit_resample(self, X, y): sample, ref_neigh[1], [x[1] for x in neighbours], + unique_labels, random_state, ) append_X_synth(X_new) y_resambled = sparse.vstack((y_resampled, y_new)) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: - for index, label in np.ndenumerate(self.unique_labels): - irlbl = self._get_imbalance_ratio_per_label(label, y_resampled) + for index, label in np.ndenumerate(unique_labels): + irlbl = self._get_imbalance_ratio_per_label( + label, unique_labels, y_resampled + ) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) for sample in min_bag: - distances = self._calc_distances(sample, min_bag) + distances = self._calc_distances(sample, min_bag, unique_labels) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] ref_neigh = random_state.choice(neighbours, 1)[0] @@ -150,6 +155,7 @@ def fit_resample(self, X, y): sample, ref_neigh[1], [x[1] for x in neighbours], + unique_labels, random_state, ) append_X_synth(X_new) @@ -191,7 +197,9 @@ def _collect_unique_labels(self, y): ) ) - def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_state): + def _create_new_sample( + self, sample_id, ref_neigh_id, neighbour_ids, unique_labels, random_state + ): sample = self.features[sample_id] synth_sample = np.copy(sample) ref_neigh = self.features[ref_neigh_id] @@ -211,7 +219,7 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_stat if sparse.issparse(self.labels): neighbours_labels = self.labels[neighbour_ids] possible_labels = neighbours_labels.sum(axis=0) - y = np.zeros((1, len(self.unique_labels))) + y = np.zeros((1, len(unique_labels))) if self.sampling_strategy_ == "ranking": head_index = int((self.k_neighbors + 1) / 2) choosen_labels = possible_labels.nonzero()[1][:head_index] @@ -246,12 +254,15 @@ def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids, random_stat return X, y - def _calc_distances(self, sample, min_bag): + def _calc_distances(self, sample, min_bag, unique_labels): def calc_dist(bag_sample): nominal_distance = sum( [ self._get_vdm( - self.features[sample, cat], self.features[bag_sample, cat], cat + self.features[sample, cat], + self.features[bag_sample, cat], + cat, + unique_labels, ) for cat in self.categorical_features_ ] @@ -275,7 +286,7 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first - second) return euclidean_distance - def _get_vdm(self, first, second, category): + def _get_vdm(self, first, second, category, unique_labels): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" if sparse.issparse(self.features): @@ -292,7 +303,7 @@ def f_sparse(c): p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay))) return p - vdm = np.sum(np.array([f_sparse(c) for c in self.unique_labels])) + vdm = np.sum(np.array([f_sparse(c) for c in unique_labels])) return vdm category_rows = self.features[:, category] @@ -307,7 +318,7 @@ def f(c): p = abs((N_axc / N_ax) - (N_ayc / N_ay)) return p - vdm = np.array([f(c) for c in self.unique_labels]).sum() + vdm = np.array([f(c) for c in unique_labels]).sum() return vdm def _get_all_instances_of_label(self, label): @@ -320,21 +331,25 @@ def _get_all_instances_of_label(self, label): append_instance_id(i) return np.array(instance_ids) - def _get_mean_imbalance_ratio(self): + def _get_mean_imbalance_ratio(self, unique_labels, labels): ratio_sum = np.sum( - np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels))) + np.array( + list( + map( + self._get_imbalance_ratio_per_label, + unique_labels, + itertools.repeat(unique_labels), + itertools.repeat(labels), + ) + ) + ) ) - return ratio_sum / len(self.unique_labels) + return ratio_sum / len(unique_labels) - def _get_imbalance_ratio_per_label(self, label, labels=None): + def _get_imbalance_ratio_per_label(self, label, unique_labels, labels): sum_h = self._sum_h - if labels is None: - sum_array = np.array([sum_h(l, self.labels) for l in self.unique_labels]) - ratio = sum_array.max() / sum_h(label, self.labels) - else: - sum_array = np.array([sum_h(l, labels) for l in self.unique_labels]) - ratio = sum_array.max() / sum_h(label, labels) - + sum_array = np.array([sum_h(l, labels) for l in unique_labels]) + ratio = sum_array.max() / sum_h(label, labels) return ratio def _sum_h(self, label, labels): From cefac5367e9b95608dc9d38cc7c0e7018e3e1c99 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Sun, 18 Sep 2022 18:53:43 -0700 Subject: [PATCH 04/33] Refactor code to avoid redundant calculations of the IRLbl numerator --- imblearn/over_sampling/_mlsmote.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 989dd8286..53fedc8ad 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -119,8 +119,11 @@ def fit_resample(self, X, y): y_synth = None for label in unique_labels: + irlbl_num = self._get_imbalance_ratio_numerator( + unique_labels, y_resampled + ) irlbl = self._get_imbalance_ratio_per_label( - label, unique_labels, y_resampled + label, irlbl_num, y_resampled ) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) @@ -141,8 +144,11 @@ def fit_resample(self, X, y): return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: for index, label in np.ndenumerate(unique_labels): + irlbl_num = self._get_imbalance_ratio_numerator( + unique_labels, y_resampled + ) irlbl = self._get_imbalance_ratio_per_label( - label, unique_labels, y_resampled + label, irlbl_num, y_resampled ) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label) @@ -332,13 +338,14 @@ def _get_all_instances_of_label(self, label): return np.array(instance_ids) def _get_mean_imbalance_ratio(self, unique_labels, labels): + irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels) ratio_sum = np.sum( np.array( list( map( self._get_imbalance_ratio_per_label, unique_labels, - itertools.repeat(unique_labels), + itertools.repeat(irlbl_num), itertools.repeat(labels), ) ) @@ -346,11 +353,12 @@ def _get_mean_imbalance_ratio(self, unique_labels, labels): ) return ratio_sum / len(unique_labels) - def _get_imbalance_ratio_per_label(self, label, unique_labels, labels): - sum_h = self._sum_h - sum_array = np.array([sum_h(l, labels) for l in unique_labels]) - ratio = sum_array.max() / sum_h(label, labels) - return ratio + def _get_imbalance_ratio_numerator(self, unique_labels, labels): + sum_array = np.array([self._sum_h(label, labels) for label in unique_labels]) + return sum_array.max() + + def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): + return irlbl_numerator / self._sum_h(label, labels) def _sum_h(self, label, labels): if sparse.issparse(labels): From 600d51ad8c1a8128272b44df23a2743634ea6fb2 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 09:14:40 -0700 Subject: [PATCH 05/33] Refactor code to remove self.labels --- imblearn/over_sampling/_mlsmote.py | 58 ++++++++++++++++++------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 53fedc8ad..ba4ae7d9f 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -56,7 +56,6 @@ def __init__( self.sampling_strategy_ = sampling_strategy self.categorical_features = categorical_features self.continuous_features_ = None - self.labels = [] self.features = [] def fit_resample(self, X, y): @@ -126,9 +125,11 @@ def fit_resample(self, X, y): label, irlbl_num, y_resampled ) if irlbl > mean_ir: - min_bag = self._get_all_instances_of_label(label) + min_bag = self._get_all_instances_of_label(label, labels) for sample in min_bag: - distances = self._calc_distances(sample, min_bag, unique_labels) + distances = self._calc_distances( + sample, min_bag, unique_labels, labels + ) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] ref_neigh = random_state.choice(neighbours, 1)[0] @@ -137,6 +138,7 @@ def fit_resample(self, X, y): ref_neigh[1], [x[1] for x in neighbours], unique_labels, + labels, random_state, ) append_X_synth(X_new) @@ -151,9 +153,11 @@ def fit_resample(self, X, y): label, irlbl_num, y_resampled ) if irlbl > mean_ir: - min_bag = self._get_all_instances_of_label(label) + min_bag = self._get_all_instances_of_label(label, labels) for sample in min_bag: - distances = self._calc_distances(sample, min_bag, unique_labels) + distances = self._calc_distances( + sample, min_bag, unique_labels, labels + ) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] ref_neigh = random_state.choice(neighbours, 1)[0] @@ -162,6 +166,7 @@ def fit_resample(self, X, y): ref_neigh[1], [x[1] for x in neighbours], unique_labels, + labels, random_state, ) append_X_synth(X_new) @@ -204,12 +209,18 @@ def _collect_unique_labels(self, y): ) def _create_new_sample( - self, sample_id, ref_neigh_id, neighbour_ids, unique_labels, random_state + self, + sample_id, + ref_neigh_id, + neighbour_ids, + unique_labels, + labels, + random_state, ): sample = self.features[sample_id] synth_sample = np.copy(sample) ref_neigh = self.features[ref_neigh_id] - sample_labels = self.labels[sample_id] + sample_labels = labels[sample_id] for i in range(synth_sample.shape[0]): if i in self.continuous_features_: @@ -222,8 +233,8 @@ def _create_new_sample( ) X = synth_sample - if sparse.issparse(self.labels): - neighbours_labels = self.labels[neighbour_ids] + if sparse.issparse(labels): + neighbours_labels = labels[neighbour_ids] possible_labels = neighbours_labels.sum(axis=0) y = np.zeros((1, len(unique_labels))) if self.sampling_strategy_ == "ranking": @@ -241,26 +252,26 @@ def _create_new_sample( else: neighbours_labels = [] for ni in neighbour_ids: - neighbours_labels.append(self.labels[ni].tolist()) + neighbours_labels.append(labels[ni].tolist()) - labels = [] # sample_labels.tolist() - labels += [ + new_labels = [] # sample_labels.tolist() + new_labels += [ a for x in neighbours_labels for a in (x if isinstance(x, list) else [x]) ] - labels = list(set(labels)) + new_labels = list(set(new_labels)) if self.sampling_strategy_ == "ranking": head_index = int((self.k_neighbors + 1) / 2) - y = labels[:head_index] + y = new_labels[:head_index] if self.sampling_strategy_ == "union": - y = labels[:] + y = new_labels[:] if self.sampling_strategy_ == "intersection": y = list(set.intersection(*neighbours_labels)) return X, y - def _calc_distances(self, sample, min_bag, unique_labels): + def _calc_distances(self, sample, min_bag, unique_labels, labels): def calc_dist(bag_sample): nominal_distance = sum( [ @@ -269,6 +280,7 @@ def calc_dist(bag_sample): self.features[bag_sample, cat], cat, unique_labels, + labels, ) for cat in self.categorical_features_ ] @@ -292,14 +304,14 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first - second) return euclidean_distance - def _get_vdm(self, first, second, category, unique_labels): + def _get_vdm(self, first, second, category, unique_labels, labels): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" if sparse.issparse(self.features): def f_sparse(c): N_ax = len(sparse.find(self.features[:, category] == first)[0]) N_ay = len(sparse.find(self.features[:, category] == second)[0]) - c_instances = self._get_all_instances_of_label(c) + c_instances = self._get_all_instances_of_label(c, labels) N_axc = len( sparse.find(self.features[c_instances, category] == first)[0] ) @@ -317,7 +329,7 @@ def f_sparse(c): N_ay = len(np.where(category_rows == second)) def f(c): - class_instances = self._get_all_instances_of_label(c) + class_instances = self._get_all_instances_of_label(c, labels) class_instance_rows = category_rows[class_instances] N_axc = len(np.where(class_instance_rows == first)[0]) N_ayc = len(np.where(class_instance_rows == second)[0]) @@ -327,12 +339,12 @@ def f(c): vdm = np.array([f(c) for c in unique_labels]).sum() return vdm - def _get_all_instances_of_label(self, label): - if sparse.issparse(self.labels): - return self.labels[:, label].nonzero()[0] + def _get_all_instances_of_label(self, label, labels): + if sparse.issparse(labels): + return labels[:, label].nonzero()[0] instance_ids = [] append_instance_id = instance_ids.append - for i, label_set in enumerate(self.labels): + for i, label_set in enumerate(labels): if label in label_set: append_instance_id(i) return np.array(instance_ids) From a96cefa3d13adf16253d338c47ed40f4204305d6 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 09:57:36 -0700 Subject: [PATCH 06/33] Return over-sampled y as a list of lists --- imblearn/over_sampling/_mlsmote.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index ba4ae7d9f..c3b304bab 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -114,7 +114,7 @@ def fit_resample(self, X, y): append_y_synth = y_synth.append mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels) - if sparse.issparse(y): + if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: y_synth = None for label in unique_labels: @@ -171,9 +171,8 @@ def fit_resample(self, X, y): ) append_X_synth(X_new) append_y_synth(y_new) - return np.concatenate((X_resampled, np.array(X_synth))), np.array( - y_resampled.tolist() + y_synth - ) + y_resampled.extend(y_synth) + return np.concatenate((X_resampled, np.array(X_synth))), y_resampled def _validate_estimator(self): categorical_features = np.asarray(self.categorical_features) From 735a8925d6aa4c775cfc0082b88b76bda5b14572 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 10:15:03 -0700 Subject: [PATCH 07/33] Fix small bug on variable name --- imblearn/over_sampling/_mlsmote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index c3b304bab..1661a30ab 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -142,7 +142,7 @@ def fit_resample(self, X, y): random_state, ) append_X_synth(X_new) - y_resambled = sparse.vstack((y_resampled, y_new)) + y_resampled = sparse.vstack((y_resampled, y_new)) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: for index, label in np.ndenumerate(unique_labels): From 783486fc29f820df280134b63499886df66c70b6 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 10:16:23 -0700 Subject: [PATCH 08/33] Remove unnecessary index variable --- imblearn/over_sampling/_mlsmote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 1661a30ab..c5165206b 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -145,7 +145,7 @@ def fit_resample(self, X, y): y_resampled = sparse.vstack((y_resampled, y_new)) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: - for index, label in np.ndenumerate(unique_labels): + for label in unique_labels: irlbl_num = self._get_imbalance_ratio_numerator( unique_labels, y_resampled ) From 6541cdb7c18dd50c605916af598fb174c9803e88 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 11:03:12 -0700 Subject: [PATCH 09/33] Handle case where y is a dense array when calculating mean IR and IRLbl --- imblearn/over_sampling/_mlsmote.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index c5165206b..410c81d82 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -232,7 +232,7 @@ def _create_new_sample( ) X = synth_sample - if sparse.issparse(labels): + if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: neighbours_labels = labels[neighbour_ids] possible_labels = neighbours_labels.sum(axis=0) y = np.zeros((1, len(unique_labels))) @@ -305,7 +305,7 @@ def _get_euclidean_distance(self, first, second): def _get_vdm(self, first, second, category, unique_labels, labels): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" - if sparse.issparse(self.features): + if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: def f_sparse(c): N_ax = len(sparse.find(self.features[:, category] == first)[0]) @@ -339,7 +339,7 @@ def f(c): return vdm def _get_all_instances_of_label(self, label, labels): - if sparse.issparse(labels): + if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: return labels[:, label].nonzero()[0] instance_ids = [] append_instance_id = instance_ids.append @@ -372,21 +372,23 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): return irlbl_numerator / self._sum_h(label, labels) def _sum_h(self, label, labels): - if sparse.issparse(labels): + if type(labels) == sparse._csr.csr_matrix: return labels[:, label].count_nonzero() + elif type(labels) == np.ndarray: + return np.count_nonzero(labels[:, label]) + else: + h_sum = 0 - h_sum = 0 - - def h(l, Y): - if l in Y: - return 1 - else: - return 0 + def h(l, Y): + if l in Y: + return 1 + else: + return 0 - for label_set in labels: - h_sum += h(label, label_set) + for label_set in labels: + h_sum += h(label, label_set) - return h_sum + return h_sum def _get_label_frequencies(self, labels): """A support function to get the frequencies of labels""" From 9f2d7cf706b25f9f7afca22575b341dec0172e8d Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 12:07:11 -0700 Subject: [PATCH 10/33] Refactor code to remove self.features --- imblearn/over_sampling/_mlsmote.py | 40 ++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 410c81d82..156e2d0a3 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -56,7 +56,6 @@ def __init__( self.sampling_strategy_ = sampling_strategy self.categorical_features = categorical_features self.continuous_features_ = None - self.features = [] def fit_resample(self, X, y): """Resample the dataset. @@ -105,7 +104,6 @@ def fit_resample(self, X, y): "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'" " or 'list'" ) - self.features = X X_synth = [] y_synth = [] @@ -128,7 +126,7 @@ def fit_resample(self, X, y): min_bag = self._get_all_instances_of_label(label, labels) for sample in min_bag: distances = self._calc_distances( - sample, min_bag, unique_labels, labels + sample, min_bag, X, unique_labels, labels ) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] @@ -137,6 +135,7 @@ def fit_resample(self, X, y): sample, ref_neigh[1], [x[1] for x in neighbours], + X, unique_labels, labels, random_state, @@ -156,7 +155,7 @@ def fit_resample(self, X, y): min_bag = self._get_all_instances_of_label(label, labels) for sample in min_bag: distances = self._calc_distances( - sample, min_bag, unique_labels, labels + sample, min_bag, X, unique_labels, labels ) distances = np.sort(distances, order="distance") neighbours = distances[: self.k_neighbors] @@ -165,6 +164,7 @@ def fit_resample(self, X, y): sample, ref_neigh[1], [x[1] for x in neighbours], + X, unique_labels, labels, random_state, @@ -212,13 +212,14 @@ def _create_new_sample( sample_id, ref_neigh_id, neighbour_ids, + features, unique_labels, labels, random_state, ): - sample = self.features[sample_id] + sample = features[sample_id] synth_sample = np.copy(sample) - ref_neigh = self.features[ref_neigh_id] + ref_neigh = features[ref_neigh_id] sample_labels = labels[sample_id] for i in range(synth_sample.shape[0]): @@ -228,7 +229,7 @@ def _create_new_sample( synth_sample[i] = sample[i] + offset if i in self.categorical_features_: synth_sample[i] = self._get_most_frequent_value( - self.features[neighbour_ids, i] + features[neighbour_ids, i] ) X = synth_sample @@ -270,13 +271,14 @@ def _create_new_sample( return X, y - def _calc_distances(self, sample, min_bag, unique_labels, labels): + def _calc_distances(self, sample, min_bag, features, unique_labels, labels): def calc_dist(bag_sample): nominal_distance = sum( [ self._get_vdm( - self.features[sample, cat], - self.features[bag_sample, cat], + features[sample, cat], + features[bag_sample, cat], + features, cat, unique_labels, labels, @@ -287,7 +289,7 @@ def calc_dist(bag_sample): ordinal_distance = sum( [ self._get_euclidean_distance( - self.features[sample, num], self.features[bag_sample, num] + features[sample, num], features[bag_sample, num] ) for num in self.continuous_features_ ] @@ -303,27 +305,23 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first - second) return euclidean_distance - def _get_vdm(self, first, second, category, unique_labels, labels): + def _get_vdm(self, first, second, features, category, unique_labels, labels): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: def f_sparse(c): - N_ax = len(sparse.find(self.features[:, category] == first)[0]) - N_ay = len(sparse.find(self.features[:, category] == second)[0]) + N_ax = len(sparse.find(features[:, category] == first)[0]) + N_ay = len(sparse.find(features[:, category] == second)[0]) c_instances = self._get_all_instances_of_label(c, labels) - N_axc = len( - sparse.find(self.features[c_instances, category] == first)[0] - ) - N_ayc = len( - sparse.find(self.features[c_instances, category] == second)[0] - ) + N_axc = len(sparse.find(features[c_instances, category] == first)[0]) + N_ayc = len(sparse.find(features[c_instances, category] == second)[0]) p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay))) return p vdm = np.sum(np.array([f_sparse(c) for c in unique_labels])) return vdm - category_rows = self.features[:, category] + category_rows = features[:, category] N_ax = len(np.where(category_rows == first)) N_ay = len(np.where(category_rows == second)) From 60fbcd129b9aec19de748c22a6d565b8f7fe3109 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 13:06:27 -0700 Subject: [PATCH 11/33] Add TODO to fix case where mean IR is infinity --- imblearn/over_sampling/_mlsmote.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 156e2d0a3..e904b4606 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -110,6 +110,10 @@ def fit_resample(self, X, y): append_X_synth = X_synth.append append_y_synth = y_synth.append + + """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has + no samples + """ mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels) if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: From 04aae006a207ff29efe4ecdc37713f608f72c77d Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 19 Sep 2022 13:22:42 -0700 Subject: [PATCH 12/33] Handle/fix case where X_synth is empty --- imblearn/over_sampling/_mlsmote.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index e904b4606..9af4bcf3d 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -105,10 +105,9 @@ def fit_resample(self, X, y): " or 'list'" ) - X_synth = [] + X_synth = np.array([]).reshape(0, self.n_features_) y_synth = [] - append_X_synth = X_synth.append append_y_synth = y_synth.append """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has @@ -144,7 +143,7 @@ def fit_resample(self, X, y): labels, random_state, ) - append_X_synth(X_new) + X_synth = np.vstack((X_synth, X_new)) y_resampled = sparse.vstack((y_resampled, y_new)) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled else: @@ -173,7 +172,7 @@ def fit_resample(self, X, y): labels, random_state, ) - append_X_synth(X_new) + X_synth = np.vstack((X_synth, X_new)) append_y_synth(y_new) y_resampled.extend(y_synth) return np.concatenate((X_resampled, np.array(X_synth))), y_resampled From a4e70fd5a0f1520fdc80a458527636cdf8963fa8 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 09:17:01 -0700 Subject: [PATCH 13/33] Refactored code to reduce branching. Input label always converted to sparse matrix --- imblearn/over_sampling/_mlsmote.py | 224 +++++++++-------------------- 1 file changed, 67 insertions(+), 157 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 9af4bcf3d..25b6da201 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -15,9 +15,8 @@ class MLSMOTE: sampling_strategy: 'ranking', 'union' or 'intersection' default: 'ranking' Strategy to generate labelsets - k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic + If ``int``, number of nearest neighbors used to construct synthetic samples. categorical_features : ndarray of shape (n_cat_features,) or (n_features,) @@ -91,91 +90,55 @@ def fit_resample(self, X, y): random_state = check_random_state(self.random_state) X_resampled = X.copy() - y_resampled = y.copy() - if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: - labels = y - unique_labels = range(0, y.shape[1]) + # Convert 'y' to a sparse matrix + if type(y) == sparse._csr.csr_matrix: + y_resampled = y.copy() + unique_labels = range(0, y_resampled.shape[1]) + elif type(y) == np.ndarray: + y_resampled = sparse.csr_matrix(y, dtype=int) + unique_labels = range(0, y_resampled.shape[1]) elif type(y) == list: - labels = np.array([np.array(xi) for xi in y], dtype=object) unique_labels = self._collect_unique_labels(y) + y_resampled = sparse.csr_matrix((len(y), len(unique_labels))) + for i, sample in enumerate(y): + for label in sample: + y_resampled[i, label] = 1 else: raise TypeError( - "'y' can only be of type 'numpy.ndarray', 'scipy.sparse._csr.csr_matrix'" - " or 'list'" + "'y' can only be of type 'numpy.ndarray', " + "'scipy.sparse._csr.csr_matrix' or 'list'" ) - X_synth = np.array([]).reshape(0, self.n_features_) - y_synth = [] - - append_y_synth = y_synth.append - """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has no samples """ - mean_ir = self._get_mean_imbalance_ratio(unique_labels, labels) - - if type(y) == np.ndarray or type(y) == sparse._csr.csr_matrix: - y_synth = None - - for label in unique_labels: - irlbl_num = self._get_imbalance_ratio_numerator( - unique_labels, y_resampled - ) - irlbl = self._get_imbalance_ratio_per_label( - label, irlbl_num, y_resampled - ) - if irlbl > mean_ir: - min_bag = self._get_all_instances_of_label(label, labels) - for sample in min_bag: - distances = self._calc_distances( - sample, min_bag, X, unique_labels, labels - ) - distances = np.sort(distances, order="distance") - neighbours = distances[: self.k_neighbors] - ref_neigh = random_state.choice(neighbours, 1)[0] - X_new, y_new = self._create_new_sample( - sample, - ref_neigh[1], - [x[1] for x in neighbours], - X, - unique_labels, - labels, - random_state, - ) - X_synth = np.vstack((X_synth, X_new)) - y_resampled = sparse.vstack((y_resampled, y_new)) - return np.concatenate((X_resampled, np.array(X_synth))), y_resampled - else: - for label in unique_labels: - irlbl_num = self._get_imbalance_ratio_numerator( - unique_labels, y_resampled - ) - irlbl = self._get_imbalance_ratio_per_label( - label, irlbl_num, y_resampled - ) - if irlbl > mean_ir: - min_bag = self._get_all_instances_of_label(label, labels) - for sample in min_bag: - distances = self._calc_distances( - sample, min_bag, X, unique_labels, labels - ) - distances = np.sort(distances, order="distance") - neighbours = distances[: self.k_neighbors] - ref_neigh = random_state.choice(neighbours, 1)[0] - X_new, y_new = self._create_new_sample( - sample, - ref_neigh[1], - [x[1] for x in neighbours], - X, - unique_labels, - labels, - random_state, - ) - X_synth = np.vstack((X_synth, X_new)) - append_y_synth(y_new) - y_resampled.extend(y_synth) - return np.concatenate((X_resampled, np.array(X_synth))), y_resampled + mean_ir = self._get_mean_imbalance_ratio(unique_labels, y_resampled) + + for label in unique_labels: + irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, y_resampled) + irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled) + if irlbl > mean_ir: + min_bag = self._get_all_instances_of_label(label, y_resampled) + for sample in min_bag: + distances = self._calc_distances( + sample, min_bag, X_resampled, unique_labels, y_resampled + ) + distances = np.sort(distances, order="distance") + neighbors = distances[: self.k_neighbors] + ref_neigh = random_state.choice(neighbors, 1)[0] + X_new, y_new = self._create_new_sample( + sample, + ref_neigh[1], + [x[1] for x in neighbors], + X_resampled, + unique_labels, + y_resampled, + random_state, + ) + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = sparse.vstack((y_resampled, y_new)) + return X_resampled, y_resampled def _validate_estimator(self): categorical_features = np.asarray(self.categorical_features) @@ -194,27 +157,11 @@ def _validate_estimator(self): np.arange(self.n_features_), self.categorical_features_ ) - def _collect_unique_labels(self, y): - """A support function that flattens the labelsets and return one set of unique - labels - """ - return np.unique( - np.array( - [ - label - for label_set in y - for label in ( - label_set if isinstance(label_set, list) else [label_set] - ) - ] - ) - ) - def _create_new_sample( self, sample_id, ref_neigh_id, - neighbour_ids, + neighbor_ids, features, unique_labels, labels, @@ -223,7 +170,6 @@ def _create_new_sample( sample = features[sample_id] synth_sample = np.copy(sample) ref_neigh = features[ref_neigh_id] - sample_labels = labels[sample_id] for i in range(synth_sample.shape[0]): if i in self.continuous_features_: @@ -232,48 +178,33 @@ def _create_new_sample( synth_sample[i] = sample[i] + offset if i in self.categorical_features_: synth_sample[i] = self._get_most_frequent_value( - features[neighbour_ids, i] + features[neighbor_ids, i] ) X = synth_sample - if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: - neighbours_labels = labels[neighbour_ids] - possible_labels = neighbours_labels.sum(axis=0) - y = np.zeros((1, len(unique_labels))) - if self.sampling_strategy_ == "ranking": - head_index = int((self.k_neighbors + 1) / 2) - choosen_labels = possible_labels.nonzero()[1][:head_index] - y[0, choosen_labels] = 1 - if self.sampling_strategy_ == "union": - choosen_labels = possible_labels.nonzero()[0] - y[choosen_labels] = 1 - if self.sampling_strategy_ == "intersection": - choosen_labels = sparse.find(possible_labels == len(neighbours_labels)) - y[choosen_labels] = 1 - y = sparse.csr_matrix(y) - - else: - neighbours_labels = [] - for ni in neighbour_ids: - neighbours_labels.append(labels[ni].tolist()) - - new_labels = [] # sample_labels.tolist() - new_labels += [ - a - for x in neighbours_labels - for a in (x if isinstance(x, list) else [x]) - ] - new_labels = list(set(new_labels)) - if self.sampling_strategy_ == "ranking": - head_index = int((self.k_neighbors + 1) / 2) - y = new_labels[:head_index] - if self.sampling_strategy_ == "union": - y = new_labels[:] - if self.sampling_strategy_ == "intersection": - y = list(set.intersection(*neighbours_labels)) + neighbors_labels = labels[neighbor_ids] + possible_labels = neighbors_labels.sum(axis=0) + y = np.zeros((1, len(unique_labels))) + if self.sampling_strategy_ == "ranking": + head_index = int((self.k_neighbors + 1) / 2) + choosen_labels = possible_labels.nonzero()[1][:head_index] + y[0, choosen_labels] = 1 + if self.sampling_strategy_ == "union": + choosen_labels = possible_labels.nonzero()[0] + y[choosen_labels] = 1 + if self.sampling_strategy_ == "intersection": + choosen_labels = sparse.find(possible_labels == len(neighbors_labels)) + y[choosen_labels] = 1 + y = sparse.csr_matrix(y) return X, y + def _collect_unique_labels(self, y): + """A support function that flattens the labelsets and return one set of unique + labels + """ + return np.unique(np.array([label for label_set in y for label in label_set])) + def _calc_distances(self, sample, min_bag, features, unique_labels, labels): def calc_dist(bag_sample): nominal_distance = sum( @@ -309,7 +240,9 @@ def _get_euclidean_distance(self, first, second): return euclidean_distance def _get_vdm(self, first, second, features, category, unique_labels, labels): - """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf""" + """A support function to compute the Value Difference Metric(VDM) discribed in + https://arxiv.org/pdf/cs/9701101.pdf + """ if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: def f_sparse(c): @@ -340,14 +273,7 @@ def f(c): return vdm def _get_all_instances_of_label(self, label, labels): - if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: - return labels[:, label].nonzero()[0] - instance_ids = [] - append_instance_id = instance_ids.append - for i, label_set in enumerate(labels): - if label in label_set: - append_instance_id(i) - return np.array(instance_ids) + return labels[:, label].nonzero()[0] def _get_mean_imbalance_ratio(self, unique_labels, labels): irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels) @@ -373,23 +299,7 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): return irlbl_numerator / self._sum_h(label, labels) def _sum_h(self, label, labels): - if type(labels) == sparse._csr.csr_matrix: - return labels[:, label].count_nonzero() - elif type(labels) == np.ndarray: - return np.count_nonzero(labels[:, label]) - else: - h_sum = 0 - - def h(l, Y): - if l in Y: - return 1 - else: - return 0 - - for label_set in labels: - h_sum += h(label, label_set) - - return h_sum + return labels[:, label].count_nonzero() def _get_label_frequencies(self, labels): """A support function to get the frequencies of labels""" From 0d70b3dd8187badcc50545d0a30f66b7789f0508 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 09:28:37 -0700 Subject: [PATCH 14/33] Fix bug where 'sample' was included in the neighbor set --- imblearn/over_sampling/_mlsmote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 25b6da201..4cadcc53e 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -125,7 +125,9 @@ def fit_resample(self, X, y): sample, min_bag, X_resampled, unique_labels, y_resampled ) distances = np.sort(distances, order="distance") - neighbors = distances[: self.k_neighbors] + neighbors = distances[ + 1 : self.k_neighbors + 1 + ] # Remove 'sample' from neighbor set ref_neigh = random_state.choice(neighbors, 1)[0] X_new, y_new = self._create_new_sample( sample, From 1a6b2492ca8c88b40610c202c026a0ac290cb331 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 09:36:20 -0700 Subject: [PATCH 15/33] Handle the case (skip generating synth samples) when there is only sample for a given label --- imblearn/over_sampling/_mlsmote.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 4cadcc53e..6ec901ac8 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -120,6 +120,10 @@ def fit_resample(self, X, y): irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label, y_resampled) + if ( + len(min_bag) <= 1 + ): # If there is only one sample, the neighbor set will be empty + continue for sample in min_bag: distances = self._calc_distances( sample, min_bag, X_resampled, unique_labels, y_resampled From aa212af33a7644686e49421c09832e65619cba1a Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 10:13:23 -0700 Subject: [PATCH 16/33] Reorganized code to make it more consistent with other implementations --- imblearn/over_sampling/_mlsmote.py | 50 +++++++++++++++++------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 6ec901ac8..9afe1cdd9 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -42,19 +42,44 @@ class MLSMOTE: >>> from sklearn.datasets import make_multilabel_classification """ + _required_parameters = ["categorical_features"] + _sampling_strategies = ["intersection", "ranking", "union"] + def __init__( self, + categorical_features, *, sampling_strategy="ranking", - categorical_features, random_state=None, k_neighbors=5, ): + if sampling_strategy not in MLSMOTE._sampling_strategies: + raise ValueError( + "Sampling Strategy can only be one of: 'ranking', 'union' or " + "'intersection'" + ) + + self.categorical_features = categorical_features + self.sampling_strategy_ = sampling_strategy self.random_state = random_state self.k_neighbors = k_neighbors - self.sampling_strategy_ = sampling_strategy - self.categorical_features = categorical_features - self.continuous_features_ = None + + def _validate_estimator(self): + categorical_features = np.asarray(self.categorical_features) + if categorical_features.dtype.name == "bool": + self.categorical_features_ = np.flatnonzero(categorical_features) + else: + if any( + [cat not in np.arange(self.n_features_) for cat in categorical_features] + ): + raise ValueError( + "Some of the categorical indices are out of range. Indices" + f" should be between 0 and {self.n_features_}" + ) + self.categorical_features_ = categorical_features + self.continuous_features_ = np.setdiff1d( + np.arange(self.n_features_), self.categorical_features_ + ) def fit_resample(self, X, y): """Resample the dataset. @@ -146,23 +171,6 @@ def fit_resample(self, X, y): y_resampled = sparse.vstack((y_resampled, y_new)) return X_resampled, y_resampled - def _validate_estimator(self): - categorical_features = np.asarray(self.categorical_features) - if categorical_features.dtype.name == bool: - self.categorical_features_ = np.flatnonzero(categorical_features) - else: - if any( - [cat not in np.arange(self.n_features_) for cat in categorical_features] - ): - raise ValueError( - "Some of the categorical indices are out of range. Indices" - f" should be between 0 and {self.n_features_ - 1}" - ) - self.categorical_features_ = categorical_features - self.continuous_features_ = np.setdiff1d( - np.arange(self.n_features_), self.categorical_features_ - ) - def _create_new_sample( self, sample_id, From 7fccdd636101365b8db372805e1c6caf29e359f2 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 10:58:17 -0700 Subject: [PATCH 17/33] Improve code readability --- imblearn/over_sampling/_mlsmote.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 9afe1cdd9..4ae72fdf3 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -149,9 +149,9 @@ def fit_resample(self, X, y): len(min_bag) <= 1 ): # If there is only one sample, the neighbor set will be empty continue - for sample in min_bag: + for sample_id in min_bag: distances = self._calc_distances( - sample, min_bag, X_resampled, unique_labels, y_resampled + sample_id, min_bag, X_resampled, unique_labels, y_resampled ) distances = np.sort(distances, order="distance") neighbors = distances[ @@ -159,7 +159,7 @@ def fit_resample(self, X, y): ] # Remove 'sample' from neighbor set ref_neigh = random_state.choice(neighbors, 1)[0] X_new, y_new = self._create_new_sample( - sample, + sample_id, ref_neigh[1], [x[1] for x in neighbors], X_resampled, @@ -176,42 +176,41 @@ def _create_new_sample( sample_id, ref_neigh_id, neighbor_ids, - features, + X_resampled, unique_labels, - labels, + y_resampled, random_state, ): - sample = features[sample_id] - synth_sample = np.copy(sample) - ref_neigh = features[ref_neigh_id] + sample = X_resampled[sample_id] + synth_sample = np.zeros_like(sample) + ref_neigh = X_resampled[ref_neigh_id] for i in range(synth_sample.shape[0]): if i in self.continuous_features_: diff = ref_neigh[i] - sample[i] offset = diff * random_state.uniform(0, 1) synth_sample[i] = sample[i] + offset - if i in self.categorical_features_: + elif i in self.categorical_features_: synth_sample[i] = self._get_most_frequent_value( - features[neighbor_ids, i] + X_resampled[neighbor_ids, i] ) - X = synth_sample - neighbors_labels = labels[neighbor_ids] + neighbors_labels = y_resampled[neighbor_ids] possible_labels = neighbors_labels.sum(axis=0) y = np.zeros((1, len(unique_labels))) if self.sampling_strategy_ == "ranking": head_index = int((self.k_neighbors + 1) / 2) choosen_labels = possible_labels.nonzero()[1][:head_index] y[0, choosen_labels] = 1 - if self.sampling_strategy_ == "union": + elif self.sampling_strategy_ == "union": choosen_labels = possible_labels.nonzero()[0] y[choosen_labels] = 1 - if self.sampling_strategy_ == "intersection": + elif self.sampling_strategy_ == "intersection": choosen_labels = sparse.find(possible_labels == len(neighbors_labels)) y[choosen_labels] = 1 y = sparse.csr_matrix(y) - return X, y + return synth_sample, y def _collect_unique_labels(self, y): """A support function that flattens the labelsets and return one set of unique From c4ccaeb433d2da70dc8bcea508b3bd4f64202c14 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 11:22:09 -0700 Subject: [PATCH 18/33] Simplify _get_most_frequent_value --- imblearn/over_sampling/_mlsmote.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 4ae72fdf3..041a59321 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -321,6 +321,9 @@ def _get_label_frequencies(self, labels): return frequencies def _get_most_frequent_value(self, values): - """A support function to get most frequent value if a list of values""" - uniques, indices = np.unique(values, return_inverse=True) - return uniques[np.argmax(np.bincount(indices))] + """A support function to get most frequent value if a list of values + TODO: We might want to randomize 'unique' and 'counts' to avoid always returning + the first occurrence when multiple occurrences of the maximum value. + """ + uniques, counts = np.unique(values, return_counts=True) + return uniques[np.argmax(counts)] From d7e4e42e34db47af17fff90ed16be9179706261a Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 12:32:48 -0700 Subject: [PATCH 19/33] Fixed erroneous implementation of ranking and intersection strategies. Improved code readability --- imblearn/over_sampling/_mlsmote.py | 41 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 041a59321..56f041ae2 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -43,13 +43,17 @@ class MLSMOTE: """ _required_parameters = ["categorical_features"] - _sampling_strategies = ["intersection", "ranking", "union"] + + INTERSECTION = "intersection" + RANKING = "ranking" + UNION = "union" + _sampling_strategies = [INTERSECTION, RANKING, UNION] def __init__( self, categorical_features, *, - sampling_strategy="ranking", + sampling_strategy=RANKING, random_state=None, k_neighbors=5, ): @@ -196,21 +200,24 @@ def _create_new_sample( ) neighbors_labels = y_resampled[neighbor_ids] - possible_labels = neighbors_labels.sum(axis=0) - y = np.zeros((1, len(unique_labels))) - if self.sampling_strategy_ == "ranking": - head_index = int((self.k_neighbors + 1) / 2) - choosen_labels = possible_labels.nonzero()[1][:head_index] - y[0, choosen_labels] = 1 - elif self.sampling_strategy_ == "union": - choosen_labels = possible_labels.nonzero()[0] - y[choosen_labels] = 1 - elif self.sampling_strategy_ == "intersection": - choosen_labels = sparse.find(possible_labels == len(neighbors_labels)) - y[choosen_labels] = 1 - y = sparse.csr_matrix(y) - - return synth_sample, y + label_counts = np.squeeze( + np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0)) + ) + synth_sample_labels = sparse.csr_matrix((1, len(unique_labels))) + if self.sampling_strategy_ == MLSMOTE.RANKING: + # Note: Paper states "present in half or more of the instances considered" + # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We + # follow the pseudocode for now. + quorum = int((len(neighbor_ids) + 1) / 2) + chosen_labels = label_counts > quorum + elif self.sampling_strategy_ == MLSMOTE.UNION: + chosen_labels = label_counts.nonzero() + elif self.sampling_strategy_ == MLSMOTE.INTERSECTION: + chosen_labels = label_counts == len(neighbor_ids) + 1 + + synth_sample_labels[0, chosen_labels] = 1 + + return synth_sample, synth_sample_labels def _collect_unique_labels(self, y): """A support function that flattens the labelsets and return one set of unique From 8d1c4c966802439050f7930815d7c350938366c8 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 13:43:19 -0700 Subject: [PATCH 20/33] Add support function to return labels equal to their input type --- imblearn/over_sampling/_mlsmote.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 56f041ae2..d35346816 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -173,7 +173,7 @@ def fit_resample(self, X, y): ) X_resampled = np.vstack((X_resampled, X_new)) y_resampled = sparse.vstack((y_resampled, y_new)) - return X_resampled, y_resampled + return X_resampled, self.convert_to_input_type(y_resampled, unique_labels, type(y)) def _create_new_sample( self, @@ -334,3 +334,16 @@ def _get_most_frequent_value(self, values): """ uniques, counts = np.unique(values, return_counts=True) return uniques[np.argmax(counts)] + + def convert_to_input_type(self, y_resampled, unique_labels, input_type): + """A support function that converts the labels back to its input format""" + if input_type == sparse._csr.csr_matrix: + return y_resampled + elif input_type == np.ndarray: + return np.asarray(y_resampled.todense()) + elif input_type == list: + labels = [[] for _ in range(y_resampled.shape[0])] + rows, cols = y_resampled.nonzero() + for row, col in zip(rows, cols): + labels[row].append(unique_labels[col]) + return labels From 66566c6f8fa78b200a30913a1a722104b96dbcfe Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 15:57:19 -0700 Subject: [PATCH 21/33] Refactor use of unique_labels --- imblearn/over_sampling/_mlsmote.py | 50 ++++++++++++++++-------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index d35346816..cd424d0e1 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -120,32 +120,33 @@ def fit_resample(self, X, y): X_resampled = X.copy() + unique_labels = None # Convert 'y' to a sparse matrix if type(y) == sparse._csr.csr_matrix: y_resampled = y.copy() - unique_labels = range(0, y_resampled.shape[1]) elif type(y) == np.ndarray: y_resampled = sparse.csr_matrix(y, dtype=int) - unique_labels = range(0, y_resampled.shape[1]) elif type(y) == list: unique_labels = self._collect_unique_labels(y) y_resampled = sparse.csr_matrix((len(y), len(unique_labels))) - for i, sample in enumerate(y): - for label in sample: - y_resampled[i, label] = 1 + for i, sample_labels in enumerate(y): + for label in sample_labels: + y_resampled[i, np.where(unique_labels == label)] = 1 else: raise TypeError( "'y' can only be of type 'numpy.ndarray', " "'scipy.sparse._csr.csr_matrix' or 'list'" ) + self.n_classes_ = y_resampled.shape[1] + """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has no samples """ - mean_ir = self._get_mean_imbalance_ratio(unique_labels, y_resampled) + mean_ir = self._get_mean_imbalance_ratio(y_resampled) - for label in unique_labels: - irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, y_resampled) + for label in range(self.n_classes_): + irlbl_num = self._get_imbalance_ratio_numerator(y_resampled) irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label, y_resampled) @@ -155,7 +156,7 @@ def fit_resample(self, X, y): continue for sample_id in min_bag: distances = self._calc_distances( - sample_id, min_bag, X_resampled, unique_labels, y_resampled + sample_id, min_bag, X_resampled, y_resampled ) distances = np.sort(distances, order="distance") neighbors = distances[ @@ -167,13 +168,14 @@ def fit_resample(self, X, y): ref_neigh[1], [x[1] for x in neighbors], X_resampled, - unique_labels, y_resampled, random_state, ) X_resampled = np.vstack((X_resampled, X_new)) y_resampled = sparse.vstack((y_resampled, y_new)) - return X_resampled, self.convert_to_input_type(y_resampled, unique_labels, type(y)) + return X_resampled, self.convert_to_input_type( + y_resampled, unique_labels, type(y) + ) def _create_new_sample( self, @@ -181,7 +183,6 @@ def _create_new_sample( ref_neigh_id, neighbor_ids, X_resampled, - unique_labels, y_resampled, random_state, ): @@ -203,7 +204,7 @@ def _create_new_sample( label_counts = np.squeeze( np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0)) ) - synth_sample_labels = sparse.csr_matrix((1, len(unique_labels))) + synth_sample_labels = sparse.csr_matrix((1, self.n_classes_), dtype=int) if self.sampling_strategy_ == MLSMOTE.RANKING: # Note: Paper states "present in half or more of the instances considered" # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We @@ -225,7 +226,7 @@ def _collect_unique_labels(self, y): """ return np.unique(np.array([label for label_set in y for label in label_set])) - def _calc_distances(self, sample, min_bag, features, unique_labels, labels): + def _calc_distances(self, sample, min_bag, features, labels): def calc_dist(bag_sample): nominal_distance = sum( [ @@ -234,7 +235,6 @@ def calc_dist(bag_sample): features[bag_sample, cat], features, cat, - unique_labels, labels, ) for cat in self.categorical_features_ @@ -259,7 +259,7 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first - second) return euclidean_distance - def _get_vdm(self, first, second, features, category, unique_labels, labels): + def _get_vdm(self, first, second, features, category, labels): """A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf """ @@ -274,7 +274,7 @@ def f_sparse(c): p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay))) return p - vdm = np.sum(np.array([f_sparse(c) for c in unique_labels])) + vdm = np.sum(np.array([f_sparse(c) for c in range(self.n_classes_)])) return vdm category_rows = features[:, category] @@ -289,30 +289,32 @@ def f(c): p = abs((N_axc / N_ax) - (N_ayc / N_ay)) return p - vdm = np.array([f(c) for c in unique_labels]).sum() + vdm = np.array([f(c) for c in range(self.n_classes_)]).sum() return vdm def _get_all_instances_of_label(self, label, labels): return labels[:, label].nonzero()[0] - def _get_mean_imbalance_ratio(self, unique_labels, labels): - irlbl_num = self._get_imbalance_ratio_numerator(unique_labels, labels) + def _get_mean_imbalance_ratio(self, labels): + irlbl_num = self._get_imbalance_ratio_numerator(labels) ratio_sum = np.sum( np.array( list( map( self._get_imbalance_ratio_per_label, - unique_labels, + range(self.n_classes_), itertools.repeat(irlbl_num), itertools.repeat(labels), ) ) ) ) - return ratio_sum / len(unique_labels) + return ratio_sum / self.n_classes_ - def _get_imbalance_ratio_numerator(self, unique_labels, labels): - sum_array = np.array([self._sum_h(label, labels) for label in unique_labels]) + def _get_imbalance_ratio_numerator(self, labels): + sum_array = np.array( + [self._sum_h(label, labels) for label in range(self.n_classes_)] + ) return sum_array.max() def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): From 3aa029fec5decd1eca65974e599b07b1c3516218 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 18:23:00 -0700 Subject: [PATCH 22/33] Optimize _get_vdm --- imblearn/over_sampling/_mlsmote.py | 40 +++++++++--------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index cd424d0e1..1382b2ed6 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -248,7 +248,7 @@ def calc_dist(bag_sample): for num in self.continuous_features_ ] ) - dist = sum([nominal_distance, ordinal_distance]) + dist = nominal_distance + ordinal_distance return (dist, bag_sample) distances = [calc_dist(bag_sample) for bag_sample in min_bag] @@ -259,37 +259,21 @@ def _get_euclidean_distance(self, first, second): euclidean_distance = np.linalg.norm(first - second) return euclidean_distance - def _get_vdm(self, first, second, features, category, labels): - """A support function to compute the Value Difference Metric(VDM) discribed in + def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels): + """A support function to compute the Value Difference Metric(VDM) described in https://arxiv.org/pdf/cs/9701101.pdf """ - if type(labels) == np.ndarray or type(labels) == sparse._csr.csr_matrix: - - def f_sparse(c): - N_ax = len(sparse.find(features[:, category] == first)[0]) - N_ay = len(sparse.find(features[:, category] == second)[0]) - c_instances = self._get_all_instances_of_label(c, labels) - N_axc = len(sparse.find(features[c_instances, category] == first)[0]) - N_ayc = len(sparse.find(features[c_instances, category] == second)[0]) - p = np.square(np.abs((N_axc / N_ax) - (N_ayc / N_ay))) - return p - - vdm = np.sum(np.array([f_sparse(c) for c in range(self.n_classes_)])) - return vdm - - category_rows = features[:, category] - N_ax = len(np.where(category_rows == first)) - N_ay = len(np.where(category_rows == second)) - - def f(c): - class_instances = self._get_all_instances_of_label(c, labels) - class_instance_rows = category_rows[class_instances] - N_axc = len(np.where(class_instance_rows == first)[0]) - N_ayc = len(np.where(class_instance_rows == second)[0]) - p = abs((N_axc / N_ax) - (N_ayc / N_ay)) + + def f_sparse(_class): + c_instances = self._get_all_instances_of_label(_class, labels) + N_axc = np.count_nonzero(features[c_instances, category] == x_attr_val) + N_ayc = np.count_nonzero(features[c_instances, category] == y_attr_val) + p = abs((N_axc / N_ax) - (N_ayc / N_ay)) ** 2 return p - vdm = np.array([f(c) for c in range(self.n_classes_)]).sum() + N_ax = np.count_nonzero(features[:, category] == x_attr_val) + N_ay = np.count_nonzero(features[:, category] == y_attr_val) + vdm = sum([f_sparse(_class) for _class in range(self.n_classes_)]) return vdm def _get_all_instances_of_label(self, label, labels): From feb2f98aaa854e51747bdbda3e9d8ed2fd3cc42f Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 19:10:13 -0700 Subject: [PATCH 23/33] Add documentation --- README.rst | 3 +++ imblearn/over_sampling/_mlsmote.py | 35 +++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index dee6419cb..36936712f 100644 --- a/README.rst +++ b/README.rst @@ -185,6 +185,7 @@ Below is a list of the methods currently implemented in this module. 7. ADASYN - Adaptive synthetic sampling approach for imbalanced learning [15]_ 8. KMeans-SMOTE [17]_ 9. ROSE - Random OverSampling Examples [19]_ + 10. MLSMOTE - Multilabel Synthetic Minority Over-sampling Technique [20]_ * Over-sampling followed by under-sampling 1. SMOTE + Tomek links [12]_ @@ -243,3 +244,5 @@ References: .. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197. .. [19] : Menardi, G., Torelli, N.: "Training and assessing classification rules with unbalanced data", Data Mining and Knowledge Discovery, 28, (2014): 92–122 + +.. [20] : Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 1382b2ed6..646e3ac20 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -39,7 +39,36 @@ class MLSMOTE: Examples -------- + >>> import numpy as np >>> from sklearn.datasets import make_multilabel_classification + >>> from imblearn.over_sampling import MLSMOTE + >>> X, y = make_multilabel_classification(n_classes=5, n_features=20, + ... random_state=42) + >>> print("Original Dataset") + Original Dataset + >>> print(f"Samples: {X.shape[0]}") + Samples: 100 + >>> for _class in range(y.shape[1]): + ... print(f"Class {_class} count: {np.count_nonzero(y[:, _class])}") + Class 0 count: 30 + Class 1 count: 54 + Class 2 count: 48 + Class 3 count: 33 + Class 4 count: 14 + >>> categorical_features = np.full((20,), True) + >>> mlsmote = MLSMOTE(categorical_features, random_state=42) + >>> X_res, y_res = mlsmote.fit_resample(X, y) + >>> print("Resampled Dataset") + Resampled Dataset + >>> print(f"Samples: {X_res.shape[0]}") + Samples: 114 + >>> for _class in range(y_res.shape[1]): + ... print(f"Class {_class} count: {np.count_nonzero(y_res[:, _class])}") + Class 0 count: 30 + Class 1 count: 60 + Class 2 count: 56 + Class 3 count: 33 + Class 4 count: 28 """ _required_parameters = ["categorical_features"] @@ -95,8 +124,7 @@ def fit_resample(self, X, y): Matrix containing the data which have to be sampled. y : {array-like, sparse matrix of shape \ - (n_samples, n_labels) - or a list of lists of labels. + (n_samples, n_labels) or a list of lists of labels. See "sklearn.datasets.make_multilabel_classification" and \ the "return_indicate" input parameter for more \ information on possible label sets formats. @@ -110,7 +138,8 @@ def fit_resample(self, X, y): (n_samples_new, n_features) The array containing the resampled data. - y_resampled : array-like of shape (n_samples_new, n_labels) + y_resampled : array-like of shape (n_samples_new, n_labels) \ + or a list of lists of labels. The corresponding label sets of `X_resampled`. """ self.n_features_ = X.shape[1] From ff89a0494d043f51a00fedd7e5d68a9e8e327418 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Tue, 20 Sep 2022 20:33:29 -0700 Subject: [PATCH 24/33] Add and update tests --- imblearn/over_sampling/tests/test_mlsmote.py | 216 +++++++++++++++---- 1 file changed, 176 insertions(+), 40 deletions(-) diff --git a/imblearn/over_sampling/tests/test_mlsmote.py b/imblearn/over_sampling/tests/test_mlsmote.py index 49f1c0317..5174c197e 100644 --- a/imblearn/over_sampling/tests/test_mlsmote.py +++ b/imblearn/over_sampling/tests/test_mlsmote.py @@ -1,17 +1,16 @@ """Test the module MLSMOTE.""" - -from collections import Counter - -import pytest - import numpy as np -from scipy import sparse -from sklearn.preprocessing import MultiLabelBinarizer +import pytest +from sklearn.datasets import make_multilabel_classification +from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import assert_array_equal from imblearn.over_sampling import MLSMOTE +R_TOL = 1e-4 + def data_heterogneous_ordered(): rng = np.random.RandomState(42) @@ -22,7 +21,7 @@ def data_heterogneous_ordered(): X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20 # return the categories return X, y, [2, 3] @@ -36,7 +35,7 @@ def data_heterogneous_unordered(): X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20 # return the categories return X, y, [0, 3] @@ -50,28 +49,33 @@ def data_heterogneous_masked(): X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object) # create a categorical feature using some integer X[:, 3] = rng.randint(3, size=30) - y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) + y = [[0, 2, 3]] * 5 + [[1, 2, 3, 4]] * 2 + [[1, 2]] * 3 + [[1]] * 20 # return the categories return X, y, [True, False, True] def data_sparse(): - rng = np.random.RandomState(42) - X = np.empty((30, 4), dtype=np.float64) - # create 2 random continuous feature - X[:, [1, 2]] = rng.randn(30, 2) - # create a categorical feature using some string - X[:, 0] = rng.randint(3, size=30) - # create a categorical feature using some integer - X[:, 3] = rng.randint(3, size=30) - y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20) - labelBinarizer = MultiLabelBinarizer() - y = labelBinarizer.fit_transform(y) - y = sparse.csr_matrix(y) - return X, y, [0, 3] + X, y = make_multilabel_classification( + n_samples=20, n_features=5, return_indicator="sparse", random_state=42 + ) + return X, y, [] + + +def data_dense(): + X, y = make_multilabel_classification( + n_samples=20, n_features=5, return_indicator="dense", random_state=42 + ) + return X, y, [] -def test_mlsmote_error(): +def data_list_of_lists(): + X, y = make_multilabel_classification( + n_samples=20, n_features=5, return_indicator=False, random_state=42 + ) + return X, y, [] + + +def test_mlsmote_categorical_features_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] smote = MLSMOTE(categorical_features=categorical_features) @@ -79,21 +83,32 @@ def test_mlsmote_error(): smote.fit_resample(X, y) +def test_mlsmote_invalid_strategy_error(): + _, _, categorical_features = data_heterogneous_unordered() + with pytest.raises( + ValueError, + match="Sampling Strategy can only be one of:", + ): + _ = MLSMOTE(categorical_features=categorical_features, sampling_strategy="foo") + + @pytest.mark.parametrize( "data", [ data_heterogneous_ordered(), data_heterogneous_unordered(), data_heterogneous_masked(), - data_sparse() + data_sparse(), + data_dense(), + data_list_of_lists(), ], ) def test_mlsmote(data): X, y, categorical_features = data smote = MLSMOTE(categorical_features=categorical_features) X_resampled, y_resampled = smote.fit_resample(X, y) - assert X_resampled.dtype == X.dtype + assert type(y) == type(y_resampled) categorical_features = np.array(categorical_features) if categorical_features.dtype == bool: @@ -103,24 +118,145 @@ def test_mlsmote(data): assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype -def test_mlsmote_fit(): +def test_mlsmote_fit_resample_1(): X, y, categorical_features = data_heterogneous_unordered() + classes = set([a for x in y for a in x]) smote = MLSMOTE(categorical_features=categorical_features) - smote.fit_resample(X, y) + _, y_res = smote.fit_resample(X, y) + classes_res = set([a for x in y_res for a in x]) + + assert classes == classes_res assert hasattr( smote, "sampling_strategy_" ), "No fitted attribute sampling_strategy_" -def test_mlsmote_fit_resample(): - X, y, categorical_features = data_heterogneous_unordered() - target_stats = Counter(np.unique( - np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))) - smote = MLSMOTE(categorical_features=categorical_features) - _, y_res = smote.fit_resample(X, y) - classes_res = np.unique( - np.array([a for x in y_res - for a in (x if isinstance(x, list) else [x])])) - _ = Counter(classes_res) - n_samples = max(target_stats.values()) - assert all(value >= n_samples for value in Counter(classes_res).values()) +def test_mlsmote_fit_resample_2(): + X = np.array( + [ + [25.0, 34.0], + [38.0, 10.0], + [47.0, 7.0], + [32.0, 15.0], + [23.0, 27.0], + [36.0, 9.0], + [45.0, 10.0], + [39.0, 7.0], + [29.0, 26.0], + [31.0, 18.0], + [36.0, 6.0], + [37.0, 7.0], + [44.0, 10.0], + [42.0, 16.0], + [39.0, 5.0], + [44.0, 9.0], + [33.0, 13.0], + [36.0, 12.0], + [32.0, 6.0], + [28.0, 9.0], + ] + ) + + y = np.array( + [ + [0, 0], + [1, 1], + [1, 0], + [1, 1], + [0, 0], + [1, 1], + [1, 1], + [0, 1], + [0, 0], + [0, 0], + [0, 1], + [1, 0], + [1, 1], + [0, 1], + [1, 1], + [1, 1], + [1, 1], + [0, 1], + [1, 1], + [0, 1], + ] + ) + + X_resampled_exp = np.array( + [ + [25.0, 34.0], + [38.0, 10.0], + [47.0, 7.0], + [32.0, 15.0], + [23.0, 27.0], + [36.0, 9.0], + [45.0, 10.0], + [39.0, 7.0], + [29.0, 26.0], + [31.0, 18.0], + [36.0, 6.0], + [37.0, 7.0], + [44.0, 10.0], + [42.0, 16.0], + [39.0, 5.0], + [44.0, 9.0], + [33.0, 13.0], + [36.0, 12.0], + [32.0, 6.0], + [28.0, 9.0], + [38.95071431, 6.34003029], + [42.22519874, 6.10833449], + [33.83699557, 12.99774833], + [36.06175348, 5.12036059], + [38.43013104, 10.0], + [36.08297745, 6.69575776], + [40.54443985, 9.70877086], + [37.80041708, 5.18666265], + [41.80182894, 9.45606998], + [34.91230996, 10.05030734], + [32.23225206, 6.60754485], + ] + ) + + y_resampled_exp = np.array( + [ + [0, 0], + [1, 1], + [1, 0], + [1, 1], + [0, 0], + [1, 1], + [1, 1], + [0, 1], + [0, 0], + [0, 0], + [0, 1], + [1, 0], + [1, 1], + [0, 1], + [1, 1], + [1, 1], + [1, 1], + [0, 1], + [1, 1], + [0, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + [1, 1], + ] + ) + + smote = MLSMOTE(categorical_features=[], random_state=42) + X_resampled, y_resampled = smote.fit_resample(X, y) + print(X_resampled) + print(y_resampled) + assert_allclose(X_resampled, X_resampled_exp, rtol=R_TOL) + assert_array_equal(y_resampled, y_resampled_exp) From 1484f948b9c825d81fba657f3189309d8b23fe38 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 21 Sep 2022 08:51:14 -0700 Subject: [PATCH 25/33] Remove unused '_get_label_frequencies' function --- imblearn/over_sampling/_mlsmote.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 646e3ac20..23289cb3e 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -336,12 +336,6 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): def _sum_h(self, label, labels): return labels[:, label].count_nonzero() - def _get_label_frequencies(self, labels): - """A support function to get the frequencies of labels""" - frequency_map = np.array(np.unique(labels, return_counts=True)).T - frequencies = np.array([x[1] for x in frequency_map]) - return frequencies - def _get_most_frequent_value(self, values): """A support function to get most frequent value if a list of values TODO: We might want to randomize 'unique' and 'counts' to avoid always returning From 4938543e9ebbe66befd61dcbb2a100d19e1f8895 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 21 Sep 2022 08:52:49 -0700 Subject: [PATCH 26/33] Rename function _convert_to_input_type --- imblearn/over_sampling/_mlsmote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 23289cb3e..eb1553784 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -202,7 +202,7 @@ def fit_resample(self, X, y): ) X_resampled = np.vstack((X_resampled, X_new)) y_resampled = sparse.vstack((y_resampled, y_new)) - return X_resampled, self.convert_to_input_type( + return X_resampled, self._convert_to_input_type( y_resampled, unique_labels, type(y) ) @@ -344,7 +344,7 @@ def _get_most_frequent_value(self, values): uniques, counts = np.unique(values, return_counts=True) return uniques[np.argmax(counts)] - def convert_to_input_type(self, y_resampled, unique_labels, input_type): + def _convert_to_input_type(self, y_resampled, unique_labels, input_type): """A support function that converts the labels back to its input format""" if input_type == sparse._csr.csr_matrix: return y_resampled From 8c1319f0f7a24074da968d47ae26942b48cb00e0 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Mon, 10 Oct 2022 10:04:49 -0600 Subject: [PATCH 27/33] Simplified calculation of Mean IR --- imblearn/over_sampling/_mlsmote.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index eb1553784..1d6bfa2ee 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -1,6 +1,5 @@ """Class to perfrom over-sampling using MLSMOTE.""" -import itertools import numpy as np from scipy import sparse @@ -309,19 +308,11 @@ def _get_all_instances_of_label(self, label, labels): return labels[:, label].nonzero()[0] def _get_mean_imbalance_ratio(self, labels): - irlbl_num = self._get_imbalance_ratio_numerator(labels) - ratio_sum = np.sum( - np.array( - list( - map( - self._get_imbalance_ratio_per_label, - range(self.n_classes_), - itertools.repeat(irlbl_num), - itertools.repeat(labels), - ) - ) - ) + sum_per_label = np.array( + [self._sum_h(label, labels) for label in range(self.n_classes_)] ) + irlbl_num = sum_per_label.max() + ratio_sum = np.sum(irlbl_num / sum_per_label) return ratio_sum / self.n_classes_ def _get_imbalance_ratio_numerator(self, labels): From 2b82cda90921d34aab53a4b75864fd316550304f Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 13 Oct 2022 11:20:56 -0500 Subject: [PATCH 28/33] Simplify get_euclidean_distance --- imblearn/over_sampling/_mlsmote.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 1d6bfa2ee..b71e05e05 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -284,8 +284,10 @@ def calc_dist(bag_sample): return np.array(distances, dtype=dtype) def _get_euclidean_distance(self, first, second): - euclidean_distance = np.linalg.norm(first - second) - return euclidean_distance + """Since the inputs are of type 'float' the euclidean distance is just + the absolute value of their difference. + """ + return abs(first - second) def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels): """A support function to compute the Value Difference Metric(VDM) described in From 07a2cc67cbd96c2731da4a48efc7758c299cae00 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Wed, 12 Oct 2022 09:32:38 -0600 Subject: [PATCH 29/33] Use a cache to calculate euclidean distance --- imblearn/over_sampling/_mlsmote.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index b71e05e05..1811e1993 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -1,5 +1,6 @@ """Class to perfrom over-sampling using MLSMOTE.""" +from itertools import combinations import numpy as np from scipy import sparse @@ -178,13 +179,22 @@ def fit_resample(self, X, y): irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label, y_resampled) + euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0])) + X_sliced = X_resampled[:][:,self.continuous_features_] + pairs = list(combinations(min_bag, 2)) + for m, n in pairs: + distance = sum(self._get_euclidean_distance( + X_sliced[m, :], X_sliced[n, :] + )) + euclidean_dist_cache[m, n] = distance + euclidean_dist_cache[n, m] = distance if ( len(min_bag) <= 1 ): # If there is only one sample, the neighbor set will be empty continue for sample_id in min_bag: distances = self._calc_distances( - sample_id, min_bag, X_resampled, y_resampled + sample_id, min_bag, X_resampled, y_resampled, euclidean_dist_cache, ) distances = np.sort(distances, order="distance") neighbors = distances[ @@ -254,7 +264,7 @@ def _collect_unique_labels(self, y): """ return np.unique(np.array([label for label_set in y for label in label_set])) - def _calc_distances(self, sample, min_bag, features, labels): + def _calc_distances(self, sample, min_bag, features, labels, euclidean_dist_cache): def calc_dist(bag_sample): nominal_distance = sum( [ @@ -268,14 +278,7 @@ def calc_dist(bag_sample): for cat in self.categorical_features_ ] ) - ordinal_distance = sum( - [ - self._get_euclidean_distance( - features[sample, num], features[bag_sample, num] - ) - for num in self.continuous_features_ - ] - ) + ordinal_distance = euclidean_dist_cache[sample, bag_sample] dist = nominal_distance + ordinal_distance return (dist, bag_sample) From d36a132a93dad72a588e97ef20ab1e0274672cdd Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 27 Oct 2022 12:45:08 -0700 Subject: [PATCH 30/33] Avoid repeated calculation of 'c_instances' --- imblearn/over_sampling/_mlsmote.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 1811e1993..a717f2325 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -273,7 +273,7 @@ def calc_dist(bag_sample): features[bag_sample, cat], features, cat, - labels, + c_instances, ) for cat in self.categorical_features_ ] @@ -282,6 +282,9 @@ def calc_dist(bag_sample): dist = nominal_distance + ordinal_distance return (dist, bag_sample) + c_instances = [ + self._get_all_instances_of_label(_class, labels) for _class in range(self.n_classes_) + ] distances = [calc_dist(bag_sample) for bag_sample in min_bag] dtype = np.dtype([("distance", float), ("index", int)]) return np.array(distances, dtype=dtype) @@ -292,15 +295,14 @@ def _get_euclidean_distance(self, first, second): """ return abs(first - second) - def _get_vdm(self, x_attr_val, y_attr_val, features, category, labels): + def _get_vdm(self, x_attr_val, y_attr_val, features, category, c_instances): """A support function to compute the Value Difference Metric(VDM) described in https://arxiv.org/pdf/cs/9701101.pdf """ def f_sparse(_class): - c_instances = self._get_all_instances_of_label(_class, labels) - N_axc = np.count_nonzero(features[c_instances, category] == x_attr_val) - N_ayc = np.count_nonzero(features[c_instances, category] == y_attr_val) + N_axc = np.count_nonzero(features[c_instances[_class], category] == x_attr_val) + N_ayc = np.count_nonzero(features[c_instances[_class], category] == y_attr_val) p = abs((N_axc / N_ax) - (N_ayc / N_ay)) ** 2 return p From b593edf87bbf8a798f7069583d250e9491b8008c Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 27 Oct 2022 14:50:39 -0700 Subject: [PATCH 31/33] Minor format and code reorganization --- imblearn/over_sampling/_mlsmote.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index a717f2325..a3ca95d29 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -179,19 +179,19 @@ def fit_resample(self, X, y): irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled) if irlbl > mean_ir: min_bag = self._get_all_instances_of_label(label, y_resampled) + if ( + len(min_bag) <= 1 + ): # If there is only one sample, the neighbor set will be empty + continue euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0])) - X_sliced = X_resampled[:][:,self.continuous_features_] + X_cont = X_resampled[:][:, self.continuous_features_] pairs = list(combinations(min_bag, 2)) for m, n in pairs: distance = sum(self._get_euclidean_distance( - X_sliced[m, :], X_sliced[n, :] + X_cont[m, :], X_cont[n, :] )) euclidean_dist_cache[m, n] = distance euclidean_dist_cache[n, m] = distance - if ( - len(min_bag) <= 1 - ): # If there is only one sample, the neighbor set will be empty - continue for sample_id in min_bag: distances = self._calc_distances( sample_id, min_bag, X_resampled, y_resampled, euclidean_dist_cache, From c135cfa5b4e5619798ca6abcf21a249628ee2126 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 27 Oct 2022 15:28:34 -0700 Subject: [PATCH 32/33] Add note about calculation of VDM distances --- imblearn/over_sampling/_mlsmote.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index a3ca95d29..608b72967 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -183,6 +183,15 @@ def fit_resample(self, X, y): len(min_bag) <= 1 ): # If there is only one sample, the neighbor set will be empty continue + # Note: Only the distance for numeric attributes can be + # cached. The Value Difference Metric (VDM) distance for + # categorical/nominal attributes CANNOT be cached because VDMs + # are dependent on the total number of samples in the dataset + # that have specific values for the different attributes. + # Given that each synthetic sample is added to the dataset in + # the inner loop (line 17 of 'Algorithm 1' of the MLSMOTE, + # Charte, F. et al. paper), the VDM between samples has to be + # computed in every inner iteration. euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0])) X_cont = X_resampled[:][:, self.continuous_features_] pairs = list(combinations(min_bag, 2)) From 70e1f966414cdf9425d764c52a0a92d99c732051 Mon Sep 17 00:00:00 2001 From: Bruno Alvisio Date: Thu, 27 Oct 2022 16:05:49 -0700 Subject: [PATCH 33/33] Use numpy array instead of sparse matrix --- imblearn/over_sampling/_mlsmote.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py index 608b72967..a7133eb41 100644 --- a/imblearn/over_sampling/_mlsmote.py +++ b/imblearn/over_sampling/_mlsmote.py @@ -150,14 +150,14 @@ def fit_resample(self, X, y): X_resampled = X.copy() unique_labels = None - # Convert 'y' to a sparse matrix + # Convert 'y' to a numpy array if type(y) == sparse._csr.csr_matrix: - y_resampled = y.copy() + y_resampled = y.toarray() elif type(y) == np.ndarray: - y_resampled = sparse.csr_matrix(y, dtype=int) + y_resampled = np.copy(y) elif type(y) == list: unique_labels = self._collect_unique_labels(y) - y_resampled = sparse.csr_matrix((len(y), len(unique_labels))) + y_resampled = np.zeros((len(y), len(unique_labels))) for i, sample_labels in enumerate(y): for label in sample_labels: y_resampled[i, np.where(unique_labels == label)] = 1 @@ -219,7 +219,7 @@ def fit_resample(self, X, y): random_state, ) X_resampled = np.vstack((X_resampled, X_new)) - y_resampled = sparse.vstack((y_resampled, y_new)) + y_resampled = np.vstack((y_resampled, y_new)) return X_resampled, self._convert_to_input_type( y_resampled, unique_labels, type(y) ) @@ -251,7 +251,7 @@ def _create_new_sample( label_counts = np.squeeze( np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0)) ) - synth_sample_labels = sparse.csr_matrix((1, self.n_classes_), dtype=int) + synth_sample_labels = np.zeros((1, self.n_classes_), dtype=int) if self.sampling_strategy_ == MLSMOTE.RANKING: # Note: Paper states "present in half or more of the instances considered" # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We @@ -321,7 +321,7 @@ def f_sparse(_class): return vdm def _get_all_instances_of_label(self, label, labels): - return labels[:, label].nonzero()[0] + return np.nonzero(labels[:, label])[0] def _get_mean_imbalance_ratio(self, labels): sum_per_label = np.array( @@ -341,7 +341,7 @@ def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels): return irlbl_numerator / self._sum_h(label, labels) def _sum_h(self, label, labels): - return labels[:, label].count_nonzero() + return np.count_nonzero(labels[:, label]) def _get_most_frequent_value(self, values): """A support function to get most frequent value if a list of values @@ -354,9 +354,9 @@ def _get_most_frequent_value(self, values): def _convert_to_input_type(self, y_resampled, unique_labels, input_type): """A support function that converts the labels back to its input format""" if input_type == sparse._csr.csr_matrix: - return y_resampled + return sparse.csr_matrix(y_resampled, dtype=int) elif input_type == np.ndarray: - return np.asarray(y_resampled.todense()) + return y_resampled elif input_type == list: labels = [[] for _ in range(y_resampled.shape[0])] rows, cols = y_resampled.nonzero()