-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy path_imbalance.py
121 lines (99 loc) · 4.09 KB
/
_imbalance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Transform a dataset into an imbalanced dataset."""
# Authors: Dayvid Oliveira
# Guillaume Lemaitre <[email protected]>
# Christos Aridas
# License: MIT
from collections import Counter
from ..under_sampling import RandomUnderSampler
from ..utils import check_sampling_strategy
from ..utils._validation import (
_deprecate_positional_args,
get_classes_counts,
)
@_deprecate_positional_args
def make_imbalance(
X, y, *, sampling_strategy=None, random_state=None, verbose=False, **kwargs
):
"""Turns a dataset into an imbalanced dataset with a specific sampling
strategy.
A simple toy dataset to visualize clustering and classification
algorithms.
Read more in the :ref:`User Guide <make_imbalanced>`.
Parameters
----------
X : {array-like, dataframe} of shape (n_samples, n_features)
Matrix containing the data to be imbalanced.
y : ndarray of shape (n_samples,)
Corresponding label for each sample in X.
sampling_strategy : dict or callable,
Ratio to use for resampling the data set.
- When ``dict``, the keys correspond to the targeted classes. The
values correspond to the desired number of samples for each targeted
class.
- When callable, function taking ``y`` and returns a ``dict``. The keys
correspond to the targeted classes. The values correspond to the
desired number of samples for each class.
random_state : int, RandomState instance or None, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random.
verbose : bool, default=False
Show information regarding the sampling.
kwargs : dict
Dictionary of additional keyword arguments to pass to
``sampling_strategy``.
Returns
-------
X_resampled : {ndarray, dataframe} of shape (n_samples_new, n_features)
The array containing the imbalanced data.
y_resampled : ndarray of shape (n_samples_new)
The corresponding label of `X_resampled`
Notes
-----
See
:ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`,
:ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and
:ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import load_iris
>>> from imblearn.datasets import make_imbalance
>>> data = load_iris()
>>> X, y = data.data, data.target
>>> print('Distribution before imbalancing: {}'.format(Counter(y)))
Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50})
>>> X_res, y_res = make_imbalance(X, y,
... sampling_strategy={0: 10, 1: 20, 2: 30},
... random_state=42)
>>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
"""
target_stats = get_classes_counts(y)
# restrict ratio to be a dict or a callable
if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
sampling_strategy_ = check_sampling_strategy(
sampling_strategy, target_stats, "under-sampling", **kwargs
)
else:
raise ValueError(
"'sampling_strategy' has to be a dictionary or a "
"function returning a dictionary. Got {} instead.".format(
type(sampling_strategy)
)
)
if verbose:
print(
"The original target distribution in the dataset is: %s",
target_stats,
)
rus = RandomUnderSampler(
sampling_strategy=sampling_strategy_,
replacement=False,
random_state=random_state,
)
X_resampled, y_resampled = rus.fit_resample(X, y)
if verbose:
print("Make the dataset imbalanced: %s", Counter(y_resampled))
return X_resampled, y_resampled