-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[WIP] ENH: Class Senstive Scaling #416
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
f612e09
2658dc7
1243bfc
d9d4410
b1afd23
8cbc0eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
""" | ||
The :mod:`imblearn.over_sampling` provides a set of method to | ||
perform over-sampling. | ||
""" | ||
|
||
from .css import CSS | ||
|
||
__all__ = ['CSS'] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
""" | ||
Base class for the over-sampling method. | ||
""" | ||
# Authors: Bernhard Schlegel <[email protected]> | ||
# License: MIT | ||
|
||
|
||
from ..base import BaseSampler | ||
|
||
|
||
class BaseScaler(BaseSampler): | ||
"""Base class for over-sampling algorithms. | ||
|
||
Warning: This class should not be used directly. Use the derive classes | ||
instead. | ||
""" | ||
|
||
_sampling_type = 'scaling' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
"""Class to perform sample scaling using class specific scaling (CSS).""" | ||
# Authors: Bernhard Schlegel <[email protected]> | ||
# License: MIT | ||
|
||
|
||
from __future__ import division, print_function | ||
from collections import Counter | ||
import random | ||
import numpy as np | ||
from .base import BaseScaler | ||
|
||
CSS_MODE = ('linear', 'constant') | ||
CSS_TARGET = ('minority', 'majority', 'both') | ||
|
||
|
||
class CSS(BaseScaler): | ||
"""Class to perform sample scaling using class specific scaling (CSS). | ||
|
||
Parameters | ||
---------- | ||
mode : str (default = 'constant') | ||
Defines the scaling mode. Currently, two modes are implemented: `'constant'` | ||
and `'linear'`. | ||
|
||
In `'constant'` mode, all samples of the `'target'` class will be scaled | ||
by the same amount `c` to their class specific center. The following | ||
formula will be applied to calculate the new feature (`X`) values: | ||
`X[y==0] * (1-c) + col_means * c` | ||
|
||
In `'linear'` mode, all samples will be scaled in depedence on their | ||
distance and `c` to their class specific center. Samples, that are | ||
one/unit standard deviation away from the class center will be scaled | ||
with `c`. The following formula will be applied to calculate the new | ||
feature (`X`) values: | ||
`norm = distances * c + (1-c)` | ||
`X[y==0] * (1-c) / norm + col_means * (distances * c) / norm | ||
|
||
|
||
target : str (default = 'minority') | ||
defines which class to scale. Possible values are 'minority', 'majority', | ||
and 'both'. Note that all sample are scaled to their corresponding class | ||
center. | ||
|
||
c : float (default = 0.25) | ||
Defines the amount of the scaling. | ||
|
||
target_class_value: int (default = None) | ||
class level indicating the minority class. By default (`None`) the minority | ||
class will be automatically determined. Use any integer number (e.g. `0`, | ||
`1` or `-1`) to force the minority class. | ||
|
||
random_state : int, RandomState instance or None, optional (default=None) | ||
If int, random_state is the seed used by the random number generator; | ||
If RandomState instance, random_state is the random number generator; | ||
If None, the random number generator is the RandomState instance used | ||
by np.random. | ||
|
||
Attributes | ||
---------- | ||
mode_ : str | ||
CSS mode ('constant' or 'linear') | ||
|
||
target_ : str or int | ||
Name of the target class ('majority', 'minority', 'both') | ||
|
||
target_class_value: int | ||
class level indicating the minority class | ||
|
||
c_ : dict of str/int : int | ||
A dictionary in which the number of occurences of each class is | ||
reported. | ||
|
||
shuffle : Boolean | ||
If True, results will be shuffled. | ||
|
||
Examples | ||
-------- | ||
|
||
>>> import numpy as np | ||
>>> from sklearn.utils import shuffle | ||
>>> from imblearn.scaling import CSS | ||
|
||
>>> rng = np.random.RandomState(42) | ||
>>> n_samples_1 = 50 | ||
>>> n_samples_2 = 5 | ||
>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), | ||
0.5 * rng.randn(n_samples_2, 2) + [2, 2]] | ||
>>> y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2)) | ||
>>> X_syn, y_syn = shuffle(X_syn, y_syn) | ||
>>> css = CSS(mode="linear", target="both", c=0.1, shuffle=True) | ||
>>> X_train_res, y_train_res = css.fit_sample(X_syn, y_syn) | ||
|
||
References | ||
---------- | ||
.. [1] B. Schlegel, and B. Sick. "Dealing with class imbalance the scalable way: | ||
Evaluation of various techniques based on classification grade and computational | ||
complexity." 2017 IEEE International Conference on Data Mining Workshops, 2017. | ||
""" | ||
|
||
def __init__(self, | ||
mode='linear', | ||
target='minority', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be the first parameter and it should be rename |
||
c=0.25, | ||
minority_class_value=None, | ||
shuffle=True, | ||
random_state=None): | ||
super(CSS, self).__init__(ratio=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ratio will be the sampling_strategy |
||
self.mode = mode | ||
self.target = target | ||
self.c = c | ||
self.minority_class_value = minority_class_value | ||
self.shuffle = shuffle | ||
|
||
def _validate_estimator(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can remove that |
||
i = 1 | ||
# nothing to do | ||
|
||
def fit(self, X, y): | ||
"""Find the classes statistics before to perform sampling. | ||
|
||
Parameters | ||
---------- | ||
X : ndarray, shape (n_samples, n_features) | ||
Matrix containing the data which have to be scaled. | ||
|
||
y : ndarray, shape (n_samples, ) | ||
Corresponding label for each sample in X. | ||
|
||
Returns | ||
------- | ||
self : object, | ||
Return self. | ||
|
||
""" | ||
|
||
super(CSS, self).fit(X, y) | ||
|
||
self._validate_estimator() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this |
||
|
||
return self | ||
|
||
def _shuffleTwo(self, a, b): | ||
#if len(a) != len(b): | ||
# raise ValueError("lenth of a ({}) doesn't match length of b ({})".format(len(a), len(b))) | ||
|
||
indexes = np.array(range(0, len(a))) | ||
random.shuffle(indexes) | ||
a2, b2 = a[indexes], b[indexes] | ||
|
||
return a2, b2, indexes | ||
|
||
def _sample(self, X, y): | ||
"""scales the dataset. | ||
|
||
Parameters | ||
---------- | ||
X : ndarray, shape (n_samples, n_features) | ||
Matrix containing the data which have to be sampled. | ||
|
||
y : ndarray, shape (n_samples, ) | ||
Corresponding label for each sample in X. | ||
|
||
Returns | ||
------- | ||
X_scaled : ndarray, shape (n_samples, n_features) | ||
The array containing the resampled data. | ||
|
||
y_scaled : ndarray, shape (n_samples) | ||
The corresponding label of `X_scaled` | ||
|
||
""" | ||
|
||
if self.mode not in CSS_MODE: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All those checking should be done in fit. |
||
raise ValueError('Unknown kind for CSS mode.' | ||
' Choices are {}. Got \'{}\' instead.'.format( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need for \ |
||
CSS_MODE, self.mode)) | ||
|
||
if self.target not in CSS_TARGET: | ||
raise ValueError('Unknown kind for CSS target.' | ||
' Choices are {}. Got \'{}\' instead.'.format( | ||
CSS_TARGET, self.target)) | ||
|
||
if self.c < 0 or self.c > 1: | ||
raise ValueError('Received scaling factor c={}, which' | ||
' is outside the allowed range ' | ||
'(0-1].'.format(self.c)) | ||
if self.c is 0: | ||
raise ValueError('Received scaling factor c={}, which is' | ||
' equal to no CSS at.'.format(self.c)) | ||
|
||
if self.minority_class_value is not None and \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we use parentheses instead of \ |
||
not isinstance(self.minority_class_value, int): | ||
raise ValueError('Unallowed target class value \'{}\'.' | ||
' Valid values include None to automatically' | ||
' infer the target class or any integer number' | ||
' corresponding to the value of the label in y') | ||
|
||
|
||
mcv = self.minority_class_value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. be explicit minority_class |
||
if mcv is None: | ||
# infer minority class value | ||
counts = Counter(y) | ||
least_common = counts.most_common()[:-1-1:-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is difficult to read.
You should check the other sampler but we call counts -> target_stats I think |
||
mcv = least_common[0][0] | ||
|
||
# in the following _a is majority, _i is minority | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. be explicit even if it is more verbose |
||
if self.target is "majority" or self.target is "both": | ||
col_means_a = np.mean(X[(y != mcv)], axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. col is not good name because this is not a column actually. It is fine to cool it |
||
if self.mode is "linear": | ||
distances_a = abs(np.subtract(X[y != mcv], col_means_a)) | ||
if self.target is "minority" or self.target is "both": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need to extend the problem to multiclass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right now, it only works in a 1-vs-all fashion. Is this required for an accept ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whatever was shown in the paper is fine. This is usually the way that people transform the balancing method for multiclass. The only thing is that the I imagine that we should also accept a list which mention the classes which should be scaled. But anyway, we need to merge #413 first. So I would address all other issue. |
||
col_means_i = np.mean(X[(y == mcv)], axis=0) | ||
if self.mode is "linear": | ||
distances_i = abs(np.subtract(X[y == mcv], col_means_i)) | ||
|
||
if self.target is "majority" or self.target is "both": | ||
if self.mode is "constant": | ||
X_scaled_a = X[y != mcv] * (1 - self.c) + col_means_a * self.c | ||
elif self.mode is "linear": | ||
scale_factors_mean = (distances_a * self.c) | ||
scale_factors_values = (1 - self.c * distances_a) | ||
|
||
X_scaled_a = X[y != mcv] * scale_factors_values + col_means_a * scale_factors_mean | ||
if self.target is "minority" or self.target is "both": | ||
if self.mode is "constant": | ||
X_scaled_i = X[y == mcv] * (1 - self.c) + col_means_i * self.c | ||
elif self.mode is "linear": | ||
scale_factors_mean = (distances_i * self.c) | ||
scale_factors_values = (1 - self.c * distances_i) | ||
X_scaled_i = X[y == mcv] * scale_factors_values + col_means_i * scale_factors_mean | ||
|
||
# merge scaled and non scaled stuff | ||
if self.target is "majority": | ||
X_scaled = np.concatenate([X_scaled_a, X[y == mcv]], axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you need to use |
||
elif self.target is "minority": | ||
X_scaled = np.concatenate([X[y != mcv], X_scaled_i], axis=0) | ||
else: #"both" | ||
X_scaled = np.concatenate([X_scaled_a, X_scaled_i], axis=0) | ||
|
||
# make sure that y is in same order like X | ||
y_assembled = np.concatenate([y[y != mcv], y[y == mcv]], axis=0) | ||
|
||
# shuffle | ||
X_scaled_shuffled, y_res_shuffled, indices = self._shuffleTwo(X_scaled, y_assembled) | ||
|
||
if self.shuffle: | ||
return X_scaled_shuffled, y_res_shuffled | ||
else: | ||
return X_scaled, y_assembled |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not a good idea. We can have label which are string and it will not work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would not bother about this parameter