-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy path_cross_validation.py
111 lines (88 loc) · 3.72 KB
/
_cross_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
class InstanceHardnessCV:
"""Instance-hardness CV splitter
CV splitter that distributes samples with large instance hardness equally
over the folds
Read more in the :ref:`User Guide <instance_hardness_threshold>`.
Parameters
----------
estimator : estimator object
Classifier to be used to estimate instance hardness of the samples.
This classifier should implement `predict_proba`.
n_splits : int, default=5
Number of folds. Must be at least 2.
random_state : int, RandomState instance, default=None
Determines random_state for reproducible results across multiple calls.
Examples
--------
>>> from imblearn.cross_validation import InstanceHardnessCV
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import cross_validate
>>> from sklearn.linear_model import LogisticRegression
>>> X, y = make_classification(weights=[0.9, 0.1], class_sep=2,
... n_informative=3, n_redundant=1, flip_y=0.05, n_samples=1000, random_state=10)
>>> estimator = LogisticRegression(random_state=10)
>>> ih_cv = InstanceHardnessCV(estimator=estimator, n_splits=5,random_state=10)
>>> cv_result = cross_validate(estimator, X, y, cv=ih_cv)
>>> print(f"Standard deviation of test_scores: {cv_result['test_score'].std():.3f}")
Standard deviation of test_scores: 0.004
"""
def __init__(self, estimator, n_splits=5, random_state=None):
self.n_splits = n_splits
self.estimator = estimator
self.random_state = random_state
def split(self, X, y, groups=None):
"""
Generate indices to split data into training and test set.
Parameters
----------
X: array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples and
n_features is the number of features.
y: array-like of shape (n_samples,)
The target variable.
groups: object
Always ignored, exists for compatibility.
Yields
------
train: ndarray
The training set indices for that split.
test: ndarray
The testing set indices for that split.
"""
if self.estimator is not None:
self.estimator_ = self.estimator
else:
self.estimator_ = RandomForestClassifier(
n_jobs=-1, class_weight="balanced", random_state=self.random_state
)
probas = cross_val_predict(
self.estimator_, X, y, cv=self.n_splits, method="predict_proba"
)
# by sorting first on y then on proba rows are ordered by instance hardness
# within the group having the same label
sorted_indices = np.lexsort((probas[:, 1], y))
groups = np.zeros(len(X), dtype=int)
groups[sorted_indices] = np.arange(len(X)) % self.n_splits
cv = LeaveOneGroupOut()
for train_index, test_index in cv.split(X, y, groups):
yield train_index, test_index
def get_n_splits(self, X=None, y=None, groups=None):
"""
Returns the number of splitting iterations in the cross-validator.
Parameters
----------
X: object
Always ignored, exists for compatibility.
y: object
Always ignored, exists for compatibility.
groups: object
Always ignored, exists for compatibility.
Returns
-------
n_splits: int
Returns the number of splitting iterations in the cross-validator.
"""
return self.n_splits