-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
385 lines (309 loc) · 16.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import matplotlib.pyplot as plt
import numpy
from carl.distributions import Join
from carl.distributions import Mixture
from carl.distributions import Normal
from carl.distributions import Exponential
from carl.distributions import LinearTransform
from carl.ratios import ClassifierRatio
from sklearn.datasets import make_sparse_spd_matrix
from sklearn.base import clone
from sklearn.utils import check_random_state
import corner
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
hist_settings = {'bins': 50, 'normed': True, 'alpha': 0.3}
# a helper function for plotting the reweighted data
def draw_distributions(original, target, original_weights, target_weights=None):
"""
Plot weighted distributions for original and target samples.
:param numpy.ndarray original: original samples
:param numpy.ndarray target: target samples
:param numpy.array original_weights: weights for original samples
:param numpy.array target_weights: weights for target samples
"""
if target_weights is None:
target_weights = numpy.ones(target.shape[0])
columns = range(original.shape[1])
if len(columns) > 3:
plt.figure(figsize=(16, 8))
else:
plt.figure(figsize=(16, 4))
for ind, column in enumerate(columns, 1):
xlim1 = numpy.percentile(numpy.hstack(target[:, column]), [0.01, 99.99])
xlim2 = numpy.percentile(numpy.hstack(original[:, column]), [0.01, 99.99])
xlim = (min(xlim1[0], xlim2[0]), max(xlim1[1], xlim2[1]))
if len(columns) > 3:
plt.subplot(2, numpy.ceil(len(columns) / 2.), ind)
else:
plt.subplot(1, len(columns), ind)
plt.hist(original[:, column], weights=original_weights, label='original',
range=xlim, color='b', **hist_settings)
plt.hist(target[:, column], range=xlim, weights=target_weights, color='orange', label='target',
**hist_settings)
plt.xlabel('X%d' %(ind))
plt.legend(loc='best')
plt.show()
def generate_samples(with_linear_transformation=False, add_variation=False, n_samples=50000, verbose=True):
"""
Generate 5 independent variables: two Gaussian, mixture of Gaussian, two exponents.
Two Gaussian have different means for original and target distributions.
if with_linear_transformation is True then add linear transformation of generated 5 variables.
if add_variation is True then add random values in variance to obtain gaussian pdf
for orignal and target samples not only with different mean but also with different variance.
:param bool with_linear_transformation: apply or not linear transformation for samples features
:param bool add_variation: make or not different variance for Gaussian distribution for original and target samples.
:param int n_samples: number of generated samples for original/target distributions. For test samples 2*n_samples will be generated
:param bool verbose: print and plot additional info during generation.
:return: train original, train target, exact weights for train original, test original, test target, exact weights for test original
"""
# define linear transformation matrix
R = make_sparse_spd_matrix(5, alpha=0.5, random_state=7)
variation_origin, variation_target = (0, 0)
if add_variation:
r = check_random_state(42)
variation_origin, variation_target = r.uniform() / 3., r.uniform() / 3.
p0 = Join(components=[
Normal(mu=.5, sigma=1 + variation_origin),
Normal(mu=-.5, sigma=3 + variation_origin),
Mixture(components=[Normal(mu=-2, sigma=1),
Normal(mu=2, sigma=0.5)]),
Exponential(inverse_scale=3.0),
Exponential(inverse_scale=0.5)])
p1 = Join(components=[
Normal(mu=0, sigma=1 + variation_target),
Normal(mu=0, sigma=3 + variation_target),
Mixture(components=[Normal(mu=-2, sigma=1),
Normal(mu=2, sigma=0.5)]),
Exponential(inverse_scale=3.0),
Exponential(inverse_scale=0.5)])
if with_linear_transformation:
p0 = LinearTransform(p0, R)
p1 = LinearTransform(p1, R)
X0 = p0.rvs(n_samples, random_state=777)
X1 = p1.rvs(n_samples, random_state=777)
exact_weights = numpy.exp(p0.nll(X0) - p1.nll(X0))
exact_weights[numpy.isinf(exact_weights)] = 0.
# generate samples to test reweighting rule (to avoid overfitting)
X0_roc = p0.rvs(2 * n_samples, random_state=777 * 2)
X1_roc = p1.rvs(2 * n_samples, random_state=777 * 2)
# Weighted with true ratios
exact_weights_roc = numpy.exp(p0.nll(X0_roc) - p1.nll(X0_roc))
exact_weights_roc[numpy.isinf(exact_weights_roc)] = 0.
if verbose:
print "Original distribution"
fig = corner.corner(X0, bins=20, smooth=0.85, labels=["X0", "X1", "X2", "X3", "X4"])
plt.show()
print "Target distribution"
fig = corner.corner(X1, bins=20, smooth=0.85, labels=["X0", "X1", "X2", "X3", "X4"])
plt.show()
print "Exact reweighting"
# In this example, we know p0(x) and p1(x) exactly,
#so we can compare the other can compare the approximate reweighting approaches with the exact weights.
draw_distributions(X0, X1, exact_weights)
return X0, X1, exact_weights, X0_roc, X1_roc, exact_weights_roc
def generate_samples_for_blow_up_demo(n_samples=50000):
"""
Generate 3 independent Gaussian variables and apply linear transformation to them.
These Gaussian have different means and different sigmas for target and original distribution.
This is example of samples with regions with high target samples number zero original samples. In this case exact reweighting rule blow up and the same happens for algorithms.
:param int n_samples: number of generated samples for original/target distributions. For test samples 2*n_samples will be generated
:return: train original, train target, exact weights for train original, test original, test target, exact weights for test original
"""
p0 = Join(components=[
Normal(mu=1, sigma=0.7),
Normal(mu=-1, sigma=0.7),
Normal(mu=1, sigma=1.5)])
p1 = Join(components=[
Normal(mu=0, sigma=0.7),
Normal(mu=0, sigma=0.7),
Normal(mu=0, sigma=1.5)])
R = make_sparse_spd_matrix(3, alpha=0.5, random_state=7)
p0 = LinearTransform(p0, R)
p1 = LinearTransform(p1, R)
X0 = p0.rvs(n_samples, random_state=777)
X1 = p1.rvs(n_samples, random_state=777)
exact_weights = numpy.exp(p0.nll(X0) - p1.nll(X0))
exact_weights[numpy.isinf(exact_weights)] = 1.
# generate samples to test reweighting rule (to avoid overfitting)
X0_roc = p0.rvs(2 * n_samples, random_state=777 * 2)
X1_roc = p1.rvs(2 * n_samples, random_state=777 * 2)
# Weighted with true ratios
exact_weights_roc = numpy.exp(p0.nll(X0_roc) - p1.nll(X0_roc))
exact_weights_roc[numpy.isinf(exact_weights_roc)] = 1.
draw_distributions(X0, X1, numpy.ones(len(X0)))
print "Exact weights are used (inf weights are set to 1)"
draw_distributions(X0, X1, exact_weights)
return X0, X1, exact_weights, X0_roc, X1_roc, exact_weights_roc
def check_reweighting_by_ML_gb(original, target, original_weights, target_weights=None, n_iterations=1):
"""
Compare multidimentional distributions after reweighting by Gradient Boosting classifier.
If reweighting is perfect then classifier cannot distinguish samples and roc auc score will be 0.5
For splot target, when target weights can be negative, we move smaples with negative weights into original samples with positive weights.
:param numpy.ndarray original: original samples
:param numpy.ndarray target: target samples
:param numpy.array original_weights: weights for original samples
:param numpy.array target_weights: weights for target samples
:param int n_iterations: number of bootstrap iterations (randomly divide into train/test; on test compute auc; repeat this procedure n_iterations times).
:return: fpr, tpr, mean auc, std auc
"""
if target_weights is None:
target_weights = numpy.ones(target.shape[0])
aucs = []
data = numpy.concatenate([original, target])
labels = numpy.array([0] * original.shape[0] + [1] * target.shape[0])
W = numpy.concatenate([original_weights, target_weights])
for _ in range(n_iterations):
Xtr, Xts, Ytr, Yts, Wtr, Wts = train_test_split(data, labels, W, train_size=0.51)
original_w = Wtr[Ytr == 0]
target_w = Wtr[Ytr == 1]
original_w /= numpy.sum(original_w)
target_w /= numpy.sum(target_w)
original_tr = Xtr[Ytr == 0]
target_tr = Xtr[Ytr == 1]
# put target events with negative weights into original samples with -weights
data_neg = target_tr[target_w < 0]
weights_neg = -target_w[target_w < 0]
original_tr = numpy.concatenate((original_tr, data_neg))
original_w = numpy.concatenate((original_w, weights_neg))
target_tr = target_tr[target_w >= 0]
target_w = target_w[target_w >= 0]
Xtr = numpy.concatenate([original_tr, target_tr])
Ytr = numpy.array([0] * original_tr.shape[0] + [1] * target_tr.shape[0])
Wtr = numpy.concatenate([original_w, target_w])
clf = GradientBoostingClassifier(n_estimators=200, subsample=0.5,
min_samples_leaf=100, learning_rate=0.1,
max_depth=6).fit(Xtr, Ytr, sample_weight=Wtr)
proba = clf.predict_proba(Xts)[:, 1]
aucs.append(roc_auc_score(Yts, proba, sample_weight=Wts))
fpr, tpr, _ = roc_curve(Yts, proba, sample_weight=Wts)
return fpr, tpr, numpy.mean(aucs), numpy.std(aucs)
def plot_roc(fpr, tpr, auc, auc_std, name=""):
"""
Plot roc curve.
"""
plt.plot(fpr, tpr, label=name + "\nAUC=%.3f$\pm$%.3f" % (auc, auc_std))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Discriminator trained with weights')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.tight_layout()
def plot_scatter_weights(exact_weights, weights, title=""):
"""
Plot scatter between exact weights and reconstructed weights.
"""
w1 = exact_weights *1. / numpy.mean(exact_weights)
w2 = weights * 1. / numpy.mean(weights)
plt.scatter(w1, w2, alpha=0.1)
plt.plot([0, 10], [0, 10], lw=2, c='r')
plt.xlim(0, 10)
plt.ylim(0, 10)
plt.xlabel('exact weights')
plt.ylabel('estimated weights')
plt.title(title)
def run_verbose_info(original_test, target_test, weights_test, exact_weights_test, cv_val=3):
draw_distributions(original_test, target_test, weights_test)
plt.show()
plt.subplot(1, 3, 3)
h = plt.hist(weights_test, bins=numpy.exp(numpy.linspace(-4, 3, 50)), alpha=0.2, label="estimated")
h = plt.hist(exact_weights_test, bins=numpy.exp(numpy.linspace(-4, 3, 50)), alpha=0.2, label="exact")
plt.semilogx()
plt.legend()
plt.xlabel("weight")
plt.subplot(1, 3, 3)
fpr, tpr, auc, auc_std = check_reweighting_by_ML_gb(original_test, target_test,
weights_test, n_iterations=cv_val)
plot_roc(fpr, tpr, auc, auc_std, 'with weights')
fpr, tpr, auc, auc_std = check_reweighting_by_ML_gb(original_test, target_test,
weights_test, n_iterations=cv_val)
plot_roc(fpr, tpr, auc, auc_std, 'with exact weights')
plt.show()
def reconstruct_ratio_using_estimated_pdfs(classifier, classifier_parameters, cv_val=3, with_linear_transformation=False,
add_variation=False, n_samples=50000, verbose=True, inverse_weights=False,
test_by_ML_GB=False):
"""
Reconstruct weights by discriinative classifiers (calibrated and non-calibrated)
from `carl` on the generated samples defined by function `generate_samples`
"""
original, target, exact_weights, original_test, target_test, exact_weights_test = \
generate_samples(with_linear_transformation=with_linear_transformation,
add_variation=add_variation, n_samples=n_samples, verbose=verbose)
predicted_weights = []
for params in classifier_parameters:
if verbose:
print "Used parameters ", params
classifier_clone = clone(classifier)
classifier_clone.set_params(**params)
ratio = ClassifierRatio(base_estimator=classifier_clone, random_state=42)
#reformat X0 and X1 into training data
X = numpy.vstack((original, target))
y = numpy.array([1] * original.shape[0] + [0] * target.shape[0])
# fit the ration
ratio.fit(X, y)
carl_weights_test = ratio.predict(original_test, log=False)
carl_weights_test[numpy.isinf(carl_weights_test)] = 0.
predicted_weights.append(carl_weights_test)
# plot 1d distribution for test sample
if verbose:
run_verbose_info(original_test, target_test, carl_weights_test, exact_weights_test, cv_val=cv_val)
regime = [False]
if inverse_weights:
regime = [True, False]
for inverse in regime:
plt.figure(figsize=(len(classifier_parameters) * 5, 4))
m = len(predicted_weights)
for n, (weights, params) in enumerate(zip(predicted_weights, classifier_parameters)):
plt.subplot(1, m, n + 1)
if inverse:
plot_scatter_weights(1. / exact_weights_test, 1. / weights, title="Inverse weights for\n" + str(params))
else:
plot_scatter_weights(exact_weights_test, weights, title="Weights for\n" + str(params))
def GBreweighter_fit(reweighter, reweighter_parameters, original, target,
exact_weights, original_test, target_test, exact_weights_test, verbose=False, print_weights=False):
predicted_weights = []
for params in reweighter_parameters:
if verbose:
print "Used parameters ", params
reweighter_clone = clone(reweighter)
reweighter_clone.set_params(**params)
reweighter_clone.fit(original, target, numpy.ones(len(original)))
gb_weights_test = reweighter_clone.predict_weights(original_test)
predicted_weights.append(gb_weights_test)
# plot 1d distribution for test sample
if verbose:
run_verbose_info(original_test, target_test, gb_weights_test, exact_weights_test, cv_val=cv_val)
if print_weights:
print "weights ", numpy.sort(gb_weights_test)
# plot 1d distribution for test sample
draw_distributions(original_test, target_test, gb_weights_test)
plt.figure(figsize=(len(reweighter_parameters) * 5, 4))
m = len(predicted_weights)
for n, (weights, params) in enumerate(zip(predicted_weights, reweighter_parameters)):
plt.subplot(1, m, n + 1)
plot_scatter_weights(exact_weights_test, weights, title="Weights for\n" + str(params))
def reconstruct_weights_by_GBreweighter(reweighter, reweighter_parameters, with_linear_transformation=False,
add_variation=False, n_samples=50000, verbose=True):
"""
Reconstruct weights by BDT reweighter from `hep_ml` on the generated
samples defined by function `generate_samples`
"""
original, target, exact_weights, original_test, target_test, exact_weights_test = \
generate_samples(with_linear_transformation=with_linear_transformation,
add_variation=add_variation, n_samples=n_samples, verbose=verbose)
GBreweighter_fit(reweighter, reweighter_parameters, original, target,
exact_weights, original_test, target_test, exact_weights_test,
verbose=verbose, print_weights=False)
def reconstruct_weights_by_GBreweighter_with_blow_up(reweighter, reweighter_parameters, n_samples=50000):
"""
Reconstruct weights by BDT reweighter from `hep_ml` on the generated
samples defined by function `generate_samples_for_blow_up_demo`
"""
original, target, exact_weights, original_test, target_test, exact_weights_test = \
generate_samples_for_blow_up_demo()
GBreweighter_fit(reweighter, reweighter_parameters, original, target,
exact_weights, original_test, target_test, exact_weights_test,
verbose=False, print_weights=True)