|
2 | 2 | closuretest/closure_results.py
|
3 | 3 |
|
4 | 4 | Module containing actiosn to calculate sigle closure test estimators.
|
5 |
| -This is useful for quickly checking the |
6 |
| -bias of a fit without having to run the full multiclosure analysis. |
| 5 | +This is useful for quickly checking the bias of a fit without having |
| 6 | +to run the full multiclosure analysis. |
7 | 7 |
|
8 | 8 | """
|
9 |
| -from collections import namedtuple |
10 | 9 |
|
11 | 10 | import numpy as np
|
12 | 11 | import pandas as pd
|
13 | 12 |
|
14 | 13 | from reportengine import collect
|
15 | 14 | from reportengine.table import table
|
16 |
| -from validphys.calcutils import bootstrap_values, calc_chi2 |
17 |
| -from validphys.checks import check_pdf_is_montecarlo |
18 | 15 | from validphys.closuretest.closure_checks import (
|
19 | 16 | check_fit_isclosure,
|
20 | 17 | check_fits_areclosures,
|
|
23 | 20 | check_use_fitcommondata,
|
24 | 21 | )
|
25 | 22 |
|
26 |
| -BiasData = namedtuple("BiasData", ("bias", "ndata")) |
27 | 23 |
|
28 |
| -underlying_results = collect("results", ("fitunderlyinglaw",)) |
29 |
| - |
30 |
| - |
31 |
| -@check_fit_isclosure |
32 |
| -@check_use_fitcommondata |
33 |
| -def bias_dataset(results, underlying_results, fit, use_fitcommondata): |
34 |
| - """Calculate the bias for a given dataset and fit. The bias is defined as |
35 |
| - chi2 between the prediction from the underlying PDF (which was used to |
36 |
| - generate the closure pseudodata), also known as level zero closure data, and |
37 |
| - the central prediction calculated from the fitted PDF. |
38 |
| -
|
39 |
| - we require that use_fitcommondata is true because the generated closure data |
40 |
| - is used to generate the multiplicative contributions to the covariance |
41 |
| - matrix |
42 |
| - """ |
43 |
| - dt_ct, th_ct = results |
44 |
| - # does collect need to collect a list even with one element? |
45 |
| - ((_, th_ul),) = underlying_results |
46 |
| - central_diff = th_ct.central_value - th_ul.central_value |
47 |
| - bias_out = calc_chi2(dt_ct.sqrtcovmat, central_diff) # unnormalised |
48 |
| - return BiasData(bias_out, len(th_ct)) |
49 |
| - |
50 |
| - |
51 |
| -underlying_dataset_inputs_results = collect("dataset_inputs_results", ("fitunderlyinglaw",)) |
52 |
| - |
53 |
| - |
54 |
| -@check_fit_isclosure |
55 |
| -@check_use_fitcommondata |
56 |
| -def bias_experiment( |
57 |
| - dataset_inputs_results, underlying_dataset_inputs_results, fit, use_fitcommondata |
58 |
| -): |
59 |
| - """Like `bias_dataset` but for a whole experiment.""" |
60 |
| - return bias_dataset( |
61 |
| - dataset_inputs_results, |
62 |
| - underlying_dataset_inputs_results, |
63 |
| - fit, |
64 |
| - use_fitcommondata, |
65 |
| - ) |
66 |
| - |
67 |
| - |
68 |
| -experiments_bias = collect("bias_experiment", ("group_dataset_inputs_by_experiment",)) |
69 |
| -fits_experiments_bias = collect("experiments_bias", ("fits", "fitcontext")) |
70 | 24 | fits_experiments = collect("experiments_data", ("fits", "fitcontext"))
|
71 | 25 |
|
72 | 26 |
|
73 |
| -@table |
74 |
| -@check_fits_same_filterseed |
75 |
| -@check_fits_underlying_law_match |
76 |
| -def biases_table(fits_experiments, fits_experiments_bias, fits, show_total: bool = False): |
77 |
| - """Creates a table with fits as the columns and the experiments from both |
78 |
| - fits as the row index. |
79 |
| - """ |
80 |
| - col = ["bias"] |
81 |
| - dfs = [] |
82 |
| - for fit, experiments, biases in zip(fits, fits_experiments, fits_experiments_bias): |
83 |
| - total = 0 |
84 |
| - total_points = 0 |
85 |
| - records = [] |
86 |
| - for biasres, experiment in zip(biases, experiments): |
87 |
| - records.append( |
88 |
| - dict( |
89 |
| - experiment=str(experiment), |
90 |
| - bias=biasres.bias / biasres.ndata, # normalised bias |
91 |
| - ) |
92 |
| - ) |
93 |
| - if show_total: |
94 |
| - total += biasres.bias |
95 |
| - total_points += biasres.ndata |
96 |
| - if show_total: |
97 |
| - total /= total_points |
98 |
| - records.append(dict(experiment="Total", bias=total)) |
99 |
| - |
100 |
| - df = pd.DataFrame.from_records( |
101 |
| - records, columns=("experiment", "bias"), index=("experiment") |
102 |
| - ) |
103 |
| - df.columns = pd.MultiIndex.from_product(([str(fit)], col)) |
104 |
| - dfs.append(df) |
105 |
| - return pd.concat(dfs, axis=1, sort=True) |
106 |
| - |
107 |
| - |
108 |
| -@check_pdf_is_montecarlo |
109 |
| -@check_fit_isclosure |
110 |
| -def bootstrap_bias_experiment( |
111 |
| - dataset_inputs_results, underlying_dataset_inputs_results, bootstrap_samples=500 |
112 |
| -): |
113 |
| - """Calculates bias as per `bias_experiment` but performs bootstrap sample |
114 |
| - across replicas. note that bias_experiment returns a named tuple like |
115 |
| - (unnormalised_bias, ndata) whereas this actions simply returns an array |
116 |
| - `boostrap_bias` with length bootstrap_samples. Each element of |
117 |
| - returned array is bias/n_data (bias normalised by number of datapoints) |
118 |
| - """ |
119 |
| - dt_ct, th_ct = dataset_inputs_results |
120 |
| - ((_, th_ul),) = underlying_dataset_inputs_results |
121 |
| - th_ct_boot_cv = bootstrap_values(th_ct.error_members, bootstrap_samples) |
122 |
| - boot_diffs = th_ct_boot_cv - th_ul.central_value[:, np.newaxis] |
123 |
| - boot_bias = calc_chi2(dt_ct.sqrtcovmat, boot_diffs) / len(dt_ct) |
124 |
| - return boot_bias |
125 |
| - |
126 |
| - |
127 |
| -experiments_bootstrap_bias = collect( |
128 |
| - "bootstrap_bias_experiment", ("group_dataset_inputs_by_experiment",) |
129 |
| -) |
130 |
| -fits_experiments_bootstrap_bias = collect("experiments_bootstrap_bias", ("fits", "fitcontext")) |
131 |
| - |
132 |
| - |
133 |
| -@table |
134 |
| -@check_use_fitcommondata |
135 |
| -@check_fits_areclosures |
136 |
| -@check_fits_same_filterseed |
137 |
| -@check_fits_underlying_law_match |
138 |
| -def fits_bootstrap_bias_table( |
139 |
| - fits_experiments_bootstrap_bias, |
140 |
| - fits_name_with_covmat_label, |
141 |
| - fits_experiments, |
142 |
| - fits, |
143 |
| - use_fitcommondata, |
144 |
| -): |
145 |
| - """Produce a table with bias for each experiment for each fit, along with |
146 |
| - variance calculated from doing a bootstrap sample |
147 |
| - """ |
148 |
| - col = ["bias", "bias std. dev."] |
149 |
| - dfs = [] |
150 |
| - for fit, experiments, biases in zip( |
151 |
| - fits_name_with_covmat_label, fits_experiments, fits_experiments_bootstrap_bias |
152 |
| - ): |
153 |
| - records = [] |
154 |
| - for biasres, experiment in zip(biases, experiments): |
155 |
| - records.append( |
156 |
| - dict( |
157 |
| - experiment=str(experiment), |
158 |
| - bias=biasres.mean(), |
159 |
| - bias_std=biasres.std(), |
160 |
| - ) |
161 |
| - ) |
162 |
| - df = pd.DataFrame.from_records( |
163 |
| - records, columns=("experiment", "bias", "bias_std"), index=("experiment") |
164 |
| - ) |
165 |
| - df.columns = pd.MultiIndex.from_product(([str(fit)], col)) |
166 |
| - dfs.append(df) |
167 |
| - return pd.concat(dfs, axis=1, sort=True) |
168 |
| - |
169 |
| - |
170 |
| -VarianceData = namedtuple("VarianceData", ("variance", "ndata")) |
171 |
| - |
172 |
| - |
173 |
| -@check_fit_isclosure |
174 |
| -@check_use_fitcommondata |
175 |
| -def variance_dataset(results, fit, use_fitcommondata): |
176 |
| - """calculate the variance for a given dataset, which is the spread of |
177 |
| - replicas measured in the space of the covariance matrix. Given by: |
178 |
| -
|
179 |
| - E_rep [ (T - E_rep[T])_i C^{-1}_ij (T - E_rep[T])_j ] |
180 |
| -
|
181 |
| - where E_rep is the expectation value across replicas. The quantity is the |
182 |
| - same as squaring `phi_data`, however it is redefined here in a way which can |
183 |
| - be made fully independent of the closure data. This is useful when checking |
184 |
| - the variance of data which was not included in the fit. |
185 |
| -
|
186 |
| - """ |
187 |
| - dt, th = results |
188 |
| - diff = th.central_value[:, np.newaxis] - th.error_members |
189 |
| - var_unnorm = calc_chi2(dt.sqrtcovmat, diff).mean() |
190 |
| - return VarianceData(var_unnorm, len(th)) |
191 |
| - |
192 |
| - |
193 |
| -@check_fit_isclosure |
194 |
| -@check_use_fitcommondata |
195 |
| -def variance_experiment(dataset_inputs_results, fit, use_fitcommondata): |
196 |
| - """Like variance_dataset but for a whole experiment""" |
197 |
| - return variance_dataset(dataset_inputs_results, fit, use_fitcommondata) |
198 |
| - |
199 |
| - |
200 |
| -@check_fit_isclosure |
201 |
| -def bootstrap_variance_experiment(dataset_inputs_results, bootstrap_samples=500): |
202 |
| - """Calculate the variance as in `variance_experiment` but performs bootstrap |
203 |
| - sample of the estimator. Returns an array of variance for each resample, |
204 |
| - normalised to the number of data in the experiment. |
205 |
| - """ |
206 |
| - dt_ct, th_ct = dataset_inputs_results |
207 |
| - diff = th_ct.central_value[:, np.newaxis] - th_ct.error_members |
208 |
| - var_unnorm_boot = bootstrap_values( |
209 |
| - diff, |
210 |
| - bootstrap_samples, |
211 |
| - apply_func=(lambda x, y: calc_chi2(y, x)), |
212 |
| - args=[dt_ct.sqrtcovmat], |
213 |
| - ).mean( |
214 |
| - axis=0 |
215 |
| - ) # mean across replicas |
216 |
| - return var_unnorm_boot / len(th_ct) # normalise by n_data |
217 |
| - |
218 |
| - |
219 |
| -experiments_boostrap_variance = collect( |
220 |
| - "bootstrap_variance_experiment", ("group_dataset_inputs_by_experiment",) |
221 |
| -) |
222 |
| - |
223 |
| -fits_exps_bootstrap_var = collect("experiments_boostrap_variance", ("fits", "fitcontext")) |
224 |
| - |
225 |
| - |
226 |
| -@table |
227 |
| -@check_use_fitcommondata |
228 |
| -@check_fits_areclosures |
229 |
| -@check_fits_same_filterseed |
230 |
| -@check_fits_underlying_law_match |
231 |
| -def fits_bootstrap_variance_table( |
232 |
| - fits_exps_bootstrap_var, |
233 |
| - fits_name_with_covmat_label, |
234 |
| - fits_experiments, |
235 |
| - fits, |
236 |
| - use_fitcommondata, |
237 |
| -): |
238 |
| - """Produce a table with variance and its error. Variance is defined as |
239 |
| -
|
240 |
| - var = sum_ij E_rep[(E_rep[T_i], T_i) invcov_ij (E_rep[T_j], T_j)] / N_data |
241 |
| -
|
242 |
| - which is the expectation value across replicas of the chi2 between the central |
243 |
| - theory predictions and replica theory predictions. It is the same as phi^2 |
244 |
| - and gives the variance of the theory predictions in units of the covariance |
245 |
| - matrix, normalised by the number of data points. |
246 |
| -
|
247 |
| - The error is the standard deviation across bootstrap samples. |
248 |
| -
|
249 |
| - """ |
250 |
| - col = ["variance", "variance std. dev."] |
251 |
| - dfs = [] |
252 |
| - for fit, experiments, fit_vars in zip( |
253 |
| - fits_name_with_covmat_label, fits_experiments, fits_exps_bootstrap_var |
254 |
| - ): |
255 |
| - records = [] |
256 |
| - for var_res, experiment in zip(fit_vars, experiments): |
257 |
| - records.append( |
258 |
| - dict( |
259 |
| - experiment=str(experiment), |
260 |
| - variance=var_res.mean(), |
261 |
| - variance_std=var_res.std(), |
262 |
| - ) |
263 |
| - ) |
264 |
| - df = pd.DataFrame.from_records( |
265 |
| - records, |
266 |
| - columns=("experiment", "variance", "variance_std"), |
267 |
| - index=("experiment"), |
268 |
| - ) |
269 |
| - df.columns = pd.MultiIndex.from_product(([str(fit)], col)) |
270 |
| - dfs.append(df) |
271 |
| - return pd.concat(dfs, axis=1, sort=True) |
272 |
| - |
273 |
| - |
274 | 27 | experiments_bootstrap_chi2_central = collect(
|
275 | 28 | "dataset_inputs_bootstrap_chi2_central", ("group_dataset_inputs_by_experiment",)
|
276 | 29 | )
|
277 | 30 |
|
| 31 | + |
278 | 32 | fits_exps_bootstrap_chi2_central = collect(
|
279 | 33 | "experiments_bootstrap_chi2_central", ("fits", "fitcontext")
|
280 | 34 | )
|
| 35 | + |
| 36 | + |
281 | 37 | fits_level_1_noise = collect("total_chi2_data", ("fits", "fitinputcontext", "fitunderlyinglaw"))
|
282 | 38 |
|
283 | 39 |
|
@@ -336,20 +92,15 @@ def delta_chi2_table(
|
336 | 92 | dfs = []
|
337 | 93 | cols = ("ndata", r"$\Delta_{chi^2}$ (normalised by ndata)")
|
338 | 94 | for label, experiments, exps_chi2, exps_level_1_noise in zip(
|
339 |
| - fits_name_with_covmat_label, |
340 |
| - fits_experiments, |
341 |
| - fits_exps_chi2, |
342 |
| - fits_exps_level_1_noise, |
| 95 | + fits_name_with_covmat_label, fits_experiments, fits_exps_chi2, fits_exps_level_1_noise |
343 | 96 | ):
|
344 | 97 | records = []
|
345 | 98 | for experiment, exp_chi2, level_1_noise in zip(experiments, exps_chi2, exps_level_1_noise):
|
346 | 99 | delta_chi2 = (exp_chi2.central_result - level_1_noise.central_result) / exp_chi2.ndata
|
347 | 100 | npoints = exp_chi2.ndata
|
348 | 101 | records.append(dict(experiment=str(experiment), npoints=npoints, delta_chi2=delta_chi2))
|
349 | 102 | df = pd.DataFrame.from_records(
|
350 |
| - records, |
351 |
| - columns=("experiment", "npoints", "delta_chi2"), |
352 |
| - index=("experiment",), |
| 103 | + records, columns=("experiment", "npoints", "delta_chi2"), index=("experiment",) |
353 | 104 | )
|
354 | 105 | df.columns = pd.MultiIndex.from_product(([label], cols))
|
355 | 106 | dfs.append(df)
|
|
0 commit comments