Skip to content

Commit ed75983

Browse files
Merge pull request #318 from rsagroup/small_sample
update to include small sample two factor bootstrap correction
2 parents e43546c + ddc65bd commit ed75983

File tree

3 files changed

+111
-36
lines changed

3 files changed

+111
-36
lines changed

Diff for: src/rsatoolbox/inference/evaluate.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,8 @@ def eval_dual_bootstrap(
173173
/ (matrix.shape[0] - 1)
174174
result = Result(models, evaluations, method=method,
175175
cv_method=cv_method, noise_ceiling=noise_ceil,
176-
variances=variances, dof=dof)
176+
variances=variances, dof=dof, n_rdm=data.n_rdm,
177+
n_pattern=data.n_cond)
177178
return result
178179

179180

@@ -201,15 +202,17 @@ def eval_fixed(models, data, theta=None, method='cosine'):
201202
noise_ceil = boot_noise_ceiling(
202203
data, method=method, rdm_descriptor='index')
203204
if data.n_rdm > 1:
204-
variances = np.cov(evaluations[0], ddof=1) \
205+
variances = np.cov(evaluations[0], ddof=0) \
205206
/ evaluations.shape[-1]
206207
dof = evaluations.shape[-1] - 1
207208
else:
208209
variances = None
209210
dof = 0
210211
result = Result(models, evaluations, method=method,
211212
cv_method='fixed', noise_ceiling=noise_ceil,
212-
variances=variances, dof=dof)
213+
variances=variances, dof=dof, n_rdm=data.n_rdm,
214+
n_pattern=None)
215+
result.n_pattern = data.n_cond
213216
return result
214217

215218

@@ -269,7 +272,8 @@ def eval_bootstrap(models, data, theta=None, method='cosine', N=1000,
269272
dof = min(data.n_rdm, data.n_cond) - 1
270273
result = Result(models, evaluations, method=method,
271274
cv_method='bootstrap', noise_ceiling=noise_ceil,
272-
variances=variances, dof=dof)
275+
variances=variances, dof=dof, n_rdm=data.n_rdm,
276+
n_pattern=data.n_cond)
273277
return result
274278

275279

@@ -329,7 +333,9 @@ def eval_bootstrap_pattern(models, data, theta=None, method='cosine', N=1000,
329333
dof = data.n_cond - 1
330334
result = Result(models, evaluations, method=method,
331335
cv_method='bootstrap_pattern', noise_ceiling=noise_ceil,
332-
variances=variances, dof=dof)
336+
variances=variances, dof=dof, n_rdm=None,
337+
n_pattern=data.n_cond)
338+
result.n_rdm = data.n_rdm
333339
return result
334340

335341

@@ -378,7 +384,9 @@ def eval_bootstrap_rdm(models, data, theta=None, method='cosine', N=1000,
378384
variances = np.cov(evaluations.T)
379385
result = Result(models, evaluations, method=method,
380386
cv_method='bootstrap_rdm', noise_ceiling=noise_ceil,
381-
variances=variances, dof=dof)
387+
variances=variances, dof=dof, n_rdm=data.n_rdm,
388+
n_pattern=None)
389+
result.n_pattern = data.n_cond
382390
return result
383391

384392

@@ -590,7 +598,8 @@ def bootstrap_crossval(models, data, method='cosine', fitter=None,
590598
variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
591599
result = Result(models, evaluations, method=method,
592600
cv_method=cv_method, noise_ceiling=noise_ceil,
593-
variances=variances, dof=dof)
601+
variances=variances, dof=dof, n_rdm=data.n_rdm,
602+
n_pattern=data.n_cond)
594603
return result
595604

596605

@@ -735,7 +744,8 @@ def eval_dual_bootstrap_random(
735744
variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
736745
result = Result(models, evaluations, method=method,
737746
cv_method=cv_method, noise_ceiling=noise_ceil,
738-
variances=variances, dof=dof)
747+
variances=variances, dof=dof, n_rdm=data.n_rdm,
748+
n_pattern=data.n_cond)
739749
return result
740750

741751

Diff for: src/rsatoolbox/inference/result.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class Result:
4040
"""
4141

4242
def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
43-
variances=None, dof=1, fitter=None):
43+
variances=None, dof=1, fitter=None, n_rdm=None, n_pattern=None):
4444
if isinstance(models, rsatoolbox.model.Model):
4545
models = [models]
4646
assert len(models) == evaluations.shape[1], 'evaluations shape does' \
@@ -55,6 +55,8 @@ def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
5555
self.dof = dof
5656
self.fitter = fitter
5757
self.n_bootstraps = evaluations.shape[0]
58+
self.n_rdm = n_rdm
59+
self.n_pattern = n_pattern
5860
if variances is not None:
5961
# if the variances only refer to the models this should have the
6062
# same number of entries as the models list.
@@ -63,7 +65,7 @@ def __init__(self, models, evaluations, method, cv_method, noise_ceiling,
6365
else:
6466
nc_included = variances.shape[-1] != len(models)
6567
self.model_var, self.diff_var, self.noise_ceil_var = \
66-
extract_variances(variances, nc_included)
68+
extract_variances(variances, nc_included, n_rdm, n_pattern)
6769
else:
6870
self.model_var = None
6971
self.diff_var = None

Diff for: src/rsatoolbox/util/inference_util.py

+89-26
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
"""
44
Inference module utilities
55
"""
6-
6+
from __future__ import annotations
77
from collections.abc import Iterable
8+
from typing import TYPE_CHECKING, Optional
9+
import warnings
810
import numpy as np
911
from scipy import stats
1012
from scipy.stats import rankdata, wilcoxon
@@ -13,6 +15,8 @@
1315
from rsatoolbox.rdm import RDMs
1416
from .matrix import pairwise_contrast
1517
from .rdm_utils import batch_to_matrices
18+
if TYPE_CHECKING:
19+
from numpy.typing import NDArray
1620

1721

1822
def input_check_model(models, theta=None, fitter=None, N=1):
@@ -68,7 +72,7 @@ def input_check_model(models, theta=None, fitter=None, N=1):
6872
return models, evaluations, theta, fitter
6973

7074

71-
def pool_rdm(rdms, method='cosine'):
75+
def pool_rdm(rdms, method: str = 'cosine'):
7276
"""pools multiple RDMs into the one with maximal performance under a given
7377
evaluation metric
7478
rdm_descriptors of the generated rdms are empty
@@ -114,11 +118,11 @@ def pool_rdm(rdms, method='cosine'):
114118
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
115119
rdm_vec = _nan_mean(rdm_vec)
116120
elif method in ('kendall', 'tau-b'):
117-
Warning('Noise ceiling for tau based on averaged ranks!')
121+
warnings.warn('Noise ceiling for tau based on averaged ranks!')
118122
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
119123
rdm_vec = _nan_mean(rdm_vec)
120124
elif method == 'tau-a':
121-
Warning('Noise ceiling for tau based on averaged ranks!')
125+
warnings.warn('Noise ceiling for tau based on averaged ranks!')
122126
rdm_vec = np.array([_nan_rank_data(v) for v in rdm_vec])
123127
rdm_vec = _nan_mean(rdm_vec)
124128
else:
@@ -130,7 +134,7 @@ def pool_rdm(rdms, method='cosine'):
130134
pattern_descriptors=rdms.pattern_descriptors)
131135

132136

133-
def _nan_mean(rdm_vector):
137+
def _nan_mean(rdm_vector: NDArray) -> NDArray:
134138
""" takes the average over a rdm_vector with nans for masked entries
135139
without a warning
136140
@@ -149,7 +153,7 @@ def _nan_mean(rdm_vector):
149153
return rdm_mean
150154

151155

152-
def _nan_rank_data(rdm_vector):
156+
def _nan_rank_data(rdm_vector: NDArray) -> NDArray:
153157
""" rank_data for vectors with nan entries
154158
155159
Args:
@@ -166,9 +170,14 @@ def _nan_rank_data(rdm_vector):
166170
return ranks
167171

168172

169-
def all_tests(evaluations, noise_ceil, test_type='t-test',
170-
model_var=None, diff_var=None, noise_ceil_var=None,
171-
dof=1):
173+
def all_tests(
174+
evaluations: NDArray,
175+
noise_ceil: NDArray,
176+
test_type: str = 't-test',
177+
model_var: Optional[NDArray] = None,
178+
diff_var: Optional[NDArray] = None,
179+
noise_ceil_var: Optional[NDArray] = None,
180+
dof: int = 1):
172181
"""wrapper running all tests necessary for the model plot
173182
-> pairwise tests, tests against 0 and against noise ceiling
174183
@@ -218,7 +227,11 @@ def all_tests(evaluations, noise_ceil, test_type='t-test',
218227
return p_pairwise, p_zero, p_noise
219228

220229

221-
def pair_tests(evaluations, test_type='t-test', diff_var=None, dof=1):
230+
def pair_tests(
231+
evaluations: NDArray,
232+
test_type: str = 't-test',
233+
diff_var: Optional[NDArray] = None,
234+
dof: int = 1):
222235
"""wrapper running pair tests
223236
224237
Args:
@@ -393,7 +406,7 @@ def bootstrap_pair_tests(evaluations):
393406
proportions = np.zeros((evaluations.shape[1], evaluations.shape[1]))
394407
while len(evaluations.shape) > 2:
395408
evaluations = np.mean(evaluations, axis=-1)
396-
for i_model in range(evaluations.shape[1]-1):
409+
for i_model in range(evaluations.shape[1] - 1):
397410
for j_model in range(i_model + 1, evaluations.shape[1]):
398411
proportions[i_model, j_model] = np.sum(
399412
evaluations[:, i_model] < evaluations[:, j_model]) \
@@ -499,7 +512,11 @@ def t_test_nc(evaluations, variances, noise_ceil, dof=1):
499512
return p
500513

501514

502-
def extract_variances(variance, nc_included=True):
515+
def extract_variances(
516+
variance,
517+
nc_included: bool = True,
518+
n_rdm: Optional[int] = None,
519+
n_pattern: Optional[int] = None):
503520
""" extracts the variances for the individual model evaluations,
504521
differences between model evaluations and for the comparison to
505522
the noise ceiling
@@ -516,6 +533,12 @@ def extract_variances(variance, nc_included=True):
516533
to the noise ceiling results
517534
518535
nc_included=False assumes that the noise ceiling is fixed instead.
536+
537+
To get the more accurate estimates that take into account
538+
the number of subjects and/or the numbers of stimuli
539+
can be passed as n_rdm and n_pattern respectively.
540+
This function corrects for all ns that are passed. If you bootstrapped
541+
only one factor only pass the N for that factor!
519542
"""
520543
if variance.ndim == 0:
521544
variance = np.array([variance])
@@ -532,6 +555,9 @@ def extract_variances(variance, nc_included=True):
532555
model_variances = variance
533556
nc_variances = np.array([variance, variance]).T
534557
diff_variances = np.diag(C @ np.diag(variance) @ C.T)
558+
model_variances = _correct_1d(model_variances, n_pattern, n_rdm)
559+
nc_variances = _correct_1d(nc_variances, n_pattern, n_rdm)
560+
diff_variances = _correct_1d(diff_variances, n_pattern, n_rdm)
535561
elif variance.ndim == 2:
536562
# a single covariance matrix
537563
if nc_included:
@@ -546,6 +572,9 @@ def extract_variances(variance, nc_included=True):
546572
model_variances = np.diag(variance)
547573
nc_variances = np.array([model_variances, model_variances]).T
548574
diff_variances = np.diag(C @ variance @ C.T)
575+
model_variances = _correct_1d(model_variances, n_pattern, n_rdm)
576+
nc_variances = _correct_1d(nc_variances, n_pattern, n_rdm)
577+
diff_variances = _correct_1d(diff_variances, n_pattern, n_rdm)
549578
elif variance.ndim == 3:
550579
# general transform for multiple covariance matrices
551580
if nc_included:
@@ -565,12 +594,30 @@ def extract_variances(variance, nc_included=True):
565594
).transpose(1, 2, 0)
566595
diff_variances = np.einsum('ij,kjl,il->ki', C, variance, C)
567596
# dual bootstrap variance estimate from 3 covariance matrices
568-
model_variances = _dual_bootstrap(model_variances)
569-
nc_variances = _dual_bootstrap(nc_variances)
570-
diff_variances = _dual_bootstrap(diff_variances)
597+
model_variances = _dual_bootstrap(model_variances, n_rdm, n_pattern)
598+
nc_variances = _dual_bootstrap(nc_variances, n_rdm, n_pattern)
599+
diff_variances = _dual_bootstrap(diff_variances, n_rdm, n_pattern)
571600
return model_variances, diff_variances, nc_variances
572601

573602

603+
def _correct_1d(
604+
variance: NDArray,
605+
n_pattern: Optional[int] = None,
606+
n_rdm: Optional[int] = None):
607+
if (n_pattern is not None) and (n_rdm is not None):
608+
# uncorrected dual bootstrap?
609+
n = min(n_rdm, n_pattern)
610+
elif n_pattern is not None:
611+
n = n_pattern
612+
elif n_rdm is not None:
613+
n = n_rdm
614+
else:
615+
n = None
616+
if n is not None:
617+
variance = (n / (n - 1)) * variance
618+
return variance
619+
620+
574621
def get_errorbars(model_var, evaluations, dof, error_bars='sem',
575622
test_type='t-test'):
576623
""" computes errorbars for the model-evaluations from a results object
@@ -617,31 +664,47 @@ def get_errorbars(model_var, evaluations, dof, error_bars='sem',
617664
errorbar_high = std_eval \
618665
* tdist.ppf(prop_cut, dof)
619666
else:
620-
raise Exception('computing errorbars: Argument ' +
621-
'error_bars is incorrectly defined as '
622-
+ str(error_bars) + '.')
667+
raise ValueError('computing errorbars: Argument ' +
668+
'error_bars is incorrectly defined as '
669+
+ str(error_bars) + '.')
623670
limits = np.stack((errorbar_low, errorbar_high))
624671
if np.isnan(limits).any() or (abs(limits) == np.inf).any():
625-
raise Exception(
672+
raise ValueError(
626673
'computing errorbars: Too few bootstrap samples for the ' +
627674
'requested confidence interval: ' + error_bars + '.')
628675
return limits
629676

630677

631-
def _dual_bootstrap(variances):
678+
def _dual_bootstrap(variances, n_rdm=None, n_pattern=None):
632679
""" helper function to perform the dual bootstrap
633680
634681
Takes a 3x... array of variances and computes the corrections assuming:
635682
variances[0] are the variances in the double bootstrap
636683
variances[1] are the variances in the rdm bootstrap
637684
variances[2] are the variances in the pattern bootstrap
685+
686+
If both n_rdm and n_pattern are given this uses
687+
the more accurate small sample formula.
638688
"""
639-
variance = 2 * (variances[1] + variances[2]) \
640-
- variances[0]
641-
variance = np.maximum(np.maximum(
642-
variance, variances[1]), variances[2])
643-
variance = np.minimum(
644-
variance, variances[0])
689+
if n_rdm is None or n_pattern is None:
690+
variance = 2 * (variances[1] + variances[2]) \
691+
- variances[0]
692+
variance = np.maximum(np.maximum(
693+
variance, variances[1]), variances[2])
694+
variance = np.minimum(
695+
variance, variances[0])
696+
else:
697+
variance = (
698+
(n_rdm / (n_rdm - 1)) * variances[1]
699+
+ (n_pattern / (n_pattern - 1)) * variances[2]
700+
- ((n_pattern * n_rdm / (n_pattern - 1) / (n_rdm - 1))
701+
* (variances[0] - variances[1] - variances[2])))
702+
variance = np.maximum(np.maximum(
703+
variance,
704+
(n_rdm / (n_rdm - 1)) * variances[1]),
705+
(n_pattern / (n_pattern - 1)) * variances[2])
706+
variance = np.minimum(
707+
variance, variances[0])
645708
return variance
646709

647710

0 commit comments

Comments
 (0)