Skip to content

Commit 44d39fc

Browse files
ENH: Added "seed" parameter to subsample-ids (#317)
Now allows user to set a random seed, improving reproducibility. --- Co-authored-by: Greg Caporaso <[email protected]>
1 parent 4ca1f5f commit 44d39fc

File tree

3 files changed

+90
-30
lines changed

3 files changed

+90
-30
lines changed

q2_feature_table/_subsample_ids.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
#
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
8-
98
import biom
109

1110

12-
def subsample_ids(table: biom.Table, subsampling_depth: int,
13-
axis: str) -> biom.Table:
11+
def subsample_ids(table: biom.Table,
12+
subsampling_depth: int,
13+
axis: str,
14+
random_seed: int = None
15+
) -> biom.Table:
1416
if axis == 'feature':
1517
# we are transposing the table due to biocore/biom-format#759
1618
table = table.transpose()
@@ -21,7 +23,8 @@ def subsample_ids(table: biom.Table, subsampling_depth: int,
2123
'is: %d.' % len(table.ids()))
2224

2325
# the axis is always 'sample' due to the above transpose
24-
table = table.subsample(subsampling_depth, axis='sample', by_id=True)
26+
table = table.subsample(subsampling_depth, axis='sample',
27+
by_id=True, seed=random_seed)
2528

2629
# the inverted axis is always observation due to the above transpose
2730
invaxis = 'observation'

q2_feature_table/plugin_setup.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@
6666
function=q2_feature_table.subsample_ids,
6767
inputs={'table': FeatureTable[Frequency]},
6868
parameters={'subsampling_depth': Int % Range(1, None),
69-
'axis': Str % Choices(['sample', 'feature'])},
69+
'axis': Str % Choices(['sample', 'feature']),
70+
'random_seed': Int % Range(0, None),
71+
},
7072
outputs=[('sampled_table', FeatureTable[Frequency])],
7173
input_descriptions={'table': 'The feature table to be sampled.'},
7274
parameter_descriptions={
@@ -76,7 +78,10 @@
7678
'the resulting table.'),
7779
'axis': ('The axis to sample over. If "sample" then samples will be '
7880
'randomly selected to be retained. If "feature" then '
79-
'a random set of features will be selected to be retained.')
81+
'a random set of features will be selected to be retained.'),
82+
'random_seed': ('Set the seed for the subsampling. Using the same '
83+
'seed with the same table will always lead to the '
84+
'same result. Defaults to a random seed.')
8085
},
8186
output_descriptions={
8287
'sampled_table': 'The resulting subsampled feature table.'

q2_feature_table/tests/test_subsample.py

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,32 @@
1717

1818
class SubsampleIDsTests(TestCase):
1919

20-
def test_subsample_samples(self):
20+
def test_subsample_samples_w_random_seed(self):
2121
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
2222
['O1', 'O2'],
2323
['S1', 'S2', 'S3'])
2424
a = subsample_ids(t, 2, 'sample')
25-
self.assertEqual(a.shape, (2, 2))
26-
27-
sample_ids = frozenset(a.ids(axis='sample'))
28-
self.assertIn(sample_ids, set([frozenset(['S1', 'S2']),
29-
frozenset(['S1', 'S3']),
30-
frozenset(['S2', 'S3'])]))
31-
self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2']))
32-
33-
for i in a.ids(axis='sample'):
34-
npt.assert_equal(t.data(i, axis='sample'),
35-
a.data(i, axis='sample'))
25+
a_eq_b = []
26+
n_iterations = 100
27+
for i in range(n_iterations):
28+
b = subsample_ids(t, 2, 'sample')
29+
self.assertEqual(a.shape, (2, 2))
30+
31+
sample_ids = frozenset(b.ids(axis='sample'))
32+
self.assertIn(sample_ids, set([frozenset(['S1', 'S2']),
33+
frozenset(['S1', 'S3']),
34+
frozenset(['S2', 'S3'])]))
35+
self.assertEqual(set(b.ids(axis='observation')), set(['O1', 'O2']))
36+
37+
for i in b.ids(axis='sample'):
38+
npt.assert_equal(t.data(i, axis='sample'),
39+
b.data(i, axis='sample'))
40+
a_eq_b.append(a == b)
41+
42+
self.assertTrue(False in a_eq_b,
43+
f"After {n_iterations} iterations, all feature tables "
44+
"were identical. It therefore seems that a randomized "
45+
"seed is not being used.")
3646

3747
def test_subsample_samples_drop_empty_feature(self):
3848
t = Table(np.array([[0, 0, 0], [1, 1, 2]]),
@@ -47,22 +57,64 @@ def test_subsample_samples_drop_empty_feature(self):
4757
frozenset(['S2', 'S3'])]))
4858
self.assertEqual(set(a.ids(axis='observation')), set(['O2']))
4959

50-
def test_subsample_features(self):
60+
def test_subsample_features_w_random_seed(self):
5161
t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
5262
['O1', 'O2', 'O3'],
5363
['S1', 'S2'])
5464
a = subsample_ids(t, 2, 'feature')
55-
self.assertEqual(a.shape, (2, 2))
56-
57-
sample_ids = frozenset(a.ids(axis='observation'))
58-
self.assertIn(sample_ids, set([frozenset(['O1', 'O2']),
59-
frozenset(['O1', 'O3']),
60-
frozenset(['O2', 'O3'])]))
61-
self.assertEqual(set(a.ids(axis='sample')), set(['S1', 'S2']))
62-
63-
for i in a.ids(axis='observation'):
64-
npt.assert_equal(t.data(i, axis='observation'),
65-
a.data(i, axis='observation'))
65+
a_eq_b = []
66+
n_iterations = 100
67+
for i in range(n_iterations):
68+
b = subsample_ids(t, 2, 'feature')
69+
self.assertEqual(b.shape, (2, 2))
70+
71+
feature_ids = frozenset(b.ids(axis='observation'))
72+
self.assertIn(feature_ids, set([frozenset(['O1', 'O2']),
73+
frozenset(['O1', 'O3']),
74+
frozenset(['O2', 'O3'])]))
75+
self.assertEqual(set(b.ids(axis='sample')), set(['S1', 'S2']))
76+
77+
for i in b.ids(axis='observation'):
78+
npt.assert_equal(t.data(i, axis='observation'),
79+
b.data(i, axis='observation'))
80+
a_eq_b.append(a == b)
81+
82+
self.assertTrue(False in a_eq_b,
83+
f"After {n_iterations} iterations, all feature tables "
84+
"were identical. It therefore seems that a randomized "
85+
"seed is not being used.")
86+
87+
def test_subsample_samples_uses_nonrandom_seed(self):
88+
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
89+
['O1', 'O2'],
90+
['S1', 'S2', 'S3'])
91+
a = subsample_ids(t, 2, 'sample', random_seed=1)
92+
a_eq_b = []
93+
n_iterations = 100
94+
for i in range(n_iterations):
95+
b = subsample_ids(t, 2, 'sample', random_seed=1)
96+
a_eq_b.append(a == b)
97+
self.assertFalse(False in a_eq_b,
98+
f"After {n_iterations} iterations, at least one "
99+
"feature table differed from the others. It "
100+
"therefore seems that a randomized seed is being "
101+
"used.")
102+
103+
def test_subsample_features_uses_nonrandom_seed(self):
104+
t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
105+
['O1', 'O2', 'O3'],
106+
['S1', 'S2'])
107+
a = subsample_ids(t, 2, 'feature', random_seed=1)
108+
a_eq_b = []
109+
n_iterations = 100
110+
for i in range(n_iterations):
111+
b = subsample_ids(t, 2, 'feature', random_seed=1)
112+
a_eq_b.append(a == b)
113+
self.assertFalse(False in a_eq_b,
114+
f"After {n_iterations} iterations, at least one "
115+
"feature table differed from the others. It "
116+
"therefore seems that a randomized seed is being "
117+
"used.")
66118

67119
def test_subsample_features_drop_empty_samples(self):
68120
t = Table(np.array([[0, 0, 0], [1, 1, 2]]).T,

0 commit comments

Comments
 (0)