ENH: Added "seed" parameter to subsample-ids (#317)

VinzentRisch · gregcaporaso · web-flow · commit 44d39fc6324c · 2025-10-16T14:33:42.000-07:00
Now allows user to set a random seed, improving reproducibility. 

--- 
Co-authored-by: Greg Caporaso &lt;jgcap@fastmail.com&gt;
diff --git a/q2_feature_table/_subsample_ids.py b/q2_feature_table/_subsample_ids.py
@@ -5,12 +5,14 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
 import biom
 
 
-def subsample_ids(table: biom.Table, subsampling_depth: int,
-                  axis: str) -> biom.Table:
+def subsample_ids(table: biom.Table,
+                  subsampling_depth: int,
+                  axis: str,
+                  random_seed: int = None
+                  ) -> biom.Table:
     if axis == 'feature':
         # we are transposing the table due to biocore/biom-format#759
         table = table.transpose()
@@ -21,7 +23,8 @@ def subsample_ids(table: biom.Table, subsampling_depth: int,
                          'is: %d.' % len(table.ids()))
 
     # the axis is always 'sample' due to the above transpose
-    table = table.subsample(subsampling_depth, axis='sample', by_id=True)
+    table = table.subsample(subsampling_depth, axis='sample',
+                            by_id=True, seed=random_seed)
 
     # the inverted axis is always observation due to the above transpose
     invaxis = 'observation'
diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py
@@ -66,7 +66,9 @@
     function=q2_feature_table.subsample_ids,
     inputs={'table': FeatureTable[Frequency]},
     parameters={'subsampling_depth': Int % Range(1, None),
-                'axis': Str % Choices(['sample', 'feature'])},
+                'axis': Str % Choices(['sample', 'feature']),
+                'random_seed': Int % Range(0, None),
+                },
     outputs=[('sampled_table', FeatureTable[Frequency])],
     input_descriptions={'table': 'The feature table to be sampled.'},
     parameter_descriptions={
@@ -76,7 +78,10 @@
                               'the resulting table.'),
         'axis': ('The axis to sample over. If "sample" then samples will be '
                  'randomly selected to be retained. If "feature" then '
-                 'a random set of features will be selected to be retained.')
+                 'a random set of features will be selected to be retained.'),
+        'random_seed': ('Set the seed for the subsampling. Using the same '
+                        'seed with the same table will always lead to the '
+                        'same result. Defaults to a random seed.')
     },
     output_descriptions={
         'sampled_table': 'The resulting subsampled feature table.'
diff --git a/q2_feature_table/tests/test_subsample.py b/q2_feature_table/tests/test_subsample.py
@@ -17,22 +17,32 @@
 
 class SubsampleIDsTests(TestCase):
 
-    def test_subsample_samples(self):
+    def test_subsample_samples_w_random_seed(self):
         t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
                   ['O1', 'O2'],
                   ['S1', 'S2', 'S3'])
         a = subsample_ids(t, 2, 'sample')
-        self.assertEqual(a.shape, (2, 2))
-
-        sample_ids = frozenset(a.ids(axis='sample'))
-        self.assertIn(sample_ids, set([frozenset(['S1', 'S2']),
-                                       frozenset(['S1', 'S3']),
-                                       frozenset(['S2', 'S3'])]))
-        self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2']))
-
-        for i in a.ids(axis='sample'):
-            npt.assert_equal(t.data(i, axis='sample'),
-                             a.data(i, axis='sample'))
+        a_eq_b = []
+        n_iterations = 100
+        for i in range(n_iterations):
+            b = subsample_ids(t, 2, 'sample')
+            self.assertEqual(a.shape, (2, 2))
+
+            sample_ids = frozenset(b.ids(axis='sample'))
+            self.assertIn(sample_ids, set([frozenset(['S1', 'S2']),
+                                           frozenset(['S1', 'S3']),
+                                           frozenset(['S2', 'S3'])]))
+            self.assertEqual(set(b.ids(axis='observation')), set(['O1', 'O2']))
+
+            for i in b.ids(axis='sample'):
+                npt.assert_equal(t.data(i, axis='sample'),
+                                 b.data(i, axis='sample'))
+            a_eq_b.append(a == b)
+
+        self.assertTrue(False in a_eq_b,
+                        f"After {n_iterations} iterations, all feature tables "
+                        "were identical. It therefore seems that a randomized "
+                        "seed is not being used.")
 
     def test_subsample_samples_drop_empty_feature(self):
         t = Table(np.array([[0, 0, 0], [1, 1, 2]]),
@@ -47,22 +57,64 @@ def test_subsample_samples_drop_empty_feature(self):
                                        frozenset(['S2', 'S3'])]))
         self.assertEqual(set(a.ids(axis='observation')), set(['O2']))
 
-    def test_subsample_features(self):
+    def test_subsample_features_w_random_seed(self):
         t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
                   ['O1', 'O2', 'O3'],
                   ['S1', 'S2'])
         a = subsample_ids(t, 2, 'feature')
-        self.assertEqual(a.shape, (2, 2))
-
-        sample_ids = frozenset(a.ids(axis='observation'))
-        self.assertIn(sample_ids, set([frozenset(['O1', 'O2']),
-                                       frozenset(['O1', 'O3']),
-                                       frozenset(['O2', 'O3'])]))
-        self.assertEqual(set(a.ids(axis='sample')), set(['S1', 'S2']))
-
-        for i in a.ids(axis='observation'):
-            npt.assert_equal(t.data(i, axis='observation'),
-                             a.data(i, axis='observation'))
+        a_eq_b = []
+        n_iterations = 100
+        for i in range(n_iterations):
+            b = subsample_ids(t, 2, 'feature')
+            self.assertEqual(b.shape, (2, 2))
+
+            feature_ids = frozenset(b.ids(axis='observation'))
+            self.assertIn(feature_ids, set([frozenset(['O1', 'O2']),
+                                           frozenset(['O1', 'O3']),
+                                           frozenset(['O2', 'O3'])]))
+            self.assertEqual(set(b.ids(axis='sample')), set(['S1', 'S2']))
+
+            for i in b.ids(axis='observation'):
+                npt.assert_equal(t.data(i, axis='observation'),
+                                 b.data(i, axis='observation'))
+            a_eq_b.append(a == b)
+
+        self.assertTrue(False in a_eq_b,
+                        f"After {n_iterations} iterations, all feature tables "
+                        "were identical. It therefore seems that a randomized "
+                        "seed is not being used.")
+
+    def test_subsample_samples_uses_nonrandom_seed(self):
+        t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
+                  ['O1', 'O2'],
+                  ['S1', 'S2', 'S3'])
+        a = subsample_ids(t, 2, 'sample', random_seed=1)
+        a_eq_b = []
+        n_iterations = 100
+        for i in range(n_iterations):
+            b = subsample_ids(t, 2, 'sample', random_seed=1)
+            a_eq_b.append(a == b)
+        self.assertFalse(False in a_eq_b,
+                         f"After {n_iterations} iterations, at least one "
+                         "feature table differed from the others. It "
+                         "therefore seems that a randomized seed is being "
+                         "used.")
+
+    def test_subsample_features_uses_nonrandom_seed(self):
+        t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2'])
+        a = subsample_ids(t, 2, 'feature', random_seed=1)
+        a_eq_b = []
+        n_iterations = 100
+        for i in range(n_iterations):
+            b = subsample_ids(t, 2, 'feature', random_seed=1)
+            a_eq_b.append(a == b)
+        self.assertFalse(False in a_eq_b,
+                         f"After {n_iterations} iterations, at least one "
+                         "feature table differed from the others. It "
+                         "therefore seems that a randomized seed is being "
+                         "used.")
 
     def test_subsample_features_drop_empty_samples(self):
         t = Table(np.array([[0, 0, 0], [1, 1, 2]]).T,