MAINT: various updates to support better usage of biom.Table API (qiime2#323)

gregcaporaso · wasade · gregcaporaso · commit 31e09f61d843 · 2025-01-23T09:28:30.000-07:00
Fixes qiime2#149 Also adds a subsample test that should have existed (prior to this PR). --------- Co-authored-by: Daniel McDonald <danielmcdonald@ucsd.edu>
diff --git a/q2_feature_table/_filter.py b/q2_feature_table/_filter.py
@@ -66,10 +66,7 @@ def _filter_table(table, min_frequency, max_frequency, min_nonzero,
     # filter on the opposite axis to remove any entities that now have a
     # frequency of zero
     if filter_opposite_axis:
-        filter_fn2 = _get_biom_filter_function(
-            ids_to_keep=table.ids(axis=_other_axis_map[axis]), min_frequency=0,
-            max_frequency=None, min_nonzero=1, max_nonzero=None)
-        table.filter(filter_fn2, axis=_other_axis_map[axis], inplace=True)
+        table.remove_empty(axis=_other_axis_map[axis], inplace=True)
 
     if not allow_empty_table:
         _validate_nonempty_table(table)
diff --git a/q2_feature_table/_split.py b/q2_feature_table/_split.py
@@ -15,12 +15,14 @@ def split(table: biom.Table,
           filter_empty_features: bool = True) -> biom.Table:
     metadata = metadata.filter_ids(table.ids(axis='sample'))
     metadata_df = metadata.drop_missing_values().to_dataframe()
+    lookup = metadata_df[metadata.name].to_dict()
 
-    indices = metadata_df.reset_index(
-        ).groupby(metadata.name)[metadata_df.index.name].apply(list).to_dict()
+    def partition_f(i, m):
+        return lookup.get(i)
 
+    unique_grps = sorted(set(lookup.values()))
     try:
-        qiime2.sdk.util.validate_result_collection_keys(*indices.keys())
+        qiime2.sdk.util.validate_result_collection_keys(*unique_grps)
     except KeyError as e:
         raise KeyError(
             "One or more invalid metadata column values identified during "
@@ -29,9 +31,11 @@ def split(table: biom.Table,
             f"table. The original error message is as follows: {str(e)}")
 
     result = {}
-    for group, sample_ids in indices.items():
-        t = table.filter(sample_ids, axis='sample', inplace=False)
+    for group, tab in table.partition(partition_f):
+        if group is None:
+            continue
+
         if filter_empty_features:
-            t.remove_empty(axis='observation', inplace=True)
-        result[group] = t
+            tab.remove_empty(axis='observation', inplace=True)
+        result[group] = tab
     return result
diff --git a/q2_feature_table/_subsample_ids.py b/q2_feature_table/_subsample_ids.py
@@ -25,7 +25,7 @@ def subsample_ids(table: biom.Table, subsampling_depth: int,
 
     # the inverted axis is always observation due to the above transpose
     invaxis = 'observation'
-    table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis)
+    table = table.remove_empty(axis=invaxis, inplace=False)
 
     if axis == 'feature':
         # reverse the transpose necessary due to biocore/biom-format#759
diff --git a/q2_feature_table/tests/test_subsample.py b/q2_feature_table/tests/test_subsample.py
@@ -34,6 +34,19 @@ def test_subsample_samples(self):
             npt.assert_equal(t.data(i, axis='sample'),
                              a.data(i, axis='sample'))
 
+    def test_subsample_samples_drop_empty_feature(self):
+        t = Table(np.array([[0, 0, 0], [1, 1, 2]]),
+                  ['O1', 'O2'],
+                  ['S1', 'S2', 'S3'])
+        a = subsample_ids(t, 2, 'sample')
+        self.assertEqual(a.shape, (1, 2))
+
+        sample_ids = frozenset(a.ids(axis='sample'))
+        self.assertIn(sample_ids, set([frozenset(['S1', 'S2']),
+                                       frozenset(['S1', 'S3']),
+                                       frozenset(['S2', 'S3'])]))
+        self.assertEqual(set(a.ids(axis='observation')), set(['O2']))
+
     def test_subsample_features(self):
         t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
                   ['O1', 'O2', 'O3'],
@@ -51,6 +64,19 @@ def test_subsample_features(self):
             npt.assert_equal(t.data(i, axis='observation'),
                              a.data(i, axis='observation'))
 
+    def test_subsample_features_drop_empty_samples(self):
+        t = Table(np.array([[0, 0, 0], [1, 1, 2]]).T,
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2'])
+        a = subsample_ids(t, 2, 'feature')
+        self.assertEqual(a.shape, (2, 1))
+
+        sample_ids = frozenset(a.ids(axis='observation'))
+        self.assertIn(sample_ids, set([frozenset(['O1', 'O2']),
+                                       frozenset(['O1', 'O3']),
+                                       frozenset(['O2', 'O3'])]))
+        self.assertEqual(set(a.ids(axis='sample')), set(['S2']))
+
     def test_subsample_samples_oversample(self):
         t = Table(np.array([[0, 1, 3], [1, 1, 2]]).T,
                   ['O1', 'O2', 'O3'],