Merge branch 'master' into slurm-runs

CellArr · Dec 12, 2024 · 908e763 · 908e763
2 parents 5d6302e + d679fa5
commit 908e763
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 15 deletions.
diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml
@@ -14,10 +14,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - name: Set up Python 3.9
+    - name: Set up Python 3.11
       uses: actions/setup-python@v5
       with:
-        python-version: 3.9
+        python-version: 3.11
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ]
+        python-version: [ '3.9', '3.10', '3.11', '3.12' ]
 
     name: Python ${{ matrix.python-version }}
     steps:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: '^docs/conf.py'
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v5.0.0
   hooks:
   - id: trailing-whitespace
   - id: check-added-large-files
@@ -33,7 +33,7 @@ repos:
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.6.8
+  rev: v0.8.2
   hooks:
     - id: ruff
       args: [--fix, --exit-non-zero-on-fix]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,16 @@
 # Changelog
 
-## Version 0.3.0
+## Version 0.4.0
+
+- chore: Remove Python 3.8 (EOL).
+- precommit: Replace docformatter with ruff's formatter.
+
+## Version 0.3.2
+
+- Functionality to iterate over samples and cells.
+- Explicitly mention that slicing defaults to TileB's behavior, inclusive of upper bounds.
+
+## Version 0.3.0 - 0.3.1
 
 This version introduces major improvements to matrix handling, storage, and performance, including support for multiple matrices in H5AD/AnnData workflows and optimizations for ingestion and querying.
 

diff --git a/setup.cfg b/setup.cfg
@@ -41,7 +41,7 @@ package_dir =
     =src
 
 # Require a min/specific Python version (comma-separated conditions)
-python_requires = >=3.8
+python_requires = >=3.9
 
 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
 # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in

diff --git a/src/cellarr/CellArrDataset.py b/src/cellarr/CellArrDataset.py
@@ -27,7 +27,6 @@
         print(result1)
 """
 
-import os
 from functools import lru_cache
 from typing import List, Sequence, Union
 
@@ -42,6 +41,58 @@
 __license__ = "MIT"
 
 
+class CellArrSampleIterator:
+    """Sample iterator to a :py:class:`~cellarr.CellArrDataset` object."""
+
+    def __init__(self, obj: "CellArrDataset") -> None:
+        """Initialize the iterator.
+
+        Args:
+            obj:
+                Source object to iterate.
+        """
+        self._obj = obj
+        self._current_index = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._current_index < self._obj.get_number_of_samples():
+            iter_row_index = self._obj.get_sample_metadata_index()[self._current_index]
+
+            iter_slice = self._obj.get_cells_for_sample(self._current_index)
+            self._current_index += 1
+            return (iter_row_index, iter_slice)
+
+        raise StopIteration
+
+
+class CellArrCellIterator:
+    """Cell iterator to a :py:class:`~cellarr.CellArrDataset` object."""
+
+    def __init__(self, obj: "CellArrDataset") -> None:
+        """Initialize the iterator.
+
+        Args:
+            obj:
+                Source object to iterate.
+        """
+        self._obj = obj
+        self._current_index = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._current_index < self._obj.get_number_of_cells():
+            iter_slice = self._obj[self._current_index, :]
+            self._current_index += 1
+            return (self._current_index, iter_slice)
+
+        raise StopIteration
+
+
 class CellArrDataset:
     """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
 
@@ -63,17 +114,22 @@ def __init__(
                 Usually the ``output_path`` from the
                 :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
 
-            assay_group:
+                You may provide any tiledb compatible base path (e.g. local
+                directory, S3, minio etc.).
+
+            assay_tiledb_group:
                 TileDB group containing the assay matrices.
 
                 If the provided build process was used, the matrices are stored
                 in the "assay" TileDB group.
 
-                May be an empty string to specify no group.
+                May be an empty string or `None` to specify no group. This is
+                mostly for backwards compatibility of cellarr builds for versions
+                before 0.3.
 
             assay_uri:
                 Relative path to matrix store.
-                Must be in tiledb group specified by ``assay_group``.
+                Must be in tiledb group specified by ``assay_tiledb_group``.
 
             gene_annotation_uri:
                 Relative path to gene annotation store.
@@ -87,10 +143,6 @@ def __init__(
             config:
                 Custom TileDB configuration. If None, defaults will be used.
         """
-
-        if not os.path.isdir(dataset_path):
-            raise ValueError("'dataset_path' is not a directory.")
-
         if config is None:
             config = tiledb.Config()
 
@@ -192,6 +244,10 @@ def get_cell_subset(self, subset: Union[slice, tiledb.QueryCondition], columns=N
             self._cell_metadata_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_sample"
         )
 
+    def get_number_of_cells(self) -> int:
+        """Get number of cells."""
+        return self._cell_metadata_tdb.nonempty_domain()[0][1] + 1
+
     ####
     ## Subset methods for the `gene_annotation` TileDB file.
     ####
@@ -275,6 +331,10 @@ def get_gene_subset(self, subset: Union[slice, List[str], tiledb.QueryCondition]
             self._gene_annotation_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_gene_index"
         )
 
+    def get_number_of_features(self) -> int:
+        """Get number of features."""
+        return self._gene_annotation_tdb.nonempty_domain()[0][1] + 1
+
     ####
     ## Subset methods for the `sample_metadata` TileDB file.
     ####
@@ -337,6 +397,20 @@ def get_sample_subset(self, subset: Union[slice, tiledb.QueryCondition], columns
             self._sample_metadata_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_sample"
         )
 
+    def get_number_of_samples(self) -> int:
+        """Get number of samples."""
+        return self._sample_metadata_tdb.nonempty_domain()[0][1] + 1
+
+    @lru_cache(maxsize=128)
+    def get_sample_metadata_index(self) -> List[str]:
+        """Get index of the ``sample_metadata`` store.
+
+        Returns:
+            List of unique sample names.
+        """
+        res = qtd.get_a_column(self._sample_metadata_tdb, "cellarr_sample")
+        return res["cellarr_sample"].tolist()
+
     ####
     ## Subset methods for the `matrix` TileDB file.
     ####
@@ -464,6 +538,9 @@ def __getitem__(
                 the rows (or cells) to retain, while the second entry specifies the
                 columns (or features/genes) to retain, based on their names or indices.
 
+        Note:
+            Slices are inclusive of the upper bounds. This is the default TileDB behavior.
+
         Raises:
             ValueError:
                 If too many or too few slices provided.
@@ -573,3 +650,15 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.__del__()
+
+    ####
+    ## Iterators
+    ####
+
+    def itersamples(self) -> CellArrSampleIterator:
+        """Iterator over samples."""
+        return CellArrSampleIterator(self)
+
+    def itercells(self) -> CellArrCellIterator:
+        """Iterator over samples."""
+        return CellArrCellIterator(self)
diff --git a/tests/test_query.py b/tests/test_query.py
@@ -107,3 +107,16 @@ def test_query_cellarrdataset():
 
     assert cd.get_cells_for_sample(0).to_anndata().shape == (adata1.shape[0], 1000)
     assert cd.get_cells_for_sample(1).to_anndata().shape == (adata2.shape[0], 1000)
+
+    sample_count = 0
+    obs = [adata1, adata2]
+    for sample, sample_chunk in cd.itersamples():
+        assert len(sample_chunk.cell_metadata) == obs[sample_count].shape[0]
+        sample_count += 1
+
+    assert sample_count == 2
+
+    cell_count = 0
+    for cell, cell_chunk in cd.itercells():
+        cell_count += 1
+    assert cell_count == 1100