Skip to content

Commit

Permalink
Merge branch 'master' into slurm-runs
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche committed Dec 12, 2024
2 parents 5d6302e + d679fa5 commit 908e763
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 15 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pypi-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ jobs:

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ]
python-version: [ '3.9', '3.10', '3.11', '3.12' ]

name: Python ${{ matrix.python-version }}
steps:
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ exclude: '^docs/conf.py'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
Expand Down Expand Up @@ -33,7 +33,7 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.6.8
rev: v0.8.2
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Changelog

## Version 0.3.0
## Version 0.4.0

- chore: Remove Python 3.8 (EOL).
- precommit: Replace docformatter with ruff's formatter.

## Version 0.3.2

- Functionality to iterate over samples and cells.
- Explicitly mention that slicing defaults to TileB's behavior, inclusive of upper bounds.

## Version 0.3.0 - 0.3.1

This version introduces major improvements to matrix handling, storage, and performance, including support for multiple matrices in H5AD/AnnData workflows and optimizations for ingestion and querying.

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ package_dir =
=src

# Require a min/specific Python version (comma-separated conditions)
python_requires = >=3.8
python_requires = >=3.9

# Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
# Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
Expand Down
105 changes: 97 additions & 8 deletions src/cellarr/CellArrDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
print(result1)
"""

import os
from functools import lru_cache
from typing import List, Sequence, Union

Expand All @@ -42,6 +41,58 @@
__license__ = "MIT"


class CellArrSampleIterator:
"""Sample iterator to a :py:class:`~cellarr.CellArrDataset` object."""

def __init__(self, obj: "CellArrDataset") -> None:
"""Initialize the iterator.
Args:
obj:
Source object to iterate.
"""
self._obj = obj
self._current_index = 0

def __iter__(self):
return self

def __next__(self):
if self._current_index < self._obj.get_number_of_samples():
iter_row_index = self._obj.get_sample_metadata_index()[self._current_index]

iter_slice = self._obj.get_cells_for_sample(self._current_index)
self._current_index += 1
return (iter_row_index, iter_slice)

raise StopIteration


class CellArrCellIterator:
"""Cell iterator to a :py:class:`~cellarr.CellArrDataset` object."""

def __init__(self, obj: "CellArrDataset") -> None:
"""Initialize the iterator.
Args:
obj:
Source object to iterate.
"""
self._obj = obj
self._current_index = 0

def __iter__(self):
return self

def __next__(self):
if self._current_index < self._obj.get_number_of_cells():
iter_slice = self._obj[self._current_index, :]
self._current_index += 1
return (self._current_index, iter_slice)

raise StopIteration


class CellArrDataset:
"""A class that represent a collection of cells and their associated metadata in a TileDB backed store."""

Expand All @@ -63,17 +114,22 @@ def __init__(
Usually the ``output_path`` from the
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
assay_group:
You may provide any tiledb compatible base path (e.g. local
directory, S3, minio etc.).
assay_tiledb_group:
TileDB group containing the assay matrices.
If the provided build process was used, the matrices are stored
in the "assay" TileDB group.
May be an empty string to specify no group.
May be an empty string or `None` to specify no group. This is
mostly for backwards compatibility of cellarr builds for versions
before 0.3.
assay_uri:
Relative path to matrix store.
Must be in tiledb group specified by ``assay_group``.
Must be in tiledb group specified by ``assay_tiledb_group``.
gene_annotation_uri:
Relative path to gene annotation store.
Expand All @@ -87,10 +143,6 @@ def __init__(
config:
Custom TileDB configuration. If None, defaults will be used.
"""

if not os.path.isdir(dataset_path):
raise ValueError("'dataset_path' is not a directory.")

if config is None:
config = tiledb.Config()

Expand Down Expand Up @@ -192,6 +244,10 @@ def get_cell_subset(self, subset: Union[slice, tiledb.QueryCondition], columns=N
self._cell_metadata_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_sample"
)

def get_number_of_cells(self) -> int:
"""Get number of cells."""
return self._cell_metadata_tdb.nonempty_domain()[0][1] + 1

####
## Subset methods for the `gene_annotation` TileDB file.
####
Expand Down Expand Up @@ -275,6 +331,10 @@ def get_gene_subset(self, subset: Union[slice, List[str], tiledb.QueryCondition]
self._gene_annotation_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_gene_index"
)

def get_number_of_features(self) -> int:
"""Get number of features."""
return self._gene_annotation_tdb.nonempty_domain()[0][1] + 1

####
## Subset methods for the `sample_metadata` TileDB file.
####
Expand Down Expand Up @@ -337,6 +397,20 @@ def get_sample_subset(self, subset: Union[slice, tiledb.QueryCondition], columns
self._sample_metadata_tdb, subset=subset, columns=columns, primary_key_column_name="cellarr_sample"
)

def get_number_of_samples(self) -> int:
"""Get number of samples."""
return self._sample_metadata_tdb.nonempty_domain()[0][1] + 1

@lru_cache(maxsize=128)
def get_sample_metadata_index(self) -> List[str]:
"""Get index of the ``sample_metadata`` store.
Returns:
List of unique sample names.
"""
res = qtd.get_a_column(self._sample_metadata_tdb, "cellarr_sample")
return res["cellarr_sample"].tolist()

####
## Subset methods for the `matrix` TileDB file.
####
Expand Down Expand Up @@ -464,6 +538,9 @@ def __getitem__(
the rows (or cells) to retain, while the second entry specifies the
columns (or features/genes) to retain, based on their names or indices.
Note:
Slices are inclusive of the upper bounds. This is the default TileDB behavior.
Raises:
ValueError:
If too many or too few slices provided.
Expand Down Expand Up @@ -573,3 +650,15 @@ def __enter__(self):

def __exit__(self, exc_type, exc_val, exc_tb):
self.__del__()

####
## Iterators
####

def itersamples(self) -> CellArrSampleIterator:
"""Iterator over samples."""
return CellArrSampleIterator(self)

def itercells(self) -> CellArrCellIterator:
"""Iterator over samples."""
return CellArrCellIterator(self)
13 changes: 13 additions & 0 deletions tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,16 @@ def test_query_cellarrdataset():

assert cd.get_cells_for_sample(0).to_anndata().shape == (adata1.shape[0], 1000)
assert cd.get_cells_for_sample(1).to_anndata().shape == (adata2.shape[0], 1000)

sample_count = 0
obs = [adata1, adata2]
for sample, sample_chunk in cd.itersamples():
assert len(sample_chunk.cell_metadata) == obs[sample_count].shape[0]
sample_count += 1

assert sample_count == 2

cell_count = 0
for cell, cell_chunk in cd.itercells():
cell_count += 1
assert cell_count == 1100

0 comments on commit 908e763

Please sign in to comment.