Skip to content

Commit b379dcd

Browse files
committed
Create staggered hierarchy for CellArr dataset for flexibility for power users
This will allow to operate on tiledb arrays directly
1 parent 36ae287 commit b379dcd

File tree

1 file changed

+192
-80
lines changed

1 file changed

+192
-80
lines changed

src/cellarr/CellArrDataset.py

Lines changed: 192 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
print(result1)
2828
"""
2929

30+
from collections.abc import Mapping
3031
from functools import lru_cache
3132
from typing import List, Optional, Sequence, Union
3233

@@ -93,99 +94,56 @@ def __next__(self):
9394
raise StopIteration
9495

9596

96-
class CellArrDataset:
97-
"""A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
97+
98+
class _CellArrDatasetBase:
99+
"""
100+
Base class for CellArr dataset. This does not manage tiledb arrays and will not close them on __del__.
101+
Can we even abstract away tiledb.Array here so we can support np.ndarray directly? I expect that
102+
we use too much of the tiledb API for that but we may want to explore that option.
103+
104+
This is a nice-to-have for creating CellArr datasets that combine data that are not within the same prefix,
105+
e.g. when running a pipeline that modifies the data matrices but the metadata never changes. This even allows for
106+
having metadata in memory and cell data streamed from disk. Note that this is a power user interface and we should
107+
not provide support, i.e. power users must know what they are doing and operate at their own risk (hence the leading underscore).
108+
"""
98109

99110
def __init__(
100111
self,
101-
dataset_path: str,
102-
assay_tiledb_group: str = "assays",
103-
assay_uri: Union[str, List[str]] = "counts",
104-
gene_annotation_uri: str = "gene_annotation",
105-
cell_metadata_uri: str = "cell_metadata",
106-
sample_metadata_uri: str = "sample_metadata",
107-
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
112+
assays: Union[tiledb.Array, Sequence[tiledb.Array], dict[str, tiledb.Array]],
113+
gene_annotations: tiledb.Array,
114+
cell_metadata: tiledb.Array,
115+
sample_metadata: tiledb.Array,
108116
):
109-
"""Initialize a ``CellArrDataset``.
110-
111-
Args:
112-
dataset_path:
113-
Path to the directory containing the TileDB stores.
114-
Usually the ``output_path`` from the
115-
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
116-
117-
You may provide any tiledb compatible base path (e.g. local
118-
directory, S3, minio etc.).
119-
120-
assay_tiledb_group:
121-
TileDB group containing the assay matrices.
122-
123-
If the provided build process was used, the matrices are stored
124-
in the "assay" TileDB group.
125-
126-
May be an empty string or `None` to specify no group. This is
127-
mostly for backwards compatibility of cellarr builds for versions
128-
before 0.3.
129-
130-
assay_uri:
131-
Relative path to matrix store.
132-
Must be in tiledb group specified by ``assay_tiledb_group``.
133-
134-
gene_annotation_uri:
135-
Relative path to gene annotation store.
136-
137-
cell_metadata_uri:
138-
Relative path to cell metadata store.
139-
140-
sample_metadata_uri:
141-
Relative path to sample metadata store.
142-
143-
config_or_context:
144-
Custom TileDB configuration or context.
145-
If None, default TileDB Config will be used.
146-
"""
147-
if config_or_context is None:
148-
config_or_context = tiledb.Config()
149-
150-
if isinstance(config_or_context, tiledb.Config):
151-
ctx = tiledb.Ctx(config_or_context)
152-
elif isinstance(config_or_context, tiledb.Ctx):
153-
ctx = config_or_context
117+
if isinstance(assays, tiledb.Array):
118+
assays = [tiledb.Array]
119+
if isinstance(assays, Mapping):
120+
self._matrix_tdb = dict(assays)
154121
else:
155-
raise Exception("'config_or_context' must be either TileDB config or a context object.")
156-
157-
self._dataset_path = dataset_path
158-
159-
if isinstance(assay_uri, str):
160-
assay_uri = [assay_uri]
161-
# TODO: Maybe switch to on-demand loading of these objects
162-
self._matrix_tdb = {}
163-
_asy_path = dataset_path
164-
if assay_tiledb_group is not None and len(assay_tiledb_group) > 0:
165-
_asy_path = f"{dataset_path}/{assay_tiledb_group}"
166-
for mtdb in assay_uri:
167-
self._matrix_tdb[mtdb] = tiledb.open(f"{_asy_path}/{mtdb}", "r", ctx=ctx)
168-
self._gene_annotation_tdb = tiledb.open(f"{dataset_path}/{gene_annotation_uri}", "r", ctx=ctx)
169-
self._cell_metadata_tdb = tiledb.open(f"{dataset_path}/{cell_metadata_uri}", "r", ctx=ctx)
170-
self._sample_metadata_tdb = tiledb.open(f"{dataset_path}/{sample_metadata_uri}", "r", ctx=ctx)
122+
self._matrix_tdb = {assay.uri.split("/")[-1]: assay for assay in assays}
123+
self._gene_annotation_tdb = gene_annotations
124+
self._cell_metadata_tdb = cell_metadata
125+
self._sample_metadata_tdb = sample_metadata
171126

172127
self._validate()
173128

174129
def _validate(self):
175130
num_cells = self._cell_metadata_tdb.nonempty_domain()[0][1]
176131
num_rows = self._gene_annotation_tdb.nonempty_domain()[0][1]
177132

178-
for mname, muri in self._matrix_tdb.items():
179-
dom = muri.nonempty_domain()
133+
for mname, marray in self._matrix_tdb.items():
134+
self._validate_read_only(marray, mname)
135+
dom = marray.nonempty_domain()
180136
if dom[0][1] != num_cells or dom[1][1] != num_rows:
181137
raise RuntimeError(f"Matrix {mname} has incorrect dimensions")
182138

183-
def __del__(self):
184-
self._gene_annotation_tdb.close()
185-
self._cell_metadata_tdb.close()
186-
self._sample_metadata_tdb.close()
187-
for tobj in self._matrix_tdb.values():
188-
tobj.close()
139+
self._validate_read_only(self._gene_annotation_tdb)
140+
self._validate_read_only(self._cell_metadata_tdb)
141+
self._validate_read_only(self._sample_metadata_tdb)
142+
143+
@staticmethod
144+
def _validate_read_only(array: tiledb.Array, name: str | None = ""):
145+
assert not array.iswritable, f"Arrays must be read-only but found writable array {name}: {array}"
146+
189147

190148
####
191149
## Subset methods for the `cell_metadata` TileDB file.
@@ -595,7 +553,6 @@ def __repr__(self) -> str:
595553
"""
596554
output = f"{type(self).__name__}(number_of_rows={self.shape[0]}"
597555
output += f", number_of_columns={self.shape[1]}"
598-
output += ", at path=" + self._dataset_path
599556

600557
output += ")"
601558
return output
@@ -609,7 +566,6 @@ def __str__(self) -> str:
609566

610567
output += f"number_of_rows: {self.shape[0]}\n"
611568
output += f"number_of_columns: {self.shape[1]}\n"
612-
output += f"path: '{self._dataset_path}'\n"
613569

614570
return output
615571

@@ -668,3 +624,159 @@ def itersamples(self) -> CellArrSampleIterator:
668624
def itercells(self) -> CellArrCellIterator:
669625
"""Iterator over samples."""
670626
return CellArrCellIterator(self)
627+
628+
629+
class _CellArrDatasetUri(_CellArrDatasetBase):
630+
"""
631+
An extension of _CellArrDatasetBase that manages the underlying tiledb arrays. Unlike the base class,
632+
this accepts onli uris, not tiledb.Array objects, and will manage the tiledb.Array objects. That means,
633+
it will open them (in read-only mode) upon creation and close them upon deletion via override of __del__.
634+
635+
This is a nice-to-have for creating CellArr datasets that combine data that are not within the same prefix,
636+
e.g. when running a pipeline that modifies the data matrices but the metadata never changes.
637+
Note that this is a power user interface and we should not provide support, i.e. power users must know what
638+
they are doing and operate at their own risk (hence the leading underscore).
639+
"""
640+
641+
def __init__(
642+
self,
643+
assay_uris: Union[str, Sequence[str], Mapping[str, str]],
644+
gene_annotation_uri: str,
645+
cell_metadata_uri: str,
646+
sample_metadata_uri: str,
647+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
648+
):
649+
650+
if config_or_context is None:
651+
config_or_context = tiledb.Config()
652+
653+
if isinstance(config_or_context, tiledb.Config):
654+
self._ctx = tiledb.Ctx(config_or_context)
655+
elif isinstance(config_or_context, tiledb.Ctx):
656+
self._ctx = config_or_context
657+
else:
658+
raise Exception("'config_or_context' must be either TileDB config or a context object.")
659+
660+
if isinstance(assay_uris, str):
661+
assay_uris = [assay_uris]
662+
if not isinstance(assay_uris, Mapping):
663+
assay_uris = {uri.split("/")[-1]: uri for uri in assay_uris}
664+
def _open(uri):
665+
return tiledb.open(uri=uri, mode="r", ctx=self._ctx)
666+
assays = {name: _open(uri=uri) for name, uri in assay_uris.items()}
667+
668+
super().__init__(
669+
assays=assays,
670+
gene_annotations=_open(gene_annotation_uri),
671+
cell_metadata=_open(cell_metadata_uri),
672+
sample_metadata=_open(sample_metadata_uri),
673+
)
674+
675+
def __del__(self):
676+
self._gene_annotation_tdb.close()
677+
self._cell_metadata_tdb.close()
678+
self._sample_metadata_tdb.close()
679+
for tobj in self._matrix_tdb.values():
680+
tobj.close()
681+
682+
683+
class CellArrDataset(_CellArrDatasetUri):
684+
"""A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
685+
686+
def __init__(
687+
self,
688+
dataset_path: str,
689+
assay_tiledb_group: str = "assays",
690+
assay_uri: Union[str, List[str]] = "counts",
691+
gene_annotation_uri: str = "gene_annotation",
692+
cell_metadata_uri: str = "cell_metadata",
693+
sample_metadata_uri: str = "sample_metadata",
694+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
695+
):
696+
"""Initialize a ``CellArrDataset``.
697+
698+
Args:
699+
dataset_path:
700+
Path to the directory containing the TileDB stores.
701+
Usually the ``output_path`` from the
702+
:py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
703+
704+
You may provide any tiledb compatible base path (e.g. local
705+
directory, S3, minio etc.).
706+
707+
assay_tiledb_group:
708+
TileDB group containing the assay matrices.
709+
710+
If the provided build process was used, the matrices are stored
711+
in the "assay" TileDB group.
712+
713+
May be an empty string or `None` to specify no group. This is
714+
mostly for backwards compatibility of cellarr builds for versions
715+
before 0.3.
716+
717+
assay_uri:
718+
Relative path to matrix store.
719+
Must be in tiledb group specified by ``assay_tiledb_group``.
720+
721+
gene_annotation_uri:
722+
Relative path to gene annotation store.
723+
724+
cell_metadata_uri:
725+
Relative path to cell metadata store.
726+
727+
sample_metadata_uri:
728+
Relative path to sample metadata store.
729+
730+
config_or_context:
731+
Custom TileDB configuration or context.
732+
If None, default TileDB Config will be used.
733+
"""
734+
735+
736+
self._dataset_path = dataset_path
737+
738+
if isinstance(assay_uri, str):
739+
assay_uri = [assay_uri]
740+
741+
def _prefix(uri, *prefixes):
742+
prefix = "/".join(prefixes)
743+
return f"{dataset_path}/{prefix}/{uri}"
744+
745+
assay_uris = {name: _prefix(name, assay_tiledb_group or "") for name in assay_uri}
746+
super().__init__(
747+
assay_uris=assay_uris,
748+
gene_annotation_uri=_prefix(gene_annotation_uri),
749+
cell_metadata_uri=_prefix(cell_metadata_uri),
750+
sample_metadata_uri=_prefix(sample_metadata_uri),
751+
config_or_context=config_or_context
752+
)
753+
754+
####
755+
## Printing.
756+
####
757+
758+
def __repr__(self) -> str:
759+
"""
760+
Returns:
761+
A string representation.
762+
"""
763+
output = f"{type(self).__name__}(number_of_rows={self.shape[0]}"
764+
output += f", number_of_columns={self.shape[1]}"
765+
output += ", at path=" + self._dataset_path
766+
767+
output += ")"
768+
return output
769+
770+
def __str__(self) -> str:
771+
"""
772+
Returns:
773+
A pretty-printed string containing the contents of this object.
774+
"""
775+
output = f"class: {type(self).__name__}\n"
776+
777+
output += f"number_of_rows: {self.shape[0]}\n"
778+
output += f"number_of_columns: {self.shape[1]}\n"
779+
output += f"path: '{self._dataset_path}'\n"
780+
781+
return output
782+

0 commit comments

Comments
 (0)