27
27
print(result1)
28
28
"""
29
29
30
+ from collections .abc import Mapping
30
31
from functools import lru_cache
31
32
from typing import List , Optional , Sequence , Union
32
33
@@ -93,99 +94,56 @@ def __next__(self):
93
94
raise StopIteration
94
95
95
96
96
- class CellArrDataset :
97
- """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
97
+
98
+ class _CellArrDatasetBase :
99
+ """
100
+ Base class for CellArr dataset. This does not manage tiledb arrays and will not close them on __del__.
101
+ Can we even abstract away tiledb.Array here so we can support np.ndarray directly? I expect that
102
+ we use too much of the tiledb API for that but we may want to explore that option.
103
+
104
+ This is a nice-to-have for creating CellArr datasets that combine data that are not within the same prefix,
105
+ e.g. when running a pipeline that modifies the data matrices but the metadata never changes. This even allows for
106
+ having metadata in memory and cell data streamed from disk. Note that this is a power user interface and we should
107
+ not provide support, i.e. power users must know what they are doing and operate at their own risk (hence the leading underscore).
108
+ """
98
109
99
110
def __init__ (
100
111
self ,
101
- dataset_path : str ,
102
- assay_tiledb_group : str = "assays" ,
103
- assay_uri : Union [str , List [str ]] = "counts" ,
104
- gene_annotation_uri : str = "gene_annotation" ,
105
- cell_metadata_uri : str = "cell_metadata" ,
106
- sample_metadata_uri : str = "sample_metadata" ,
107
- config_or_context : Optional [Union [tiledb .Config , tiledb .Ctx ]] = None ,
112
+ assays : Union [tiledb .Array , Sequence [tiledb .Array ], dict [str , tiledb .Array ]],
113
+ gene_annotations : tiledb .Array ,
114
+ cell_metadata : tiledb .Array ,
115
+ sample_metadata : tiledb .Array ,
108
116
):
109
- """Initialize a ``CellArrDataset``.
110
-
111
- Args:
112
- dataset_path:
113
- Path to the directory containing the TileDB stores.
114
- Usually the ``output_path`` from the
115
- :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
116
-
117
- You may provide any tiledb compatible base path (e.g. local
118
- directory, S3, minio etc.).
119
-
120
- assay_tiledb_group:
121
- TileDB group containing the assay matrices.
122
-
123
- If the provided build process was used, the matrices are stored
124
- in the "assay" TileDB group.
125
-
126
- May be an empty string or `None` to specify no group. This is
127
- mostly for backwards compatibility of cellarr builds for versions
128
- before 0.3.
129
-
130
- assay_uri:
131
- Relative path to matrix store.
132
- Must be in tiledb group specified by ``assay_tiledb_group``.
133
-
134
- gene_annotation_uri:
135
- Relative path to gene annotation store.
136
-
137
- cell_metadata_uri:
138
- Relative path to cell metadata store.
139
-
140
- sample_metadata_uri:
141
- Relative path to sample metadata store.
142
-
143
- config_or_context:
144
- Custom TileDB configuration or context.
145
- If None, default TileDB Config will be used.
146
- """
147
- if config_or_context is None :
148
- config_or_context = tiledb .Config ()
149
-
150
- if isinstance (config_or_context , tiledb .Config ):
151
- ctx = tiledb .Ctx (config_or_context )
152
- elif isinstance (config_or_context , tiledb .Ctx ):
153
- ctx = config_or_context
117
+ if isinstance (assays , tiledb .Array ):
118
+ assays = [tiledb .Array ]
119
+ if isinstance (assays , Mapping ):
120
+ self ._matrix_tdb = dict (assays )
154
121
else :
155
- raise Exception ("'config_or_context' must be either TileDB config or a context object." )
156
-
157
- self ._dataset_path = dataset_path
158
-
159
- if isinstance (assay_uri , str ):
160
- assay_uri = [assay_uri ]
161
- # TODO: Maybe switch to on-demand loading of these objects
162
- self ._matrix_tdb = {}
163
- _asy_path = dataset_path
164
- if assay_tiledb_group is not None and len (assay_tiledb_group ) > 0 :
165
- _asy_path = f"{ dataset_path } /{ assay_tiledb_group } "
166
- for mtdb in assay_uri :
167
- self ._matrix_tdb [mtdb ] = tiledb .open (f"{ _asy_path } /{ mtdb } " , "r" , ctx = ctx )
168
- self ._gene_annotation_tdb = tiledb .open (f"{ dataset_path } /{ gene_annotation_uri } " , "r" , ctx = ctx )
169
- self ._cell_metadata_tdb = tiledb .open (f"{ dataset_path } /{ cell_metadata_uri } " , "r" , ctx = ctx )
170
- self ._sample_metadata_tdb = tiledb .open (f"{ dataset_path } /{ sample_metadata_uri } " , "r" , ctx = ctx )
122
+ self ._matrix_tdb = {assay .uri .split ("/" )[- 1 ]: assay for assay in assays }
123
+ self ._gene_annotation_tdb = gene_annotations
124
+ self ._cell_metadata_tdb = cell_metadata
125
+ self ._sample_metadata_tdb = sample_metadata
171
126
172
127
self ._validate ()
173
128
174
129
def _validate (self ):
175
130
num_cells = self ._cell_metadata_tdb .nonempty_domain ()[0 ][1 ]
176
131
num_rows = self ._gene_annotation_tdb .nonempty_domain ()[0 ][1 ]
177
132
178
- for mname , muri in self ._matrix_tdb .items ():
179
- dom = muri .nonempty_domain ()
133
+ for mname , marray in self ._matrix_tdb .items ():
134
+ self ._validate_read_only (marray , mname )
135
+ dom = marray .nonempty_domain ()
180
136
if dom [0 ][1 ] != num_cells or dom [1 ][1 ] != num_rows :
181
137
raise RuntimeError (f"Matrix { mname } has incorrect dimensions" )
182
138
183
- def __del__ (self ):
184
- self ._gene_annotation_tdb .close ()
185
- self ._cell_metadata_tdb .close ()
186
- self ._sample_metadata_tdb .close ()
187
- for tobj in self ._matrix_tdb .values ():
188
- tobj .close ()
139
+ self ._validate_read_only (self ._gene_annotation_tdb )
140
+ self ._validate_read_only (self ._cell_metadata_tdb )
141
+ self ._validate_read_only (self ._sample_metadata_tdb )
142
+
143
+ @staticmethod
144
+ def _validate_read_only (array : tiledb .Array , name : str | None = "" ):
145
+ assert not array .iswritable , f"Arrays must be read-only but found writable array { name } : { array } "
146
+
189
147
190
148
####
191
149
## Subset methods for the `cell_metadata` TileDB file.
@@ -595,7 +553,6 @@ def __repr__(self) -> str:
595
553
"""
596
554
output = f"{ type (self ).__name__ } (number_of_rows={ self .shape [0 ]} "
597
555
output += f", number_of_columns={ self .shape [1 ]} "
598
- output += ", at path=" + self ._dataset_path
599
556
600
557
output += ")"
601
558
return output
@@ -609,7 +566,6 @@ def __str__(self) -> str:
609
566
610
567
output += f"number_of_rows: { self .shape [0 ]} \n "
611
568
output += f"number_of_columns: { self .shape [1 ]} \n "
612
- output += f"path: '{ self ._dataset_path } '\n "
613
569
614
570
return output
615
571
@@ -668,3 +624,159 @@ def itersamples(self) -> CellArrSampleIterator:
668
624
def itercells (self ) -> CellArrCellIterator :
669
625
"""Iterator over samples."""
670
626
return CellArrCellIterator (self )
627
+
628
+
629
+ class _CellArrDatasetUri (_CellArrDatasetBase ):
630
+ """
631
+ An extension of _CellArrDatasetBase that manages the underlying tiledb arrays. Unlike the base class,
632
+ this accepts onli uris, not tiledb.Array objects, and will manage the tiledb.Array objects. That means,
633
+ it will open them (in read-only mode) upon creation and close them upon deletion via override of __del__.
634
+
635
+ This is a nice-to-have for creating CellArr datasets that combine data that are not within the same prefix,
636
+ e.g. when running a pipeline that modifies the data matrices but the metadata never changes.
637
+ Note that this is a power user interface and we should not provide support, i.e. power users must know what
638
+ they are doing and operate at their own risk (hence the leading underscore).
639
+ """
640
+
641
+ def __init__ (
642
+ self ,
643
+ assay_uris : Union [str , Sequence [str ], Mapping [str , str ]],
644
+ gene_annotation_uri : str ,
645
+ cell_metadata_uri : str ,
646
+ sample_metadata_uri : str ,
647
+ config_or_context : Optional [Union [tiledb .Config , tiledb .Ctx ]] = None ,
648
+ ):
649
+
650
+ if config_or_context is None :
651
+ config_or_context = tiledb .Config ()
652
+
653
+ if isinstance (config_or_context , tiledb .Config ):
654
+ self ._ctx = tiledb .Ctx (config_or_context )
655
+ elif isinstance (config_or_context , tiledb .Ctx ):
656
+ self ._ctx = config_or_context
657
+ else :
658
+ raise Exception ("'config_or_context' must be either TileDB config or a context object." )
659
+
660
+ if isinstance (assay_uris , str ):
661
+ assay_uris = [assay_uris ]
662
+ if not isinstance (assay_uris , Mapping ):
663
+ assay_uris = {uri .split ("/" )[- 1 ]: uri for uri in assay_uris }
664
+ def _open (uri ):
665
+ return tiledb .open (uri = uri , mode = "r" , ctx = self ._ctx )
666
+ assays = {name : _open (uri = uri ) for name , uri in assay_uris .items ()}
667
+
668
+ super ().__init__ (
669
+ assays = assays ,
670
+ gene_annotations = _open (gene_annotation_uri ),
671
+ cell_metadata = _open (cell_metadata_uri ),
672
+ sample_metadata = _open (sample_metadata_uri ),
673
+ )
674
+
675
+ def __del__ (self ):
676
+ self ._gene_annotation_tdb .close ()
677
+ self ._cell_metadata_tdb .close ()
678
+ self ._sample_metadata_tdb .close ()
679
+ for tobj in self ._matrix_tdb .values ():
680
+ tobj .close ()
681
+
682
+
683
+ class CellArrDataset (_CellArrDatasetUri ):
684
+ """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
685
+
686
+ def __init__ (
687
+ self ,
688
+ dataset_path : str ,
689
+ assay_tiledb_group : str = "assays" ,
690
+ assay_uri : Union [str , List [str ]] = "counts" ,
691
+ gene_annotation_uri : str = "gene_annotation" ,
692
+ cell_metadata_uri : str = "cell_metadata" ,
693
+ sample_metadata_uri : str = "sample_metadata" ,
694
+ config_or_context : Optional [Union [tiledb .Config , tiledb .Ctx ]] = None ,
695
+ ):
696
+ """Initialize a ``CellArrDataset``.
697
+
698
+ Args:
699
+ dataset_path:
700
+ Path to the directory containing the TileDB stores.
701
+ Usually the ``output_path`` from the
702
+ :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
703
+
704
+ You may provide any tiledb compatible base path (e.g. local
705
+ directory, S3, minio etc.).
706
+
707
+ assay_tiledb_group:
708
+ TileDB group containing the assay matrices.
709
+
710
+ If the provided build process was used, the matrices are stored
711
+ in the "assay" TileDB group.
712
+
713
+ May be an empty string or `None` to specify no group. This is
714
+ mostly for backwards compatibility of cellarr builds for versions
715
+ before 0.3.
716
+
717
+ assay_uri:
718
+ Relative path to matrix store.
719
+ Must be in tiledb group specified by ``assay_tiledb_group``.
720
+
721
+ gene_annotation_uri:
722
+ Relative path to gene annotation store.
723
+
724
+ cell_metadata_uri:
725
+ Relative path to cell metadata store.
726
+
727
+ sample_metadata_uri:
728
+ Relative path to sample metadata store.
729
+
730
+ config_or_context:
731
+ Custom TileDB configuration or context.
732
+ If None, default TileDB Config will be used.
733
+ """
734
+
735
+
736
+ self ._dataset_path = dataset_path
737
+
738
+ if isinstance (assay_uri , str ):
739
+ assay_uri = [assay_uri ]
740
+
741
+ def _prefix (uri , * prefixes ):
742
+ prefix = "/" .join (prefixes )
743
+ return f"{ dataset_path } /{ prefix } /{ uri } "
744
+
745
+ assay_uris = {name : _prefix (name , assay_tiledb_group or "" ) for name in assay_uri }
746
+ super ().__init__ (
747
+ assay_uris = assay_uris ,
748
+ gene_annotation_uri = _prefix (gene_annotation_uri ),
749
+ cell_metadata_uri = _prefix (cell_metadata_uri ),
750
+ sample_metadata_uri = _prefix (sample_metadata_uri ),
751
+ config_or_context = config_or_context
752
+ )
753
+
754
+ ####
755
+ ## Printing.
756
+ ####
757
+
758
+ def __repr__ (self ) -> str :
759
+ """
760
+ Returns:
761
+ A string representation.
762
+ """
763
+ output = f"{ type (self ).__name__ } (number_of_rows={ self .shape [0 ]} "
764
+ output += f", number_of_columns={ self .shape [1 ]} "
765
+ output += ", at path=" + self ._dataset_path
766
+
767
+ output += ")"
768
+ return output
769
+
770
+ def __str__ (self ) -> str :
771
+ """
772
+ Returns:
773
+ A pretty-printed string containing the contents of this object.
774
+ """
775
+ output = f"class: { type (self ).__name__ } \n "
776
+
777
+ output += f"number_of_rows: { self .shape [0 ]} \n "
778
+ output += f"number_of_columns: { self .shape [1 ]} \n "
779
+ output += f"path: '{ self ._dataset_path } '\n "
780
+
781
+ return output
782
+
0 commit comments