17
17
from pathlib import Path
18
18
from typing import TYPE_CHECKING , overload
19
19
20
+ from sqlalchemy .engine import Connection , Engine
20
21
from sqlalchemy .orm .session import Session
21
22
from sqlalchemy .sql import func
22
23
from sqlalchemy .sql .expression import delete , select , text , update
@@ -119,8 +120,13 @@ def __init__(self, folder: str | Path) -> None:
119
120
:param folder: the path to a folder that will host this object-store container.
120
121
"""
121
122
self ._folder = Path (folder ).resolve ()
122
- # Will be populated by the _get_session function
123
- self ._session : Session | None = None
123
+ # This session is used for to send read/write operations to the
124
+ # database. It can be reused but also closed anytime the operation has
125
+ # finished.
126
+ self ._operation_session : Session | None = None
127
+ # This session is alive after initialisation and will be only closed
128
+ # when the container is closed.
129
+ self ._container_session : Session | None = None
124
130
125
131
# These act as caches and will be populated by the corresponding properties
126
132
# IMPORANT! IF YOU ADD MORE, REMEMBER TO CLEAR THEM IN `init_container()`!
@@ -133,9 +139,25 @@ def get_folder(self) -> Path:
133
139
134
140
def close (self ) -> None :
135
141
"""Close open files (in particular, the connection to the SQLite DB)."""
136
- if self ._session is not None :
137
- self ._session .close ()
138
- self ._session = None
142
+ if self ._operation_session is not None :
143
+ binding = self ._operation_session .bind
144
+ self ._operation_session .close ()
145
+ if isinstance (binding , Engine ):
146
+ binding .dispose ()
147
+ elif isinstance (binding , Connection ):
148
+ binding .invalidate ()
149
+ binding .close ()
150
+ self ._operation_session = None
151
+
152
+ if self ._container_session is not None :
153
+ binding = self ._container_session .bind
154
+ self ._container_session .close ()
155
+ if isinstance (binding , Engine ):
156
+ binding .dispose ()
157
+ elif isinstance (binding , Connection ):
158
+ binding .invalidate ()
159
+ binding .close ()
160
+ self ._container_session = None
139
161
140
162
def __enter__ (self ) -> Container :
141
163
"""Return a context manager that will close the session when exiting the context."""
@@ -180,42 +202,29 @@ def _get_config_file(self) -> Path:
180
202
"""Return the path to the container config file."""
181
203
return self ._folder / "config.json"
182
204
183
- @overload
184
- def _get_session (
185
- self , create : bool = False , raise_if_missing : Literal [True ] = True
186
- ) -> Session :
187
- ...
205
+ def _get_container_session (self ) -> Session :
206
+ """Return the container session to connect to the pack-index SQLite DB.
188
207
189
- @overload
190
- def _get_session (
191
- self , create : bool = False , raise_if_missing : Literal [False ] = False
192
- ) -> Session | None :
193
- ...
208
+ This session should not be closed until the container has been closed.
209
+ """
210
+ if self ._container_session is None :
211
+ self ._container_session = get_session (
212
+ self ._get_pack_index_path (),
213
+ create = True ,
214
+ )
215
+ return self ._container_session
194
216
195
- def _get_session (
196
- self , create : bool = False , raise_if_missing : bool = False
197
- ) -> Session | None :
198
- """Return a new session to connect to the pack-index SQLite DB.
217
+ def _get_operation_session (self ) -> Session :
218
+ """Return an operation session to access the SQLite file.
199
219
200
- :param create: if True, creates the sqlite file and schema.
201
- :param raise_if_missing: ignored if create==True. If create==False, and the index file
202
- is missing, either raise an exception (FileNotFoundError) if this flag is True, or return None
220
+ This session can be reused if not closed.
203
221
"""
204
- return get_session (
205
- self ._get_pack_index_path (),
206
- create = create ,
207
- raise_if_missing = raise_if_missing ,
208
- )
209
-
210
- def _get_cached_session (self ) -> Session :
211
- """Return the SQLAlchemy session to access the SQLite file,
212
- reusing the same one."""
213
- # We want to catch both if it's missing, and if it's None
214
- # the latter means that in the previous run the pack file was missing
215
- # but maybe by now it has been created!
216
- if self ._session is None :
217
- self ._session = self ._get_session (create = False , raise_if_missing = True )
218
- return self ._session
222
+ if self ._operation_session is None :
223
+ self ._operation_session = get_session (
224
+ self ._get_pack_index_path (),
225
+ create = False ,
226
+ )
227
+ return self ._operation_session
219
228
220
229
def _get_loose_path_from_hashkey (self , hashkey : str ) -> Path :
221
230
"""Return the path of a loose object on disk containing the data of a given hash key.
@@ -332,6 +341,7 @@ def init_container(
332
341
raise ValueError (f'Unknown hash type "{ hash_type } "' )
333
342
334
343
if clear :
344
+ self .close ()
335
345
if self ._folder .exists ():
336
346
shutil .rmtree (self ._folder )
337
347
@@ -391,7 +401,7 @@ def init_container(
391
401
]:
392
402
os .makedirs (folder )
393
403
394
- self ._get_session ( create = True )
404
+ self ._get_container_session ( )
395
405
396
406
def _get_repository_config (self ) -> dict [str , int | str ]:
397
407
"""Return the repository config."""
@@ -584,7 +594,7 @@ def _get_objects_stream_meta_generator( # pylint: disable=too-many-branches,too
584
594
# Currently ordering in the DB (it's ordered across all packs, but this should not be
585
595
# a problem as we then split them by pack). To be checked, performance-wise, if it's better
586
596
# to order in python instead
587
- session = self ._get_cached_session ()
597
+ session = self ._get_operation_session ()
588
598
589
599
obj_reader : StreamReadBytesType
590
600
@@ -729,18 +739,18 @@ def _get_objects_stream_meta_generator( # pylint: disable=too-many-branches,too
729
739
# If they are not, the object does not exist.
730
740
if loose_not_found :
731
741
# IMPORTANT. I need to close the session (and flush the
732
- # self._session cache) to refresh the DB, otherwise since I am
742
+ # self._operation_session cache) to refresh the DB, otherwise since I am
733
743
# reading in WAL mode, I will be keeping to read from the "old"
734
744
# state of the DB.
735
745
# Note that this is an expensive operation!
736
746
# This means that asking for non-existing objects will be
737
747
# slow.
738
- if self ._session is not None :
739
- self ._session .close ()
740
- self ._session = None
748
+ if self ._operation_session is not None :
749
+ self ._operation_session .close ()
750
+ self ._operation_session = None
741
751
742
752
packs = defaultdict (list )
743
- session = self ._get_cached_session ()
753
+ session = self ._get_operation_session ()
744
754
if len (loose_not_found ) <= self ._MAX_CHUNK_ITERATE_LENGTH :
745
755
for chunk in chunk_iterator (
746
756
loose_not_found , size = self ._IN_SQL_MAX_LENGTH
@@ -1069,7 +1079,7 @@ def count_objects(self) -> ObjectCount:
1069
1079
In particular, it returns the number of loose objects,
1070
1080
of packed objects, and the number of pack files."""
1071
1081
1072
- number_packed = self ._get_cached_session ().scalar (
1082
+ number_packed = self ._get_operation_session ().scalar (
1073
1083
select (func .count ()).select_from (Obj )
1074
1084
)
1075
1085
return ObjectCount (
@@ -1122,7 +1132,7 @@ def get_total_size(self) -> TotalSize:
1122
1132
"""
1123
1133
retval = {}
1124
1134
1125
- session = self ._get_cached_session ()
1135
+ session = self ._get_operation_session ()
1126
1136
# COALESCE is used to return 0 if there are no results, rather than None
1127
1137
# SQL's COALESCE returns the first non-null result
1128
1138
retval ["total_size_packed" ] = session .scalar (
@@ -1227,7 +1237,7 @@ def list_all_objects(self) -> Iterator[str]:
1227
1237
loose_objects = set (self ._list_loose ())
1228
1238
1229
1239
# Let us initialise a session
1230
- session = self ._get_cached_session ()
1240
+ session = self ._get_operation_session ()
1231
1241
1232
1242
# This variable stored the last PK that we saw. We are assuming that PKs are always positive integers.
1233
1243
# NOTE: We don't use limit+offset, but a filter on the last PK being > than the last PK seen.
@@ -1368,7 +1378,7 @@ def pack_all_loose( # pylint: disable=too-many-locals,too-many-branches,too-man
1368
1378
1369
1379
loose_objects = set (self ._list_loose ())
1370
1380
pack_int_id = self ._get_pack_id_to_write_to ()
1371
- session = self ._get_cached_session ()
1381
+ session = self ._get_operation_session ()
1372
1382
1373
1383
# I first skip all loose hashkeys that already exist in the pack.
1374
1384
# Packing should be performed by a single process at a given time as a
@@ -1640,7 +1650,7 @@ def add_streamed_objects_to_pack( # pylint: disable=too-many-locals, too-many-b
1640
1650
# without affecting the original list, and it's from the end so it's fast
1641
1651
working_stream_list = list (stream_list [::- 1 ])
1642
1652
pack_int_id = self ._get_pack_id_to_write_to ()
1643
- session = self ._get_cached_session ()
1653
+ session = self ._get_operation_session ()
1644
1654
1645
1655
if no_holes :
1646
1656
if callback :
@@ -1916,6 +1926,10 @@ def add_objects_to_pack( # pylint: disable=too-many-arguments
1916
1926
1917
1927
:return: a list of object hash keys
1918
1928
"""
1929
+ if not self .is_initialised :
1930
+ raise ValueError (
1931
+ "Invalid use of function, please first initialise the container."
1932
+ )
1919
1933
stream_list : list [StreamSeekBytesType ] = [
1920
1934
io .BytesIO (content ) for content in content_list
1921
1935
]
@@ -1981,7 +1995,7 @@ def _vacuum(self) -> None:
1981
1995
"""
1982
1996
# VACUUM cannot be performed from within a transaction
1983
1997
# see: https://github.com/sqlalchemy/sqlalchemy/discussions/6959
1984
- session = self ._get_cached_session ()
1998
+ session = self ._get_operation_session ()
1985
1999
session .execute (text ("COMMIT" ))
1986
2000
session .execute (text ("VACUUM" ))
1987
2001
# ensure sqlalchemy knows to open a new transaction for the next execution
@@ -2074,7 +2088,7 @@ def clean_storage( # pylint: disable=too-many-branches,too-many-locals
2074
2088
# Force reload of the session to get the most up-to-date packed objects
2075
2089
self .close ()
2076
2090
2077
- session = self ._get_cached_session ()
2091
+ session = self ._get_operation_session ()
2078
2092
# I search now for all loose hash keys that exist also in the packs
2079
2093
existing_packed_hashkeys = []
2080
2094
if len (loose_objects ) <= self ._MAX_CHUNK_ITERATE_LENGTH :
@@ -2169,7 +2183,7 @@ def import_objects( # pylint: disable=too-many-locals,too-many-statements,too-m
2169
2183
# see issue #94.
2170
2184
# NOTE: I need to wrap in the `yield_first_element` iterator since it returns a list of lists
2171
2185
sorted_packed = yield_first_element (
2172
- self ._get_cached_session ().execute (
2186
+ self ._get_operation_session ().execute (
2173
2187
text ("SELECT hashkey FROM db_object ORDER BY hashkey" )
2174
2188
)
2175
2189
)
@@ -2331,7 +2345,7 @@ def import_objects( # pylint: disable=too-many-locals,too-many-statements,too-m
2331
2345
2332
2346
# Since I called the `add_objects_to_pack` without committing (gives a boost for performance),
2333
2347
# I need now to commit to save what I've been doing.
2334
- self ._get_cached_session ().commit ()
2348
+ self ._get_operation_session ().commit ()
2335
2349
2336
2350
return old_new_obj_hashkey_mapping
2337
2351
@@ -2406,7 +2420,7 @@ def callback(self, action, value):
2406
2420
invalid_sizes = []
2407
2421
overlapping = []
2408
2422
2409
- session = self ._get_cached_session ()
2423
+ session = self ._get_operation_session ()
2410
2424
2411
2425
if callback :
2412
2426
# If we have a callback, compute the total count of objects in this pack
@@ -2511,7 +2525,7 @@ def validate(self, callback: Callable | None = None) -> ValidationIssues:
2511
2525
if callback :
2512
2526
callback (action = "close" , value = None )
2513
2527
2514
- session = self ._get_cached_session ()
2528
+ session = self ._get_operation_session ()
2515
2529
2516
2530
all_pack_ids = sorted (
2517
2531
{res [0 ] for res in session .execute (select (Obj .pack_id ).distinct ())}
@@ -2585,7 +2599,7 @@ def delete_objects(self, hashkeys: list[str]) -> list[str | Any]:
2585
2599
# No loose object: it's OK
2586
2600
pass
2587
2601
2588
- session = self ._get_cached_session ()
2602
+ session = self ._get_operation_session ()
2589
2603
2590
2604
# Operate in chunks, due to the SQLite limits
2591
2605
# (see comment above the definition of self._IN_SQL_MAX_LENGTH)
@@ -2676,7 +2690,7 @@ def repack_pack( # pylint: disable=too-many-branches,too-many-statements,too-ma
2676
2690
self ._REPACK_PACK_ID , allow_repack_pack = True
2677
2691
).exists (), f"The repack pack '{ self ._REPACK_PACK_ID } ' already exists, probably a previous repacking aborted?"
2678
2692
2679
- session = self ._get_cached_session ()
2693
+ session = self ._get_operation_session ()
2680
2694
2681
2695
one_object_in_pack = session .execute (
2682
2696
select (Obj .id ).where (Obj .pack_id == pack_id ).limit (1 )
0 commit comments