Skip to content

Commit 2ecde9c

Browse files
committed
Close sessions properly after usage
The session created during initialisation of the container was never properly closed. This unclosed session was until py3.12 garbage collected since it was unreferenced. With py3.13 the sessions however are not anymore garbage collected and thus remain open. Resulting in an open file descriptors of the `pack.idx` for each initialisation of the container. This commit fixes it by keeping track of the session that initialises the container `_container_session`. We adapt the name `_session` to `_operation_session` for a clearer distinguishment between the two session types.
1 parent 73f14e3 commit 2ecde9c

File tree

5 files changed

+89
-80
lines changed

5 files changed

+89
-80
lines changed

disk_objectstore/container.py

Lines changed: 71 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pathlib import Path
1818
from typing import TYPE_CHECKING, overload
1919

20+
from sqlalchemy.engine import Connection, Engine
2021
from sqlalchemy.orm.session import Session
2122
from sqlalchemy.sql import func
2223
from sqlalchemy.sql.expression import delete, select, text, update
@@ -119,8 +120,13 @@ def __init__(self, folder: str | Path) -> None:
119120
:param folder: the path to a folder that will host this object-store container.
120121
"""
121122
self._folder = Path(folder).resolve()
122-
# Will be populated by the _get_session function
123-
self._session: Session | None = None
123+
# This session is used for to send read/write operations to the
124+
# database. It can be reused but also closed anytime the operation has
125+
# finished.
126+
self._operation_session: Session | None = None
127+
# This session is alive after initialisation and will be only closed
128+
# when the container is closed.
129+
self._container_session: Session | None = None
124130

125131
# These act as caches and will be populated by the corresponding properties
126132
# IMPORANT! IF YOU ADD MORE, REMEMBER TO CLEAR THEM IN `init_container()`!
@@ -133,9 +139,25 @@ def get_folder(self) -> Path:
133139

134140
def close(self) -> None:
135141
"""Close open files (in particular, the connection to the SQLite DB)."""
136-
if self._session is not None:
137-
self._session.close()
138-
self._session = None
142+
if self._operation_session is not None:
143+
binding = self._operation_session.bind
144+
self._operation_session.close()
145+
if isinstance(binding, Engine):
146+
binding.dispose()
147+
elif isinstance(binding, Connection):
148+
binding.invalidate()
149+
binding.close()
150+
self._operation_session = None
151+
152+
if self._container_session is not None:
153+
binding = self._container_session.bind
154+
self._container_session.close()
155+
if isinstance(binding, Engine):
156+
binding.dispose()
157+
elif isinstance(binding, Connection):
158+
binding.invalidate()
159+
binding.close()
160+
self._container_session = None
139161

140162
def __enter__(self) -> Container:
141163
"""Return a context manager that will close the session when exiting the context."""
@@ -180,42 +202,29 @@ def _get_config_file(self) -> Path:
180202
"""Return the path to the container config file."""
181203
return self._folder / "config.json"
182204

183-
@overload
184-
def _get_session(
185-
self, create: bool = False, raise_if_missing: Literal[True] = True
186-
) -> Session:
187-
...
205+
def _get_container_session(self) -> Session:
206+
"""Return the container session to connect to the pack-index SQLite DB.
188207
189-
@overload
190-
def _get_session(
191-
self, create: bool = False, raise_if_missing: Literal[False] = False
192-
) -> Session | None:
193-
...
208+
This session should not be closed until the container has been closed.
209+
"""
210+
if self._container_session is None:
211+
self._container_session = get_session(
212+
self._get_pack_index_path(),
213+
create=True,
214+
)
215+
return self._container_session
194216

195-
def _get_session(
196-
self, create: bool = False, raise_if_missing: bool = False
197-
) -> Session | None:
198-
"""Return a new session to connect to the pack-index SQLite DB.
217+
def _get_operation_session(self) -> Session:
218+
"""Return an operation session to access the SQLite file.
199219
200-
:param create: if True, creates the sqlite file and schema.
201-
:param raise_if_missing: ignored if create==True. If create==False, and the index file
202-
is missing, either raise an exception (FileNotFoundError) if this flag is True, or return None
220+
This session can be reused if not closed.
203221
"""
204-
return get_session(
205-
self._get_pack_index_path(),
206-
create=create,
207-
raise_if_missing=raise_if_missing,
208-
)
209-
210-
def _get_cached_session(self) -> Session:
211-
"""Return the SQLAlchemy session to access the SQLite file,
212-
reusing the same one."""
213-
# We want to catch both if it's missing, and if it's None
214-
# the latter means that in the previous run the pack file was missing
215-
# but maybe by now it has been created!
216-
if self._session is None:
217-
self._session = self._get_session(create=False, raise_if_missing=True)
218-
return self._session
222+
if self._operation_session is None:
223+
self._operation_session = get_session(
224+
self._get_pack_index_path(),
225+
create=False,
226+
)
227+
return self._operation_session
219228

220229
def _get_loose_path_from_hashkey(self, hashkey: str) -> Path:
221230
"""Return the path of a loose object on disk containing the data of a given hash key.
@@ -332,6 +341,7 @@ def init_container(
332341
raise ValueError(f'Unknown hash type "{hash_type}"')
333342

334343
if clear:
344+
self.close()
335345
if self._folder.exists():
336346
shutil.rmtree(self._folder)
337347

@@ -391,7 +401,7 @@ def init_container(
391401
]:
392402
os.makedirs(folder)
393403

394-
self._get_session(create=True)
404+
self._get_container_session()
395405

396406
def _get_repository_config(self) -> dict[str, int | str]:
397407
"""Return the repository config."""
@@ -584,7 +594,7 @@ def _get_objects_stream_meta_generator( # pylint: disable=too-many-branches,too
584594
# Currently ordering in the DB (it's ordered across all packs, but this should not be
585595
# a problem as we then split them by pack). To be checked, performance-wise, if it's better
586596
# to order in python instead
587-
session = self._get_cached_session()
597+
session = self._get_operation_session()
588598

589599
obj_reader: StreamReadBytesType
590600

@@ -729,18 +739,18 @@ def _get_objects_stream_meta_generator( # pylint: disable=too-many-branches,too
729739
# If they are not, the object does not exist.
730740
if loose_not_found:
731741
# IMPORTANT. I need to close the session (and flush the
732-
# self._session cache) to refresh the DB, otherwise since I am
742+
# self._operation_session cache) to refresh the DB, otherwise since I am
733743
# reading in WAL mode, I will be keeping to read from the "old"
734744
# state of the DB.
735745
# Note that this is an expensive operation!
736746
# This means that asking for non-existing objects will be
737747
# slow.
738-
if self._session is not None:
739-
self._session.close()
740-
self._session = None
748+
if self._operation_session is not None:
749+
self._operation_session.close()
750+
self._operation_session = None
741751

742752
packs = defaultdict(list)
743-
session = self._get_cached_session()
753+
session = self._get_operation_session()
744754
if len(loose_not_found) <= self._MAX_CHUNK_ITERATE_LENGTH:
745755
for chunk in chunk_iterator(
746756
loose_not_found, size=self._IN_SQL_MAX_LENGTH
@@ -1069,7 +1079,7 @@ def count_objects(self) -> ObjectCount:
10691079
In particular, it returns the number of loose objects,
10701080
of packed objects, and the number of pack files."""
10711081

1072-
number_packed = self._get_cached_session().scalar(
1082+
number_packed = self._get_operation_session().scalar(
10731083
select(func.count()).select_from(Obj)
10741084
)
10751085
return ObjectCount(
@@ -1122,7 +1132,7 @@ def get_total_size(self) -> TotalSize:
11221132
"""
11231133
retval = {}
11241134

1125-
session = self._get_cached_session()
1135+
session = self._get_operation_session()
11261136
# COALESCE is used to return 0 if there are no results, rather than None
11271137
# SQL's COALESCE returns the first non-null result
11281138
retval["total_size_packed"] = session.scalar(
@@ -1227,7 +1237,7 @@ def list_all_objects(self) -> Iterator[str]:
12271237
loose_objects = set(self._list_loose())
12281238

12291239
# Let us initialise a session
1230-
session = self._get_cached_session()
1240+
session = self._get_operation_session()
12311241

12321242
# This variable stored the last PK that we saw. We are assuming that PKs are always positive integers.
12331243
# NOTE: We don't use limit+offset, but a filter on the last PK being > than the last PK seen.
@@ -1368,7 +1378,7 @@ def pack_all_loose( # pylint: disable=too-many-locals,too-many-branches,too-man
13681378

13691379
loose_objects = set(self._list_loose())
13701380
pack_int_id = self._get_pack_id_to_write_to()
1371-
session = self._get_cached_session()
1381+
session = self._get_operation_session()
13721382

13731383
# I first skip all loose hashkeys that already exist in the pack.
13741384
# Packing should be performed by a single process at a given time as a
@@ -1640,7 +1650,7 @@ def add_streamed_objects_to_pack( # pylint: disable=too-many-locals, too-many-b
16401650
# without affecting the original list, and it's from the end so it's fast
16411651
working_stream_list = list(stream_list[::-1])
16421652
pack_int_id = self._get_pack_id_to_write_to()
1643-
session = self._get_cached_session()
1653+
session = self._get_operation_session()
16441654

16451655
if no_holes:
16461656
if callback:
@@ -1916,6 +1926,10 @@ def add_objects_to_pack( # pylint: disable=too-many-arguments
19161926
19171927
:return: a list of object hash keys
19181928
"""
1929+
if not self.is_initialised:
1930+
raise ValueError(
1931+
"Invalid use of function, please first initialise the container."
1932+
)
19191933
stream_list: list[StreamSeekBytesType] = [
19201934
io.BytesIO(content) for content in content_list
19211935
]
@@ -1981,7 +1995,7 @@ def _vacuum(self) -> None:
19811995
"""
19821996
# VACUUM cannot be performed from within a transaction
19831997
# see: https://github.com/sqlalchemy/sqlalchemy/discussions/6959
1984-
session = self._get_cached_session()
1998+
session = self._get_operation_session()
19851999
session.execute(text("COMMIT"))
19862000
session.execute(text("VACUUM"))
19872001
# ensure sqlalchemy knows to open a new transaction for the next execution
@@ -2074,7 +2088,7 @@ def clean_storage( # pylint: disable=too-many-branches,too-many-locals
20742088
# Force reload of the session to get the most up-to-date packed objects
20752089
self.close()
20762090

2077-
session = self._get_cached_session()
2091+
session = self._get_operation_session()
20782092
# I search now for all loose hash keys that exist also in the packs
20792093
existing_packed_hashkeys = []
20802094
if len(loose_objects) <= self._MAX_CHUNK_ITERATE_LENGTH:
@@ -2169,7 +2183,7 @@ def import_objects( # pylint: disable=too-many-locals,too-many-statements,too-m
21692183
# see issue #94.
21702184
# NOTE: I need to wrap in the `yield_first_element` iterator since it returns a list of lists
21712185
sorted_packed = yield_first_element(
2172-
self._get_cached_session().execute(
2186+
self._get_operation_session().execute(
21732187
text("SELECT hashkey FROM db_object ORDER BY hashkey")
21742188
)
21752189
)
@@ -2331,7 +2345,7 @@ def import_objects( # pylint: disable=too-many-locals,too-many-statements,too-m
23312345

23322346
# Since I called the `add_objects_to_pack` without committing (gives a boost for performance),
23332347
# I need now to commit to save what I've been doing.
2334-
self._get_cached_session().commit()
2348+
self._get_operation_session().commit()
23352349

23362350
return old_new_obj_hashkey_mapping
23372351

@@ -2406,7 +2420,7 @@ def callback(self, action, value):
24062420
invalid_sizes = []
24072421
overlapping = []
24082422

2409-
session = self._get_cached_session()
2423+
session = self._get_operation_session()
24102424

24112425
if callback:
24122426
# If we have a callback, compute the total count of objects in this pack
@@ -2511,7 +2525,7 @@ def validate(self, callback: Callable | None = None) -> ValidationIssues:
25112525
if callback:
25122526
callback(action="close", value=None)
25132527

2514-
session = self._get_cached_session()
2528+
session = self._get_operation_session()
25152529

25162530
all_pack_ids = sorted(
25172531
{res[0] for res in session.execute(select(Obj.pack_id).distinct())}
@@ -2585,7 +2599,7 @@ def delete_objects(self, hashkeys: list[str]) -> list[str | Any]:
25852599
# No loose object: it's OK
25862600
pass
25872601

2588-
session = self._get_cached_session()
2602+
session = self._get_operation_session()
25892603

25902604
# Operate in chunks, due to the SQLite limits
25912605
# (see comment above the definition of self._IN_SQL_MAX_LENGTH)
@@ -2676,7 +2690,7 @@ def repack_pack( # pylint: disable=too-many-branches,too-many-statements,too-ma
26762690
self._REPACK_PACK_ID, allow_repack_pack=True
26772691
).exists(), f"The repack pack '{self._REPACK_PACK_ID}' already exists, probably a previous repacking aborted?"
26782692

2679-
session = self._get_cached_session()
2693+
session = self._get_operation_session()
26802694

26812695
one_object_in_pack = session.execute(
26822696
select(Obj.id).where(Obj.pack_id == pack_id).limit(1)

disk_objectstore/database.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Models for the container index file (SQLite DB)."""
22
from pathlib import Path
3-
from typing import Optional
43

54
from sqlalchemy import Boolean, Column, Integer, String, create_engine, event
65
from sqlalchemy.orm import declarative_base, sessionmaker
@@ -31,19 +30,13 @@ class Obj(Base): # pylint: disable=too-few-public-methods
3130
) # integer ID of the pack in which this entry is stored
3231

3332

34-
def get_session(
35-
path: Path, create: bool = False, raise_if_missing: bool = False
36-
) -> Optional[Session]:
33+
def get_session(path: Path, create: bool = False) -> Session:
3734
"""Return a new session to connect to the pack-index SQLite DB.
3835
3936
:param create: if True, creates the sqlite file and schema.
40-
:param raise_if_missing: ignored if create==True. If create==False, and the index file
41-
is missing, either raise an exception (FileNotFoundError) if this flag is True, or return None
4237
"""
4338
if not create and not path.exists():
44-
if raise_if_missing:
45-
raise FileNotFoundError("Pack index does not exist")
46-
return None
39+
raise FileNotFoundError("Pack index does not exist")
4740

4841
engine = create_engine(f"sqlite:///{path}", future=True)
4942

tests/concurrent_tests/periodic_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def main(
316316
)
317317
print(f"Exists Loose: {loose_path.exists()}")
318318
session = (
319-
container._get_cached_session() # pylint: disable=protected-access
319+
container._get_operation_session() # pylint: disable=protected-access
320320
)
321321
stmt = select(
322322
Obj.pack_id,

tests/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_backup(temp_container, temp_dir, remote, verbosity):
223223
if verbosity:
224224
args += [f"--verbosity={verbosity}"]
225225

226-
result = CliRunner().invoke(cli.backup, args, obj=obj)
226+
result = CliRunner().invoke(cli.backup, args, obj=obj, catch_exceptions=False)
227227

228228
assert result.exit_code == 0
229229
assert path.exists()

0 commit comments

Comments
 (0)