Skip to content

Commit a85d116

Browse files
committed
ADD validator - RM from dataset
1 parent 2e160ab commit a85d116

File tree

11 files changed

+1351
-791
lines changed

11 files changed

+1351
-791
lines changed

Diff for: server/src/scimodom/api/management.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from scimodom.config import get_config
1010
from scimodom.services.assembly import LiftOverError
1111

12-
from scimodom.services.dataset import (
13-
get_dataset_service,
12+
from scimodom.services.dataset import get_dataset_service
13+
from scimodom.services.validator import (
1414
SelectionNotFoundError,
1515
DatasetImportError,
1616
DatasetHeaderError,

Diff for: server/src/scimodom/cli/assembly.py

+10-17
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from scimodom.services.assembly import (
44
AssemblyVersionError,
5+
AssemblyAbortedError,
56
get_assembly_service,
67
)
78

@@ -28,20 +29,20 @@ def add_assembly(**kwargs) -> None:
2829
return
2930
try:
3031
assembly_service.prepare_assembly_for_version(assembly_id)
31-
except AssemblyVersionError:
32+
except FileExistsError:
3233
click.secho(
33-
"Cannot create assembly for this database version... Aborting!",
34+
"Assembly directory already exists... Aborting!",
3435
fg="red",
3536
)
3637
return
37-
except FileExistsError:
38+
except AssemblyVersionError as exc:
3839
click.secho(
39-
"Assembly directory already exists... Aborting!",
40+
f"Cannot create assembly for this database version: {exc}",
4041
fg="red",
4142
)
4243
return
43-
except Exception as exc:
44-
click.secho(f"Failed to prepare assembly: {exc}. Aborting!", fg="red")
44+
except AssemblyAbortedError as exc:
45+
click.secho(f"Failed to prepare assembly: {exc}", fg="red")
4546
return
4647
click.secho("... done!", fg="green")
4748
else:
@@ -56,16 +57,8 @@ def add_assembly(**kwargs) -> None:
5657
if c not in ["y", "Y"]:
5758
return
5859
try:
59-
assembly_id = assembly_service.add_assembly(taxa_id, assembly_name)
60-
except FileExistsError:
61-
click.secho(
62-
"Directory exists, but not assembly... check for data corruption. Aborting!",
63-
fg="red",
64-
)
65-
return
66-
except Exception as exc:
67-
click.secho(
68-
f"Failed to add alternative assembly: {exc}. Aborting!", fg="red"
69-
)
60+
assembly_id = assembly_service.get_assembly_by_name(taxa_id, assembly_name)
61+
except AssemblyAbortedError as exc:
62+
click.secho(f"Failed to add alternative assembly: {exc}", fg="red")
7063
return
7164
click.secho(f"... done! Assembly ID is {assembly_id}.", fg="green")

Diff for: server/src/scimodom/cli/utilities.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ def validate_dataset_title(ctx, param, value):
5151
def add_assembly_to_template_if_none(organism, assembly_service):
5252
click.secho("Checking if assembly ID is defined...", fg="green")
5353
if organism.assembly_id is None:
54-
assembly_id = assembly_service.add_assembly(
55-
organism.taxa_id, organism.assembly_name
54+
assembly_id = assembly_service.get_assembly_by_name(
55+
organism.taxa_id, organism.assembly_name, fail_safe=False
5656
)
5757
click.secho(
5858
f"Updating project metadata template with assembly ID '{assembly_id}'... ",

Diff for: server/src/scimodom/services/assembly.py

+88-62
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from functools import cache
44
from posixpath import join as urljoin
5-
from typing import Any, Sequence
5+
from typing import Any, Sequence, TextIO
66

77
from sqlalchemy import select, func
88
from sqlalchemy.exc import NoResultFound
@@ -25,13 +25,20 @@
2525

2626

2727
class AssemblyNotFoundError(Exception):
28-
"""Exception handling for a non-existing Assembly."""
28+
"""Exception for handling a non-existing Assembly."""
2929

3030
pass
3131

3232

3333
class AssemblyVersionError(Exception):
34-
"""Exception handling for Assembly version mismatch."""
34+
"""Exception for handling an Assembly version mismatch."""
35+
36+
pass
37+
38+
39+
class AssemblyAbortedError(Exception):
40+
"""Exception for handling general errors associated with
41+
preparing/adding assemblies e.g. request streaming, etc."""
3542

3643
pass
3744

@@ -86,6 +93,36 @@ def get_assembly_by_id(self, assembly_id: int) -> Assembly:
8693
except NoResultFound:
8794
raise AssemblyNotFoundError(f"No such assembly with ID: {assembly_id}.")
8895

96+
def get_assembly_by_name(
97+
self, taxa_id: int, assembly_name: str, fail_safe: bool = True
98+
) -> Assembly:
99+
"""Retrieve assembly by name. If not found, add it to the database,
100+
unless fail_safe is False, in which case raises AssemblyNotFoundError.
101+
102+
:param taxa_id: Taxonomy ID
103+
:type taxa_id: int
104+
:param assembly_name: Assembly name
105+
:type assembly_name: str
106+
:param fail_safe: If True (default), add assembly if not found.
107+
:type fail_safe: bool
108+
:returns: Newly created or existing assembly
109+
:rtype: Assembly
110+
111+
:raises: AssemblyNotFoundError
112+
"""
113+
try:
114+
assembly = self._session.execute(
115+
select(Assembly).filter_by(taxa_id=taxa_id, name=assembly_name)
116+
).scalar_one()
117+
return assembly
118+
except NoResultFound:
119+
if fail_safe:
120+
return self._add_assembly(taxa_id, assembly_name)
121+
else:
122+
raise AssemblyNotFoundError(
123+
f"No such assembly '{assembly_name}' for organism '{taxa_id}'."
124+
)
125+
89126
def get_assemblies_by_taxa(self, taxa_id: int) -> Sequence[Assembly]:
90127
"""Retrieve all assemblies for a given organism.
91128
@@ -158,13 +195,13 @@ def _yield_chroms():
158195

159196
return list(_yield_chroms())
160197

161-
def liftover(
198+
def create_lifted_file(
162199
self,
163200
assembly: Assembly,
164201
raw_file: str,
165202
unmapped_file: str | None = None,
166203
threshold: float = ImportLimits.LIFTOVER.max,
167-
) -> str:
204+
) -> TextIO:
168205
"""Liftover records to current assembly.
169206
170207
:param assembly: Assembly instance
@@ -175,8 +212,8 @@ def liftover(
175212
:type unmapped_file: str | None
176213
:param threshold: Threshold for raising LiftOverError
177214
:type threshold: float
178-
:returns: Files pointing to the liftedOver features
179-
:rtype: str
215+
:returns: File handle pointing to the liftedOver features
216+
:rtype: TextIO
180217
"""
181218
if self.is_latest_assembly(assembly):
182219
raise AssemblyVersionError("Cannot liftover for latest assembly.")
@@ -207,60 +244,7 @@ def liftover(
207244
f"{unmapped_lines} records could not be mapped... "
208245
"Contact the system administrator if you have questions."
209246
)
210-
return lifted_file
211-
212-
def add_assembly(self, taxa_id: int, assembly_name: str) -> int:
213-
"""Add an alternative assembly to the database.
214-
215-
If assembly exists, nothing is done.
216-
217-
:param taxa_id: Taxonomy ID
218-
:type taxa_id: int
219-
:param assembly_name: Assembly name
220-
:type assembly_name: str
221-
:returns: Newly created or existing assembly ID
222-
:rtype: int
223-
"""
224-
try:
225-
assembly = self._session.execute(
226-
select(Assembly).filter_by(taxa_id=taxa_id, name=assembly_name)
227-
).scalar_one()
228-
return assembly.id
229-
except NoResultFound:
230-
pass
231-
232-
if self._file_service.check_if_assembly_exists(taxa_id, assembly_name):
233-
raise FileExistsError(
234-
f"Directory exists, but assembly '{assembly_name}' does not exist!"
235-
)
236-
237-
chain_file_name = self._get_chain_file_name(
238-
assembly_name, self.get_name_for_version(taxa_id)
239-
)
240-
url = self._get_ensembl_chain_file_url(taxa_id, chain_file_name)
241-
242-
logger.info(f"Setting up a new assembly for {assembly_name}...")
243-
try:
244-
with self._file_service.create_chain_file(
245-
taxa_id, chain_file_name, assembly_name
246-
) as fh:
247-
self._web_service.stream_request_to_file(url, fh)
248-
version_nums = (
249-
self._session.execute(select(func.distinct(Assembly.version)))
250-
.scalars()
251-
.all()
252-
)
253-
version_num = gen_short_uuid(Identifiers.ASSEMBLY.length, version_nums)
254-
assembly = Assembly(
255-
name=assembly_name, taxa_id=taxa_id, version=version_num
256-
)
257-
self._session.add(assembly)
258-
self._session.commit()
259-
return assembly.id
260-
except Exception:
261-
self._session.rollback()
262-
self._file_service.delete_assembly(taxa_id, assembly_name)
263-
raise
247+
return self._file_service.open_file_for_reading(lifted_file)
264248

265249
def prepare_assembly_for_version(self, assembly_id: int) -> None:
266250
"""Prepare directories and files for the latest version.
@@ -288,14 +272,56 @@ def prepare_assembly_for_version(self, assembly_id: int) -> None:
288272
try:
289273
self._handle_gene_build(assembly)
290274
self._handle_release(assembly)
291-
except Exception:
275+
except AssemblyVersionError:
292276
self._file_service.delete_assembly(assembly.taxa_id, assembly.name)
293277
raise
278+
except Exception as exc:
279+
self._file_service.delete_assembly(assembly.taxa_id, assembly.name)
280+
raise AssemblyAbortedError(
281+
f"Adding assembly for ID '{assembly_id}' aborted."
282+
) from exc
294283

295284
@staticmethod
296285
def _get_chain_file_name(source_assembly_name, target_assembly_name):
297286
return f"{source_assembly_name}_to_{target_assembly_name}.chain.gz"
298287

288+
def _add_assembly(self, taxa_id: int, assembly_name: str) -> Assembly:
289+
if self._file_service.check_if_assembly_exists(taxa_id, assembly_name):
290+
raise AssemblyAbortedError(
291+
f"Suspected incomplete or inconsistent data: files were found on the "
292+
f"system for '{assembly_name}', but assembly does not exist in the database."
293+
)
294+
295+
chain_file_name = self._get_chain_file_name(
296+
assembly_name, self.get_name_for_version(taxa_id)
297+
)
298+
url = self._get_ensembl_chain_file_url(taxa_id, chain_file_name)
299+
300+
logger.info(f"Setting up a new assembly for {assembly_name}...")
301+
try:
302+
with self._file_service.create_chain_file(
303+
taxa_id, chain_file_name, assembly_name
304+
) as fh:
305+
self._web_service.stream_request_to_file(url, fh)
306+
version_nums = (
307+
self._session.execute(select(func.distinct(Assembly.version)))
308+
.scalars()
309+
.all()
310+
)
311+
version_num = gen_short_uuid(Identifiers.ASSEMBLY.length, version_nums)
312+
assembly = Assembly(
313+
name=assembly_name, taxa_id=taxa_id, version=version_num
314+
)
315+
self._session.add(assembly)
316+
self._session.commit()
317+
return assembly
318+
except Exception as exc:
319+
self._session.rollback()
320+
self._file_service.delete_assembly(taxa_id, assembly_name)
321+
raise AssemblyAbortedError(
322+
f"Adding assembly for '{assembly_name}' aborted."
323+
) from exc
324+
299325
def _get_ensembl_chain_file_url(self, taxa_id: int, chain_file_name):
300326
return urljoin(
301327
Ensembl.FTP.value,

Diff for: server/src/scimodom/services/data.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class NoDataRecords(Exception):
1717

1818
class DataService:
1919
def __init__(self, session: Session):
20-
self._db_session = session
20+
self._session = session
2121

2222
def get_by_dataset(
2323
self, datasets: Union[str, Dataset, List[Union[str, Dataset]]]
@@ -30,7 +30,7 @@ def get_by_dataset(
3030
.where(Data.dataset_id.in_(dataset_ids))
3131
)
3232
count = 0
33-
for record in self._db_session.execute(query).all():
33+
for record in self._session.execute(query).all():
3434
count += 1
3535
yield record[0]
3636
if count == 0:

0 commit comments

Comments
 (0)