|
| 1 | +from collections.abc import Sequence |
| 2 | +from logging import getLogger |
| 3 | +from pathlib import Path |
| 4 | +from typing import Any, Optional, Literal, get_args |
| 5 | + |
| 6 | +from sqlalchemy import select |
| 7 | +from sqlalchemy.orm import Session |
| 8 | + |
| 9 | +from scimodom.database.database import get_session |
| 10 | +from scimodom.database.models import Data, Association |
| 11 | +from scimodom.utils.operations import to_bedtool, _remove_extra_feature |
| 12 | +from scimodom.services.importer import get_bed_importer |
| 13 | +import scimodom.utils.utils as utils |
| 14 | + |
| 15 | +logger = getLogger(__name__) |
| 16 | + |
| 17 | + |
| 18 | +class FailedUploadError(Exception): |
| 19 | + pass |
| 20 | + |
| 21 | + |
| 22 | +class NoRecordsFoundError(Exception): |
| 23 | + pass |
| 24 | + |
| 25 | + |
| 26 | +class ComparisonService: |
| 27 | + """Utility class to handle dataset comparison. |
| 28 | + The EUFID(s) are assumed to be well-defined, |
| 29 | + avoiding repeated validation queries to the |
| 30 | + database. |
| 31 | +
|
| 32 | + :param session: SQLAlchemy ORM session |
| 33 | + :type session: Session |
| 34 | + :param reference_ids: Reference EUFID(s) |
| 35 | + :type reference_ids: list of str |
| 36 | + :param comparison_ids: EUFID(s) to compare |
| 37 | + :type comparison_ids: list of str |
| 38 | + :param query_operation: Comparison to perform: |
| 39 | + intersect, closest, or subtract (difference) |
| 40 | + :type query_operation: str |
| 41 | + :param is_strand: Perform strand-aware query |
| 42 | + :type is_strand: bool |
| 43 | + """ |
| 44 | + |
| 45 | + OPERATIONS = Literal["intersect", "closest", "subtract"] |
| 46 | + |
| 47 | + def __init__( |
| 48 | + self, |
| 49 | + session: Session, |
| 50 | + reference_ids: list[str], |
| 51 | + comparison_ids: list[str], |
| 52 | + query_operation: OPERATIONS, |
| 53 | + is_strand: bool, |
| 54 | + ) -> None: |
| 55 | + """Initializer method.""" |
| 56 | + self._session = session |
| 57 | + self._reference_ids = reference_ids |
| 58 | + self._comparison_ids = comparison_ids |
| 59 | + self._query_operation = query_operation |
| 60 | + self._is_strand = is_strand |
| 61 | + |
| 62 | + self._reference_records: Sequence[Any] |
| 63 | + self._comparison_records: Sequence[Any] |
| 64 | + |
| 65 | + operations = get_args(self.OPERATIONS) |
| 66 | + assert ( |
| 67 | + query_operation in operations |
| 68 | + ), f"Undefined '{query_operation}'. Allowed values are {operations}." |
| 69 | + |
| 70 | + @classmethod |
| 71 | + def from_upload( |
| 72 | + cls, |
| 73 | + session: Session, |
| 74 | + reference_ids: list[str], |
| 75 | + query_operation: str, |
| 76 | + is_strand: bool, |
| 77 | + upload: str | Path, |
| 78 | + ): |
| 79 | + """Provides ComparisonService factory to |
| 80 | + instantiate class upload instead of EUFID(s). |
| 81 | +
|
| 82 | + :param session: SQLAlchemy ORM session |
| 83 | + :type session: Session |
| 84 | + :param reference_ids: Reference EUFID(s) |
| 85 | + :type reference_ids: list of str |
| 86 | + :param upload: Uploaded dataset to compare |
| 87 | + :type upload: str | Path |
| 88 | + :param query_operation: Comparison to perform: |
| 89 | + intersect, closest, or subtract (difference) |
| 90 | + :type query_operation: str |
| 91 | + :param is_strand: Perform strand-aware query |
| 92 | + :type is_strand: bool |
| 93 | + """ |
| 94 | + upload_path = Path(upload) |
| 95 | + if not upload_path.is_file(): |
| 96 | + msg = f"No such file or directory: {upload_path.as_posix()}" |
| 97 | + raise FileNotFoundError(msg) |
| 98 | + try: |
| 99 | + db_records = ComparisonService.upload_file() |
| 100 | + except Exception as exc: |
| 101 | + # upload itself is "fail-safe", catch eveything else... |
| 102 | + msg = f"Failed to upload {upload_path.as_posix()}. " |
| 103 | + raise FailedUploadError(msg) |
| 104 | + records = [tuple([val for key, val in record.items()]) for record in db_records] |
| 105 | + # ... but records might be "skipped" for various reasons |
| 106 | + if not records: |
| 107 | + raise NoRecordsFoundError |
| 108 | + service = cls(session, reference_ids, [], query_operation, is_strand) |
| 109 | + service._comparison_records = records |
| 110 | + return service |
| 111 | + |
| 112 | + @staticmethod |
| 113 | + def upload_file(file_path: Path) -> list[dict[str, Any]]: |
| 114 | + """Upload bed-like file. |
| 115 | +
|
| 116 | + :param file_path: Path to file |
| 117 | + :type file_path: Path |
| 118 | + :return: Uploaded records |
| 119 | + :rtype: list of dict of {str: Any} |
| 120 | + """ |
| 121 | + importer = get_bed_importer(file_path) |
| 122 | + importer.parse_records() |
| 123 | + importer.close() |
| 124 | + return importer.get_buffer() |
| 125 | + |
| 126 | + def get_records(self) -> None: |
| 127 | + """Query the database for selected records.""" |
| 128 | + |
| 129 | + def _construct_query(idx): |
| 130 | + query = ( |
| 131 | + select( |
| 132 | + Data.chrom, |
| 133 | + Data.start, |
| 134 | + Data.end, |
| 135 | + Data.name, |
| 136 | + Data.score, |
| 137 | + Data.strand, |
| 138 | + Association.dataset_id, |
| 139 | + Data.coverage, |
| 140 | + Data.frequency, |
| 141 | + ) |
| 142 | + .join_from(Data, Association, Data.inst_association) |
| 143 | + .where(Association.dataset_id.in_(idx)) |
| 144 | + ) |
| 145 | + return query |
| 146 | + |
| 147 | + query = _construct_query(self._reference_ids) |
| 148 | + self._reference_records = self._session.execute(query).all() |
| 149 | + |
| 150 | + if self._comparison_ids: |
| 151 | + self._comparison_records = [] |
| 152 | + for idx in self._comparison_ids: |
| 153 | + query = _construct_query([idx]) |
| 154 | + self._comparison_records.append(self._session.execute(query).all()) |
| 155 | + |
| 156 | + # check for empty records? |
| 157 | + |
| 158 | + def compare_dataset(self): # -> |
| 159 | + """Execute comparison. |
| 160 | +
|
| 161 | + :return: Result form comparison |
| 162 | + :rtype: |
| 163 | + """ |
| 164 | + func = eval(f"_get_{self._query_operation}") |
| 165 | + records = func(self._reference_records, self._comparison_records) |
| 166 | + |
| 167 | + # TODO |
| 168 | + # from scimodom.utils.models import records_factory |
| 169 | + # op, strand = query_operation.split("S") |
| 170 | + # c_records = get_op(op)(a_records, b_records, s=eval(strand)) |
| 171 | + # records = [records_factory(op.capitalize(), r)._asdict() for r in c_records] |
| 172 | + |
| 173 | + return records |
| 174 | + |
| 175 | + def _get_intersect( |
| 176 | + a_records: Sequence[Any], |
| 177 | + b_records: Sequence[Any], |
| 178 | + s: bool = True, |
| 179 | + sorted: bool = True, |
| 180 | + ) -> list[Any]: |
| 181 | + """Wrapper for pybedtools.bedtool.BedTool.intersect |
| 182 | +
|
| 183 | + Relies on the behaviour of bedtools -wa -wb option: the first |
| 184 | + column after the complete -a record lists the file number |
| 185 | + from which the overlap came. |
| 186 | +
|
| 187 | + :param a_records: DB records (A features) |
| 188 | + :type a_records: Sequence (list of tuples) |
| 189 | + :param b_records: DB records (B features) |
| 190 | + :type b_records: Sequence (list of tuples) |
| 191 | + :param s: Force strandedness |
| 192 | + :type s: bool |
| 193 | + :param sorted: Invoked sweeping algorithm |
| 194 | + :type sorted: bool |
| 195 | + :returns: records |
| 196 | + :rtype: list of tuples |
| 197 | + """ |
| 198 | + |
| 199 | + # required options |
| 200 | + # write the original entry in A for each overlap |
| 201 | + wa: bool = True |
| 202 | + # write the original entry in B for each overlap |
| 203 | + wb: bool = True |
| 204 | + |
| 205 | + a_bedtool, b_bedtool = to_bedtool(a_records), to_bedtool( |
| 206 | + b_records, as_list=True |
| 207 | + ) |
| 208 | + bedtool = a_bedtool.intersect( |
| 209 | + b=[b.fn for b in b_bedtool], wa=wa, wb=wb, s=s, sorted=sorted |
| 210 | + ) |
| 211 | + stream = bedtool.each(_remove_extra_feature) |
| 212 | + records = [tuple(s.fields) for s in stream] |
| 213 | + return records |
| 214 | + |
| 215 | + def _get_closest( |
| 216 | + a_records: Sequence[Any], |
| 217 | + b_records: Sequence[Any], |
| 218 | + s: bool = True, |
| 219 | + sorted: bool = True, |
| 220 | + ) -> list[Any]: |
| 221 | + """Wrapper for pybedtools.bedtool.BedTool.closest |
| 222 | +
|
| 223 | + Relies on the behaviour of bedtools -io -t -mdb -D options: the first |
| 224 | + column after the complete -a record lists the file number |
| 225 | + from which the closest interval came. |
| 226 | +
|
| 227 | + :param a_records: DB records (A features) |
| 228 | + :type a_records: Sequence (list of tuples) |
| 229 | + :param b_records: DB records (B features) |
| 230 | + :type b_records: Sequence (list of tuples) |
| 231 | + :param s: Force strandedness |
| 232 | + :type s: bool |
| 233 | + :param sorted: Invoked sweeping algorithm |
| 234 | + :type sorted: bool |
| 235 | + :returns: records |
| 236 | + :rtype: list of tuples |
| 237 | + """ |
| 238 | + |
| 239 | + # required options |
| 240 | + # Ignore features in B that overlap A |
| 241 | + io: bool = True |
| 242 | + # Report all ties |
| 243 | + t: str = "all" |
| 244 | + # Report closest records among all databases |
| 245 | + mdb: str = "all" |
| 246 | + # Report distance with respect to A |
| 247 | + D: str = "a" |
| 248 | + |
| 249 | + a_bedtool, b_bedtool = to_bedtool(a_records), to_bedtool( |
| 250 | + b_records, as_list=True |
| 251 | + ) |
| 252 | + bedtool = a_bedtool.closest( |
| 253 | + b=[b.fn for b in b_bedtool], io=io, t=t, mdb=mdb, D=D, s=s, sorted=sorted |
| 254 | + ) |
| 255 | + stream = bedtool.each(_remove_extra_feature) |
| 256 | + records = [tuple(s.fields) for s in stream] |
| 257 | + |
| 258 | + # TODO |
| 259 | + # Reports “none” for chrom (?) and “-1” for all other fields (?) when a feature |
| 260 | + # is not found in B on the same chromosome as the feature in A. |
| 261 | + # Note that "start" (fields) is a string! |
| 262 | + # 9 + 0/1 + 1 |
| 263 | + # c_bedtool = c_bedtool.filter(lambda c: c.fields[(offset + filnum_idx + 1)] != "-1") |
| 264 | + |
| 265 | + return records |
| 266 | + |
| 267 | + def _get_subtract( |
| 268 | + a_records: Sequence[Any], |
| 269 | + b_records: Sequence[Any], |
| 270 | + s: bool = True, |
| 271 | + sorted: bool = True, |
| 272 | + ) -> list[Any]: |
| 273 | + """Wrapper for pybedtools.bedtool.BedTool.subtract |
| 274 | +
|
| 275 | + :param a_records: DB records (A features) |
| 276 | + :type a_records: Sequence (list of tuples) |
| 277 | + :param b_records: DB records (B features) |
| 278 | + :type b_records: Sequence (list of tuples) |
| 279 | + :param s: Force strandedness |
| 280 | + :type s: bool |
| 281 | + :param sorted: Invoked sweeping algorithm |
| 282 | + :type sorted: bool |
| 283 | + :param n_fields: Number of other fields attribute in addition to BED6 |
| 284 | + :type n_fields: int |
| 285 | + :returns: records |
| 286 | + :rtype: list of tuples |
| 287 | + """ |
| 288 | + |
| 289 | + a_bedtool, b_bedtool = to_bedtool(a_records), to_bedtool( |
| 290 | + utils.flatten_list(b_records) |
| 291 | + ) |
| 292 | + c_bedtool = a_bedtool.subtract(b_bedtool, s=s, sorted=sorted) |
| 293 | + # records = [(i.fields[:offset]) for i in c_bedtool] |
| 294 | + # TODO ??? offset = 6 + 3 |
| 295 | + # records = [tuple(i.fields[:offset]) for i in c_bedtool] |
| 296 | + # return records |
| 297 | + |
| 298 | + |
| 299 | +_cached_service: Optional[ComparisonService] = None |
| 300 | + |
| 301 | + |
| 302 | +def get_comparison_service() -> ComparisonService: |
| 303 | + global _cached_service |
| 304 | + if _cached_service is None: |
| 305 | + # TODO |
| 306 | + # _cached_service = ComparisonService(session=get_session()) |
| 307 | + pass |
| 308 | + return _cached_service |
0 commit comments