diff --git a/cubi_tk/snappy/itransfer_common.py b/cubi_tk/snappy/itransfer_common.py index f224b01f..52686dfc 100644 --- a/cubi_tk/snappy/itransfer_common.py +++ b/cubi_tk/snappy/itransfer_common.py @@ -171,7 +171,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: "--assay", dest="assay", default=None, help="UUID of assay to download data for." ) parser.add_argument( - "destination", help="UUID from Landing Zone or Project - where files will be moved to." + "destination", help="Landing zone path or UUID from Landing Zone or Project" ) @classmethod @@ -190,15 +190,21 @@ def check_args(self, args): toml_config = load_toml_config(args) if not args.sodar_url: - if toml_config: - args.sodar_url = toml_config.get("global", {}).get("sodar_server_url") - else: + if not toml_config: + logger.error("SODAR URL not found in config files. Please specify on command line.") + res = 1 + args.sodar_url = toml_config.get("global", {}).get("sodar_server_url") + if not args.sodar_url: logger.error("SODAR URL not found in config files. Please specify on command line.") res = 1 if not args.sodar_api_token: - if toml_config: - args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token") - else: + if not toml_config: + logger.error( + "SODAR API token not found in config files. Please specify on command line." + ) + res = 1 + args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token") + if not args.sodar_api_token: logger.error( "SODAR API token not found in config files. Please specify on command line." ) @@ -379,11 +385,32 @@ def get_sodar_info(self): msg = "Not possible to continue the process without a landing zone path. Breaking..." logger.info(msg) raise UserCanceledException(msg) + # Check if `in_destination` is a Landing zone path. + elif in_destination.startswith("/"): + # We expect to find one UUID in the LZ path, this will be the project UUID + # Note: it might bet better to split on irods.path_sep if that can be determined + uuids = [p for p in in_destination.split("/") if is_uuid(p)] + if len(uuids) == 1: + sodar_uuid = uuids[0] + lz_irods_path = in_destination + # Get uuid of lz that matches lz_path, this validates the path is correct & we have access + # validate that the LZ exists & user has access + try: + lz_uuid = self.get_landing_zone_uuid_by_path( + lz_irods_path, sodar_uuid, self.args.assay + ) + except requests.exceptions.HTTPError as e: + exception_str = str(e) + logger.error( + "Unable to identify UUID of given LZ %s. HTTP error %s " + % (in_destination, exception_str) + ) + raise # Not able to process - raise exception. - # UUID provided is not associated with project nor lz. + # UUID provided is not associated with project nor lz, or could not extract UUID from LZ path. if lz_irods_path is None: - msg = "Data provided by user is not a valid UUID. Please review input: {0}".format( + msg = "Data provided by user is not a valid UUID or LZ path. Please review input: {0}".format( in_destination ) logger.error(msg) @@ -431,6 +458,50 @@ def get_landing_zone_by_uuid(self, lz_uuid): ) return lz.irods_path + def get_landing_zone_uuid_by_path(self, lz_irods_path, project_uuid, assay_uuid=None): + """ + :param lz_irods_path: Landing zone path. + :type lz_irods_path: str + + :param project_uuid: Project UUID. + :type project_uuid: str + + :param assay_uuid: Assay UUID (optional). + :type assay_uuid: str + + :return: Returns LZ UUID. + """ + from sodar_cli.api import landingzone + + # List existing lzs + existing_lzs = sorted( + landingzone.list_( + sodar_url=self.args.sodar_url, + sodar_api_token=self.args.sodar_api_token, + project_uuid=project_uuid, + ), + key=lambda x: x.date_modified, + reverse=True, + ) + + # Filter for assay + if assay_uuid: + existing_lzs = list(filter(lambda x: x.assay == assay_uuid, existing_lzs)) + + matching_lzs = list(filter(lambda x: x.irods_path == lz_irods_path, existing_lzs)) + if matching_lzs and matching_lzs[0].status in ("ACTIVE", "FAILED"): + lz_uuid = matching_lzs[0].sodar_uuid + else: + msg = ( + "Could not find an active LZ with the given path. Please review input: {0}".format( + lz_irods_path + ) + ) + logger.error(msg) + raise ParameterException(msg) + + return lz_uuid + def create_landing_zone(self, project_uuid, assay_uuid=None): """ :param project_uuid: Project UUID. diff --git a/cubi_tk/sodar/ingest_fastq.py b/cubi_tk/sodar/ingest_fastq.py index 91bb3bb3..2d1cc761 100644 --- a/cubi_tk/sodar/ingest_fastq.py +++ b/cubi_tk/sodar/ingest_fastq.py @@ -10,15 +10,15 @@ import pathlib import re from subprocess import SubprocessError, check_output +import sys import typing from logzero import logger from sodar_cli import api import tqdm -#: Default value for --src-regex. -from ..common import sizeof_fmt -from ..exceptions import MissingFileException +from ..common import check_irods_icommands, load_toml_config, sizeof_fmt +from ..exceptions import MissingFileException, ParameterException, UserCanceledException from ..snappy.itransfer_common import ( SnappyItransferCommandBase, TransferJob, @@ -27,6 +27,7 @@ DEFAULT_SRC_REGEX = ( r"(.*/)?(?P.+?)" + r"(?:_S[0-9]+)?" r"(?:_(?PL[0-9]+?))?" r"(?:_(?PR[0-9]+?))?" r"(?:_(?P[0-9]+?))?" @@ -34,7 +35,7 @@ ) #: Default value for --dest-pattern -DEFAULT_DEST_PATTERN = r"{sample}/{date}/{filename}" +DEFAULT_DEST_PATTERN = r"{collection_name}/raw_data/{date}/{filename}" #: Default number of parallel transfers. DEFAULT_NUM_TRANSFERS = 8 @@ -81,60 +82,245 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: help="Assume the answer to all prompts is 'yes'", ) parser.add_argument( - "--base-path", - default=os.getcwd(), - required=False, - help="Base path of project (contains 'ngs_mapping/' etc.), defaults to current path.", - ) - parser.add_argument( - "--remote-dir-date", - default=datetime.date.today().strftime("%Y-%m-%d"), - help="Date to use in remote directory, defaults to YYYY-MM-DD of today.", + "--validate-and-move", + default=False, + action="store_true", + help="After files are transferred to SODAR, it will proceed with validation and move.", ) parser.add_argument( "--src-regex", default=DEFAULT_SRC_REGEX, - help=f"Regular expression to use for matching input fastq files, default: {DEFAULT_SRC_REGEX}", + help=f"Regular expression to use for matching input fastq files, default: {DEFAULT_SRC_REGEX}. " + "All capture groups can be used for --remote-dir-pattern, but only 'sample' is used by default. " + "Only this regex controls which files are ingested, so other files than fastq.gz can be used too.", ) parser.add_argument( "--remote-dir-pattern", default=DEFAULT_DEST_PATTERN, - help=f"Pattern to use for constructing remote pattern, default: {DEFAULT_DEST_PATTERN}", + help=f"Pattern to use for constructing remote pattern, default: {DEFAULT_DEST_PATTERN}. " + "'collection_name' is the target iRODS collection and will be filled with the (-m regex modified) " + "'sample' unless --match-column is not used to fill it from the assay table. Any capture group of the " + "src-regex ('sample', 'lane', ...) can be used along with 'date' and 'filename'.", ) parser.add_argument( - "--add-suffix", - default="", - help="Suffix to add to all file names (e.g. '-N1-DNA1-WES1').", + "--match-column", + default=None, + help="Alternative assay column against which the {sample} from the src-regex should be matched, " + "in order to determine collections based on the assay table (e.g. last material or collection-column). " + "If not set it is assumed that {sample} matches the iRODS collections directly. If it matches multiple " + "columns the last one can be used.", ) parser.add_argument( "-m", - "--remote-dir-mapping", + "--sample-collection-mapping", nargs=2, action="append", metavar=("MATCH", "REPL"), default=[], type=str, - help="Substitutions applied to the filled remote dir paths. " - "Can for example be used to modify sample names. " + help="Substitutions applied to the extracted sample name, " + "which is used to determine iRODS collections." + "Can be used to change extracted string to correct collections names " + "or to match the values of '--match-column'." "Use pythons regex syntax of 're.sub' package. " "This argument can be used multiple times " "(i.e. '-m -m ' ...).", ) + parser.add_argument( + "--remote-dir-date", + default=datetime.date.today().strftime("%Y-%m-%d"), + help="Date to use in remote directory, defaults to YYYY-MM-DD of today.", + ) + parser.add_argument( + "--collection-column", + default=None, + help="Assay column from that matches iRODS collection names. " + "If not set, the last material column will be used. If it matches multiple " + "columns the last one can be used.", + ) parser.add_argument( "--tmp", default="temp/", help="Folder to save files from WebDAV temporarily, if set as source.", ) + parser.add_argument("--assay", dest="assay", default=None, help="UUID of assay to use.") + parser.add_argument("sources", help="paths to fastq folders", nargs="+") - parser.add_argument("destination", help="UUID or iRods path of landing zone to move to.") + parser.add_argument( + "destination", help="UUID from Landing Zone or Project - where files will be moved to." + ) + + def check_args(self, args): + """Called for checking arguments, override to change behaviour.""" + # Check presence of icommands when not testing. + if "pytest" not in sys.modules: # pragma: nocover + check_irods_icommands(warn_only=False) + res = 0 + + toml_config = load_toml_config(args) + if not args.sodar_url: + if not toml_config: + logger.error("SODAR URL not found in config files. Please specify on command line.") + res = 1 + args.sodar_url = toml_config.get("global", {}).get("sodar_server_url") + if not args.sodar_url: + logger.error("SODAR URL not found in config files. Please specify on command line.") + res = 1 + if not args.sodar_api_token: + if not toml_config: + logger.error( + "SODAR API token not found in config files. Please specify on command line." + ) + res = 1 + args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token") + if not args.sodar_api_token: + logger.error( + "SODAR API token not found in config files. Please specify on command line." + ) + res = 1 + + return res + + def get_project_uuid(self, lz_uuid: str): + """Get project UUID from landing zone UUID. + :param lz_uuid: Landing zone UUID. + :type lz_uuid: str + + :return: Returns SODAR UUID of corresponding project. + """ + from sodar_cli.api import landingzone + + lz = landingzone.retrieve( + sodar_url=self.args.sodar_url, + sodar_api_token=self.args.sodar_api_token, + landingzone_uuid=lz_uuid, + ) + return lz.project def build_base_dir_glob_pattern(self, library_name: str) -> typing.Tuple[str, str]: raise NotImplementedError( "build_base_dir_glob_pattern() not implemented in SodarIngestFastq!" ) + def get_match_to_collection_mapping( + self, project_uuid: str, in_column: str, out_column: typing.Optional[str] = None + ) -> typing.Dict[str, str]: + """Return a dict that matches all values from a specific `ìn_column` of the assay table + to a corresponding `out_column` (default if not defined: last Material column).""" + + isa_dict = api.samplesheet.export( + sodar_url=self.args.sodar_url, + sodar_api_token=self.args.sodar_api_token, + project_uuid=project_uuid, + ) + if len(isa_dict["assays"]) > 1: + if not self.args.assay: + msg = "Multiple assays found in investigation, please specify which one to use with --assay." + logger.error(msg) + raise ParameterException(msg) + + investigation = api.samplesheet.retrieve( + sodar_url=self.args.sodar_url, + sodar_api_token=self.args.sodar_api_token, + project_uuid=project_uuid, + ) + for study in investigation.studies.values(): + for assay_uuid in study.assays.keys(): + if self.args.assay == assay_uuid: + assay_file_name = study.assays[assay_uuid].file_name + break + # First break can only break out of inner loop + else: + continue + break + else: + msg = f"Assay with UUID {self.args.assay} not found in investigation." + logger.error(msg) + raise ParameterException(msg) + else: + assay_file_name = list(isa_dict["assays"].keys())[0] + + assay_tsv = isa_dict["assays"][assay_file_name]["tsv"] + assay_header, *assay_lines = assay_tsv.rstrip("\n").split("\n") + assay_header = assay_header.split("\t") + assay_lines = map(lambda x: x.split("\t"), assay_lines) + + def check_col_index(column_index): + if not column_index: + msg = "Could not identify any column in the assay sheet matching provided data. Please review input: --match-column={0}".format( + in_column + ) + logger.error(msg) + raise ParameterException(msg) + elif len(column_index) > 1: + column_index = max(column_index) + if self.args.yes: + logger.info( + "Multiple columns in the assay sheet match the provided column name ({}), using the last one.".format( + assay_header[column_index] + ) + ) + elif ( + input( + "Multiple columns in the assay sheet match the provided column name ({}), use the last one? [yN] ".format( + assay_header[column_index] + ) + ) + .lower() + .startswith("y") + ): + pass + else: + msg = "Not possible to continue the process without a defined match-column. Breaking..." + logger.info(msg) + raise UserCanceledException(msg) + else: + column_index = column_index[0] + return column_index + + # Never match these (hidden) assay columns + ignore_cols = ( + "Performer", + "Date", + "Protocol REF", + "Unit", + "Term Source REF", + "Term Accession Number", + ) + + in_column_index = [ + i + for i, head in enumerate(assay_header) + if head not in ignore_cols + and in_column.lower() + in re.sub("(Parameter Value|Comment|Characteristics)", "", head).lower() + ] + in_column_index = check_col_index(in_column_index) + + if out_column is None: + # Get index of last material column that is not 'Raw Data File' + materials = ( + "Extract Name", + "Labeled Extract Name", + "Library Name", + "Sample Name", + "Source Name", + ) + out_column_index = max([i for i, head in enumerate(assay_header) if head in materials]) + else: + out_column_index = [ + i + for i, head in enumerate(assay_header) + if head not in ignore_cols + and out_column.lower() + in re.sub("(Parameter Value|Comment|Characteristics)", "", head).lower() + ] + out_column_index = check_col_index(out_column_index) + + return {line[in_column_index]: line[out_column_index] for line in assay_lines} + def download_webdav(self, sources): download_jobs = [] folders = [] @@ -148,60 +334,52 @@ def download_webdav(self, sources): else: folders.append(src) - logger.info("Planning to download folders...") - for job in download_jobs: - logger.info(" %s => %s", job.path_src, job.path_dest) - if not self.args.yes and not input("Is this OK? [yN] ").lower().startswith("y"): - logger.error("OK, breaking at your request") - return [] - - counter = Value(c_ulonglong, 0) - total_bytes = sum([job.bytes for job in download_jobs]) - with tqdm.tqdm(total=total_bytes) as t: - if self.args.num_parallel_transfers == 0: # pragma: nocover - for job in download_jobs: - download_folder(job, counter, t) - else: - pool = ThreadPool(processes=self.args.num_parallel_transfers) - for job in download_jobs: - pool.apply_async(download_folder, args=(job, counter, t)) - pool.close() - pool.join() + if download_jobs: + logger.info("Planning to download folders...") + for job in download_jobs: + logger.info(" %s => %s", job.path_src, job.path_dest) + if not self.args.yes and not input("Is this OK? [yN] ").lower().startswith("y"): + logger.error("OK, breaking at your request") + return [] + + counter = Value(c_ulonglong, 0) + total_bytes = sum([job.bytes for job in download_jobs]) + with tqdm.tqdm(total=total_bytes) as t: + if self.args.num_parallel_transfers == 0: # pragma: nocover + for job in download_jobs: + download_folder(job, counter, t) + else: + pool = ThreadPool(processes=self.args.num_parallel_transfers) + for job in download_jobs: + pool.apply_async(download_folder, args=(job, counter, t)) + pool.close() + pool.join() return folders - def build_jobs(self, library_names=None) -> typing.Tuple[TransferJob, ...]: + def build_jobs(self, library_names=None): """Build file transfer jobs.""" if library_names: logger.warning( - "will ignore parameter 'library_names' = %s in build_jobs()", str(library_names) + "will ignore parameter 'library_names' = %s in ingest_fastq.build_jobs()", + str(library_names), ) - if "/" in self.args.destination: - lz_irods_path = self.args.destination + lz_uuid, lz_irods_path = self.get_sodar_info() + project_uuid = self.get_project_uuid(lz_uuid) + if self.args.match_column is not None: + column_match = self.get_match_to_collection_mapping( + project_uuid, self.args.match_column, self.args.collection_column + ) else: - lz_irods_path = api.landingzone.retrieve( - sodar_url=self.args.sodar_url, - sodar_api_token=self.args.sodar_api_token, - landingzone_uuid=self.args.destination, - ).irods_path - logger.info("Target iRods path: %s", lz_irods_path) - - transfer_jobs = [] + column_match = None folders = self.download_webdav(self.args.sources) + transfer_jobs = [] for folder in folders: - logger.info("Searching for fastq files in folder: %s", folder) - - # assuming folder is local directory - if not pathlib.Path(folder).is_dir(): - logger.error("Problem when processing input paths") - raise MissingFileException("Missing folder %s" % folder) - for path in glob.iglob(f"{folder}/**/*", recursive=True): real_path = os.path.realpath(path) - if not os.path.isfile(real_path): continue # skip if did not resolve to file if real_path.endswith(".md5"): @@ -224,14 +402,37 @@ def build_jobs(self, library_names=None) -> typing.Tuple[TransferJob, ...]: for item in m.groupdict(default="").items() if item[0] in self.dest_pattern_fields ) - remote_file = pathlib.Path(lz_irods_path) / self.args.remote_dir_pattern.format( - filename=pathlib.Path(path).name + self.args.add_suffix, - date=self.args.remote_dir_date, - **match_wildcards, - ) - remote_file = str(remote_file) - for m_pat, r_pat in self.args.remote_dir_mapping: - remote_file = re.sub(m_pat, r_pat, remote_file) + + # `-m` regex now only applied to extracted sample name + sample_name = m.groupdict(default="")["sample"] + for m_pat, r_pat in self.args.sample_collection_mapping: + sample_name = re.sub(m_pat, r_pat, sample_name) + + try: + remote_file = pathlib.Path( + lz_irods_path + ) / self.args.remote_dir_pattern.format( + # Removed the `+ self.args.add_suffix` here, since adding anything after the file extension is a bad idea + filename=pathlib.Path(path).name, + date=self.args.remote_dir_date, + collection_name=column_match[sample_name] + if column_match + else sample_name, + **match_wildcards, + ) + except KeyError: + msg = ( + f"Could not match extracted sample value '{sample_name}' to any value in the " + f"--match-column {self.args.match_column}. Please review the assay table, src-regex and sample-collection-mapping args." + ) + logger.error(msg) + raise ParameterException(msg) + + # This was the original code, but there is no need to change the remote file names once they are + # mapped to the correct collections: + # remote_file = str(remote_file) + # for m_pat, r_pat in self.args.remote_dir_mapping: + # remote_file = re.sub(m_pat, r_pat, remote_file) for ext in ("", ".md5"): try: @@ -240,10 +441,13 @@ def build_jobs(self, library_names=None) -> typing.Tuple[TransferJob, ...]: size = 0 transfer_jobs.append( TransferJob( - path_src=real_path + ext, path_dest=remote_file + ext, bytes=size + path_src=real_path + ext, + path_dest=str(remote_file) + ext, + bytes=size, ) ) - return tuple(sorted(transfer_jobs)) + + return lz_irods_path, tuple(sorted(transfer_jobs)) def execute(self) -> typing.Optional[int]: """Execute the transfer.""" @@ -254,36 +458,47 @@ def execute(self) -> typing.Optional[int]: logger.info("Starting cubi-tk sodar %s", self.command_name) logger.info(" args: %s", self.args) - jobs = self.build_jobs() - logger.debug("Transfer jobs:\n%s", "\n".join(map(lambda x: x.to_oneline(), jobs))) + lz_uuid, transfer_jobs = self.build_jobs() + logger.debug("Transfer jobs:\n%s", "\n".join(map(lambda x: x.to_oneline(), transfer_jobs))) if self.fix_md5_files: - jobs = self._execute_md5_files_fix(jobs) + transfer_jobs = self._execute_md5_files_fix(transfer_jobs) logger.info("Planning to transfer the files as follows...") - for job in jobs: + for job in transfer_jobs: logger.info(" %s => %s", job.path_src, job.path_dest) if not self.args.yes and not input("Is this OK? [yN] ").lower().startswith("y"): logger.error("OK, breaking at your request") return 1 - total_bytes = sum([job.bytes for job in jobs]) + total_bytes = sum([job.bytes for job in transfer_jobs]) logger.info( - "Transferring %d files with a total size of %s", len(jobs), sizeof_fmt(total_bytes) + "Transferring %d files with a total size of %s", + len(transfer_jobs), + sizeof_fmt(total_bytes), ) counter = Value(c_ulonglong, 0) with tqdm.tqdm(total=total_bytes, unit="B", unit_scale=True) as t: if self.args.num_parallel_transfers == 0: # pragma: nocover - for job in jobs: + for job in transfer_jobs: irsync_transfer(job, counter, t) else: pool = ThreadPool(processes=self.args.num_parallel_transfers) - for job in jobs: + for job in transfer_jobs: pool.apply_async(irsync_transfer, args=(job, counter, t)) pool.close() pool.join() + # Validate and move transferred files + # Behaviour: If flag is True and lz uuid is not None*, + # it will ask SODAR to validate and move transferred files. + # (*) It can be None if user provided path + if lz_uuid and self.args.validate_and_move: + self.move_landing_zone(lz_uuid=lz_uuid) + else: + logger.info("Transferred files will \033[1mnot\033[0m be automatically moved in SODAR.") + logger.info("All done") return None diff --git a/docs_manual/man_ingest_fastq.rst b/docs_manual/man_ingest_fastq.rst index d61d197a..b64ce4d3 100644 --- a/docs_manual/man_ingest_fastq.rst +++ b/docs_manual/man_ingest_fastq.rst @@ -13,7 +13,8 @@ The basic usage is: $ cubi-tk sodar ingest-fastq SOURCE [SOURCE ...] DESTINATION -where each ``SOURCE`` is a path to a folder containing relevant files and ``DESTINATION`` is either an iRODS path to a *landing zone* in SODAR or the UUID of that *landing zone*. +where each ``SOURCE`` is a path to a folder containing relevant files (this can also be a WebDav URL, see below) and +``DESTINATION`` is either an iRODS path to a *landing zone* in SODAR, the UUID of that *landing zone*, or and SODAR project UUID. ---------------- Other file types @@ -37,20 +38,25 @@ The default ``--remote-dir-pattern`` is .. code-block:: bash - {sample}/{date}/{filename} + {collection_name}/{date}/{filename} -It contains the wildcard ``{sample}``, which will be filled with the captured content of group ``(?P...)``. -In addition, the wildcards ``{date}`` and ``{filename}`` can always be used and will be filled with the current date and full filename (the basename of a matched file), respectively. +It contains the wildcard ``{collection_name}``, which represents and irods collection and will be filled with the captured +content of group ``(?P...)``, potentially modified by a regex (see 'Mapping of file names' below). +Alternatively the irods collection name can be derived by mapping the extracted (and modified) ``(?P...)`` +group to any column of the assay table associated with the LZ or project. In this case the ``{library_name}`` will be +filled with the content of the last material column of the assay table (or ``--collection-column`` if defined). +In addition, the wildcards ``{date}`` and ``{filename}`` can always be used in ``--remote-dir-pattern`` and will be +filled with the current date (or ``--remote-dir-date``) and full filename (the basename of a matched file), respectively. --------------------- Mapping of file names --------------------- -In some cases additional mapping of filenames is required (for example the samples should be renamed). -This can be done via the parameter ``--remote-dir-mapping`` or short ``-m``. +In some cases additional mapping of filenames is required (for example to fully macth the irods collections). +This can be done via the parameter ``--sample-collection-mapping`` or short ``-m``. It can be supplied several times, each time for another mapping. With each ``-m MATCH REPL`` a pair of a regular expression and a replacement string are specified. -Internally, pythons ``re.sub`` command is executed on the ``--remote-dir-pattern`` after wildcards have been filled. +Internally, pythons ``re.sub`` command is executed on the extracted ``(?P...)`` capture group. Therefore, you can refer to the documentation of the `re package `_ for syntax questions. ---------------------- @@ -70,8 +76,7 @@ To use this command, which internally executes iRODS icommands, you need to auth $ iinit -To be able to access the SODAR API (which is only required, if you specify a landing zone UUID instead of an iRODS path), you also need an API token. -For token management for SODAR, the following docs can be used: +To be able to access the SODAR API you also need an API token. For token management for SODAR, the following docs can be used: - https://sodar.bihealth.org/manual/ui_user_menu.html - https://sodar.bihealth.org/manual/ui_api_tokens.html diff --git a/tests/conftest.py b/tests/conftest.py index 307bee45..487ea4b5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,3 +74,26 @@ def my_exists(self): def my_get_sodar_info(_self): """Method is used to patch cubi_tk.snappy.itransfer_common.SnappyItransferCommandBase.get_sodar_info""" return "466ab946-ce6a-4c78-9981-19b79e7bbe86", "/irods/dest" + + +def my_sodar_api_export(n_assays=1): + """Return contents for api.samplesheet.export""" + assay = textwrap.dedent( + """ + Sample Name\tProtocol REF\tParameter Value[Concentration measurement]\tPerformer\tDate\tExtract Name\tCharacteristics[Concentration]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[Provider name]\tParameter Value[Provider contact]\tParameter Value[Provider project ID]\tParameter Value[Provider sample ID]\tParameter Value[Provider QC status]\tParameter Value[Requestor contact]\tParameter Value[Requestor project]\tParameter Value[Requestor sample ID]\tParameter Value[Concentration measurement]\tParameter Value[Library source]\tParameter Value[Library strategy]\tParameter Value[Library selection]\tParameter Value[Library layout]\tParameter Value[Library kit]\tComment[Library kit catalogue ID]\tParameter Value[Target insert size]\tParameter Value[Wet-lab insert size]\tParameter Value[Barcode kit]\tParameter Value[Barcode kit catalogue ID]\tParameter Value[Barcode name]\tParameter Value[Barcode sequence]\tPerformer\tDate\tLibrary Name\tCharacteristics[Folder name]\tCharacteristics[Concentration]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[Platform]\tParameter Value[Instrument model]\tParameter Value[Base quality encoding]\tParameter Value[Center name]\tParameter Value[Center contact]\tPerformer\tDate\tRaw Data File + Sample1-N1\tNucleic acid extraction WES\t\t\t\tSample1-N1-DNA1\t\t\t\t\tLibrary construction WES\t\t\t\t\t\t\t\t\t\tGENOMIC\tWXS\tHybrid Selection\tPAIRED\tAgilent SureSelect Human All Exon V7\t\t\t\t\t\t\t\t\t\tSample1-N1-DNA1-WES1\tFolder1\t\t\t\t\tNucleic acid sequencing WES\tILLUMINA\tIllumina NovaSeq 6000\tPhred+33 + Sample2-N1\tNucleic acid extraction WES\t\t\t\tSample2-N1-DNA1\t\t\t\t\tLibrary construction WES\t\t\t\t\t\t\t\t\t\tGENOMIC\tWXS\tHybrid Selection\tPAIRED\tAgilent SureSelect Human All Exon V7\t\t\t\t\t\t\t\t\t\tSample2-N1-DNA1-WES1\tFolder2\t\t\t\t\tNucleic acid sequencing WES\tILLUMINA\tIllumina NovaSeq 6000\tPhred+33 + Sample3-N1\tNucleic acid extraction WES\t\t\t\tSample3-N1-DNA1\t\t\t\t\tLibrary construction WES\t\t\t\t\t\t\t\t\t\tGENOMIC\tWXS\tHybrid Selection\tPAIRED\tAgilent SureSelect Human All Exon V7\t\t\t\t\t\t\t\t\t\tSample3-N1-DNA1-WES1\tFolder3\t\t\t\t\tNucleic acid sequencing WES\tILLUMINA\tIllumina NovaSeq 6000\tPhred+33 + """ + ).lstrip() + + isa_dict = { + "investigation": {"path": "i_Investigation.txt", "tsv": None}, + "studies": {"s_Study_0.txt": {"tsv": None}}, + "assays": {"a_name_0": {"tsv": assay}}, + } + if n_assays > 1: + for i in range(1, n_assays): + isa_dict["assays"]["a_name_%d" % i] = {"tsv": assay} + + return isa_dict diff --git a/tests/test_sodar_ingest_fastq.py b/tests/test_sodar_ingest_fastq.py index 6a6e7a18..608d1e8c 100644 --- a/tests/test_sodar_ingest_fastq.py +++ b/tests/test_sodar_ingest_fastq.py @@ -5,12 +5,20 @@ import json import os +import re +import unittest from unittest import mock +from unittest.mock import patch from pyfakefs import fake_filesystem, fake_pathlib import pytest from cubi_tk.__main__ import main, setup_argparse +from cubi_tk.exceptions import ParameterException +from cubi_tk.sodar.ingest_fastq import SodarIngestFastq + +from .conftest import my_get_sodar_info, my_sodar_api_export +from .factories import InvestigationFactory def test_run_sodar_ingest_fastq_help(capsys): @@ -38,10 +46,88 @@ def test_run_sodar_ingest_fastq_nothing(capsys): assert res.err +def test_run_sodar_ingest_fastq_src_regex(): + from cubi_tk.sodar.ingest_fastq import DEFAULT_SRC_REGEX + + # Collection of example filenames and the expected {sample} value the regex should capture + test_filenames = { + "Sample1-N1-RNA1-RNA_seq1.fastq.gz": "Sample1-N1-RNA1-RNA_seq1", + "P1234_Samplename_S14_L006_R2_001.fastq.gz": "P1234_Samplename", + "P1234_Samplename2_R1.fastq.gz": "P1234_Samplename2", + } + + for test_filename, expected_sample in test_filenames.items(): + res = re.match(DEFAULT_SRC_REGEX, test_filename) + assert res is not None + assert res.groupdict()["sample"] == expected_sample + + +@patch("cubi_tk.sodar.ingest_fastq.api.samplesheet.retrieve") +@patch("cubi_tk.sodar.ingest_fastq.api.samplesheet.export") +def test_run_sodar_ingest_fastq_get_match_to_collection_mapping(mock_api_export, mock_api_retrieve): + # Patched sodar API call + mock_api_export.return_value = my_sodar_api_export() + + # Instantiate SodarIngestFastq (seems to require args?) + landing_zone_uuid = "466ab946-ce6a-4c78-9981-19b79e7bbe86" + project_uuid = "466ab946-ce6a-4c78-9981-19b79e7bbe86" + fake_base_path = "/base/path" + argv = [ + "--verbose", + "sodar", + "ingest-fastq", + "--num-parallel-transfers", + "0", + "--sodar-api-token", + "XXXX", + "--yes", + fake_base_path, + landing_zone_uuid, + ] + + parser, _subparsers = setup_argparse() + args = parser.parse_args(argv) + ingestfastq = SodarIngestFastq(args) + + # test to get expected dict + expected = { + "Folder1": "Sample1-N1-DNA1-WES1", + "Folder2": "Sample2-N1-DNA1-WES1", + "Folder3": "Sample3-N1-DNA1-WES1", + } + + assert expected == ingestfastq.get_match_to_collection_mapping(project_uuid, "Folder name") + assert expected == ingestfastq.get_match_to_collection_mapping( + project_uuid, "Folder name", "Library Name" + ) + + # Test for alternative collection column + expected2 = { + "Folder1": "Sample1-N1-DNA1", + "Folder2": "Sample2-N1-DNA1", + "Folder3": "Sample3-N1-DNA1", + } + assert expected2 == ingestfastq.get_match_to_collection_mapping( + project_uuid, "Folder name", "Extract Name" + ) + + # Test for missing column + with unittest.TestCase.assertRaises(unittest.TestCase, ParameterException): + ingestfastq.get_match_to_collection_mapping(project_uuid, "Typo-Column") + + # Test with additional assay + mock_api_export.return_value = my_sodar_api_export(2) + mock_api_retrieve.return_value = InvestigationFactory() + assay_uuid = list(mock_api_retrieve.return_value.studies["s_Study_0"].assays.keys())[0] + ingestfastq.args.assay = assay_uuid + + assert expected == ingestfastq.get_match_to_collection_mapping(project_uuid, "Folder name") + + def test_run_sodar_ingest_fastq_smoke_test(mocker, requests_mock): # --- setup arguments irods_path = "/irods/dest" - landing_zone_uuid = "landing_zone_uuid" + landing_zone_uuid = "466ab946-ce6a-4c78-9981-19b79e7bbe86" dest_path = "target/folder/generic_file.fq.gz" fake_base_path = "/base/path" argv = [ @@ -86,9 +172,11 @@ def test_run_sodar_ingest_fastq_smoke_test(mocker, requests_mock): # --- mock modules mocker.patch("glob.os", fake_os) - mocker.patch("cubi_tk.sea_snap.itransfer_results.pathlib", fake_pl) - mocker.patch("cubi_tk.sea_snap.itransfer_results.os", fake_os) mocker.patch("cubi_tk.snappy.itransfer_common.os", fake_os) + mocker.patch( + "cubi_tk.snappy.itransfer_common.SnappyItransferCommandBase.get_sodar_info", + my_get_sodar_info, + ) mock_check_output = mock.MagicMock(return_value=0) mocker.patch("cubi_tk.snappy.itransfer_common.check_output", mock_check_output)