ENH: add kmer_query_reads_card and kmer_query_mags_card (#52)

Co-authored-by: Michal Ziemski <[email protected]>
bokulich-lab · Jul 3, 2024 · b327b22 · b327b22
1 parent 8a8b2d7
commit b327b22
Show file tree

Hide file tree

Showing 6 changed files with 839 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -61,12 +61,14 @@ sequencing reads and MAGs with antimicrobial resistance genes. Currently, the [C
 the implementation and usage, please refer to the [rgi](https://github.com/arpcard/rgi) documentation). Below you will
 find an overview of actions available in the plugin.
 
-| Action                     | Description                                                                          | Underlying tool                       | Used function                        |
-|----------------------------|--------------------------------------------------------------------------------------|---------------------------------------|--------------------------------------|
-| fetch-card-db              | Download and preprocess CARD and WildCARD data.                                      | [rgi](https://github.com/arpcard/rgi) | card_annotation, wildcard_annotation |
-| annotate-mags-card         | Annotate MAGs with antimicrobial resistance gene information from CARD.              | [rgi](https://github.com/arpcard/rgi) | main, load                           |
-| annotate-reads-card        | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [rgi](https://github.com/arpcard/rgi) | bwt, load                            |
-| heatmap                    | Create a heatmap from annotate-mags-card output files.                               | [rgi](https://github.com/arpcard/rgi) | heatmap                              |
+| Action                | Description                                                                          | Underlying tool                       | Used function                        |
+|-----------------------|--------------------------------------------------------------------------------------|---------------------------------------|--------------------------------------|
+| fetch-card-db         | Download and preprocess CARD and WildCARD data.                                      | [rgi](https://github.com/arpcard/rgi) | card_annotation, wildcard_annotation |
+| annotate-mags-card    | Annotate MAGs with antimicrobial resistance gene information from CARD.              | [rgi](https://github.com/arpcard/rgi) | main, load                           |
+| annotate-reads-card   | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [rgi](https://github.com/arpcard/rgi) | bwt, load                            |
+| heatmap               | Create a heatmap from annotate-mags-card output files.                               | [rgi](https://github.com/arpcard/rgi) | heatmap                              |
+| kmer-query-mags-card  | Pathogen-of-origin prediction for ARGs in MAGs.                                      | [rgi](https://github.com/arpcard/rgi) | kmer-query, load                     |
+| kmer-query-reads-card | Pathogen-of-origin prediction for ARGs in reads.                                     | [rgi](https://github.com/arpcard/rgi) | kmer-query, load                     |
 
 ## Dev environment
 This repository follows the _black_ code style. To make the development slightly easier

diff --git a/q2_amr/card/kmer.py b/q2_amr/card/kmer.py
@@ -0,0 +1,220 @@
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+import warnings
+from pathlib import Path
+
+from q2_amr.card.utils import load_card_db, run_command
+from q2_amr.types import (
+    CARDAlleleAnnotationDirectoryFormat,
+    CARDAnnotationDirectoryFormat,
+    CARDDatabaseDirectoryFormat,
+    CARDKmerDatabaseDirectoryFormat,
+    CARDMAGsKmerAnalysisDirectoryFormat,
+    CARDReadsAlleleKmerAnalysisDirectoryFormat,
+    CARDReadsGeneKmerAnalysisDirectoryFormat,
+)
+
+
+def kmer_query_mags_card(
+    ctx, amr_annotations, kmer_db, card_db, minimum=10, threads=1, num_partitions=None
+):
+    # Define all actions used by the pipeline
+    partition_method = ctx.get_action("amr", "partition_mags_annotations")
+    collate_method = ctx.get_action("amr", "collate_mags_kmer_analyses")
+    kmer_query = ctx.get_action("amr", "_kmer_query_mags")
+
+    # Partition the annotations
+    (partitioned_annotations,) = partition_method(amr_annotations, num_partitions)
+
+    kmer_analyses = []
+
+    # Run _kmer_query_mags for every partition
+    for partition in partitioned_annotations.values():
+        (kmer_analysis,) = kmer_query(
+            card_db=card_db,
+            kmer_db=kmer_db,
+            amr_annotations=partition,
+            minimum=minimum,
+            threads=threads,
+        )
+
+        # Append resulting kmer analysis artifacts to lists
+        kmer_analyses.append(kmer_analysis)
+
+    # Collate kmer analysis artifacts
+    (collated_kmer_analyses,) = collate_method(kmer_analyses)
+
+    return collated_kmer_analyses
+
+
+def kmer_query_reads_card(
+    ctx, amr_annotations, card_db, kmer_db, minimum=10, threads=1, num_partitions=None
+):
+    # Define all actions used by the pipeline
+    partition_method = ctx.get_action("amr", "partition_reads_allele_annotations")
+    collate_method_allele = ctx.get_action("amr", "collate_reads_allele_kmer_analyses")
+    collate_method_gene = ctx.get_action("amr", "collate_reads_gene_kmer_analyses")
+    kmer_query = ctx.get_action("amr", "_kmer_query_reads")
+
+    # Partition the annotations
+    (partitioned_annotations,) = partition_method(amr_annotations, num_partitions)
+
+    kmer_analyses_allele = []
+    kmer_analyses_gene = []
+
+    # Run _kmer_query_reads for every partition
+    for part in partitioned_annotations.values():
+        (kmer_analysis_allele, kmer_analysis_gene) = kmer_query(
+            card_db=card_db,
+            kmer_db=kmer_db,
+            amr_annotations=part,
+            minimum=minimum,
+            threads=threads,
+        )
+        # Append resulting kmer analysis artifacts to lists
+        kmer_analyses_allele.append(kmer_analysis_allele)
+        kmer_analyses_gene.append(kmer_analysis_gene)
+
+    # Collate kmer analysis artifacts
+    (collated_kmer_analysis_allele,) = collate_method_allele(kmer_analyses_allele)
+    (collated_kmer_analysis_gene,) = collate_method_gene(kmer_analyses_gene)
+
+    return collated_kmer_analysis_allele, collated_kmer_analysis_gene
+
+
+def _kmer_query_mags(
+    card_db: CARDDatabaseDirectoryFormat,
+    kmer_db: CARDKmerDatabaseDirectoryFormat,
+    amr_annotations: CARDAnnotationDirectoryFormat,
+    minimum: int = 10,
+    threads: int = 1,
+) -> CARDMAGsKmerAnalysisDirectoryFormat:
+    return _kmer_query_helper(card_db, kmer_db, amr_annotations, minimum, threads)
+
+
+def _kmer_query_reads(
+    card_db: CARDDatabaseDirectoryFormat,
+    kmer_db: CARDKmerDatabaseDirectoryFormat,
+    amr_annotations: CARDAlleleAnnotationDirectoryFormat,
+    minimum: int = 10,
+    threads: int = 1,
+) -> (
+    CARDReadsAlleleKmerAnalysisDirectoryFormat,
+    CARDReadsGeneKmerAnalysisDirectoryFormat,
+):
+    return _kmer_query_helper(card_db, kmer_db, amr_annotations, minimum, threads)
+
+
+def _kmer_query_helper(card_db, kmer_db, amr_annotations, minimum, threads):
+    if type(amr_annotations) is CARDAlleleAnnotationDirectoryFormat:
+        annotation_file = "sorted.length_100.bam"
+        input_type = "bwt"
+
+        kmer_analysis = (
+            CARDReadsAlleleKmerAnalysisDirectoryFormat(),
+            CARDReadsGeneKmerAnalysisDirectoryFormat(),
+        )
+
+    else:
+        annotation_file = "amr_annotation.json"
+        input_type = "rgi"
+
+        kmer_analysis = CARDMAGsKmerAnalysisDirectoryFormat()
+
+    with tempfile.TemporaryDirectory() as tmp:
+        # Load all necessary database files and retrieve Kmer size
+        kmer_size = load_card_db(tmp=tmp, card_db=card_db, kmer_db=kmer_db, kmer=True)
+
+        # Run once per annotation file
+        for root, dirs, files in os.walk(str(amr_annotations)):
+            if annotation_file in files:
+                input_path = os.path.join(root, annotation_file)
+
+                # Run kmer_query
+                _run_rgi_kmer_query(
+                    tmp=tmp,
+                    input_file=input_path,
+                    input_type=input_type,
+                    kmer_size=kmer_size,
+                    minimum=minimum,
+                    threads=threads,
+                )
+
+                # Define path to JSON kmer analysis file and split it into components
+                path_json = os.path.join(tmp, f"output_{kmer_size}mer_analysis.json")
+                path_split = input_path.split(os.path.sep)
+
+                # Load dictionary from result JSON file to check if its empty and print
+                # warning
+                with open(path_json) as json_file:
+                    json_dict = json.load(json_file)
+
+                if not json_dict:
+                    warnings.warn(
+                        f"No taxonomic prediction could be made for "
+                        f"{path_split[-2]}.",
+                        UserWarning,
+                    )
+
+                # Define filenames and paths to destination directories for reads or
+                # MAGs analysis
+                if type(amr_annotations) is CARDAlleleAnnotationDirectoryFormat:
+                    files = (
+                        f"output_{kmer_size}mer_analysis.allele.txt",
+                        f"output_{kmer_size}mer_analysis.json",
+                        f"output_{kmer_size}mer_analysis.gene.txt",
+                    )
+                    des_dirs = [
+                        kmer_analysis[0].path / path_split[-2],
+                        kmer_analysis[0].path / path_split[-2],
+                        kmer_analysis[1].path / path_split[-2],
+                    ]
+
+                else:
+                    files = (
+                        f"output_{kmer_size}mer_analysis_rgi_summary.txt",
+                        f"output_{kmer_size}mer_analysis.json",
+                    )
+                    des_dirs = [
+                        kmer_analysis.path / path_split[-3] / path_split[-2],
+                        kmer_analysis.path / path_split[-3] / path_split[-2],
+                    ]
+
+                # Copy Kmer analysis files into the destination directories and remove
+                # "output_" prefix from filenames
+                for file, des_dir in zip(files, des_dirs):
+                    os.makedirs(des_dir, exist_ok=True)
+                    shutil.move(Path(tmp) / file, des_dir / file[7:])
+
+    return kmer_analysis
+
+
+def _run_rgi_kmer_query(tmp, input_file, input_type, kmer_size, minimum, threads):
+    cmd = [
+        "rgi",
+        "kmer_query",
+        "--input",
+        input_file,
+        f"--{input_type}",
+        "--kmer_size",
+        kmer_size,
+        "--minimum",
+        str(minimum),
+        "--threads",
+        str(threads),
+        "--output",
+        "output",
+        "--local",
+    ]
+
+    try:
+        run_command(cmd, tmp, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while running rgi, "
+            f"(return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )