Skip to content

Commit

Permalink
Merge pull request #110 from Pathogen-Genomics-Cymru/not_top_hit
Browse files Browse the repository at this point in the history
Not top hit
  • Loading branch information
WhalleyT authored Oct 23, 2024
2 parents f9640f4 + 291de0a commit 1ef8bcc
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 22 deletions.
1 change: 0 additions & 1 deletion .github/workflows/build-push-quay.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ on:
push:
branches:
- main
- bcg
paths:
- '**/Dockerfile*'
- "bin/"
Expand Down
31 changes: 25 additions & 6 deletions bin/parse_kraken_report2.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def read_kraken_report(input, pct_threshold, num_threshold):
return S, G, G1, F, non_human_species_detected

# define output function
def parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold, num_threshold):
def parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold, num_threshold, permissive):
# arguments are the output from read_kraken_report function
# define warnings lists
warnings = []
Expand Down Expand Up @@ -155,7 +155,8 @@ def parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold,
# IF THE TOP FAMILY IS MYCOBACTERIACEAE (WHICH CAN ONLY BE THE CASE IF MINIMUM COVERAGE THRESHOLDS ARE MET), WE WILL ALSO REPORT THE KRAKEN 'G1' CLASSIFICATIONS. THESE MAY INDICATE WHETHER THIS IS A MIXED MYCOBACTERIAL SAMPLE.
if top_family == "Mycobacteriaceae":
if no_of_reads_assigned_to_top_family < 100000:
if "Errors" not in out: out['Errors'] = []
if "Errors" not in out:
out['Errors'] = []
out['Errors'].append("error: there are < 100k reads classified as Mycobacteriaceae")
out['afanc'] = 'false'
else:
Expand All @@ -179,9 +180,14 @@ def parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold,
if len(sorted_G1) > 1:
warnings.append("warning: sample contains multiple mycobacterial species complexes (for superior classification of mixed mycobacteria, defer to afanc report)")
else:
if "Errors" not in out: out['Errors'] = []
out['Errors'].append("error: top family is not Mycobacteriaceae")
out['afanc'] = 'false'
if "Errors" not in out:
out['Errors'] = []
if permissive:
warnings.append("Warning: Mycobacteriaceae is not the top family, but permissive flag has been invoked")
out['afanc'] = 'true'
else:
out['Errors'].append("error: top family is not Mycobacteriaceae")
out['afanc'] = 'false'

if len(warnings) == 0:
warnings.append('')
Expand All @@ -196,6 +202,7 @@ def process_requirements(args):
out_file = args[2]
pct_threshold = float(args[3])
num_threshold = int(args[4])
permissive = args[5]

# check if input file exists
if not os.path.exists(in_file):
Expand All @@ -219,6 +226,9 @@ def process_requirements(args):
if pct_threshold > 100:
sys.exit('ERROR: %f is a %% and cannot be > 100' %(pct_threshold))

if ((permissive != 'yes') & (permissive != 'no')):
sys.exit('ERROR: \'permissive\' should be either \'yes\' or \'no\'')

return

# call main function
Expand All @@ -234,6 +244,8 @@ def process_requirements(args):
parser.add_argument('out_file', metavar='out_file', type=str, help='Path to output file; must end .json')
parser.add_argument('pct_threshold', metavar='pct_threshold', type=float, help='Min. coverage, as %%')
parser.add_argument('num_threshold', metavar='num_threshold', type=int, help='Min. coverage, as no. of reads. Should be a positive integer.')
parser.add_argument('permissive', metavar='permissive', type=str, help="Boolean. if True then permissive error handling will be applied. \
This means samples will proceed even if Mycobacteriaceae is not the top hit" )

args = parser.parse_args()

Expand All @@ -243,6 +255,13 @@ def process_requirements(args):
out_file = sys.argv[2]
pct_threshold = float(sys.argv[3])
num_threshold = int(sys.argv[4])
permissive = sys.argv[5]

#coerce permissive into a bool
if permissive == "yes":
permissive = True
else:
permissive = False

# read kraken report
S = []
Expand All @@ -253,7 +272,7 @@ def process_requirements(args):
S, G, G1, F, non_human_species_detected = read_kraken_report(in_file, pct_threshold, num_threshold)

# parse kraken report and generate output
out = parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold, num_threshold)
out = parse_kraken_report(S, G, G1, F, non_human_species_detected, pct_threshold, num_threshold, permissive)

# CREATE OUTPUT FILE
with open(out_file, 'w') as f:
Expand Down
2 changes: 1 addition & 1 deletion config/containers.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process {

withLabel:getversion {
container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.9"
container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.9r1"
}

withLabel:preprocessing {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
FROM ubuntu:focal


LABEL maintainer="[email protected]" \
about.summary="container for the preprocessing workflow"

Expand Down Expand Up @@ -80,18 +81,21 @@ RUN curl -fsSL https://github.com/OpenGene/fastp/archive/v${fastp_version}.tar.g
&& cd .. \
&& rm -r fastp-${fastp_version}

#fastqc
RUN wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v${fastqc_version}.zip \
&& unzip fastqc_v${fastqc_version}.zip \
&& chmod +x FastQC/fastqc \
&& mv FastQC/* /usr/local/bin \
&& rm fastqc_v${fastqc_version}.zip \
&& rm -r FastQC

#Kraken needed for afanc and standalone
RUN curl -fsSL https://github.com/DerrickWood/kraken2/archive/v${kraken2_version}.tar.gz | tar -xz \
&& cd kraken2-${kraken2_version} \
&& ./install_kraken2.sh /usr/local/bin \
&& cd ..

# afanc (plus mash and fastANI)
RUN curl -fsSL https://github.com/ArthurVM/Afanc/archive/refs/tags/v${afanc_version}-alpha.tar.gz | tar -xz \
&& cd Afanc-${afanc_version}-alpha \
&& pip3 install ./ \
Expand All @@ -103,22 +107,12 @@ RUN curl -fsSL https://github.com/ArthurVM/Afanc/archive/refs/tags/v${afanc_vers
&& unzip fastANI-Linux64-v${fastani_version}.zip \
&& mv fastANI /usr/local/bin

#edirect needed for afanc
RUN sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" \
&& mkdir -p /opt/edirect \
&& mv /root/edirect/* /opt/edirect

RUN git clone --recursive -b geno_kmer_count https://github.com/phelimb/mccortex \
&& make -C mccortex \
&& mv mccortex /usr/local/bin \
&& curl -fsSL mykrobe-${mykrobe_version}.tar.gz https://github.com/Mykrobe-tools/mykrobe/archive/v${mykrobe_version}.tar.gz | tar -xz \
&& cd mykrobe-${mykrobe_version} \
&& pip3 install requests \
&& pip3 install . \
&& ln -s /usr/local/bin/mccortex/bin/mccortex31 /usr/local/lib/python3.8/dist-packages/mykrobe/cortex/mccortex31 \
&& mykrobe panels update_metadata \
&& mykrobe panels update_species all \
&& cd ..

# install BWA
RUN curl -fsSL https://github.com/lh3/bwa/archive/v${bwa_version}.tar.gz | tar -C /usr/local/bin -xz \
&& make -C /usr/local/bin/bwa-${bwa_version} \
&& chmod +x /usr/local/bin/bwa-${bwa_version}/bwa
Expand Down
2 changes: 1 addition & 1 deletion modules/decontaminationModules.nf
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ process reKraken {
"""
kraken2 --threads ${task.cpus} --db . --output ${kraken2_read_classification} --report ${kraken2_report} --paired $fq1 $fq2
parse_kraken_report2.py ${kraken2_report} ${kraken2_json} ${params.percent_threshold} ${params.n_reads_threshold}
parse_kraken_report2.py ${kraken2_report} ${kraken2_json} ${params.percent_threshold} ${params.n_reads_threshold} ${params.permissive}
rm -rf ${sample_name}_read_classifications.txt
"""

Expand Down
2 changes: 1 addition & 1 deletion modules/preprocessingModules.nf
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ process kraken2 {
"""
kraken2 --threads ${task.cpus} --db . --output ${kraken2_read_classification} --report ${kraken2_report} --paired $fq1 $fq2
parse_kraken_report2.py ${kraken2_report} ${kraken2_json} ${params.percent_threshold} ${params.n_reads_threshold}
parse_kraken_report2.py ${kraken2_report} ${kraken2_json} ${params.percent_threshold} ${params.n_reads_threshold} ${params.permissive}
extract_kraken_reads.py -k ${kraken2_read_classification} -r ${kraken2_report} -s $fq1 -s2 $fq2 -o ${nonBac_depleted_reads_1} -o2 ${nonBac_depleted_reads_2} --taxid 2 --include-children --fastq-output >/dev/null
Expand Down
File renamed without changes.

0 comments on commit 1ef8bcc

Please sign in to comment.