From d82b5e93dff66bf91e6fb7857c46c4fd32c51dfb Mon Sep 17 00:00:00 2001 From: Tom Whalley Date: Thu, 22 Aug 2024 12:11:29 +0000 Subject: [PATCH] log passes --- bin/identify_tophit_and_contaminants2.py | 4 +++- docker/Dockerfile.preprocessing-0.9.8.1 | 1 + modules/decontaminationModules.nf | 22 +++++++++++----------- modules/preprocessingModules.nf | 3 ++- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/bin/identify_tophit_and_contaminants2.py b/bin/identify_tophit_and_contaminants2.py index ffe12e9..93f8547 100755 --- a/bin/identify_tophit_and_contaminants2.py +++ b/bin/identify_tophit_and_contaminants2.py @@ -483,6 +483,7 @@ def process_reports(afanc_json_path, kraken_json_path, supposed_species, unmix_m parser.add_argument('myco_dir', metavar='myco_dir', type=str, help='Path to myco directory') parser.add_argument('prev_species_json', metavar='prev_species_json', type=str, help='Path to previous species json file. Can be set to \'null\'') parser.add_argument('permissive', metavar='permissive', type=str, help="Is either \'yes\' or \'no\', given in response to the question: do you want to carry on to Clockwork regardless of errors?") + parser.add_argument('pass_number', metavar='pass_number', type=int, help="Pass number. Refers to what pass of decontamination the pipeline is on") args = parser.parse_args() # REQUIREMENTS @@ -495,6 +496,7 @@ def process_reports(afanc_json_path, kraken_json_path, supposed_species, unmix_m myco_dir = sys.argv[6] prev_species_json = sys.argv[7] permissive = sys.argv[8] + pass_number = sys.argv[9] # read assembly summary urls, tax_ids = read_assembly_summary(assembly_file) @@ -509,6 +511,6 @@ def process_reports(afanc_json_path, kraken_json_path, supposed_species, unmix_m f.write(url + "\n") # print final file - out_file2 = sample_id + '_species_in_sample.json' + out_file2 = sample_id + '_species_in_sample_pass_' + str(pass_number) + '.json' with open(out_file2, 'w') as f: json.dump(out, f, indent = 4) diff --git a/docker/Dockerfile.preprocessing-0.9.8.1 b/docker/Dockerfile.preprocessing-0.9.8.1 index 0c4da95..ab92979 100644 --- a/docker/Dockerfile.preprocessing-0.9.8.1 +++ b/docker/Dockerfile.preprocessing-0.9.8.1 @@ -1,5 +1,6 @@ FROM ubuntu:focal + LABEL maintainer="pricea35@cardiff.ac.uk" \ about.summary="container for the preprocessing workflow" diff --git a/modules/decontaminationModules.nf b/modules/decontaminationModules.nf index 6cfb91c..76a15b7 100644 --- a/modules/decontaminationModules.nf +++ b/modules/decontaminationModules.nf @@ -34,7 +34,7 @@ process identifyBacterialContaminants { report_json = "${sample_name}_report.json" """ - identify_tophit_and_contaminants2.py ${afanc_json} ${kraken_json} ${refseq} ${params.species} ${params.unmix_myco} ${resources} null ${params.permissive} + identify_tophit_and_contaminants2.py ${afanc_json} ${kraken_json} ${refseq} ${params.species} ${params.unmix_myco} ${resources} null ${params.permissive} ${pass} contam_to_remove=\$(jq -r '.summary_questions.are_there_contaminants' ${sample_name}_species_in_sample.json) acceptable_species=\$(jq -r '.summary_questions.is_the_top_species_appropriate' ${sample_name}_species_in_sample.json) @@ -340,32 +340,32 @@ process summarise { tuple val(sample_name), path("${sample_name}_species_in_sample.json"), stdout, emit: summary_json stdout emit: do_we_break path "${sample_name}_err.json", emit: summary_log optional true - path "${sample_name}_report.json", emit: summary_report optional true + path "${sample_name}_pass_${pass}_report.json", emit: summary_report optional true val(pass), emit: pass_number script: error_log = "${sample_name}_err.json" - report_json = "${sample_name}_report.json" - + report_json = "${sample_name}_pass_${pass}_report.json" + species_in_sample = "${sample_name}_species_in_sample_pass_${pass}.json" """ - identify_tophit_and_contaminants2.py ${afanc_json} ${kraken_json} ${refseq} ${params.species} ${params.unmix_myco} ${resources} ${prev_species_json} ${params.permissive} + identify_tophit_and_contaminants2.py ${afanc_json} ${kraken_json} ${refseq} ${params.species} ${params.unmix_myco} ${resources} ${prev_species_json} ${params.permissive} ${pass} - contam_to_remove=\$(jq -r '.summary_questions.are_there_contaminants' ${sample_name}_species_in_sample.json) - acceptable_species=\$(jq -r '.summary_questions.is_the_top_species_appropriate' ${sample_name}_species_in_sample.json) - top_hit=\$(jq -r '.top_hit.name' ${sample_name}_species_in_sample.json) + contam_to_remove=\$(jq -r '.summary_questions.are_there_contaminants' ${species_in_sample}) + acceptable_species=\$(jq -r '.summary_questions.is_the_top_species_appropriate' ${species_in_sample}) + top_hit=\$(jq -r '.top_hit.name' ${species_in_sample}) if [ \$contam_to_remove == 'yes' ]; then if [ "${params.permissive}" == "no" ]; then printf "${sample_name}" - echo '{"error":"sample remains contaminated, even after attempting to resolve this"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${sample_name}_species_in_sample.json > ${report_json} + echo '{"error":"sample remains contaminated, even after attempting to resolve this"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${species_in_sample} > ${report_json} else if [ "${pass}" == 2 ]; then printf "NOW_ALIGN_TO_REF_${sample_name}" else printf "${sample_name}" fi - echo '{"warning":"sample remains contaminated, even after attempting to resolve this"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${sample_name}_species_in_sample.json > ${report_json} + echo '{"warning":"sample remains contaminated, even after attempting to resolve this"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${species_in_sample} > ${report_json} fi fi @@ -373,7 +373,7 @@ process summarise { printf "NOW_ALIGN_TO_REF_${sample_name}" elif [ \$contam_to_remove == 'no' ] && [ \$acceptable_species == 'no' ]; then jq -n --arg key "\$top_hit" '{"error": ("top hit " + \$key + " does not have a reference genome. Sample will not proceed beyond preprocessing workflow.")}' > ${error_log} && \ - jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${sample_name}_species_in_sample.json > ${report_json} + jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${species_in_sample} > ${report_json} printf "DO_NOT_PROCEED_${sample_name}" fi """ diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index 504ef8b..8504450 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -374,7 +374,8 @@ process afanc { cp ${sample_name}/${sample_name}.json ${sample_name}_afanc_original.json reformat_afanc_json.py ${sample_name}/${sample_name}.json - identify_tophit_and_contaminants2.py ${afanc_report} ${kraken_json} $refseq_path ${params.species} ${params.unmix_myco} $resource_dir null ${params.permissive} + identify_tophit_and_contaminants2.py ${afanc_report} ${kraken_json} $refseq_path ${params.species} ${params.unmix_myco} $resource_dir null ${params.permissive} ${pass} + mv "${sample_name}"_species_in_sample_pass_0.json "${sample_name}"_species_in_sample.json echo '{"error":"Kraken's top family hit either wasn't Mycobacteriaceae, or there were < 100k Mycobacteriaceae reads. Sample will not proceed further than afanc."}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${sample_name}_species_in_sample.json > ${report_json}