Skip to content

Commit

Permalink
remove boto/path checking in python
Browse files Browse the repository at this point in the history
  • Loading branch information
t-whalley committed Nov 29, 2023
1 parent 8327e56 commit 6dbe554
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 102 deletions.
93 changes: 10 additions & 83 deletions bin/identify_tophit_and_contaminants2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,78 +11,6 @@
import configparser
import pathlib

from botocore.exceptions import ClientError
from collections import namedtuple

def get_credentials(profile, cred_path):
__s3_creds = namedtuple(
"s3_credentials",
["access_key", "secret_key", "endpoint", "region", "profile_name"],)

credential_file = configparser.ConfigParser()

if os.path.isfile(cred_path) :
credential_file.read_file(open(os.path.expanduser(cred_path), "rt"))
else:
credential_file = False

profile = "climb" if not profile else profile

endpoint = "https://s3.climb.ac.uk"

region = "s3"

if credential_file:
access_key = credential_file[profile]["aws_access_key_id"]
secret_key = credential_file[profile]["aws_secret_access_key"]

if os.getenv("AWS_ACCESS_KEY_ID"):
access_key = os.getenv("AWS_ACCESS_KEY_ID")

if os.getenv("AWS_SECRET_ACCESS_KEY"):
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")

if not access_key or not secret_key:
error = """CLIMB S3 credentials could not be found, please provide valid credentials in one of the following ways:
- In a correctly formatted config file (~/.aws/credentials)
- As environmental variables 'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY'
- As a command line argument, see --help for more details
"""
print(error, file=sys.stderr)
sys.exit(1)

s3_credentials = __s3_creds(
access_key=access_key,
secret_key=secret_key,
endpoint=endpoint,
region=region,
profile_name=profile,
)

return s3_credentials

def create_client(creds):
s3_client = boto3.client("s3", endpoint_url=creds.endpoint,
aws_access_key_id=creds.access_key,
aws_secret_access_key=creds.secret_key)

return s3_client

def _is_file_in_s3(client, folder, file):
try:
client.get_object(Bucket=folder, Key=file)
return True
except ClientError:
return False


def is_file_in_s3(bucket, file, config="~/.aws/credentials", profile="climb"):
creds = get_credentials(profile, config)
client = create_client(creds)

return _is_file_in_s3(client, bucket, file)


# define process requirements function
def process_requirements(args):
# REQUIREMENTS
Expand All @@ -93,11 +21,10 @@ def process_requirements(args):
unmix_myco = args[5]
myco_dir = args[6]
prev_species_json = args[7]
credential_file = args[8]

if credential_file == "null":
credential_file = "~/.aws/config"

credential_file = "~/.aws/config"

"""
# check if input files exist and not empty
if not os.path.exists(afanc_json):
sys.exit('ERROR: cannot find %s' %(afanc_json))
Expand All @@ -114,21 +41,22 @@ def process_requirements(args):
if os.stat(assembly_file).st_size == 0:
sys.exit('ERROR: %s is empty' %(assembly_file))
#if not os.path.exists(myco_dir) and not bucket_exists(myco_dir):
# sys.exit('ERROR: cannot find %s' %(myco_dir))
if not os.path.exists(myco_dir) and not bucket_exists(myco_dir):
sys.exit('ERROR: cannot find %s' %(myco_dir))
if (prev_species_json != 'null'):
if not os.path.exists(prev_species_json):
sys.exit('ERROR: cannot find %s' %(prev_species_json))
if os.stat(prev_species_json).st_size == 0:
sys.exit('ERROR: %s is empty' %(prev_species_json))

"""

species = ['abscessus', 'africanum', 'avium', 'bovis', 'chelonae', 'chimaera', 'fortuitum', 'intracellulare', 'kansasii', 'tuberculosis']
for spec in species:
spec_fasta_path = os.path.join(myco_dir, spec + '.fasta')
spec_mmi_path = os.path.join(myco_dir, spec + '.mmi')


"""
if myco_dir.startswith("s3://"):
s3_myco_dir = myco_dir.replace("s3://", "")
spec_fasta = s3_myco_dir.split("/", 1)[-1] + "/" + spec + ".fasta"
Expand All @@ -150,7 +78,8 @@ def process_requirements(args):
else:
if not os.path.exists(spec_fasta_path):
sys.exit('ERROR: cannot find %s' %(spec_mmi_path))

"""

if ((supposed_species != 'null') & (supposed_species not in species)):
sys.exit('ERROR: if you provide a species ID, it must be one of either: abscessus|africanum|avium|bovis|chelonae|chimaera|fortuitum|intracellulare|kansasii|tuberculosis')

Expand Down Expand Up @@ -559,7 +488,6 @@ def process_reports(afanc_json_path, kraken_json_path, supposed_species, unmix_m
parser.add_argument('unmix_myco', metavar='unmix_myco', type=str, help='Is either \'yes\' or \'no\', given in response to the question: do you want to disambiguate mixed-mycobacterial samples by read alignment?\nIf \'no\', any contaminating mycobacteria will be recorded but NOT acted upon')
parser.add_argument('myco_dir', metavar='myco_dir', type=str, help='Path to myco directory')
parser.add_argument('prev_species_json', metavar='prev_species_json', type=str, help='Path to previous species json file. Can be set to \'null\'')
parser.add_argument('credential_file', metavar='credential_file', type=str, help='Path to AWS config file. Can be set to \'null\'')
args = parser.parse_args()

# REQUIREMENTS
Expand All @@ -571,7 +499,6 @@ def process_reports(afanc_json_path, kraken_json_path, supposed_species, unmix_m
unmix_myco = sys.argv[5]
myco_dir = sys.argv[6]
prev_species_json = sys.argv[7]
credential_file = sys.argv[8]

# read assembly summary
urls, tax_ids = read_assembly_summary(assembly_file)
Expand Down
131 changes: 131 additions & 0 deletions docker/Dockerfile.preprocessing-0.9.7r9
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
FROM ubuntu:focal

LABEL maintainer="[email protected]" \
about.summary="container for the preprocessing workflow"

ENV samtools_version=1.12 \
bcftools_version=1.12 \
htslib_version=1.12 \
bedtools_version=2.29.2 \
bowtie2_version=2.4.2 \
fastp_version=0.20.1 \
fastqc_version=0.11.9 \
fqtools_version=2.3 \
kraken2_version=2.1.1 \
afanc_version=0.10.2 \
mykrobe_version=0.12.1 \
bwa_version=0.7.17 \
mash_version=2.3 \
fastani_version=1.33

ENV PACKAGES="procps curl git wget build-essential zlib1g-dev libncurses-dev libz-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libgsl-dev rsync unzip ncbi-blast+ pigz jq libtbb-dev openjdk-11-jre-headless autoconf r-base-core locales locales-all" \
PYTHON="python3 python3-pip python3-dev" \
PYTHON_PACKAGES="biopython awscli boto3"

ENV PATH=${PATH}:/usr/local/bin/mccortex/bin:/usr/local/bin/bwa-${bwa_version}:/opt/edirect \
LD_LIBRARY_PATH=/usr/local/lib

RUN export DEBIAN_FRONTEND="noninteractive"

COPY bin/ /opt/bin/
ENV PATH=/opt/bin:$PATH

RUN apt-get update \
&& DEBIAN_FRONTEND="noninteractive" apt-get install -y $PACKAGES $PYTHON \
&& pip3 install --upgrade pip \
&& pip3 install $PYTHON_PACKAGES \
&& ln -s /usr/bin/python3 /usr/bin/python

RUN curl -fsSL https://github.com/samtools/samtools/archive/${samtools_version}.tar.gz | tar -xz \
&& curl -fsSL https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 | tar -xj \
&& make -C samtools-${samtools_version} -j HTSDIR=../htslib-${htslib_version} \
&& make -C samtools-${samtools_version} -j HTSDIR=../htslib-${htslib_version} prefix=/usr/local install \
&& rm -r samtools-${samtools_version} \
&& curl -fsSL https://github.com/samtools/bcftools/archive/refs/tags/${bcftools_version}.tar.gz | tar -xz \
&& make -C bcftools-${bcftools_version} -j HTSDIR=../htslib-${htslib_version} \
&& make -C bcftools-${bcftools_version} -j HTSDIR=../htslib-${htslib_version} prefix=/usr/local install \
&& rm -r bcftools-${bcftools_version}

RUN curl -fsSL https://github.com/alastair-droop/fqtools/archive/v${fqtools_version}.tar.gz | tar -xz \
&& mv htslib-${htslib_version} fqtools-${fqtools_version} \
&& cd fqtools-${fqtools_version} \
&& mv htslib-${htslib_version} htslib \
&& cd htslib \
&& autoreconf -i \
&& ./configure \
&& make \
&& make install \
&& cd .. \
&& make \
&& mv bin/* /usr/local/bin \
&& chmod +x /usr/local/bin/fqtools \
&& cd .. \
&& rm -r fqtools-${fqtools_version}

RUN curl -fsSL https://github.com/arq5x/bedtools2/releases/download/v${bedtools_version}/bedtools-${bedtools_version}.tar.gz | tar -xz \
&& make -C bedtools2 \
&& mv bedtools2/bin/* /usr/local/bin \
&& rm -r bedtools2

RUN curl -fsSL https://sourceforge.net/projects/bowtie-bio/files/bowtie2/${bowtie2_version}/bowtie2-${bowtie2_version}-source.zip -o bowtie2-${bowtie2_version}-source.zip \
&& unzip bowtie2-${bowtie2_version}-source.zip \
&& make -C bowtie2-${bowtie2_version} prefix=/usr/local install \
&& rm -r bowtie2-${bowtie2_version} \
&& rm bowtie2-${bowtie2_version}-source.zip

RUN curl -fsSL https://github.com/OpenGene/fastp/archive/v${fastp_version}.tar.gz | tar -xz \
&& cd fastp-${fastp_version} \
&& make \
&& make install \
&& cd .. \
&& rm -r fastp-${fastp_version}

RUN wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v${fastqc_version}.zip \
&& unzip fastqc_v${fastqc_version}.zip \
&& chmod +x FastQC/fastqc \
&& mv FastQC/* /usr/local/bin \
&& rm fastqc_v${fastqc_version}.zip \
&& rm -r FastQC

RUN curl -fsSL https://github.com/DerrickWood/kraken2/archive/v${kraken2_version}.tar.gz | tar -xz \
&& cd kraken2-${kraken2_version} \
&& ./install_kraken2.sh /usr/local/bin \
&& cd ..

RUN curl -fsSL https://github.com/ArthurVM/Afanc/archive/refs/tags/v${afanc_version}-alpha.tar.gz | tar -xz \
&& cd Afanc-${afanc_version}-alpha \
&& pip3 install ./ \
&& cd .. \
&& curl -fsSL "https://github.com/marbl/Mash/releases/download/v${mash_version}/mash-Linux64-v${mash_version}.tar" | tar -x \
&& mv mash-Linux64-v${mash_version}/mash /usr/local/bin \
&& rm -r mash-Linux* \
&& wget https://github.com/ParBLiSS/FastANI/releases/download/v${fastani_version}/fastANI-Linux64-v${fastani_version}.zip \
&& unzip fastANI-Linux64-v${fastani_version}.zip \
&& mv fastANI /usr/local/bin

RUN sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" \
&& mkdir -p /opt/edirect \
&& mv /root/edirect/* /opt/edirect

RUN git clone --recursive -b geno_kmer_count https://github.com/phelimb/mccortex \
&& make -C mccortex \
&& mv mccortex /usr/local/bin \
&& curl -fsSL mykrobe-${mykrobe_version}.tar.gz https://github.com/Mykrobe-tools/mykrobe/archive/v${mykrobe_version}.tar.gz | tar -xz \
&& cd mykrobe-${mykrobe_version} \
&& pip3 install requests \
&& pip3 install . \
&& ln -s /usr/local/bin/mccortex/bin/mccortex31 /usr/local/lib/python3.8/dist-packages/mykrobe/cortex/mccortex31 \
&& mykrobe panels update_metadata \
&& mykrobe panels update_species all \
&& cd ..

RUN curl -fsSL https://github.com/lh3/bwa/archive/v${bwa_version}.tar.gz | tar -C /usr/local/bin -xz \
&& make -C /usr/local/bin/bwa-${bwa_version} \
&& chmod +x /usr/local/bin/bwa-${bwa_version}/bwa

RUN unset DEBIAN_FRONTEND

ENV LC_ALL en_US.UTF-8 \
LANG en_US.UTF-8 \
LANGUAGE en_US.UTF-8

2 changes: 1 addition & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ workflow {

input_files_vjson = input_files.combine(getversion.out.getversion_json)

preprocessing(input_files_vjson, krakenDB, bowtie_dir, params.afanc_myco_db, params.resource_dir, params.refseq, params.aws_config)
preprocessing(input_files_vjson, krakenDB, bowtie_dir, params.afanc_myco_db, params.resource_dir, params.refseq)

// CLOCKWORK SUB-WORKFLOW

Expand Down
16 changes: 5 additions & 11 deletions modules/preprocessingModules.nf
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,7 @@ process afanc_screen {
path(afanc_myco_db)

output:
tuple val(sample_name), path("${sample_name}/${sample_name}.json"), stdout, emit: afanc_json

script:
afanc_report = "${sample_name}_afanc_report.json"
error_log = "${sample_name}_err.json"
report_json = "${sample_name}_report.json"
path "${sample_name}/${sample_name}.json", emit: afanc_json

"""
if [[ ${run_afanc} =~ /${sample_name}/ ]]
Expand All @@ -425,10 +420,8 @@ process afanc_parse {
*/

tag { sample_name }
label 'preprocessing'
label 'normal_cpu'
label 'medium_memory'
label 'retry_afanc'
label 'low_cpu'
label 'low_memory'
label 'afanc_parse'

publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP", mode: 'copy', pattern: '*_afanc_report.json'
Expand Down Expand Up @@ -458,7 +451,8 @@ process afanc_parse {
printf ${sample_name}
else
reformat_afanc_json.py ${unchanged_afanc_report}
identify_tophit_and_contaminants2.py ${afanc_report} ${kraken_json} $refseq_path ${params.species} ${params.unmix_myco} $resource_dir null
echo "Afanc JSON reformatted"
identify_tophit_and_contaminants2.py ${afanc_report} ${kraken_json} $refseq_path ${params.species} ${params.unmix_myco} $resource_dir null
echo '{"error":"Kraken's top family hit either wasn't Mycobacteriaceae, or there were < 100k Mycobacteriaceae reads. Sample will not proceed further than afanc."}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1] * .[2]" ${software_json} ${error_log} ${sample_name}_species_in_sample.json > ${report_json}
fi
Expand Down
10 changes: 3 additions & 7 deletions workflows/preprocessing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ include {countReads} from '../modules/preprocessingModules.nf' params(params)
include {fastp} from '../modules/preprocessingModules.nf' params(params)
include {fastQC} from '../modules/preprocessingModules.nf' params(params)
include {kraken2} from '../modules/preprocessingModules.nf' params(params)
include {afanc_screen} from '../modules/preprocessingModules.nf' params(params)
include {afanc_parse} from '../modules/preprocessingModules.nf' params(params)
include {afanc} from '../modules/preprocessingModules.nf' params(params)
include {mykrobe} from '../modules/preprocessingModules.nf' params(params)
include {bowtie2} from '../modules/preprocessingModules.nf' params(params)
include {identifyBacterialContaminants} from '../modules/preprocessingModules.nf' params(params)
Expand Down Expand Up @@ -58,14 +57,11 @@ workflow preprocessing {
kraken2(fastp.out.fastp_fqs, krakenDB.toList())

mykrobe(kraken2.out.kraken2_fqs)

afanc_screen(kraken2.out.kraken2_fqs.join(kraken2.out.kraken2_json, by: 0), afanc_myco_db)

first_afanc_json = afanc_screen.out.afanc_json
afanc_parse(kraken2.out.kraken2_fqs.join(kraken2.out.kraken2_json, by: 0), first_afanc_json, afanc_myco_db, resource_dir, refseq_path)
afanc(kraken2.out.kraken2_fqs.join(kraken2.out.kraken2_json, by: 0), afanc_myco_db, resource_dir, refseq_path)

// set speciation report
speciation_report = afanc_parse.out.afanc_json
speciation_report = afanc.out.afanc_json

bowtie2(kraken2.out.kraken2_fqs, bowtie_dir.toList())

Expand Down

0 comments on commit 6dbe554

Please sign in to comment.