diff --git a/.editorconfig b/.editorconfig index d898f5a49..244512988 100644 --- a/.editorconfig +++ b/.editorconfig @@ -34,3 +34,6 @@ indent_size = 2 [*.yaml] indent_style = space indent_size = 2 + +[*.nf] +indent_size = 2 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..4ddcc6c3b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +workflows/references/submit/*.txt filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 000000000..7c649ad26 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,75 @@ +# Thid workflow will build and push the import pipeline container. +# the plan later will be to include unit tests as well + + +name: Building Pipeline Containers + +on: + push: + branches: + 'dev' +jobs: + + starting-notification: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Intital notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_MESSAGE: 'Creating new pipeline image in docker hub' + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + MSG_MINIMAL: true + + create-docker-image: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: docker login + env: + DOCKER_USER: ${{ secrets.DOCKER_USER }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD + + - name: docker build + run: docker build -f Dockerfile -t rnacentral/rnacentral-import-pipeline . + + - name: docker push + run: docker push rnacentral/rnacentral-import-pipeline + + finished-notification: + needs: + - create-docker-image + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Finished notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_MESSAGE: 'New pipeline image pushed to docker hub' + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + MSG_MINIMAL: true + + singularity-conversion: + needs: + - create-docker-image + uses: rnacentral/rnacentral-import-pipeline/.github/workflows/singularity.yaml@dev + secrets: inherit + + + finished-singularity: + needs: + - singularity-conversion + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Finished notification + uses: rtCamp/action-slack-notify@v2 + env: + SLACK_MESSAGE: 'New singularity image pushed to ghcr' + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + MSG_MINIMAL: true diff --git a/.github/workflows/singularity.yaml b/.github/workflows/singularity.yaml new file mode 100644 index 000000000..e5630c91d --- /dev/null +++ b/.github/workflows/singularity.yaml @@ -0,0 +1,25 @@ +# This workflow runs the conversion to singularity and stores the result in the +# ghcr so we can pull it easier + +name: Singularity Build +on: workflow_call + + +jobs: + run_conversion: + name: "Pull docker image and convert" + runs-on: ubuntu-latest + + container: + image: quay.io/singularity/singularity:v3.8.1 + options: --privileged + + steps: + - name: "Pull image" + run: | + singularity pull --name rnacentral-rnacentral-import-pipeline-latest.sif docker://rnacentral/rnacentral-import-pipeline:latest + + - name: "Push to ghcr" + run: | + echo ${{ secrets.GITHUB_TOKEN }} | singularity remote login -u ${{ secrets.GHCR_USERNAME }} --password-stdin oras://ghcr.io + singularity push rnacentral-rnacentral-import-pipeline-latest.sif oras://ghcr.io/${GITHUB_REPOSITORY}:latest diff --git a/.gitignore b/.gitignore index e8ae80863..0dbec5bae 100644 --- a/.gitignore +++ b/.gitignore @@ -101,3 +101,8 @@ stubs .envrc workflows/references/results workflows/references/metadata +workflows/references/backup +workflows/references/submit/previous-release +workflows/references/manually_annotated/from* +workflows/references/manually_annotated/results +singularity/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c17f67002..4e648edb7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,28 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 + rev: v4.3.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - repo: https://github.com/psf/black - rev: 19.3b0 + rev: 22.6.0 hooks: - id: black +- repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files"] + name: isort (python) +# - repo: https://github.com/doublify/pre-commit-rust +# rev: v1.0 +# hooks: +# - id: fmt +# - id: cargo-check +# - id: clippy +- repo: https://github.com/python-poetry/poetry + rev: '1.2.0rc1' + hooks: + - id: poetry-check + # - id: poetry-lock diff --git a/Dockerfile b/Dockerfile index 942e3e149..d223fba72 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7-buster +FROM python:3.8-buster ENV RNA /rna @@ -46,6 +46,7 @@ RUN apt-get install -y \ unzip \ wget + # Install Infernal RUN \ cd $RNA/ && \ @@ -94,6 +95,7 @@ RUN pip3 install -r $RNACENTRAL_IMPORT_PIPELINE/requirements.txt RUN python3 -m textblob.download_corpora + WORKDIR / COPY openssl/openssl.cnf /etc/ssl/ diff --git a/Makefile b/Makefile index 48a766ea7..9900316cd 100644 --- a/Makefile +++ b/Makefile @@ -13,13 +13,25 @@ requirements-dev.txt: requirements-dev.in rust: cargo build --release - cp target/release/json2fasta bin - cp target/release/split-ena bin - cp target/release/expand-urs bin - cp target/release/precompute bin - cp target/release/search-export bin - cp target/release/ftp-export bin - cp target/release/json2dfasta bin + mv -f target/release/json2fasta bin + mv -f target/release/split-ena bin + mv -f target/release/expand-urs bin + mv -f target/release/precompute bin + mv -f target/release/search-export bin + mv -f target/release/ftp-export bin + mv -f target/release/json2dfasta bin + mv -f target/release/expression-parse bin + +clean: + rm bin/json2fasta + rm bin/split-ena + rm bin/expand-urs + rm bin/precompute + rm bin/search-export + rm bin/ftp-export + rm bin/json2dfasta + rm bin/expression-parse + cargo clean docker: Dockerfile requirements.txt .dockerignore docker build -t "$(docker)" . diff --git a/analyze.nf b/analyze.nf index 526698378..66b777ace 100755 --- a/analyze.nf +++ b/analyze.nf @@ -7,13 +7,29 @@ include { genome_mapping } from './workflows/genome-mapping' include { r2dt } from './workflows/r2dt' include { rfam_scan } from './workflows/rfam-scan' +include { slack_closure } from './workflows/utils/slack' +include { slack_message } from './workflows/utils/slack' + workflow analyze { take: ready emit: done main: + Channel.of("Starting analyze pipeline") | slack_message ready | (genome_mapping & rfam_scan & r2dt & cpat) | mix | collect | set { done } } workflow { analyze(Channel.of('ready')) } + + +workflow.onComplete { + slack_closure("Analyze workflow completed") + +} + +workflow.onError { + + slack_closure("Analyze workflow hit an error and crashed") + +} diff --git a/bin/check_ids.py b/bin/check_ids.py index 6aba3b8e2..7d47b0d71 100755 --- a/bin/check_ids.py +++ b/bin/check_ids.py @@ -26,10 +26,11 @@ words.update(ignore_ids) special_char = re.compile('[@!#$%^&()<>?/\[\]\'}{~:]') nts = re.compile('^[acgu]+$') +numbers_and_dash = re.compile('^\d+[\-]\d+$') # do not use ids like 6-1, 260-1, etc def check_id(item): - if item.isnumeric() or item.lower() in words: + if item.isnumeric() or item.lower() in words or numbers_and_dash.search(item): result = None elif len(item) > 2 and not special_char.search(item) and not nts.search(item.lower()) and "\\" not in item: result = item @@ -47,55 +48,72 @@ def main(database, filename, output): """ Check ids and create file that will be used by RNAcentral-references. """ - remove_dot = ["ensembl_gene", "ensembl_gencode_gene", "ensembl_metazoa_gene"] - split_on_comma = ["flybase_gene_synonym", "pombase_gene_synonym", "refseq_gene_synonym", "hgnc_gene_synonym"] + remove_dot = ["ensembl", "ensembl_gencode", "ensembl_metazoa"] + split_on_comma = ["flybase", "hgnc", "pombase", "refseq"] + rfam_ignore = [ + "30_255", "30_292", "5S_rRNA", "5_8S_rRNA", "6A", "6S", "7SK", "C4", "CRISPR-DR10", "CRISPR-DR11", + "CRISPR-DR12", "CRISPR-DR13", "CRISPR-DR14", "CRISPR-DR15", "CRISPR-DR16", "CRISPR-DR17", "CRISPR-DR18", + "CRISPR-DR19", "CRISPR-DR2", "CRISPR-DR20", "CRISPR-DR21", "CRISPR-DR22", "CRISPR-DR23", "CRISPR-DR24", + "CRISPR-DR25", "CRISPR-DR26", "CRISPR-DR27", "CRISPR-DR28", "CRISPR-DR29", "CRISPR-DR3", "CRISPR-DR30", + "CRISPR-DR31", "CRISPR-DR32", "CRISPR-DR33", "CRISPR-DR34", "CRISPR-DR35", "CRISPR-DR36", "CRISPR-DR37", + "CRISPR-DR38", "CRISPR-DR39", "CRISPR-DR4", "CRISPR-DR40", "CRISPR-DR41", "CRISPR-DR42", "CRISPR-DR43", + "CRISPR-DR44", "CRISPR-DR45", "CRISPR-DR46", "CRISPR-DR47", "CRISPR-DR48", "CRISPR-DR49", "CRISPR-DR5", + "CRISPR-DR50", "CRISPR-DR51", "CRISPR-DR52", "CRISPR-DR53", "CRISPR-DR54", "CRISPR-DR55", "CRISPR-DR56", + "CRISPR-DR57", "CRISPR-DR58", "CRISPR-DR6", "CRISPR-DR60", "CRISPR-DR61", "CRISPR-DR62", "CRISPR-DR63", + "CRISPR-DR64", "CRISPR-DR65", "CRISPR-DR66", "CRISPR-DR7", "CRISPR-DR8", "CRISPR-DR9", "F6", "Hairpin", + "Hairpin-meta1", "Hairpin-meta2", "Hatchet", "P1", "P10", "P11", "P13", "P14", "P15", "P17", "P18", "P2", "P24", + "P26", "P27", "P31", "P33", "P34", "P35", "P36", "P37", "P4", "P5", "P6", "P8", "P9", "ROSE", "S35", "S414", + "S774", "S808", "SAM", "SL1", "SL2", "U1", "U11", "U12", "U1_yeast", "U2", "U3", "U4", "U4atac", "U5", "U54", + "U6", "U6atac", "U7", "U8", "VA", "csRNA", "drum", "g2", "pRNA", "sar", "sul1", "t44", "tRNA", "tRNA-Sec", + "tmRNA", "tp2", "tracrRNA" + ] with open(filename, 'r') as input_file: with open(output, 'w') as output_file: while line := input_file.readline(): line = line.rstrip() line = line.split('|') - - if len(line) == 4: - get_gene = line[0] - get_primary_id = line[1] - urs = line[2] - taxid = line[3] - - # remove "." - if database in remove_dot and "." in get_gene: - get_gene = get_gene.split('.')[0] - - # split on "," - gene_results = [] - if database in split_on_comma: - gene_list = get_gene.split(',') - for item in gene_list: - item = check_id(item) - if item: - gene_results.append(item) - - if gene_results: - primary_id = check_id(get_primary_id) - for gene in gene_results: - if gene and primary_id and gene != primary_id: - output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n') - else: - gene = check_id(get_gene) - primary_id = check_id(get_primary_id) - if gene and primary_id and gene != primary_id: - output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n') - - else: - get_primary_id = line[0] - urs = line[1] - taxid = line[2] - - # check if it is a valid id - primary_id = check_id(get_primary_id) - - if primary_id: - output_file.write(primary_id + '|' + urs + '_' + taxid + '\n') + urs = line[0] + taxid = line[1] + primary_id = check_id(line[2]) + if primary_id and database in remove_dot and "." in primary_id: + primary_id = primary_id.split('.')[0] + + if primary_id and line[3:]: + for item in line[3:]: + if item: + get_id = item + else: + continue + + # ignore some optional_id from Rfam + if database == "rfam" and get_id in rfam_ignore: + output_file.write('|' + primary_id + '|' + urs + '_' + taxid + '\n') + continue + + # remove "." + if database in remove_dot and "." in get_id: + get_id = get_id.split('.')[0] + + # split on "," + results = [] + if database in split_on_comma: + list_of_ids = get_id.split(',') + for elem in list_of_ids: + elem = check_id(elem) + if elem: + results.append(elem) + + if results: + for db_id in results: + if db_id != primary_id: + output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n') + else: + db_id = check_id(get_id) + if db_id and db_id != primary_id: + output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n') + elif primary_id: + output_file.write(primary_id + '|' + urs + '_' + taxid + '\n') if __name__ == '__main__': diff --git a/bin/create_xml_metadata.py b/bin/create_xml_metadata.py new file mode 100755 index 000000000..e9c1f592b --- /dev/null +++ b/bin/create_xml_metadata.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright [2009-present] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import click +import gzip +import random +import string +import uuid +import xml.etree.ElementTree as ET + + +def create_xml_file(results, metadata): + """ + Creates the XML that will be used by the search index + :param results: list of results + :param metadata: file to be created + :return: None + """ + # start to create a XML file + database = ET.Element("database") + ET.SubElement(database, "name").text = "RNAcentral" + entries = ET.SubElement(database, "entries") + + for item in results: + entry = ET.SubElement(entries, "entry", id="metadata" + "_" + str(uuid.uuid4())) + additional_fields = ET.SubElement(entry, "additional_fields") + ET.SubElement(additional_fields, "field", name="entry_type").text = "Metadata" + ET.SubElement(additional_fields, "field", name="job_id").text = item["job_id"] + ET.SubElement(additional_fields, "field", name="database").text = item["db"] + ET.SubElement(additional_fields, "field", name="primary_id").text = item["primary_id"] + + ET.SubElement(database, "entry_count").text = str(len(results)) + + # save the file + tree = ET.ElementTree(database) + ET.indent(tree, space="\t", level=0) + random_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8)) + with gzip.open(metadata.split("*")[0] + random_string + ".xml.gz", "wb") as file: + tree.write(file) + + +@click.command() +@click.argument('filename') +@click.argument('output') +def main(filename, output): + """ + This function takes the ids and creates a temporary list to store the metadata. + :param filename: file containing ids + :param output: file to be created + :return: None + """ + with open(filename, "r") as input_file: + temp_results = [] + + while line := input_file.readline(): + line = line.rstrip() + line = line.split('|') + job_id = line[0] + database = line[1] + + if len(line) < 3: + temp_results.append({"job_id": job_id, "db": database, "primary_id": ""}) + else: + primary_id = line[2] + temp_results.append({"job_id": job_id, "db": database, "primary_id": primary_id}) + + if len(temp_results) >= 500000: + create_xml_file(temp_results, output) + temp_results = [] + + create_xml_file(temp_results, output) + + +if __name__ == "__main__": + main() diff --git a/bin/get_unique_ids.sh b/bin/get_unique_ids.sh new file mode 100755 index 000000000..a75d11f02 --- /dev/null +++ b/bin/get_unique_ids.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# set parameters +file=$1 +database=$2 + +# read file line by line +while IFS= read -r line; do + IFS=$"|" + tmp=($line) + if [[ ${#tmp[*]} = 2 ]]; then + job_id="${tmp[0]}" + urs="${tmp[1]}" + else + job_id="${tmp[0]}" + primary_id="${tmp[1]}" + urs="${tmp[2]}" + fi + + if [[ -n "${job_id}" ]]; then + echo ${job_id} >> ${database}_all_ids.txt + fi + + if [[ -n "${primary_id}" ]]; then + echo ${primary_id} >> ${database}_all_ids.txt + fi + + if [[ -n "${urs}" ]]; then + echo ${urs} >> ${database}_all_ids.txt + fi +done < ${file} + +# create file with unique ids +cat ${database}_all_ids.txt | sort | uniq > ${database}_ids.txt diff --git a/bin/metadata-rnacentral.py b/bin/metadata-rnacentral.py new file mode 100755 index 000000000..98cd365d9 --- /dev/null +++ b/bin/metadata-rnacentral.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright [2009-present] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import click + + +@click.command() +@click.argument('filename') +@click.argument('output') +def main(filename, output): + """ + This function creates a file to store URS and a file with job_ids|URS. + These files can be used to create the metadata for the RNAcentral website. + :param filename: file containing ids + :param output: file to be created + :return: None + """ + type_id = output.split('_')[0] + + with open(filename, "r") as input_file: + with open(output, 'w') as output_file: + while line := input_file.readline(): + line = line.rstrip() + line = line.split('|') + + if type_id == 'urs': + urs = line[-1] + output_file.write(urs + '\n') + elif type_id == 'job' and len(line) == 2: + job = line[0] + urs = line[1] + if job and urs: + output_file.write(job + '|' + urs + '\n') + elif type_id == 'job' and len(line) == 3: + job = line[0] + primary = line[1] + urs = line[2] + if job and urs: + output_file.write(job + '|' + urs + '\n') + if primary and urs: + output_file.write(primary + '|' + urs + '\n') + + +if __name__ == "__main__": + main() diff --git a/bin/metadata.py b/bin/metadata.py index 3e8f9a7be..bab8bddb0 100755 --- a/bin/metadata.py +++ b/bin/metadata.py @@ -14,38 +14,6 @@ limitations under the License. """ import click -import gzip -import uuid -import xml.etree.ElementTree as ET - - -def create_xml_file(results, metadata): - """ - Creates the XML that will be used by the search index - :param results: list of results - :param metadata: file to be created - :return: None - """ - # start to create a XML file - database = ET.Element("database") - ET.SubElement(database, "name").text = "RNAcentral" - entries = ET.SubElement(database, "entries") - - for item in results: - entry = ET.SubElement(entries, "entry", id="metadata" + "_" + str(uuid.uuid4())) - additional_fields = ET.SubElement(entry, "additional_fields") - ET.SubElement(additional_fields, "field", name="entry_type").text = "Metadata" - ET.SubElement(additional_fields, "field", name="job_id").text = item["job_id"] - ET.SubElement(additional_fields, "field", name="database").text = item["db"] - ET.SubElement(additional_fields, "field", name="primary_id").text = item["primary_id"] - - ET.SubElement(database, "entry_count").text = str(len(results)) - - # save the file - tree = ET.ElementTree(database) - ET.indent(tree, space="\t", level=0) - with gzip.open(metadata, "wb") as file: - tree.write(file) @click.command() @@ -53,38 +21,37 @@ def create_xml_file(results, metadata): @click.argument('output') def main(filename, output): """ - This function takes the ids and creates a temporary list to store the metadata. + This function creates a file with the metadata of a given database. :param filename: file containing ids :param output: file to be created :return: None """ with open(filename, "r") as input_file: - temp_results = [] database = filename.split(".")[0] - no_primary_id = ["genecards", "gtrnadb", "mirgenedb", "pdbe", "sgd"] - while line := input_file.readline(): - line = line.rstrip() - line = line.split('|') - - if database in no_primary_id: - job_id = line[0] - urs = line[1] - - temp_results.append({"job_id": urs, "db": "rnacentral", "primary_id": ""}) - temp_results.append({"job_id": job_id, "db": "rnacentral", "primary_id": urs}) - temp_results.append({"job_id": job_id, "db": database, "primary_id": ""}) - else: - job_id = line[0] - primary_id = line[1] - urs = line[2] - - temp_results.append({"job_id": urs, "db": "rnacentral", "primary_id": ""}) - temp_results.append({"job_id": primary_id, "db": database, "primary_id": ""}) - temp_results.append({"job_id": primary_id, "db": "rnacentral", "primary_id": urs}) - temp_results.append({"job_id": job_id, "db": "rnacentral", "primary_id": urs}) - temp_results.append({"job_id": job_id, "db": database, "primary_id": primary_id}) - create_xml_file(temp_results, output) + with open(output, "w") as output_file: + + while line := input_file.readline(): + line = line.rstrip() + line = line.split('|') + + if len(line) < 3: + job_id = line[0].lower() + urs = line[1] + + output_file.write(urs + "|" + "rnacentral" + "\n") + output_file.write(job_id + "|" + "rnacentral" + "|" + urs + "\n") + output_file.write(job_id + "|" + database + "\n") + else: + job_id = line[0].lower() + primary_id = line[1].lower() + urs = line[2] + + output_file.write(urs + "|" + "rnacentral" + "\n") + output_file.write(primary_id + "|" + database + "\n") + output_file.write(primary_id + "|" + "rnacentral" + "|" + urs + "\n") + output_file.write(job_id + "|" + "rnacentral" + "|" + urs + "\n") + output_file.write(job_id + "|" + database + "|" + primary_id + "\n") if __name__ == "__main__": diff --git a/bin/references-manually-annotated.py b/bin/references-manually-annotated.py new file mode 100755 index 000000000..0295e612c --- /dev/null +++ b/bin/references-manually-annotated.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright [2009-present] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import click + + +@click.command() +@click.argument('filename') +@click.argument('output') +def main(filename, output): + """ + This function creates a file for each database containing the manually annotated references + :param filename: file containing ids + :param output: file to be created + :return: None + """ + name = output.split("*")[0] + + with open(filename, "r") as input_file: + with open(name + "hgnc", 'w') as hgnc, open(name + "pombase", 'w') as pombase, open(name + "sgd", 'w') as sgd, \ + open(name + "tair", 'w') as tair, open(name + "zfin", 'w') as zfin: + while line := input_file.readline(): + line = line.rstrip() + line = line.split('|') + urs = line[0] + database = line[1] + pmid = line[2] + doi = line[3] + pmcid = line[4] + + if database.lower() == "hgnc": + hgnc.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n') + elif database.lower() == "pombase": + pombase.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n') + elif database.lower() == "sgd": + sgd.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n') + elif database.lower() == "tair": + tair.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n') + elif database.lower() == "zfin": + zfin.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n') + + +if __name__ == "__main__": + main() diff --git a/bin/upload_ids.sh b/bin/upload_ids.sh index 47ea731b1..a9edd1296 100755 --- a/bin/upload_ids.sh +++ b/bin/upload_ids.sh @@ -1,114 +1,27 @@ #!/bin/bash # Script to submit ids to RNAcentral-reference -# -# Usage: ./upload.sh [file] [database] -# -# The file can contain job_id, primary_id and urs_taxid. -# Each line in the file must have at least a job_id or a primary_id. -# Example: -# 5_8S_rRNA|RF00002|URS000019A91D_7230 -# Y_RNA|RF00019| -# Ysr224|| -# ZMP-ZTP||URS0001BC94F0_256318 -# |RF01750|URS0001BC834A_408172 -# |RF02770| - -# set parameters +# set parameter file=$1 -database=$2 -primary=$3 -upi=$4 - -# set database -if [ $database == "ensembl_gencode_gene" ] || [ $database == "ensembl_gencode_locus_tag" ]; then - database="ensembl_gencode" -elif [ $database == "ensembl_gene" ] || [ $database == "ensembl_locus_tag" ]; then - database="ensembl" -elif [ $database == "ensembl_metazoa_gene" ] || [ $database == "ensembl_metazoa_locus_tag" ]; then - database="ensembl_metazoa" -elif [ $database == "ensembl_plants_gene" ] || [ $database == "ensembl_plants_locus_tag" ]; then - database="ensembl_plants" -elif [ $database == "ensembl_protists_gene" ] || [ $database == "ensembl_protists_locus_tag" ]; then - database="ensembl_protists" -elif [ $database == "flybase_gene_synonym" ] || [ $database == "flybase_locus_tag" ]; then - database="flybase" -elif [ $database == "hgnc_gene_synonym" ] || [ $database == "hgnc_accession" ]; then - database="hgnc" -elif [ $database == "pombase_gene_synonym" ] || [ $database == "pombase_gene" ]; then - database="pombase" -elif [ $database == "refseq_gene" ] || [ $database == "refseq_gene_synonym" ] || [ $database == "refseq_optional_id" ]; then - database="refseq" -fi # create folder [ ! -d submitted ] && mkdir submitted function submitJob { - line=$1 - IFS=$'|' - tmp=($line) - - if [ -z ${primary} ] && [ -z ${upi} ]; then - # set job_id, primary_id and urs - job_id="${tmp[0]}" - primary_id="${tmp[1]}" - urs="${tmp[2]}" - elif [ -z ${primary} ]; then - # set job_id and primary_id - job_id="${tmp[0]}" - primary_id="${tmp[1]}" - else - # set job_id - job_id="${tmp[1]}" - fi - - # submit search according to the parameters used - if [ -z ${primary_id} ] && [ -z ${urs} ]; then - # submit job (id and database) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${job_id}\", \"database\": \"${database}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - elif [ -z ${job_id} ] && [ -z ${urs} ]; then - # submit job (primary_id and database) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${primary_id}\", \"database\": \"${database}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - elif [ -z ${urs} ]; then - # submit job (id, primary_id and database) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${job_id}\", \"primary_id\": \"${primary_id}\", \"database\": \"${database}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - elif [ -z ${primary_id} ]; then - # submit job (id, urs and database) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${job_id}\", \"database\": \"${database}\", \"urs\": \"${urs}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - elif [ -z ${job_id} ]; then - # submit job (primary_id, urs and database) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${primary_id}\", \"database\": \"${database}\", \"urs\": \"${urs}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - else - # submit job (id, database, primary_id, urs) - curl -X POST \ - -H "Content-Type:application/json" \ - -d "{\"id\": \"${job_id}\", \"database\": \"${database}\", \"primary_id\": \"${primary_id}\", \"urs\": \"${urs}\"}" \ - http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; - fi - - sleep 0.05 + # set parameter + job_id=$1 + + # submit job + curl -X POST \ + -H "Content-Type:application/json" \ + -d "{\"id\": \"${job_id}\"}" \ + http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file}; } # loop through the file -while IFS="" read -r p || [ -n "$p" ] +while IFS="" read -r line || [ -n "$line" ] do - submitJob "$p" + submitJob "$line" done < "$file" diff --git a/config/cluster.config b/config/cluster.config index 47608d7dd..d4599264b 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -10,6 +10,7 @@ process { executor { $lsf { queueSize = 10000 + submitRateLimit = '1sec' } } diff --git a/config/databases.config b/config/databases.config index 1c159945d..1cc88238e 100644 --- a/config/databases.config +++ b/config/databases.config @@ -17,7 +17,7 @@ params { } ena { - remote = '/nfs/ftp/pub/databases/ena/non-coding/snapshot_latest' + remote = '/nfs/ftp/public/databases/ena/non-coding/snapshot_latest' max_sequences = 50000 } @@ -152,7 +152,7 @@ params { silva { needs_taxonomy = true - remote = 'ftp://ftp.arb-silva.de/current/Exports/rnac/SILVA_*Parc.rnac.gz' + remote = 'http://ftp.arb-silva.de/current/Exports/rnac/' } snodb { @@ -187,7 +187,7 @@ params { } sgd { - remote = 'https://sgd-prod-upload.s3.amazonaws.com/latest/RNAcentral.json' + remote = "https://downloads.yeastgenome.org/latest/RNAcentral.json" } tarbase { diff --git a/config/main.config b/config/main.config index 0be1444ff..0fa78f4e8 100644 --- a/config/main.config +++ b/config/main.config @@ -7,7 +7,7 @@ params { connections = slurper.parse(new File(connection_file)) import_data { - chunk_size = 1024 * 1000 * 1000 + chunk_size = 256 * 1000 * 1000 } } diff --git a/config/precompute.config b/config/precompute.config index 8b7574452..eb0f2c804 100644 --- a/config/precompute.config +++ b/config/precompute.config @@ -1,11 +1,10 @@ params { precompute { run = true - max_entries = 500000 + max_entries = 250000 load_size = 1024 * 1000 * 1000 maxForks = 5 method = 'release' - range.memory = 8.GB } } diff --git a/containers/cpat/Dockerfile b/containers/cpat/Dockerfile index dc4dbd064..46a9775b3 100644 --- a/containers/cpat/Dockerfile +++ b/containers/cpat/Dockerfile @@ -1,6 +1,6 @@ -From r-base:3.4.1 +From r-base:3.6.3 -RUN apt-get update && apt-get install -y python3-pip +RUN apt-get update && apt-get install -y python3-pip procps RUN cp /usr/bin/python3 /usr/bin/python RUN pip3 install numpy diff --git a/files/import-data/expressionatlas/lookup-dump-query.sql b/files/import-data/expressionatlas/lookup-dump-query.sql new file mode 100644 index 000000000..d7966fd12 --- /dev/null +++ b/files/import-data/expressionatlas/lookup-dump-query.sql @@ -0,0 +1,27 @@ +COPY( + SELECT urs_taxid, + xref.taxid as taxid, + gene || '|' || external_id || '|' || gene_synonym || '|' || optional_id as external_id, + description, + seq_version, + assembly_id, + region_start, + region_stop, + rsr.chromosome, + strand, + rna_type, + COALESCE(seq_short, seq_long) as seq + FROM rnc_accessions + JOIN xref + ON xref.ac = rnc_accessions.accession + + JOIN rna + ON xref.upi = rna.upi + + JOIN rnc_accession_sequence_region rasr + ON rasr.accession = xref.ac + + JOIN rnc_sequence_regions rsr + ON rsr.id = region_id + + ) TO STDOUT CSV HEADER diff --git a/files/import-data/load/long-sequences.ctl b/files/import-data/load/long-sequences.ctl index 2d8942628..82d98bfef 100644 --- a/files/import-data/load/long-sequences.ctl +++ b/files/import-data/load/long-sequences.ctl @@ -27,14 +27,15 @@ TARGET COLUMNS ( WITH drop indexes, batch rows = 25000, - batch size = 512MB, - workers = 10, + batch size = 256MB, + prefetch rows = 50000, + workers = 5, concurrency = 2, skip header = 0, fields escaped by double-quote, fields terminated by ',' SET - work_mem to '256 MB', + work_mem to '512 MB', maintenance_work_mem to '1 GB' ; diff --git a/files/import-data/pre-release/000__assemblies.sql b/files/import-data/pre-release/000__assemblies.sql index 8da15840b..8773e420e 100644 --- a/files/import-data/pre-release/000__assemblies.sql +++ b/files/import-data/pre-release/000__assemblies.sql @@ -1,12 +1,5 @@ BEGIN; -DELETE FROM ensembl_assembly ensembl -USING load_assemblies load -WHERE - load.taxid = ensembl.taxid - and load.assembly_id != ensembl.assembly_id -; - INSERT INTO ensembl_assembly ( assembly_id, assembly_full_name, diff --git a/files/precompute/fetch-xref-info.sql b/files/precompute/fetch-xref-info.sql index 9d7696400..15a5ca5d9 100644 --- a/files/precompute/fetch-xref-info.sql +++ b/files/precompute/fetch-xref-info.sql @@ -1,22 +1,11 @@ -CREATE TEMP TABLE xref_releases AS -SELECT - rna.id as rna_id, - xref.upi, - xref.last -FROM xref -JOIN rna -ON - rna.upi = xref.upi -; +COPY( + SELECT + rna.id, + xref.upi, + xref.last + FROM xref + JOIN rna + ON + rna.upi = xref.upi -CREATE INDEX ix_xref_releases_upi ON xref_releases(upi); - -COPY ( -SELECT - rna_id, - upi, - max(last) -from xref_releases -group by rna_id, upi -order by rna_id ASC ) TO STDOUT (FORMAT CSV) diff --git a/files/precompute/methods/weekly.sql b/files/precompute/methods/weekly.sql new file mode 100644 index 000000000..2674087f5 --- /dev/null +++ b/files/precompute/methods/weekly.sql @@ -0,0 +1,7 @@ +COPY( +select upi from xref + +where deleted = 'N' +and EXTRACT (DAY FROM (CURRENT_TIMESTAMP - timestamp)) < 7 + +) TO STDOUT (FORMAT CSV) diff --git a/files/search-export/parts/accessions.sql b/files/search-export/parts/accessions.sql index 98b4c0d8f..c28a61ef1 100644 --- a/files/search-export/parts/accessions.sql +++ b/files/search-export/parts/accessions.sql @@ -4,7 +4,7 @@ COPY ( 'id', todo.search_export_id, 'urs_taxid', todo.urs_taxid, 'accession', todo.accession, - 'common_name', COALESCE(tax.common_name, todo.common_name), + 'common_name', tax.common_name, 'database', todo.database, 'external_id', todo.external_id, 'function', todo.function, diff --git a/files/search-export/parts/text-mining.sql b/files/search-export/parts/text-mining.sql index 804d3cc91..a3b4ad232 100644 --- a/files/search-export/parts/text-mining.sql +++ b/files/search-export/parts/text-mining.sql @@ -8,6 +8,6 @@ COPY ( FROM search_export_urs todo JOIN search_export_publication_counts counts ON - todo.urs = counts.urs + todo.urs_taxid = counts.urs ORDER by todo.id ) TO STDOUT diff --git a/files/search-export/setup.sql b/files/search-export/setup.sql index c066faab5..fb0d97083 100644 --- a/files/search-export/setup.sql +++ b/files/search-export/setup.sql @@ -1,7 +1,7 @@ BEGIN TRANSACTION; DROP TABLE IF EXISTS search_export_publication_counts; -CREATE TEMP TABLE search_export_publication_counts ( +CREATE TABLE search_export_publication_counts ( urs text primary key, publication_count int not null ); diff --git a/import-data.nf b/import-data.nf index 6a4df3936..eff12365d 100644 --- a/import-data.nf +++ b/import-data.nf @@ -7,10 +7,14 @@ include { batch_lookup_ontology_information } from './workflows/lookup-ontology- include { parse_databases } from './workflows/parse-databases' include { parse_metadata } from './workflows/parse-metadata' include { load_data } from './workflows/load-data' +include { slack_message } from './workflows/utils/slack' +include { slack_closure } from './workflows/utils/slack' workflow import_data { emit: post_release main: + Channel.of("Starting data import pipeline") | slack_message + Channel.empty() \ | mix( parse_databases(), @@ -30,8 +34,19 @@ workflow import_data { | mix(term_info, references) \ | load_data \ | set { post_release } + + + } workflow { import_data() } + +workflow.onError { + slack_closure("Import pipeline encountered an error and failed") +} + +workflow.onComplete { + slack_closure("Workflow completed ${$workflow.status ? 'Ok' : 'with errors'} ") +} diff --git a/nextflow.config b/nextflow.config index 4ec6d7fd5..c5f85a07e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,7 +8,7 @@ includeConfig "config/export.config" includeConfig "config/crs.config" process { - container = 'rnacentral/rnacentral-import-pipeline:latest' + container = 'oras://ghcr.io/rnacentral/rnacentral-import-pipeline:latest' } // local.config must should contain something like the following. I use profiles @@ -20,7 +20,7 @@ includeConfig "local.config" params.should_release = false params.needs_publications = false -params.needs_taxonomy = false +params.needs_taxonomy = false params.databases.ensembl._any.run = false // Infer the needs_publications and should_release parameters. These are diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 000000000..4f4568476 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,1231 @@ +[[package]] +name = "argcomplete" +version = "2.0.0" +description = "Bash tab completion for argparse" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +test = ["coverage", "flake8", "pexpect", "wheel"] + +[[package]] +name = "argh" +version = "0.26.2" +description = "An unobtrusive argparse wrapper with natural syntax" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "atomicwrites" +version = "1.4.1" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.4.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope-interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope-interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope-interface"] +tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] + +[[package]] +name = "beautifulsoup4" +version = "4.11.1" +description = "Screen-scraping library" +category = "main" +optional = false +python-versions = ">=3.6.0" + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "biopython" +version = "1.79" +description = "Freely available tools for computational molecular biology." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +numpy = "*" + +[[package]] +name = "certifi" +version = "2022.9.24" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "charset-normalizer" +version = "2.1.1" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.6.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "click-aliases" +version = "1.0.1" +description = "Enable aliases for Click" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +click = "*" + +[package.extras] +dev = ["coveralls", "flake8", "flake8-import-order", "pytest", "pytest-cov", "tox-travis", "wheel"] + +[[package]] +name = "colorama" +version = "0.4.5" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "furl" +version = "2.1.3" +description = "URL manipulation made simple." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +orderedmultidict = ">=1.0.1" +six = ">=1.8.0" + +[[package]] +name = "gffutils" +version = "0.10.1" +description = "Work with GFF and GTF files in a flexible database framework" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +argcomplete = ">=1.9.4" +argh = ">=0.26.2" +pyfaidx = ">=0.5.5.2" +simplejson = "*" +six = ">=1.12.0" + +[[package]] +name = "humanfriendly" +version = "10.0" +description = "Human friendly output for text interfaces using Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "ijson" +version = "3.1.4" +description = "Iterative JSON parser with standard Python iterator interfaces" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "importlib-resources" +version = "5.10.0" +description = "Read resources from Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] +testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "intervaltree" +version = "3.1.0" +description = "Editable interval tree data structure for Python 2 and 3" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +sortedcontainers = ">=2.0,<3.0" + +[[package]] +name = "joblib" +version = "1.2.0" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "jsonschema" +version = "4.16.0" +description = "An implementation of JSON Schema validation for Python" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +attrs = ">=17.4.0" +importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} +pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "lxml" +version = "4.9.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.7)"] + +[[package]] +name = "more-itertools" +version = "8.14.0" +description = "More routines for operating on iterables, beyond itertools" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "networkx" +version = "2.8.7" +description = "Python package for creating and manipulating graphs and networks" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.981)", "pre-commit (>=2.20)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.4)", "pillow (>=9.1)", "pydata-sphinx-theme (>=0.9)", "sphinx (>=5)", "sphinx-gallery (>=0.10)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] + +[[package]] +name = "numpy" +version = "1.23.4" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.8" + +[[package]] +name = "obonet" +version = "0.3.0" +description = "Parse OBO formatted ontologies into networkx" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +networkx = "*" + +[package.extras] +dev = ["pre-commit", "pytest"] + +[[package]] +name = "orderedmultidict" +version = "1.0.1" +description = "Ordered Multivalue Dictionary" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +six = ">=1.8.0" + +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + +[[package]] +name = "pandas" +version = "1.5.1" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = [ + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, +] +python-dateutil = ">=2.8.1" +pytz = ">=2020.1" + +[package.extras] +test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] + +[[package]] +name = "pkgutil-resolve-name" +version = "1.3.10" +description = "Resolve a name to an object." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "psycopg2" +version = "2.9.3" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pyfaidx" +version = "0.7.1" +description = "pyfaidx: efficient pythonic random access to fasta subsequences" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +setuptools = ">=0.7" +six = "*" + +[[package]] +name = "pymysql" +version = "1.0.2" +description = "Pure Python MySQL Driver" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +ed25519 = ["PyNaCl (>=1.4.0)"] +rsa = ["cryptography"] + +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "pypika" +version = "0.48.9" +description = "A SQL query builder API for Python" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pyreadline3" +version = "3.4.1" +description = "A python implementation of GNU readline." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pyrsistent" +version = "0.18.1" +description = "Persistent/Functional/Immutable data structures" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "pytest" +version = "6.2.5" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2022.5" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "ratelimiter" +version = "1.2.0.post0" +description = "Simple python rate limiting object" +category = "main" +optional = false +python-versions = "*" + +[package.extras] +test = ["pytest (>=3.0)", "pytest-asyncio"] + +[[package]] +name = "requests" +version = "2.28.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=3.7, <4" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "retry" +version = "0.9.2" +description = "Easy to use retry decorator." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +decorator = ">=3.4.2" +py = ">=1.4.26,<2.0.0" + +[[package]] +name = "scikit-learn" +version = "1.1.2" +description = "A set of python modules for machine learning and data mining" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +joblib = ">=1.0.0" +numpy = ">=1.17.3" +scipy = ">=1.3.2" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.2)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pyamg (>=4.0.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"] + +[[package]] +name = "scipy" +version = "1.9.2" +description = "Fundamental algorithms for scientific computing in Python" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = ">=1.18.5,<1.26.0" + +[package.extras] +dev = ["flake8", "mypy", "pycodestyle", "typing-extensions"] +doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"] +test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + +[[package]] +name = "semver" +version = "2.13.0" +description = "Python helper for Semantic Versioning (http://semver.org/)" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "setuptools" +version = "65.5.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "simplejson" +version = "3.17.6" +description = "Simple, fast, extensible JSON encoder/decoder for Python" +category = "main" +optional = false +python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "soupsieve" +version = "2.3.2.post1" +description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "sqlitedict" +version = "1.7.0" +description = "Persistent dict in Python, backed up by sqlite3 and pickle, multithread-safe." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "tatsu" +version = "4.4.0" +description = "TatSu takes a grammar in a variation of EBNF as input, and outputs a memoizing PEG/Packrat parser in Python." +category = "main" +optional = false +python-versions = "*" + +[package.extras] +future-regex = ["regex"] + +[[package]] +name = "threadpoolctl" +version = "3.1.0" +description = "threadpoolctl" +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "urllib3" +version = "1.26.12" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[[package]] +name = "zipp" +version = "3.9.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] +testing = ["flake8 (<5)", "func-timeout", "jaraco-functools", "jaraco-itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.8" +content-hash = "7fa3b8b76be48a244d3c3b6f237bea96793095d5a375ff2d240ef56299bd1f4a" + +[metadata.files] +argcomplete = [ + {file = "argcomplete-2.0.0-py2.py3-none-any.whl", hash = "sha256:cffa11ea77999bb0dd27bb25ff6dc142a6796142f68d45b1a26b11f58724561e"}, + {file = "argcomplete-2.0.0.tar.gz", hash = "sha256:6372ad78c89d662035101418ae253668445b391755cfe94ea52f1b9d22425b20"}, +] +argh = [ + {file = "argh-0.26.2-py2.py3-none-any.whl", hash = "sha256:a9b3aaa1904eeb78e32394cd46c6f37ac0fb4af6dc488daa58971bdc7d7fcaf3"}, + {file = "argh-0.26.2.tar.gz", hash = "sha256:e9535b8c84dc9571a48999094fda7f33e63c3f1b74f3e5f3ac0105a58405bb65"}, +] +atomicwrites = [ + {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] +beautifulsoup4 = [ + {file = "beautifulsoup4-4.11.1-py3-none-any.whl", hash = "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30"}, + {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"}, +] +biopython = [ + {file = "biopython-1.79-cp310-cp310-win_amd64.whl", hash = "sha256:9eadfd4300f534cd4fa39613eeee786d2c3d6b981d373c5c46616fa1a97cad10"}, + {file = "biopython-1.79-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:72a1477cf1701964c7224e506a54fd65d1cc5228da200b634a17992230aa1cbd"}, + {file = "biopython-1.79-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:365569543ea58dd07ef205ec351c23b6c1a3200d5d321eb28ceaecd55eb5955e"}, + {file = "biopython-1.79-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4be31815226052d86d4c2f6a103c40504e34bba3e25cc1b1d687a3203c42fb6e"}, + {file = "biopython-1.79-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ceab668be9cbdcddef55ad459f87acd0316ae4a00d32251fea4cf665f5062fda"}, + {file = "biopython-1.79-cp36-cp36m-win32.whl", hash = "sha256:83bfea8a19f9352c47b13965c4b73853e7aeef3c5aed8489895b0679e32c621b"}, + {file = "biopython-1.79-cp36-cp36m-win_amd64.whl", hash = "sha256:98deacc30b8654cfcdcf707d93fa4e3c8717bbda07c3f9f828cf84753d4a1e4d"}, + {file = "biopython-1.79-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:884a2b99ac7820cb84f70089769a512e3238ee60438b8c934ed519613dc570ce"}, + {file = "biopython-1.79-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51eb467a60c38820ad1e6c3a7d4cb10535606f559646e824cc65c96091d91ff7"}, + {file = "biopython-1.79-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03ee5c72b3cc3f0675a8c22ce1c45fe99a32a60db18df059df479ae6cf619708"}, + {file = "biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9580978803b582e0612b71673cab289e6bf261a865009cfb9501d65bc726a76e"}, + {file = "biopython-1.79-cp37-cp37m-win32.whl", hash = "sha256:5ae69c5e09769390643aa0f8064517665df6fb99c37433821d6664584d0ecb8c"}, + {file = "biopython-1.79-cp37-cp37m-win_amd64.whl", hash = "sha256:f0a7e1d94a318f74974345fd0987ec389b16988ec484e67218e900b116b932a8"}, + {file = "biopython-1.79-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aa23a83a220486af6193760d079b36543fe00afcfbd18280ca2fd0b2c1c8dd6d"}, + {file = "biopython-1.79-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3d4eec2e348c3d97a7fde80ee0f2b8ebeed849d2bd64a616833a9be03b93c8"}, + {file = "biopython-1.79-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:947b793e804c59ea45ae46945a57612ad1789ca87af4af0d6a62dcecf3a6246a"}, + {file = "biopython-1.79-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d9f6ce961e0c380e2a5435f64c96421dbcebeab6a1b41506bd81251feb733c08"}, + {file = "biopython-1.79-cp38-cp38-win32.whl", hash = "sha256:155c5b95857bca7ebd607210cb9d8ea459bb0b86b3ca37ea44ec47c26ede7e9a"}, + {file = "biopython-1.79-cp38-cp38-win_amd64.whl", hash = "sha256:2dbb4388c75b5dfca8ce729e791f465c9c878dbd7ba2ab9a1f9854609d2b5426"}, + {file = "biopython-1.79-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:76988ed3d7383d566db1d7fc69c9cf136c6275813fb749fc6753c340f81f1a8f"}, + {file = "biopython-1.79-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e921571b51514a6d35944242d6fef6427c3998acf58940fe1f209ac8a92a6e87"}, + {file = "biopython-1.79-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf634a56f449a4123e48e538d661948e5ac29fb452acd2962b8cb834b472a9d7"}, + {file = "biopython-1.79-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ab93d5749b375be3682866b3a606aa2ebd3e6d868079793925bf4fbb0987cf1f"}, + {file = "biopython-1.79-cp39-cp39-win32.whl", hash = "sha256:8f33dafd3c7254fff5e1684b965e45a7c08d9b8e1bf51562b0a521ff9a6f5ea0"}, + {file = "biopython-1.79-cp39-cp39-win_amd64.whl", hash = "sha256:b3ab26f26a1956ef26303386510d84e917e31fcbbc94918c336da0163ef628df"}, + {file = "biopython-1.79.tar.gz", hash = "sha256:edb07eac99d3b8abd7ba56ff4bedec9263f76dfc3c3f450e7d2e2bcdecf8559b"}, +] +certifi = [ + {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"}, + {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, + {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, +] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] +click-aliases = [ + {file = "click-aliases-1.0.1.tar.gz", hash = "sha256:f48012077e0788eb02f4f8ee458fef3601873fec6c998e9ea8b4554394e705a3"}, + {file = "click_aliases-1.0.1-py2.py3-none-any.whl", hash = "sha256:229ecab12a97d1d5ce3f1fd7ce16da0e4333a24ebe3b34d8b7a6d0a1d2cfab90"}, +] +colorama = [ + {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, + {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, +] +decorator = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] +furl = [ + {file = "furl-2.1.3-py2.py3-none-any.whl", hash = "sha256:9ab425062c4217f9802508e45feb4a83e54324273ac4b202f1850363309666c0"}, + {file = "furl-2.1.3.tar.gz", hash = "sha256:5a6188fe2666c484a12159c18be97a1977a71d632ef5bb867ef15f54af39cc4e"}, +] +gffutils = [ + {file = "gffutils-0.10.1.tar.gz", hash = "sha256:a8fc39006d7aa353147238160640e2210b168f7849cb99896be3fc9441e351cb"}, +] +humanfriendly = [ + {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, + {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, +] +idna = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] +ijson = [ + {file = "ijson-3.1.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:6c1a777096be5f75ffebb335c6d2ebc0e489b231496b7f2ca903aa061fe7d381"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:475fc25c3d2a86230b85777cae9580398b42eed422506bf0b6aacfa936f7bfcd"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:f587699b5a759e30accf733e37950cc06c4118b72e3e146edcea77dded467426"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:339b2b4c7bbd64849dd69ef94ee21e29dcd92c831f47a281fdd48122bb2a715a"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:446ef8980504da0af8d20d3cb6452c4dc3d8aa5fd788098985e899b913191fe6"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:3997a2fdb28bc04b9ab0555db5f3b33ed28d91e9d42a3bf2c1842d4990beb158"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:fa10a1d88473303ec97aae23169d77c5b92657b7fb189f9c584974c00a79f383"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:9a5bf5b9d8f2ceaca131ee21fc7875d0f34b95762f4f32e4d65109ca46472147"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:81cc8cee590c8a70cca3c9aefae06dd7cb8e9f75f3a7dc12b340c2e332d33a2a"}, + {file = "ijson-3.1.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4ea5fc50ba158f72943d5174fbc29ebefe72a2adac051c814c87438dc475cf78"}, + {file = "ijson-3.1.4-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3b98861a4280cf09d267986cefa46c3bd80af887eae02aba07488d80eb798afa"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:068c692efba9692406b86736dcc6803e4a0b6280d7f0b7534bff3faec677ff38"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:86884ac06ac69cea6d89ab7b84683b3b4159c4013e4a20276d3fc630fe9b7588"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:41e5886ff6fade26f10b87edad723d2db14dcbb1178717790993fcbbb8ccd333"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:24b58933bf777d03dc1caa3006112ec7f9e6f6db6ffe1f5f5bd233cb1281f719"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:13f80aad0b84d100fb6a88ced24bade21dc6ddeaf2bba3294b58728463194f50"}, + {file = "ijson-3.1.4-cp35-cp35m-win32.whl", hash = "sha256:fa9a25d0bd32f9515e18a3611690f1de12cb7d1320bd93e9da835936b41ad3ff"}, + {file = "ijson-3.1.4-cp35-cp35m-win_amd64.whl", hash = "sha256:c4c1bf98aaab4c8f60d238edf9bcd07c896cfcc51c2ca84d03da22aad88957c5"}, + {file = "ijson-3.1.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f0f2a87c423e8767368aa055310024fa28727f4454463714fef22230c9717f64"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:15507de59d74d21501b2a076d9c49abf927eb58a51a01b8f28a0a0565db0a99f"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2e6bd6ad95ab40c858592b905e2bbb4fe79bbff415b69a4923dafe841ffadcb4"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:68e295bb12610d086990cedc89fb8b59b7c85740d66e9515aed062649605d0bf"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3bb461352c0f0f2ec460a4b19400a665b8a5a3a2da663a32093df1699642ee3f"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f91c75edd6cf1a66f02425bafc59a22ec29bc0adcbc06f4bfd694d92f424ceb3"}, + {file = "ijson-3.1.4-cp36-cp36m-win32.whl", hash = "sha256:4c53cc72f79a4c32d5fc22efb85aa22f248e8f4f992707a84bdc896cc0b1ecf9"}, + {file = "ijson-3.1.4-cp36-cp36m-win_amd64.whl", hash = "sha256:ac9098470c1ff6e5c23ec0946818bc102bfeeeea474554c8d081dc934be20988"}, + {file = "ijson-3.1.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dcd6f04df44b1945b859318010234651317db2c4232f75e3933f8bb41c4fa055"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:5a2f40c053c837591636dc1afb79d85e90b9a9d65f3d9963aae31d1eb11bfed2"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f50337e3b8e72ec68441b573c2848f108a8976a57465c859b227ebd2a2342901"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:454918f908abbed3c50a0a05c14b20658ab711b155e4f890900e6f60746dd7cc"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:387c2ec434cc1bc7dc9bd33ec0b70d95d443cc1e5934005f26addc2284a437ab"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:179ed6fd42e121d252b43a18833df2de08378fac7bce380974ef6f5e522afefa"}, + {file = "ijson-3.1.4-cp37-cp37m-win32.whl", hash = "sha256:26a6a550b270df04e3f442e2bf0870c9362db4912f0e7bdfd300f30ea43115a2"}, + {file = "ijson-3.1.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ff8cf7507d9d8939264068c2cff0a23f99703fa2f31eb3cb45a9a52798843586"}, + {file = "ijson-3.1.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:09c9d7913c88a6059cd054ff854958f34d757402b639cf212ffbec201a705a0d"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:702ba9a732116d659a5e950ee176be6a2e075998ef1bcde11cbf79a77ed0f717"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:667841591521158770adc90793c2bdbb47c94fe28888cb802104b8bbd61f3d51"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:df641dd07b38c63eecd4f454db7b27aa5201193df160f06b48111ba97ab62504"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:9348e7d507eb40b52b12eecff3d50934fcc3d2a15a2f54ec1127a36063b9ba8f"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:93455902fdc33ba9485c7fae63ac95d96e0ab8942224a357113174bbeaff92e9"}, + {file = "ijson-3.1.4-cp38-cp38-win32.whl", hash = "sha256:5b725f2e984ce70d464b195f206fa44bebbd744da24139b61fec72de77c03a16"}, + {file = "ijson-3.1.4-cp38-cp38-win_amd64.whl", hash = "sha256:a5965c315fbb2dc9769dfdf046eb07daf48ae20b637da95ec8d62b629be09df4"}, + {file = "ijson-3.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b8ee7dbb07cec9ba29d60cfe4954b3cc70adb5f85bba1f72225364b59c1cf82b"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d9e01c55d501e9c3d686b6ee3af351c9c0c8c3e45c5576bd5601bee3e1300b09"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:297f26f27a04cd0d0a2f865d154090c48ea11b239cabe0a17a6c65f0314bd1ca"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:9239973100338a4138d09d7a4602bd289861e553d597cd67390c33bfc452253e"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:2a64c66a08f56ed45a805691c2fd2e1caef00edd6ccf4c4e5eff02cd94ad8364"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d17fd199f0d0a4ab6e0d541b4eec1b68b5bd5bb5d8104521e22243015b51049b"}, + {file = "ijson-3.1.4-cp39-cp39-win32.whl", hash = "sha256:70ee3c8fa0eba18c80c5911639c01a8de4089a4361bad2862a9949e25ec9b1c8"}, + {file = "ijson-3.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:6bf2b64304321705d03fa5e403ec3f36fa5bb27bf661849ad62e0a3a49bc23e3"}, + {file = "ijson-3.1.4-pp27-pypy_73-macosx_10_9_x86_64.whl", hash = "sha256:5d7e3fcc3b6de76a9dba1e9fc6ca23dad18f0fa6b4e6499415e16b684b2e9af1"}, + {file = "ijson-3.1.4-pp27-pypy_73-manylinux1_x86_64.whl", hash = "sha256:a72eb0359ebff94754f7a2f00a6efe4c57716f860fc040c606dedcb40f49f233"}, + {file = "ijson-3.1.4-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:28fc168f5faf5759fdfa2a63f85f1f7a148bbae98f34404a6ba19f3d08e89e87"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2844d4a38d27583897ed73f7946e205b16926b4cab2525d1ce17e8b08064c706"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux1_x86_64.whl", hash = "sha256:252defd1f139b5fb8c764d78d5e3a6df81543d9878c58992a89b261369ea97a7"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:15d5356b4d090c699f382c8eb6a2bcd5992a8c8e8b88c88bc6e54f686018328a"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-win32.whl", hash = "sha256:6774ec0a39647eea70d35fb76accabe3d71002a8701c0545b9120230c182b75b"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f11da15ec04cc83ff0f817a65a3392e169be8d111ba81f24d6e09236597bb28c"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux1_x86_64.whl", hash = "sha256:ee13ceeed9b6cf81b3b8197ef15595fc43fd54276842ed63840ddd49db0603da"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:97e4df67235fae40d6195711223520d2c5bf1f7f5087c2963fcde44d72ebf448"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-win32.whl", hash = "sha256:3d10eee52428f43f7da28763bb79f3d90bbbeea1accb15de01e40a00885b6e89"}, + {file = "ijson-3.1.4.tar.gz", hash = "sha256:1d1003ae3c6115ec9b587d29dd136860a81a23c7626b682e2b5b12c9fd30e4ea"}, +] +importlib-resources = [ + {file = "importlib_resources-5.10.0-py3-none-any.whl", hash = "sha256:ee17ec648f85480d523596ce49eae8ead87d5631ae1551f913c0100b5edd3437"}, + {file = "importlib_resources-5.10.0.tar.gz", hash = "sha256:c01b1b94210d9849f286b86bb51bcea7cd56dde0600d8db721d7b81330711668"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +intervaltree = [ + {file = "intervaltree-3.1.0.tar.gz", hash = "sha256:902b1b88936918f9b2a19e0e5eb7ccb430ae45cde4f39ea4b36932920d33952d"}, +] +joblib = [ + {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, + {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, +] +jsonschema = [ + {file = "jsonschema-4.16.0-py3-none-any.whl", hash = "sha256:9e74b8f9738d6a946d70705dc692b74b5429cd0960d58e79ffecfc43b2221eb9"}, + {file = "jsonschema-4.16.0.tar.gz", hash = "sha256:165059f076eff6971bae5b742fc029a7b4ef3f9bcf04c14e4776a7605de14b23"}, +] +lxml = [ + {file = "lxml-4.9.1-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed"}, + {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc"}, + {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc"}, + {file = "lxml-4.9.1-cp27-cp27m-win32.whl", hash = "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3"}, + {file = "lxml-4.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627"}, + {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84"}, + {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8"}, + {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8"}, + {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d"}, + {file = "lxml-4.9.1-cp310-cp310-win32.whl", hash = "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7"}, + {file = "lxml-4.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b"}, + {file = "lxml-4.9.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d"}, + {file = "lxml-4.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3"}, + {file = "lxml-4.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29"}, + {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d"}, + {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318"}, + {file = "lxml-4.9.1-cp35-cp35m-win32.whl", hash = "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7"}, + {file = "lxml-4.9.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4"}, + {file = "lxml-4.9.1-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf"}, + {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3"}, + {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391"}, + {file = "lxml-4.9.1-cp36-cp36m-win32.whl", hash = "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e"}, + {file = "lxml-4.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7"}, + {file = "lxml-4.9.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca"}, + {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785"}, + {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785"}, + {file = "lxml-4.9.1-cp37-cp37m-win32.whl", hash = "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a"}, + {file = "lxml-4.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e"}, + {file = "lxml-4.9.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715"}, + {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036"}, + {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387"}, + {file = "lxml-4.9.1-cp38-cp38-win32.whl", hash = "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94"}, + {file = "lxml-4.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345"}, + {file = "lxml-4.9.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000"}, + {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25"}, + {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd"}, + {file = "lxml-4.9.1-cp39-cp39-win32.whl", hash = "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb"}, + {file = "lxml-4.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73"}, + {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c"}, + {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9"}, + {file = "lxml-4.9.1.tar.gz", hash = "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"}, +] +more-itertools = [ + {file = "more-itertools-8.14.0.tar.gz", hash = "sha256:c09443cd3d5438b8dafccd867a6bc1cb0894389e90cb53d227456b0b0bccb750"}, + {file = "more_itertools-8.14.0-py3-none-any.whl", hash = "sha256:1bc4f91ee5b1b31ac7ceacc17c09befe6a40a503907baf9c839c229b5095cfd2"}, +] +networkx = [ + {file = "networkx-2.8.7-py3-none-any.whl", hash = "sha256:15cdf7f7c157637107ea690cabbc488018f8256fa28242aed0fb24c93c03a06d"}, + {file = "networkx-2.8.7.tar.gz", hash = "sha256:815383fd52ece0a7024b5fd8408cc13a389ea350cd912178b82eed8b96f82cd3"}, +] +numpy = [ + {file = "numpy-1.23.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2"}, + {file = "numpy-1.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f"}, + {file = "numpy-1.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71"}, + {file = "numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3"}, + {file = "numpy-1.23.4-cp310-cp310-win32.whl", hash = "sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd"}, + {file = "numpy-1.23.4-cp310-cp310-win_amd64.whl", hash = "sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329"}, + {file = "numpy-1.23.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:488a66cb667359534bc70028d653ba1cf307bae88eab5929cd707c761ff037db"}, + {file = "numpy-1.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce03305dd694c4873b9429274fd41fc7eb4e0e4dea07e0af97a933b079a5814f"}, + {file = "numpy-1.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8981d9b5619569899666170c7c9748920f4a5005bf79c72c07d08c8a035757b0"}, + {file = "numpy-1.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a70a7d3ce4c0e9284e92285cba91a4a3f5214d87ee0e95928f3614a256a1488"}, + {file = "numpy-1.23.4-cp311-cp311-win32.whl", hash = "sha256:5e13030f8793e9ee42f9c7d5777465a560eb78fa7e11b1c053427f2ccab90c79"}, + {file = "numpy-1.23.4-cp311-cp311-win_amd64.whl", hash = "sha256:7607b598217745cc40f751da38ffd03512d33ec06f3523fb0b5f82e09f6f676d"}, + {file = "numpy-1.23.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7ab46e4e7ec63c8a5e6dbf5c1b9e1c92ba23a7ebecc86c336cb7bf3bd2fb10e5"}, + {file = "numpy-1.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8aae2fb3180940011b4862b2dd3756616841c53db9734b27bb93813cd79fce6"}, + {file = "numpy-1.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c053d7557a8f022ec823196d242464b6955a7e7e5015b719e76003f63f82d0f"}, + {file = "numpy-1.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0882323e0ca4245eb0a3d0a74f88ce581cc33aedcfa396e415e5bba7bf05f68"}, + {file = "numpy-1.23.4-cp38-cp38-win32.whl", hash = "sha256:dada341ebb79619fe00a291185bba370c9803b1e1d7051610e01ed809ef3a4ba"}, + {file = "numpy-1.23.4-cp38-cp38-win_amd64.whl", hash = "sha256:0fe563fc8ed9dc4474cbf70742673fc4391d70f4363f917599a7fa99f042d5a8"}, + {file = "numpy-1.23.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c67b833dbccefe97cdd3f52798d430b9d3430396af7cdb2a0c32954c3ef73894"}, + {file = "numpy-1.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f76025acc8e2114bb664294a07ede0727aa75d63a06d2fae96bf29a81747e4a7"}, + {file = "numpy-1.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12ac457b63ec8ded85d85c1e17d85efd3c2b0967ca39560b307a35a6703a4735"}, + {file = "numpy-1.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95de7dc7dc47a312f6feddd3da2500826defdccbc41608d0031276a24181a2c0"}, + {file = "numpy-1.23.4-cp39-cp39-win32.whl", hash = "sha256:f2f390aa4da44454db40a1f0201401f9036e8d578a25f01a6e237cea238337ef"}, + {file = "numpy-1.23.4-cp39-cp39-win_amd64.whl", hash = "sha256:f260da502d7441a45695199b4e7fd8ca87db659ba1c78f2bbf31f934fe76ae0e"}, + {file = "numpy-1.23.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:61be02e3bf810b60ab74e81d6d0d36246dbfb644a462458bb53b595791251911"}, + {file = "numpy-1.23.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:296d17aed51161dbad3c67ed6d164e51fcd18dbcd5dd4f9d0a9c6055dce30810"}, + {file = "numpy-1.23.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4d52914c88b4930dafb6c48ba5115a96cbab40f45740239d9f4159c4ba779962"}, + {file = "numpy-1.23.4.tar.gz", hash = "sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c"}, +] +obonet = [ + {file = "obonet-0.3.0-py3-none-any.whl", hash = "sha256:d436eb4f57afa6f1a48992c3a4132126da9793e1439f667ab23cc74d8e957aee"}, + {file = "obonet-0.3.0.tar.gz", hash = "sha256:fd801166cd28a2ef86126f22c8e3da30f5c3b6a3adfc62536abea1aa9956a2b4"}, +] +orderedmultidict = [ + {file = "orderedmultidict-1.0.1-py2.py3-none-any.whl", hash = "sha256:43c839a17ee3cdd62234c47deca1a8508a3f2ca1d0678a3bf791c87cf84adbf3"}, + {file = "orderedmultidict-1.0.1.tar.gz", hash = "sha256:04070bbb5e87291cc9bfa51df413677faf2141c73c61d2a5f7b26bea3cd882ad"}, +] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] +pandas = [ + {file = "pandas-1.5.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0a78e05ec09731c5b3bd7a9805927ea631fe6f6cb06f0e7c63191a9a778d52b4"}, + {file = "pandas-1.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5b0c970e2215572197b42f1cff58a908d734503ea54b326412c70d4692256391"}, + {file = "pandas-1.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f340331a3f411910adfb4bbe46c2ed5872d9e473a783d7f14ecf49bc0869c594"}, + {file = "pandas-1.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8c709f4700573deb2036d240d140934df7e852520f4a584b2a8d5443b71f54d"}, + {file = "pandas-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32e3d9f65606b3f6e76555bfd1d0b68d94aff0929d82010b791b6254bf5a4b96"}, + {file = "pandas-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:a52419d9ba5906db516109660b114faf791136c94c1a636ed6b29cbfff9187ee"}, + {file = "pandas-1.5.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:66a1ad667b56e679e06ba73bb88c7309b3f48a4c279bd3afea29f65a766e9036"}, + {file = "pandas-1.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:36aa1f8f680d7584e9b572c3203b20d22d697c31b71189322f16811d4ecfecd3"}, + {file = "pandas-1.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcf1a82b770b8f8c1e495b19a20d8296f875a796c4fe6e91da5ef107f18c5ecb"}, + {file = "pandas-1.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c25e5c16ee5c0feb6cf9d982b869eec94a22ddfda9aa2fbed00842cbb697624"}, + {file = "pandas-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:932d2d7d3cab44cfa275601c982f30c2d874722ef6396bb539e41e4dc4618ed4"}, + {file = "pandas-1.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:eb7e8cf2cf11a2580088009b43de84cabbf6f5dae94ceb489f28dba01a17cb77"}, + {file = "pandas-1.5.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cb2a9cf1150302d69bb99861c5cddc9c25aceacb0a4ef5299785d0f5389a3209"}, + {file = "pandas-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:81f0674fa50b38b6793cd84fae5d67f58f74c2d974d2cb4e476d26eee33343d0"}, + {file = "pandas-1.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:17da7035d9e6f9ea9cdc3a513161f8739b8f8489d31dc932bc5a29a27243f93d"}, + {file = "pandas-1.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:669c8605dba6c798c1863157aefde959c1796671ffb342b80fcb80a4c0bc4c26"}, + {file = "pandas-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:683779e5728ac9138406c59a11e09cd98c7d2c12f0a5fc2b9c5eecdbb4a00075"}, + {file = "pandas-1.5.1-cp38-cp38-win32.whl", hash = "sha256:ddf46b940ef815af4e542697eaf071f0531449407a7607dd731bf23d156e20a7"}, + {file = "pandas-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:db45b94885000981522fb92349e6b76f5aee0924cc5315881239c7859883117d"}, + {file = "pandas-1.5.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:927e59c694e039c75d7023465d311277a1fc29ed7236b5746e9dddf180393113"}, + {file = "pandas-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e675f8fe9aa6c418dc8d3aac0087b5294c1a4527f1eacf9fe5ea671685285454"}, + {file = "pandas-1.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04e51b01d5192499390c0015630975f57836cc95c7411415b499b599b05c0c96"}, + {file = "pandas-1.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cee0c74e93ed4f9d39007e439debcaadc519d7ea5c0afc3d590a3a7b2edf060"}, + {file = "pandas-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b156a971bc451c68c9e1f97567c94fd44155f073e3bceb1b0d195fd98ed12048"}, + {file = "pandas-1.5.1-cp39-cp39-win32.whl", hash = "sha256:05c527c64ee02a47a24031c880ee0ded05af0623163494173204c5b72ddce658"}, + {file = "pandas-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bb391659a747cf4f181a227c3e64b6d197100d53da98dcd766cc158bdd9ec68"}, + {file = "pandas-1.5.1.tar.gz", hash = "sha256:249cec5f2a5b22096440bd85c33106b6102e0672204abd2d5c014106459804ee"}, +] +pkgutil-resolve-name = [ + {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"}, + {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +psycopg2 = [ + {file = "psycopg2-2.9.3-cp310-cp310-win32.whl", hash = "sha256:083707a696e5e1c330af2508d8fab36f9700b26621ccbcb538abe22e15485362"}, + {file = "psycopg2-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:d3ca6421b942f60c008f81a3541e8faf6865a28d5a9b48544b0ee4f40cac7fca"}, + {file = "psycopg2-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:9572e08b50aed176ef6d66f15a21d823bb6f6d23152d35e8451d7d2d18fdac56"}, + {file = "psycopg2-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:a81e3866f99382dfe8c15a151f1ca5fde5815fde879348fe5a9884a7c092a305"}, + {file = "psycopg2-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:cb10d44e6694d763fa1078a26f7f6137d69f555a78ec85dc2ef716c37447e4b2"}, + {file = "psycopg2-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:4295093a6ae3434d33ec6baab4ca5512a5082cc43c0505293087b8a46d108461"}, + {file = "psycopg2-2.9.3-cp38-cp38-win32.whl", hash = "sha256:34b33e0162cfcaad151f249c2649fd1030010c16f4bbc40a604c1cb77173dcf7"}, + {file = "psycopg2-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:0762c27d018edbcb2d34d51596e4346c983bd27c330218c56c4dc25ef7e819bf"}, + {file = "psycopg2-2.9.3-cp39-cp39-win32.whl", hash = "sha256:8cf3878353cc04b053822896bc4922b194792df9df2f1ad8da01fb3043602126"}, + {file = "psycopg2-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:06f32425949bd5fe8f625c49f17ebb9784e1e4fe928b7cce72edc36fb68e4c0c"}, + {file = "psycopg2-2.9.3.tar.gz", hash = "sha256:8e841d1bf3434da985cc5ef13e6f75c8981ced601fd70cc6bf33351b91562981"}, +] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] +pyfaidx = [ + {file = "pyfaidx-0.7.1.tar.gz", hash = "sha256:3977632b7fd29049f8b11035d7e9dea0e2c5da9c235f982b4c3fae06ff1fa23f"}, +] +pymysql = [ + {file = "PyMySQL-1.0.2-py3-none-any.whl", hash = "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641"}, + {file = "PyMySQL-1.0.2.tar.gz", hash = "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"}, +] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pypika = [ + {file = "PyPika-0.48.9.tar.gz", hash = "sha256:838836a61747e7c8380cd1b7ff638694b7a7335345d0f559b04b2cd832ad5378"}, +] +pyreadline3 = [ + {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"}, + {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"}, +] +pyrsistent = [ + {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, + {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, + {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ed6784ceac462a7d6fcb7e9b663e93b9a6fb373b7f43594f9ff68875788e01e"}, + {file = "pyrsistent-0.18.1-cp310-cp310-win32.whl", hash = "sha256:e4f3149fd5eb9b285d6bfb54d2e5173f6a116fe19172686797c056672689daf6"}, + {file = "pyrsistent-0.18.1-cp310-cp310-win_amd64.whl", hash = "sha256:636ce2dc235046ccd3d8c56a7ad54e99d5c1cd0ef07d9ae847306c91d11b5fec"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e92a52c166426efbe0d1ec1332ee9119b6d32fc1f0bbfd55d5c1088070e7fc1b"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7a096646eab884bf8bed965bad63ea327e0d0c38989fc83c5ea7b8a87037bfc"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfd2c361b8a8e5d9499b9082b501c452ade8bbf42aef97ea04854f4a3f43b22"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:7ec335fc998faa4febe75cc5268a9eac0478b3f681602c1f27befaf2a1abe1d8"}, + {file = "pyrsistent-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6455fc599df93d1f60e1c5c4fe471499f08d190d57eca040c0ea182301321286"}, + {file = "pyrsistent-0.18.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fd8da6d0124efa2f67d86fa70c851022f87c98e205f0594e1fae044e7119a5a6"}, + {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bfe2388663fd18bd8ce7db2c91c7400bf3e1a9e8bd7d63bf7e77d39051b85ec"}, + {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e3e1fcc45199df76053026a51cc59ab2ea3fc7c094c6627e93b7b44cdae2c8c"}, + {file = "pyrsistent-0.18.1-cp38-cp38-win32.whl", hash = "sha256:b568f35ad53a7b07ed9b1b2bae09eb15cdd671a5ba5d2c66caee40dbf91c68ca"}, + {file = "pyrsistent-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1b96547410f76078eaf66d282ddca2e4baae8964364abb4f4dcdde855cd123a"}, + {file = "pyrsistent-0.18.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f87cc2863ef33c709e237d4b5f4502a62a00fab450c9e020892e8e2ede5847f5"}, + {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc66318fb7ee012071b2792024564973ecc80e9522842eb4e17743604b5e045"}, + {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:914474c9f1d93080338ace89cb2acee74f4f666fb0424896fcfb8d86058bf17c"}, + {file = "pyrsistent-0.18.1-cp39-cp39-win32.whl", hash = "sha256:1b34eedd6812bf4d33814fca1b66005805d3640ce53140ab8bbb1e2651b0d9bc"}, + {file = "pyrsistent-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:e24a828f57e0c337c8d8bb9f6b12f09dfdf0273da25fda9e314f0b684b415a07"}, + {file = "pyrsistent-0.18.1.tar.gz", hash = "sha256:d4d61f8b993a7255ba714df3aca52700f8125289f84f704cf80916517c46eb96"}, +] +pytest = [ + {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, + {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [ + {file = "pytz-2022.5-py2.py3-none-any.whl", hash = "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"}, + {file = "pytz-2022.5.tar.gz", hash = "sha256:c4d88f472f54d615e9cd582a5004d1e5f624854a6a27a6211591c251f22a6914"}, +] +ratelimiter = [ + {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"}, + {file = "ratelimiter-1.2.0.post0.tar.gz", hash = "sha256:5c395dcabdbbde2e5178ef3f89b568a3066454a6ddc223b76473dac22f89b4f7"}, +] +requests = [ + {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, + {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, +] +retry = [ + {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"}, + {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"}, +] +scikit-learn = [ + {file = "scikit-learn-1.1.2.tar.gz", hash = "sha256:7c22d1305b16f08d57751a4ea36071e2215efb4c09cb79183faa4e8e82a3dbf8"}, + {file = "scikit_learn-1.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6c840f662b5d3377c4ccb8be1fc21bb52cb5d8b8790f8d6bf021739f84e543cf"}, + {file = "scikit_learn-1.1.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2b8db962360c93554cab7bb3c096c4a24695da394dd4b3c3f13409f409b425bc"}, + {file = "scikit_learn-1.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e7d1fc817867a350133f937aaebcafbc06192517cbdf0cf7e5774ad4d1adb9f"}, + {file = "scikit_learn-1.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec3ea40d467966821843210c02117d82b097b54276fdcfb50f4dfb5c60dbe39"}, + {file = "scikit_learn-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:bbef6ea1c012ff9f3e6f6e9ca006b8772d8383e177b898091e68fbd9b3f840f9"}, + {file = "scikit_learn-1.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a90ca42fe8242fd6ff56cda2fecc5fca586a88a24ab602d275d2d0dcc0b928fb"}, + {file = "scikit_learn-1.1.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a682ec0f82b6f30fb07486daed1c8001b6683cc66b51877644dfc532bece6a18"}, + {file = "scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c33e16e9a165af6012f5be530ccfbb672e2bc5f9b840238a05eb7f6694304e3f"}, + {file = "scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f94c0146bad51daef919c402a3da8c1c6162619653e1c00c92baa168fda292f2"}, + {file = "scikit_learn-1.1.2-cp38-cp38-win32.whl", hash = "sha256:2f46c6e3ff1054a5ec701646dcfd61d43b8ecac4d416014daed8843cf4c33d4d"}, + {file = "scikit_learn-1.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:b1e706deca9b2ad87ae27dafd5ac4e8eff01b6db492ed5c12cef4735ec5f21ea"}, + {file = "scikit_learn-1.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:567417dbbe6a6278399c3e6daf1654414a5a1a4d818d28f251fa7fc28730a1bf"}, + {file = "scikit_learn-1.1.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:d6f232779023c3b060b80b5c82e5823723bc424dcac1d1a148aa2492c54d245d"}, + {file = "scikit_learn-1.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:589d46f28460469f444b898223b13d99db9463e1038dc581ba698111f612264b"}, + {file = "scikit_learn-1.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76800652fb6d6bf527bce36ecc2cc25738b28fe1a17bd294a218fff8e8bd6d50"}, + {file = "scikit_learn-1.1.2-cp39-cp39-win32.whl", hash = "sha256:1c8fecb7c9984d9ec2ea48898229f98aad681a0873e0935f2b7f724fbce4a047"}, + {file = "scikit_learn-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:407e9a1cb9e6ba458a539986a9bd25546a757088095b3aab91d465b79a760d37"}, +] +scipy = [ + {file = "scipy-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ee4ceed204f269da19f67f0115a85d3a2cd8547185037ad99a4025f9c61d02e9"}, + {file = "scipy-1.9.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:17be1a7c68ec4c49d8cd4eb1655d55d14a54ab63012296bdd5921c92dc485acd"}, + {file = "scipy-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72297eb9702576bd8f626bb488fd32bb35349d3120fc4a5e733db137f06c9a6"}, + {file = "scipy-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa270cc6080c987929335c4cb94e8054fee9a6058cecff22276fa5dbab9856fc"}, + {file = "scipy-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:22380e076a162e81b659d53d75b02e9c75ad14ea2d53d9c645a12543414e2150"}, + {file = "scipy-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bbed414fc25d64bd6d1613dc0286fbf91902219b8be63ad254525162235b67e9"}, + {file = "scipy-1.9.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:885b7ac56d7460544b2ef89ab9feafa30f4264c9825d975ef690608d07e6cc55"}, + {file = "scipy-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5994a8232cc6510a8e85899661df2d11198bf362f0ffe6fbd5c0aca17ab46ce3"}, + {file = "scipy-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c83dccac06f3b9aa02df69577f239758d5d0d0c069673fb0b47ecb971983d"}, + {file = "scipy-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:92c5e627a0635ca02e6494bbbdb74f98d93ac8730416209d61de3b70c8a821be"}, + {file = "scipy-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b6194da32e0ce9200b2eda4eb4edb89c5cb8b83d6deaf7c35f8ad3d5d7627d5c"}, + {file = "scipy-1.9.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:148cb6f53d9d10dafde848e9aeb1226bf2809d16dc3221b2fa568130b6f2e586"}, + {file = "scipy-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:658fd31c6ad4eb9fa3fd460fcac779f70a6bc7480288a211b7658a25891cf01d"}, + {file = "scipy-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4012dbe540732311b8f4388b7e1482eb43a7cc0435bbf2b9916b3d6c38fb8d01"}, + {file = "scipy-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:d6cb1f92ded3fc48f7dbe94d20d7b9887e13b874e79043907de541c841563b4c"}, + {file = "scipy-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1e3b23a82867018cd26255dc951789a7c567921622073e1113755866f1eae928"}, + {file = "scipy-1.9.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:82e8bfb352aa9dce9a0ffe81f4c369a2c87c85533519441686f59f21d8c09697"}, + {file = "scipy-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61b95283529712101bfb7c87faf94cb86ed9e64de079509edfe107e5cfa55733"}, + {file = "scipy-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c8c29703202c39d699b0d6b164bde5501c212005f20abf46ae322b9307c8a41"}, + {file = "scipy-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7b2608b3141c257d01ae772e23b3de9e04d27344e6b68a890883795229cb7191"}, + {file = "scipy-1.9.2.tar.gz", hash = "sha256:99e7720caefb8bca6ebf05c7d96078ed202881f61e0c68bd9e0f3e8097d6f794"}, +] +semver = [ + {file = "semver-2.13.0-py2.py3-none-any.whl", hash = "sha256:ced8b23dceb22134307c1b8abfa523da14198793d9787ac838e70e29e77458d4"}, + {file = "semver-2.13.0.tar.gz", hash = "sha256:fa0fe2722ee1c3f57eac478820c3a5ae2f624af8264cbdf9000c980ff7f75e3f"}, +] +setuptools = [ + {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, + {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, +] +simplejson = [ + {file = "simplejson-3.17.6-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a89acae02b2975b1f8e4974cb8cdf9bf9f6c91162fb8dec50c259ce700f2770a"}, + {file = "simplejson-3.17.6-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:82ff356ff91be0ab2293fc6d8d262451eb6ac4fd999244c4b5f863e049ba219c"}, + {file = "simplejson-3.17.6-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:0de783e9c2b87bdd75b57efa2b6260c24b94605b5c9843517577d40ee0c3cc8a"}, + {file = "simplejson-3.17.6-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:d24a9e61df7a7787b338a58abfba975414937b609eb6b18973e25f573bc0eeeb"}, + {file = "simplejson-3.17.6-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:e8603e691580487f11306ecb066c76f1f4a8b54fb3bdb23fa40643a059509366"}, + {file = "simplejson-3.17.6-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:9b01e7b00654115965a206e3015f0166674ec1e575198a62a977355597c0bef5"}, + {file = "simplejson-3.17.6-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:37bc0cf0e5599f36072077e56e248f3336917ded1d33d2688624d8ed3cefd7d2"}, + {file = "simplejson-3.17.6-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:cf6e7d5fe2aeb54898df18db1baf479863eae581cce05410f61f6b4188c8ada1"}, + {file = "simplejson-3.17.6-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:bdfc54b4468ed4cd7415928cbe782f4d782722a81aeb0f81e2ddca9932632211"}, + {file = "simplejson-3.17.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd16302d39c4d6f4afde80edd0c97d4db643327d355a312762ccd9bd2ca515ed"}, + {file = "simplejson-3.17.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:deac4bdafa19bbb89edfb73b19f7f69a52d0b5bd3bb0c4ad404c1bbfd7b4b7fd"}, + {file = "simplejson-3.17.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8bbdb166e2fb816e43ab034c865147edafe28e1b19c72433147789ac83e2dda"}, + {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7854326920d41c3b5d468154318fe6ba4390cb2410480976787c640707e0180"}, + {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:04e31fa6ac8e326480703fb6ded1488bfa6f1d3f760d32e29dbf66d0838982ce"}, + {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f63600ec06982cdf480899026f4fda622776f5fabed9a869fdb32d72bc17e99a"}, + {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e03c3b8cc7883a54c3f34a6a135c4a17bc9088a33f36796acdb47162791b02f6"}, + {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a2d30d6c1652140181dc6861f564449ad71a45e4f165a6868c27d36745b65d40"}, + {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a1aa6e4cae8e3b8d5321be4f51c5ce77188faf7baa9fe1e78611f93a8eed2882"}, + {file = "simplejson-3.17.6-cp310-cp310-win32.whl", hash = "sha256:97202f939c3ff341fc3fa84d15db86156b1edc669424ba20b0a1fcd4a796a045"}, + {file = "simplejson-3.17.6-cp310-cp310-win_amd64.whl", hash = "sha256:80d3bc9944be1d73e5b1726c3bbfd2628d3d7fe2880711b1eb90b617b9b8ac70"}, + {file = "simplejson-3.17.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9fa621b3c0c05d965882c920347b6593751b7ab20d8fa81e426f1735ca1a9fc7"}, + {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd2fb11922f58df8528adfca123f6a84748ad17d066007e7ac977720063556bd"}, + {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:724c1fe135aa437d5126138d977004d165a3b5e2ee98fc4eb3e7c0ef645e7e27"}, + {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4ff4ac6ff3aa8f814ac0f50bf218a2e1a434a17aafad4f0400a57a8cc62ef17f"}, + {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:67093a526e42981fdd954868062e56c9b67fdd7e712616cc3265ad0c210ecb51"}, + {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:5d6b4af7ad7e4ac515bc6e602e7b79e2204e25dbd10ab3aa2beef3c5a9cad2c7"}, + {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:1c9b1ed7ed282b36571638297525f8ef80f34b3e2d600a56f962c6044f24200d"}, + {file = "simplejson-3.17.6-cp36-cp36m-win32.whl", hash = "sha256:632ecbbd2228575e6860c9e49ea3cc5423764d5aa70b92acc4e74096fb434044"}, + {file = "simplejson-3.17.6-cp36-cp36m-win_amd64.whl", hash = "sha256:4c09868ddb86bf79b1feb4e3e7e4a35cd6e61ddb3452b54e20cf296313622566"}, + {file = "simplejson-3.17.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4b6bd8144f15a491c662f06814bd8eaa54b17f26095bb775411f39bacaf66837"}, + {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5decdc78849617917c206b01e9fc1d694fd58caa961be816cb37d3150d613d9a"}, + {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:521877c7bd060470806eb6335926e27453d740ac1958eaf0d8c00911bc5e1802"}, + {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:65b998193bd7b0c7ecdfffbc825d808eac66279313cb67d8892bb259c9d91494"}, + {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ac786f6cb7aa10d44e9641c7a7d16d7f6e095b138795cd43503769d4154e0dc2"}, + {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3ff5b3464e1ce86a8de8c88e61d4836927d5595c2162cab22e96ff551b916e81"}, + {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:69bd56b1d257a91e763256d63606937ae4eb890b18a789b66951c00062afec33"}, + {file = "simplejson-3.17.6-cp37-cp37m-win32.whl", hash = "sha256:b81076552d34c27e5149a40187a8f7e2abb2d3185576a317aaf14aeeedad862a"}, + {file = "simplejson-3.17.6-cp37-cp37m-win_amd64.whl", hash = "sha256:07ecaafc1b1501f275bf5acdee34a4ad33c7c24ede287183ea77a02dc071e0c0"}, + {file = "simplejson-3.17.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:068670af975247acbb9fc3d5393293368cda17026db467bf7a51548ee8f17ee1"}, + {file = "simplejson-3.17.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4d1c135af0c72cb28dd259cf7ba218338f4dc027061262e46fe058b4e6a4c6a3"}, + {file = "simplejson-3.17.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:23fe704da910ff45e72543cbba152821685a889cf00fc58d5c8ee96a9bad5f94"}, + {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f444762fed1bc1fd75187ef14a20ed900c1fbb245d45be9e834b822a0223bc81"}, + {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:681eb4d37c9a9a6eb9b3245a5e89d7f7b2b9895590bb08a20aa598c1eb0a1d9d"}, + {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8e8607d8f6b4f9d46fee11447e334d6ab50e993dd4dbfb22f674616ce20907ab"}, + {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b10556817f09d46d420edd982dd0653940b90151d0576f09143a8e773459f6fe"}, + {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e1ec8a9ee0987d4524ffd6299e778c16cc35fef6d1a2764e609f90962f0b293a"}, + {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0b4126cac7d69ac06ff22efd3e0b3328a4a70624fcd6bca4fc1b4e6d9e2e12bf"}, + {file = "simplejson-3.17.6-cp38-cp38-win32.whl", hash = "sha256:35a49ebef25f1ebdef54262e54ae80904d8692367a9f208cdfbc38dbf649e00a"}, + {file = "simplejson-3.17.6-cp38-cp38-win_amd64.whl", hash = "sha256:743cd768affaa508a21499f4858c5b824ffa2e1394ed94eb85caf47ac0732198"}, + {file = "simplejson-3.17.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb62d517a516128bacf08cb6a86ecd39fb06d08e7c4980251f5d5601d29989ba"}, + {file = "simplejson-3.17.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:12133863178a8080a3dccbf5cb2edfab0001bc41e5d6d2446af2a1131105adfe"}, + {file = "simplejson-3.17.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5540fba2d437edaf4aa4fbb80f43f42a8334206ad1ad3b27aef577fd989f20d9"}, + {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d74ee72b5071818a1a5dab47338e87f08a738cb938a3b0653b9e4d959ddd1fd9"}, + {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:28221620f4dcabdeac310846629b976e599a13f59abb21616356a85231ebd6ad"}, + {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b09bc62e5193e31d7f9876220fb429ec13a6a181a24d897b9edfbbdbcd678851"}, + {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7255a37ff50593c9b2f1afa8fafd6ef5763213c1ed5a9e2c6f5b9cc925ab979f"}, + {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:401d40969cee3df7bda211e57b903a534561b77a7ade0dd622a8d1a31eaa8ba7"}, + {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a649d0f66029c7eb67042b15374bd93a26aae202591d9afd71e111dd0006b198"}, + {file = "simplejson-3.17.6-cp39-cp39-win32.whl", hash = "sha256:522fad7be85de57430d6d287c4b635813932946ebf41b913fe7e880d154ade2e"}, + {file = "simplejson-3.17.6-cp39-cp39-win_amd64.whl", hash = "sha256:3fe87570168b2ae018391e2b43fbf66e8593a86feccb4b0500d134c998983ccc"}, + {file = "simplejson-3.17.6.tar.gz", hash = "sha256:cf98038d2abf63a1ada5730e91e84c642ba6c225b0198c3684151b1f80c5f8a6"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +sortedcontainers = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] +soupsieve = [ + {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"}, + {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"}, +] +sqlitedict = [ + {file = "sqlitedict-1.7.0.tar.gz", hash = "sha256:2affcc301aacd4da7511692601ecbde392294205af418498f7d6d3ec0dbcad56"}, +] +tatsu = [ + {file = "TatSu-4.4.0-py2.py3-none-any.whl", hash = "sha256:c9211eeee9a2d4c90f69879ec0b518b1aa0d9450249cb0dd181f5f5b18be0a92"}, + {file = "TatSu-4.4.0.zip", hash = "sha256:80713413473a009f2081148d0f494884cabaf9d6866b71f2a68a92b6442f343d"}, +] +threadpoolctl = [ + {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, + {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] +urllib3 = [ + {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, + {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, +] +zipp = [ + {file = "zipp-3.9.0-py3-none-any.whl", hash = "sha256:972cfa31bc2fedd3fa838a51e9bc7e64b7fb725a8c00e7431554311f180e9980"}, + {file = "zipp-3.9.0.tar.gz", hash = "sha256:3a7af91c3db40ec72dd9d154ae18e008c69efe8ca88dde4f9a731bb82fe2f9eb"}, +] diff --git a/precompute.nf b/precompute.nf index 3d3458f88..48fd0dc84 100644 --- a/precompute.nf +++ b/precompute.nf @@ -13,6 +13,9 @@ include { query as prev_query} from './workflows/precompute/utils' include { query as basic_query} from './workflows/precompute/utils' include { query as orf_query} from './workflows/precompute/utils' +include { slack_closure } from './workflows/utils/slack' +include { slack_message } from './workflows/utils/slack' + process build_precompute_context { input: path('species-repeats*') @@ -118,8 +121,6 @@ process process_range { } process load_data { - beforeScript 'slack db-work loading-precompute || true' - afterScript 'slack db-done loading-precompute || true' input: path('precompute*.csv') @@ -139,6 +140,9 @@ process load_data { workflow precompute { take: _flag main: + + Channel.of("Starting precompute pipeline") | slack_message + Channel.fromPath('files/precompute/get-accessions/query.sql') | set { accession_query } Channel.fromPath('files/precompute/load.ctl') | set { data_ctl } Channel.fromPath('files/precompute/qa.ctl') | set { qa_ctl } @@ -194,3 +198,13 @@ workflow precompute { workflow { precompute(Channel.of(true)) } + +workflow.onComplete { + + slack_closure("Precompute workflow completed. Data import complete") +} + +workflow.onError { + + slack_closure("Precompute workflow encountered an error and crashed") +} diff --git a/prepare-environment.nf b/prepare-environment.nf new file mode 100644 index 000000000..90d654a24 --- /dev/null +++ b/prepare-environment.nf @@ -0,0 +1,49 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +include { slack_closure } from './workflows/utils/slack' +include { slack_message } from './workflows/utils/slack' + +/* Get some data downloaded and in the right place */ + +/* On the cluster this is much much faster than wget */ +process get_r2dt_data { + queue 'datamover' + executor 'lsf' + container '' + + input: + val(data_dir) + + script: + """ + echo "$data_dir" + if [ ! -d $data_dir ] + then + mkdir -p $data_dir + fi + + cd $data_dir + + cp /nfs/ftp/public/databases/RNAcentral/r2dt/1.3/cms.tar.gz . + + tar -xf cms.tar.gz --strip-components=1 -C ./cms + """ +} + +workflow prepare_environment { + main: + Channel.of("Starting environment preparation") | slack_message + + Channel.of("$params.r2dt.cms_path/../")| get_r2dt_data +} + +workflow { + Channel.of("Starting...") | slack_message + prepare_environment() +} + +workflow.onComplete { + slack_closure("Environment preparation completed") +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..40e730ff0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[tool.poetry] +name = "rnacentral_pipeline" +version = "0.1.0" +description = "The pipeline that imports all RNAcentral data" +authors = ["Blake Sweeney "] + +[tool.poetry.dependencies] +python = "^3.8" +PyMySQL = "^1.0.2" +attrs = "^21.4.0" +beautifulsoup4 = "^4.10.0" +biopython = "^1.79" +click = "^8.0.3" +click-aliases = "^1.0.1" +furl = "^2.1.3" +gffutils = "^0.10.1" +humanfriendly = "^10.0" +ijson = "^3.1.4" +intervaltree = "^3.1.0" +jsonschema = "^4.3.3" +lxml = "^4.7.1" +more-itertools = "^8.12.0" +obonet = "^0.3.0" +pandas = "^1.3.5" +PyPika = "^0.48.8" +ratelimiter = "^1.2.0" +requests = "^2.27.1" +retry = "^0.9.2" +scikit-learn = "^1.0.2" +semver = "^2.13.0" +sqlitedict = "^1.7.0" +TatSu = "4.4.0" +psycopg2 = "2.9.3" + +[tool.poetry.dev-dependencies] +pytest = "^6.2.5" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +xfail_strict = true +filterwarnings = "ignore::DeprecationWarning" +markers = [ + "slow: Tests that take a long time", + "db: Test that require access to our database", +] diff --git a/references-manually-annotated.nf b/references-manually-annotated.nf new file mode 100644 index 000000000..c42f2a961 --- /dev/null +++ b/references-manually-annotated.nf @@ -0,0 +1,35 @@ +nextflow.enable.dsl=2 + +process get_ids { + publishDir "$baseDir/workflows/references/manually_annotated/", mode: 'copy' + + input: + path(query) + + output: + path('results') + + script: + """ + psql -t -A -f $query "$PGDATABASE" > results + """ +} + +process split_by_db { + publishDir "$baseDir/workflows/references/manually_annotated/", mode: 'copy' + + input: + file(results) + + output: + path('from_*') + + script: + """ + references-manually-annotated.py $results from_* + """ +} + +workflow { + Channel.fromPath('workflows/references/manually_annotated/query.sql') | get_ids | split_by_db +} diff --git a/references-metadata-rnacentral.nf b/references-metadata-rnacentral.nf new file mode 100644 index 000000000..fcf78fcf0 --- /dev/null +++ b/references-metadata-rnacentral.nf @@ -0,0 +1,38 @@ +nextflow.enable.dsl=2 + +process get_urs { + publishDir "$baseDir/workflows/references/metadata/rnacentral", mode: 'copy' + + input: + path(database) + + output: + path("urs_${database.baseName}") + + script: + """ + metadata-rnacentral.py $database urs_${database.baseName} + """ +} + +process get_job { + publishDir "$baseDir/workflows/references/metadata/rnacentral", mode: 'copy' + + input: + path(database) + + output: + path("job_${database.baseName}") + + script: + """ + metadata-rnacentral.py $database job_${database.baseName} + """ +} + + + +workflow { + Channel.fromPath('workflows/references/results/*.txt') | get_urs + Channel.fromPath('workflows/references/results/*.txt') | get_job +} diff --git a/references-metadata.nf b/references-metadata.nf index a243ca34c..1a5e4c9bb 100644 --- a/references-metadata.nf +++ b/references-metadata.nf @@ -1,20 +1,46 @@ nextflow.enable.dsl=2 +process create_metadata { + input: + path(database) + + output: + path("metadata_${database.baseName}") + + script: + """ + metadata.py $database metadata_${database.baseName} + """ +} + +process merge_metadata { + input: + file(results) + + output: + path("merged_metadata") + + script: + """ + cat $results | sort -fb | uniq -i > merged_metadata + """ +} + process create_xml { publishDir "$baseDir/workflows/references/metadata/", mode: 'copy' input: - path(database) + file(merged_metadata) output: - path("metadata_${database.baseName}.xml.gz") + path("metadata_*") script: """ - metadata.py $database metadata_${database.baseName}.xml.gz + create_xml_metadata.py $merged_metadata metadata_* """ } workflow { - Channel.fromPath('workflows/references/results/*.txt') | create_xml + Channel.fromPath('workflows/references/results/*.txt') | create_metadata | collect | merge_metadata | create_xml } diff --git a/references.nf b/references.nf index 17f643014..cdad8ba9a 100644 --- a/references.nf +++ b/references.nf @@ -37,21 +37,41 @@ process sort_ids { script: """ - cat $output | sort | uniq > ${database}.txt + cat $output | sort -fb | uniq -i > ${database}.txt + """ +} + +process prepare_to_submit { + publishDir "$baseDir/workflows/references/submit/", mode: 'copy' + + input: + tuple val(database), path("${database}.txt") + + output: + tuple val(database), path("${database}_ids.txt") + + script: + """ + # make a copy of the old version before creating the new file + rm -f $baseDir/workflows/references/submit/previous-release/${database}_ids.txt + mv $baseDir/workflows/references/submit/${database}_ids.txt $baseDir/workflows/references/submit/previous-release + get_unique_ids.sh ${database}.txt $database """ } process submit_ids { input: - tuple val(database), file("${database}.txt") + tuple val(database), file("${database}_ids.txt") script: """ - upload_ids.sh ${database}.txt $database + # submit new ids only + comm -13 $baseDir/workflows/references/submit/previous-release/${database}_ids.txt $baseDir/workflows/references/submit/${database}_ids.txt > new_${database}_ids.txt + upload_ids.sh new_${database}_ids.txt """ } workflow { - Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids - // Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | submit_ids + Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | prepare_to_submit + // Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | prepare_to_submit | submit_ids } diff --git a/report.nf b/report.nf new file mode 100644 index 000000000..99dfe0c81 --- /dev/null +++ b/report.nf @@ -0,0 +1,11 @@ +process send_completion_report { + executor 'local' + + """ + rnac notify report + """ +} + +workflow { + send_completion_report() +} diff --git a/requirements.in b/requirements.in index e43e25a23..af8426f2e 100644 --- a/requirements.in +++ b/requirements.in @@ -21,6 +21,7 @@ requests retry scikit-learn semver +slack_sdk sqlitedict tatsu textblob diff --git a/requirements.txt b/requirements.txt index 2d7883e9d..784c59e7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,63 +1,132 @@ # -# This file is autogenerated by pip-compile +# This file is autogenerated by pip-compile with python 3.8 # To update, run: # # pip-compile --output-file=requirements.txt requirements.in # -argcomplete==1.12.3 # via gffutils -argh==0.26.2 # via gffutils -attrs==20.3.0 # via -r requirements.in, jsonschema -beautifulsoup4==4.9.3 # via -r requirements.in -biopython==1.78 # via -r requirements.in -certifi==2020.12.5 # via requests -chardet==4.0.0 # via requests -click-aliases==1.0.1 # via -r requirements.in -click==7.1.2 # via -r requirements.in, click-aliases, nltk -decorator==4.4.2 # via networkx, retry -furl==2.1.2 # via -r requirements.in -gffutils==0.10.1 # via -r requirements.in -humanfriendly==9.1 # via -r requirements.in -idna==2.10 # via requests -ijson==3.1.4 # via -r requirements.in -importlib-metadata==4.0.1 # via argcomplete, jsonschema -intervaltree==3.1.0 # via -r requirements.in -joblib==1.0.1 # via nltk, scikit-learn -jsonschema==3.2.0 # via -r requirements.in -lxml==4.6.3 # via -r requirements.in -more-itertools==8.7.0 # via -r requirements.in -networkx==2.5.1 # via obonet -nltk==3.6.2 # via textblob -numpy==1.20.2 # via biopython, pandas, scikit-learn, scipy -obonet==0.3.0 # via -r requirements.in -orderedmultidict==1.0.1 # via furl -pandas==1.2.4 # via -r requirements.in -psycopg2==2.8.6 # via -r requirements.in -py==1.10.0 # via retry -pyfaidx==0.5.9.5 # via gffutils -pymysql==1.0.2 # via -r requirements.in -pypika==0.48.1 # via -r requirements.in -pyrsistent==0.17.3 # via jsonschema -python-dateutil==2.8.1 # via pandas -pytz==2021.1 # via pandas -ratelimiter==1.2.0.post0 # via -r requirements.in -regex==2021.4.4 # via nltk -requests==2.25.1 # via -r requirements.in -retry==0.9.2 # via -r requirements.in -scikit-learn==0.24.1 # via -r requirements.in -scipy==1.6.2 # via scikit-learn -semver==2.13.0 # via -r requirements.in -simplejson==3.17.2 # via gffutils -six==1.15.0 # via furl, gffutils, jsonschema, orderedmultidict, pyfaidx, python-dateutil -sortedcontainers==2.3.0 # via intervaltree -soupsieve==2.2.1 # via beautifulsoup4 -sqlitedict==1.7.0 # via -r requirements.in -tatsu==4.4.0 # via -r requirements.in -textblob==0.15.3 # via -r requirements.in -threadpoolctl==2.1.0 # via scikit-learn -tqdm==4.60.0 # via nltk -typing-extensions==3.7.4.3 # via importlib-metadata -urllib3==1.26.4 # via requests -zipp==3.4.1 # via importlib-metadata +argcomplete==1.12.3 + # via gffutils +argh==0.26.2 + # via gffutils +attrs==20.3.0 + # via + # -r requirements.in + # jsonschema +beautifulsoup4==4.9.3 + # via -r requirements.in +biopython==1.78 + # via -r requirements.in +certifi==2022.6.15 + # via requests +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -r requirements.in + # click-aliases + # nltk +click-aliases==1.0.1 + # via -r requirements.in +decorator==4.4.2 + # via + # networkx + # retry +furl==2.1.2 + # via -r requirements.in +gffutils==0.10.1 + # via -r requirements.in +humanfriendly==9.1 + # via -r requirements.in +idna==2.10 + # via requests +ijson==3.1.4 + # via -r requirements.in +intervaltree==3.1.0 + # via -r requirements.in +joblib==1.0.1 + # via + # nltk + # scikit-learn +jsonschema==3.2.0 + # via -r requirements.in +lxml==4.6.3 + # via -r requirements.in +more-itertools==8.7.0 + # via -r requirements.in +networkx==2.5.1 + # via obonet +nltk==3.6.2 + # via textblob +numpy==1.23.0 + # via + # biopython + # pandas + # scikit-learn + # scipy +obonet==0.3.0 + # via -r requirements.in +orderedmultidict==1.0.1 + # via furl +pandas==1.4.3 + # via -r requirements.in +psycopg2==2.8.6 + # via -r requirements.in +py==1.10.0 + # via retry +pyfaidx==0.5.9.5 + # via gffutils +pymysql==1.0.2 + # via -r requirements.in +pypika==0.48.1 + # via -r requirements.in +pyrsistent==0.17.3 + # via jsonschema +python-dateutil==2.8.1 + # via pandas +pytz==2021.1 + # via pandas +ratelimiter==1.2.0.post0 + # via -r requirements.in +regex==2021.4.4 + # via nltk +requests==2.25.1 + # via -r requirements.in +retry==0.9.2 + # via -r requirements.in +scikit-learn==1.1.1 + # via -r requirements.in +scipy==1.8.1 + # via scikit-learn +semver==2.13.0 + # via -r requirements.in +simplejson==3.17.2 + # via gffutils +six==1.15.0 + # via + # furl + # gffutils + # jsonschema + # orderedmultidict + # pyfaidx + # python-dateutil +slack-sdk==3.18.1 + # via -r requirements.in +sortedcontainers==2.3.0 + # via intervaltree +soupsieve==2.2.1 + # via beautifulsoup4 +sqlitedict==1.7.0 + # via -r requirements.in +tatsu==4.4.0 + # via -r requirements.in +textblob==0.15.3 + # via -r requirements.in +threadpoolctl==2.1.0 + # via scikit-learn +tqdm==4.60.0 + # via nltk +urllib3==1.26.4 + # via requests # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py index 7890ea211..9a754a443 100644 --- a/rnacentral_pipeline/cli/__init__.py +++ b/rnacentral_pipeline/cli/__init__.py @@ -23,6 +23,7 @@ crw, ena, ensembl, + expressionatlas, europepmc, five_s_rrnadb, flybase, @@ -40,9 +41,11 @@ mirgenedb, misc, ncbi, + notify, ols, pdb, pirbase, + plncdb, pombase, psicquic, precompute, @@ -55,6 +58,7 @@ rfam, ribovision, search_export, + scan_imports, sgd, silva, snodb, @@ -91,6 +95,7 @@ def cli(log_level): cli.add_command(ena.cli) cli.add_command(ensembl.cli) cli.add_command(europepmc.cli) +cli.add_command(expressionatlas.cli) cli.add_command(five_s_rrnadb.cli) cli.add_command(flybase.cli) cli.add_command(ftp_export.cli) @@ -109,9 +114,11 @@ def cli(log_level): cli.add_command(misc.find_upi_ranges) cli.add_command(misc.validate_pgloader) cli.add_command(ncbi.cli) +cli.add_command(notify.cli) cli.add_command(ols.cli) cli.add_command(pdb.cli) cli.add_command(pirbase.cli) +cli.add_command(plncdb.cli) cli.add_command(pombase.cli) cli.add_command(psicquic.cli) cli.add_command(precompute.cli) @@ -123,6 +130,7 @@ def cli(log_level): cli.add_command(repeats.cli) cli.add_command(rfam.cli) cli.add_command(ribovision.cli) +cli.add_command(scan_imports.cli) cli.add_command(search_export.cli) cli.add_command(sgd.cli) cli.add_command(silva.cli) diff --git a/rnacentral_pipeline/cli/crw.py b/rnacentral_pipeline/cli/crw.py index cc6d7f794..57b35d77c 100644 --- a/rnacentral_pipeline/cli/crw.py +++ b/rnacentral_pipeline/cli/crw.py @@ -19,7 +19,7 @@ from Bio import SeqIO -from rnacentral_pipeline.databases.crw import parser +from rnacentral_pipeline.databases.crw import parser, helpers from rnacentral_pipeline.writers import entry_writer @@ -53,5 +53,5 @@ def process_crw(metadata_file, sequence_directory, output): @click.argument("directory", type=click.Path()) @click.argument("output", type=click.File("w")) def generate_r2dt_fasta(directory, output): - entries = parser.fasta_entries(Path(directory)) + entries = helpers.fasta_entries(Path(directory)) SeqIO.write(entries, output, "fasta") diff --git a/rnacentral_pipeline/cli/ena.py b/rnacentral_pipeline/cli/ena.py index 2a09656d8..70c87d3ed 100644 --- a/rnacentral_pipeline/cli/ena.py +++ b/rnacentral_pipeline/cli/ena.py @@ -13,11 +13,13 @@ limitations under the License. """ +import os from pathlib import Path import click from rnacentral_pipeline.databases.ena import context, parser +from rnacentral_pipeline.rnacentral.notify.slack import send_notification from rnacentral_pipeline.writers import entry_writer @@ -58,6 +60,26 @@ def process_ena( builder.with_dr(ena_file) ctx = builder.context() entries = parser.parse_with_context(ctx, ena_file) - with entry_writer(Path(output)) as writer: - writer.write(entries) + try: + with entry_writer(Path(output)) as writer: + writer.write(entries) + except ValueError: + print("No entries could be written for one of the parsed ENA files.") + print("Sending warning to slack, but carrying on") + + # Dump this again to attach to the report + ctx.dump_counts(Path(counts)) + + message = f"No entries could be written for ENA file {ena_file}\n" + message += "This may be correct, but you should check\n" + message += f"Working directory: {os.getcwd()}\n" + message += "Ribotyper log:\n" + message += open( + Path(ribovore_path) / "ribotyper-results.ribotyper.log", "r" + ).read() + message += "\n\nContext counts:\n" + message += open(Path(counts), "r").read() + + send_notification("ENA parsing error", message) + ctx.dump_counts(Path(counts)) diff --git a/rnacentral_pipeline/cli/ensembl.py b/rnacentral_pipeline/cli/ensembl.py index db6a082a6..9c20ca59a 100644 --- a/rnacentral_pipeline/cli/ensembl.py +++ b/rnacentral_pipeline/cli/ensembl.py @@ -14,21 +14,22 @@ """ import csv -from pathlib import Path import itertools as it import operator as op +from pathlib import Path import click -from rnacentral_pipeline.databases.ensembl.metadata import assemblies -from rnacentral_pipeline.databases.ensembl.metadata import compara -from rnacentral_pipeline.databases.ensembl.metadata import coordinate_systems -from rnacentral_pipeline.databases.ensembl.metadata import karyotypes -from rnacentral_pipeline.databases.ensembl.metadata import proteins +from rnacentral_pipeline.databases.ensembl import parser, pseudogenes, urls from rnacentral_pipeline.databases.ensembl.data import Division -from rnacentral_pipeline.databases.ensembl import parser -from rnacentral_pipeline.databases.ensembl import pseudogenes -from rnacentral_pipeline.databases.ensembl import urls +from rnacentral_pipeline.databases.ensembl.metadata import ( + assemblies, + compara, + coordinate_systems, + karyotypes, + proteins, +) +from rnacentral_pipeline.rnacentral.notify import slack from rnacentral_pipeline.writers import entry_writer @@ -81,8 +82,20 @@ def parse_data(division, embl_file, gff_file, output, family_file=None): if family_file: family_file = Path(family_file) entries = parser.parse(division, embl_file, gff_file, family_file=family_file) - with entry_writer(Path(output)) as writer: - writer.write(entries) + ## Send warning to slack with details about empty parse + try: + with entry_writer(Path(output)) as writer: + writer.write(entries) + except ValueError: + print("Empty entries, implies no ncRNAs. You should check that") + message = f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. Empty data supplied for now, but you should check the legitimacy of this result.\n" + message += "For reference, the other parameters to the parser were:\n" + message += f"division: {division}\n" + message += f"embl_file: {embl_file.name}\n" + message += f"gff_file: {gff_file.name}\n" + message += f"family_file: {family_file.name}\n" + + slack.send_notification("Ensembl parser error", message) @cli.command("assemblies") diff --git a/rnacentral_pipeline/cli/expressionatlas.py b/rnacentral_pipeline/cli/expressionatlas.py new file mode 100644 index 000000000..5625a7bd9 --- /dev/null +++ b/rnacentral_pipeline/cli/expressionatlas.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2021] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +import click + +from rnacentral_pipeline.databases.expressionatlas import parser +from rnacentral_pipeline.writers import entry_writer + + +@click.group("expressionatlas") +def cli(): + """ + Commands for parsing expression atlas data + """ + + +@cli.command("parse") +@click.option("--db-url", envvar="PGDATABASE") +@click.argument("csv_file", type=click.File("r")) +@click.argument( + "output", + default=".", + type=click.Path(writable=True, dir_okay=True, file_okay=False), +) +def process_csv(csv_file, output, db_url): + """ + Process the csv generated by linking EA data to rnc data + """ + entries = parser.parse(csv_file, db_url) + with entry_writer(Path(output)) as writer: + writer.write(entries) diff --git a/rnacentral_pipeline/cli/notify.py b/rnacentral_pipeline/cli/notify.py new file mode 100644 index 000000000..28afdac56 --- /dev/null +++ b/rnacentral_pipeline/cli/notify.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2017] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import click +import os + +from rnacentral_pipeline.rnacentral.notify.slack import send_notification, pipeline_report +from rnacentral_pipeline import db + + + +@click.group("notify") +def cli(): + """ + This group of commands deals with sending notifications + """ + +@cli.command("step") +@click.argument("title", type=click.STRING) +@click.argument("message", type=click.STRING) +def notify_step(title, message): + """ + Send a simple message, maybe when a step finishes + """ + send_notification(title, message) + + +@cli.command("query") +@click.argument("title", type=click.STRING) +@click.argument("query", type=click.STRING) +def notify_query(title, query): + """ + Run a query against the database, then format the result into markdown and + send as a notification in slack. + """ + + # parse query - try to figure out what table headings to give + # There is probably a better way to do this + args = [ + arg.strip() for arg in + query.upper().removeprefix("SELECT ").split("FROM")[0].split(',') + ] + + headerline = f"{' : '.join(args)} \n" + + # get PGDATABASE url from environment + PGDATABASE = os.getenv("PGDATABASE") + + # run query, convert output to list of tuples + result = list(db.run_query(PGDATABASE, query, commit_on_leave=False)) + + markdown_string = "" + markdown_string += f"Result of query: {query}\n\n" + markdown_string += headerline + + # add the results to the message... + for res in result: + markdown_string += f"- {' : '.join([str(r) for r in res])} \n" + + markdown_string += '\n' + + send_notification(title, markdown_string) + + + +@cli.command("file") +@click.argument("path", type=click.File()) +def notify_file(path): + """ + Read a mrkdwn formatted file and send as a message in slack. + """ + send_notification("", path.read()) + +@cli.command("report") +def notify_report(): + """ + Generate a run report and send it as mrkdwn + """ + pipeline_report() diff --git a/rnacentral_pipeline/cli/pdb.py b/rnacentral_pipeline/cli/pdb.py index 7b6dad455..e650581dc 100644 --- a/rnacentral_pipeline/cli/pdb.py +++ b/rnacentral_pipeline/cli/pdb.py @@ -13,14 +13,15 @@ limitations under the License. """ +import collections as coll +import csv import logging from pathlib import Path import click -from rnacentral_pipeline.databases.pdb import fetch -from rnacentral_pipeline.databases.pdb import parser from rnacentral_pipeline import writers +from rnacentral_pipeline.databases.pdb import fetch, helpers, parser LOGGER = logging.getLogger(__name__) @@ -44,12 +45,24 @@ def cli(): file_okay=False, ), ) -def process_pdb(output, skip_references=False): +@click.option( + "--override-chains", + default=None, + type=click.File("r"), +) +def process_pdb(output, skip_references=False, override_chains=None): """ This will fetch and parse all sequence data from PDBe to produce the csv files we import. """ - chain_info = fetch.rna_chains() + pdb_ids = set() + overrides = set() + if override_chains: + LOGGER.info("Loading chain overrides") + overrides = helpers.load_overrides(override_chains) + LOGGER.info("Loaded %i chain overrides", len(pdb_ids)) + chain_info = fetch.rna_chains(overrides) + LOGGER.info("Loaded %i chains", len(chain_info)) references = {} try: if not skip_references: @@ -57,6 +70,6 @@ def process_pdb(output, skip_references=False): except Exception: LOGGER.info("Failed to get extra references") - entries = parser.parse(chain_info, references) + entries = parser.parse(chain_info, references, overrides) with writers.entry_writer(Path(output)) as writer: writer.write(entries) diff --git a/rnacentral_pipeline/cli/plncdb.py b/rnacentral_pipeline/cli/plncdb.py new file mode 100644 index 000000000..1af85c627 --- /dev/null +++ b/rnacentral_pipeline/cli/plncdb.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2020] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pathlib import Path + +import click +from furl import furl + +import requests + +from rnacentral_pipeline.databases.plncdb import parser +from rnacentral_pipeline.writers import entry_writer +from rnacentral_pipeline.rnacentral.notify.slack import send_notification + +@click.group("plncdb") +def cli(): + """ + A group of commands dealing with PLncDB data. + """ + pass + + +@cli.command("parse") +@click.argument("data", type=click.Path(dir_okay=True, readable=True, file_okay=False)) +@click.argument( + "output", + default=".", + type=click.Path(writable=True, dir_okay=True, file_okay=False), +) +def parse(data, output): + entries = parser.parse(Path(data)) + with entry_writer(Path(output)) as writer: + try: + writer.write(entries) + except ValueError as e: + print(e) + + +@cli.command("fetch-data") +@click.argument("urls", type=click.Path(writable=False, file_okay=True, dir_okay=False)) +@click.argument("destination", type=click.Path(writable=True, file_okay=False, dir_okay=True), default='.') +def fetch_data(urls, destination): + url_dict = {} + with open(urls, 'r') as url_file: + for url_line in url_file: + url_dict[url_line.split(',')[0]] = url_line.split(',')[1:] + + + for dir_name in url_dict.keys(): + print(f"Getting data for {dir_name}") + + send_notification("PLncDB Download", f"Getting data for {dir_name}") + + target_path = Path(destination) / dir_name + target_path.mkdir(exist_ok=True, parents=True) + for url in url_dict[dir_name]: + download_file(url, target_path) + print(f"All data for {dir_name} is downloaded") + + +def download_file(url, destination=Path('.')): + local_filename = url.split('/')[-1] + if (destination / local_filename).exists(): + return local_filename + # NOTE the stream=True parameter below + with requests.get(url.strip(), stream=True) as r: + r.raise_for_status() + with open(destination / local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + # If you have chunk encoded response uncomment if + # and set chunk_size parameter to None. + f.write(chunk) + return local_filename diff --git a/rnacentral_pipeline/cli/scan_imports.py b/rnacentral_pipeline/cli/scan_imports.py new file mode 100644 index 000000000..883d7ef0a --- /dev/null +++ b/rnacentral_pipeline/cli/scan_imports.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2018] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import click +import psycopg2 +import psycopg2.extras +import pandas as pd + + +@click.group("scan-imports") +def cli(): + """ + A group of commands to scan imports and decide what to run + """ + pass + +""" +This is the process I think + +manual selection -> csv file +csv file: db_name, remote -> nf runs process(check_db_md5) -> list of db_name: md5 +list of db_name: md5 -> nf runs process(select_for_import) -> db_selection.config + +main pipeline includes selection.config to switch on/off the right dbs + +md5 creation can be done in shell for now, could do something with stripping the +metadata of json files to compare only the actual data md5 (date would change the overall sum) + +""" + + + + +@cli.command("select-for-import") +@click.option("--db-url", envvar="PGDATABASE") +@click.argument("db_md5_map") +@click.argument("output", default="db_selection.config") +def select_db_to_import(db_md5_map, output, db_url=None, type=click.Path(writable=True,dir_okay=False,file_okay=True)): + """ + Takes the map of db name to md5 sum and queries our DB to select those DBs that can usefully be imported + + Outputs a config file that the weekly import includes to switch on/off the relevant DBs + """ + + selection_template = """params {{ + databases {{ + {0} + {1} + }} + }}""" + + latest_checksums = pd.read_csv(db_md5_map, names=["db_name", "checksum"]) + + conn = psycopg2.connect(db_url) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + cur.execute("SELECT * FROM rnc_import_tracker;") + + prev_checksums = pd.DataFrame(cur.fetchall()) + + cur.close() + conn.close() + + selection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum != file_md5")['db_name'].values + deselection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum == file_md5")['db_name'].values + + selection = [f"{s}.run = true" for s in selection] + deselection = [f"{s}.run = false" for s in deselection] + + activation_string = "\n\t\t".join(selection) + deactivation_string = "\n\t\t".join(deselection) + + with open(output, 'w') as selection_config: + selection_config.write(selection_template.format(activation_string, deactivation_string)) + +@cli.command("update-tracker") +@click.argument("latest_md5s") +@click.option("--db-url", envvar="PGDATABASE") +def update_tracker(latest_md5s, db_url): + latest_checksums = pd.read_csv(latest_md5s, names=["db_name", "checksum"]) + + conn = psycopg2.connect(db_url) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + cur.execute("SELECT * FROM rnc_import_tracker;") + + prev_checksums = pd.DataFrame(cur.fetchall()) + + selection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum != file_md5") + selection['db_name'] = selection['db_name'].apply(lambda x: x.upper()) + + print(cur.execute("SELECT * FROM rnc_database WHERE descr = ANY(%s);", (list(selection['db_name'].values),) ) ) + all_dbs = pd.DataFrame(cur.fetchall()) + + insert_data = selection.join(all_dbs.set_index('descr'), on='db_name', rsuffix='_r').filter(items=["db_name", "id_r", "checksum"]) + + print(insert_data) + + for idx, row in insert_data.iterrows(): + print(row) + db_name = row[0].lower() + db_id = row[1] + checksum = row[2] + + cur.execute("TRUNCATE TABLE rnc_import_tracker") + cur.execute("INSERT INTO rnc_import_tracker(db_name, db_id, last_import_date, file_md5) VALUES (%s, %s, CURRENT_TIMESTAMP, %s) ", (db_name, db_id, checksum,)) + + conn.commit() + cur.close() + conn.close() diff --git a/rnacentral_pipeline/databases/crw/helpers.py b/rnacentral_pipeline/databases/crw/helpers.py index daf5395ae..bcfd1c7f9 100644 --- a/rnacentral_pipeline/databases/crw/helpers.py +++ b/rnacentral_pipeline/databases/crw/helpers.py @@ -55,7 +55,7 @@ def lineage(row: ty.Dict[str, ty.Any]) -> str: def sequence(row: ty.Dict[str, ty.Any], sequences: ty.Dict[str, SeqRecord]) -> str: - return str(sequences[row["model_name"]].seq) + return str(sequences[row["model_name"]].seq).upper().replace("U", "T") def description(row: ty.Dict[str, ty.Any]) -> str: @@ -68,8 +68,11 @@ def description(row: ty.Dict[str, ty.Any]) -> str: def organelle(row: ty.Dict[str, ty.Any]) -> ty.Optional[str]: - return ORGANELLE_MAPPING.get(row["cellular_location"], None) - + cellular_location = row.get("cellular_location", None) + if cellular_location is not None: + return ORGANELLE_MAPPING.get(row["cellular_location"], None) + else: + return None def as_entry(row: ty.Dict[str, ty.Any], sequences) -> ty.Optional[data.Entry]: try: @@ -99,11 +102,11 @@ def as_entry(row: ty.Dict[str, ty.Any], sequences) -> ty.Optional[data.Entry]: def fasta_entries(directory: Path) -> ty.Iterable[SeqRecord]: - model_pattern = re.compile("crw-bpseq/(.+).bpseq") + model_pattern = re.compile("crw-bpseq/(.+).bpseq ") for fasta_file in directory.glob("*.fasta"): with fasta_file.open("r") as raw: header, sequence, _ = raw.readlines() matches = re.search(model_pattern, header) if matches is None: raise ValueError(f"Could not get model id from {header}") - yield SeqRecord(Seq(sequence), id=matches.group(1)) + yield SeqRecord(Seq(sequence.strip()), id=matches.group(1)) diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py index 94fb6cc38..9eefd19d0 100644 --- a/rnacentral_pipeline/databases/data/databases.py +++ b/rnacentral_pipeline/databases/data/databases.py @@ -39,6 +39,7 @@ class Database(enum.Enum): ensembl_metazoa = DatabaseValue(5, "Ensembl Metazoa") ensembl_plants = DatabaseValue(6, "Ensembl Plants") ensembl_protists = DatabaseValue(7, "Ensembl Protists") + expression_atlas = DatabaseValue(51, "Expression Atlas") five_srrnadb = DatabaseValue(8, "5SrRNAdb") flybase = DatabaseValue(9, "FlyBase") gencode = DatabaseValue(10, "Ensembl/GENCODE") @@ -59,6 +60,7 @@ class Database(enum.Enum): noncode = DatabaseValue(25, "NONCODE") pdbe = DatabaseValue(26, "PDBe") pirbase = DatabaseValue(27, "PirBase") + plncdb = DatabaseValue(50, "PLncDB") pombase = DatabaseValue(28, "PomBase") psicquic = DatabaseValue(48, "PSICQUIC") rdp = DatabaseValue(29, "RDP") diff --git a/rnacentral_pipeline/databases/data/entry.py b/rnacentral_pipeline/databases/data/entry.py index 4ba206fb0..4e87519cd 100644 --- a/rnacentral_pipeline/databases/data/entry.py +++ b/rnacentral_pipeline/databases/data/entry.py @@ -28,10 +28,10 @@ from . import utils from .features import SequenceFeature +from .go_annotations import GoTermAnnotation from .references import IdReference, Reference -from .secondary_structure import SecondaryStructure from .regions import Exon, SequenceRegion -from .go_annotations import GoTermAnnotation +from .secondary_structure import SecondaryStructure LOGGER = logging.getLogger(__name__) @@ -177,7 +177,10 @@ def gene_synonym(self) -> str: """ Returns a comma separated list of gene synonyms. """ - return ",".join(self.gene_synonyms) + if self.gene_synonyms: + return ",".join(self.gene_synonyms) + else: + return "" @property def feature_location_start(self): diff --git a/rnacentral_pipeline/databases/data/utils.py b/rnacentral_pipeline/databases/data/utils.py index 63a09f824..7d8388275 100644 --- a/rnacentral_pipeline/databases/data/utils.py +++ b/rnacentral_pipeline/databases/data/utils.py @@ -28,6 +28,7 @@ "Y_RNA": "SO:0000405", "antisense_RNA": "SO:0000644", "autocatalytically_spliced_intron": "SO:0000588", + "circRNA": "SO:0002291", "guide_RNA": "SO:0000602", "hammerhead_ribozyme": "SO:0000380", "lncRNA": "SO:0001877", @@ -41,6 +42,7 @@ "ribozyme": "SO:0000374", "scRNA": "SO:0000013", "scaRNA": "SO:0002095", + "sgRNA": "SO:0001998", "siRNA": "SO:0000646", "snRNA": "SO:0000274", "snoRNA": "SO:0000275", diff --git a/rnacentral_pipeline/databases/ensembl/genomes/parser.py b/rnacentral_pipeline/databases/ensembl/genomes/parser.py index a5a8d27cb..4715bb73a 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/parser.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/parser.py @@ -13,20 +13,19 @@ limitations under the License. """ -import operator as op import itertools as it +import operator as op import typing as ty from Bio import SeqIO from rnacentral_pipeline.databases import data -from rnacentral_pipeline.databases.helpers import embl -from rnacentral_pipeline.databases.ensembl.vertebrates import helpers as ensembl - from rnacentral_pipeline.databases.ensembl import helpers as common +from rnacentral_pipeline.databases.ensembl.data import Pseudogene from rnacentral_pipeline.databases.ensembl.genomes import helpers from rnacentral_pipeline.databases.ensembl.genomes.data import Context -from rnacentral_pipeline.databases.ensembl.data import Pseudogene +from rnacentral_pipeline.databases.ensembl.vertebrates import helpers as ensembl +from rnacentral_pipeline.databases.helpers import embl def ncrnas(context: Context, handle) -> ty.Iterable[data.Entry]: @@ -57,20 +56,29 @@ def parse(context: Context, handle) -> ty.Iterable[data.Entry]: def pseudogenes(handle: ty.IO) -> ty.Iterable[Pseudogene]: - for record in SeqIO.parse(handle, "embl"): - current_gene = None - for feature in record.features: - if feature.type == "source": - continue - - if embl.is_gene(feature) and help: - current_gene = feature - - if helpers.is_pseudogene(current_gene, feature): - gene = embl.gene(feature) - if not gene: + try: + for record in SeqIO.parse(handle, "embl"): + current_gene = None + for feature in record.features: + if feature.type == "source": continue - yield Pseudogene( - gene=embl.gene(feature), - region=common.regions(record, feature)[0], - ) + + if embl.is_gene(feature) and help: + current_gene = feature + + if helpers.is_pseudogene(current_gene, feature): + gene = embl.gene(feature) + if not gene: + continue + yield Pseudogene( + gene=embl.gene(feature), + region=common.regions(record, feature)[0], + ) + except UnicodeDecodeError: + import os + + print(f"UTF-8 error in file {handle.name}. Abort parsing.") + print(f"The working directory is {os.getcwd()}") + message = f"UFT-8 error in file {handle.name} during pseudogenes parsing. Aborting parse\n" + message += f"Working directory: {os.getcwd()}" + slack.send_notification("Ensembl parser error", message) diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py index 70da8a196..5ae922997 100644 --- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py +++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py @@ -14,11 +14,11 @@ """ import json -import tempfile -from ftplib import FTP import logging +import tempfile import typing as ty from contextlib import contextmanager +from ftplib import FTP from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo @@ -71,16 +71,17 @@ def generate_paths( gff_path = f"{base}/{release}/gff3/{name}/{organism_name}.gff3.gz" data_files = f"{base}/{release}/embl/{name}/{organism_name}.*.dat.gz" - try: - size = ftp.size(gff_path) - if size is None: - LOGGER.warn("GFF file %s is empty, skip %s", gff_path, assembly) - continue - except: - LOGGER.warn( - "Could not get data for %s, skipping %s", gff_path, assembly - ) - continue + # try: + # size = ftp.size(gff_path) + # if size is None: + # LOGGER.warn("GFF file %s is empty, skip %s", gff_path, assembly) + # continue + # except e: + # LOGGER.warn( + # "Could not get data for %s, skipping %s", gff_path, assembly + # ) + # print(e) + # continue yield FtpInfo( division=division, diff --git a/rnacentral_pipeline/databases/europepmc/xml.py b/rnacentral_pipeline/databases/europepmc/xml.py index fb30d5bfb..53d279f5f 100644 --- a/rnacentral_pipeline/databases/europepmc/xml.py +++ b/rnacentral_pipeline/databases/europepmc/xml.py @@ -203,7 +203,7 @@ def node_to_reference(node): def parse(xml_file): - for _, node in ET.iterparse(xml_file, events=("end",), tag="PMC_ARTICLE"): + for _, node in ET.iterparse(xml_file, recover=True, events=("end",), tag="PMC_ARTICLE"): ref = node_to_reference(node) if not ref: continue diff --git a/rnacentral_pipeline/databases/expressionatlas/__init__.py b/rnacentral_pipeline/databases/expressionatlas/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnacentral_pipeline/databases/expressionatlas/helpers.py b/rnacentral_pipeline/databases/expressionatlas/helpers.py new file mode 100644 index 000000000..4329e3fc6 --- /dev/null +++ b/rnacentral_pipeline/databases/expressionatlas/helpers.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-current] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion +from rnacentral_pipeline.databases.helpers import phylogeny as phy +from rnacentral_pipeline.databases.helpers import publications as pubs + + +def accession(info): + return "EXPRESSIONATLAS:" + info["GeneID"] + + +def primary_id(info): + return "EXPRESSIONATLAS:" + info["GeneID"] + + +def taxid(info): + taxid = info["taxid"][0] + return int(taxid) + + +def species(info): + return phy.species(info["taxid"][0]) + + +def lineage(info): + return phy.lineage(info["taxid"][0]) + + +def common_name(info): + return phy.common_name(info["taxid"][0]) + + +def url(experiment): + return "https://www.ebi.ac.uk/gxa/experiments/" + experiment + + +def region_builder(info): + print(info["region_start"], info["region_stop"], info["strand"], info["urs_taxid"]) + return [ + SequenceRegion( + chromosome=info["chromosome"][0], + strand=info["strand"][0], + exons=[ + Exon(start=start, stop=stop) + for start, stop in zip(info["region_start"], info["region_stop"]) + ], + assembly_id=info["assembly_id"][0], + coordinate_system="1-start, fully-closed", + ) + ] + + +def references(interactions): + refs = set() + for interaction in interactions: + refs.update(interaction.publications) + refs.add(pubs.reference(24234451)) + return list(refs) + + +def as_entry(info, experiment): + synonyms = list( + filter(None, [""] if info["Gene Name"] == [None] else info["Gene Name"]) + ) + return Entry( + primary_id=primary_id(info), + accession=accession(info), + ncbi_tax_id=taxid(info), + database="EXPRESSIONATLAS", + sequence=info["seq"][0], + regions=region_builder(info), + rna_type=info["rna_type"][0], + url=url(experiment), + seq_version="1", + description=info["description"][0], + species=species(info), + common_name=common_name(info), + lineage=lineage(info), + gene=info["GeneID"][0], + gene_synonyms=synonyms, + ) diff --git a/rnacentral_pipeline/databases/expressionatlas/lookup.py b/rnacentral_pipeline/databases/expressionatlas/lookup.py new file mode 100644 index 000000000..14a7cc160 --- /dev/null +++ b/rnacentral_pipeline/databases/expressionatlas/lookup.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-current] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import operator as op +from rnacentral_pipeline.rnacentral import lookup + +QUERY = """ +select + pre.id as id, + pre.rna_type, + COALESCE(rna.seq_short, rna.seq_long) as sequence, + pre.description + +from rnc_rna_precomputed pre +join rna on rna.upi = pre.upi +where + pre.id in %s +""" + +def ids(interactions): + getter = op.attrgetter("urs_taxid") + return {getter(r) for r in interactions} + + +def mapping(db_url, data): + """ + lookup URS as a mapping, gets just enough information to create a valid + entry object + + This is fairly unpleasant, but data is noq a load of tuples, so we have to + extract the URS from it to use here. + + The other element of the tupe is the Gene ID, used for constructing the + URL later + """ + _mapping = lookup.as_mapping(db_url, map(op.itemgetter(0), data), QUERY) + for idx, value in enumerate(_mapping.values()): + value["sequence"] = value["sequence"].replace("U", "T") + return _mapping diff --git a/rnacentral_pipeline/databases/expressionatlas/parser.py b/rnacentral_pipeline/databases/expressionatlas/parser.py new file mode 100644 index 000000000..3a5e3ea20 --- /dev/null +++ b/rnacentral_pipeline/databases/expressionatlas/parser.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-current] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import json +import operator as op +import typing as ty + +from rnacentral_pipeline.databases import data + +from . import helpers + + +def as_expression(mapping): + + pass + + +def parse(handle, db_url): + """ + Process the jsonlines output from Rust into entries. + + The jsonlines output is already grouped by geneID and urs taxid so this + should give us the transcript level linkage we're after without any further + processing. + """ + for line in handle: + hit = json.loads(line) + for experiment in hit["experiment"]: + print(hit) + yield helpers.as_entry(hit, experiment) diff --git a/rnacentral_pipeline/databases/genecards_suite/core/parser.py b/rnacentral_pipeline/databases/genecards_suite/core/parser.py index 49d75c6bd..2ae12eace 100644 --- a/rnacentral_pipeline/databases/genecards_suite/core/parser.py +++ b/rnacentral_pipeline/databases/genecards_suite/core/parser.py @@ -30,7 +30,7 @@ def as_entry(context: Context, row, matching: KnownSequence) -> data.Entry: accession=helpers.accession(context, row), ncbi_tax_id=helpers.taxid(context, row), database=context.database, - sequence=matching.sequence, + sequence=matching.sequence.upper().replace('U', 'T'), regions=[], rna_type=matching.rna_type, url=context.url(row), diff --git a/rnacentral_pipeline/databases/helpers/phylogeny.py b/rnacentral_pipeline/databases/helpers/phylogeny.py index a4eb6b400..c3d125fcb 100644 --- a/rnacentral_pipeline/databases/helpers/phylogeny.py +++ b/rnacentral_pipeline/databases/helpers/phylogeny.py @@ -23,6 +23,8 @@ TAX_URL = "https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/{taxon_id}" +SPECIES_URL = "https://www.ebi.ac.uk/ena/taxonomy/rest/any-name/{species}" + LOGGER = logging.getLogger(__name__) @@ -117,3 +119,36 @@ def division(taxon_id: int) -> str: data = phylogeny(taxon_id) return data["division"] + +@lru_cache +def taxid(species: str) -> int: + """ + Get the taxid for a given species + Re-use request logic from phylogeny, but this uses a different endpoint, so + can't directly reuse + """ + + for count in range(10): + response = requests.get(SPECIES_URL.format(species=species)) + try: + response.raise_for_status() + data = response.json() + break + except simplejson.errors.JSONDecodeError: + sleep(0.15 * (count + 1) ** 2) + continue + except requests.HTTPError as err: + if response.status_code == 500: + sleep(0.15 * (count + 1) ** 2) + continue + elif response.status_code == 404: + raise UnknownTaxonId(taxon_id) + else: + LOGGER.exception(err) + raise FailedTaxonId("Unknown error") + else: + raise FailedTaxonId("Could not get taxon id for %s" % species) + + if not data: + raise FailedTaxonId("Somehow got no data") + return int(data[0]["taxId"]) diff --git a/rnacentral_pipeline/databases/lncbook/parser.py b/rnacentral_pipeline/databases/lncbook/parser.py index 97997db2d..688af5d55 100644 --- a/rnacentral_pipeline/databases/lncbook/parser.py +++ b/rnacentral_pipeline/databases/lncbook/parser.py @@ -15,6 +15,7 @@ import json from io import StringIO +import attr from rnacentral_pipeline.databases.generic import parser as generic @@ -23,12 +24,13 @@ def parse(handle): raw = json.load(handle) data = [] for ncrna in raw["data"]: - regions = ncrna["genomeLocactions"] + regions = ncrna["genomeLocations"] regions = filter(lambda r: r["assembly"] == "GRCh38", regions) regions = list(regions) if not regions: continue - ncrna["genomeLocactions"] = regions + ncrna["genomeLocations"] = regions + ncrna["sequence"] = ncrna["sequence"].upper() data.append(ncrna) if not data: raise ValueError("All ncRNA are not from GRCh38, failing") diff --git a/rnacentral_pipeline/databases/pdb/data.py b/rnacentral_pipeline/databases/pdb/data.py index 73bbd3280..62b9407f0 100644 --- a/rnacentral_pipeline/databases/pdb/data.py +++ b/rnacentral_pipeline/databases/pdb/data.py @@ -38,18 +38,18 @@ def first_or_none(value): @attr.s() class ChainInfo: - pdb_id = attr.ib(validator=is_a(str)) - chain_id = attr.ib(validator=is_a(str)) - release_date = attr.ib(validator=is_a(dt.datetime)) - experimental_method = attr.ib(validator=optional(is_a(str))) - entity_id = attr.ib(validator=is_a(int)) + pdb_id: str = attr.ib(validator=is_a(str)) + chain_id: str = attr.ib(validator=is_a(str)) + release_date: dt.datetime = attr.ib(validator=is_a(dt.datetime)) + experimental_method: ty.Optional[str] = attr.ib(validator=optional(is_a(str))) + entity_id: int = attr.ib(validator=is_a(int)) taxids: ty.List[int] = attr.ib(validator=is_a(list)) - resolution = attr.ib(validator=optional(is_a(float))) - title = attr.ib(validator=is_a(str)) - sequence = attr.ib(validator=is_a(str)) + resolution: float = attr.ib(validator=optional(is_a(float))) + title: str = attr.ib(validator=is_a(str)) + sequence: str = attr.ib(validator=is_a(str)) molecule_names: ty.List[str] = attr.ib(validator=is_a(list)) - molecule_type = attr.ib(validator=optional(is_a(str))) - organism_scientific_name = attr.ib(validator=optional(is_a(str))) + molecule_type: str = attr.ib(validator=optional(is_a(str))) + organism_scientific_name: ty.Optional[str] = attr.ib(validator=optional(is_a(str))) @classmethod def build(cls, chain_index, raw) -> ChainInfo: @@ -64,13 +64,16 @@ def build(cls, chain_index, raw) -> ChainInfo: resolution=raw.get("resolution"), title=raw["title"], sequence=raw["molecule_sequence"], - molecule_names=raw.get("molecule_name", []), + molecule_names=raw.get("molecule_name", raw.get("rfam_id", [])), molecule_type=raw.get("molecule_type", None), organism_scientific_name=first_or_none( raw.get("organism_scientific_name", []) ), ) + def override_key(self) -> ty.Tuple[str, str]: + return (self.pdb_id.lower(), self.chain_id) + def accession(self) -> str: return f"{self.pdb_id.upper()}_{self.chain_id}_{self.entity_id}" diff --git a/rnacentral_pipeline/databases/pdb/fetch.py b/rnacentral_pipeline/databases/pdb/fetch.py index aca60bdd3..8f8e5a467 100644 --- a/rnacentral_pipeline/databases/pdb/fetch.py +++ b/rnacentral_pipeline/databases/pdb/fetch.py @@ -19,15 +19,14 @@ import logging import typing as ty -from furl import furl import requests -from retry import retry +from furl import furl from more_itertools import chunked from ratelimiter import RateLimiter +from retry import retry -from rnacentral_pipeline.databases.pdb.data import ChainInfo -from rnacentral_pipeline.databases.pdb.data import ReferenceMapping from rnacentral_pipeline.databases.pdb import helpers +from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping LOGGER = logging.getLogger(__name__) @@ -45,6 +44,7 @@ "molecule_name", "molecule_type", "organism_scientific_name", + "rfam_id", } PDBE_SEARCH_URL = "https://www.ebi.ac.uk/pdbe/search/pdb/select" @@ -83,34 +83,88 @@ def fetch_range(query: str, start: int, rows: int) -> ty.Iterator[ChainInfo]: raise MissingPdbs(f"Missing for '{query}', {start}") for raw in data["response"]["docs"]: for index in range(len(raw["chain_id"])): - info = ChainInfo.build(index, raw) - if info.molecule_type and "RNA" in info.molecule_type: - yield info + yield ChainInfo.build(index, raw) + + +@retry((requests.HTTPError, MissingPdbs), tries=5, delay=1) +def all_chains_in_pdbs( + pdb_ids: ty.List[str], query_size=1000 +) -> ty.Iterable[ChainInfo]: + """ + Get all chains from all given PDB ids. This does no filtering to chains that + may be RNA or not and simply fetches everything. + """ + + LOGGER.info("Fetching all chains in requested structures") + query = " OR ".join([f"pdb_id:{p.lower()}" for p in pdb_ids]) + + total = get_pdbe_count(query) + limiter = RateLimiter(max_calls=10, period=1) + for start in range(0, total, query_size): + with limiter: + for chain in fetch_range(query, start, query_size): + yield chain + + +@retry((requests.HTTPError, MissingPdbs), tries=5, delay=1) +def chains(required: ty.Set[ty.Tuple[str, str]], query_size=1000) -> ty.List[ChainInfo]: + """ + Get all chains from all given PDB ids. This does no filtering to chains that + may be RNA or not and simply fetches everything. + """ + + LOGGER.info("Fetching requested chains") + + seen = set() + chains = [] + pdb_ids = [r[0] for r in required] + for chain in all_chains_in_pdbs(pdb_ids): + key = chain.override_key() + if key not in required: + continue + seen.add(key) + chains.append(chain) + + if seen != required: + missed = required - seen + raise ValueError("Did not find all requested ids: %s" % missed) + return chains @retry((requests.HTTPError, MissingPdbs), tries=5, delay=1) def rna_chains( - pdb_ids: ty.Optional[ty.List[str]] = None, query_size=1000 + required: ty.Set[ty.Tuple[str, str]], query_size=1000 ) -> ty.List[ChainInfo]: """ Get PDB ids of all RNA-containing 3D structures using the RCSB PDB REST API. """ + LOGGER.info("Fetching all RNA containing chains") query = "number_of_RNA_chains:[1 TO *]" - if pdb_ids: - id_query = " OR ".join([f"pdb_id:{p.lower()}" for p in pdb_ids]) - query = f"{query} AND ({id_query})" - rna_chains: ty.List[ChainInfo] = [] total = get_pdbe_count(query) + seen = set() limiter = RateLimiter(max_calls=10, period=1) for start in range(0, total, query_size): with limiter: - rna_chains.extend(fetch_range(query, start, query_size)) + for chain in fetch_range(query, start, query_size): + key = chain.override_key() + if ( + chain.molecule_type and "RNA" in chain.molecule_type + ) or key in required: + rna_chains.append(chain) + seen.add(key) + + # This may be missed if the PDB does not contain any chains labeled as RNA. + # Rfam does match some DNA chains so we allow them into RNAcentral. + missed = required - seen + if missed: + LOGGER.info("Missed some chains, well fetch manually") + rna_chains.extend(chains(missed)) - # Must be >= as sometimes more than one chain is in a single document assert rna_chains, "Found no RNA chains" + LOGGER.info("Found %i RNA containing chains", len(rna_chains)) return rna_chains diff --git a/rnacentral_pipeline/databases/pdb/helpers.py b/rnacentral_pipeline/databases/pdb/helpers.py index d17d8a39e..a7434ab03 100644 --- a/rnacentral_pipeline/databases/pdb/helpers.py +++ b/rnacentral_pipeline/databases/pdb/helpers.py @@ -13,16 +13,15 @@ limitations under the License. """ +import csv +import logging import re import typing as ty -import logging +from rnacentral_pipeline.databases.data import AnyReference, Reference from rnacentral_pipeline.databases.helpers import phylogeny as phy from rnacentral_pipeline.databases.helpers import publications as pubs -from rnacentral_pipeline.databases.data import Reference -from rnacentral_pipeline.databases.data import AnyReference -from rnacentral_pipeline.databases.pdb.data import ChainInfo -from rnacentral_pipeline.databases.pdb.data import ReferenceMapping +from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping RIBOSOMES = set( [ @@ -36,6 +35,7 @@ "40S", "60S", "80S", + "LSU", ] ) @@ -58,6 +58,14 @@ class MissingProduct(Exception): pass +class MissingTypeInfo(Exception): + """ + Raised when the chain molecule_names field is empty and we can't infer type + """ + + pass + + def is_mrna(chain: ChainInfo) -> bool: mrna_names = [ "mRNA", @@ -182,7 +190,10 @@ def compound_rna_type(compound: str) -> str: def rna_type(info: ChainInfo) -> str: if not info.molecule_names: - raise ValueError(f"Cannot find RNA type for {info}") + raise MissingTypeInfo( + f'Cannot find RNA type for {info}, falling back to "misc_RNA"' + ) + return "misc_RNA" return compound_rna_type(info.molecule_names[0]) @@ -235,3 +246,15 @@ def lineage(info: ChainInfo) -> str: def species(info: ChainInfo) -> str: return phy.species(taxid(info)) + + +def load_overrides(handle) -> ty.Set[ty.Tuple[str, str]]: + """ + Parse TSV file of pdb_id chain and produce a set of (pdb_id, chain). PDB id + will be lowercased. This is used to ensure all sequences with an Rfam match + are loaded into the pipeline. + """ + overrides = set() + for row in csv.reader(handle, delimiter="\t"): + overrides.add((row[0].lower(), row[1])) + return overrides diff --git a/rnacentral_pipeline/databases/pdb/parser.py b/rnacentral_pipeline/databases/pdb/parser.py index f45917a5d..3f5bffe5f 100644 --- a/rnacentral_pipeline/databases/pdb/parser.py +++ b/rnacentral_pipeline/databases/pdb/parser.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Copyright [2009-2017] EMBL-European Bioinformatics Institute +Copyright [2009-2022] EMBL-European Bioinformatics Institute Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -17,10 +17,8 @@ import typing as ty from rnacentral_pipeline.databases import data - -from rnacentral_pipeline.databases.pdb.data import ChainInfo -from rnacentral_pipeline.databases.pdb.data import ReferenceMapping from rnacentral_pipeline.databases.pdb import helpers +from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping LOGGER = logging.getLogger(__name__) @@ -50,22 +48,36 @@ def as_entry(info: ChainInfo, reference_mapping: ReferenceMapping): def parse( rna_chains: ty.List[ChainInfo], reference_mapping: ReferenceMapping, + override_list: ty.Set[ty.Tuple[str, str]], ) -> ty.Iterator[data.Entry]: disqualified = {"mRNA": 0, "other": 0} + seen: ty.Set[ty.Tuple[str, str]] = set() for chain in rna_chains: - if helpers.is_mrna(chain): - LOGGER.debug("Disqualifing %s", chain) - disqualified["mRNA"] += 1 - continue + override_key = chain.override_key() + if override_key in override_list: + LOGGER.debug("Overriding %s, %s", chain.pdb_id, chain.chain_id) + seen.add(override_key) + else: + if helpers.is_mrna(chain): + LOGGER.debug("Disqualifing %s", chain) + disqualified["mRNA"] += 1 + continue - if not helpers.is_ncrna(chain): - LOGGER.debug("Skipping %s", chain) - disqualified["other"] += 1 - continue + if not helpers.is_ncrna(chain): + LOGGER.debug("Skipping %s", chain) + disqualified["other"] += 1 + continue try: yield as_entry(chain, reference_mapping) except helpers.InvalidSequence: LOGGER.warn(f"Invalid sequence for {chain}") + except helpers.MissingTypeInfo: + LOGGER.warn(f"Missing type info for {chain}") + + missing = override_list - seen LOGGER.info("Disqualified %i mRNA chains", disqualified["mRNA"]) LOGGER.info("Disqualified %i non ncRNA chains", disqualified["other"]) + LOGGER.info("Did not load %s overrided chains", missing) + if missing: + raise ValueError("Missed some required ids %s" % missing) diff --git a/rnacentral_pipeline/databases/plncdb/__init__.py b/rnacentral_pipeline/databases/plncdb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnacentral_pipeline/databases/plncdb/parser.py b/rnacentral_pipeline/databases/plncdb/parser.py new file mode 100644 index 000000000..7be36f964 --- /dev/null +++ b/rnacentral_pipeline/databases/plncdb/parser.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2020] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion +from rnacentral_pipeline.databases.helpers import phylogeny as phy + +import gffutils +from Bio import SeqIO +from Bio.SeqFeature import SeqFeature, FeatureLocation +import pathlib +import typing as ty +import os + +import pandas as pd + +from tqdm import tqdm + +def _find_gff_file(search_path: pathlib.Path) -> pathlib.Path: + """ + Find the right gff3 file for use with the fasta in parsing alongside the + info file + """ + for file in search_path.iterdir(): + if file.suffix.strip() == ".gff3" and "PLncDB" in file.stem: + return file + +def _find_fasta_file(search_path: pathlib.Path) -> pathlib.Path: + """ + Find the fasta file that corresponds with the gff file and info file + """ + for file in search_path.iterdir(): + if file.suffix.strip() == ".fa" and "chromosome" in file.stem: + return file + +def _find_info_file(search_path: pathlib.Path) -> pathlib.Path: + """ + Find the corresponding info filefor this fasta and gff + """ + for file in search_path.iterdir(): + if file.suffix.strip() == ".txt" and "lncRNA" in file.stem: + return file + +def _generate_description(taxid: int, gene_name: str) -> str: + info = phy.phylogeny(taxid) + scientificName = info['scientificName'] + shortName = scientificName[0] + f". {scientificName.split()[1]}" + description = ( + f"{info['scientificName']} ({info.get('commonName', shortName)}) " + f"long non-coding RNA ({gene_name})" + ) + return description + +def parse(data:pathlib.Path) -> ty.Iterable[Entry]: + """ + Parse a directory of data from PLncDB into entries for import. Expects the + directory to contain one directory per species which is derived from the + FTP download that has been decompressed. + + We read the gff3, fasta, and associated info file to construct the entry + """ + + + ## Set some things which will be common for all entries + rna_type = "SO:0001877" + database = "PLNCDB" + + url = "https://www.tobaccodb.org/plncdb/nunMir?plncdb_id={}" + + + ## loop on all directories in the data directory + gff_file = _find_gff_file(data) + fasta_file = _find_fasta_file(data) + info_file = _find_info_file(data) + + # Load the GFF file into the gffutils database for working with + gff_db = gffutils.create_db(str(gff_file), ":memory:") + + ## Load the FASTA file as well + fasta_db = SeqIO.to_dict(SeqIO.parse(str(fasta_file), 'fasta')) + + ## Finally, load the info file using pandas + species_info = pd.read_csv(info_file, delimiter='\t') + species_info["Species"] = species_info["Species"].apply(lambda x: x.replace("_", " ")) + species_info["taxid"] = species_info["Species"].apply(phy.taxid) + + + total_entries = len(gff_db.execute("select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall()) + entries = [] + for gene_id_q in tqdm(gff_db.execute("select id from features"), total=total_entries): + primary_id = gene_id_q["id"] + + gene_info = species_info[species_info["lncRNA_ID"] == primary_id] + if len(gene_info) == 0: + break + + + + taxid = gene_info["taxid"].values[0] + + chromosome = fasta_db[gff_db[primary_id].seqid] ##Hopefully gets the right chromosome? + + features = list(gff_db.children(primary_id)) + ##TODO: check coordinate system + exons = [Exon(start=e.start, stop=e.stop) for e in features] + seq_start = min([e.start for e in features]) + seq_end = max([e.end for e in features]) + whole_feature = SeqFeature(FeatureLocation(seq_start, seq_end)) + + sequence = whole_feature.extract(chromosome) + + region = SequenceRegion( + chromosome = features[0].chrom, + strand = features[0].strand, + exons = exons, + assembly_id = gene_info['Ref_Genome_Vers'], + coordinate_system = "1-start, fully-closed" + ) + + entries.append( + Entry( + primary_id=primary_id, + accession=primary_id, + ncbi_tax_id=int(taxid), + species=species_info["Species"][0], + database=database, + sequence=sequence.seq.upper(), + regions=[region], + rna_type=rna_type, + url=url.format(primary_id), + seq_version="1", + # optional_id=optional_id(record, context), + description=_generate_description(int(taxid), gene_info["Gene_ID"].values[0]), + # note_data=note_data(record), + # xref_data=xrefs(record), + # related_sequences=related_sequences(record), + # secondary_structure=secondary_structure(record), + # references=references(record), + # organelle=record.get("localization", None), + # product=record.get("product", None), + # anticodon=anticodon(record), + gene=gene_info["Gene_ID"].values[0], + # gene_synonyms=gene_synonyms(record), + # locus_tag=locus_tag(record), + # features=features(record), + ) + ) + + + + return entries diff --git a/rnacentral_pipeline/databases/rfam/helpers.py b/rnacentral_pipeline/databases/rfam/helpers.py index ac23053bc..6f6f790b1 100644 --- a/rnacentral_pipeline/databases/rfam/helpers.py +++ b/rnacentral_pipeline/databases/rfam/helpers.py @@ -14,13 +14,12 @@ """ import collections as coll +import logging import re import typing as ty -import logging - -from rnacentral_pipeline.databases.helpers.publications import reference from rnacentral_pipeline.databases.data import IdReference +from rnacentral_pipeline.databases.helpers.publications import reference LOGGER = logging.getLogger(__name__) @@ -54,10 +53,10 @@ def seq_version(data: ty.Dict[str, str]) -> str: def rna_type(family: ty.Dict[str, str]) -> str: so_terms = family["so_terms"] - if ',' in so_terms: - so_terms = so_terms.split(',')[0] - if ',' in so_terms: - so_terms = so_terms.split(',')[0] + if "," in so_terms: + so_terms = so_terms.split(",")[0] + if "," in so_terms: + so_terms = so_terms.split(",")[0] assert re.match(r"^SO:\d+$", so_terms) return so_terms @@ -90,7 +89,7 @@ def note(data: ty.Dict[str, str]): result = coll.defaultdict(list) result["Alignment"] = data["sequence_type"] for xref in data["dbxrefs"].split(","): - db, _ = xref.split(":") + db, _ = xref.split(":", 1) result[db].append(xref) return result diff --git a/rnacentral_pipeline/databases/zfin/fetch.py b/rnacentral_pipeline/databases/zfin/fetch.py index 797f44bc6..42f605639 100644 --- a/rnacentral_pipeline/databases/zfin/fetch.py +++ b/rnacentral_pipeline/databases/zfin/fetch.py @@ -20,8 +20,7 @@ def fetch(url): - with closing(request.urlopen(url)) as compressed: - with gzip.GzipFile(None, "rb", 9, compressed) as raw: + with closing(request.urlopen(url)) as raw: data = json.load(raw) # Fix weird PMID formatting diff --git a/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py b/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py index 6d4bd9786..20436bb3a 100644 --- a/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py +++ b/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py @@ -13,15 +13,14 @@ limitations under the License. """ -import re import json import operator as op +import re from jsonschema import validate from rnacentral_pipeline import psql - MOD_URL = "http://modomics.genesilico.pl/sequences/list/{id}" @@ -37,6 +36,13 @@ ] ) +DISALLOWED_TYPES = set( + [ + "circRNA", + "sgRNA", + ] +) + SEQUENCE_PATTERN = re.compile("^[ACGTYRWSKMDVHBNXFI]+$") @@ -50,6 +56,9 @@ def external_id(data): def is_high_quality(data): name = data["database"].lower() + ## Do not send some RNAs to ensembl + if data["rna_type"] in DISALLOWED_TYPES: + return False if name in TRUSTED_DB: return True if name == "rfam": diff --git a/rnacentral_pipeline/rnacentral/notify/__init__.py b/rnacentral_pipeline/rnacentral/notify/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rnacentral_pipeline/rnacentral/notify/slack.py b/rnacentral_pipeline/rnacentral/notify/slack.py new file mode 100644 index 000000000..52891b05d --- /dev/null +++ b/rnacentral_pipeline/rnacentral/notify/slack.py @@ -0,0 +1,110 @@ +""" +Send a notification to slack. + +NB: The webhook should be configured in the nextflow profile + +""" + +import os + +import psycopg2 +import requests +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError + +REPORT_QUERY = """ +SELECT display_name, count(taxid) FROM xref +JOIN rnc_database db ON xref.dbid = db.id +WHERE xref.deleted = 'N' +AND EXTRACT (DAY FROM (CURRENT_TIMESTAMP - xref.timestamp)) < 7 +GROUP BY display_name +ORDER BY display_name +""" + + +def send_notification(title, message): + """ + Send a notification to the configured slack webhook. + """ + SLACK_WEBHOOK = os.getenv("SLACK_CLIENT_TOKEN") + if SLACK_WEBHOOK is None: + raise SystemExit("SLACK_CLIENT_TOKEN environment variable not defined") + + client_token = os.getenv("SLACK_CLIENT_TOKEN") + channel = os.getenv("SLACK_CHANNEL") + + client = WebClient(token=client_token) + + blocks = [ + { + "type": "section", + "text": {"type": "mrkdwn", "text": message}, + }, + ] + try: + response = client.chat_postMessage(channel=channel, text=title, blocks=blocks) + + print(response) + except SlackApiError as e: + assert e.response["error"] + + +def pipeline_report(): + """ + Generates a nicely formatted report of the number of sequences imported from + each DB. This uses the slack_sdk, rather than a webhook, and uses the + blockkit to format the message nicely. + + TODO: What else should go in this? Maybe parsing the log file to get the + run duration? + """ + db_url = os.getenv("PGDATABASE") + client_token = os.getenv("SLACK_CLIENT_TOKEN") + channel = os.getenv("SLACK_CHANNEL") + + client = WebClient(token=client_token) + + lock_text_template = "New sequences from *{0}* {1:,}" + + summary_blocks = [ + { + "type": "header", + "text": {"type": "plain_text", "text": "Workflow Completion report"}, + }, + {"type": "divider"}, + ] + running_total = 0 + with psycopg2.connect(db_url) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + cur.execute(REPORT_QUERY) + res = cur.fetchall() + for r in res: + running_total += r[1] + summary_blocks.append( + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": block_text_template.format(r[0].ljust(30), r[1]), + }, + } + ) + summary_blocks.append({"type": "divider"}) + summary_blocks.append( + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"Total sequences imported: *{running_total:,}*", + }, + } + ) + + try: + response = client.chat_postMessage( + channel=channel, text="Workflow completion report", blocks=summary_blocks + ) + + print(response) + except SlackApiError as e: + assert e.response["error"] diff --git a/rnacentral_pipeline/rnacentral/precompute/data/sequence.py b/rnacentral_pipeline/rnacentral/precompute/data/sequence.py index 427635b09..a53b50149 100644 --- a/rnacentral_pipeline/rnacentral/precompute/data/sequence.py +++ b/rnacentral_pipeline/rnacentral/precompute/data/sequence.py @@ -175,9 +175,7 @@ def species(self) -> ty.Set[str]: for accession in self.accessions: if not accession.species: continue - for species in accession.species: - if species: - all_species.add(species) + all_species.add(accession.species) return all_species def domains(self) -> ty.Set[str]: diff --git a/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py b/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py index 4feae62c9..f47221732 100644 --- a/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py +++ b/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py @@ -23,13 +23,12 @@ from rnacentral_pipeline.databases.data import Database, RnaType from rnacentral_pipeline.databases.sequence_ontology import tree +from rnacentral_pipeline.rnacentral.precompute import utils from rnacentral_pipeline.rnacentral.precompute.data import context from rnacentral_pipeline.rnacentral.precompute.data import sequence as seq from rnacentral_pipeline.rnacentral.precompute.data.accession import Accession from rnacentral_pipeline.rnacentral.precompute.qa import contamination as cont -from rnacentral_pipeline.rnacentral.precompute import utils - LOGGER = logging.getLogger(__name__) @@ -48,6 +47,7 @@ Database.rgd, Database.zfin, Database.mirgenedb, + Database.plncdb, Database.lncipedia, Database.lncrnadb, Database.lncbook, @@ -63,6 +63,7 @@ Database.genecards, Database.malacards, Database.intact, + Database.expression_atlas, Database.rfam, Database.tarbase, Database.lncbase, diff --git a/rnacentral_pipeline/rnacentral/release/database_stats.py b/rnacentral_pipeline/rnacentral/release/database_stats.py index 146d2d72d..eda378495 100644 --- a/rnacentral_pipeline/rnacentral/release/database_stats.py +++ b/rnacentral_pipeline/rnacentral/release/database_stats.py @@ -159,7 +159,9 @@ def lengths(conn, db_id: int) -> ty.Dict[str, ty.Any]: .on(xref.upi == rna.upi) ) cursor.execute(str(query)) - return dict(cursor.fetchone()) + r = {k:v if v is not None else 0 for k,v in dict(cursor.fetchone()).items()} + + return r def count_sequences(conn, db_id: int) -> int: diff --git a/select_databases.nf b/select_databases.nf new file mode 100644 index 000000000..baa06bff7 --- /dev/null +++ b/select_databases.nf @@ -0,0 +1,12 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl=2 + +include { select } from './workflows/databases/select.nf' + + +workflow { + + select() + +} diff --git a/tests/databases/pdb/fetch_test.py b/tests/databases/pdb/fetch_test.py index b3cadcbf8..c96b2cab0 100644 --- a/tests/databases/pdb/fetch_test.py +++ b/tests/databases/pdb/fetch_test.py @@ -20,34 +20,11 @@ from rnacentral_pipeline.databases.pdb import fetch -@pytest.fixture(scope="module") -def chain_info(): - return fetch.rna_chains() - - -@pytest.fixture(scope="module") -def chain_map(chain_info): - info = {} - for chain in chain_info: - info[(chain.pdb_id, chain.chain_id)] = chain - return info - - -@pytest.mark.skip() -def test_can_get_all_pdbs(chain_info): - assert len(fetch.rna_chains()) >= 14686 - - -@pytest.mark.skip() -def test_contains_no_duplicate_chains(chain_info, chain_map): - assert len(chain_info) == len(chain_map) - - @pytest.mark.skip() def test_produces_correct_data(): - chains = fetch.rna_chains(pdb_ids=["1s72"]) - chain = next(c for c in chains if c.chain_id == "9") - assert chain == fetch.ChainInfo( + chains = fetch.chains({("1S72", "9")}) + assert len(chains) == 1 + assert chains[0] == fetch.ChainInfo( pdb_id="1s72", chain_id="9", release_date=dt.datetime(2004, 6, 15, hour=1), @@ -109,5 +86,5 @@ def test_produces_correct_data(): ], ) def test_fetches_all_rna_chains_even_mrna(pdb_id, chains): - entries = fetch.rna_chains(pdb_ids=[pdb_id]) - assert set(d.chain_id for d in entries) == chains + entries = fetch.all_chains_in_pdbs([pdb_id]) + assert set(d.chain_id for d in entries) & chains diff --git a/tests/databases/pdb/helpers_test.py b/tests/databases/pdb/helpers_test.py index 54f9fc558..6570d22de 100644 --- a/tests/databases/pdb/helpers_test.py +++ b/tests/databases/pdb/helpers_test.py @@ -15,14 +15,14 @@ import pytest +from rnacentral_pipeline.databases.pdb import fetch, helpers from rnacentral_pipeline.databases.pdb.data import ChainInfo -from rnacentral_pipeline.databases.pdb import helpers -from rnacentral_pipeline.databases.pdb import fetch def load(pdb_id: str, chain_id: str) -> ChainInfo: - chains = fetch.rna_chains(pdb_ids=[pdb_id.lower()]) - return next(c for c in chains if c.chain_id == chain_id) + chains = fetch.chains({(pdb_id, chain_id)}) + assert len(chains) == 1 + return chains[0] @pytest.mark.parametrize( @@ -66,18 +66,18 @@ def test_can_compute_correct_rna_types(product: str, expected): [ ("7mky", "A", True), ("7lyj", "A", True), - ("5U3G", "B", True), - ("2L1V", "A", True), - ("6VAR", "A", True), - ("4Y1I", "A", True), - ("4Y1I", "B", True), - ("4Y1J", "A", True), - ("4Y1J", "B", True), - ("4Y1M", "A", True), - ("4Y1M", "B", True), - ("7MKY", "A", True), - ("7LYJ", "A", True), - ("7MLW", "F", True), + ("5u3g", "B", True), + ("2l1v", "A", True), + ("6var", "A", True), + ("4y1i", "A", True), + ("4y1i", "B", True), + ("4y1j", "A", True), + ("4y1j", "B", True), + ("4y1m", "A", True), + ("4y1m", "B", True), + ("7mky", "A", True), + ("7lyj", "A", True), + ("7mlw", "F", True), ], ) def test_can_detect_if_is_ncrna(pdb, chain, expected): diff --git a/tests/databases/pdb/parser_test.py b/tests/databases/pdb/parser_test.py index 8bae184ea..98e35b2ca 100644 --- a/tests/databases/pdb/parser_test.py +++ b/tests/databases/pdb/parser_test.py @@ -17,13 +17,12 @@ import pytest from rnacentral_pipeline.databases import data -from rnacentral_pipeline.databases.pdb import parser -from rnacentral_pipeline.databases.pdb import fetch from rnacentral_pipeline.databases.helpers import publications as pubs +from rnacentral_pipeline.databases.pdb import fetch, parser def load(pdb_id: str, chain_id: str) -> data.Entry: - chains = fetch.rna_chains(pdb_ids=[pdb_id.lower()]) + chains = fetch.chains({(pdb_id.lower(), chain_id)}) chain_info = next(c for c in chains if c.chain_id == chain_id) references = fetch.references([chain_info]) return parser.as_entry(chain_info, references) @@ -103,33 +102,59 @@ def test_can_build_correct_entry_for_srp_rna(): ) +@pytest.mark.skip("Needs to be reworked") @pytest.mark.parametrize( "pdb_id,expected", [ ("157d", [32630, 32630]), ("1a1t", [32630]), - ("1j5e", [274]), ], ) def test_can_get_given_taxid(pdb_id, expected): chains = fetch.rna_chains(pdb_ids=[pdb_id]) - taxids = [entry.ncbi_tax_id for entry in parser.parse(chains, {})] + taxids = [entry.ncbi_tax_id for entry in parser.parse(chains, {}, set())] assert taxids == expected @pytest.mark.parametrize( - "pdb_id,missing", + "requested,missing", [ - ("5wnt", "5WNT_U_21"), - ("5wnp", "5WNP_U_21"), + (("5wnt", "A"), ("5WNT", "U")), + (("5wnp", "A"), ("5WNP", "U")), ], ) -def test_will_not_fetch_mislabeled_chains(pdb_id, missing): - chains = fetch.rna_chains(pdb_ids=[pdb_id]) - entries = {e.primary_id for e in parser.parse(chains, {})} +def test_will_not_fetch_mislabeled_chains(requested, missing): + chains = fetch.chains({requested}) + entries = {(e.primary_id, e.optional_id) for e in parser.parse(chains, {}, set())} assert missing not in entries +@pytest.mark.parametrize( + "overrides,expected", + [ + ( + { + ("7umc", "A"), + }, + ("7UMC", "A"), + ), + ( + { + ("7umc", "A"), + }, + ("7UMC", "A"), + ), + ({("7mib", "H")}, ("7MIB", "H")), + ], +) +def test_will_respect_the_override_list(overrides, expected): + chains = fetch.chains(overrides) + entries = { + (e.primary_id, e.optional_id) for e in parser.parse(chains, {}, overrides) + } + assert expected in entries + + @pytest.mark.parametrize( "pdb_id,chains", [ @@ -173,8 +198,8 @@ def test_will_not_fetch_mislabeled_chains(pdb_id, missing): ], ) def test_extracts_expected_chains(pdb_id, chains): - fetched = fetch.rna_chains(pdb_ids=[pdb_id.lower()]) - entries = parser.parse(fetched, {}) + fetched = fetch.all_chains_in_pdbs([pdb_id]) + entries = parser.parse(list(fetched), {}, set()) assert set(d.optional_id for d in entries) == chains diff --git a/tests/rnacentral/genes/build_test.py b/tests/rnacentral/genes/build_test.py index a500eb802..bec1ebb01 100644 --- a/tests/rnacentral/genes/build_test.py +++ b/tests/rnacentral/genes/build_test.py @@ -19,6 +19,7 @@ import pytest import psycopg2 import yaml +from yaml import Loader from rnacentral_pipeline.rnacentral.genes import build @@ -92,7 +93,7 @@ def load_overlapping_regions(region_name): def load_examples(): with open("data/genes/examples.yaml", "r") as raw: - return yaml.load(raw) + return yaml.load(raw, Loader=Loader) @pytest.mark.parametrize("expected", load_examples()) diff --git a/tests/rnacentral/precompute/number_of_species.py b/tests/rnacentral/precompute/number_of_species.py new file mode 100644 index 000000000..ddea90f7d --- /dev/null +++ b/tests/rnacentral/precompute/number_of_species.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +""" +Copyright [2009-2022] EMBL-European Bioinformatics Institute +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from functools import lru_cache + +import attr +import pytest + +from rnacentral_pipeline.rnacentral.precompute.data.update import SequenceUpdate +from rnacentral_pipeline.rnacentral.precompute.data.context import Context +from rnacentral_pipeline.rnacentral.precompute import process + +from . import helpers + + +def load_data(upi): + context, sequence = helpers.load_data(upi) + return SequenceUpdate.from_sequence(context, sequence) + + + +@pytest.mark.parametrize( + "rna_id,number", + [ # pylint: disable=no-member + ( + "URS000001E7BA_559292", + 1, + ), + ] +) +def test_gets_correct_number_of_species(rna_id, number): + spec_set = load_data(rna_id).sequence.species() + assert number == len(spec_set) diff --git a/utils/expression-atlas/Cargo.toml b/utils/expression-atlas/Cargo.toml new file mode 100644 index 000000000..156e976db --- /dev/null +++ b/utils/expression-atlas/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "expression-parse" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "1.0" +anyhow = "1.0" +log = "0.4" +env_logger = "0.9.0" +multimap = "0.8.3" +clap = { version = "3.1.18", features = ["derive"] } +polars = { version = "0.21.1", features = ["lazy", "csv-file", "rows", "abs", "is_in", "strings", "concat_str", "list", "json"] } +quick-xml = { version = "0.22.0", features = ["serialize"] } +serde = { version = "1.0", features = [ "derive" ] } diff --git a/utils/expression-atlas/src/augment.rs b/utils/expression-atlas/src/augment.rs new file mode 100644 index 000000000..a4732c4d1 --- /dev/null +++ b/utils/expression-atlas/src/augment.rs @@ -0,0 +1,326 @@ +use crate::configuration::*; +/// This module will combine the config and dataframes to create a df we can extract all the +/// necessary information from. +use anyhow::Result; +use multimap::MultiMap; +use polars::frame::DataFrame; +use polars::prelude::*; + +use log::{info, warn}; + +/// This function will 'augment' the experiment dataframe with information from the config +/// What that means is adding location and factor information as series to the right of the +/// dataframe. I do this based on the assay group id (the gN in the expression data) which I +/// map to the assay name. The assay name is then mapped to the location and factor data in the +/// sdrf file, where I try to separate factors and locations. This also allows grabbing the +/// taxid for the experiment +pub fn augment_differential_df( + df: &mut DataFrame, + config: &Config, + sdrf: &DataFrame, +) -> Result { + // preprocess the config into a hash map for later convenience + info!("Parsing config into lookup MultiMap"); + let mut contrast_lookup: MultiMap = MultiMap::new(); + for analysis in &config.analytics { + for cont in &analysis.contrasts.as_ref().unwrap().contrast { + let test_group = cont.test_group.clone(); + + let mut assay_names = analysis.assay_groups.assay_group.clone(); + assay_names.retain(|ag| ag.id == test_group); + + contrast_lookup.insert( + test_group, + (format!("{}.log2foldchange", cont.id), format!("{}.p-value", cont.id)), + ); + } + } + + // Set up some dataframes for the things we want + let mut taxid_df = DataFrame::default(); + let mut localisation_df = DataFrame::default(); + let mut disease_df = DataFrame::default(); + let mut cell_type_df = DataFrame::default(); + + let mut df_result = DataFrame::default(); + + for analysis in &config.analytics { + for ass_group in &analysis.assay_groups.assay_group { + // Build the series of assay names for matching later + let mut assay_names = + Utf8ChunkedBuilder::new("assay_names", ass_group.assays.len(), 128); + for ass_nm in &ass_group.assays { + assay_names.append_value(ass_nm); + } + // Select out just the bits for this assay group from sdrf + let assay_df = sdrf + .clone() + .lazy() + .filter(col("assay_name").is_in(lit(assay_names.finish().into_series()))) + .with_column(lit(ass_group.id.as_str()).alias("group_id")); + + let mut df_inter = df.clone(); + + df_inter = df_inter.lazy().with_column(lit(NULL).alias("group_id")).collect()?; + + // If there is localisation data, this will select it + if localisation_df.height() == 0 { + localisation_df = get_localisation_data(assay_df.clone())?; + } else { + localisation_df.vstack_mut(&get_localisation_data(assay_df.clone())?)?; + } + + // If there is disease data, this will select it + if disease_df.height() == 0 { + disease_df = get_disease_data(assay_df.clone())?; + } else { + disease_df.vstack_mut(&get_disease_data(assay_df.clone())?)?; + } + + // If there is cell type data, this will select it + if cell_type_df.height() == 0 { + cell_type_df = get_cell_type_data(assay_df.clone())?; + } else { + cell_type_df.vstack_mut(&get_cell_type_data(assay_df.clone())?)?; + } + + // Get the taxonomy ontology reference. T + if taxid_df.height() == 0 { + taxid_df = get_taxonomy_data(assay_df.clone())?; + } else { + taxid_df.vstack_mut(&get_taxonomy_data(assay_df.clone())?)?; + } + + if contrast_lookup.get(&ass_group.id) == None { + continue; + } + + let check_cols_vec = contrast_lookup.get_vec(&ass_group.id).unwrap(); + + for check_cols in check_cols_vec.iter() { + // Check columns are actually present + if !&df_inter.get_column_names().contains(&check_cols.0.as_str()) + || !&df_inter.get_column_names().contains(&check_cols.1.as_str()) + { + warn!("Couldn't find either {} or {} in the DataFrame columns, skipping this contrast", check_cols.0, check_cols.1); + continue; + } + + df_inter = df_inter + .lazy() + .with_column( + when(col(&check_cols.0).and(col(&check_cols.1))) + .then(lit(ass_group.id.as_ref())) + .otherwise(col("group_id")) + .alias("group_id"), + ) + .filter(col(&check_cols.0).and(col(&check_cols.1))) + .collect()?; + + if df_result.height() == 0 { + df_result = df_inter.clone(); + } else { + df_result.vstack_mut(&df_inter)?; + } + } + } + } // closes loop on analyses + + // Use a df to join on selectively + df_result = + join_augmentations(&df_result, &taxid_df, &localisation_df, &disease_df, &cell_type_df)?; + + Ok(df_result) +} + +/// Augmentation function for baseline experiments +pub fn augment_baseline_df( + df: &mut DataFrame, + config: &Config, + sdrf: &DataFrame, +) -> Result { + // Set up some dataframes for the things we want + let mut taxid_df = DataFrame::default(); + let mut localisation_df = DataFrame::default(); + let mut disease_df = DataFrame::default(); + let mut cell_type_df = DataFrame::default(); + + let mut df_inter = df.clone(); + df_inter = df_inter.lazy().with_column(lit(NULL).alias("group_id")).collect()?; + + let mut df_result = DataFrame::default(); + + for analysis in &config.analytics { + for assay_group in &analysis.assay_groups.assay_group { + let assay_names = assay_group.assays.clone(); + let ass_group = assay_group.id.clone(); + + let mut assay_names_series = + Utf8ChunkedBuilder::new("assay_names", assay_names.len(), 128); + for ass_nm in &assay_names { + assay_names_series.append_value(ass_nm); + } + + let assay_df = sdrf + .clone() + .lazy() + .filter(col("assay_name").is_in(lit(assay_names_series.finish().into_series()))) + .with_column(lit(ass_group.as_str()).alias("group_id")); + + if localisation_df.height() == 0 { + localisation_df = get_localisation_data(assay_df.clone())?; + } else { + localisation_df.vstack_mut(&get_localisation_data(assay_df.clone())?)?; + } + + if disease_df.height() == 0 { + disease_df = get_disease_data(assay_df.clone())?; + } else { + disease_df.vstack_mut(&get_disease_data(assay_df.clone())?)?; + } + + if cell_type_df.height() == 0 { + cell_type_df = get_cell_type_data(assay_df.clone())?; + } else { + cell_type_df.vstack_mut(&get_cell_type_data(assay_df.clone())?)?; + } + + if taxid_df.height() == 0 { + taxid_df = get_taxonomy_data(assay_df.clone())?; + } else { + taxid_df.vstack_mut(&get_taxonomy_data(assay_df.clone())?)?; + } + + // Filter the experiment data according to how expresison atlas does it + df_inter = df_inter + .lazy() + .with_column( + when(col(ass_group.as_str())) + .then(lit(ass_group.as_ref())) + .otherwise(col("group_id")) + .alias("group_id"), + ) + .filter(col(ass_group.as_str())) + .collect()?; + + if df_result.height() == 0 { + df_result = df_inter.clone(); + } else { + df_result.vstack_mut(&df_inter)?; + } + } // close loop on assay groups + } //close loop on analyses + + df_result = + join_augmentations(&df_result, &taxid_df, &localisation_df, &disease_df, &cell_type_df)?; + + Ok(df_result) +} + +fn get_localisation_data(assay_df: LazyFrame) -> Result { + let localisation = assay_df + .filter( + ((col("feat_class").eq(lit("factor"))).or(col("feat_class").eq(lit("characteristic")))) + .and(col("feat_type").eq(lit("organism part"))), + ) + .select([col("group_id"), col("ontology").alias("location")]) + .first() + .collect()?; + + Ok(localisation) +} + +fn get_disease_data(assay_df: LazyFrame) -> Result { + let disease = assay_df + .filter( + ((col("feat_class").eq(lit("factor"))).or(col("feat_class").eq(lit("characteristic")))) + .and(col("feat_type").eq(lit("disease"))), + ) + .select([col("group_id"), col("ontology").alias("disease")]) + .first() + .collect()?; + + Ok(disease) +} + +fn get_cell_type_data(assay_df: LazyFrame) -> Result { + let cell_type = assay_df + .filter( + col("feat_class").eq(lit("characteristic")).and(col("feat_type").eq(lit("cell type"))), + ) + .select([col("group_id"), col("ontology").alias("cell_type")]) + .first() + .collect()?; + + Ok(cell_type) +} + +fn get_taxonomy_data(assay_df: LazyFrame) -> Result { + let tax_data = assay_df + .filter(col("feat_type").eq(lit("organism"))) + .select([col("group_id"), col("ontology").alias("taxonomy")]) + .first() + .collect()?; + + Ok(tax_data) +} + +fn join_augmentations( + df_result_bare: &DataFrame, + taxid_df: &DataFrame, + localisation_df: &DataFrame, + disease_df: &DataFrame, + cell_type_df: &DataFrame, +) -> Result { + // Use a df to join on selectively + let mut df_result = df_result_bare.clone(); + if taxid_df.height() > 0 { + df_result = df_result.join(taxid_df, ["group_id"], ["group_id"], JoinType::Inner, None)?; + } else { + df_result = df_result + .lazy() + .with_column(lit(NULL).cast(DataType::Utf8).alias("taxonomy")) + .collect()?; + } + + if localisation_df.height() > 0 { + df_result = + df_result.join(localisation_df, ["group_id"], ["group_id"], JoinType::Inner, None)?; + } else { + df_result = df_result + .lazy() + .with_column(lit(NULL).cast(DataType::Utf8).alias("location")) + .collect()?; + } + + if disease_df.height() > 0 { + df_result = + df_result.join(disease_df, ["group_id"], ["group_id"], JoinType::Inner, None)?; + } else { + df_result = df_result + .lazy() + .with_column(lit(NULL).cast(DataType::Utf8).alias("disease")) + .collect()?; + } + + if cell_type_df.height() > 0 { + df_result = + df_result.join(cell_type_df, ["group_id"], ["group_id"], JoinType::Inner, None)?; + } else { + df_result = df_result + .lazy() + .with_column(lit(NULL).cast(DataType::Utf8).alias("cell_type")) + .collect()?; + } + + df_result = df_result.select([ + "GeneID", + "Gene Name", + "experiment", + "taxonomy", + "location", + "disease", + "cell_type", + ])?; + Ok(df_result) +} diff --git a/utils/expression-atlas/src/configuration.rs b/utils/expression-atlas/src/configuration.rs new file mode 100644 index 000000000..b40b0f65a --- /dev/null +++ b/utils/expression-atlas/src/configuration.rs @@ -0,0 +1,54 @@ +/// This module handles the parsing of the configuration file +use quick_xml::de::from_str; +use quick_xml::DeError; +use serde::Deserialize; +use std::fs; + +use std::path::PathBuf; + +#[derive(Debug, Deserialize, PartialEq, Eq)] +pub struct Config { + #[serde(rename = "experimentType")] + pub exp_type: String, + #[serde(rename = "analytics")] + pub analytics: Vec, +} + +#[derive(Debug, Deserialize, PartialEq, Eq)] +pub struct Analytics { + pub assay_groups: AssayGroups, + pub array_design: Option, + pub contrasts: Option, +} + +#[derive(Debug, Deserialize, PartialEq, Eq)] +pub struct AssayGroups { + pub assay_group: Vec, +} + +#[derive(Debug, Deserialize, PartialEq, Eq, Clone)] +pub struct AssayGroup { + pub id: String, + pub label: Option, // This contains the factors in a ; separated list + #[serde(rename = "assay", default)] + pub assays: Vec, +} + +#[derive(Debug, Deserialize, PartialEq, Eq)] +pub struct Contrasts { + pub contrast: Vec, +} + +#[derive(Debug, Deserialize, PartialEq, Eq)] +pub struct Contrast { + pub id: String, + pub name: String, + #[serde(alias = "reference_assay_group")] + pub ref_group: String, + #[serde(alias = "test_assay_group")] + pub test_group: String, +} + +pub fn parse_config(file: &PathBuf) -> Result { + from_str::(&fs::read_to_string(file).unwrap()) +} diff --git a/utils/expression-atlas/src/filtering.rs b/utils/expression-atlas/src/filtering.rs new file mode 100644 index 000000000..3038543d6 --- /dev/null +++ b/utils/expression-atlas/src/filtering.rs @@ -0,0 +1,103 @@ +/// This module implements the filtering that EA uses in their webapp +/// NB: This is all reverse engineered +use anyhow::Result; +use polars::frame::DataFrame; +use polars::prelude::*; +use polars::series::Series; +use regex::Regex; + +fn lowercase_fn(ent: &Option<&str>) -> Option { + ent.as_ref().map(|ent| ent.to_lowercase()) + // match ent { + // None => None, + // Some(ent) => Some(ent.to_lowercase()), + // } +} + +fn fix_bad_infinities(str_val: &Series) -> Series { + let lowercased = str_val + .utf8() + .unwrap() + .into_iter() + .map(|x| lowercase_fn(&x)) + .collect::>>() + .into_iter() + .map(|x| x.map(|x| x.parse::().unwrap())) + .collect::>>(); + + Series::from_iter(lowercased) +} + +fn baseline_get_median_gt_zero(str_val: &Series) -> Series { + let lists = str_val.utf8().unwrap().into_iter().map(|x| { + x.unwrap().split(',').into_iter().map(|y| y.parse::().unwrap()).collect::>() + }); + + let medians: Vec = + lists.into_iter().map(|x| Series::from_iter(x).median().unwrap() > 0.0).collect(); + Series::from_iter(medians) +} + +/// This function will filter the differential results based on: +/// - non-null p value +/// - absolute log2 fold change greater than 1 +// find the p value and log fold columns +pub fn filter_differential(input: &DataFrame) -> Result { + let pv_regex = Regex::new(r".*p-value.*").unwrap(); + let log_fold_regex = Regex::new(r".*log2.*").unwrap(); + + let mut inter = input.clone(); + + for column in input.get_column_names_owned() { + // Check for badly parsed infinities + if pv_regex.is_match(&column) { + if !inter.column(&column)?.dtype().is_numeric() { + inter.apply(&column, fix_bad_infinities)?; + } + inter = inter + .lazy() + .with_column( + when(col(&column).lt(lit(0.05f64))) + .then(lit(true).alias(&column)) + .otherwise(lit(false).alias(&column)), + ) + .collect()?; + } else if log_fold_regex.is_match(&column) { + if !inter.column(&column)?.dtype().is_numeric() { + inter.apply(&column, fix_bad_infinities)?; + } + inter = inter + .lazy() + .with_column( + when(col(&column).abs().gt_eq(lit(1.0f64))) + .then(lit(true).alias(&column)) + .otherwise(lit(false).alias(&column)), + ) + .collect()?; + } + } + Ok(inter) +} + +pub fn filter_baseline(input: &mut DataFrame) -> DataFrame { + // This is a baseline experiment. + // Find columns starting with lower case g, then apply the function to convert to + // medain and select greater than zero + let mut meas = Vec::::new(); + + for column in input.get_column_names_owned() { + if column.starts_with('g') { + input.apply(&column, baseline_get_median_gt_zero).unwrap(); + meas.push(col(&column)); + } + } + // Selection should now have all the gN column names in it + input + .clone() + .lazy() + .filter( + any_exprs(&meas[0..meas.len() / 2]).or(any_exprs(&meas[meas.len() / 2..meas.len()])), + ) // If we try to do the whole thing at once, we get a stack overflow + .collect() + .unwrap() +} diff --git a/utils/expression-atlas/src/main.rs b/utils/expression-atlas/src/main.rs new file mode 100644 index 000000000..bb5c055bf --- /dev/null +++ b/utils/expression-atlas/src/main.rs @@ -0,0 +1,343 @@ +use anyhow::Result; +use clap::{Parser, Subcommand}; +use regex::Regex; +use std::collections::HashMap; +use std::fs; +use std::path::PathBuf; + +use polars::frame::DataFrame; +use polars::prelude::*; + +use log::{info, warn}; + +pub mod augment; +pub mod configuration; +pub mod filtering; +pub mod sdrf; + +#[derive(Parser, Debug)] +#[clap(author = "Andrew Green", version, about)] +struct Args { + #[clap(subcommand)] + cmd: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// Parse the Expression Atlas data into the unique genes per experiment jsonlines + Parse { + /// Path where input has been copied. Must contain the config files + #[clap(short, long, multiple_values(true))] + input: PathBuf, + + /// An output file + #[clap(short, long)] + output: String, + }, + /// Lookup the gene names we found, using a dump from the database + Lookup { + /// File containing the genes from all experiments + #[clap(short, long)] + genes: PathBuf, + + /// Dump from the database containing all URS -> Gene names. See the query file for references + #[clap(short, long)] + lookup: PathBuf, + + /// An output file. Will contain the data from the experiments file along with URS data and some other useful stuff + #[clap(short, long)] + output: String, + }, +} + +fn load_df_add_experiment(path: &PathBuf) -> Result { + info!("Loading experiment data from {:?}", path); + let exp_name = path.file_name().unwrap().to_str().unwrap().split('-').collect::>() + [0..=2] + .join("-") + .replace("_A", ""); + + let mut exp_df: DataFrame = CsvReader::from_path(&path)? + .has_header(true) + .with_delimiter(b'\t') + .with_null_values(Some(NullValues::AllColumns("NA".to_string()))) + .infer_schema(None) + .finish() + .unwrap_or_else(|x| panic!("Failed on {:?} with error {:?}", path, x)); + + // hstack the experiment name (derived from the filename) into the DataFrame + let mut exp_col_arr = Utf8ChunkedBuilder::new("experiment", exp_df.height(), 128); + for _i in 0..exp_df.height() { + exp_col_arr.append_value(&exp_name); + } + // let iter_exp = std::iter::repeat([&exp_name].into_iter()).take(exp_df.height()); + let exp_col: Series = exp_col_arr.finish().into_series(); + exp_df.hstack_mut(&[exp_col]).unwrap(); + + if !&exp_df.get_column_names().contains(&"GeneID") { + // normalise column names + info!("Standard column heading not found, normalising column names"); + info!("Column names were {:?}", &exp_df.get_column_names()); + if exp_df.get_column_names().contains(&"Gene ID") { + exp_df.rename("Gene ID", "GeneID")?; + } else if exp_df.get_column_names().contains(&"Gene.ID") { + exp_df.rename("Gene.ID", "GeneID")?; + } + info!("Column names are now {:?}", &exp_df.get_column_names()); + } + + Ok(exp_df) +} + +fn run_parse(input: &PathBuf, output: &String) -> Result<()> { + let config_re = Regex::new(r"configuration.xml").unwrap(); + let mut config_lookup: HashMap = HashMap::new(); + for file in fs::read_dir(&input)? { + let file = file?; + let path = file.path(); + if config_re.is_match(path.to_str().unwrap()) { + let exp_name = + path.file_name().unwrap().to_str().unwrap().split('-').collect::>() + [0..=2] + .join("-") + .replace("configuration", ""); // yeah... + let config = configuration::parse_config(&path)?; + config_lookup.insert(exp_name, config); + } + } + + // Now have a hashmap with exp_name:config. We can loop over it and + // - Check config for experiment type + // - Construct appropriate filenames + // - Dispatch filenames for loading, appropriate error handling if they don't exist + // - Construct new df to merge with big one + + let differential_re = Regex::new(r".*diff.*").unwrap(); + + let mut big_df = DataFrame::default(); + // String::new(); + // data_path = PathBuf::from(&args.input); + // let mut sdrf_path = PathBuf::from(&args.input); + + let mut gene_count: usize = 0; + + for (exp_name, config) in &config_lookup { + let mut exp_df = DataFrame::default(); + let mut data_path = PathBuf::from(&input); + let mut sdrf_path = PathBuf::from(&input); + if differential_re.is_match(&config.exp_type) { + for analysis in &config.analytics { + let array_design = analysis.array_design.as_deref().unwrap_or(""); + if !array_design.is_empty() { + let data_filename: String = + format!("{}_{}-analytics.tsv", exp_name, array_design); + data_path.push(&data_filename); + + if !data_path.exists() { + warn!( + "File {} does not exist, skipping this experiment", + data_path.to_str().unwrap() + ); + data_path.pop(); + continue; + } + + // Load the data + if exp_df.height() == 0 { + exp_df = load_df_add_experiment(&data_path)?; + data_path.pop(); + } else { + exp_df = exp_df + .lazy() + .join( + load_df_add_experiment(&data_path)?.lazy(), + [col("GeneID")], + [col("GeneID")], + JoinType::Inner, + ) + .select(&[col("*").exclude([ + "Gene Name_right", + "experiment_right", + "Design Element_right", + ])]) + .collect()?; + data_path.pop(); + } + } else { + let data_filename: String = format!("{}-analytics.tsv", exp_name); + data_path.push(&data_filename); + + if !data_path.exists() { + warn!( + "File {} does not exist, skipping this experiment", + data_path.to_str().unwrap() + ); + data_path.pop(); + continue; + } + + exp_df = load_df_add_experiment(&data_path)?; + data_path.pop(); + } + } + } else { + let data_filename: String = format!("{}-tpms.tsv", exp_name); + data_path.push(&data_filename); + + if !data_path.exists() { + warn!( + "File {} does not exist, skipping this experiment", + data_path.to_str().unwrap() + ); + data_path.pop(); + continue; + } + + exp_df = load_df_add_experiment(&data_path)?; + data_path.pop(); + } + + let sdrf_filename: String = format!("{}.condensed-sdrf.tsv", exp_name); + sdrf_path.push(&sdrf_filename); + + if !sdrf_path.exists() { + warn!("File {} does not exist, skipping this experiment", sdrf_path.to_str().unwrap()); + sdrf_path.pop(); + data_path.pop(); + continue; + } + + // Now load the sdrf + let sdrf_df = sdrf::parse_condensed_sdrf(&sdrf_path)?; + // println!("{:?}", sdrf_df); + // filter based on differential or baseline + if differential_re.is_match(&config.exp_type) { + info!("Filtering experiment dataset with differential filters"); + exp_df = filtering::filter_differential(&exp_df)?; + exp_df = augment::augment_differential_df(&mut exp_df, config, &sdrf_df)?; + } else { + info!("Filtering with baseline filters"); + exp_df = filtering::filter_baseline(&mut exp_df); + exp_df = augment::augment_baseline_df(&mut exp_df, config, &sdrf_df)?; + } + + data_path.pop(); + sdrf_path.pop(); + + info!("dataframe remaining: {}", exp_df.height()); + gene_count += exp_df.height(); + + // Add the newly parsed data to the big df ready for export + if big_df.height() == 0 { + big_df = exp_df.clone(); + } else { + big_df.vstack_mut(&exp_df)?; + } + } + + info!( + "Parsed a total of {} lines, from which {} were selected ({}%)", + gene_count, + big_df.height(), + 100.0 * (big_df.height() as f64) / (gene_count as f64) + ); + println!("{:?}", big_df.height()); + + info!("All files parsed, preparing to write import csvs"); + + let mut output_file = fs::File::create(&output)?; + CsvWriter::new(&mut output_file).has_header(true).finish(&mut big_df)?; + + Ok(()) +} + +fn run_lookup(genes: &PathBuf, lookup: &PathBuf, output: &String) -> Result<()> { + let mut gene_df: DataFrame = CsvReader::from_path(&genes)? + .has_header(true) + .finish() + .unwrap_or_else(|_x| panic!("Failed to load gene output")); + // You need to get the taxid from the taxonomy URL. Should be able to split on _ and take last element if you can figure it out + gene_df = gene_df + .lazy() + .with_column( + col("taxonomy").str().extract("([0-9]+)$", 1).cast(DataType::Int64).alias("taxid"), + ) + .collect() + .unwrap(); + // gene_df = gene_df.lazy().with_column(col("taxid").cast(DataType::UInt32)).collect().unwrap(); + println!("{:?}", gene_df); + + let mut lookup_df: DataFrame = CsvReader::from_path(&lookup)? + .has_header(true) + .finish() + .unwrap_or_else(|_x| panic!("Failed to load lookup data!")); + + // lookup_df.rename("column_1", "upi"); + // lookup_df.rename("column_2", "taxid"); + // lookup_df.rename("column_3", "possible_ids"); + // lookup_df.rename("column_4", "start"); + // lookup_df.rename("column_5", "end"); + // lookup_df.rename("column_6", "rna_type"); + + // The database dump has a column where possible IDs are separated by a | character, so we need to split on that + // The plan then is to use explode on the df with the external IDs column to get a mega big dataframe which we join onto + // the gene one + println!("{:?}", &lookup_df); + + // For now, I only have the external ID from the database, if this doesn't match many, I can tweak the lookupo query and re-add this + lookup_df = lookup_df + .lazy() + .with_column(col("external_id").str().split("|").alias("external_id")) + .explode([col("external_id")]) + .with_column(col("external_id").str().split(",").alias("external_id")) + .explode([col("external_id")]) + .filter(col("external_id").neq(lit(""))) + .filter(col("external_id").neq(lit("null"))) + .filter(col("external_id").is_not_null()) + .collect() + .unwrap(); + + println!("Got to the end in one piece!"); + println!("{:?}", &lookup_df); + + // println!("{:?}", &gene_df); + + let mut matched_df = + gene_df.join(&lookup_df, ["GeneID"], ["external_id"], JoinType::Inner, None).unwrap(); + println!("{:?}", matched_df.get_column_names()); + + matched_df = matched_df.lazy().filter(col("taxid").eq(col("taxid_right"))).collect().unwrap(); + + let mut grouped_df = + matched_df.lazy().groupby([col("GeneID"), col("urs_taxid")]).agg([col("*").list().unique()]).collect()?; + + println!("{:?}", grouped_df); + + let mut output_file = fs::File::create(&output)?; + JsonWriter::new(&mut output_file) + .with_json_format(JsonFormat::JsonLines) + .finish(&mut grouped_df)?; + + Ok(()) +} + +fn main() -> Result<()> { + env_logger::init(); + info!("Starting Expression Atlas parser"); + let args = Args::parse(); + + match args.cmd { + Command::Parse { + input, + output, + } => run_parse(&input, &output), + Command::Lookup { + genes, + lookup, + output, + } => run_lookup(&genes, &lookup, &output), + } + + // Parse the config files first + // set up the regex +} diff --git a/utils/expression-atlas/src/sdrf.rs b/utils/expression-atlas/src/sdrf.rs new file mode 100644 index 000000000..cff79deb5 --- /dev/null +++ b/utils/expression-atlas/src/sdrf.rs @@ -0,0 +1,80 @@ +use anyhow::Result; +use polars::chunked_array::builder::Utf8ChunkedBuilder; +use polars::frame::DataFrame; +use polars::prelude::IntoSeries; +use std::fs; +use std::io::Read; +use std::path::PathBuf; + +use log::{info, warn}; + +pub fn parse_condensed_sdrf(path: &PathBuf) -> Result { + /* + A condensed sdrf file has 7 columns, but the last is often not delimited correctly meaning it + is tricky to read with the polars default csv reader. + + Therefore, we will be manually parsing the file into 6 series objects (one column seems to + always be null) and constructing a dataframe from them + + We use a chunked array builder for Utf8 strings. + */ + + info!("Loading sdrf data from {:?}", path); + + let mut file = fs::File::open(path).unwrap(); + let mut s = String::new(); + file.read_to_string(&mut s)?; + + let part_parsed: Vec> = s.lines().map(|line| line.split('\t').collect()).collect(); + let bytes_per_string: usize = 128; + let mut exp_name = Utf8ChunkedBuilder::new("exp_name", part_parsed.len(), bytes_per_string); + let mut assay_name = Utf8ChunkedBuilder::new("assay_name", part_parsed.len(), bytes_per_string); + let mut feat_class = Utf8ChunkedBuilder::new("feat_class", part_parsed.len(), bytes_per_string); + let mut feat_type = Utf8ChunkedBuilder::new("feat_type", part_parsed.len(), bytes_per_string); + let mut feat_value = Utf8ChunkedBuilder::new("feat_value", part_parsed.len(), bytes_per_string); + let mut ontology = Utf8ChunkedBuilder::new("ontology", part_parsed.len(), bytes_per_string); + + // There is one experiment file that does not have the empty column in line[1] + if part_parsed.iter().map(|x| x.len()).max().unwrap() == 7 { + for line in part_parsed.iter() { + exp_name.append_value(line[0]); + assay_name.append_value(line[2]); // remember line[1] will be empty + feat_class.append_value(line[3]); + feat_type.append_value(line[4]); + feat_value.append_value(line[5]); + if line.len() == 7 { + ontology.append_value(line[6]); + } else { + ontology.append_null(); + } + } + } else { + warn!( + "Unusual sdrf parsing with {} columns, not 7 for experiment {}", + part_parsed.iter().map(|x| x.len()).max().unwrap(), + part_parsed[0][0] + ); + for line in part_parsed.iter() { + exp_name.append_value(line[0]); + assay_name.append_value(line[1]); + feat_class.append_value(line[2]); + feat_type.append_value(line[3]); + feat_value.append_value(line[4]); + if line.len() == 6 { + ontology.append_value(line[5]); + } else { + ontology.append_null(); + } + } + } + + // Now have all the lines parsed with the same lengths. Try to construct a dataframe... + DataFrame::new(vec![ + exp_name.finish().into_series(), + assay_name.finish().into_series(), + feat_class.finish().into_series(), + feat_type.finish().into_series(), + feat_value.finish().into_series(), + ontology.finish().into_series(), + ]) +} diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml index 40dc3516a..761212348 100644 --- a/utils/precompute/Cargo.toml +++ b/utils/precompute/Cargo.toml @@ -20,3 +20,4 @@ sorted-iter = "0.1.7" structopt = "0.3" strum = "0.21" strum_macros = "0.21" +polars = "0.21.1" diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs index cca9879b5..69d05ae54 100644 --- a/utils/precompute/src/releases.rs +++ b/utils/precompute/src/releases.rs @@ -25,6 +25,8 @@ use anyhow::{ Result, }; +use polars::prelude::*; + #[derive(Serialize, Deserialize, Debug)] pub struct UrsEntry { id: usize, @@ -61,28 +63,34 @@ pub fn write_max(filename: &Path, output: &Path) -> Result<()> { } pub fn select_new(xrefs: &Path, known: &Path, output: &Path) -> Result<()> { - let xref_records = entries(xrefs)?.map(|e: UrsEntry| (e.id, e)).assume_sorted_by_key(); - let known_records = entries(known)?.map(|e: UrsEntry| (e.id, e)).assume_sorted_by_key(); - - let mut writer = csv::Writer::from_writer(File::create(output)?); - let pairs = xref_records.outer_join(known_records); - for (_key, (xref, pre)) in pairs { - match (xref, pre) { - (Some(x), Some(p)) => match x.release.cmp(&p.release) { - Less => Err(anyhow!( - "This should never happen, too small release for {:?} vs {:?}", - &x, - &p - ))?, - Equal => (), - Greater => writer.write_record(&[x.urs])?, - }, - (Some(x), None) => writer.write_record(&[x.urs])?, - (None, Some(_)) => (), - (None, None) => (), - } - } - writer.flush()?; + + let mut xref_records : DataFrame = CsvReader::from_path(xrefs)?.has_header(false).finish().unwrap(); + xref_records.rename("column_1", "id").ok(); + xref_records.rename("column_2", "upi").ok(); + xref_records.rename("column_3", "last").ok(); + let mut known_records : DataFrame = CsvReader::from_path(known)?.has_header(false).finish().unwrap(); + known_records.rename("column_1", "id").ok(); + known_records.rename("column_2", "upi").ok(); + known_records.rename("column_3", "last").ok(); + // Run groupby, sort and max on xref (because the DB doesn't have the memory to do it) + xref_records = xref_records.groupby(["id", "upi"])? + .select(["last"]) + .max()? + .sort(["id"], false) + .unwrap(); + + // Join the frames on id, then filter to select those where xref > known (?) + let mut selection = xref_records.join(&known_records, ["id", "upi"], ["id", "upi"], JoinType::Outer, None)?; + let mask = selection.column("last_max")?.gt(selection.column("last")?)?; + let mut selected_upis = selection.filter(&mask).unwrap() + .select(["upi"])? + .unique(None, UniqueKeepStrategy::First)?; + + + let out_stream : File = File::create(output).unwrap(); + CsvWriter::new(out_stream) + .has_header(false) + .finish(&mut selected_upis); Ok(()) } diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs index 3d2c11b4f..78bc78a99 100644 --- a/utils/search-export/src/sequences/normalized.rs +++ b/utils/search-export/src/sequences/normalized.rs @@ -65,6 +65,7 @@ pub struct Normalized { interacting_proteins: Vec, interacting_rnas: Vec, so_rna_type_tree: so_tree::SoTree, + publication_count: usize, #[serde(flatten)] orfs: OrfVec, @@ -108,6 +109,7 @@ impl Normalized { short_urs: parsed.short(), deleted: String::from("N"), so_rna_type_tree: raw.so_tree().to_owned(), + publication_count: raw.publication_count(), pre_summary: raw.precompute().into(), basic: base, qa_status: raw.qa_status().to_owned(), diff --git a/utils/search-export/src/sequences/publication_counts.rs b/utils/search-export/src/sequences/publication_counts.rs index 271f3055c..561749a57 100644 --- a/utils/search-export/src/sequences/publication_counts.rs +++ b/utils/search-export/src/sequences/publication_counts.rs @@ -22,3 +22,9 @@ impl grouper::HasIndex for PublicationCount { pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> { grouper::group::(grouper::Criteria::ZeroOrOne, &path, 1, max, &output) } + +impl PublicationCount { + pub fn publication_count(&self) -> usize { + self.publication_count + } +} diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs index 420eca0db..3e67649fd 100644 --- a/utils/search-export/src/sequences/raw.rs +++ b/utils/search-export/src/sequences/raw.rs @@ -133,4 +133,9 @@ impl Raw { pub fn base(&self) -> &Basic { &self.base } + + /// Get this raw's publication count. + pub fn publication_count(&self) -> usize { + self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0) + } } diff --git a/weekly-update/crontab.txt b/weekly-update/crontab.txt index b4f9b89df..512e60848 100644 --- a/weekly-update/crontab.txt +++ b/weekly-update/crontab.txt @@ -1,3 +1,4 @@ -MAILTO=bsweeney@ebi.ac.uk +MAILTO=agreen@ebi.ac.uk +SHELL=/bin/bash -0 17 * * 5 source /etc/bashrc; cd /hps/nobackup/production/xfam/bsweeney/automated && bsub ./weekly-update/run.sh +0 17 * * 4 . ~/.bashrc && cd /hps/nobackup/agb/rnacentral/weekly-run && bsub -o weekly_run.out -e weekly_run.err -M 20480 ./weekly-update/run.sh diff --git a/weekly-update/run.sh b/weekly-update/run.sh index dba811bdd..d86f0075b 100755 --- a/weekly-update/run.sh +++ b/weekly-update/run.sh @@ -1,30 +1,59 @@ #!/usr/bin/env bash +#BSUB -oo weekly_run.out +#BSUB -eo weekly_run.err +#BSUB -M 4096 +#BSUB -cwd /hps/nobackup/agb/rnacentral/weekly-run +#BSUB -J "PDBe weekly import" set -euo pipefail IFS=$'\n\t' export NXF_OPTS='-Dnxf.pool.type=sync -Dnxf.pool.maxThreads=10000' -export PATH="/nfs/software/singularity/3.5.0/bin:$HOME/.cargo/bin:$PATH" [ -d work/tmp ] || mkdir -p work/tmp [ ! -e local.config ] || rm local.config when=$(date +'%Y-%m-%d') -if [[ -d singularity/bind/r2dt/ ]]; then - rm -r singularity/bind/r2dt/ + +ln -s weekly-update/update.config local.config + +make rust + +# Download latest version of nextflow +curl --max-time 10 -s https://get.nextflow.io | bash +res=$? +# Load module as fallback +if test "$res" != "0"; then + echo "Using module nextflow..." + module load nextflow-21.10.6-gcc-9.3.0-tkuemwd + NF="nextflow" +else + echo "Using downloaded nextflow..." + NF="./nextflow" fi -mkdir -p singularity/bind/r2dt/data -pushd singularity/bind/r2dt/data -wget -O cms.tar.gz https://www.dropbox.com/s/3ie8kzb8ol658s0/cms.tar.gz?dl=1 -tar xf cms.tar.gz -popd +# Clean up previous run by nextflow +$NF clean -f -ln -s weekly-update/update.config local.config +rm .nextflow.log + +## Run new DB selection workflow - selects DBs based on file changes from remotes +rm -f db_selection.config && touch db_selection.config + +$NF -quiet run -profile pg11prod select_databases.nf --import_selection_remotes=weekly-update/weekly_db_remotes + +$NF -quiet run -with-report "$when-setup.html" -profile pg11prod --use_datamover prepare-environment.nf +$NF -quiet run -with-report "$when-import.html" -profile pg11prod import-data.nf +$NF -quiet run -with-report "$when-analyze.html" -profile pg11prod analyze.nf +$NF -quiet run -with-report "$when-precompute.html" -profile pg11prod precompute.nf + +$NF -quiet run -profile pg11prod report.nf + +$NF -quiet run -with-report "$when-search.html" -profile pg11prod export.nf -make rust -./nextflow -quiet run -with-report "$when-import.html" -profile prod import-data.nf -./nextflow -quiet run -with-report "$when-precompute.html" -profile prod precompute.nf -./nextflow -quiet run -with-report "$when-search.html" -profile prod search-export.nf +# Zip up reports and email them to me +tar -cjf reports.tar.bz2 *.html +rm *.html +mail -a reports.tar.bz2 -s "Weekly workflow completion reports" agreen@ebi.ac.uk < .nextflow.log diff --git a/weekly-update/update.config b/weekly-update/update.config index c6778e29b..7cb2c4b79 100644 --- a/weekly-update/update.config +++ b/weekly-update/update.config @@ -15,10 +15,15 @@ params { pfam.run = false } + rfam + { + memory = 2.GB + } + precompute { run = true maxForks = 4 - range.memory = '5GB' + range.memory = '8GB' } r2dt.run = false @@ -30,31 +35,32 @@ params { memory = '15 GB' publish { host = '' - path = "/nfs/production/xfam/rnacentral/search_dumps/dev-nightly/" + path = "/nfs/production/agb/rnacentral/search-export/dev-nightly/" } } sequence_search { run = false } + + use_datamover = true } singularity { enabled = true cacheDir = "$baseDir/singularity" - runOptions = '--bind /nfs/ftp/pub/databases/ena --bind /ebi/ftp --bind /nfs/ftp --bind /nfs/ensemblftp --bind /nfs/ensemblgenomes/ftp' } notification { enabled = true - to = 'bsweeney@ebi.ac.uk' + to = 'agreen@ebi.ac.uk' } -includeConfig '../private.config' +includeConfig '../profiles.config' includeConfig 'config/cluster.config' +includeConfig 'db_selection.config' process { - time = '5h' errorStrategy = { task.exitStatus == 130 ? 'retry' : 'terminate' } maxRetries = 2 } diff --git a/weekly-update/weekly_db_remotes b/weekly-update/weekly_db_remotes new file mode 100644 index 000000000..047c59633 --- /dev/null +++ b/weekly-update/weekly_db_remotes @@ -0,0 +1,6 @@ +sgd,https://downloads.yeastgenome.org/latest/RNAcentral.json +pombase,ftp://ftp.pombase.org/nightly_update/misc/rnacentral.json +zfin,https://zfin.org/downloads/rnaCentral.json +intact,https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip +hgnc,https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/locus_groups/non-coding_RNA.json +flybase,ftp://ftp.flybase.net/releases/current/precomputed_files/genes/ncRNA*.json.gz diff --git a/workflows/databases/crw.nf b/workflows/databases/crw.nf index 5c769fbde..1c505beb9 100644 --- a/workflows/databases/crw.nf +++ b/workflows/databases/crw.nf @@ -11,7 +11,7 @@ process fetch_and_process { psql -f "$metadata_query" "$PGDATABASE" > metadata.json git clone "$params.databases.crw.r2dt_repo" r2dt rnac crw r2dt-to-fasta r2dt/data/crw-fasta sequences.fasta - rnac crw parse metadata.json $sequences + rnac crw parse metadata.json sequences.fasta """ } diff --git a/workflows/databases/ena.nf b/workflows/databases/ena.nf index 7f1a6a2af..aa9ebb3d7 100644 --- a/workflows/databases/ena.nf +++ b/workflows/databases/ena.nf @@ -2,6 +2,8 @@ process fetch_directory { tag { "$name" } when { params.databases.ena.run } clusterOptions '-sp 100' + queue 'datamover' + containerOptions '--bind /nfs:/nfs' input: tuple val(name), val(remote) diff --git a/workflows/databases/ensembl.nf b/workflows/databases/ensembl.nf index bcecf048e..e8185cd0c 100644 --- a/workflows/databases/ensembl.nf +++ b/workflows/databases/ensembl.nf @@ -28,15 +28,16 @@ process find_urls { output: path('species.txt') - """ - rnac ensembl urls-for $division ${params.databases.ensembl[division].ftp_host} species.txt - """ + script: + """ + rnac ensembl urls-for $division ${params.databases.ensembl[division].ftp_host} species.txt + """ } process fetch_species_data { tag { "$species" } clusterOptions '-sp 90' - errorStrategy 'retry' + errorStrategy { task.exitStatis == 8 ? 'retry' : 'ignore' } maxRetries 10 maxForks 10 @@ -70,7 +71,7 @@ process parse_data { """ rnac ensembl parse $division --family-file $rfam $embl $gff . - rnac ensembl pseudogenes $division $embl ensembl-pseudogenes.csv +# rnac ensembl pseudogenes $division $embl ensembl-pseudogenes.csv """ } diff --git a/workflows/databases/expressionatlas.nf b/workflows/databases/expressionatlas.nf new file mode 100644 index 000000000..267e97520 --- /dev/null +++ b/workflows/databases/expressionatlas.nf @@ -0,0 +1,91 @@ +process fetch_data { + queue 'datamover' + container '' + errorStrategy 'ignore' + + input: + path("base_dir") + + output: + path('tsv_files') + + """ + mkdir tsv_files + find $base_dir -type f .. | xargs -I {} -P 10 cp {} tsv_files + """ +} + +process fetch_lookup { + queue 'short' + + input: + path (query) + + output: + path("lookup_dump.csv") + + """ + psql -f $query $PGDATABASE > lookup_dump.csv + """ +} + + +process parse_tsvs { + memory 24.GB + + input: + path(tsvs) + + output: + path('chunk_*') + + """ + expression-parse parse -i $tsvs -o all_genes.csv + split -n l/10 all_genes.csv chunk_ + """ + +} + +process lookup_genes { + + input: + path(lookup) + path(genes) + + output: + path('*.csv') + + """ + expression-parse lookup -g $genes -l $lookup -o exp_parse_stage2.json + rnac expressionatlas parse exp_parse_stage2.json . + """ +} + + +workflow expressionatlas { + + emit: data + main: + + if( params.databases.expressionatlas.run ) { + Channel.fromPath('files/import-data/expressionatlas/lookup-dump-query.sql') | set { lookup_sql } + Channel.fromPath($params.databases.expressionatlas.remote) | set { tsv_path } + lookup_sql | fetch_lookup | set { lookup } + tsv_path \ + | fetch_data \ + | filter { tsv_name -> + !params.databases.expressionatlas.exclude.any {p -> tsv_name.baseName =~ p} + } \ + | parse_tsvs \ + | set { genes } + + lookup_genes(genes, lookup) \ + | collectFile() {csvfile -> [csvfile.name, csvfile.text]} \ + | set { data } + + } + else { + Channel.empty() | set { data } + } + +} diff --git a/workflows/databases/genecards_suite.nf b/workflows/databases/genecards_suite.nf index f9a1a87f9..34ee18f4e 100644 --- a/workflows/databases/genecards_suite.nf +++ b/workflows/databases/genecards_suite.nf @@ -1,5 +1,8 @@ process fetch { tag { "$name" } + queue 'datamover' + container '' + input: tuple val(name), path(data), val(column_name) diff --git a/workflows/databases/lncbook.nf b/workflows/databases/lncbook.nf index 697b939ba..ca1e92bc8 100644 --- a/workflows/databases/lncbook.nf +++ b/workflows/databases/lncbook.nf @@ -5,7 +5,8 @@ process lncbook { path('*.csv') """ - wget -O lncbook.json ${params.databases.lncbook.remote} + wget -O lncbook.json.gz ${params.databases.lncbook.remote} + gzip -d lncbook.json.gz rnac lncbook parse lncbook.json . """ } diff --git a/workflows/databases/lncipedia.nf b/workflows/databases/lncipedia.nf index 20dd8427e..b073050c3 100644 --- a/workflows/databases/lncipedia.nf +++ b/workflows/databases/lncipedia.nf @@ -8,7 +8,7 @@ process lncipedia { path('*.csv') """ - curl ${params.databases.lncipedia.remote} > lncipedia.json + wget -O lncipedia.json ${params.databases.lncipedia.remote} rnac lncipedia parse lncipedia.json . """ } diff --git a/workflows/databases/pdbe.nf b/workflows/databases/pdbe.nf index 7d5715387..8b7382dab 100644 --- a/workflows/databases/pdbe.nf +++ b/workflows/databases/pdbe.nf @@ -5,6 +5,9 @@ process pdbe { path('*.csv') """ - rnac pdb generate . + wget --read-timeout=30 -t 1 -O pdb_full_region.txt.gz http://ftp.ebi.ac.uk/pub/databases/Rfam/.preview/pdb_full_region.txt.gz + gzip -d pdb_full_region.txt.gz + awk 'BEGIN {OFS = FS = "\t" } \$11 == 1 { print \$2, \$3} ' pdb_full_region.txt | sort -u > rfam_hit_ids + rnac pdb generate --override-chains=rfam_hit_ids . """ } diff --git a/workflows/databases/plncdb.nf b/workflows/databases/plncdb.nf new file mode 100644 index 000000000..14cba07c7 --- /dev/null +++ b/workflows/databases/plncdb.nf @@ -0,0 +1,53 @@ +nextflow.enable.dsl = 2 + +process fetch_data { + when { !params.databases.plncdb.prefetch and params.databases.plncdb.run } + + containerOptions "--contain --bind $baseDir" + + output: + path("data") + + """ + rnac plncdb fetch-data $params.databases.plncdb.urls data + """ +} + +process parse_data { + when { params.databases.plncdb.run } + + queue 'short' + memory { 8.GB * task.attempt } + + errorStrategy 'retry' + maxRetries 16 + + input: + path data + + output: + path('*.csv') + + """ + # rnac notify step "Data parsing for PLncDB" $params.databases.plncdb.data_path$data + rnac plncdb parse $params.databases.plncdb.data_path$data + """ +} + +workflow plncdb { + emit: data_files + + main: + if( params.databases.plncdb.run ) { + Channel.fromPath("$params.databases.plncdb.data_path/*", type:'dir') \ + | parse_data \ + | flatten + | collectFile() {csvfile -> [csvfile.name, csvfile.text]} \ + | set { data_files } + } + else { + Channel.empty() | set { data_files } + } + + +} diff --git a/workflows/databases/quickgo.nf b/workflows/databases/quickgo.nf index cbba55bbf..de41f5c4f 100644 --- a/workflows/databases/quickgo.nf +++ b/workflows/databases/quickgo.nf @@ -1,13 +1,45 @@ -process quickgo { - when { params.databases.quickgo.run } +process quickgo_get { + queue 'datamover' + container '' + + output: + path('data.gpa') + + """ + scp $params.databases.quickgo.remote data.gpa.gz + gzip -d data.gpa.gz + """ +} + + + +process quickgo_parse { memory { params.databases.quickgo.memory } + input: + path(data) + output: path('*.csv') """ - scp $params.databases.quickgo.remote data.gpa.gz - gzip -d data.gpa - rnac quickgo parse data.gpa . + rnac quickgo parse $data . """ } + + + +workflow quickgo { + + emit: data + + main: + if ( params.databases.quickgo.run ) { + quickgo_get | quickgo_parse | set { data } + } + else { + Channel.empty() | set { data } + } + + +} diff --git a/workflows/databases/rfam.nf b/workflows/databases/rfam.nf index ba2137f5d..4c052634d 100644 --- a/workflows/databases/rfam.nf +++ b/workflows/databases/rfam.nf @@ -78,7 +78,7 @@ workflow rfam { emit: data main: Channel.fromPath('files/import-data/rfam/select-families.sql') | set { family_sql } - Channel.fromPath('files/import-data/rfam/select-families.sql') | set { family_sql } + Channel.fromPath('files/import-data/rfam/families.sql') | set { info_sql } Channel.fromPath('files/import-data/rfam/sequences.sql') | set { sequence_sql } info_sql | fetch_families_info | set { info } diff --git a/workflows/databases/select.nf b/workflows/databases/select.nf new file mode 100644 index 000000000..b9a36fa84 --- /dev/null +++ b/workflows/databases/select.nf @@ -0,0 +1,63 @@ +nextflow.enable.dsl=2 + + + +process check_db_md5 { + container '' + + input: + tuple val(db_name), val(remote) + + output: + path("*.csv") + + + """ + wget -O target_file $remote + echo -n "$db_name," >> latest_md5s.csv && md5sum target_file | awk 'BEGIN {fs="[ ]"}; {print \$1}' >> latest_md5s.csv + """ +} + + +process make_selection { + publishDir "$projectDir" + + input: + path latest_md5s + + output: + path ("*.config") + path ("$latest_md5s") + + """ + rnac scan-imports select-for-import $latest_md5s + """ + +} + + +process update_tracker_table { + input: + path latest_md5s + + """ + rnac scan-imports update-tracker $latest_md5s + """ +} + + +workflow select { + + Channel.fromPath(params.import_selection_remotes) \ + | splitCsv + | map { row -> tuple(row[0], row[1])} + | check_db_md5 + | collectFile + | ( make_selection & update_tracker_table ) + +} + + +workflow { + select() +} diff --git a/workflows/databases/silva.nf b/workflows/databases/silva.nf index 7407631c2..91e2b258c 100644 --- a/workflows/databases/silva.nf +++ b/workflows/databases/silva.nf @@ -5,7 +5,7 @@ process fetch { path('*.rnac') """ - wget $params.databases.silva.remote + wget -e robots=off -nH -r --cut-dirs 3 --no-parent -A "SILVA_*Parc.rnac.gz" $params.databases.silva.remote gzip -d *.gz """ } diff --git a/workflows/databases/zwd.nf b/workflows/databases/zwd.nf index fcd3f8bfd..a6ab0b1e7 100644 --- a/workflows/databases/zwd.nf +++ b/workflows/databases/zwd.nf @@ -8,7 +8,7 @@ process zwd { path('*.csv') """ - cp $params.databases.zwd.remote zwd.json + wget -O zwd.json $params.databases.zwd.remote rnac zwd parse $context zwd.json . """ } diff --git a/workflows/export/sequence-search.nf b/workflows/export/sequence-search.nf index 1ada4cdee..90fda0e2a 100755 --- a/workflows/export/sequence-search.nf +++ b/workflows/export/sequence-search.nf @@ -23,7 +23,7 @@ process query_database { maxForks params.export.sequence_search.max_forks input: - tuple val(name), path(query), val(partition) + tuple val(name), path(query), val(partition) output: tuple val(name), path('raw.json') @@ -64,6 +64,7 @@ process create_fasta { process atomic_publish { stageInMode 'copy' + queue 'datamover' input: path(fasta) diff --git a/workflows/export/text-search.nf b/workflows/export/text-search.nf index 991c87ede..7f6e5c3f8 100755 --- a/workflows/export/text-search.nf +++ b/workflows/export/text-search.nf @@ -23,7 +23,7 @@ process create_release_note { // At this point we should be able to safely move data into the final location. // This deletes the old data and then moves the new data in place. process atomic_publish { - container '' + queue 'datamover' input: path('release_note.txt') diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf index 451cd8b09..9c308cb11 100755 --- a/workflows/export/text-search/sequences.nf +++ b/workflows/export/text-search/sequences.nf @@ -90,9 +90,9 @@ process build_ranges { process fetch_accession { tag { "$min-$max" } maxForks 3 - time '10m' errorStrategy 'retry' maxRetries 5 + container '' input: tuple val(min), val(max), path(sql), val(_flag) @@ -114,15 +114,16 @@ process text_mining_query { input: val(max_count) path(script) + container '' output: - path("text-mining.json") + path("publication-count.json") """ curl "$params.export.search.text_mining" > counts.csv psql -v ON_ERROR_STOP=1 -c "\\copy search_export_publication_counts from 'counts.csv'" "$PGDATABASE" psql -v ON_ERROR_STOP=1 -f "$script" "$PGDATABASE" > raw.json - search-export group text-mining raw.json ${max_count} text-mining.json + search-export group publication-count raw.json ${max_count} publication-count.json """ } diff --git a/workflows/load-data.nf b/workflows/load-data.nf index 298e0a159..874be7c1a 100644 --- a/workflows/load-data.nf +++ b/workflows/load-data.nf @@ -14,7 +14,7 @@ process create_load_tables { process merge_and_import { tag { name } - memory 3.GB + memory 9.GB maxForks 2 containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir" diff --git a/workflows/lookup-references.nf b/workflows/lookup-references.nf index cbd970355..010ae1cfa 100644 --- a/workflows/lookup-references.nf +++ b/workflows/lookup-references.nf @@ -16,12 +16,15 @@ process merge_and_split_all_publications { process fetch_publications { when { params.needs_publications } + queue 'datamover' + executor 'lsf' + container '' output: path('out') """ - curl -L http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz > PMCLiteMetadata.tgz + cp /nfs/ftp/public/databases/pmc/PMCLiteMetadata/PMCLiteMetadata.tgz . tar xvf PMCLiteMetadata.tgz """ } diff --git a/workflows/metadata/taxonomy.nf b/workflows/metadata/taxonomy.nf index d0583923e..ca432235c 100644 --- a/workflows/metadata/taxonomy.nf +++ b/workflows/metadata/taxonomy.nf @@ -1,11 +1,14 @@ process taxonomy { memory '2GB' + errorStrategy 'retry' output: path('*.csv') """ - wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz + wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz + wget https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5 + md5sum -c new_taxdump.tar.gz.md5 tar xvf new_taxdump.tar.gz mkdir taxdump mv *.dmp taxdump diff --git a/workflows/parse-databases.nf b/workflows/parse-databases.nf index cd02d2c9e..daf7d709a 100644 --- a/workflows/parse-databases.nf +++ b/workflows/parse-databases.nf @@ -1,6 +1,7 @@ include { crw } from './databases/crw' include { ena } from './databases/ena' include { ensembl } from './databases/ensembl' +include { expressionatlas } from './databases/expressionatlas' include { five_s_rrnadb } from './databases/5srrnadb' include { flybase } from './databases/flybase' include { genecards_suite } from './databases/genecards_suite' @@ -14,6 +15,7 @@ include { mirbase } from './databases/mirbase' include { mirgenedb } from './databases/mirgenedb' include { pdbe } from './databases/pdbe' include { pirbase } from './databases/pirbase' +include { plncdb } from './databases/plncdb' include { pombase } from './databases/pombase' include { psicquic } from './databases/psicquic' include { quickgo } from './databases/quickgo' @@ -57,6 +59,7 @@ workflow parse_databases { five_s_rrnadb(), ena(), ensembl(), + expressionatlas(), flybase(), genecards_suite(), gtrnadb(context), @@ -69,6 +72,7 @@ workflow parse_databases { mirgenedb(), pdbe(), pirbase(), + plncdb(), pombase(), psicquic(), quickgo(), diff --git a/workflows/precompute/build_urs_table.nf b/workflows/precompute/build_urs_table.nf index 807490c2c..18cda328f 100644 --- a/workflows/precompute/build_urs_table.nf +++ b/workflows/precompute/build_urs_table.nf @@ -29,6 +29,8 @@ process fetch_all_urs_taxid { process select_outdated { containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir" + memory '24 GB' + cpus 4 input: path('xref.csv') @@ -96,7 +98,8 @@ process sort_ids { process xref_releases { input: - tuple val(_flag), file(query) + tuple val(_flag) + file(query) output: path('data.csv') @@ -106,9 +109,10 @@ process xref_releases { """ } -process fetch_release_info { +process precompute_releases { input: - tuple val(_flag), file(query) + val(_flag) + file(query) output: path('data.csv') @@ -158,7 +162,7 @@ workflow using_ids { flag \ | map { _flag -> file(params.precompute.select.id_file) } \ - | set { id_files } + | set { id_files } sort_ids(flag, id_files) | set { selected } } diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf index da8f3082d..3be666626 100644 --- a/workflows/r2dt.nf +++ b/workflows/r2dt.nf @@ -58,7 +58,7 @@ process layout_sequences { tag { "${sequences}" } memory params.r2dt.layout.memory container params.r2dt.container - containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms" + containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms" errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' } input: @@ -75,8 +75,9 @@ process layout_sequences { process publish_layout { maxForks 50 - errorStrategy { task.attempt < 5 ? "retry" : "finish" } + errorStrategy { task.attempt < 5 ? "retry" : "ignore" } maxRetries 5 + queue 'datamover' input: tuple path(sequences), path(output), path(mapping) @@ -94,6 +95,7 @@ process publish_layout { process parse_layout { input: tuple path(sequences), path(to_parse), path(mapping) + errorStrategy "ignore" output: path "data.csv", emit: data @@ -142,10 +144,10 @@ workflow common { workflow for_database { take: sequences - emit: + emit: parsed layouts - main: + main: common | set { model_mapping } sequences \ @@ -183,7 +185,7 @@ workflow r2dt { | set { data } data | publish_layout - data | parse_layout + data | parse_layout parse_layout.out.data | collect | set { data } parse_layout.out.attempted | collect | set { attempted } diff --git a/workflows/references/manually_annotated/query.sql b/workflows/references/manually_annotated/query.sql new file mode 100644 index 000000000..fdde38275 --- /dev/null +++ b/workflows/references/manually_annotated/query.sql @@ -0,0 +1,17 @@ +select + xref.upi || '_' || xref.taxid, + -- acc.accession, + acc."database", + refs.pmid, + refs.doi, + refs.pmcid + -- refs.epmcid +from rnc_accessions acc +join xref +on xref.ac = acc.accession +join rnc_reference_map rmap on rmap.accession = acc.accession +join rnc_references refs on refs.id = rmap.reference_id +where + xref.dbid in (24, 20, 14, 16, 18, 23, 27, 44, 48) + and xref.deleted = 'N' +; diff --git a/workflows/references/queries/ensembl_gene.sql b/workflows/references/queries/ensembl.sql similarity index 82% rename from workflows/references/queries/ensembl_gene.sql rename to workflows/references/queries/ensembl.sql index 20592eee3..a9c194b30 100644 --- a/workflows/references/queries/ensembl_gene.sql +++ b/workflows/references/queries/ensembl.sql @@ -1,9 +1,10 @@ -- ENSEMBL select - gene, -- Also search for everything up to the first '.' - external_id, upi, - taxid + taxid, + external_id, + gene, -- Also search for everything up to the first '.' + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/ensembl_gencode_gene.sql b/workflows/references/queries/ensembl_gencode.sql similarity index 90% rename from workflows/references/queries/ensembl_gencode_gene.sql rename to workflows/references/queries/ensembl_gencode.sql index ed7432395..b4c63180f 100644 --- a/workflows/references/queries/ensembl_gencode_gene.sql +++ b/workflows/references/queries/ensembl_gencode.sql @@ -1,9 +1,10 @@ -- ENSEMBL_GENCODE select - gene, -- Also search for everything up to the first '.' - external_id, upi, - taxid + taxid, + external_id, + gene, -- Also search for everything up to the first '.' + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/ensembl_gencode_locus_tag.sql b/workflows/references/queries/ensembl_gencode_locus_tag.sql deleted file mode 100644 index 746306912..000000000 --- a/workflows/references/queries/ensembl_gencode_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ENSEMBL_GENCODE -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('ENSEMBL_GENCODE') -; diff --git a/workflows/references/queries/ensembl_locus_tag.sql b/workflows/references/queries/ensembl_locus_tag.sql deleted file mode 100644 index 7ad75b2ae..000000000 --- a/workflows/references/queries/ensembl_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ENSEMBL -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('ENSEMBL') -; diff --git a/workflows/references/queries/ensembl_metazoa_gene.sql b/workflows/references/queries/ensembl_metazoa.sql similarity index 90% rename from workflows/references/queries/ensembl_metazoa_gene.sql rename to workflows/references/queries/ensembl_metazoa.sql index 16d9619e3..c9fa76c9a 100644 --- a/workflows/references/queries/ensembl_metazoa_gene.sql +++ b/workflows/references/queries/ensembl_metazoa.sql @@ -1,9 +1,10 @@ -- ENSEMBL METAZOA select - gene, -- Also search for everything up to the first '.' - external_id, upi, - taxid + taxid, + external_id, + gene, -- Also search for everything up to the first '.' + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/ensembl_metazoa_locus_tag.sql b/workflows/references/queries/ensembl_metazoa_locus_tag.sql deleted file mode 100644 index ee1f7286e..000000000 --- a/workflows/references/queries/ensembl_metazoa_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ENSEMBL METAZOA -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('ENSEMBL_METAZOA') -; diff --git a/workflows/references/queries/ensembl_plants_gene.sql b/workflows/references/queries/ensembl_plants.sql similarity index 88% rename from workflows/references/queries/ensembl_plants_gene.sql rename to workflows/references/queries/ensembl_plants.sql index 2ed18276e..8078e5216 100644 --- a/workflows/references/queries/ensembl_plants_gene.sql +++ b/workflows/references/queries/ensembl_plants.sql @@ -1,9 +1,10 @@ -- ENSEMBL PLANTS select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene, + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/ensembl_plants_locus_tag.sql b/workflows/references/queries/ensembl_plants_locus_tag.sql deleted file mode 100644 index ea2c1f3b7..000000000 --- a/workflows/references/queries/ensembl_plants_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ENSEMBL PLANTS -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('ENSEMBL_PLANTS') -; diff --git a/workflows/references/queries/ensembl_protists_gene.sql b/workflows/references/queries/ensembl_protists.sql similarity index 88% rename from workflows/references/queries/ensembl_protists_gene.sql rename to workflows/references/queries/ensembl_protists.sql index c0ec32cc1..332e924a8 100644 --- a/workflows/references/queries/ensembl_protists_gene.sql +++ b/workflows/references/queries/ensembl_protists.sql @@ -1,9 +1,10 @@ -- ENSEMBL PROTISTS select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene, + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/ensembl_protists_locus_tag.sql b/workflows/references/queries/ensembl_protists_locus_tag.sql deleted file mode 100644 index 1b12b1e55..000000000 --- a/workflows/references/queries/ensembl_protists_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ENSEMBL PROTISTS -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('ENSEMBL_PROTISTS') -; diff --git a/workflows/references/queries/flybase_gene_synonym.sql b/workflows/references/queries/flybase.sql similarity index 88% rename from workflows/references/queries/flybase_gene_synonym.sql rename to workflows/references/queries/flybase.sql index 41823a045..735010c39 100644 --- a/workflows/references/queries/flybase_gene_synonym.sql +++ b/workflows/references/queries/flybase.sql @@ -1,9 +1,10 @@ -- Flybase select - gene_synonym, -- Split on , - external_id, upi, - taxid + taxid, + external_id, + gene_synonym, -- Split on , + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/flybase_locus_tag.sql b/workflows/references/queries/flybase_locus_tag.sql deleted file mode 100644 index 261af9de8..000000000 --- a/workflows/references/queries/flybase_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- Flybase -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" = 'FLYBASE' -; diff --git a/workflows/references/queries/genecards.sql b/workflows/references/queries/genecards.sql index 62c1d15e4..765e01045 100644 --- a/workflows/references/queries/genecards.sql +++ b/workflows/references/queries/genecards.sql @@ -1,8 +1,8 @@ -- GENECARDS select - gene, upi, - taxid + taxid, + gene from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/gtrnadb.sql b/workflows/references/queries/gtrnadb.sql index dfc535087..eb0ab6800 100644 --- a/workflows/references/queries/gtrnadb.sql +++ b/workflows/references/queries/gtrnadb.sql @@ -1,8 +1,8 @@ -- GTRNADB select - gene, upi, - taxid + taxid, + gene from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/hgnc_accession.sql b/workflows/references/queries/hgnc.sql similarity index 80% rename from workflows/references/queries/hgnc_accession.sql rename to workflows/references/queries/hgnc.sql index a8eba7ccc..9a3fdf0c6 100644 --- a/workflows/references/queries/hgnc_accession.sql +++ b/workflows/references/queries/hgnc.sql @@ -1,9 +1,10 @@ -- HGNC select - accession, - gene, upi, - taxid + taxid, + gene, + accession, + gene_synonym from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/hgnc_gene_synonym.sql b/workflows/references/queries/hgnc_gene_synonym.sql deleted file mode 100644 index 142752c31..000000000 --- a/workflows/references/queries/hgnc_gene_synonym.sql +++ /dev/null @@ -1,14 +0,0 @@ --- HGNC -select - gene_synonym, - gene, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" = 'HGNC' -; diff --git a/workflows/references/queries/mirbase.sql b/workflows/references/queries/mirbase.sql index 7530b38f8..0d804c00e 100644 --- a/workflows/references/queries/mirbase.sql +++ b/workflows/references/queries/mirbase.sql @@ -1,9 +1,9 @@ -- MIRBASE -select - optional_id, - external_id, +select upi, - taxid + taxid, + external_id, + optional_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/mirgenedb.sql b/workflows/references/queries/mirgenedb.sql index 98d372191..f558be54a 100644 --- a/workflows/references/queries/mirgenedb.sql +++ b/workflows/references/queries/mirgenedb.sql @@ -1,8 +1,8 @@ -- MIRGENEDB select - external_id, upi, - taxid + taxid, + external_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/pdbe.sql b/workflows/references/queries/pdbe.sql index e53d315ec..eca4b6644 100644 --- a/workflows/references/queries/pdbe.sql +++ b/workflows/references/queries/pdbe.sql @@ -1,8 +1,8 @@ -- PDBE select - external_id, upi, - taxid + taxid, + external_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/pombase_gene.sql b/workflows/references/queries/pombase.sql similarity index 79% rename from workflows/references/queries/pombase_gene.sql rename to workflows/references/queries/pombase.sql index 211c45232..5e868e181 100644 --- a/workflows/references/queries/pombase_gene.sql +++ b/workflows/references/queries/pombase.sql @@ -1,9 +1,10 @@ -- POMBASE select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene, + gene_synonym -- SPlit on ',' from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/pombase_gene_synonym.sql b/workflows/references/queries/pombase_gene_synonym.sql deleted file mode 100644 index ed9562602..000000000 --- a/workflows/references/queries/pombase_gene_synonym.sql +++ /dev/null @@ -1,14 +0,0 @@ --- POMBASE -select - gene_synonym, -- SPlit on ',' - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('POMBASE') -; diff --git a/workflows/references/queries/refseq_gene.sql b/workflows/references/queries/refseq.sql similarity index 66% rename from workflows/references/queries/refseq_gene.sql rename to workflows/references/queries/refseq.sql index 8dbf153ac..8dc4cc9d0 100644 --- a/workflows/references/queries/refseq_gene.sql +++ b/workflows/references/queries/refseq.sql @@ -1,9 +1,11 @@ -- REFSEQ select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene, + gene_synonym, -- Split on ',' + optional_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/refseq_gene_synonym.sql b/workflows/references/queries/refseq_gene_synonym.sql deleted file mode 100644 index a60ac7e4a..000000000 --- a/workflows/references/queries/refseq_gene_synonym.sql +++ /dev/null @@ -1,14 +0,0 @@ --- REFSEQ -select - gene_synonym, -- Split on ',' - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('REFSEQ') -; diff --git a/workflows/references/queries/refseq_optional_id.sql b/workflows/references/queries/refseq_optional_id.sql deleted file mode 100644 index 84287782c..000000000 --- a/workflows/references/queries/refseq_optional_id.sql +++ /dev/null @@ -1,14 +0,0 @@ --- REFSEQ -select - optional_id, -- Do not split - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('REFSEQ') -; diff --git a/workflows/references/queries/rfam.sql b/workflows/references/queries/rfam.sql index f1f2b319c..83ba1406f 100644 --- a/workflows/references/queries/rfam.sql +++ b/workflows/references/queries/rfam.sql @@ -1,9 +1,9 @@ -- Rfam select - optional_id, - external_id, upi, - taxid + taxid, + external_id, + optional_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/sgd.sql b/workflows/references/queries/sgd.sql index 5a2674ae1..5034ff5f0 100644 --- a/workflows/references/queries/sgd.sql +++ b/workflows/references/queries/sgd.sql @@ -1,8 +1,8 @@ -- SGD select - external_id, upi, - taxid + taxid, + external_id from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/tair.sql b/workflows/references/queries/tair.sql index de6676337..a21438632 100644 --- a/workflows/references/queries/tair.sql +++ b/workflows/references/queries/tair.sql @@ -1,9 +1,9 @@ -- TAIR select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/wormbase.sql b/workflows/references/queries/wormbase.sql index 790ee75b3..275337074 100644 --- a/workflows/references/queries/wormbase.sql +++ b/workflows/references/queries/wormbase.sql @@ -1,9 +1,10 @@ -- WORMBASE select - optional_id, - external_id, upi, - taxid + taxid, + external_id, + optional_id, + locus_tag from xref x join rnc_accessions ra on diff --git a/workflows/references/queries/wormbase_locus_tag.sql b/workflows/references/queries/wormbase_locus_tag.sql deleted file mode 100644 index cd6d69dd7..000000000 --- a/workflows/references/queries/wormbase_locus_tag.sql +++ /dev/null @@ -1,14 +0,0 @@ --- WORMBASE -select - locus_tag, - external_id, - upi, - taxid -from xref x -join rnc_accessions ra -on - ra.accession = x.ac -where - x.deleted = 'N' - and ra."database" in ('WORMBASE') -; diff --git a/workflows/references/queries/zfin.sql b/workflows/references/queries/zfin.sql index 3cfdd6e51..3d03623db 100644 --- a/workflows/references/queries/zfin.sql +++ b/workflows/references/queries/zfin.sql @@ -1,9 +1,9 @@ -- ZFIN select - gene, - external_id, upi, - taxid + taxid, + external_id, + gene from xref x join rnc_accessions ra on diff --git a/workflows/references/submit/ensembl_gencode_ids.txt b/workflows/references/submit/ensembl_gencode_ids.txt new file mode 100644 index 000000000..d7809e737 --- /dev/null +++ b/workflows/references/submit/ensembl_gencode_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e54218795a93e36c7bb522f430235c1e0e80561712c4db4d612c782919e4c6 +size 2954775 diff --git a/workflows/references/submit/ensembl_ids.txt b/workflows/references/submit/ensembl_ids.txt new file mode 100644 index 000000000..81b662657 --- /dev/null +++ b/workflows/references/submit/ensembl_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd35397a3c3036fe909568f273c0d22c5073141501ac3a5871e9085a996fc46 +size 77758708 diff --git a/workflows/references/submit/ensembl_metazoa_ids.txt b/workflows/references/submit/ensembl_metazoa_ids.txt new file mode 100644 index 000000000..0163a44ae --- /dev/null +++ b/workflows/references/submit/ensembl_metazoa_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ec48f51e3650751a4735e169eabd213bfce86fef1be8cb132b2e18937ff0ba8 +size 5163520 diff --git a/workflows/references/submit/ensembl_plants_ids.txt b/workflows/references/submit/ensembl_plants_ids.txt new file mode 100644 index 000000000..fd8ebc36b --- /dev/null +++ b/workflows/references/submit/ensembl_plants_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd63f1099cdf76fa152329273fed50a19b7b5e1e04f58a500d51501660dd7f5 +size 5432399 diff --git a/workflows/references/submit/ensembl_protists_ids.txt b/workflows/references/submit/ensembl_protists_ids.txt new file mode 100644 index 000000000..6b8d3f4e3 --- /dev/null +++ b/workflows/references/submit/ensembl_protists_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aadc96282b91ecb4a5d6acd0dbbc9429f455fe67fa6a0d732dff5f3b2070a9d +size 544884 diff --git a/workflows/references/submit/flybase_ids.txt b/workflows/references/submit/flybase_ids.txt new file mode 100644 index 000000000..eb17f7f0f --- /dev/null +++ b/workflows/references/submit/flybase_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d59a992feb5fe136d35a5aa257607fbcbf54f2f1091e885158a1292cb3ff488 +size 253169 diff --git a/workflows/references/submit/genecards_ids.txt b/workflows/references/submit/genecards_ids.txt new file mode 100644 index 000000000..982241102 --- /dev/null +++ b/workflows/references/submit/genecards_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fce18cd50752c9630b709b4682e0981a6eb84ff02083ab66823db3e99a885c55 +size 12648409 diff --git a/workflows/references/submit/gtrnadb_ids.txt b/workflows/references/submit/gtrnadb_ids.txt new file mode 100644 index 000000000..9db72c482 --- /dev/null +++ b/workflows/references/submit/gtrnadb_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58bd8991d16966046140cc57f884da3050286b0510f1abff4b2acca783ab6bc7 +size 5291180 diff --git a/workflows/references/submit/hgnc_ids.txt b/workflows/references/submit/hgnc_ids.txt new file mode 100644 index 000000000..1582c2b80 --- /dev/null +++ b/workflows/references/submit/hgnc_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6decd826f276b1b43d19c23341fb1a30f35ba86a19db05692b0d3195e6b478 +size 79668 diff --git a/workflows/references/submit/mirbase_ids.txt b/workflows/references/submit/mirbase_ids.txt new file mode 100644 index 000000000..bb2816ec1 --- /dev/null +++ b/workflows/references/submit/mirbase_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384472d7ce2e65e9e693572341b6d7bd8258b6f1cd626f4799835ee4276ee008 +size 3829708 diff --git a/workflows/references/submit/mirgenedb_ids.txt b/workflows/references/submit/mirgenedb_ids.txt new file mode 100644 index 000000000..d1bdada0c --- /dev/null +++ b/workflows/references/submit/mirgenedb_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da77cb18cdbb27b64b9a174f004436391d446921b01c0b5d188f97b2c5d8dd97 +size 1209980 diff --git a/workflows/references/submit/pdbe_ids.txt b/workflows/references/submit/pdbe_ids.txt new file mode 100644 index 000000000..99950072f --- /dev/null +++ b/workflows/references/submit/pdbe_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8625ddfbdeccf32a95629d71f6fa4057e932b0062b1baa9589c7aaf971febc10 +size 98027 diff --git a/workflows/references/submit/pombase_ids.txt b/workflows/references/submit/pombase_ids.txt new file mode 100644 index 000000000..5b54cd100 --- /dev/null +++ b/workflows/references/submit/pombase_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfae65f806baeb9cd5e45aff4c10c3949909e40289d41ed403a37b3aef2ee4bc +size 373012 diff --git a/workflows/references/submit/refseq_ids.txt b/workflows/references/submit/refseq_ids.txt new file mode 100644 index 000000000..536cfae9a --- /dev/null +++ b/workflows/references/submit/refseq_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d874ce9eb8fbc20bd251a54f344f3a590cd3fedc0d6f4a5bd801da3074d6374d +size 2902887 diff --git a/workflows/references/submit/rfam_ids.txt b/workflows/references/submit/rfam_ids.txt new file mode 100644 index 000000000..e84a2fe53 --- /dev/null +++ b/workflows/references/submit/rfam_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a9a091da295c6f9ed877e6990473b6878d9307ecdef840b4f947cc028b051f +size 43048703 diff --git a/workflows/references/submit/sgd_ids.txt b/workflows/references/submit/sgd_ids.txt new file mode 100644 index 000000000..5d38566a7 --- /dev/null +++ b/workflows/references/submit/sgd_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6886370d74b311a808323eb08eaedbd674c19d801b9e77aaf421fe229a4342e +size 9182 diff --git a/workflows/references/submit/tair_ids.txt b/workflows/references/submit/tair_ids.txt new file mode 100644 index 000000000..782ea8393 --- /dev/null +++ b/workflows/references/submit/tair_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b7561d9883f16bb8fea61d4652e3a22b946b7a9411ece4ccce312c41242a354 +size 190481 diff --git a/workflows/references/submit/wormbase_ids.txt b/workflows/references/submit/wormbase_ids.txt new file mode 100644 index 000000000..998d179e5 --- /dev/null +++ b/workflows/references/submit/wormbase_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a22355e705541c86481b4efe7865ea0942a39722ca9f389d1f5215c22d00240 +size 1537664 diff --git a/workflows/references/submit/zfin_ids.txt b/workflows/references/submit/zfin_ids.txt new file mode 100644 index 000000000..d3aada3da --- /dev/null +++ b/workflows/references/submit/zfin_ids.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a0f93922a5d5005c7c9889a2d03096ec51f389e200a809e1a3ac66dad8f7a4 +size 65577 diff --git a/workflows/rfam-scan.nf b/workflows/rfam-scan.nf index 25e775221..ddd392b9f 100644 --- a/workflows/rfam-scan.nf +++ b/workflows/rfam-scan.nf @@ -24,9 +24,10 @@ process generate_files { } process sequences { - memory '20GB' + memory '4GB' + queue short containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir" - clusterOptions '-R "rusage[scratch=4000]"' + // clusterOptions '-R "rusage[scratch=4000]"' input: tuple path(version), path(active_xrefs), path(computed), path(compute_missing) @@ -50,6 +51,7 @@ process scan { memory { params.rfam.memory * params.rfam.cpus } errorStrategy 'ignore' containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir" + queue 'short' input: tuple path(version), path('sequences.fasta'), path(cm_files) diff --git a/workflows/utils/slack.nf b/workflows/utils/slack.nf new file mode 100644 index 000000000..06a15ef5e --- /dev/null +++ b/workflows/utils/slack.nf @@ -0,0 +1,47 @@ +process slack_message { + + input: + val(message) + + """ + rnac notify step "Import Workflow" "$message" + """ + +} + + +process slack_file { + + input: + path(message) + + """ + rnac notify file "$message" + """ + +} + + +import groovy.json.JsonSlurper + +// A groovy function for use in closures - uses groovy's own URL class to make the request +def slack_closure(msg) { + def configFile = new File("secrets.json"); + def config = new JsonSlurper().parseFile(configFile, 'UTF-8'); + + def post = new URL(config.SLACK_WEBHOOK).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true); + post.setRequestProperty("Content-Type", "application/json"); + + def payload = "{\"text\" : \"$msg\" }" + + + post.getOutputStream().write(payload.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (postRC != 200) { + println("Something went wrong calling slack webhook!"); + println(post.getInputStream().getText()); + } + +}