diff --git a/.editorconfig b/.editorconfig
index d898f5a49..244512988 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -34,3 +34,6 @@ indent_size = 2
 [*.yaml]
 indent_style = space
 indent_size = 2
+
+[*.nf]
+indent_size = 2
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000..4ddcc6c3b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+workflows/references/submit/*.txt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 000000000..7c649ad26
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,75 @@
+# Thid workflow will build and push the import pipeline container.
+# the plan later will be to include unit tests as well
+
+
+name: Building Pipeline Containers
+
+on:
+ push:
+  branches:
+   'dev'
+jobs:
+
+  starting-notification:
+   runs-on: ubuntu-latest
+   steps:
+     - uses: actions/checkout@v2
+
+     - name: Intital notification
+       uses: rtCamp/action-slack-notify@v2
+       env:
+         SLACK_MESSAGE: 'Creating new pipeline image in docker hub'
+         SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+         MSG_MINIMAL: true
+
+  create-docker-image:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: docker login
+        env:
+          DOCKER_USER: ${{ secrets.DOCKER_USER }}
+          DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
+        run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
+
+      - name: docker build
+        run: docker build -f Dockerfile -t rnacentral/rnacentral-import-pipeline .
+
+      - name: docker push
+        run: docker push rnacentral/rnacentral-import-pipeline
+
+  finished-notification:
+    needs:
+      - create-docker-image
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Finished notification
+        uses: rtCamp/action-slack-notify@v2
+        env:
+          SLACK_MESSAGE: 'New pipeline image pushed to docker hub'
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          MSG_MINIMAL: true
+
+  singularity-conversion:
+    needs:
+      - create-docker-image
+    uses: rnacentral/rnacentral-import-pipeline/.github/workflows/singularity.yaml@dev
+    secrets: inherit
+
+
+  finished-singularity:
+    needs:
+      - singularity-conversion
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Finished notification
+        uses: rtCamp/action-slack-notify@v2
+        env:
+          SLACK_MESSAGE: 'New singularity image pushed to ghcr'
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          MSG_MINIMAL: true
diff --git a/.github/workflows/singularity.yaml b/.github/workflows/singularity.yaml
new file mode 100644
index 000000000..e5630c91d
--- /dev/null
+++ b/.github/workflows/singularity.yaml
@@ -0,0 +1,25 @@
+# This workflow runs the conversion to singularity and stores the result in the
+# ghcr so we can pull it easier
+
+name: Singularity Build
+on: workflow_call
+
+
+jobs:
+  run_conversion:
+    name: "Pull docker image and convert"
+    runs-on: ubuntu-latest
+
+    container:
+      image: quay.io/singularity/singularity:v3.8.1
+      options: --privileged
+
+    steps:
+      - name: "Pull image"
+        run: |
+          singularity pull  --name rnacentral-rnacentral-import-pipeline-latest.sif docker://rnacentral/rnacentral-import-pipeline:latest
+
+      - name: "Push to ghcr"
+        run: |
+          echo ${{ secrets.GITHUB_TOKEN }} | singularity remote login -u ${{ secrets.GHCR_USERNAME }} --password-stdin oras://ghcr.io
+          singularity push rnacentral-rnacentral-import-pipeline-latest.sif oras://ghcr.io/${GITHUB_REPOSITORY}:latest
diff --git a/.gitignore b/.gitignore
index e8ae80863..0dbec5bae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,3 +101,8 @@ stubs
 .envrc
 workflows/references/results
 workflows/references/metadata
+workflows/references/backup
+workflows/references/submit/previous-release
+workflows/references/manually_annotated/from*
+workflows/references/manually_annotated/results
+singularity/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c17f67002..4e648edb7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,28 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v4.3.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
 -   repo: https://github.com/psf/black
-    rev: 19.3b0
+    rev: 22.6.0
     hooks:
     -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 5.10.1
+    hooks:
+    -   id: isort
+        args: ["--profile", "black", "--filter-files"]
+        name: isort (python)
+# -   repo: https://github.com/doublify/pre-commit-rust
+#     rev: v1.0
+#     hooks:
+#     -   id: fmt
+#     -   id: cargo-check
+#     -   id: clippy
+- repo: https://github.com/python-poetry/poetry
+  rev: '1.2.0rc1'
+  hooks:
+    - id: poetry-check
+    # - id: poetry-lock
diff --git a/Dockerfile b/Dockerfile
index 942e3e149..d223fba72 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7-buster
+FROM python:3.8-buster
 
 ENV RNA /rna
 
@@ -46,6 +46,7 @@ RUN apt-get install -y \
     unzip \
     wget
 
+
 # Install Infernal
 RUN \
     cd $RNA/ && \
@@ -94,6 +95,7 @@ RUN pip3 install -r $RNACENTRAL_IMPORT_PIPELINE/requirements.txt
 
 RUN python3 -m textblob.download_corpora
 
+
 WORKDIR /
 
 COPY openssl/openssl.cnf /etc/ssl/
diff --git a/Makefile b/Makefile
index 48a766ea7..9900316cd 100644
--- a/Makefile
+++ b/Makefile
@@ -13,13 +13,25 @@ requirements-dev.txt: requirements-dev.in
 
 rust:
 	cargo build --release
-	cp target/release/json2fasta bin
-	cp target/release/split-ena bin
-	cp target/release/expand-urs bin
-	cp target/release/precompute bin
-	cp target/release/search-export bin
-	cp target/release/ftp-export bin
-	cp target/release/json2dfasta bin
+	mv -f target/release/json2fasta bin
+	mv -f target/release/split-ena bin
+	mv -f target/release/expand-urs bin
+	mv -f target/release/precompute bin
+	mv -f target/release/search-export bin
+	mv -f target/release/ftp-export bin
+	mv -f target/release/json2dfasta bin
+	mv -f target/release/expression-parse bin
+
+clean:
+	rm bin/json2fasta
+	rm bin/split-ena
+	rm bin/expand-urs
+	rm bin/precompute
+	rm bin/search-export
+	rm bin/ftp-export
+	rm bin/json2dfasta
+	rm bin/expression-parse
+	cargo clean
 
 docker: Dockerfile requirements.txt .dockerignore
 	docker build -t "$(docker)" .
diff --git a/analyze.nf b/analyze.nf
index 526698378..66b777ace 100755
--- a/analyze.nf
+++ b/analyze.nf
@@ -7,13 +7,29 @@ include { genome_mapping } from './workflows/genome-mapping'
 include { r2dt } from './workflows/r2dt'
 include { rfam_scan } from './workflows/rfam-scan'
 
+include { slack_closure } from './workflows/utils/slack'
+include { slack_message } from './workflows/utils/slack'
+
 workflow analyze {
   take: ready
   emit: done
   main:
+    Channel.of("Starting analyze pipeline") | slack_message
     ready | (genome_mapping & rfam_scan & r2dt & cpat) | mix | collect | set { done }
 }
 
 workflow {
   analyze(Channel.of('ready'))
 }
+
+
+workflow.onComplete {
+  slack_closure("Analyze workflow completed")
+
+}
+
+workflow.onError {
+
+  slack_closure("Analyze workflow hit an error and crashed")
+
+}
diff --git a/bin/check_ids.py b/bin/check_ids.py
index 6aba3b8e2..7d47b0d71 100755
--- a/bin/check_ids.py
+++ b/bin/check_ids.py
@@ -26,10 +26,11 @@
 words.update(ignore_ids)
 special_char = re.compile('[@!#$%^&()<>?/\[\]\'}{~:]')
 nts = re.compile('^[acgu]+$')
+numbers_and_dash = re.compile('^\d+[\-]\d+$')  # do not use ids like 6-1, 260-1, etc
 
 
 def check_id(item):
-    if item.isnumeric() or item.lower() in words:
+    if item.isnumeric() or item.lower() in words or numbers_and_dash.search(item):
         result = None
     elif len(item) > 2 and not special_char.search(item) and not nts.search(item.lower()) and "\\" not in item:
         result = item
@@ -47,55 +48,72 @@ def main(database, filename, output):
     """
     Check ids and create file that will be used by RNAcentral-references.
     """
-    remove_dot = ["ensembl_gene", "ensembl_gencode_gene", "ensembl_metazoa_gene"]
-    split_on_comma = ["flybase_gene_synonym", "pombase_gene_synonym", "refseq_gene_synonym", "hgnc_gene_synonym"]
+    remove_dot = ["ensembl", "ensembl_gencode", "ensembl_metazoa"]
+    split_on_comma = ["flybase", "hgnc", "pombase", "refseq"]
+    rfam_ignore = [
+        "30_255", "30_292", "5S_rRNA", "5_8S_rRNA", "6A", "6S", "7SK", "C4", "CRISPR-DR10", "CRISPR-DR11",
+        "CRISPR-DR12", "CRISPR-DR13", "CRISPR-DR14", "CRISPR-DR15", "CRISPR-DR16", "CRISPR-DR17", "CRISPR-DR18",
+        "CRISPR-DR19", "CRISPR-DR2", "CRISPR-DR20", "CRISPR-DR21", "CRISPR-DR22", "CRISPR-DR23", "CRISPR-DR24",
+        "CRISPR-DR25", "CRISPR-DR26", "CRISPR-DR27", "CRISPR-DR28", "CRISPR-DR29", "CRISPR-DR3", "CRISPR-DR30",
+        "CRISPR-DR31", "CRISPR-DR32", "CRISPR-DR33", "CRISPR-DR34", "CRISPR-DR35", "CRISPR-DR36", "CRISPR-DR37",
+        "CRISPR-DR38", "CRISPR-DR39", "CRISPR-DR4", "CRISPR-DR40", "CRISPR-DR41", "CRISPR-DR42", "CRISPR-DR43",
+        "CRISPR-DR44", "CRISPR-DR45", "CRISPR-DR46", "CRISPR-DR47", "CRISPR-DR48", "CRISPR-DR49", "CRISPR-DR5",
+        "CRISPR-DR50", "CRISPR-DR51", "CRISPR-DR52", "CRISPR-DR53", "CRISPR-DR54", "CRISPR-DR55", "CRISPR-DR56",
+        "CRISPR-DR57", "CRISPR-DR58", "CRISPR-DR6", "CRISPR-DR60", "CRISPR-DR61", "CRISPR-DR62", "CRISPR-DR63",
+        "CRISPR-DR64", "CRISPR-DR65", "CRISPR-DR66", "CRISPR-DR7", "CRISPR-DR8", "CRISPR-DR9", "F6", "Hairpin",
+        "Hairpin-meta1", "Hairpin-meta2", "Hatchet", "P1", "P10", "P11", "P13", "P14", "P15", "P17", "P18", "P2", "P24",
+        "P26", "P27", "P31", "P33", "P34", "P35", "P36", "P37", "P4", "P5", "P6", "P8", "P9", "ROSE", "S35", "S414",
+        "S774", "S808", "SAM", "SL1", "SL2", "U1", "U11", "U12", "U1_yeast", "U2", "U3", "U4", "U4atac", "U5", "U54",
+        "U6", "U6atac", "U7", "U8", "VA", "csRNA", "drum", "g2", "pRNA", "sar", "sul1", "t44", "tRNA", "tRNA-Sec",
+        "tmRNA", "tp2", "tracrRNA"
+    ]
 
     with open(filename, 'r') as input_file:
         with open(output, 'w') as output_file:
             while line := input_file.readline():
                 line = line.rstrip()
                 line = line.split('|')
-
-                if len(line) == 4:
-                    get_gene = line[0]
-                    get_primary_id = line[1]
-                    urs = line[2]
-                    taxid = line[3]
-
-                    # remove "."
-                    if database in remove_dot and "." in get_gene:
-                        get_gene = get_gene.split('.')[0]
-
-                    # split on ","
-                    gene_results = []
-                    if database in split_on_comma:
-                        gene_list = get_gene.split(',')
-                        for item in gene_list:
-                            item = check_id(item)
-                            if item:
-                                gene_results.append(item)
-
-                    if gene_results:
-                        primary_id = check_id(get_primary_id)
-                        for gene in gene_results:
-                            if gene and primary_id and gene != primary_id:
-                                output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
-                    else:
-                        gene = check_id(get_gene)
-                        primary_id = check_id(get_primary_id)
-                        if gene and primary_id and gene != primary_id:
-                            output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
-
-                else:
-                    get_primary_id = line[0]
-                    urs = line[1]
-                    taxid = line[2]
-
-                    # check if it is a valid id
-                    primary_id = check_id(get_primary_id)
-
-                    if primary_id:
-                        output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
+                urs = line[0]
+                taxid = line[1]
+                primary_id = check_id(line[2])
+                if primary_id and database in remove_dot and "." in primary_id:
+                    primary_id = primary_id.split('.')[0]
+
+                if primary_id and line[3:]:
+                    for item in line[3:]:
+                        if item:
+                            get_id = item
+                        else:
+                            continue
+
+                        # ignore some optional_id from Rfam
+                        if database == "rfam" and get_id in rfam_ignore:
+                            output_file.write('|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                            continue
+
+                        # remove "."
+                        if database in remove_dot and "." in get_id:
+                            get_id = get_id.split('.')[0]
+
+                        # split on ","
+                        results = []
+                        if database in split_on_comma:
+                            list_of_ids = get_id.split(',')
+                            for elem in list_of_ids:
+                                elem = check_id(elem)
+                                if elem:
+                                    results.append(elem)
+
+                        if results:
+                            for db_id in results:
+                                if db_id != primary_id:
+                                    output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                        else:
+                            db_id = check_id(get_id)
+                            if db_id and db_id != primary_id:
+                                output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                elif primary_id:
+                    output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
 
 
 if __name__ == '__main__':
diff --git a/bin/create_xml_metadata.py b/bin/create_xml_metadata.py
new file mode 100755
index 000000000..e9c1f592b
--- /dev/null
+++ b/bin/create_xml_metadata.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-present] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import click
+import gzip
+import random
+import string
+import uuid
+import xml.etree.ElementTree as ET
+
+
+def create_xml_file(results, metadata):
+    """
+    Creates the XML that will be used by the search index
+    :param results: list of results
+    :param metadata: file to be created
+    :return: None
+    """
+    # start to create a XML file
+    database = ET.Element("database")
+    ET.SubElement(database, "name").text = "RNAcentral"
+    entries = ET.SubElement(database, "entries")
+
+    for item in results:
+        entry = ET.SubElement(entries, "entry", id="metadata" + "_" + str(uuid.uuid4()))
+        additional_fields = ET.SubElement(entry, "additional_fields")
+        ET.SubElement(additional_fields, "field", name="entry_type").text = "Metadata"
+        ET.SubElement(additional_fields, "field", name="job_id").text = item["job_id"]
+        ET.SubElement(additional_fields, "field", name="database").text = item["db"]
+        ET.SubElement(additional_fields, "field", name="primary_id").text = item["primary_id"]
+
+    ET.SubElement(database, "entry_count").text = str(len(results))
+
+    # save the file
+    tree = ET.ElementTree(database)
+    ET.indent(tree, space="\t", level=0)
+    random_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=8))
+    with gzip.open(metadata.split("*")[0] + random_string + ".xml.gz", "wb") as file:
+        tree.write(file)
+
+
+@click.command()
+@click.argument('filename')
+@click.argument('output')
+def main(filename, output):
+    """
+    This function takes the ids and creates a temporary list to store the metadata.
+    :param filename: file containing ids
+    :param output: file to be created
+    :return: None
+    """
+    with open(filename, "r") as input_file:
+        temp_results = []
+
+        while line := input_file.readline():
+            line = line.rstrip()
+            line = line.split('|')
+            job_id = line[0]
+            database = line[1]
+
+            if len(line) < 3:
+                temp_results.append({"job_id": job_id, "db": database, "primary_id": ""})
+            else:
+                primary_id = line[2]
+                temp_results.append({"job_id": job_id, "db": database, "primary_id": primary_id})
+
+            if len(temp_results) >= 500000:
+                create_xml_file(temp_results, output)
+                temp_results = []
+
+        create_xml_file(temp_results, output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/get_unique_ids.sh b/bin/get_unique_ids.sh
new file mode 100755
index 000000000..a75d11f02
--- /dev/null
+++ b/bin/get_unique_ids.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# set parameters
+file=$1
+database=$2
+
+# read file line by line
+while IFS= read -r line; do
+    IFS=$"|"
+    tmp=($line)
+    if [[ ${#tmp[*]} = 2 ]]; then
+      job_id="${tmp[0]}"
+      urs="${tmp[1]}"
+    else
+      job_id="${tmp[0]}"
+      primary_id="${tmp[1]}"
+      urs="${tmp[2]}"
+    fi
+
+    if [[ -n "${job_id}" ]]; then
+      echo ${job_id} >> ${database}_all_ids.txt
+    fi
+
+    if [[ -n "${primary_id}" ]]; then
+      echo ${primary_id} >> ${database}_all_ids.txt
+    fi
+
+    if [[ -n "${urs}" ]]; then
+      echo ${urs} >> ${database}_all_ids.txt
+    fi
+done < ${file}
+
+# create file with unique ids
+cat ${database}_all_ids.txt | sort | uniq > ${database}_ids.txt
diff --git a/bin/metadata-rnacentral.py b/bin/metadata-rnacentral.py
new file mode 100755
index 000000000..98cd365d9
--- /dev/null
+++ b/bin/metadata-rnacentral.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-present] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import click
+
+
+@click.command()
+@click.argument('filename')
+@click.argument('output')
+def main(filename, output):
+    """
+    This function creates a file to store URS and a file with job_ids|URS.
+    These files can be used to create the metadata for the RNAcentral website.
+    :param filename: file containing ids
+    :param output: file to be created
+    :return: None
+    """
+    type_id = output.split('_')[0]
+
+    with open(filename, "r") as input_file:
+        with open(output, 'w') as output_file:
+            while line := input_file.readline():
+                line = line.rstrip()
+                line = line.split('|')
+
+                if type_id == 'urs':
+                    urs = line[-1]
+                    output_file.write(urs + '\n')
+                elif type_id == 'job' and len(line) == 2:
+                    job = line[0]
+                    urs = line[1]
+                    if job and urs:
+                        output_file.write(job + '|' + urs + '\n')
+                elif type_id == 'job' and len(line) == 3:
+                    job = line[0]
+                    primary = line[1]
+                    urs = line[2]
+                    if job and urs:
+                        output_file.write(job + '|' + urs + '\n')
+                    if primary and urs:
+                        output_file.write(primary + '|' + urs + '\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/metadata.py b/bin/metadata.py
index 3e8f9a7be..bab8bddb0 100755
--- a/bin/metadata.py
+++ b/bin/metadata.py
@@ -14,38 +14,6 @@
 limitations under the License.
 """
 import click
-import gzip
-import uuid
-import xml.etree.ElementTree as ET
-
-
-def create_xml_file(results, metadata):
-    """
-    Creates the XML that will be used by the search index
-    :param results: list of results
-    :param metadata: file to be created
-    :return: None
-    """
-    # start to create a XML file
-    database = ET.Element("database")
-    ET.SubElement(database, "name").text = "RNAcentral"
-    entries = ET.SubElement(database, "entries")
-
-    for item in results:
-        entry = ET.SubElement(entries, "entry", id="metadata" + "_" + str(uuid.uuid4()))
-        additional_fields = ET.SubElement(entry, "additional_fields")
-        ET.SubElement(additional_fields, "field", name="entry_type").text = "Metadata"
-        ET.SubElement(additional_fields, "field", name="job_id").text = item["job_id"]
-        ET.SubElement(additional_fields, "field", name="database").text = item["db"]
-        ET.SubElement(additional_fields, "field", name="primary_id").text = item["primary_id"]
-
-    ET.SubElement(database, "entry_count").text = str(len(results))
-
-    # save the file
-    tree = ET.ElementTree(database)
-    ET.indent(tree, space="\t", level=0)
-    with gzip.open(metadata, "wb") as file:
-        tree.write(file)
 
 
 @click.command()
@@ -53,38 +21,37 @@ def create_xml_file(results, metadata):
 @click.argument('output')
 def main(filename, output):
     """
-    This function takes the ids and creates a temporary list to store the metadata.
+    This function creates a file with the metadata of a given database.
     :param filename: file containing ids
     :param output: file to be created
     :return: None
     """
     with open(filename, "r") as input_file:
-        temp_results = []
         database = filename.split(".")[0]
-        no_primary_id = ["genecards", "gtrnadb", "mirgenedb", "pdbe", "sgd"]
-        while line := input_file.readline():
-            line = line.rstrip()
-            line = line.split('|')
-
-            if database in no_primary_id:
-                job_id = line[0]
-                urs = line[1]
-
-                temp_results.append({"job_id": urs, "db": "rnacentral", "primary_id": ""})
-                temp_results.append({"job_id": job_id, "db": "rnacentral", "primary_id": urs})
-                temp_results.append({"job_id": job_id, "db": database, "primary_id": ""})
-            else:
-                job_id = line[0]
-                primary_id = line[1]
-                urs = line[2]
-
-                temp_results.append({"job_id": urs, "db": "rnacentral", "primary_id": ""})
-                temp_results.append({"job_id": primary_id, "db": database, "primary_id": ""})
-                temp_results.append({"job_id": primary_id, "db": "rnacentral", "primary_id": urs})
-                temp_results.append({"job_id": job_id, "db": "rnacentral", "primary_id": urs})
-                temp_results.append({"job_id": job_id, "db": database, "primary_id": primary_id})
 
-        create_xml_file(temp_results, output)
+        with open(output, "w") as output_file:
+
+            while line := input_file.readline():
+                line = line.rstrip()
+                line = line.split('|')
+
+                if len(line) < 3:
+                    job_id = line[0].lower()
+                    urs = line[1]
+
+                    output_file.write(urs + "|" + "rnacentral" + "\n")
+                    output_file.write(job_id + "|" + "rnacentral" + "|" + urs + "\n")
+                    output_file.write(job_id + "|" + database + "\n")
+                else:
+                    job_id = line[0].lower()
+                    primary_id = line[1].lower()
+                    urs = line[2]
+
+                    output_file.write(urs + "|" + "rnacentral" + "\n")
+                    output_file.write(primary_id + "|" + database + "\n")
+                    output_file.write(primary_id + "|" + "rnacentral" + "|" + urs + "\n")
+                    output_file.write(job_id + "|" + "rnacentral" + "|" + urs + "\n")
+                    output_file.write(job_id + "|" + database + "|" + primary_id + "\n")
 
 
 if __name__ == "__main__":
diff --git a/bin/references-manually-annotated.py b/bin/references-manually-annotated.py
new file mode 100755
index 000000000..0295e612c
--- /dev/null
+++ b/bin/references-manually-annotated.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-present] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import click
+
+
+@click.command()
+@click.argument('filename')
+@click.argument('output')
+def main(filename, output):
+    """
+    This function creates a file for each database containing the manually annotated references
+    :param filename: file containing ids
+    :param output: file to be created
+    :return: None
+    """
+    name = output.split("*")[0]
+
+    with open(filename, "r") as input_file:
+        with open(name + "hgnc", 'w') as hgnc, open(name + "pombase", 'w') as pombase, open(name + "sgd", 'w') as sgd, \
+                open(name + "tair", 'w') as tair, open(name + "zfin", 'w') as zfin:
+            while line := input_file.readline():
+                line = line.rstrip()
+                line = line.split('|')
+                urs = line[0]
+                database = line[1]
+                pmid = line[2]
+                doi = line[3]
+                pmcid = line[4]
+
+                if database.lower() == "hgnc":
+                    hgnc.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n')
+                elif database.lower() == "pombase":
+                    pombase.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n')
+                elif database.lower() == "sgd":
+                    sgd.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n')
+                elif database.lower() == "tair":
+                    tair.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n')
+                elif database.lower() == "zfin":
+                    zfin.write(urs + '|' + pmid + '|' + doi + '|' + pmcid + '\n')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/upload_ids.sh b/bin/upload_ids.sh
index 47ea731b1..a9edd1296 100755
--- a/bin/upload_ids.sh
+++ b/bin/upload_ids.sh
@@ -1,114 +1,27 @@
 #!/bin/bash
 
 # Script to submit ids to RNAcentral-reference
-#
-# Usage:   ./upload.sh [file] [database]
-#
-# The file can contain job_id, primary_id and urs_taxid.
-# Each line in the file must have at least a job_id or a primary_id.
-# Example:
-#       5_8S_rRNA|RF00002|URS000019A91D_7230
-#       Y_RNA|RF00019|
-#       Ysr224||
-#       ZMP-ZTP||URS0001BC94F0_256318
-#       |RF01750|URS0001BC834A_408172
-#       |RF02770|
 
-
-# set parameters
+# set parameter
 file=$1
-database=$2
-primary=$3
-upi=$4
-
-# set database
-if [ $database == "ensembl_gencode_gene" ] || [ $database == "ensembl_gencode_locus_tag" ]; then
-  database="ensembl_gencode"
-elif [ $database == "ensembl_gene" ] || [ $database == "ensembl_locus_tag" ]; then
-  database="ensembl"
-elif [ $database == "ensembl_metazoa_gene" ] || [ $database == "ensembl_metazoa_locus_tag" ]; then
-  database="ensembl_metazoa"
-elif [ $database == "ensembl_plants_gene" ] || [ $database == "ensembl_plants_locus_tag" ]; then
-  database="ensembl_plants"
-elif [ $database == "ensembl_protists_gene" ] || [ $database == "ensembl_protists_locus_tag" ]; then
-  database="ensembl_protists"
-elif [ $database == "flybase_gene_synonym" ] || [ $database == "flybase_locus_tag" ]; then
-  database="flybase"
-elif [ $database == "hgnc_gene_synonym" ] || [ $database == "hgnc_accession" ]; then
-  database="hgnc"
-elif [ $database == "pombase_gene_synonym" ] || [ $database == "pombase_gene" ]; then
-  database="pombase"
-elif [ $database == "refseq_gene" ] || [ $database == "refseq_gene_synonym" ] || [ $database == "refseq_optional_id" ]; then
-  database="refseq"
-fi
 
 # create folder
 [ ! -d submitted ] && mkdir submitted
 
 function submitJob
 {
-  line=$1
-  IFS=$'|'
-  tmp=($line)
-
-  if [ -z ${primary} ] && [ -z ${upi} ]; then
-    # set job_id, primary_id and urs
-    job_id="${tmp[0]}"
-    primary_id="${tmp[1]}"
-    urs="${tmp[2]}"
-  elif [ -z ${primary} ]; then
-    # set job_id and primary_id
-    job_id="${tmp[0]}"
-    primary_id="${tmp[1]}"
-  else
-    # set job_id
-    job_id="${tmp[1]}"
-  fi
-
-  # submit search according to the parameters used
-  if [ -z ${primary_id} ] && [ -z ${urs} ]; then
-    # submit job (id and database)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${job_id}\", \"database\": \"${database}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  elif [ -z ${job_id} ] && [ -z ${urs} ]; then
-    # submit job (primary_id and database)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${primary_id}\", \"database\": \"${database}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  elif [ -z ${urs} ]; then
-    # submit job (id, primary_id and database)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${job_id}\", \"primary_id\": \"${primary_id}\", \"database\": \"${database}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  elif [ -z ${primary_id} ]; then
-    # submit job (id, urs and database)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${job_id}\", \"database\": \"${database}\", \"urs\": \"${urs}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  elif [ -z ${job_id} ]; then
-    # submit job (primary_id, urs and database)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${primary_id}\", \"database\": \"${database}\", \"urs\": \"${urs}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  else
-    # submit job (id, database, primary_id, urs)
-    curl -X POST \
-         -H "Content-Type:application/json" \
-         -d "{\"id\": \"${job_id}\", \"database\": \"${database}\", \"primary_id\": \"${primary_id}\", \"urs\": \"${urs}\"}" \
-         http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
-  fi
-
-  sleep 0.05
+  # set parameter
+  job_id=$1
+
+  # submit job
+  curl -X POST \
+       -H "Content-Type:application/json" \
+       -d "{\"id\": \"${job_id}\"}" \
+       http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
 }
 
 # loop through the file
-while IFS="" read -r p || [ -n "$p" ]
+while IFS="" read -r line || [ -n "$line" ]
 do
-  submitJob "$p"
+  submitJob "$line"
 done < "$file"
diff --git a/config/cluster.config b/config/cluster.config
index 47608d7dd..d4599264b 100644
--- a/config/cluster.config
+++ b/config/cluster.config
@@ -10,6 +10,7 @@ process {
 executor {
   $lsf {
     queueSize = 10000
+    submitRateLimit = '1sec'
   }
 }
 
diff --git a/config/databases.config b/config/databases.config
index 1c159945d..1cc88238e 100644
--- a/config/databases.config
+++ b/config/databases.config
@@ -17,7 +17,7 @@ params {
     }
 
     ena {
-      remote = '/nfs/ftp/pub/databases/ena/non-coding/snapshot_latest'
+      remote = '/nfs/ftp/public/databases/ena/non-coding/snapshot_latest'
       max_sequences = 50000
     }
 
@@ -152,7 +152,7 @@ params {
 
     silva {
       needs_taxonomy = true
-      remote = 'ftp://ftp.arb-silva.de/current/Exports/rnac/SILVA_*Parc.rnac.gz'
+      remote = 'http://ftp.arb-silva.de/current/Exports/rnac/'
     }
 
     snodb {
@@ -187,7 +187,7 @@ params {
     }
 
     sgd {
-      remote = 'https://sgd-prod-upload.s3.amazonaws.com/latest/RNAcentral.json'
+      remote = "https://downloads.yeastgenome.org/latest/RNAcentral.json"
     }
 
     tarbase {
diff --git a/config/main.config b/config/main.config
index 0be1444ff..0fa78f4e8 100644
--- a/config/main.config
+++ b/config/main.config
@@ -7,7 +7,7 @@ params {
   connections = slurper.parse(new File(connection_file))
 
   import_data {
-    chunk_size = 1024 * 1000 * 1000
+    chunk_size = 256 * 1000 * 1000
   }
 }
 
diff --git a/config/precompute.config b/config/precompute.config
index 8b7574452..eb0f2c804 100644
--- a/config/precompute.config
+++ b/config/precompute.config
@@ -1,11 +1,10 @@
 params {
   precompute {
     run = true
-    max_entries = 500000
+    max_entries = 250000
     load_size = 1024 * 1000 * 1000
     maxForks = 5
     method = 'release'
-
     range.memory = 8.GB
   }
 }
diff --git a/containers/cpat/Dockerfile b/containers/cpat/Dockerfile
index dc4dbd064..46a9775b3 100644
--- a/containers/cpat/Dockerfile
+++ b/containers/cpat/Dockerfile
@@ -1,6 +1,6 @@
-From r-base:3.4.1
+From r-base:3.6.3
 
-RUN apt-get update && apt-get install -y python3-pip
+RUN apt-get update && apt-get install -y python3-pip procps
 RUN cp /usr/bin/python3 /usr/bin/python
 
 RUN pip3 install numpy
diff --git a/files/import-data/expressionatlas/lookup-dump-query.sql b/files/import-data/expressionatlas/lookup-dump-query.sql
new file mode 100644
index 000000000..d7966fd12
--- /dev/null
+++ b/files/import-data/expressionatlas/lookup-dump-query.sql
@@ -0,0 +1,27 @@
+COPY(
+  SELECT urs_taxid,
+    xref.taxid as taxid,
+    gene || '|' || external_id || '|' || gene_synonym || '|' || optional_id  as external_id,
+    description,
+    seq_version,
+    assembly_id,
+    region_start,
+    region_stop,
+    rsr.chromosome,
+    strand,
+    rna_type,
+    COALESCE(seq_short, seq_long) as seq
+  FROM rnc_accessions
+  JOIN xref
+  ON xref.ac = rnc_accessions.accession
+
+  JOIN rna
+  ON xref.upi = rna.upi
+
+  JOIN rnc_accession_sequence_region rasr
+  ON rasr.accession = xref.ac
+
+  JOIN rnc_sequence_regions rsr
+  ON rsr.id = region_id
+
+  ) TO STDOUT CSV HEADER
diff --git a/files/import-data/load/long-sequences.ctl b/files/import-data/load/long-sequences.ctl
index 2d8942628..82d98bfef 100644
--- a/files/import-data/load/long-sequences.ctl
+++ b/files/import-data/load/long-sequences.ctl
@@ -27,14 +27,15 @@ TARGET COLUMNS (
 WITH
     drop indexes,
     batch rows = 25000,
-    batch size =  512MB,
-    workers = 10,
+    batch size =  256MB,
+    prefetch rows = 50000,
+    workers = 5,
     concurrency = 2,
     skip header = 0,
     fields escaped by double-quote,
     fields terminated by ','
 
 SET
-    work_mem to '256 MB',
+    work_mem to '512 MB',
     maintenance_work_mem to '1 GB'
 ;
diff --git a/files/import-data/pre-release/000__assemblies.sql b/files/import-data/pre-release/000__assemblies.sql
index 8da15840b..8773e420e 100644
--- a/files/import-data/pre-release/000__assemblies.sql
+++ b/files/import-data/pre-release/000__assemblies.sql
@@ -1,12 +1,5 @@
 BEGIN;
 
-DELETE FROM ensembl_assembly ensembl
-USING load_assemblies load
-WHERE
-  load.taxid = ensembl.taxid
-  and load.assembly_id != ensembl.assembly_id
-;
-
 INSERT INTO ensembl_assembly (
   assembly_id,
   assembly_full_name,
diff --git a/files/precompute/fetch-xref-info.sql b/files/precompute/fetch-xref-info.sql
index 9d7696400..15a5ca5d9 100644
--- a/files/precompute/fetch-xref-info.sql
+++ b/files/precompute/fetch-xref-info.sql
@@ -1,22 +1,11 @@
-CREATE TEMP TABLE xref_releases AS
-SELECT
-  rna.id as rna_id,
-  xref.upi,
-  xref.last
-FROM xref
-JOIN rna
-ON
-  rna.upi = xref.upi
-;
+COPY(
+  SELECT
+    rna.id,
+    xref.upi,
+    xref.last
+  FROM xref
+  JOIN rna
+  ON
+    rna.upi = xref.upi
 
-CREATE INDEX ix_xref_releases_upi ON xref_releases(upi);
-
-COPY (
-SELECT
-  rna_id,
-  upi,
-  max(last)
-from xref_releases
-group by rna_id, upi
-order by rna_id ASC
 ) TO STDOUT (FORMAT CSV)
diff --git a/files/precompute/methods/weekly.sql b/files/precompute/methods/weekly.sql
new file mode 100644
index 000000000..2674087f5
--- /dev/null
+++ b/files/precompute/methods/weekly.sql
@@ -0,0 +1,7 @@
+COPY(
+select upi from xref
+
+where deleted = 'N'
+and EXTRACT (DAY FROM (CURRENT_TIMESTAMP - timestamp)) < 7
+
+) TO STDOUT (FORMAT CSV)
diff --git a/files/search-export/parts/accessions.sql b/files/search-export/parts/accessions.sql
index 98b4c0d8f..c28a61ef1 100644
--- a/files/search-export/parts/accessions.sql
+++ b/files/search-export/parts/accessions.sql
@@ -4,7 +4,7 @@ COPY (
       'id', todo.search_export_id,
       'urs_taxid', todo.urs_taxid,
       'accession', todo.accession,
-      'common_name', COALESCE(tax.common_name, todo.common_name),
+      'common_name', tax.common_name,
       'database', todo.database,
       'external_id', todo.external_id,
       'function', todo.function,
diff --git a/files/search-export/parts/text-mining.sql b/files/search-export/parts/text-mining.sql
index 804d3cc91..a3b4ad232 100644
--- a/files/search-export/parts/text-mining.sql
+++ b/files/search-export/parts/text-mining.sql
@@ -8,6 +8,6 @@ COPY (
     FROM search_export_urs todo
     JOIN search_export_publication_counts counts
     ON
-      todo.urs = counts.urs
+      todo.urs_taxid = counts.urs
     ORDER by todo.id
 ) TO STDOUT
diff --git a/files/search-export/setup.sql b/files/search-export/setup.sql
index c066faab5..fb0d97083 100644
--- a/files/search-export/setup.sql
+++ b/files/search-export/setup.sql
@@ -1,7 +1,7 @@
 BEGIN TRANSACTION;
 
 DROP TABLE IF EXISTS search_export_publication_counts;
-CREATE TEMP TABLE search_export_publication_counts (
+CREATE TABLE search_export_publication_counts (
   urs text primary key,
   publication_count int not null
 );
diff --git a/import-data.nf b/import-data.nf
index 6a4df3936..eff12365d 100644
--- a/import-data.nf
+++ b/import-data.nf
@@ -7,10 +7,14 @@ include { batch_lookup_ontology_information } from './workflows/lookup-ontology-
 include { parse_databases } from './workflows/parse-databases'
 include { parse_metadata } from './workflows/parse-metadata'
 include { load_data } from './workflows/load-data'
+include { slack_message } from './workflows/utils/slack'
+include { slack_closure } from './workflows/utils/slack'
 
 workflow import_data {
   emit: post_release
   main:
+    Channel.of("Starting data import pipeline") | slack_message
+
     Channel.empty() \
     | mix(
       parse_databases(),
@@ -30,8 +34,19 @@ workflow import_data {
     | mix(term_info, references) \
     | load_data \
     | set { post_release }
+
+
+
 }
 
 workflow {
   import_data()
 }
+
+workflow.onError {
+  slack_closure("Import pipeline encountered an error and failed")
+}
+
+workflow.onComplete {
+  slack_closure("Workflow completed ${$workflow.status ? 'Ok' : 'with errors'} ")
+}
diff --git a/nextflow.config b/nextflow.config
index 4ec6d7fd5..c5f85a07e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -8,7 +8,7 @@ includeConfig "config/export.config"
 includeConfig "config/crs.config"
 
 process {
-  container = 'rnacentral/rnacentral-import-pipeline:latest'
+  container = 'oras://ghcr.io/rnacentral/rnacentral-import-pipeline:latest'
 }
 
 // local.config must should contain something like the following. I use profiles
@@ -20,7 +20,7 @@ includeConfig "local.config"
 
 params.should_release = false
 params.needs_publications = false
-params.needs_taxonomy = false 
+params.needs_taxonomy = false
 params.databases.ensembl._any.run = false
 
 // Infer the needs_publications and should_release parameters. These are
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 000000000..4f4568476
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,1231 @@
+[[package]]
+name = "argcomplete"
+version = "2.0.0"
+description = "Bash tab completion for argparse"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+test = ["coverage", "flake8", "pexpect", "wheel"]
+
+[[package]]
+name = "argh"
+version = "0.26.2"
+description = "An unobtrusive argparse wrapper with natural syntax"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "atomicwrites"
+version = "1.4.1"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.4.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope-interface"]
+docs = ["furo", "sphinx", "sphinx-notfound-page", "zope-interface"]
+tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope-interface"]
+tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.11.1"
+description = "Screen-scraping library"
+category = "main"
+optional = false
+python-versions = ">=3.6.0"
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
+[[package]]
+name = "biopython"
+version = "1.79"
+description = "Freely available tools for computational molecular biology."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = "*"
+
+[[package]]
+name = "certifi"
+version = "2022.9.24"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "charset-normalizer"
+version = "2.1.1"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.6.0"
+
+[package.extras]
+unicode_backport = ["unicodedata2"]
+
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "click-aliases"
+version = "1.0.1"
+description = "Enable aliases for Click"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+click = "*"
+
+[package.extras]
+dev = ["coveralls", "flake8", "flake8-import-order", "pytest", "pytest-cov", "tox-travis", "wheel"]
+
+[[package]]
+name = "colorama"
+version = "0.4.5"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "decorator"
+version = "5.1.1"
+description = "Decorators for Humans"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "furl"
+version = "2.1.3"
+description = "URL manipulation made simple."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+orderedmultidict = ">=1.0.1"
+six = ">=1.8.0"
+
+[[package]]
+name = "gffutils"
+version = "0.10.1"
+description = "Work with GFF and GTF files in a flexible database framework"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+argcomplete = ">=1.9.4"
+argh = ">=0.26.2"
+pyfaidx = ">=0.5.5.2"
+simplejson = "*"
+six = ">=1.12.0"
+
+[[package]]
+name = "humanfriendly"
+version = "10.0"
+description = "Human friendly output for text interfaces using Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.dependencies]
+pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""}
+
+[[package]]
+name = "idna"
+version = "3.4"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "ijson"
+version = "3.1.4"
+description = "Iterative JSON parser with standard Python iterator interfaces"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "importlib-resources"
+version = "5.10.0"
+description = "Read resources from Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"]
+testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "intervaltree"
+version = "3.1.0"
+description = "Editable interval tree data structure for Python 2 and 3"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+sortedcontainers = ">=2.0,<3.0"
+
+[[package]]
+name = "joblib"
+version = "1.2.0"
+description = "Lightweight pipelining with Python functions"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "jsonschema"
+version = "4.16.0"
+description = "An implementation of JSON Schema validation for Python"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+attrs = ">=17.4.0"
+importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""}
+pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""}
+pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2"
+
+[package.extras]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+
+[[package]]
+name = "lxml"
+version = "4.9.1"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["beautifulsoup4"]
+source = ["Cython (>=0.29.7)"]
+
+[[package]]
+name = "more-itertools"
+version = "8.14.0"
+description = "More routines for operating on iterables, beyond itertools"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "networkx"
+version = "2.8.7"
+description = "Python package for creating and manipulating graphs and networks"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=0.981)", "pre-commit (>=2.20)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.4)", "pillow (>=9.1)", "pydata-sphinx-theme (>=0.9)", "sphinx (>=5)", "sphinx-gallery (>=0.10)", "texext (>=0.6.6)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"]
+
+[[package]]
+name = "numpy"
+version = "1.23.4"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[[package]]
+name = "obonet"
+version = "0.3.0"
+description = "Parse OBO formatted ontologies into networkx"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+networkx = "*"
+
+[package.extras]
+dev = ["pre-commit", "pytest"]
+
+[[package]]
+name = "orderedmultidict"
+version = "1.0.1"
+description = "Ordered Multivalue Dictionary"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+six = ">=1.8.0"
+
+[[package]]
+name = "packaging"
+version = "21.3"
+description = "Core utilities for Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+
+[[package]]
+name = "pandas"
+version = "1.5.1"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
+    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
+]
+python-dateutil = ">=2.8.1"
+pytz = ">=2020.1"
+
+[package.extras]
+test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+
+[[package]]
+name = "pkgutil-resolve-name"
+version = "1.3.10"
+description = "Resolve a name to an object."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "psycopg2"
+version = "2.9.3"
+description = "psycopg2 - Python-PostgreSQL Database Adapter"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pyfaidx"
+version = "0.7.1"
+description = "pyfaidx: efficient pythonic random access to fasta subsequences"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+setuptools = ">=0.7"
+six = "*"
+
+[[package]]
+name = "pymysql"
+version = "1.0.2"
+description = "Pure Python MySQL Driver"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+ed25519 = ["PyNaCl (>=1.4.0)"]
+rsa = ["cryptography"]
+
+[[package]]
+name = "pyparsing"
+version = "3.0.9"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "dev"
+optional = false
+python-versions = ">=3.6.8"
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
+[[package]]
+name = "pypika"
+version = "0.48.9"
+description = "A SQL query builder API for Python"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pyreadline3"
+version = "3.4.1"
+description = "A python implementation of GNU readline."
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pyrsistent"
+version = "0.18.1"
+description = "Persistent/Functional/Immutable data structures"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "pytest"
+version = "6.2.5"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+py = ">=1.8.2"
+toml = "*"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2022.5"
+description = "World timezone definitions, modern and historical"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "ratelimiter"
+version = "1.2.0.post0"
+description = "Simple python rate limiting object"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.extras]
+test = ["pytest (>=3.0)", "pytest-asyncio"]
+
+[[package]]
+name = "requests"
+version = "2.28.1"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=3.7, <4"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<3"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "retry"
+version = "0.9.2"
+description = "Easy to use retry decorator."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+decorator = ">=3.4.2"
+py = ">=1.4.26,<2.0.0"
+
+[[package]]
+name = "scikit-learn"
+version = "1.1.2"
+description = "A set of python modules for machine learning and data mining"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[package.dependencies]
+joblib = ">=1.0.0"
+numpy = ">=1.17.3"
+scipy = ">=1.3.2"
+threadpoolctl = ">=2.0.0"
+
+[package.extras]
+benchmark = ["matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"]
+docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"]
+examples = ["matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"]
+tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.2)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pyamg (>=4.0.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"]
+
+[[package]]
+name = "scipy"
+version = "1.9.2"
+description = "Fundamental algorithms for scientific computing in Python"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+
+[package.dependencies]
+numpy = ">=1.18.5,<1.26.0"
+
+[package.extras]
+dev = ["flake8", "mypy", "pycodestyle", "typing-extensions"]
+doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"]
+test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
+
+[[package]]
+name = "semver"
+version = "2.13.0"
+description = "Python helper for Semantic Versioning (http://semver.org/)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
+[[package]]
+name = "simplejson"
+version = "3.17.6"
+description = "Simple, fast, extensible JSON encoder/decoder for Python"
+category = "main"
+optional = false
+python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "soupsieve"
+version = "2.3.2.post1"
+description = "A modern CSS selector implementation for Beautiful Soup."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "sqlitedict"
+version = "1.7.0"
+description = "Persistent dict in Python, backed up by sqlite3 and pickle, multithread-safe."
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "tatsu"
+version = "4.4.0"
+description = "TatSu takes a grammar in a variation of EBNF as input, and outputs a memoizing PEG/Packrat parser in Python."
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.extras]
+future-regex = ["regex"]
+
+[[package]]
+name = "threadpoolctl"
+version = "3.1.0"
+description = "threadpoolctl"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "urllib3"
+version = "1.26.12"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+
+[[package]]
+name = "zipp"
+version = "3.9.0"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"]
+testing = ["flake8 (<5)", "func-timeout", "jaraco-functools", "jaraco-itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.8"
+content-hash = "7fa3b8b76be48a244d3c3b6f237bea96793095d5a375ff2d240ef56299bd1f4a"
+
+[metadata.files]
+argcomplete = [
+    {file = "argcomplete-2.0.0-py2.py3-none-any.whl", hash = "sha256:cffa11ea77999bb0dd27bb25ff6dc142a6796142f68d45b1a26b11f58724561e"},
+    {file = "argcomplete-2.0.0.tar.gz", hash = "sha256:6372ad78c89d662035101418ae253668445b391755cfe94ea52f1b9d22425b20"},
+]
+argh = [
+    {file = "argh-0.26.2-py2.py3-none-any.whl", hash = "sha256:a9b3aaa1904eeb78e32394cd46c6f37ac0fb4af6dc488daa58971bdc7d7fcaf3"},
+    {file = "argh-0.26.2.tar.gz", hash = "sha256:e9535b8c84dc9571a48999094fda7f33e63c3f1b74f3e5f3ac0105a58405bb65"},
+]
+atomicwrites = [
+    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
+]
+attrs = [
+    {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
+    {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
+]
+beautifulsoup4 = [
+    {file = "beautifulsoup4-4.11.1-py3-none-any.whl", hash = "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30"},
+    {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"},
+]
+biopython = [
+    {file = "biopython-1.79-cp310-cp310-win_amd64.whl", hash = "sha256:9eadfd4300f534cd4fa39613eeee786d2c3d6b981d373c5c46616fa1a97cad10"},
+    {file = "biopython-1.79-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:72a1477cf1701964c7224e506a54fd65d1cc5228da200b634a17992230aa1cbd"},
+    {file = "biopython-1.79-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:365569543ea58dd07ef205ec351c23b6c1a3200d5d321eb28ceaecd55eb5955e"},
+    {file = "biopython-1.79-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4be31815226052d86d4c2f6a103c40504e34bba3e25cc1b1d687a3203c42fb6e"},
+    {file = "biopython-1.79-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ceab668be9cbdcddef55ad459f87acd0316ae4a00d32251fea4cf665f5062fda"},
+    {file = "biopython-1.79-cp36-cp36m-win32.whl", hash = "sha256:83bfea8a19f9352c47b13965c4b73853e7aeef3c5aed8489895b0679e32c621b"},
+    {file = "biopython-1.79-cp36-cp36m-win_amd64.whl", hash = "sha256:98deacc30b8654cfcdcf707d93fa4e3c8717bbda07c3f9f828cf84753d4a1e4d"},
+    {file = "biopython-1.79-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:884a2b99ac7820cb84f70089769a512e3238ee60438b8c934ed519613dc570ce"},
+    {file = "biopython-1.79-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51eb467a60c38820ad1e6c3a7d4cb10535606f559646e824cc65c96091d91ff7"},
+    {file = "biopython-1.79-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03ee5c72b3cc3f0675a8c22ce1c45fe99a32a60db18df059df479ae6cf619708"},
+    {file = "biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9580978803b582e0612b71673cab289e6bf261a865009cfb9501d65bc726a76e"},
+    {file = "biopython-1.79-cp37-cp37m-win32.whl", hash = "sha256:5ae69c5e09769390643aa0f8064517665df6fb99c37433821d6664584d0ecb8c"},
+    {file = "biopython-1.79-cp37-cp37m-win_amd64.whl", hash = "sha256:f0a7e1d94a318f74974345fd0987ec389b16988ec484e67218e900b116b932a8"},
+    {file = "biopython-1.79-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aa23a83a220486af6193760d079b36543fe00afcfbd18280ca2fd0b2c1c8dd6d"},
+    {file = "biopython-1.79-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3d4eec2e348c3d97a7fde80ee0f2b8ebeed849d2bd64a616833a9be03b93c8"},
+    {file = "biopython-1.79-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:947b793e804c59ea45ae46945a57612ad1789ca87af4af0d6a62dcecf3a6246a"},
+    {file = "biopython-1.79-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d9f6ce961e0c380e2a5435f64c96421dbcebeab6a1b41506bd81251feb733c08"},
+    {file = "biopython-1.79-cp38-cp38-win32.whl", hash = "sha256:155c5b95857bca7ebd607210cb9d8ea459bb0b86b3ca37ea44ec47c26ede7e9a"},
+    {file = "biopython-1.79-cp38-cp38-win_amd64.whl", hash = "sha256:2dbb4388c75b5dfca8ce729e791f465c9c878dbd7ba2ab9a1f9854609d2b5426"},
+    {file = "biopython-1.79-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:76988ed3d7383d566db1d7fc69c9cf136c6275813fb749fc6753c340f81f1a8f"},
+    {file = "biopython-1.79-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e921571b51514a6d35944242d6fef6427c3998acf58940fe1f209ac8a92a6e87"},
+    {file = "biopython-1.79-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf634a56f449a4123e48e538d661948e5ac29fb452acd2962b8cb834b472a9d7"},
+    {file = "biopython-1.79-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ab93d5749b375be3682866b3a606aa2ebd3e6d868079793925bf4fbb0987cf1f"},
+    {file = "biopython-1.79-cp39-cp39-win32.whl", hash = "sha256:8f33dafd3c7254fff5e1684b965e45a7c08d9b8e1bf51562b0a521ff9a6f5ea0"},
+    {file = "biopython-1.79-cp39-cp39-win_amd64.whl", hash = "sha256:b3ab26f26a1956ef26303386510d84e917e31fcbbc94918c336da0163ef628df"},
+    {file = "biopython-1.79.tar.gz", hash = "sha256:edb07eac99d3b8abd7ba56ff4bedec9263f76dfc3c3f450e7d2e2bcdecf8559b"},
+]
+certifi = [
+    {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"},
+    {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
+    {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"},
+]
+click = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+click-aliases = [
+    {file = "click-aliases-1.0.1.tar.gz", hash = "sha256:f48012077e0788eb02f4f8ee458fef3601873fec6c998e9ea8b4554394e705a3"},
+    {file = "click_aliases-1.0.1-py2.py3-none-any.whl", hash = "sha256:229ecab12a97d1d5ce3f1fd7ce16da0e4333a24ebe3b34d8b7a6d0a1d2cfab90"},
+]
+colorama = [
+    {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"},
+    {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"},
+]
+decorator = [
+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
+]
+furl = [
+    {file = "furl-2.1.3-py2.py3-none-any.whl", hash = "sha256:9ab425062c4217f9802508e45feb4a83e54324273ac4b202f1850363309666c0"},
+    {file = "furl-2.1.3.tar.gz", hash = "sha256:5a6188fe2666c484a12159c18be97a1977a71d632ef5bb867ef15f54af39cc4e"},
+]
+gffutils = [
+    {file = "gffutils-0.10.1.tar.gz", hash = "sha256:a8fc39006d7aa353147238160640e2210b168f7849cb99896be3fc9441e351cb"},
+]
+humanfriendly = [
+    {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
+    {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
+]
+idna = [
+    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
+    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+]
+ijson = [
+    {file = "ijson-3.1.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:6c1a777096be5f75ffebb335c6d2ebc0e489b231496b7f2ca903aa061fe7d381"},
+    {file = "ijson-3.1.4-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:475fc25c3d2a86230b85777cae9580398b42eed422506bf0b6aacfa936f7bfcd"},
+    {file = "ijson-3.1.4-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:f587699b5a759e30accf733e37950cc06c4118b72e3e146edcea77dded467426"},
+    {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:339b2b4c7bbd64849dd69ef94ee21e29dcd92c831f47a281fdd48122bb2a715a"},
+    {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:446ef8980504da0af8d20d3cb6452c4dc3d8aa5fd788098985e899b913191fe6"},
+    {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:3997a2fdb28bc04b9ab0555db5f3b33ed28d91e9d42a3bf2c1842d4990beb158"},
+    {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:fa10a1d88473303ec97aae23169d77c5b92657b7fb189f9c584974c00a79f383"},
+    {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:9a5bf5b9d8f2ceaca131ee21fc7875d0f34b95762f4f32e4d65109ca46472147"},
+    {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:81cc8cee590c8a70cca3c9aefae06dd7cb8e9f75f3a7dc12b340c2e332d33a2a"},
+    {file = "ijson-3.1.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4ea5fc50ba158f72943d5174fbc29ebefe72a2adac051c814c87438dc475cf78"},
+    {file = "ijson-3.1.4-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3b98861a4280cf09d267986cefa46c3bd80af887eae02aba07488d80eb798afa"},
+    {file = "ijson-3.1.4-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:068c692efba9692406b86736dcc6803e4a0b6280d7f0b7534bff3faec677ff38"},
+    {file = "ijson-3.1.4-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:86884ac06ac69cea6d89ab7b84683b3b4159c4013e4a20276d3fc630fe9b7588"},
+    {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:41e5886ff6fade26f10b87edad723d2db14dcbb1178717790993fcbbb8ccd333"},
+    {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:24b58933bf777d03dc1caa3006112ec7f9e6f6db6ffe1f5f5bd233cb1281f719"},
+    {file = "ijson-3.1.4-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:13f80aad0b84d100fb6a88ced24bade21dc6ddeaf2bba3294b58728463194f50"},
+    {file = "ijson-3.1.4-cp35-cp35m-win32.whl", hash = "sha256:fa9a25d0bd32f9515e18a3611690f1de12cb7d1320bd93e9da835936b41ad3ff"},
+    {file = "ijson-3.1.4-cp35-cp35m-win_amd64.whl", hash = "sha256:c4c1bf98aaab4c8f60d238edf9bcd07c896cfcc51c2ca84d03da22aad88957c5"},
+    {file = "ijson-3.1.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f0f2a87c423e8767368aa055310024fa28727f4454463714fef22230c9717f64"},
+    {file = "ijson-3.1.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:15507de59d74d21501b2a076d9c49abf927eb58a51a01b8f28a0a0565db0a99f"},
+    {file = "ijson-3.1.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2e6bd6ad95ab40c858592b905e2bbb4fe79bbff415b69a4923dafe841ffadcb4"},
+    {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:68e295bb12610d086990cedc89fb8b59b7c85740d66e9515aed062649605d0bf"},
+    {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3bb461352c0f0f2ec460a4b19400a665b8a5a3a2da663a32093df1699642ee3f"},
+    {file = "ijson-3.1.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f91c75edd6cf1a66f02425bafc59a22ec29bc0adcbc06f4bfd694d92f424ceb3"},
+    {file = "ijson-3.1.4-cp36-cp36m-win32.whl", hash = "sha256:4c53cc72f79a4c32d5fc22efb85aa22f248e8f4f992707a84bdc896cc0b1ecf9"},
+    {file = "ijson-3.1.4-cp36-cp36m-win_amd64.whl", hash = "sha256:ac9098470c1ff6e5c23ec0946818bc102bfeeeea474554c8d081dc934be20988"},
+    {file = "ijson-3.1.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dcd6f04df44b1945b859318010234651317db2c4232f75e3933f8bb41c4fa055"},
+    {file = "ijson-3.1.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:5a2f40c053c837591636dc1afb79d85e90b9a9d65f3d9963aae31d1eb11bfed2"},
+    {file = "ijson-3.1.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f50337e3b8e72ec68441b573c2848f108a8976a57465c859b227ebd2a2342901"},
+    {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:454918f908abbed3c50a0a05c14b20658ab711b155e4f890900e6f60746dd7cc"},
+    {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:387c2ec434cc1bc7dc9bd33ec0b70d95d443cc1e5934005f26addc2284a437ab"},
+    {file = "ijson-3.1.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:179ed6fd42e121d252b43a18833df2de08378fac7bce380974ef6f5e522afefa"},
+    {file = "ijson-3.1.4-cp37-cp37m-win32.whl", hash = "sha256:26a6a550b270df04e3f442e2bf0870c9362db4912f0e7bdfd300f30ea43115a2"},
+    {file = "ijson-3.1.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ff8cf7507d9d8939264068c2cff0a23f99703fa2f31eb3cb45a9a52798843586"},
+    {file = "ijson-3.1.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:09c9d7913c88a6059cd054ff854958f34d757402b639cf212ffbec201a705a0d"},
+    {file = "ijson-3.1.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:702ba9a732116d659a5e950ee176be6a2e075998ef1bcde11cbf79a77ed0f717"},
+    {file = "ijson-3.1.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:667841591521158770adc90793c2bdbb47c94fe28888cb802104b8bbd61f3d51"},
+    {file = "ijson-3.1.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:df641dd07b38c63eecd4f454db7b27aa5201193df160f06b48111ba97ab62504"},
+    {file = "ijson-3.1.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:9348e7d507eb40b52b12eecff3d50934fcc3d2a15a2f54ec1127a36063b9ba8f"},
+    {file = "ijson-3.1.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:93455902fdc33ba9485c7fae63ac95d96e0ab8942224a357113174bbeaff92e9"},
+    {file = "ijson-3.1.4-cp38-cp38-win32.whl", hash = "sha256:5b725f2e984ce70d464b195f206fa44bebbd744da24139b61fec72de77c03a16"},
+    {file = "ijson-3.1.4-cp38-cp38-win_amd64.whl", hash = "sha256:a5965c315fbb2dc9769dfdf046eb07daf48ae20b637da95ec8d62b629be09df4"},
+    {file = "ijson-3.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b8ee7dbb07cec9ba29d60cfe4954b3cc70adb5f85bba1f72225364b59c1cf82b"},
+    {file = "ijson-3.1.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d9e01c55d501e9c3d686b6ee3af351c9c0c8c3e45c5576bd5601bee3e1300b09"},
+    {file = "ijson-3.1.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:297f26f27a04cd0d0a2f865d154090c48ea11b239cabe0a17a6c65f0314bd1ca"},
+    {file = "ijson-3.1.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:9239973100338a4138d09d7a4602bd289861e553d597cd67390c33bfc452253e"},
+    {file = "ijson-3.1.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:2a64c66a08f56ed45a805691c2fd2e1caef00edd6ccf4c4e5eff02cd94ad8364"},
+    {file = "ijson-3.1.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d17fd199f0d0a4ab6e0d541b4eec1b68b5bd5bb5d8104521e22243015b51049b"},
+    {file = "ijson-3.1.4-cp39-cp39-win32.whl", hash = "sha256:70ee3c8fa0eba18c80c5911639c01a8de4089a4361bad2862a9949e25ec9b1c8"},
+    {file = "ijson-3.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:6bf2b64304321705d03fa5e403ec3f36fa5bb27bf661849ad62e0a3a49bc23e3"},
+    {file = "ijson-3.1.4-pp27-pypy_73-macosx_10_9_x86_64.whl", hash = "sha256:5d7e3fcc3b6de76a9dba1e9fc6ca23dad18f0fa6b4e6499415e16b684b2e9af1"},
+    {file = "ijson-3.1.4-pp27-pypy_73-manylinux1_x86_64.whl", hash = "sha256:a72eb0359ebff94754f7a2f00a6efe4c57716f860fc040c606dedcb40f49f233"},
+    {file = "ijson-3.1.4-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:28fc168f5faf5759fdfa2a63f85f1f7a148bbae98f34404a6ba19f3d08e89e87"},
+    {file = "ijson-3.1.4-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2844d4a38d27583897ed73f7946e205b16926b4cab2525d1ce17e8b08064c706"},
+    {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux1_x86_64.whl", hash = "sha256:252defd1f139b5fb8c764d78d5e3a6df81543d9878c58992a89b261369ea97a7"},
+    {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:15d5356b4d090c699f382c8eb6a2bcd5992a8c8e8b88c88bc6e54f686018328a"},
+    {file = "ijson-3.1.4-pp36-pypy36_pp73-win32.whl", hash = "sha256:6774ec0a39647eea70d35fb76accabe3d71002a8701c0545b9120230c182b75b"},
+    {file = "ijson-3.1.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f11da15ec04cc83ff0f817a65a3392e169be8d111ba81f24d6e09236597bb28c"},
+    {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux1_x86_64.whl", hash = "sha256:ee13ceeed9b6cf81b3b8197ef15595fc43fd54276842ed63840ddd49db0603da"},
+    {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:97e4df67235fae40d6195711223520d2c5bf1f7f5087c2963fcde44d72ebf448"},
+    {file = "ijson-3.1.4-pp37-pypy37_pp73-win32.whl", hash = "sha256:3d10eee52428f43f7da28763bb79f3d90bbbeea1accb15de01e40a00885b6e89"},
+    {file = "ijson-3.1.4.tar.gz", hash = "sha256:1d1003ae3c6115ec9b587d29dd136860a81a23c7626b682e2b5b12c9fd30e4ea"},
+]
+importlib-resources = [
+    {file = "importlib_resources-5.10.0-py3-none-any.whl", hash = "sha256:ee17ec648f85480d523596ce49eae8ead87d5631ae1551f913c0100b5edd3437"},
+    {file = "importlib_resources-5.10.0.tar.gz", hash = "sha256:c01b1b94210d9849f286b86bb51bcea7cd56dde0600d8db721d7b81330711668"},
+]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
+intervaltree = [
+    {file = "intervaltree-3.1.0.tar.gz", hash = "sha256:902b1b88936918f9b2a19e0e5eb7ccb430ae45cde4f39ea4b36932920d33952d"},
+]
+joblib = [
+    {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"},
+    {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"},
+]
+jsonschema = [
+    {file = "jsonschema-4.16.0-py3-none-any.whl", hash = "sha256:9e74b8f9738d6a946d70705dc692b74b5429cd0960d58e79ffecfc43b2221eb9"},
+    {file = "jsonschema-4.16.0.tar.gz", hash = "sha256:165059f076eff6971bae5b742fc029a7b4ef3f9bcf04c14e4776a7605de14b23"},
+]
+lxml = [
+    {file = "lxml-4.9.1-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed"},
+    {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc"},
+    {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc"},
+    {file = "lxml-4.9.1-cp27-cp27m-win32.whl", hash = "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3"},
+    {file = "lxml-4.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627"},
+    {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84"},
+    {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837"},
+    {file = "lxml-4.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad"},
+    {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5"},
+    {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8"},
+    {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8"},
+    {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d"},
+    {file = "lxml-4.9.1-cp310-cp310-win32.whl", hash = "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7"},
+    {file = "lxml-4.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b"},
+    {file = "lxml-4.9.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d"},
+    {file = "lxml-4.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3"},
+    {file = "lxml-4.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29"},
+    {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d"},
+    {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318"},
+    {file = "lxml-4.9.1-cp35-cp35m-win32.whl", hash = "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7"},
+    {file = "lxml-4.9.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4"},
+    {file = "lxml-4.9.1-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb"},
+    {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067"},
+    {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536"},
+    {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8"},
+    {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b"},
+    {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf"},
+    {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3"},
+    {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391"},
+    {file = "lxml-4.9.1-cp36-cp36m-win32.whl", hash = "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e"},
+    {file = "lxml-4.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7"},
+    {file = "lxml-4.9.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2"},
+    {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc"},
+    {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c"},
+    {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4"},
+    {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3"},
+    {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca"},
+    {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785"},
+    {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785"},
+    {file = "lxml-4.9.1-cp37-cp37m-win32.whl", hash = "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a"},
+    {file = "lxml-4.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e"},
+    {file = "lxml-4.9.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b"},
+    {file = "lxml-4.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97"},
+    {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21"},
+    {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2"},
+    {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130"},
+    {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715"},
+    {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036"},
+    {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387"},
+    {file = "lxml-4.9.1-cp38-cp38-win32.whl", hash = "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94"},
+    {file = "lxml-4.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345"},
+    {file = "lxml-4.9.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67"},
+    {file = "lxml-4.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb"},
+    {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448"},
+    {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7"},
+    {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91"},
+    {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000"},
+    {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25"},
+    {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd"},
+    {file = "lxml-4.9.1-cp39-cp39-win32.whl", hash = "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb"},
+    {file = "lxml-4.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d"},
+    {file = "lxml-4.9.1-pp37-pypy37_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c"},
+    {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b"},
+    {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc"},
+    {file = "lxml-4.9.1-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b"},
+    {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2"},
+    {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73"},
+    {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c"},
+    {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9"},
+    {file = "lxml-4.9.1.tar.gz", hash = "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"},
+]
+more-itertools = [
+    {file = "more-itertools-8.14.0.tar.gz", hash = "sha256:c09443cd3d5438b8dafccd867a6bc1cb0894389e90cb53d227456b0b0bccb750"},
+    {file = "more_itertools-8.14.0-py3-none-any.whl", hash = "sha256:1bc4f91ee5b1b31ac7ceacc17c09befe6a40a503907baf9c839c229b5095cfd2"},
+]
+networkx = [
+    {file = "networkx-2.8.7-py3-none-any.whl", hash = "sha256:15cdf7f7c157637107ea690cabbc488018f8256fa28242aed0fb24c93c03a06d"},
+    {file = "networkx-2.8.7.tar.gz", hash = "sha256:815383fd52ece0a7024b5fd8408cc13a389ea350cd912178b82eed8b96f82cd3"},
+]
+numpy = [
+    {file = "numpy-1.23.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2"},
+    {file = "numpy-1.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f"},
+    {file = "numpy-1.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71"},
+    {file = "numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3"},
+    {file = "numpy-1.23.4-cp310-cp310-win32.whl", hash = "sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd"},
+    {file = "numpy-1.23.4-cp310-cp310-win_amd64.whl", hash = "sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329"},
+    {file = "numpy-1.23.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:488a66cb667359534bc70028d653ba1cf307bae88eab5929cd707c761ff037db"},
+    {file = "numpy-1.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce03305dd694c4873b9429274fd41fc7eb4e0e4dea07e0af97a933b079a5814f"},
+    {file = "numpy-1.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8981d9b5619569899666170c7c9748920f4a5005bf79c72c07d08c8a035757b0"},
+    {file = "numpy-1.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a70a7d3ce4c0e9284e92285cba91a4a3f5214d87ee0e95928f3614a256a1488"},
+    {file = "numpy-1.23.4-cp311-cp311-win32.whl", hash = "sha256:5e13030f8793e9ee42f9c7d5777465a560eb78fa7e11b1c053427f2ccab90c79"},
+    {file = "numpy-1.23.4-cp311-cp311-win_amd64.whl", hash = "sha256:7607b598217745cc40f751da38ffd03512d33ec06f3523fb0b5f82e09f6f676d"},
+    {file = "numpy-1.23.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7ab46e4e7ec63c8a5e6dbf5c1b9e1c92ba23a7ebecc86c336cb7bf3bd2fb10e5"},
+    {file = "numpy-1.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8aae2fb3180940011b4862b2dd3756616841c53db9734b27bb93813cd79fce6"},
+    {file = "numpy-1.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c053d7557a8f022ec823196d242464b6955a7e7e5015b719e76003f63f82d0f"},
+    {file = "numpy-1.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0882323e0ca4245eb0a3d0a74f88ce581cc33aedcfa396e415e5bba7bf05f68"},
+    {file = "numpy-1.23.4-cp38-cp38-win32.whl", hash = "sha256:dada341ebb79619fe00a291185bba370c9803b1e1d7051610e01ed809ef3a4ba"},
+    {file = "numpy-1.23.4-cp38-cp38-win_amd64.whl", hash = "sha256:0fe563fc8ed9dc4474cbf70742673fc4391d70f4363f917599a7fa99f042d5a8"},
+    {file = "numpy-1.23.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c67b833dbccefe97cdd3f52798d430b9d3430396af7cdb2a0c32954c3ef73894"},
+    {file = "numpy-1.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f76025acc8e2114bb664294a07ede0727aa75d63a06d2fae96bf29a81747e4a7"},
+    {file = "numpy-1.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12ac457b63ec8ded85d85c1e17d85efd3c2b0967ca39560b307a35a6703a4735"},
+    {file = "numpy-1.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95de7dc7dc47a312f6feddd3da2500826defdccbc41608d0031276a24181a2c0"},
+    {file = "numpy-1.23.4-cp39-cp39-win32.whl", hash = "sha256:f2f390aa4da44454db40a1f0201401f9036e8d578a25f01a6e237cea238337ef"},
+    {file = "numpy-1.23.4-cp39-cp39-win_amd64.whl", hash = "sha256:f260da502d7441a45695199b4e7fd8ca87db659ba1c78f2bbf31f934fe76ae0e"},
+    {file = "numpy-1.23.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:61be02e3bf810b60ab74e81d6d0d36246dbfb644a462458bb53b595791251911"},
+    {file = "numpy-1.23.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:296d17aed51161dbad3c67ed6d164e51fcd18dbcd5dd4f9d0a9c6055dce30810"},
+    {file = "numpy-1.23.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4d52914c88b4930dafb6c48ba5115a96cbab40f45740239d9f4159c4ba779962"},
+    {file = "numpy-1.23.4.tar.gz", hash = "sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c"},
+]
+obonet = [
+    {file = "obonet-0.3.0-py3-none-any.whl", hash = "sha256:d436eb4f57afa6f1a48992c3a4132126da9793e1439f667ab23cc74d8e957aee"},
+    {file = "obonet-0.3.0.tar.gz", hash = "sha256:fd801166cd28a2ef86126f22c8e3da30f5c3b6a3adfc62536abea1aa9956a2b4"},
+]
+orderedmultidict = [
+    {file = "orderedmultidict-1.0.1-py2.py3-none-any.whl", hash = "sha256:43c839a17ee3cdd62234c47deca1a8508a3f2ca1d0678a3bf791c87cf84adbf3"},
+    {file = "orderedmultidict-1.0.1.tar.gz", hash = "sha256:04070bbb5e87291cc9bfa51df413677faf2141c73c61d2a5f7b26bea3cd882ad"},
+]
+packaging = [
+    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
+    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+]
+pandas = [
+    {file = "pandas-1.5.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0a78e05ec09731c5b3bd7a9805927ea631fe6f6cb06f0e7c63191a9a778d52b4"},
+    {file = "pandas-1.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5b0c970e2215572197b42f1cff58a908d734503ea54b326412c70d4692256391"},
+    {file = "pandas-1.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f340331a3f411910adfb4bbe46c2ed5872d9e473a783d7f14ecf49bc0869c594"},
+    {file = "pandas-1.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8c709f4700573deb2036d240d140934df7e852520f4a584b2a8d5443b71f54d"},
+    {file = "pandas-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32e3d9f65606b3f6e76555bfd1d0b68d94aff0929d82010b791b6254bf5a4b96"},
+    {file = "pandas-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:a52419d9ba5906db516109660b114faf791136c94c1a636ed6b29cbfff9187ee"},
+    {file = "pandas-1.5.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:66a1ad667b56e679e06ba73bb88c7309b3f48a4c279bd3afea29f65a766e9036"},
+    {file = "pandas-1.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:36aa1f8f680d7584e9b572c3203b20d22d697c31b71189322f16811d4ecfecd3"},
+    {file = "pandas-1.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bcf1a82b770b8f8c1e495b19a20d8296f875a796c4fe6e91da5ef107f18c5ecb"},
+    {file = "pandas-1.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c25e5c16ee5c0feb6cf9d982b869eec94a22ddfda9aa2fbed00842cbb697624"},
+    {file = "pandas-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:932d2d7d3cab44cfa275601c982f30c2d874722ef6396bb539e41e4dc4618ed4"},
+    {file = "pandas-1.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:eb7e8cf2cf11a2580088009b43de84cabbf6f5dae94ceb489f28dba01a17cb77"},
+    {file = "pandas-1.5.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cb2a9cf1150302d69bb99861c5cddc9c25aceacb0a4ef5299785d0f5389a3209"},
+    {file = "pandas-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:81f0674fa50b38b6793cd84fae5d67f58f74c2d974d2cb4e476d26eee33343d0"},
+    {file = "pandas-1.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:17da7035d9e6f9ea9cdc3a513161f8739b8f8489d31dc932bc5a29a27243f93d"},
+    {file = "pandas-1.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:669c8605dba6c798c1863157aefde959c1796671ffb342b80fcb80a4c0bc4c26"},
+    {file = "pandas-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:683779e5728ac9138406c59a11e09cd98c7d2c12f0a5fc2b9c5eecdbb4a00075"},
+    {file = "pandas-1.5.1-cp38-cp38-win32.whl", hash = "sha256:ddf46b940ef815af4e542697eaf071f0531449407a7607dd731bf23d156e20a7"},
+    {file = "pandas-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:db45b94885000981522fb92349e6b76f5aee0924cc5315881239c7859883117d"},
+    {file = "pandas-1.5.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:927e59c694e039c75d7023465d311277a1fc29ed7236b5746e9dddf180393113"},
+    {file = "pandas-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e675f8fe9aa6c418dc8d3aac0087b5294c1a4527f1eacf9fe5ea671685285454"},
+    {file = "pandas-1.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04e51b01d5192499390c0015630975f57836cc95c7411415b499b599b05c0c96"},
+    {file = "pandas-1.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cee0c74e93ed4f9d39007e439debcaadc519d7ea5c0afc3d590a3a7b2edf060"},
+    {file = "pandas-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b156a971bc451c68c9e1f97567c94fd44155f073e3bceb1b0d195fd98ed12048"},
+    {file = "pandas-1.5.1-cp39-cp39-win32.whl", hash = "sha256:05c527c64ee02a47a24031c880ee0ded05af0623163494173204c5b72ddce658"},
+    {file = "pandas-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bb391659a747cf4f181a227c3e64b6d197100d53da98dcd766cc158bdd9ec68"},
+    {file = "pandas-1.5.1.tar.gz", hash = "sha256:249cec5f2a5b22096440bd85c33106b6102e0672204abd2d5c014106459804ee"},
+]
+pkgutil-resolve-name = [
+    {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"},
+    {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"},
+]
+pluggy = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
+psycopg2 = [
+    {file = "psycopg2-2.9.3-cp310-cp310-win32.whl", hash = "sha256:083707a696e5e1c330af2508d8fab36f9700b26621ccbcb538abe22e15485362"},
+    {file = "psycopg2-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:d3ca6421b942f60c008f81a3541e8faf6865a28d5a9b48544b0ee4f40cac7fca"},
+    {file = "psycopg2-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:9572e08b50aed176ef6d66f15a21d823bb6f6d23152d35e8451d7d2d18fdac56"},
+    {file = "psycopg2-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:a81e3866f99382dfe8c15a151f1ca5fde5815fde879348fe5a9884a7c092a305"},
+    {file = "psycopg2-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:cb10d44e6694d763fa1078a26f7f6137d69f555a78ec85dc2ef716c37447e4b2"},
+    {file = "psycopg2-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:4295093a6ae3434d33ec6baab4ca5512a5082cc43c0505293087b8a46d108461"},
+    {file = "psycopg2-2.9.3-cp38-cp38-win32.whl", hash = "sha256:34b33e0162cfcaad151f249c2649fd1030010c16f4bbc40a604c1cb77173dcf7"},
+    {file = "psycopg2-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:0762c27d018edbcb2d34d51596e4346c983bd27c330218c56c4dc25ef7e819bf"},
+    {file = "psycopg2-2.9.3-cp39-cp39-win32.whl", hash = "sha256:8cf3878353cc04b053822896bc4922b194792df9df2f1ad8da01fb3043602126"},
+    {file = "psycopg2-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:06f32425949bd5fe8f625c49f17ebb9784e1e4fe928b7cce72edc36fb68e4c0c"},
+    {file = "psycopg2-2.9.3.tar.gz", hash = "sha256:8e841d1bf3434da985cc5ef13e6f75c8981ced601fd70cc6bf33351b91562981"},
+]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
+pyfaidx = [
+    {file = "pyfaidx-0.7.1.tar.gz", hash = "sha256:3977632b7fd29049f8b11035d7e9dea0e2c5da9c235f982b4c3fae06ff1fa23f"},
+]
+pymysql = [
+    {file = "PyMySQL-1.0.2-py3-none-any.whl", hash = "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641"},
+    {file = "PyMySQL-1.0.2.tar.gz", hash = "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"},
+]
+pyparsing = [
+    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
+    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+]
+pypika = [
+    {file = "PyPika-0.48.9.tar.gz", hash = "sha256:838836a61747e7c8380cd1b7ff638694b7a7335345d0f559b04b2cd832ad5378"},
+]
+pyreadline3 = [
+    {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"},
+    {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
+]
+pyrsistent = [
+    {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
+    {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"},
+    {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ed6784ceac462a7d6fcb7e9b663e93b9a6fb373b7f43594f9ff68875788e01e"},
+    {file = "pyrsistent-0.18.1-cp310-cp310-win32.whl", hash = "sha256:e4f3149fd5eb9b285d6bfb54d2e5173f6a116fe19172686797c056672689daf6"},
+    {file = "pyrsistent-0.18.1-cp310-cp310-win_amd64.whl", hash = "sha256:636ce2dc235046ccd3d8c56a7ad54e99d5c1cd0ef07d9ae847306c91d11b5fec"},
+    {file = "pyrsistent-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e92a52c166426efbe0d1ec1332ee9119b6d32fc1f0bbfd55d5c1088070e7fc1b"},
+    {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7a096646eab884bf8bed965bad63ea327e0d0c38989fc83c5ea7b8a87037bfc"},
+    {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfd2c361b8a8e5d9499b9082b501c452ade8bbf42aef97ea04854f4a3f43b22"},
+    {file = "pyrsistent-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:7ec335fc998faa4febe75cc5268a9eac0478b3f681602c1f27befaf2a1abe1d8"},
+    {file = "pyrsistent-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6455fc599df93d1f60e1c5c4fe471499f08d190d57eca040c0ea182301321286"},
+    {file = "pyrsistent-0.18.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fd8da6d0124efa2f67d86fa70c851022f87c98e205f0594e1fae044e7119a5a6"},
+    {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bfe2388663fd18bd8ce7db2c91c7400bf3e1a9e8bd7d63bf7e77d39051b85ec"},
+    {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e3e1fcc45199df76053026a51cc59ab2ea3fc7c094c6627e93b7b44cdae2c8c"},
+    {file = "pyrsistent-0.18.1-cp38-cp38-win32.whl", hash = "sha256:b568f35ad53a7b07ed9b1b2bae09eb15cdd671a5ba5d2c66caee40dbf91c68ca"},
+    {file = "pyrsistent-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1b96547410f76078eaf66d282ddca2e4baae8964364abb4f4dcdde855cd123a"},
+    {file = "pyrsistent-0.18.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f87cc2863ef33c709e237d4b5f4502a62a00fab450c9e020892e8e2ede5847f5"},
+    {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc66318fb7ee012071b2792024564973ecc80e9522842eb4e17743604b5e045"},
+    {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:914474c9f1d93080338ace89cb2acee74f4f666fb0424896fcfb8d86058bf17c"},
+    {file = "pyrsistent-0.18.1-cp39-cp39-win32.whl", hash = "sha256:1b34eedd6812bf4d33814fca1b66005805d3640ce53140ab8bbb1e2651b0d9bc"},
+    {file = "pyrsistent-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:e24a828f57e0c337c8d8bb9f6b12f09dfdf0273da25fda9e314f0b684b415a07"},
+    {file = "pyrsistent-0.18.1.tar.gz", hash = "sha256:d4d61f8b993a7255ba714df3aca52700f8125289f84f704cf80916517c46eb96"},
+]
+pytest = [
+    {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
+    {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
+]
+python-dateutil = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+pytz = [
+    {file = "pytz-2022.5-py2.py3-none-any.whl", hash = "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"},
+    {file = "pytz-2022.5.tar.gz", hash = "sha256:c4d88f472f54d615e9cd582a5004d1e5f624854a6a27a6211591c251f22a6914"},
+]
+ratelimiter = [
+    {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"},
+    {file = "ratelimiter-1.2.0.post0.tar.gz", hash = "sha256:5c395dcabdbbde2e5178ef3f89b568a3066454a6ddc223b76473dac22f89b4f7"},
+]
+requests = [
+    {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"},
+    {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"},
+]
+retry = [
+    {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
+    {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"},
+]
+scikit-learn = [
+    {file = "scikit-learn-1.1.2.tar.gz", hash = "sha256:7c22d1305b16f08d57751a4ea36071e2215efb4c09cb79183faa4e8e82a3dbf8"},
+    {file = "scikit_learn-1.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6c840f662b5d3377c4ccb8be1fc21bb52cb5d8b8790f8d6bf021739f84e543cf"},
+    {file = "scikit_learn-1.1.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2b8db962360c93554cab7bb3c096c4a24695da394dd4b3c3f13409f409b425bc"},
+    {file = "scikit_learn-1.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e7d1fc817867a350133f937aaebcafbc06192517cbdf0cf7e5774ad4d1adb9f"},
+    {file = "scikit_learn-1.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec3ea40d467966821843210c02117d82b097b54276fdcfb50f4dfb5c60dbe39"},
+    {file = "scikit_learn-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:bbef6ea1c012ff9f3e6f6e9ca006b8772d8383e177b898091e68fbd9b3f840f9"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a90ca42fe8242fd6ff56cda2fecc5fca586a88a24ab602d275d2d0dcc0b928fb"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a682ec0f82b6f30fb07486daed1c8001b6683cc66b51877644dfc532bece6a18"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c33e16e9a165af6012f5be530ccfbb672e2bc5f9b840238a05eb7f6694304e3f"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f94c0146bad51daef919c402a3da8c1c6162619653e1c00c92baa168fda292f2"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-win32.whl", hash = "sha256:2f46c6e3ff1054a5ec701646dcfd61d43b8ecac4d416014daed8843cf4c33d4d"},
+    {file = "scikit_learn-1.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:b1e706deca9b2ad87ae27dafd5ac4e8eff01b6db492ed5c12cef4735ec5f21ea"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:567417dbbe6a6278399c3e6daf1654414a5a1a4d818d28f251fa7fc28730a1bf"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:d6f232779023c3b060b80b5c82e5823723bc424dcac1d1a148aa2492c54d245d"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:589d46f28460469f444b898223b13d99db9463e1038dc581ba698111f612264b"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76800652fb6d6bf527bce36ecc2cc25738b28fe1a17bd294a218fff8e8bd6d50"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-win32.whl", hash = "sha256:1c8fecb7c9984d9ec2ea48898229f98aad681a0873e0935f2b7f724fbce4a047"},
+    {file = "scikit_learn-1.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:407e9a1cb9e6ba458a539986a9bd25546a757088095b3aab91d465b79a760d37"},
+]
+scipy = [
+    {file = "scipy-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ee4ceed204f269da19f67f0115a85d3a2cd8547185037ad99a4025f9c61d02e9"},
+    {file = "scipy-1.9.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:17be1a7c68ec4c49d8cd4eb1655d55d14a54ab63012296bdd5921c92dc485acd"},
+    {file = "scipy-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72297eb9702576bd8f626bb488fd32bb35349d3120fc4a5e733db137f06c9a6"},
+    {file = "scipy-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa270cc6080c987929335c4cb94e8054fee9a6058cecff22276fa5dbab9856fc"},
+    {file = "scipy-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:22380e076a162e81b659d53d75b02e9c75ad14ea2d53d9c645a12543414e2150"},
+    {file = "scipy-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bbed414fc25d64bd6d1613dc0286fbf91902219b8be63ad254525162235b67e9"},
+    {file = "scipy-1.9.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:885b7ac56d7460544b2ef89ab9feafa30f4264c9825d975ef690608d07e6cc55"},
+    {file = "scipy-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5994a8232cc6510a8e85899661df2d11198bf362f0ffe6fbd5c0aca17ab46ce3"},
+    {file = "scipy-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c83dccac06f3b9aa02df69577f239758d5d0d0c069673fb0b47ecb971983d"},
+    {file = "scipy-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:92c5e627a0635ca02e6494bbbdb74f98d93ac8730416209d61de3b70c8a821be"},
+    {file = "scipy-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b6194da32e0ce9200b2eda4eb4edb89c5cb8b83d6deaf7c35f8ad3d5d7627d5c"},
+    {file = "scipy-1.9.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:148cb6f53d9d10dafde848e9aeb1226bf2809d16dc3221b2fa568130b6f2e586"},
+    {file = "scipy-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:658fd31c6ad4eb9fa3fd460fcac779f70a6bc7480288a211b7658a25891cf01d"},
+    {file = "scipy-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4012dbe540732311b8f4388b7e1482eb43a7cc0435bbf2b9916b3d6c38fb8d01"},
+    {file = "scipy-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:d6cb1f92ded3fc48f7dbe94d20d7b9887e13b874e79043907de541c841563b4c"},
+    {file = "scipy-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1e3b23a82867018cd26255dc951789a7c567921622073e1113755866f1eae928"},
+    {file = "scipy-1.9.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:82e8bfb352aa9dce9a0ffe81f4c369a2c87c85533519441686f59f21d8c09697"},
+    {file = "scipy-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61b95283529712101bfb7c87faf94cb86ed9e64de079509edfe107e5cfa55733"},
+    {file = "scipy-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c8c29703202c39d699b0d6b164bde5501c212005f20abf46ae322b9307c8a41"},
+    {file = "scipy-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7b2608b3141c257d01ae772e23b3de9e04d27344e6b68a890883795229cb7191"},
+    {file = "scipy-1.9.2.tar.gz", hash = "sha256:99e7720caefb8bca6ebf05c7d96078ed202881f61e0c68bd9e0f3e8097d6f794"},
+]
+semver = [
+    {file = "semver-2.13.0-py2.py3-none-any.whl", hash = "sha256:ced8b23dceb22134307c1b8abfa523da14198793d9787ac838e70e29e77458d4"},
+    {file = "semver-2.13.0.tar.gz", hash = "sha256:fa0fe2722ee1c3f57eac478820c3a5ae2f624af8264cbdf9000c980ff7f75e3f"},
+]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
+simplejson = [
+    {file = "simplejson-3.17.6-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a89acae02b2975b1f8e4974cb8cdf9bf9f6c91162fb8dec50c259ce700f2770a"},
+    {file = "simplejson-3.17.6-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:82ff356ff91be0ab2293fc6d8d262451eb6ac4fd999244c4b5f863e049ba219c"},
+    {file = "simplejson-3.17.6-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:0de783e9c2b87bdd75b57efa2b6260c24b94605b5c9843517577d40ee0c3cc8a"},
+    {file = "simplejson-3.17.6-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:d24a9e61df7a7787b338a58abfba975414937b609eb6b18973e25f573bc0eeeb"},
+    {file = "simplejson-3.17.6-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:e8603e691580487f11306ecb066c76f1f4a8b54fb3bdb23fa40643a059509366"},
+    {file = "simplejson-3.17.6-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:9b01e7b00654115965a206e3015f0166674ec1e575198a62a977355597c0bef5"},
+    {file = "simplejson-3.17.6-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:37bc0cf0e5599f36072077e56e248f3336917ded1d33d2688624d8ed3cefd7d2"},
+    {file = "simplejson-3.17.6-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:cf6e7d5fe2aeb54898df18db1baf479863eae581cce05410f61f6b4188c8ada1"},
+    {file = "simplejson-3.17.6-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:bdfc54b4468ed4cd7415928cbe782f4d782722a81aeb0f81e2ddca9932632211"},
+    {file = "simplejson-3.17.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd16302d39c4d6f4afde80edd0c97d4db643327d355a312762ccd9bd2ca515ed"},
+    {file = "simplejson-3.17.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:deac4bdafa19bbb89edfb73b19f7f69a52d0b5bd3bb0c4ad404c1bbfd7b4b7fd"},
+    {file = "simplejson-3.17.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8bbdb166e2fb816e43ab034c865147edafe28e1b19c72433147789ac83e2dda"},
+    {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7854326920d41c3b5d468154318fe6ba4390cb2410480976787c640707e0180"},
+    {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:04e31fa6ac8e326480703fb6ded1488bfa6f1d3f760d32e29dbf66d0838982ce"},
+    {file = "simplejson-3.17.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f63600ec06982cdf480899026f4fda622776f5fabed9a869fdb32d72bc17e99a"},
+    {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e03c3b8cc7883a54c3f34a6a135c4a17bc9088a33f36796acdb47162791b02f6"},
+    {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a2d30d6c1652140181dc6861f564449ad71a45e4f165a6868c27d36745b65d40"},
+    {file = "simplejson-3.17.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a1aa6e4cae8e3b8d5321be4f51c5ce77188faf7baa9fe1e78611f93a8eed2882"},
+    {file = "simplejson-3.17.6-cp310-cp310-win32.whl", hash = "sha256:97202f939c3ff341fc3fa84d15db86156b1edc669424ba20b0a1fcd4a796a045"},
+    {file = "simplejson-3.17.6-cp310-cp310-win_amd64.whl", hash = "sha256:80d3bc9944be1d73e5b1726c3bbfd2628d3d7fe2880711b1eb90b617b9b8ac70"},
+    {file = "simplejson-3.17.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9fa621b3c0c05d965882c920347b6593751b7ab20d8fa81e426f1735ca1a9fc7"},
+    {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd2fb11922f58df8528adfca123f6a84748ad17d066007e7ac977720063556bd"},
+    {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:724c1fe135aa437d5126138d977004d165a3b5e2ee98fc4eb3e7c0ef645e7e27"},
+    {file = "simplejson-3.17.6-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4ff4ac6ff3aa8f814ac0f50bf218a2e1a434a17aafad4f0400a57a8cc62ef17f"},
+    {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:67093a526e42981fdd954868062e56c9b67fdd7e712616cc3265ad0c210ecb51"},
+    {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:5d6b4af7ad7e4ac515bc6e602e7b79e2204e25dbd10ab3aa2beef3c5a9cad2c7"},
+    {file = "simplejson-3.17.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:1c9b1ed7ed282b36571638297525f8ef80f34b3e2d600a56f962c6044f24200d"},
+    {file = "simplejson-3.17.6-cp36-cp36m-win32.whl", hash = "sha256:632ecbbd2228575e6860c9e49ea3cc5423764d5aa70b92acc4e74096fb434044"},
+    {file = "simplejson-3.17.6-cp36-cp36m-win_amd64.whl", hash = "sha256:4c09868ddb86bf79b1feb4e3e7e4a35cd6e61ddb3452b54e20cf296313622566"},
+    {file = "simplejson-3.17.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4b6bd8144f15a491c662f06814bd8eaa54b17f26095bb775411f39bacaf66837"},
+    {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5decdc78849617917c206b01e9fc1d694fd58caa961be816cb37d3150d613d9a"},
+    {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:521877c7bd060470806eb6335926e27453d740ac1958eaf0d8c00911bc5e1802"},
+    {file = "simplejson-3.17.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:65b998193bd7b0c7ecdfffbc825d808eac66279313cb67d8892bb259c9d91494"},
+    {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ac786f6cb7aa10d44e9641c7a7d16d7f6e095b138795cd43503769d4154e0dc2"},
+    {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3ff5b3464e1ce86a8de8c88e61d4836927d5595c2162cab22e96ff551b916e81"},
+    {file = "simplejson-3.17.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:69bd56b1d257a91e763256d63606937ae4eb890b18a789b66951c00062afec33"},
+    {file = "simplejson-3.17.6-cp37-cp37m-win32.whl", hash = "sha256:b81076552d34c27e5149a40187a8f7e2abb2d3185576a317aaf14aeeedad862a"},
+    {file = "simplejson-3.17.6-cp37-cp37m-win_amd64.whl", hash = "sha256:07ecaafc1b1501f275bf5acdee34a4ad33c7c24ede287183ea77a02dc071e0c0"},
+    {file = "simplejson-3.17.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:068670af975247acbb9fc3d5393293368cda17026db467bf7a51548ee8f17ee1"},
+    {file = "simplejson-3.17.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4d1c135af0c72cb28dd259cf7ba218338f4dc027061262e46fe058b4e6a4c6a3"},
+    {file = "simplejson-3.17.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:23fe704da910ff45e72543cbba152821685a889cf00fc58d5c8ee96a9bad5f94"},
+    {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f444762fed1bc1fd75187ef14a20ed900c1fbb245d45be9e834b822a0223bc81"},
+    {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:681eb4d37c9a9a6eb9b3245a5e89d7f7b2b9895590bb08a20aa598c1eb0a1d9d"},
+    {file = "simplejson-3.17.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8e8607d8f6b4f9d46fee11447e334d6ab50e993dd4dbfb22f674616ce20907ab"},
+    {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b10556817f09d46d420edd982dd0653940b90151d0576f09143a8e773459f6fe"},
+    {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e1ec8a9ee0987d4524ffd6299e778c16cc35fef6d1a2764e609f90962f0b293a"},
+    {file = "simplejson-3.17.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0b4126cac7d69ac06ff22efd3e0b3328a4a70624fcd6bca4fc1b4e6d9e2e12bf"},
+    {file = "simplejson-3.17.6-cp38-cp38-win32.whl", hash = "sha256:35a49ebef25f1ebdef54262e54ae80904d8692367a9f208cdfbc38dbf649e00a"},
+    {file = "simplejson-3.17.6-cp38-cp38-win_amd64.whl", hash = "sha256:743cd768affaa508a21499f4858c5b824ffa2e1394ed94eb85caf47ac0732198"},
+    {file = "simplejson-3.17.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb62d517a516128bacf08cb6a86ecd39fb06d08e7c4980251f5d5601d29989ba"},
+    {file = "simplejson-3.17.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:12133863178a8080a3dccbf5cb2edfab0001bc41e5d6d2446af2a1131105adfe"},
+    {file = "simplejson-3.17.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5540fba2d437edaf4aa4fbb80f43f42a8334206ad1ad3b27aef577fd989f20d9"},
+    {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d74ee72b5071818a1a5dab47338e87f08a738cb938a3b0653b9e4d959ddd1fd9"},
+    {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:28221620f4dcabdeac310846629b976e599a13f59abb21616356a85231ebd6ad"},
+    {file = "simplejson-3.17.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b09bc62e5193e31d7f9876220fb429ec13a6a181a24d897b9edfbbdbcd678851"},
+    {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7255a37ff50593c9b2f1afa8fafd6ef5763213c1ed5a9e2c6f5b9cc925ab979f"},
+    {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:401d40969cee3df7bda211e57b903a534561b77a7ade0dd622a8d1a31eaa8ba7"},
+    {file = "simplejson-3.17.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a649d0f66029c7eb67042b15374bd93a26aae202591d9afd71e111dd0006b198"},
+    {file = "simplejson-3.17.6-cp39-cp39-win32.whl", hash = "sha256:522fad7be85de57430d6d287c4b635813932946ebf41b913fe7e880d154ade2e"},
+    {file = "simplejson-3.17.6-cp39-cp39-win_amd64.whl", hash = "sha256:3fe87570168b2ae018391e2b43fbf66e8593a86feccb4b0500d134c998983ccc"},
+    {file = "simplejson-3.17.6.tar.gz", hash = "sha256:cf98038d2abf63a1ada5730e91e84c642ba6c225b0198c3684151b1f80c5f8a6"},
+]
+six = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+sortedcontainers = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+soupsieve = [
+    {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"},
+    {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"},
+]
+sqlitedict = [
+    {file = "sqlitedict-1.7.0.tar.gz", hash = "sha256:2affcc301aacd4da7511692601ecbde392294205af418498f7d6d3ec0dbcad56"},
+]
+tatsu = [
+    {file = "TatSu-4.4.0-py2.py3-none-any.whl", hash = "sha256:c9211eeee9a2d4c90f69879ec0b518b1aa0d9450249cb0dd181f5f5b18be0a92"},
+    {file = "TatSu-4.4.0.zip", hash = "sha256:80713413473a009f2081148d0f494884cabaf9d6866b71f2a68a92b6442f343d"},
+]
+threadpoolctl = [
+    {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"},
+    {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"},
+]
+toml = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"},
+    {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"},
+]
+zipp = [
+    {file = "zipp-3.9.0-py3-none-any.whl", hash = "sha256:972cfa31bc2fedd3fa838a51e9bc7e64b7fb725a8c00e7431554311f180e9980"},
+    {file = "zipp-3.9.0.tar.gz", hash = "sha256:3a7af91c3db40ec72dd9d154ae18e008c69efe8ca88dde4f9a731bb82fe2f9eb"},
+]
diff --git a/precompute.nf b/precompute.nf
index 3d3458f88..48fd0dc84 100644
--- a/precompute.nf
+++ b/precompute.nf
@@ -13,6 +13,9 @@ include { query as prev_query} from './workflows/precompute/utils'
 include { query as basic_query} from './workflows/precompute/utils'
 include { query as orf_query} from './workflows/precompute/utils'
 
+include { slack_closure } from './workflows/utils/slack'
+include { slack_message } from './workflows/utils/slack'
+
 process build_precompute_context {
   input:
   path('species-repeats*')
@@ -118,8 +121,6 @@ process process_range {
 }
 
 process load_data {
-  beforeScript 'slack db-work loading-precompute || true'
-  afterScript 'slack db-done loading-precompute || true'
 
   input:
   path('precompute*.csv')
@@ -139,6 +140,9 @@ process load_data {
 workflow precompute {
   take: _flag
   main:
+
+    Channel.of("Starting precompute pipeline") | slack_message
+
     Channel.fromPath('files/precompute/get-accessions/query.sql') | set { accession_query }
     Channel.fromPath('files/precompute/load.ctl') | set { data_ctl }
     Channel.fromPath('files/precompute/qa.ctl') | set { qa_ctl }
@@ -194,3 +198,13 @@ workflow precompute {
 workflow {
   precompute(Channel.of(true))
 }
+
+workflow.onComplete {
+
+  slack_closure("Precompute workflow completed. Data import complete")
+}
+
+workflow.onError {
+
+  slack_closure("Precompute workflow encountered an error and crashed")
+}
diff --git a/prepare-environment.nf b/prepare-environment.nf
new file mode 100644
index 000000000..90d654a24
--- /dev/null
+++ b/prepare-environment.nf
@@ -0,0 +1,49 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+include { slack_closure } from './workflows/utils/slack'
+include { slack_message } from './workflows/utils/slack'
+
+/* Get some data downloaded and in the right place */
+
+/* On the cluster this is much much faster than wget */
+process get_r2dt_data {
+  queue 'datamover'
+  executor 'lsf'
+  container ''
+
+  input:
+    val(data_dir)
+
+  script:
+  """
+  echo "$data_dir"
+  if [ ! -d $data_dir ]
+  then
+    mkdir -p $data_dir
+  fi
+
+  cd $data_dir
+
+  cp /nfs/ftp/public/databases/RNAcentral/r2dt/1.3/cms.tar.gz .
+
+  tar -xf cms.tar.gz --strip-components=1 -C ./cms
+  """
+}
+
+workflow prepare_environment {
+  main:
+    Channel.of("Starting environment preparation") | slack_message
+
+    Channel.of("$params.r2dt.cms_path/../")| get_r2dt_data
+}
+
+workflow {
+  Channel.of("Starting...") | slack_message
+  prepare_environment()
+}
+
+workflow.onComplete {
+  slack_closure("Environment preparation completed")
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..40e730ff0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,48 @@
+[tool.poetry]
+name = "rnacentral_pipeline"
+version = "0.1.0"
+description = "The pipeline that imports all RNAcentral data"
+authors = ["Blake Sweeney <bsweeney@ebi.ac.uk>"]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+PyMySQL = "^1.0.2"
+attrs = "^21.4.0"
+beautifulsoup4 = "^4.10.0"
+biopython = "^1.79"
+click = "^8.0.3"
+click-aliases = "^1.0.1"
+furl = "^2.1.3"
+gffutils = "^0.10.1"
+humanfriendly = "^10.0"
+ijson = "^3.1.4"
+intervaltree = "^3.1.0"
+jsonschema = "^4.3.3"
+lxml = "^4.7.1"
+more-itertools = "^8.12.0"
+obonet = "^0.3.0"
+pandas = "^1.3.5"
+PyPika = "^0.48.8"
+ratelimiter = "^1.2.0"
+requests = "^2.27.1"
+retry = "^0.9.2"
+scikit-learn = "^1.0.2"
+semver = "^2.13.0"
+sqlitedict = "^1.7.0"
+TatSu = "4.4.0"
+psycopg2 = "2.9.3"
+
+[tool.poetry.dev-dependencies]
+pytest = "^6.2.5"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+xfail_strict = true
+filterwarnings = "ignore::DeprecationWarning"
+markers = [
+    "slow: Tests that take a long time",
+    "db: Test that require access to our database",
+]
diff --git a/references-manually-annotated.nf b/references-manually-annotated.nf
new file mode 100644
index 000000000..c42f2a961
--- /dev/null
+++ b/references-manually-annotated.nf
@@ -0,0 +1,35 @@
+nextflow.enable.dsl=2
+
+process get_ids {
+    publishDir "$baseDir/workflows/references/manually_annotated/", mode: 'copy'
+
+    input:
+    path(query)
+
+    output:
+    path('results')
+
+    script:
+    """
+    psql -t -A -f $query "$PGDATABASE" > results
+    """
+}
+
+process split_by_db {
+    publishDir "$baseDir/workflows/references/manually_annotated/", mode: 'copy'
+
+    input:
+    file(results)
+
+    output:
+    path('from_*')
+
+    script:
+    """
+    references-manually-annotated.py $results from_*
+    """
+}
+
+workflow {
+    Channel.fromPath('workflows/references/manually_annotated/query.sql') | get_ids | split_by_db
+}
diff --git a/references-metadata-rnacentral.nf b/references-metadata-rnacentral.nf
new file mode 100644
index 000000000..fcf78fcf0
--- /dev/null
+++ b/references-metadata-rnacentral.nf
@@ -0,0 +1,38 @@
+nextflow.enable.dsl=2
+
+process get_urs {
+    publishDir "$baseDir/workflows/references/metadata/rnacentral", mode: 'copy'
+
+    input:
+    path(database)
+
+    output:
+    path("urs_${database.baseName}")
+
+    script:
+    """
+    metadata-rnacentral.py $database urs_${database.baseName}
+    """
+}
+
+process get_job {
+    publishDir "$baseDir/workflows/references/metadata/rnacentral", mode: 'copy'
+
+    input:
+    path(database)
+
+    output:
+    path("job_${database.baseName}")
+
+    script:
+    """
+    metadata-rnacentral.py $database job_${database.baseName}
+    """
+}
+
+
+
+workflow {
+    Channel.fromPath('workflows/references/results/*.txt') | get_urs
+    Channel.fromPath('workflows/references/results/*.txt') | get_job
+}
diff --git a/references-metadata.nf b/references-metadata.nf
index a243ca34c..1a5e4c9bb 100644
--- a/references-metadata.nf
+++ b/references-metadata.nf
@@ -1,20 +1,46 @@
 nextflow.enable.dsl=2
 
+process create_metadata {
+    input:
+    path(database)
+
+    output:
+    path("metadata_${database.baseName}")
+
+    script:
+    """
+    metadata.py $database metadata_${database.baseName}
+    """
+}
+
+process merge_metadata {
+    input:
+    file(results)
+
+    output:
+    path("merged_metadata")
+
+    script:
+    """
+    cat $results | sort -fb | uniq -i > merged_metadata
+    """
+}
+
 process create_xml {
     publishDir "$baseDir/workflows/references/metadata/", mode: 'copy'
 
     input:
-    path(database)
+    file(merged_metadata)
 
     output:
-    path("metadata_${database.baseName}.xml.gz")
+    path("metadata_*")
 
     script:
     """
-    metadata.py $database metadata_${database.baseName}.xml.gz
+    create_xml_metadata.py $merged_metadata metadata_*
     """
 }
 
 workflow {
-    Channel.fromPath('workflows/references/results/*.txt') | create_xml
+    Channel.fromPath('workflows/references/results/*.txt') | create_metadata | collect | merge_metadata | create_xml
 }
diff --git a/references.nf b/references.nf
index 17f643014..cdad8ba9a 100644
--- a/references.nf
+++ b/references.nf
@@ -37,21 +37,41 @@ process sort_ids {
 
     script:
     """
-    cat $output | sort | uniq > ${database}.txt
+    cat $output | sort -fb | uniq -i > ${database}.txt
+    """
+}
+
+process prepare_to_submit {
+    publishDir "$baseDir/workflows/references/submit/", mode: 'copy'
+
+    input:
+    tuple val(database), path("${database}.txt")
+
+    output:
+    tuple val(database), path("${database}_ids.txt")
+
+    script:
+    """
+    # make a copy of the old version before creating the new file
+    rm -f $baseDir/workflows/references/submit/previous-release/${database}_ids.txt
+    mv $baseDir/workflows/references/submit/${database}_ids.txt $baseDir/workflows/references/submit/previous-release
+    get_unique_ids.sh ${database}.txt $database
     """
 }
 
 process submit_ids {
     input:
-    tuple val(database), file("${database}.txt")
+    tuple val(database), file("${database}_ids.txt")
 
     script:
     """
-    upload_ids.sh ${database}.txt $database
+    # submit new ids only
+    comm -13 $baseDir/workflows/references/submit/previous-release/${database}_ids.txt $baseDir/workflows/references/submit/${database}_ids.txt > new_${database}_ids.txt
+    upload_ids.sh new_${database}_ids.txt
     """
 }
 
 workflow {
-    Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids
-    // Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | submit_ids
+    Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | prepare_to_submit
+    // Channel.fromPath('workflows/references/queries/*.sql') | get_ids | check_ids | sort_ids | prepare_to_submit | submit_ids
 }
diff --git a/report.nf b/report.nf
new file mode 100644
index 000000000..99dfe0c81
--- /dev/null
+++ b/report.nf
@@ -0,0 +1,11 @@
+process send_completion_report {
+  executor 'local'
+
+  """
+  rnac notify report
+  """
+}
+
+workflow {
+  send_completion_report()
+}
diff --git a/requirements.in b/requirements.in
index e43e25a23..af8426f2e 100644
--- a/requirements.in
+++ b/requirements.in
@@ -21,6 +21,7 @@ requests
 retry
 scikit-learn
 semver
+slack_sdk
 sqlitedict
 tatsu
 textblob
diff --git a/requirements.txt b/requirements.txt
index 2d7883e9d..784c59e7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,63 +1,132 @@
 #
-# This file is autogenerated by pip-compile
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile --output-file=requirements.txt requirements.in
 #
-argcomplete==1.12.3       # via gffutils
-argh==0.26.2              # via gffutils
-attrs==20.3.0             # via -r requirements.in, jsonschema
-beautifulsoup4==4.9.3     # via -r requirements.in
-biopython==1.78           # via -r requirements.in
-certifi==2020.12.5        # via requests
-chardet==4.0.0            # via requests
-click-aliases==1.0.1      # via -r requirements.in
-click==7.1.2              # via -r requirements.in, click-aliases, nltk
-decorator==4.4.2          # via networkx, retry
-furl==2.1.2               # via -r requirements.in
-gffutils==0.10.1          # via -r requirements.in
-humanfriendly==9.1        # via -r requirements.in
-idna==2.10                # via requests
-ijson==3.1.4              # via -r requirements.in
-importlib-metadata==4.0.1  # via argcomplete, jsonschema
-intervaltree==3.1.0       # via -r requirements.in
-joblib==1.0.1             # via nltk, scikit-learn
-jsonschema==3.2.0         # via -r requirements.in
-lxml==4.6.3               # via -r requirements.in
-more-itertools==8.7.0     # via -r requirements.in
-networkx==2.5.1           # via obonet
-nltk==3.6.2               # via textblob
-numpy==1.20.2             # via biopython, pandas, scikit-learn, scipy
-obonet==0.3.0             # via -r requirements.in
-orderedmultidict==1.0.1   # via furl
-pandas==1.2.4             # via -r requirements.in
-psycopg2==2.8.6           # via -r requirements.in
-py==1.10.0                # via retry
-pyfaidx==0.5.9.5          # via gffutils
-pymysql==1.0.2            # via -r requirements.in
-pypika==0.48.1            # via -r requirements.in
-pyrsistent==0.17.3        # via jsonschema
-python-dateutil==2.8.1    # via pandas
-pytz==2021.1              # via pandas
-ratelimiter==1.2.0.post0  # via -r requirements.in
-regex==2021.4.4           # via nltk
-requests==2.25.1          # via -r requirements.in
-retry==0.9.2              # via -r requirements.in
-scikit-learn==0.24.1      # via -r requirements.in
-scipy==1.6.2              # via scikit-learn
-semver==2.13.0            # via -r requirements.in
-simplejson==3.17.2        # via gffutils
-six==1.15.0               # via furl, gffutils, jsonschema, orderedmultidict, pyfaidx, python-dateutil
-sortedcontainers==2.3.0   # via intervaltree
-soupsieve==2.2.1          # via beautifulsoup4
-sqlitedict==1.7.0         # via -r requirements.in
-tatsu==4.4.0              # via -r requirements.in
-textblob==0.15.3          # via -r requirements.in
-threadpoolctl==2.1.0      # via scikit-learn
-tqdm==4.60.0              # via nltk
-typing-extensions==3.7.4.3  # via importlib-metadata
-urllib3==1.26.4           # via requests
-zipp==3.4.1               # via importlib-metadata
+argcomplete==1.12.3
+    # via gffutils
+argh==0.26.2
+    # via gffutils
+attrs==20.3.0
+    # via
+    #   -r requirements.in
+    #   jsonschema
+beautifulsoup4==4.9.3
+    # via -r requirements.in
+biopython==1.78
+    # via -r requirements.in
+certifi==2022.6.15
+    # via requests
+chardet==4.0.0
+    # via requests
+click==7.1.2
+    # via
+    #   -r requirements.in
+    #   click-aliases
+    #   nltk
+click-aliases==1.0.1
+    # via -r requirements.in
+decorator==4.4.2
+    # via
+    #   networkx
+    #   retry
+furl==2.1.2
+    # via -r requirements.in
+gffutils==0.10.1
+    # via -r requirements.in
+humanfriendly==9.1
+    # via -r requirements.in
+idna==2.10
+    # via requests
+ijson==3.1.4
+    # via -r requirements.in
+intervaltree==3.1.0
+    # via -r requirements.in
+joblib==1.0.1
+    # via
+    #   nltk
+    #   scikit-learn
+jsonschema==3.2.0
+    # via -r requirements.in
+lxml==4.6.3
+    # via -r requirements.in
+more-itertools==8.7.0
+    # via -r requirements.in
+networkx==2.5.1
+    # via obonet
+nltk==3.6.2
+    # via textblob
+numpy==1.23.0
+    # via
+    #   biopython
+    #   pandas
+    #   scikit-learn
+    #   scipy
+obonet==0.3.0
+    # via -r requirements.in
+orderedmultidict==1.0.1
+    # via furl
+pandas==1.4.3
+    # via -r requirements.in
+psycopg2==2.8.6
+    # via -r requirements.in
+py==1.10.0
+    # via retry
+pyfaidx==0.5.9.5
+    # via gffutils
+pymysql==1.0.2
+    # via -r requirements.in
+pypika==0.48.1
+    # via -r requirements.in
+pyrsistent==0.17.3
+    # via jsonschema
+python-dateutil==2.8.1
+    # via pandas
+pytz==2021.1
+    # via pandas
+ratelimiter==1.2.0.post0
+    # via -r requirements.in
+regex==2021.4.4
+    # via nltk
+requests==2.25.1
+    # via -r requirements.in
+retry==0.9.2
+    # via -r requirements.in
+scikit-learn==1.1.1
+    # via -r requirements.in
+scipy==1.8.1
+    # via scikit-learn
+semver==2.13.0
+    # via -r requirements.in
+simplejson==3.17.2
+    # via gffutils
+six==1.15.0
+    # via
+    #   furl
+    #   gffutils
+    #   jsonschema
+    #   orderedmultidict
+    #   pyfaidx
+    #   python-dateutil
+slack-sdk==3.18.1
+    # via -r requirements.in
+sortedcontainers==2.3.0
+    # via intervaltree
+soupsieve==2.2.1
+    # via beautifulsoup4
+sqlitedict==1.7.0
+    # via -r requirements.in
+tatsu==4.4.0
+    # via -r requirements.in
+textblob==0.15.3
+    # via -r requirements.in
+threadpoolctl==2.1.0
+    # via scikit-learn
+tqdm==4.60.0
+    # via nltk
+urllib3==1.26.4
+    # via requests
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/rnacentral_pipeline/cli/__init__.py b/rnacentral_pipeline/cli/__init__.py
index 7890ea211..9a754a443 100644
--- a/rnacentral_pipeline/cli/__init__.py
+++ b/rnacentral_pipeline/cli/__init__.py
@@ -23,6 +23,7 @@
     crw,
     ena,
     ensembl,
+    expressionatlas,
     europepmc,
     five_s_rrnadb,
     flybase,
@@ -40,9 +41,11 @@
     mirgenedb,
     misc,
     ncbi,
+    notify,
     ols,
     pdb,
     pirbase,
+    plncdb,
     pombase,
     psicquic,
     precompute,
@@ -55,6 +58,7 @@
     rfam,
     ribovision,
     search_export,
+    scan_imports,
     sgd,
     silva,
     snodb,
@@ -91,6 +95,7 @@ def cli(log_level):
 cli.add_command(ena.cli)
 cli.add_command(ensembl.cli)
 cli.add_command(europepmc.cli)
+cli.add_command(expressionatlas.cli)
 cli.add_command(five_s_rrnadb.cli)
 cli.add_command(flybase.cli)
 cli.add_command(ftp_export.cli)
@@ -109,9 +114,11 @@ def cli(log_level):
 cli.add_command(misc.find_upi_ranges)
 cli.add_command(misc.validate_pgloader)
 cli.add_command(ncbi.cli)
+cli.add_command(notify.cli)
 cli.add_command(ols.cli)
 cli.add_command(pdb.cli)
 cli.add_command(pirbase.cli)
+cli.add_command(plncdb.cli)
 cli.add_command(pombase.cli)
 cli.add_command(psicquic.cli)
 cli.add_command(precompute.cli)
@@ -123,6 +130,7 @@ def cli(log_level):
 cli.add_command(repeats.cli)
 cli.add_command(rfam.cli)
 cli.add_command(ribovision.cli)
+cli.add_command(scan_imports.cli)
 cli.add_command(search_export.cli)
 cli.add_command(sgd.cli)
 cli.add_command(silva.cli)
diff --git a/rnacentral_pipeline/cli/crw.py b/rnacentral_pipeline/cli/crw.py
index cc6d7f794..57b35d77c 100644
--- a/rnacentral_pipeline/cli/crw.py
+++ b/rnacentral_pipeline/cli/crw.py
@@ -19,7 +19,7 @@
 
 from Bio import SeqIO
 
-from rnacentral_pipeline.databases.crw import parser
+from rnacentral_pipeline.databases.crw import parser, helpers
 from rnacentral_pipeline.writers import entry_writer
 
 
@@ -53,5 +53,5 @@ def process_crw(metadata_file, sequence_directory, output):
 @click.argument("directory", type=click.Path())
 @click.argument("output", type=click.File("w"))
 def generate_r2dt_fasta(directory, output):
-    entries = parser.fasta_entries(Path(directory))
+    entries = helpers.fasta_entries(Path(directory))
     SeqIO.write(entries, output, "fasta")
diff --git a/rnacentral_pipeline/cli/ena.py b/rnacentral_pipeline/cli/ena.py
index 2a09656d8..70c87d3ed 100644
--- a/rnacentral_pipeline/cli/ena.py
+++ b/rnacentral_pipeline/cli/ena.py
@@ -13,11 +13,13 @@
 limitations under the License.
 """
 
+import os
 from pathlib import Path
 
 import click
 
 from rnacentral_pipeline.databases.ena import context, parser
+from rnacentral_pipeline.rnacentral.notify.slack import send_notification
 from rnacentral_pipeline.writers import entry_writer
 
 
@@ -58,6 +60,26 @@ def process_ena(
     builder.with_dr(ena_file)
     ctx = builder.context()
     entries = parser.parse_with_context(ctx, ena_file)
-    with entry_writer(Path(output)) as writer:
-        writer.write(entries)
+    try:
+        with entry_writer(Path(output)) as writer:
+            writer.write(entries)
+    except ValueError:
+        print("No entries could be written for one of the parsed ENA files.")
+        print("Sending warning to slack, but carrying on")
+
+        # Dump this again to attach to the report
+        ctx.dump_counts(Path(counts))
+
+        message = f"No entries could be written for ENA file {ena_file}\n"
+        message += "This may be correct, but you should check\n"
+        message += f"Working directory: {os.getcwd()}\n"
+        message += "Ribotyper log:\n"
+        message += open(
+            Path(ribovore_path) / "ribotyper-results.ribotyper.log", "r"
+        ).read()
+        message += "\n\nContext counts:\n"
+        message += open(Path(counts), "r").read()
+
+        send_notification("ENA parsing error", message)
+
     ctx.dump_counts(Path(counts))
diff --git a/rnacentral_pipeline/cli/ensembl.py b/rnacentral_pipeline/cli/ensembl.py
index db6a082a6..9c20ca59a 100644
--- a/rnacentral_pipeline/cli/ensembl.py
+++ b/rnacentral_pipeline/cli/ensembl.py
@@ -14,21 +14,22 @@
 """
 
 import csv
-from pathlib import Path
 import itertools as it
 import operator as op
+from pathlib import Path
 
 import click
 
-from rnacentral_pipeline.databases.ensembl.metadata import assemblies
-from rnacentral_pipeline.databases.ensembl.metadata import compara
-from rnacentral_pipeline.databases.ensembl.metadata import coordinate_systems
-from rnacentral_pipeline.databases.ensembl.metadata import karyotypes
-from rnacentral_pipeline.databases.ensembl.metadata import proteins
+from rnacentral_pipeline.databases.ensembl import parser, pseudogenes, urls
 from rnacentral_pipeline.databases.ensembl.data import Division
-from rnacentral_pipeline.databases.ensembl import parser
-from rnacentral_pipeline.databases.ensembl import pseudogenes
-from rnacentral_pipeline.databases.ensembl import urls
+from rnacentral_pipeline.databases.ensembl.metadata import (
+    assemblies,
+    compara,
+    coordinate_systems,
+    karyotypes,
+    proteins,
+)
+from rnacentral_pipeline.rnacentral.notify import slack
 from rnacentral_pipeline.writers import entry_writer
 
 
@@ -81,8 +82,20 @@ def parse_data(division, embl_file, gff_file, output, family_file=None):
     if family_file:
         family_file = Path(family_file)
     entries = parser.parse(division, embl_file, gff_file, family_file=family_file)
-    with entry_writer(Path(output)) as writer:
-        writer.write(entries)
+    ## Send warning to slack with details about empty parse
+    try:
+        with entry_writer(Path(output)) as writer:
+            writer.write(entries)
+    except ValueError:
+        print("Empty entries, implies no ncRNAs. You should check that")
+        message = f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. Empty data supplied for now, but you should check the legitimacy of this result.\n"
+        message += "For reference, the other parameters to the parser were:\n"
+        message += f"division: {division}\n"
+        message += f"embl_file: {embl_file.name}\n"
+        message += f"gff_file: {gff_file.name}\n"
+        message += f"family_file: {family_file.name}\n"
+
+        slack.send_notification("Ensembl parser error", message)
 
 
 @cli.command("assemblies")
diff --git a/rnacentral_pipeline/cli/expressionatlas.py b/rnacentral_pipeline/cli/expressionatlas.py
new file mode 100644
index 000000000..5625a7bd9
--- /dev/null
+++ b/rnacentral_pipeline/cli/expressionatlas.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2021] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from pathlib import Path
+
+import click
+
+from rnacentral_pipeline.databases.expressionatlas import parser
+from rnacentral_pipeline.writers import entry_writer
+
+
+@click.group("expressionatlas")
+def cli():
+    """
+    Commands for parsing expression atlas data
+    """
+
+
+@cli.command("parse")
+@click.option("--db-url", envvar="PGDATABASE")
+@click.argument("csv_file", type=click.File("r"))
+@click.argument(
+    "output",
+    default=".",
+    type=click.Path(writable=True, dir_okay=True, file_okay=False),
+)
+def process_csv(csv_file, output, db_url):
+    """
+    Process the csv generated by linking EA data to rnc data
+    """
+    entries = parser.parse(csv_file, db_url)
+    with entry_writer(Path(output)) as writer:
+        writer.write(entries)
diff --git a/rnacentral_pipeline/cli/notify.py b/rnacentral_pipeline/cli/notify.py
new file mode 100644
index 000000000..28afdac56
--- /dev/null
+++ b/rnacentral_pipeline/cli/notify.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2017] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import click
+import os
+
+from rnacentral_pipeline.rnacentral.notify.slack import send_notification, pipeline_report
+from rnacentral_pipeline import db
+
+
+
+@click.group("notify")
+def cli():
+    """
+    This group of commands deals with sending notifications
+    """
+
+@cli.command("step")
+@click.argument("title", type=click.STRING)
+@click.argument("message", type=click.STRING)
+def notify_step(title, message):
+    """
+    Send a simple message, maybe when a step finishes
+    """
+    send_notification(title, message)
+
+
+@cli.command("query")
+@click.argument("title", type=click.STRING)
+@click.argument("query", type=click.STRING)
+def notify_query(title, query):
+    """
+    Run a query against the database, then format the result into markdown and
+    send as a notification in slack.
+    """
+
+    # parse query -  try to figure out what table headings to give
+    # There is probably a better way to do this
+    args = [
+        arg.strip() for arg in
+        query.upper().removeprefix("SELECT ").split("FROM")[0].split(',')
+        ]
+
+    headerline = f"{' : '.join(args)} \n"
+
+    # get PGDATABASE url from environment
+    PGDATABASE = os.getenv("PGDATABASE")
+
+    # run query, convert output to list of tuples
+    result = list(db.run_query(PGDATABASE, query, commit_on_leave=False))
+
+    markdown_string = ""
+    markdown_string += f"Result of query: {query}\n\n"
+    markdown_string += headerline
+
+    # add the results to the message...
+    for res in result:
+        markdown_string += f"- {' : '.join([str(r) for r in res])} \n"
+
+    markdown_string += '\n'
+
+    send_notification(title, markdown_string)
+
+
+
+@cli.command("file")
+@click.argument("path", type=click.File())
+def notify_file(path):
+    """
+    Read a mrkdwn formatted file and send as a message in slack.
+    """
+    send_notification("", path.read())
+
+@cli.command("report")
+def notify_report():
+    """
+    Generate a run report and send it as mrkdwn
+    """
+    pipeline_report()
diff --git a/rnacentral_pipeline/cli/pdb.py b/rnacentral_pipeline/cli/pdb.py
index 7b6dad455..e650581dc 100644
--- a/rnacentral_pipeline/cli/pdb.py
+++ b/rnacentral_pipeline/cli/pdb.py
@@ -13,14 +13,15 @@
 limitations under the License.
 """
 
+import collections as coll
+import csv
 import logging
 from pathlib import Path
 
 import click
 
-from rnacentral_pipeline.databases.pdb import fetch
-from rnacentral_pipeline.databases.pdb import parser
 from rnacentral_pipeline import writers
+from rnacentral_pipeline.databases.pdb import fetch, helpers, parser
 
 LOGGER = logging.getLogger(__name__)
 
@@ -44,12 +45,24 @@ def cli():
         file_okay=False,
     ),
 )
-def process_pdb(output, skip_references=False):
+@click.option(
+    "--override-chains",
+    default=None,
+    type=click.File("r"),
+)
+def process_pdb(output, skip_references=False, override_chains=None):
     """
     This will fetch and parse all sequence data from PDBe to produce the csv
     files we import.
     """
-    chain_info = fetch.rna_chains()
+    pdb_ids = set()
+    overrides = set()
+    if override_chains:
+        LOGGER.info("Loading chain overrides")
+        overrides = helpers.load_overrides(override_chains)
+        LOGGER.info("Loaded %i chain overrides", len(pdb_ids))
+    chain_info = fetch.rna_chains(overrides)
+    LOGGER.info("Loaded %i chains", len(chain_info))
     references = {}
     try:
         if not skip_references:
@@ -57,6 +70,6 @@ def process_pdb(output, skip_references=False):
     except Exception:
         LOGGER.info("Failed to get extra references")
 
-    entries = parser.parse(chain_info, references)
+    entries = parser.parse(chain_info, references, overrides)
     with writers.entry_writer(Path(output)) as writer:
         writer.write(entries)
diff --git a/rnacentral_pipeline/cli/plncdb.py b/rnacentral_pipeline/cli/plncdb.py
new file mode 100644
index 000000000..1af85c627
--- /dev/null
+++ b/rnacentral_pipeline/cli/plncdb.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2020] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from pathlib import Path
+
+import click
+from furl import furl
+
+import requests
+
+from rnacentral_pipeline.databases.plncdb import parser
+from rnacentral_pipeline.writers import entry_writer
+from rnacentral_pipeline.rnacentral.notify.slack import send_notification
+
+@click.group("plncdb")
+def cli():
+    """
+    A group of commands dealing with PLncDB data.
+    """
+    pass
+
+
+@cli.command("parse")
+@click.argument("data", type=click.Path(dir_okay=True, readable=True, file_okay=False))
+@click.argument(
+    "output",
+    default=".",
+    type=click.Path(writable=True, dir_okay=True, file_okay=False),
+)
+def parse(data, output):
+    entries = parser.parse(Path(data))
+    with entry_writer(Path(output)) as writer:
+        try:
+            writer.write(entries)
+        except ValueError as e:
+            print(e)
+
+
+@cli.command("fetch-data")
+@click.argument("urls", type=click.Path(writable=False, file_okay=True, dir_okay=False))
+@click.argument("destination", type=click.Path(writable=True, file_okay=False, dir_okay=True), default='.')
+def fetch_data(urls, destination):
+    url_dict = {}
+    with open(urls, 'r') as url_file:
+        for url_line in url_file:
+            url_dict[url_line.split(',')[0]] = url_line.split(',')[1:]
+
+
+    for dir_name in url_dict.keys():
+        print(f"Getting data for {dir_name}")
+
+        send_notification("PLncDB Download", f"Getting data for {dir_name}")
+
+        target_path = Path(destination) / dir_name
+        target_path.mkdir(exist_ok=True, parents=True)
+        for url in url_dict[dir_name]:
+            download_file(url, target_path)
+        print(f"All data for {dir_name} is downloaded")
+
+
+def download_file(url, destination=Path('.')):
+    local_filename = url.split('/')[-1]
+    if (destination / local_filename).exists():
+        return local_filename
+    # NOTE the stream=True parameter below
+    with requests.get(url.strip(), stream=True) as r:
+        r.raise_for_status()
+        with open(destination / local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                # If you have chunk encoded response uncomment if
+                # and set chunk_size parameter to None.
+                f.write(chunk)
+    return local_filename
diff --git a/rnacentral_pipeline/cli/scan_imports.py b/rnacentral_pipeline/cli/scan_imports.py
new file mode 100644
index 000000000..883d7ef0a
--- /dev/null
+++ b/rnacentral_pipeline/cli/scan_imports.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2018] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import click
+import psycopg2
+import psycopg2.extras
+import pandas as pd
+
+
+@click.group("scan-imports")
+def cli():
+    """
+    A group of commands to scan imports and decide what to run
+    """
+    pass
+
+"""
+This is the process I think
+
+manual selection -> csv file
+csv file: db_name, remote -> nf runs process(check_db_md5) -> list of db_name: md5
+list of db_name: md5 -> nf runs process(select_for_import) -> db_selection.config
+
+main pipeline includes selection.config to switch on/off the right dbs
+
+md5 creation can be done in shell for now, could do something with stripping the
+metadata of json files to compare only the actual data md5 (date would change the overall sum)
+
+"""
+
+
+
+
+@cli.command("select-for-import")
+@click.option("--db-url", envvar="PGDATABASE")
+@click.argument("db_md5_map")
+@click.argument("output", default="db_selection.config")
+def select_db_to_import(db_md5_map, output, db_url=None, type=click.Path(writable=True,dir_okay=False,file_okay=True)):
+    """
+    Takes the map of db name to md5 sum and queries our DB to select those DBs that can usefully be imported
+
+    Outputs a config file that the weekly import includes to switch on/off the relevant DBs
+    """
+
+    selection_template = """params {{
+        databases {{
+            {0}
+            {1}
+        }}
+    }}"""
+
+    latest_checksums = pd.read_csv(db_md5_map, names=["db_name", "checksum"])
+
+    conn = psycopg2.connect(db_url)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    cur.execute("SELECT * FROM rnc_import_tracker;")
+
+    prev_checksums = pd.DataFrame(cur.fetchall())
+
+    cur.close()
+    conn.close()
+
+    selection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum != file_md5")['db_name'].values
+    deselection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum == file_md5")['db_name'].values
+
+    selection = [f"{s}.run = true" for s in selection]
+    deselection = [f"{s}.run = false" for s in deselection]
+
+    activation_string = "\n\t\t".join(selection)
+    deactivation_string = "\n\t\t".join(deselection)
+
+    with open(output, 'w') as selection_config:
+        selection_config.write(selection_template.format(activation_string, deactivation_string))
+
+@cli.command("update-tracker")
+@click.argument("latest_md5s")
+@click.option("--db-url", envvar="PGDATABASE")
+def update_tracker(latest_md5s, db_url):
+    latest_checksums = pd.read_csv(latest_md5s, names=["db_name", "checksum"])
+
+    conn = psycopg2.connect(db_url)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    cur.execute("SELECT * FROM rnc_import_tracker;")
+
+    prev_checksums = pd.DataFrame(cur.fetchall())
+
+    selection = latest_checksums.join(prev_checksums.set_index("db_name"), on="db_name").query("checksum != file_md5")
+    selection['db_name'] = selection['db_name'].apply(lambda x: x.upper())
+
+    print(cur.execute("SELECT * FROM rnc_database WHERE descr = ANY(%s);", (list(selection['db_name'].values),) ) )
+    all_dbs = pd.DataFrame(cur.fetchall())
+
+    insert_data = selection.join(all_dbs.set_index('descr'), on='db_name', rsuffix='_r').filter(items=["db_name", "id_r", "checksum"])
+
+    print(insert_data)
+
+    for idx, row in insert_data.iterrows():
+        print(row)
+        db_name = row[0].lower()
+        db_id = row[1]
+        checksum = row[2]
+
+        cur.execute("TRUNCATE TABLE rnc_import_tracker")
+        cur.execute("INSERT INTO rnc_import_tracker(db_name, db_id, last_import_date, file_md5) VALUES (%s, %s, CURRENT_TIMESTAMP, %s) ", (db_name, db_id, checksum,))
+
+    conn.commit()
+    cur.close()
+    conn.close()
diff --git a/rnacentral_pipeline/databases/crw/helpers.py b/rnacentral_pipeline/databases/crw/helpers.py
index daf5395ae..bcfd1c7f9 100644
--- a/rnacentral_pipeline/databases/crw/helpers.py
+++ b/rnacentral_pipeline/databases/crw/helpers.py
@@ -55,7 +55,7 @@ def lineage(row: ty.Dict[str, ty.Any]) -> str:
 
 
 def sequence(row: ty.Dict[str, ty.Any], sequences: ty.Dict[str, SeqRecord]) -> str:
-    return str(sequences[row["model_name"]].seq)
+    return str(sequences[row["model_name"]].seq).upper().replace("U", "T")
 
 
 def description(row: ty.Dict[str, ty.Any]) -> str:
@@ -68,8 +68,11 @@ def description(row: ty.Dict[str, ty.Any]) -> str:
 
 
 def organelle(row: ty.Dict[str, ty.Any]) -> ty.Optional[str]:
-    return ORGANELLE_MAPPING.get(row["cellular_location"], None)
-
+    cellular_location = row.get("cellular_location", None)
+    if cellular_location is not None:
+        return ORGANELLE_MAPPING.get(row["cellular_location"], None)
+    else:
+        return None
 
 def as_entry(row: ty.Dict[str, ty.Any], sequences) -> ty.Optional[data.Entry]:
     try:
@@ -99,11 +102,11 @@ def as_entry(row: ty.Dict[str, ty.Any], sequences) -> ty.Optional[data.Entry]:
 
 
 def fasta_entries(directory: Path) -> ty.Iterable[SeqRecord]:
-    model_pattern = re.compile("crw-bpseq/(.+).bpseq")
+    model_pattern = re.compile("crw-bpseq/(.+).bpseq ")
     for fasta_file in directory.glob("*.fasta"):
         with fasta_file.open("r") as raw:
             header, sequence, _ = raw.readlines()
             matches = re.search(model_pattern, header)
             if matches is None:
                 raise ValueError(f"Could not get model id from {header}")
-        yield SeqRecord(Seq(sequence), id=matches.group(1))
+        yield SeqRecord(Seq(sequence.strip()), id=matches.group(1))
diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py
index 94fb6cc38..9eefd19d0 100644
--- a/rnacentral_pipeline/databases/data/databases.py
+++ b/rnacentral_pipeline/databases/data/databases.py
@@ -39,6 +39,7 @@ class Database(enum.Enum):
     ensembl_metazoa = DatabaseValue(5, "Ensembl Metazoa")
     ensembl_plants = DatabaseValue(6, "Ensembl Plants")
     ensembl_protists = DatabaseValue(7, "Ensembl Protists")
+    expression_atlas = DatabaseValue(51, "Expression Atlas")
     five_srrnadb = DatabaseValue(8, "5SrRNAdb")
     flybase = DatabaseValue(9, "FlyBase")
     gencode = DatabaseValue(10, "Ensembl/GENCODE")
@@ -59,6 +60,7 @@ class Database(enum.Enum):
     noncode = DatabaseValue(25, "NONCODE")
     pdbe = DatabaseValue(26, "PDBe")
     pirbase = DatabaseValue(27, "PirBase")
+    plncdb = DatabaseValue(50, "PLncDB")
     pombase = DatabaseValue(28, "PomBase")
     psicquic = DatabaseValue(48, "PSICQUIC")
     rdp = DatabaseValue(29, "RDP")
diff --git a/rnacentral_pipeline/databases/data/entry.py b/rnacentral_pipeline/databases/data/entry.py
index 4ba206fb0..4e87519cd 100644
--- a/rnacentral_pipeline/databases/data/entry.py
+++ b/rnacentral_pipeline/databases/data/entry.py
@@ -28,10 +28,10 @@
 
 from . import utils
 from .features import SequenceFeature
+from .go_annotations import GoTermAnnotation
 from .references import IdReference, Reference
-from .secondary_structure import SecondaryStructure
 from .regions import Exon, SequenceRegion
-from .go_annotations import GoTermAnnotation
+from .secondary_structure import SecondaryStructure
 
 LOGGER = logging.getLogger(__name__)
 
@@ -177,7 +177,10 @@ def gene_synonym(self) -> str:
         """
         Returns a comma separated list of gene synonyms.
         """
-        return ",".join(self.gene_synonyms)
+        if self.gene_synonyms:
+            return ",".join(self.gene_synonyms)
+        else:
+            return ""
 
     @property
     def feature_location_start(self):
diff --git a/rnacentral_pipeline/databases/data/utils.py b/rnacentral_pipeline/databases/data/utils.py
index 63a09f824..7d8388275 100644
--- a/rnacentral_pipeline/databases/data/utils.py
+++ b/rnacentral_pipeline/databases/data/utils.py
@@ -28,6 +28,7 @@
     "Y_RNA": "SO:0000405",
     "antisense_RNA": "SO:0000644",
     "autocatalytically_spliced_intron": "SO:0000588",
+    "circRNA": "SO:0002291",
     "guide_RNA": "SO:0000602",
     "hammerhead_ribozyme": "SO:0000380",
     "lncRNA": "SO:0001877",
@@ -41,6 +42,7 @@
     "ribozyme": "SO:0000374",
     "scRNA": "SO:0000013",
     "scaRNA": "SO:0002095",
+    "sgRNA": "SO:0001998",
     "siRNA": "SO:0000646",
     "snRNA": "SO:0000274",
     "snoRNA": "SO:0000275",
diff --git a/rnacentral_pipeline/databases/ensembl/genomes/parser.py b/rnacentral_pipeline/databases/ensembl/genomes/parser.py
index a5a8d27cb..4715bb73a 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/parser.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/parser.py
@@ -13,20 +13,19 @@
 limitations under the License.
 """
 
-import operator as op
 import itertools as it
+import operator as op
 import typing as ty
 
 from Bio import SeqIO
 
 from rnacentral_pipeline.databases import data
-from rnacentral_pipeline.databases.helpers import embl
-from rnacentral_pipeline.databases.ensembl.vertebrates import helpers as ensembl
-
 from rnacentral_pipeline.databases.ensembl import helpers as common
+from rnacentral_pipeline.databases.ensembl.data import Pseudogene
 from rnacentral_pipeline.databases.ensembl.genomes import helpers
 from rnacentral_pipeline.databases.ensembl.genomes.data import Context
-from rnacentral_pipeline.databases.ensembl.data import Pseudogene
+from rnacentral_pipeline.databases.ensembl.vertebrates import helpers as ensembl
+from rnacentral_pipeline.databases.helpers import embl
 
 
 def ncrnas(context: Context, handle) -> ty.Iterable[data.Entry]:
@@ -57,20 +56,29 @@ def parse(context: Context, handle) -> ty.Iterable[data.Entry]:
 
 
 def pseudogenes(handle: ty.IO) -> ty.Iterable[Pseudogene]:
-    for record in SeqIO.parse(handle, "embl"):
-        current_gene = None
-        for feature in record.features:
-            if feature.type == "source":
-                continue
-
-            if embl.is_gene(feature) and help:
-                current_gene = feature
-
-            if helpers.is_pseudogene(current_gene, feature):
-                gene = embl.gene(feature)
-                if not gene:
+    try:
+        for record in SeqIO.parse(handle, "embl"):
+            current_gene = None
+            for feature in record.features:
+                if feature.type == "source":
                     continue
-                yield Pseudogene(
-                    gene=embl.gene(feature),
-                    region=common.regions(record, feature)[0],
-                )
+
+                if embl.is_gene(feature) and help:
+                    current_gene = feature
+
+                if helpers.is_pseudogene(current_gene, feature):
+                    gene = embl.gene(feature)
+                    if not gene:
+                        continue
+                    yield Pseudogene(
+                        gene=embl.gene(feature),
+                        region=common.regions(record, feature)[0],
+                    )
+    except UnicodeDecodeError:
+        import os
+
+        print(f"UTF-8 error in file {handle.name}. Abort parsing.")
+        print(f"The working directory is {os.getcwd()}")
+        message = f"UFT-8 error in file {handle.name} during pseudogenes parsing. Aborting parse\n"
+        message += f"Working directory: {os.getcwd()}"
+        slack.send_notification("Ensembl parser error", message)
diff --git a/rnacentral_pipeline/databases/ensembl/genomes/urls.py b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
index 70da8a196..5ae922997 100644
--- a/rnacentral_pipeline/databases/ensembl/genomes/urls.py
+++ b/rnacentral_pipeline/databases/ensembl/genomes/urls.py
@@ -14,11 +14,11 @@
 """
 
 import json
-import tempfile
-from ftplib import FTP
 import logging
+import tempfile
 import typing as ty
 from contextlib import contextmanager
+from ftplib import FTP
 
 from rnacentral_pipeline.databases.ensembl.data import Division, FtpInfo
 
@@ -71,16 +71,17 @@ def generate_paths(
             gff_path = f"{base}/{release}/gff3/{name}/{organism_name}.gff3.gz"
             data_files = f"{base}/{release}/embl/{name}/{organism_name}.*.dat.gz"
 
-            try:
-                size = ftp.size(gff_path)
-                if size is None:
-                    LOGGER.warn("GFF file %s is empty, skip %s", gff_path, assembly)
-                    continue
-            except:
-                LOGGER.warn(
-                    "Could not get data for %s, skipping %s", gff_path, assembly
-                )
-                continue
+            # try:
+            #     size = ftp.size(gff_path)
+            #     if size is None:
+            #         LOGGER.warn("GFF file %s is empty, skip %s", gff_path, assembly)
+            #         continue
+            # except e:
+            #     LOGGER.warn(
+            #         "Could not get data for %s, skipping %s", gff_path, assembly
+            #     )
+            #     print(e)
+            #     continue
 
             yield FtpInfo(
                 division=division,
diff --git a/rnacentral_pipeline/databases/europepmc/xml.py b/rnacentral_pipeline/databases/europepmc/xml.py
index fb30d5bfb..53d279f5f 100644
--- a/rnacentral_pipeline/databases/europepmc/xml.py
+++ b/rnacentral_pipeline/databases/europepmc/xml.py
@@ -203,7 +203,7 @@ def node_to_reference(node):
 
 
 def parse(xml_file):
-    for _, node in ET.iterparse(xml_file, events=("end",), tag="PMC_ARTICLE"):
+    for _, node in ET.iterparse(xml_file, recover=True, events=("end",), tag="PMC_ARTICLE"):
         ref = node_to_reference(node)
         if not ref:
             continue
diff --git a/rnacentral_pipeline/databases/expressionatlas/__init__.py b/rnacentral_pipeline/databases/expressionatlas/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rnacentral_pipeline/databases/expressionatlas/helpers.py b/rnacentral_pipeline/databases/expressionatlas/helpers.py
new file mode 100644
index 000000000..4329e3fc6
--- /dev/null
+++ b/rnacentral_pipeline/databases/expressionatlas/helpers.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-current] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
+from rnacentral_pipeline.databases.helpers import phylogeny as phy
+from rnacentral_pipeline.databases.helpers import publications as pubs
+
+
+def accession(info):
+    return "EXPRESSIONATLAS:" + info["GeneID"]
+
+
+def primary_id(info):
+    return "EXPRESSIONATLAS:" + info["GeneID"]
+
+
+def taxid(info):
+    taxid = info["taxid"][0]
+    return int(taxid)
+
+
+def species(info):
+    return phy.species(info["taxid"][0])
+
+
+def lineage(info):
+    return phy.lineage(info["taxid"][0])
+
+
+def common_name(info):
+    return phy.common_name(info["taxid"][0])
+
+
+def url(experiment):
+    return "https://www.ebi.ac.uk/gxa/experiments/" + experiment
+
+
+def region_builder(info):
+    print(info["region_start"], info["region_stop"], info["strand"], info["urs_taxid"])
+    return [
+        SequenceRegion(
+            chromosome=info["chromosome"][0],
+            strand=info["strand"][0],
+            exons=[
+                Exon(start=start, stop=stop)
+                for start, stop in zip(info["region_start"], info["region_stop"])
+            ],
+            assembly_id=info["assembly_id"][0],
+            coordinate_system="1-start, fully-closed",
+        )
+    ]
+
+
+def references(interactions):
+    refs = set()
+    for interaction in interactions:
+        refs.update(interaction.publications)
+    refs.add(pubs.reference(24234451))
+    return list(refs)
+
+
+def as_entry(info, experiment):
+    synonyms = list(
+        filter(None, [""] if info["Gene Name"] == [None] else info["Gene Name"])
+    )
+    return Entry(
+        primary_id=primary_id(info),
+        accession=accession(info),
+        ncbi_tax_id=taxid(info),
+        database="EXPRESSIONATLAS",
+        sequence=info["seq"][0],
+        regions=region_builder(info),
+        rna_type=info["rna_type"][0],
+        url=url(experiment),
+        seq_version="1",
+        description=info["description"][0],
+        species=species(info),
+        common_name=common_name(info),
+        lineage=lineage(info),
+        gene=info["GeneID"][0],
+        gene_synonyms=synonyms,
+    )
diff --git a/rnacentral_pipeline/databases/expressionatlas/lookup.py b/rnacentral_pipeline/databases/expressionatlas/lookup.py
new file mode 100644
index 000000000..14a7cc160
--- /dev/null
+++ b/rnacentral_pipeline/databases/expressionatlas/lookup.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-current] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import operator as op
+from rnacentral_pipeline.rnacentral import lookup
+
+QUERY = """
+select
+    pre.id as id,
+    pre.rna_type,
+    COALESCE(rna.seq_short, rna.seq_long) as sequence,
+    pre.description
+
+from rnc_rna_precomputed pre
+join rna on rna.upi = pre.upi
+where
+    pre.id in %s
+"""
+
+def ids(interactions):
+    getter = op.attrgetter("urs_taxid")
+    return {getter(r) for r in interactions}
+
+
+def mapping(db_url, data):
+    """
+    lookup URS as a mapping, gets just enough information to create a valid
+    entry object
+
+    This is fairly unpleasant, but data is noq a load of tuples, so we have to
+    extract the URS from it to use here.
+
+    The other element of the tupe is the Gene ID, used for constructing the
+    URL later
+    """
+    _mapping = lookup.as_mapping(db_url, map(op.itemgetter(0), data), QUERY)
+    for idx, value in enumerate(_mapping.values()):
+        value["sequence"] = value["sequence"].replace("U", "T")
+    return _mapping
diff --git a/rnacentral_pipeline/databases/expressionatlas/parser.py b/rnacentral_pipeline/databases/expressionatlas/parser.py
new file mode 100644
index 000000000..3a5e3ea20
--- /dev/null
+++ b/rnacentral_pipeline/databases/expressionatlas/parser.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-current] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import json
+import operator as op
+import typing as ty
+
+from rnacentral_pipeline.databases import data
+
+from . import helpers
+
+
+def as_expression(mapping):
+
+    pass
+
+
+def parse(handle, db_url):
+    """
+    Process the jsonlines output from Rust into entries.
+
+    The jsonlines output is already grouped by geneID and urs taxid so this
+    should give us the transcript level linkage we're after without any further
+    processing.
+    """
+    for line in handle:
+        hit = json.loads(line)
+        for experiment in hit["experiment"]:
+            print(hit)
+            yield helpers.as_entry(hit, experiment)
diff --git a/rnacentral_pipeline/databases/genecards_suite/core/parser.py b/rnacentral_pipeline/databases/genecards_suite/core/parser.py
index 49d75c6bd..2ae12eace 100644
--- a/rnacentral_pipeline/databases/genecards_suite/core/parser.py
+++ b/rnacentral_pipeline/databases/genecards_suite/core/parser.py
@@ -30,7 +30,7 @@ def as_entry(context: Context, row, matching: KnownSequence) -> data.Entry:
         accession=helpers.accession(context, row),
         ncbi_tax_id=helpers.taxid(context, row),
         database=context.database,
-        sequence=matching.sequence,
+        sequence=matching.sequence.upper().replace('U', 'T'),
         regions=[],
         rna_type=matching.rna_type,
         url=context.url(row),
diff --git a/rnacentral_pipeline/databases/helpers/phylogeny.py b/rnacentral_pipeline/databases/helpers/phylogeny.py
index a4eb6b400..c3d125fcb 100644
--- a/rnacentral_pipeline/databases/helpers/phylogeny.py
+++ b/rnacentral_pipeline/databases/helpers/phylogeny.py
@@ -23,6 +23,8 @@
 
 TAX_URL = "https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/tax-id/{taxon_id}"
 
+SPECIES_URL = "https://www.ebi.ac.uk/ena/taxonomy/rest/any-name/{species}"
+
 LOGGER = logging.getLogger(__name__)
 
 
@@ -117,3 +119,36 @@ def division(taxon_id: int) -> str:
 
     data = phylogeny(taxon_id)
     return data["division"]
+    
+@lru_cache
+def taxid(species: str) -> int:
+    """
+    Get the taxid for a given species
+    Re-use request logic from phylogeny, but this uses a different endpoint, so
+    can't directly reuse
+    """
+
+    for count in range(10):
+        response = requests.get(SPECIES_URL.format(species=species))
+        try:
+            response.raise_for_status()
+            data = response.json()
+            break
+        except simplejson.errors.JSONDecodeError:
+            sleep(0.15 * (count + 1) ** 2)
+            continue
+        except requests.HTTPError as err:
+            if response.status_code == 500:
+                sleep(0.15 * (count + 1) ** 2)
+                continue
+            elif response.status_code == 404:
+                raise UnknownTaxonId(taxon_id)
+            else:
+                LOGGER.exception(err)
+                raise FailedTaxonId("Unknown error")
+    else:
+        raise FailedTaxonId("Could not get taxon id for %s" % species)
+
+    if not data:
+        raise FailedTaxonId("Somehow got no data")
+    return int(data[0]["taxId"])
diff --git a/rnacentral_pipeline/databases/lncbook/parser.py b/rnacentral_pipeline/databases/lncbook/parser.py
index 97997db2d..688af5d55 100644
--- a/rnacentral_pipeline/databases/lncbook/parser.py
+++ b/rnacentral_pipeline/databases/lncbook/parser.py
@@ -15,6 +15,7 @@
 
 import json
 from io import StringIO
+import attr
 
 from rnacentral_pipeline.databases.generic import parser as generic
 
@@ -23,12 +24,13 @@ def parse(handle):
     raw = json.load(handle)
     data = []
     for ncrna in raw["data"]:
-        regions = ncrna["genomeLocactions"]
+        regions = ncrna["genomeLocations"]
         regions = filter(lambda r: r["assembly"] == "GRCh38", regions)
         regions = list(regions)
         if not regions:
             continue
-        ncrna["genomeLocactions"] = regions
+        ncrna["genomeLocations"] = regions
+        ncrna["sequence"] = ncrna["sequence"].upper()
         data.append(ncrna)
     if not data:
         raise ValueError("All ncRNA are not from GRCh38, failing")
diff --git a/rnacentral_pipeline/databases/pdb/data.py b/rnacentral_pipeline/databases/pdb/data.py
index 73bbd3280..62b9407f0 100644
--- a/rnacentral_pipeline/databases/pdb/data.py
+++ b/rnacentral_pipeline/databases/pdb/data.py
@@ -38,18 +38,18 @@ def first_or_none(value):
 
 @attr.s()
 class ChainInfo:
-    pdb_id = attr.ib(validator=is_a(str))
-    chain_id = attr.ib(validator=is_a(str))
-    release_date = attr.ib(validator=is_a(dt.datetime))
-    experimental_method = attr.ib(validator=optional(is_a(str)))
-    entity_id = attr.ib(validator=is_a(int))
+    pdb_id: str = attr.ib(validator=is_a(str))
+    chain_id: str = attr.ib(validator=is_a(str))
+    release_date: dt.datetime = attr.ib(validator=is_a(dt.datetime))
+    experimental_method: ty.Optional[str] = attr.ib(validator=optional(is_a(str)))
+    entity_id: int = attr.ib(validator=is_a(int))
     taxids: ty.List[int] = attr.ib(validator=is_a(list))
-    resolution = attr.ib(validator=optional(is_a(float)))
-    title = attr.ib(validator=is_a(str))
-    sequence = attr.ib(validator=is_a(str))
+    resolution: float = attr.ib(validator=optional(is_a(float)))
+    title: str = attr.ib(validator=is_a(str))
+    sequence: str = attr.ib(validator=is_a(str))
     molecule_names: ty.List[str] = attr.ib(validator=is_a(list))
-    molecule_type = attr.ib(validator=optional(is_a(str)))
-    organism_scientific_name = attr.ib(validator=optional(is_a(str)))
+    molecule_type: str = attr.ib(validator=optional(is_a(str)))
+    organism_scientific_name: ty.Optional[str] = attr.ib(validator=optional(is_a(str)))
 
     @classmethod
     def build(cls, chain_index, raw) -> ChainInfo:
@@ -64,13 +64,16 @@ def build(cls, chain_index, raw) -> ChainInfo:
             resolution=raw.get("resolution"),
             title=raw["title"],
             sequence=raw["molecule_sequence"],
-            molecule_names=raw.get("molecule_name", []),
+            molecule_names=raw.get("molecule_name", raw.get("rfam_id", [])),
             molecule_type=raw.get("molecule_type", None),
             organism_scientific_name=first_or_none(
                 raw.get("organism_scientific_name", [])
             ),
         )
 
+    def override_key(self) -> ty.Tuple[str, str]:
+        return (self.pdb_id.lower(), self.chain_id)
+
     def accession(self) -> str:
         return f"{self.pdb_id.upper()}_{self.chain_id}_{self.entity_id}"
 
diff --git a/rnacentral_pipeline/databases/pdb/fetch.py b/rnacentral_pipeline/databases/pdb/fetch.py
index aca60bdd3..8f8e5a467 100644
--- a/rnacentral_pipeline/databases/pdb/fetch.py
+++ b/rnacentral_pipeline/databases/pdb/fetch.py
@@ -19,15 +19,14 @@
 import logging
 import typing as ty
 
-from furl import furl
 import requests
-from retry import retry
+from furl import furl
 from more_itertools import chunked
 from ratelimiter import RateLimiter
+from retry import retry
 
-from rnacentral_pipeline.databases.pdb.data import ChainInfo
-from rnacentral_pipeline.databases.pdb.data import ReferenceMapping
 from rnacentral_pipeline.databases.pdb import helpers
+from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping
 
 LOGGER = logging.getLogger(__name__)
 
@@ -45,6 +44,7 @@
     "molecule_name",
     "molecule_type",
     "organism_scientific_name",
+    "rfam_id",
 }
 
 PDBE_SEARCH_URL = "https://www.ebi.ac.uk/pdbe/search/pdb/select"
@@ -83,34 +83,88 @@ def fetch_range(query: str, start: int, rows: int) -> ty.Iterator[ChainInfo]:
         raise MissingPdbs(f"Missing for '{query}', {start}")
     for raw in data["response"]["docs"]:
         for index in range(len(raw["chain_id"])):
-            info = ChainInfo.build(index, raw)
-            if info.molecule_type and "RNA" in info.molecule_type:
-                yield info
+            yield ChainInfo.build(index, raw)
+
+
+@retry((requests.HTTPError, MissingPdbs), tries=5, delay=1)
+def all_chains_in_pdbs(
+    pdb_ids: ty.List[str], query_size=1000
+) -> ty.Iterable[ChainInfo]:
+    """
+    Get all chains from all given PDB ids. This does no filtering to chains that
+    may be RNA or not and simply fetches everything.
+    """
+
+    LOGGER.info("Fetching all chains in requested structures")
+    query = " OR ".join([f"pdb_id:{p.lower()}" for p in pdb_ids])
+
+    total = get_pdbe_count(query)
+    limiter = RateLimiter(max_calls=10, period=1)
+    for start in range(0, total, query_size):
+        with limiter:
+            for chain in fetch_range(query, start, query_size):
+                yield chain
+
+
+@retry((requests.HTTPError, MissingPdbs), tries=5, delay=1)
+def chains(required: ty.Set[ty.Tuple[str, str]], query_size=1000) -> ty.List[ChainInfo]:
+    """
+    Get all chains from all given PDB ids. This does no filtering to chains that
+    may be RNA or not and simply fetches everything.
+    """
+
+    LOGGER.info("Fetching requested chains")
+
+    seen = set()
+    chains = []
+    pdb_ids = [r[0] for r in required]
+    for chain in all_chains_in_pdbs(pdb_ids):
+        key = chain.override_key()
+        if key not in required:
+            continue
+        seen.add(key)
+        chains.append(chain)
+
+    if seen != required:
+        missed = required - seen
+        raise ValueError("Did not find all requested ids: %s" % missed)
+    return chains
 
 
 @retry((requests.HTTPError, MissingPdbs), tries=5, delay=1)
 def rna_chains(
-    pdb_ids: ty.Optional[ty.List[str]] = None, query_size=1000
+    required: ty.Set[ty.Tuple[str, str]], query_size=1000
 ) -> ty.List[ChainInfo]:
     """
     Get PDB ids of all RNA-containing 3D structures
     using the RCSB PDB REST API.
     """
 
+    LOGGER.info("Fetching all RNA containing chains")
     query = "number_of_RNA_chains:[1 TO *]"
-    if pdb_ids:
-        id_query = " OR ".join([f"pdb_id:{p.lower()}" for p in pdb_ids])
-        query = f"{query} AND ({id_query})"
-
     rna_chains: ty.List[ChainInfo] = []
     total = get_pdbe_count(query)
+    seen = set()
     limiter = RateLimiter(max_calls=10, period=1)
     for start in range(0, total, query_size):
         with limiter:
-            rna_chains.extend(fetch_range(query, start, query_size))
+            for chain in fetch_range(query, start, query_size):
+                key = chain.override_key()
+                if (
+                    chain.molecule_type and "RNA" in chain.molecule_type
+                ) or key in required:
+                    rna_chains.append(chain)
+                    seen.add(key)
+
+    # This may be missed if the PDB does not contain any chains labeled as RNA.
+    # Rfam does match some DNA chains so we allow them into RNAcentral.
+    missed = required - seen
+    if missed:
+        LOGGER.info("Missed some chains, well fetch manually")
+        rna_chains.extend(chains(missed))
 
-    # Must be >= as sometimes more than one chain is in a single document
     assert rna_chains, "Found no RNA chains"
+    LOGGER.info("Found %i RNA containing chains", len(rna_chains))
     return rna_chains
 
 
diff --git a/rnacentral_pipeline/databases/pdb/helpers.py b/rnacentral_pipeline/databases/pdb/helpers.py
index d17d8a39e..a7434ab03 100644
--- a/rnacentral_pipeline/databases/pdb/helpers.py
+++ b/rnacentral_pipeline/databases/pdb/helpers.py
@@ -13,16 +13,15 @@
 limitations under the License.
 """
 
+import csv
+import logging
 import re
 import typing as ty
-import logging
 
+from rnacentral_pipeline.databases.data import AnyReference, Reference
 from rnacentral_pipeline.databases.helpers import phylogeny as phy
 from rnacentral_pipeline.databases.helpers import publications as pubs
-from rnacentral_pipeline.databases.data import Reference
-from rnacentral_pipeline.databases.data import AnyReference
-from rnacentral_pipeline.databases.pdb.data import ChainInfo
-from rnacentral_pipeline.databases.pdb.data import ReferenceMapping
+from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping
 
 RIBOSOMES = set(
     [
@@ -36,6 +35,7 @@
         "40S",
         "60S",
         "80S",
+        "LSU",
     ]
 )
 
@@ -58,6 +58,14 @@ class MissingProduct(Exception):
     pass
 
 
+class MissingTypeInfo(Exception):
+    """
+    Raised when the chain molecule_names field is empty and we can't infer type
+    """
+
+    pass
+
+
 def is_mrna(chain: ChainInfo) -> bool:
     mrna_names = [
         "mRNA",
@@ -182,7 +190,10 @@ def compound_rna_type(compound: str) -> str:
 
 def rna_type(info: ChainInfo) -> str:
     if not info.molecule_names:
-        raise ValueError(f"Cannot find RNA type for {info}")
+        raise MissingTypeInfo(
+            f'Cannot find RNA type for {info}, falling back to "misc_RNA"'
+        )
+        return "misc_RNA"
     return compound_rna_type(info.molecule_names[0])
 
 
@@ -235,3 +246,15 @@ def lineage(info: ChainInfo) -> str:
 
 def species(info: ChainInfo) -> str:
     return phy.species(taxid(info))
+
+
+def load_overrides(handle) -> ty.Set[ty.Tuple[str, str]]:
+    """
+    Parse TSV file of pdb_id chain and produce a set of (pdb_id, chain). PDB id
+    will be lowercased. This is used to ensure all sequences with an Rfam match
+    are loaded into the pipeline.
+    """
+    overrides = set()
+    for row in csv.reader(handle, delimiter="\t"):
+        overrides.add((row[0].lower(), row[1]))
+    return overrides
diff --git a/rnacentral_pipeline/databases/pdb/parser.py b/rnacentral_pipeline/databases/pdb/parser.py
index f45917a5d..3f5bffe5f 100644
--- a/rnacentral_pipeline/databases/pdb/parser.py
+++ b/rnacentral_pipeline/databases/pdb/parser.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 """
-Copyright [2009-2017] EMBL-European Bioinformatics Institute
+Copyright [2009-2022] EMBL-European Bioinformatics Institute
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -17,10 +17,8 @@
 import typing as ty
 
 from rnacentral_pipeline.databases import data
-
-from rnacentral_pipeline.databases.pdb.data import ChainInfo
-from rnacentral_pipeline.databases.pdb.data import ReferenceMapping
 from rnacentral_pipeline.databases.pdb import helpers
+from rnacentral_pipeline.databases.pdb.data import ChainInfo, ReferenceMapping
 
 LOGGER = logging.getLogger(__name__)
 
@@ -50,22 +48,36 @@ def as_entry(info: ChainInfo, reference_mapping: ReferenceMapping):
 def parse(
     rna_chains: ty.List[ChainInfo],
     reference_mapping: ReferenceMapping,
+    override_list: ty.Set[ty.Tuple[str, str]],
 ) -> ty.Iterator[data.Entry]:
     disqualified = {"mRNA": 0, "other": 0}
+    seen: ty.Set[ty.Tuple[str, str]] = set()
     for chain in rna_chains:
-        if helpers.is_mrna(chain):
-            LOGGER.debug("Disqualifing %s", chain)
-            disqualified["mRNA"] += 1
-            continue
+        override_key = chain.override_key()
+        if override_key in override_list:
+            LOGGER.debug("Overriding %s, %s", chain.pdb_id, chain.chain_id)
+            seen.add(override_key)
+        else:
+            if helpers.is_mrna(chain):
+                LOGGER.debug("Disqualifing %s", chain)
+                disqualified["mRNA"] += 1
+                continue
 
-        if not helpers.is_ncrna(chain):
-            LOGGER.debug("Skipping %s", chain)
-            disqualified["other"] += 1
-            continue
+            if not helpers.is_ncrna(chain):
+                LOGGER.debug("Skipping %s", chain)
+                disqualified["other"] += 1
+                continue
 
         try:
             yield as_entry(chain, reference_mapping)
         except helpers.InvalidSequence:
             LOGGER.warn(f"Invalid sequence for {chain}")
+        except helpers.MissingTypeInfo:
+            LOGGER.warn(f"Missing type info for {chain}")
+
+    missing = override_list - seen
     LOGGER.info("Disqualified %i mRNA chains", disqualified["mRNA"])
     LOGGER.info("Disqualified %i non ncRNA chains", disqualified["other"])
+    LOGGER.info("Did not load %s overrided chains", missing)
+    if missing:
+        raise ValueError("Missed some required ids %s" % missing)
diff --git a/rnacentral_pipeline/databases/plncdb/__init__.py b/rnacentral_pipeline/databases/plncdb/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rnacentral_pipeline/databases/plncdb/parser.py b/rnacentral_pipeline/databases/plncdb/parser.py
new file mode 100644
index 000000000..7be36f964
--- /dev/null
+++ b/rnacentral_pipeline/databases/plncdb/parser.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2020] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
+from rnacentral_pipeline.databases.helpers import phylogeny as phy
+
+import gffutils
+from Bio import SeqIO
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+import pathlib
+import typing as ty
+import os
+
+import pandas as pd
+
+from tqdm import tqdm
+
+def _find_gff_file(search_path: pathlib.Path) -> pathlib.Path:
+    """
+    Find the right gff3 file for use with the fasta in parsing alongside the
+    info file
+    """
+    for file in search_path.iterdir():
+        if file.suffix.strip() == ".gff3" and "PLncDB" in file.stem:
+            return file
+
+def _find_fasta_file(search_path: pathlib.Path) -> pathlib.Path:
+    """
+    Find the fasta file that corresponds with the gff file and info file
+    """
+    for file in search_path.iterdir():
+        if file.suffix.strip() == ".fa" and "chromosome" in file.stem:
+            return file
+
+def _find_info_file(search_path: pathlib.Path) -> pathlib.Path:
+    """
+    Find the corresponding info filefor this fasta and gff
+    """
+    for file in search_path.iterdir():
+        if file.suffix.strip() == ".txt" and "lncRNA" in file.stem:
+            return file
+
+def _generate_description(taxid: int, gene_name: str) -> str:
+    info = phy.phylogeny(taxid)
+    scientificName = info['scientificName']
+    shortName = scientificName[0] + f". {scientificName.split()[1]}"
+    description = (
+                f"{info['scientificName']} ({info.get('commonName', shortName)}) "
+                f"long non-coding RNA ({gene_name})"
+                )
+    return description
+
+def parse(data:pathlib.Path) -> ty.Iterable[Entry]:
+    """
+    Parse a directory of data from PLncDB into entries for import. Expects the
+    directory to contain one directory per species which is derived from the
+    FTP download that has been decompressed.
+
+    We read the gff3, fasta, and associated info file to construct the entry
+    """
+
+
+    ## Set some things which will be common for all entries
+    rna_type = "SO:0001877"
+    database = "PLNCDB"
+
+    url = "https://www.tobaccodb.org/plncdb/nunMir?plncdb_id={}"
+
+
+    ## loop on all directories in the data directory
+    gff_file = _find_gff_file(data)
+    fasta_file = _find_fasta_file(data)
+    info_file = _find_info_file(data)
+
+    # Load the GFF file into the gffutils database for working with
+    gff_db = gffutils.create_db(str(gff_file), ":memory:")
+
+    ## Load the FASTA file as well
+    fasta_db = SeqIO.to_dict(SeqIO.parse(str(fasta_file), 'fasta'))
+
+    ## Finally, load the info file using pandas
+    species_info = pd.read_csv(info_file, delimiter='\t')
+    species_info["Species"] = species_info["Species"].apply(lambda x: x.replace("_", " "))
+    species_info["taxid"] = species_info["Species"].apply(phy.taxid)
+
+
+    total_entries = len(gff_db.execute("select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall())
+    entries = []
+    for gene_id_q in tqdm(gff_db.execute("select id from features"), total=total_entries):
+        primary_id = gene_id_q["id"]
+
+        gene_info = species_info[species_info["lncRNA_ID"] == primary_id]
+        if len(gene_info) == 0:
+            break
+
+
+
+        taxid = gene_info["taxid"].values[0]
+
+        chromosome = fasta_db[gff_db[primary_id].seqid] ##Hopefully gets the right chromosome?
+
+        features = list(gff_db.children(primary_id))
+        ##TODO: check coordinate system
+        exons = [Exon(start=e.start, stop=e.stop) for e in features]
+        seq_start = min([e.start for e in features])
+        seq_end = max([e.end for e in features])
+        whole_feature = SeqFeature(FeatureLocation(seq_start, seq_end))
+
+        sequence = whole_feature.extract(chromosome)
+
+        region = SequenceRegion(
+            chromosome = features[0].chrom,
+            strand = features[0].strand,
+            exons = exons,
+            assembly_id = gene_info['Ref_Genome_Vers'],
+            coordinate_system = "1-start, fully-closed"
+        )
+
+        entries.append(
+        Entry(
+                primary_id=primary_id,
+                accession=primary_id,
+                ncbi_tax_id=int(taxid),
+                species=species_info["Species"][0],
+                database=database,
+                sequence=sequence.seq.upper(),
+                regions=[region],
+                rna_type=rna_type,
+                url=url.format(primary_id),
+                seq_version="1",
+                # optional_id=optional_id(record, context),
+                description=_generate_description(int(taxid), gene_info["Gene_ID"].values[0]),
+                # note_data=note_data(record),
+                # xref_data=xrefs(record),
+                # related_sequences=related_sequences(record),
+                # secondary_structure=secondary_structure(record),
+                # references=references(record),
+                # organelle=record.get("localization", None),
+                # product=record.get("product", None),
+                # anticodon=anticodon(record),
+                gene=gene_info["Gene_ID"].values[0],
+                # gene_synonyms=gene_synonyms(record),
+                # locus_tag=locus_tag(record),
+                # features=features(record),
+            )
+        )
+
+
+
+    return entries
diff --git a/rnacentral_pipeline/databases/rfam/helpers.py b/rnacentral_pipeline/databases/rfam/helpers.py
index ac23053bc..6f6f790b1 100644
--- a/rnacentral_pipeline/databases/rfam/helpers.py
+++ b/rnacentral_pipeline/databases/rfam/helpers.py
@@ -14,13 +14,12 @@
 """
 
 import collections as coll
+import logging
 import re
 import typing as ty
-import logging
-
-from rnacentral_pipeline.databases.helpers.publications import reference
 
 from rnacentral_pipeline.databases.data import IdReference
+from rnacentral_pipeline.databases.helpers.publications import reference
 
 LOGGER = logging.getLogger(__name__)
 
@@ -54,10 +53,10 @@ def seq_version(data: ty.Dict[str, str]) -> str:
 
 def rna_type(family: ty.Dict[str, str]) -> str:
     so_terms = family["so_terms"]
-    if ',' in so_terms:
-        so_terms = so_terms.split(',')[0]
-    if ',' in so_terms:
-        so_terms = so_terms.split(',')[0]
+    if "," in so_terms:
+        so_terms = so_terms.split(",")[0]
+    if "," in so_terms:
+        so_terms = so_terms.split(",")[0]
     assert re.match(r"^SO:\d+$", so_terms)
     return so_terms
 
@@ -90,7 +89,7 @@ def note(data: ty.Dict[str, str]):
     result = coll.defaultdict(list)
     result["Alignment"] = data["sequence_type"]
     for xref in data["dbxrefs"].split(","):
-        db, _ = xref.split(":")
+        db, _ = xref.split(":", 1)
         result[db].append(xref)
     return result
 
diff --git a/rnacentral_pipeline/databases/zfin/fetch.py b/rnacentral_pipeline/databases/zfin/fetch.py
index 797f44bc6..42f605639 100644
--- a/rnacentral_pipeline/databases/zfin/fetch.py
+++ b/rnacentral_pipeline/databases/zfin/fetch.py
@@ -20,8 +20,7 @@
 
 
 def fetch(url):
-    with closing(request.urlopen(url)) as compressed:
-        with gzip.GzipFile(None, "rb", 9, compressed) as raw:
+    with closing(request.urlopen(url)) as raw:
             data = json.load(raw)
 
     # Fix weird PMID formatting
diff --git a/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py b/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py
index 6d4bd9786..20436bb3a 100644
--- a/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py
+++ b/rnacentral_pipeline/rnacentral/ftp_export/ensembl.py
@@ -13,15 +13,14 @@
 limitations under the License.
 """
 
-import re
 import json
 import operator as op
+import re
 
 from jsonschema import validate
 
 from rnacentral_pipeline import psql
 
-
 MOD_URL = "http://modomics.genesilico.pl/sequences/list/{id}"
 
 
@@ -37,6 +36,13 @@
     ]
 )
 
+DISALLOWED_TYPES = set(
+    [
+        "circRNA",
+        "sgRNA",
+    ]
+)
+
 SEQUENCE_PATTERN = re.compile("^[ACGTYRWSKMDVHBNXFI]+$")
 
 
@@ -50,6 +56,9 @@ def external_id(data):
 
 def is_high_quality(data):
     name = data["database"].lower()
+    ## Do not send some RNAs to ensembl
+    if data["rna_type"] in DISALLOWED_TYPES:
+        return False
     if name in TRUSTED_DB:
         return True
     if name == "rfam":
diff --git a/rnacentral_pipeline/rnacentral/notify/__init__.py b/rnacentral_pipeline/rnacentral/notify/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rnacentral_pipeline/rnacentral/notify/slack.py b/rnacentral_pipeline/rnacentral/notify/slack.py
new file mode 100644
index 000000000..52891b05d
--- /dev/null
+++ b/rnacentral_pipeline/rnacentral/notify/slack.py
@@ -0,0 +1,110 @@
+"""
+Send a notification to slack.
+
+NB: The webhook should be configured in the nextflow profile
+
+"""
+
+import os
+
+import psycopg2
+import requests
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+
+REPORT_QUERY = """
+SELECT display_name, count(taxid) FROM xref
+JOIN rnc_database db ON xref.dbid = db.id
+WHERE xref.deleted = 'N'
+AND EXTRACT (DAY FROM (CURRENT_TIMESTAMP - xref.timestamp)) < 7
+GROUP BY display_name
+ORDER BY display_name
+"""
+
+
+def send_notification(title, message):
+    """
+    Send a notification to the configured slack webhook.
+    """
+    SLACK_WEBHOOK = os.getenv("SLACK_CLIENT_TOKEN")
+    if SLACK_WEBHOOK is None:
+        raise SystemExit("SLACK_CLIENT_TOKEN environment variable not defined")
+
+    client_token = os.getenv("SLACK_CLIENT_TOKEN")
+    channel = os.getenv("SLACK_CHANNEL")
+
+    client = WebClient(token=client_token)
+
+    blocks = [
+        {
+            "type": "section",
+            "text": {"type": "mrkdwn", "text": message},
+        },
+    ]
+    try:
+        response = client.chat_postMessage(channel=channel, text=title, blocks=blocks)
+
+        print(response)
+    except SlackApiError as e:
+        assert e.response["error"]
+
+
+def pipeline_report():
+    """
+    Generates a nicely formatted report of the number of sequences imported from
+    each DB. This uses the slack_sdk, rather than a webhook, and uses the
+    blockkit to format the message nicely.
+
+    TODO: What else should go in this? Maybe parsing the log file to get the
+    run duration?
+    """
+    db_url = os.getenv("PGDATABASE")
+    client_token = os.getenv("SLACK_CLIENT_TOKEN")
+    channel = os.getenv("SLACK_CHANNEL")
+
+    client = WebClient(token=client_token)
+
+    lock_text_template = "New sequences from *{0}* {1:,}"
+
+    summary_blocks = [
+        {
+            "type": "header",
+            "text": {"type": "plain_text", "text": "Workflow Completion report"},
+        },
+        {"type": "divider"},
+    ]
+    running_total = 0
+    with psycopg2.connect(db_url) as conn:
+        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            cur.execute(REPORT_QUERY)
+            res = cur.fetchall()
+            for r in res:
+                running_total += r[1]
+                summary_blocks.append(
+                    {
+                        "type": "section",
+                        "text": {
+                            "type": "mrkdwn",
+                            "text": block_text_template.format(r[0].ljust(30), r[1]),
+                        },
+                    }
+                )
+                summary_blocks.append({"type": "divider"})
+            summary_blocks.append(
+                {
+                    "type": "section",
+                    "text": {
+                        "type": "mrkdwn",
+                        "text": f"Total sequences imported: *{running_total:,}*",
+                    },
+                }
+            )
+
+    try:
+        response = client.chat_postMessage(
+            channel=channel, text="Workflow completion report", blocks=summary_blocks
+        )
+
+        print(response)
+    except SlackApiError as e:
+        assert e.response["error"]
diff --git a/rnacentral_pipeline/rnacentral/precompute/data/sequence.py b/rnacentral_pipeline/rnacentral/precompute/data/sequence.py
index 427635b09..a53b50149 100644
--- a/rnacentral_pipeline/rnacentral/precompute/data/sequence.py
+++ b/rnacentral_pipeline/rnacentral/precompute/data/sequence.py
@@ -175,9 +175,7 @@ def species(self) -> ty.Set[str]:
         for accession in self.accessions:
             if not accession.species:
                 continue
-            for species in accession.species:
-                if species:
-                    all_species.add(species)
+            all_species.add(accession.species)
         return all_species
 
     def domains(self) -> ty.Set[str]:
diff --git a/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py b/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py
index 4feae62c9..f47221732 100644
--- a/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py
+++ b/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py
@@ -23,13 +23,12 @@
 
 from rnacentral_pipeline.databases.data import Database, RnaType
 from rnacentral_pipeline.databases.sequence_ontology import tree
+from rnacentral_pipeline.rnacentral.precompute import utils
 from rnacentral_pipeline.rnacentral.precompute.data import context
 from rnacentral_pipeline.rnacentral.precompute.data import sequence as seq
 from rnacentral_pipeline.rnacentral.precompute.data.accession import Accession
 from rnacentral_pipeline.rnacentral.precompute.qa import contamination as cont
 
-from rnacentral_pipeline.rnacentral.precompute import utils
-
 LOGGER = logging.getLogger(__name__)
 
 
@@ -48,6 +47,7 @@
     Database.rgd,
     Database.zfin,
     Database.mirgenedb,
+    Database.plncdb,
     Database.lncipedia,
     Database.lncrnadb,
     Database.lncbook,
@@ -63,6 +63,7 @@
     Database.genecards,
     Database.malacards,
     Database.intact,
+    Database.expression_atlas,
     Database.rfam,
     Database.tarbase,
     Database.lncbase,
diff --git a/rnacentral_pipeline/rnacentral/release/database_stats.py b/rnacentral_pipeline/rnacentral/release/database_stats.py
index 146d2d72d..eda378495 100644
--- a/rnacentral_pipeline/rnacentral/release/database_stats.py
+++ b/rnacentral_pipeline/rnacentral/release/database_stats.py
@@ -159,7 +159,9 @@ def lengths(conn, db_id: int) -> ty.Dict[str, ty.Any]:
             .on(xref.upi == rna.upi)
         )
         cursor.execute(str(query))
-        return dict(cursor.fetchone())
+        r = {k:v if v is not None else 0 for k,v in dict(cursor.fetchone()).items()}
+
+        return r
 
 
 def count_sequences(conn, db_id: int) -> int:
diff --git a/select_databases.nf b/select_databases.nf
new file mode 100644
index 000000000..baa06bff7
--- /dev/null
+++ b/select_databases.nf
@@ -0,0 +1,12 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+include { select } from './workflows/databases/select.nf'
+
+
+workflow {
+
+  select()
+
+}
diff --git a/tests/databases/pdb/fetch_test.py b/tests/databases/pdb/fetch_test.py
index b3cadcbf8..c96b2cab0 100644
--- a/tests/databases/pdb/fetch_test.py
+++ b/tests/databases/pdb/fetch_test.py
@@ -20,34 +20,11 @@
 from rnacentral_pipeline.databases.pdb import fetch
 
 
-@pytest.fixture(scope="module")
-def chain_info():
-    return fetch.rna_chains()
-
-
-@pytest.fixture(scope="module")
-def chain_map(chain_info):
-    info = {}
-    for chain in chain_info:
-        info[(chain.pdb_id, chain.chain_id)] = chain
-    return info
-
-
-@pytest.mark.skip()
-def test_can_get_all_pdbs(chain_info):
-    assert len(fetch.rna_chains()) >= 14686
-
-
-@pytest.mark.skip()
-def test_contains_no_duplicate_chains(chain_info, chain_map):
-    assert len(chain_info) == len(chain_map)
-
-
 @pytest.mark.skip()
 def test_produces_correct_data():
-    chains = fetch.rna_chains(pdb_ids=["1s72"])
-    chain = next(c for c in chains if c.chain_id == "9")
-    assert chain == fetch.ChainInfo(
+    chains = fetch.chains({("1S72", "9")})
+    assert len(chains) == 1
+    assert chains[0] == fetch.ChainInfo(
         pdb_id="1s72",
         chain_id="9",
         release_date=dt.datetime(2004, 6, 15, hour=1),
@@ -109,5 +86,5 @@ def test_produces_correct_data():
     ],
 )
 def test_fetches_all_rna_chains_even_mrna(pdb_id, chains):
-    entries = fetch.rna_chains(pdb_ids=[pdb_id])
-    assert set(d.chain_id for d in entries) == chains
+    entries = fetch.all_chains_in_pdbs([pdb_id])
+    assert set(d.chain_id for d in entries) & chains
diff --git a/tests/databases/pdb/helpers_test.py b/tests/databases/pdb/helpers_test.py
index 54f9fc558..6570d22de 100644
--- a/tests/databases/pdb/helpers_test.py
+++ b/tests/databases/pdb/helpers_test.py
@@ -15,14 +15,14 @@
 
 import pytest
 
+from rnacentral_pipeline.databases.pdb import fetch, helpers
 from rnacentral_pipeline.databases.pdb.data import ChainInfo
-from rnacentral_pipeline.databases.pdb import helpers
-from rnacentral_pipeline.databases.pdb import fetch
 
 
 def load(pdb_id: str, chain_id: str) -> ChainInfo:
-    chains = fetch.rna_chains(pdb_ids=[pdb_id.lower()])
-    return next(c for c in chains if c.chain_id == chain_id)
+    chains = fetch.chains({(pdb_id, chain_id)})
+    assert len(chains) == 1
+    return chains[0]
 
 
 @pytest.mark.parametrize(
@@ -66,18 +66,18 @@ def test_can_compute_correct_rna_types(product: str, expected):
     [
         ("7mky", "A", True),
         ("7lyj", "A", True),
-        ("5U3G", "B", True),
-        ("2L1V", "A", True),
-        ("6VAR", "A", True),
-        ("4Y1I", "A", True),
-        ("4Y1I", "B", True),
-        ("4Y1J", "A", True),
-        ("4Y1J", "B", True),
-        ("4Y1M", "A", True),
-        ("4Y1M", "B", True),
-        ("7MKY", "A", True),
-        ("7LYJ", "A", True),
-        ("7MLW", "F", True),
+        ("5u3g", "B", True),
+        ("2l1v", "A", True),
+        ("6var", "A", True),
+        ("4y1i", "A", True),
+        ("4y1i", "B", True),
+        ("4y1j", "A", True),
+        ("4y1j", "B", True),
+        ("4y1m", "A", True),
+        ("4y1m", "B", True),
+        ("7mky", "A", True),
+        ("7lyj", "A", True),
+        ("7mlw", "F", True),
     ],
 )
 def test_can_detect_if_is_ncrna(pdb, chain, expected):
diff --git a/tests/databases/pdb/parser_test.py b/tests/databases/pdb/parser_test.py
index 8bae184ea..98e35b2ca 100644
--- a/tests/databases/pdb/parser_test.py
+++ b/tests/databases/pdb/parser_test.py
@@ -17,13 +17,12 @@
 import pytest
 
 from rnacentral_pipeline.databases import data
-from rnacentral_pipeline.databases.pdb import parser
-from rnacentral_pipeline.databases.pdb import fetch
 from rnacentral_pipeline.databases.helpers import publications as pubs
+from rnacentral_pipeline.databases.pdb import fetch, parser
 
 
 def load(pdb_id: str, chain_id: str) -> data.Entry:
-    chains = fetch.rna_chains(pdb_ids=[pdb_id.lower()])
+    chains = fetch.chains({(pdb_id.lower(), chain_id)})
     chain_info = next(c for c in chains if c.chain_id == chain_id)
     references = fetch.references([chain_info])
     return parser.as_entry(chain_info, references)
@@ -103,33 +102,59 @@ def test_can_build_correct_entry_for_srp_rna():
     )
 
 
+@pytest.mark.skip("Needs to be reworked")
 @pytest.mark.parametrize(
     "pdb_id,expected",
     [
         ("157d", [32630, 32630]),
         ("1a1t", [32630]),
-        ("1j5e", [274]),
     ],
 )
 def test_can_get_given_taxid(pdb_id, expected):
     chains = fetch.rna_chains(pdb_ids=[pdb_id])
-    taxids = [entry.ncbi_tax_id for entry in parser.parse(chains, {})]
+    taxids = [entry.ncbi_tax_id for entry in parser.parse(chains, {}, set())]
     assert taxids == expected
 
 
 @pytest.mark.parametrize(
-    "pdb_id,missing",
+    "requested,missing",
     [
-        ("5wnt", "5WNT_U_21"),
-        ("5wnp", "5WNP_U_21"),
+        (("5wnt", "A"), ("5WNT", "U")),
+        (("5wnp", "A"), ("5WNP", "U")),
     ],
 )
-def test_will_not_fetch_mislabeled_chains(pdb_id, missing):
-    chains = fetch.rna_chains(pdb_ids=[pdb_id])
-    entries = {e.primary_id for e in parser.parse(chains, {})}
+def test_will_not_fetch_mislabeled_chains(requested, missing):
+    chains = fetch.chains({requested})
+    entries = {(e.primary_id, e.optional_id) for e in parser.parse(chains, {}, set())}
     assert missing not in entries
 
 
+@pytest.mark.parametrize(
+    "overrides,expected",
+    [
+        (
+            {
+                ("7umc", "A"),
+            },
+            ("7UMC", "A"),
+        ),
+        (
+            {
+                ("7umc", "A"),
+            },
+            ("7UMC", "A"),
+        ),
+        ({("7mib", "H")}, ("7MIB", "H")),
+    ],
+)
+def test_will_respect_the_override_list(overrides, expected):
+    chains = fetch.chains(overrides)
+    entries = {
+        (e.primary_id, e.optional_id) for e in parser.parse(chains, {}, overrides)
+    }
+    assert expected in entries
+
+
 @pytest.mark.parametrize(
     "pdb_id,chains",
     [
@@ -173,8 +198,8 @@ def test_will_not_fetch_mislabeled_chains(pdb_id, missing):
     ],
 )
 def test_extracts_expected_chains(pdb_id, chains):
-    fetched = fetch.rna_chains(pdb_ids=[pdb_id.lower()])
-    entries = parser.parse(fetched, {})
+    fetched = fetch.all_chains_in_pdbs([pdb_id])
+    entries = parser.parse(list(fetched), {}, set())
     assert set(d.optional_id for d in entries) == chains
 
 
diff --git a/tests/rnacentral/genes/build_test.py b/tests/rnacentral/genes/build_test.py
index a500eb802..bec1ebb01 100644
--- a/tests/rnacentral/genes/build_test.py
+++ b/tests/rnacentral/genes/build_test.py
@@ -19,6 +19,7 @@
 import pytest
 import psycopg2
 import yaml
+from yaml import Loader
 
 from rnacentral_pipeline.rnacentral.genes import build
 
@@ -92,7 +93,7 @@ def load_overlapping_regions(region_name):
 
 def load_examples():
     with open("data/genes/examples.yaml", "r") as raw:
-        return yaml.load(raw)
+        return yaml.load(raw, Loader=Loader)
 
 
 @pytest.mark.parametrize("expected", load_examples())
diff --git a/tests/rnacentral/precompute/number_of_species.py b/tests/rnacentral/precompute/number_of_species.py
new file mode 100644
index 000000000..ddea90f7d
--- /dev/null
+++ b/tests/rnacentral/precompute/number_of_species.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright [2009-2022] EMBL-European Bioinformatics Institute
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from functools import lru_cache
+
+import attr
+import pytest
+
+from rnacentral_pipeline.rnacentral.precompute.data.update import SequenceUpdate
+from rnacentral_pipeline.rnacentral.precompute.data.context import Context
+from rnacentral_pipeline.rnacentral.precompute import process
+
+from . import helpers
+
+
+def load_data(upi):
+    context, sequence = helpers.load_data(upi)
+    return SequenceUpdate.from_sequence(context, sequence)
+
+
+
+@pytest.mark.parametrize(
+    "rna_id,number",
+    [  # pylint: disable=no-member
+        (
+            "URS000001E7BA_559292",
+            1,
+        ),
+    ]
+)
+def test_gets_correct_number_of_species(rna_id, number):
+    spec_set = load_data(rna_id).sequence.species()
+    assert number == len(spec_set)
diff --git a/utils/expression-atlas/Cargo.toml b/utils/expression-atlas/Cargo.toml
new file mode 100644
index 000000000..156e976db
--- /dev/null
+++ b/utils/expression-atlas/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "expression-parse"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+regex = "1.0"
+anyhow = "1.0"
+log = "0.4"
+env_logger = "0.9.0"
+multimap = "0.8.3"
+clap = { version = "3.1.18", features = ["derive"] }
+polars = { version = "0.21.1", features = ["lazy", "csv-file", "rows", "abs", "is_in", "strings", "concat_str", "list", "json"] }
+quick-xml = { version = "0.22.0", features = ["serialize"] }
+serde = { version = "1.0", features = [ "derive" ] }
diff --git a/utils/expression-atlas/src/augment.rs b/utils/expression-atlas/src/augment.rs
new file mode 100644
index 000000000..a4732c4d1
--- /dev/null
+++ b/utils/expression-atlas/src/augment.rs
@@ -0,0 +1,326 @@
+use crate::configuration::*;
+/// This module will combine the config and dataframes to create a df we can extract all the
+/// necessary information from.
+use anyhow::Result;
+use multimap::MultiMap;
+use polars::frame::DataFrame;
+use polars::prelude::*;
+
+use log::{info, warn};
+
+/// This function will 'augment' the experiment dataframe with information from the config
+/// What that means is adding location and factor information as series to the right of the
+/// dataframe. I do this based on the assay group id (the gN in the expression data) which I
+/// map to the assay name. The assay name is then mapped to the location and factor data in the
+/// sdrf file, where I try to separate factors and locations. This also allows grabbing the
+/// taxid for the experiment
+pub fn augment_differential_df(
+    df: &mut DataFrame,
+    config: &Config,
+    sdrf: &DataFrame,
+) -> Result<DataFrame> {
+    // preprocess the config into a hash map for later convenience
+    info!("Parsing config into lookup MultiMap");
+    let mut contrast_lookup: MultiMap<String, (String, String)> = MultiMap::new();
+    for analysis in &config.analytics {
+        for cont in &analysis.contrasts.as_ref().unwrap().contrast {
+            let test_group = cont.test_group.clone();
+
+            let mut assay_names = analysis.assay_groups.assay_group.clone();
+            assay_names.retain(|ag| ag.id == test_group);
+
+            contrast_lookup.insert(
+                test_group,
+                (format!("{}.log2foldchange", cont.id), format!("{}.p-value", cont.id)),
+            );
+        }
+    }
+
+    // Set up some dataframes for the things we want
+    let mut taxid_df = DataFrame::default();
+    let mut localisation_df = DataFrame::default();
+    let mut disease_df = DataFrame::default();
+    let mut cell_type_df = DataFrame::default();
+
+    let mut df_result = DataFrame::default();
+
+    for analysis in &config.analytics {
+        for ass_group in &analysis.assay_groups.assay_group {
+            // Build the series of assay names for matching later
+            let mut assay_names =
+                Utf8ChunkedBuilder::new("assay_names", ass_group.assays.len(), 128);
+            for ass_nm in &ass_group.assays {
+                assay_names.append_value(ass_nm);
+            }
+            // Select out just the bits for this assay group from sdrf
+            let assay_df = sdrf
+                .clone()
+                .lazy()
+                .filter(col("assay_name").is_in(lit(assay_names.finish().into_series())))
+                .with_column(lit(ass_group.id.as_str()).alias("group_id"));
+
+            let mut df_inter = df.clone();
+
+            df_inter = df_inter.lazy().with_column(lit(NULL).alias("group_id")).collect()?;
+
+            // If there is localisation data, this will select it
+            if localisation_df.height() == 0 {
+                localisation_df = get_localisation_data(assay_df.clone())?;
+            } else {
+                localisation_df.vstack_mut(&get_localisation_data(assay_df.clone())?)?;
+            }
+
+            // If there is disease data, this will select it
+            if disease_df.height() == 0 {
+                disease_df = get_disease_data(assay_df.clone())?;
+            } else {
+                disease_df.vstack_mut(&get_disease_data(assay_df.clone())?)?;
+            }
+
+            // If there is cell type data, this will select it
+            if cell_type_df.height() == 0 {
+                cell_type_df = get_cell_type_data(assay_df.clone())?;
+            } else {
+                cell_type_df.vstack_mut(&get_cell_type_data(assay_df.clone())?)?;
+            }
+
+            // Get the taxonomy ontology reference. T
+            if taxid_df.height() == 0 {
+                taxid_df = get_taxonomy_data(assay_df.clone())?;
+            } else {
+                taxid_df.vstack_mut(&get_taxonomy_data(assay_df.clone())?)?;
+            }
+
+            if contrast_lookup.get(&ass_group.id) == None {
+                continue;
+            }
+
+            let check_cols_vec = contrast_lookup.get_vec(&ass_group.id).unwrap();
+
+            for check_cols in check_cols_vec.iter() {
+                // Check columns are actually present
+                if !&df_inter.get_column_names().contains(&check_cols.0.as_str())
+                    || !&df_inter.get_column_names().contains(&check_cols.1.as_str())
+                {
+                    warn!("Couldn't find either {} or {} in the DataFrame columns, skipping this contrast", check_cols.0, check_cols.1);
+                    continue;
+                }
+
+                df_inter = df_inter
+                    .lazy()
+                    .with_column(
+                        when(col(&check_cols.0).and(col(&check_cols.1)))
+                            .then(lit(ass_group.id.as_ref()))
+                            .otherwise(col("group_id"))
+                            .alias("group_id"),
+                    )
+                    .filter(col(&check_cols.0).and(col(&check_cols.1)))
+                    .collect()?;
+
+                if df_result.height() == 0 {
+                    df_result = df_inter.clone();
+                } else {
+                    df_result.vstack_mut(&df_inter)?;
+                }
+            }
+        }
+    } // closes loop on analyses
+
+    // Use a df to join on selectively
+    df_result =
+        join_augmentations(&df_result, &taxid_df, &localisation_df, &disease_df, &cell_type_df)?;
+
+    Ok(df_result)
+}
+
+/// Augmentation function for baseline experiments
+pub fn augment_baseline_df(
+    df: &mut DataFrame,
+    config: &Config,
+    sdrf: &DataFrame,
+) -> Result<DataFrame> {
+    // Set up some dataframes for the things we want
+    let mut taxid_df = DataFrame::default();
+    let mut localisation_df = DataFrame::default();
+    let mut disease_df = DataFrame::default();
+    let mut cell_type_df = DataFrame::default();
+
+    let mut df_inter = df.clone();
+    df_inter = df_inter.lazy().with_column(lit(NULL).alias("group_id")).collect()?;
+
+    let mut df_result = DataFrame::default();
+
+    for analysis in &config.analytics {
+        for assay_group in &analysis.assay_groups.assay_group {
+            let assay_names = assay_group.assays.clone();
+            let ass_group = assay_group.id.clone();
+
+            let mut assay_names_series =
+                Utf8ChunkedBuilder::new("assay_names", assay_names.len(), 128);
+            for ass_nm in &assay_names {
+                assay_names_series.append_value(ass_nm);
+            }
+
+            let assay_df = sdrf
+                .clone()
+                .lazy()
+                .filter(col("assay_name").is_in(lit(assay_names_series.finish().into_series())))
+                .with_column(lit(ass_group.as_str()).alias("group_id"));
+
+            if localisation_df.height() == 0 {
+                localisation_df = get_localisation_data(assay_df.clone())?;
+            } else {
+                localisation_df.vstack_mut(&get_localisation_data(assay_df.clone())?)?;
+            }
+
+            if disease_df.height() == 0 {
+                disease_df = get_disease_data(assay_df.clone())?;
+            } else {
+                disease_df.vstack_mut(&get_disease_data(assay_df.clone())?)?;
+            }
+
+            if cell_type_df.height() == 0 {
+                cell_type_df = get_cell_type_data(assay_df.clone())?;
+            } else {
+                cell_type_df.vstack_mut(&get_cell_type_data(assay_df.clone())?)?;
+            }
+
+            if taxid_df.height() == 0 {
+                taxid_df = get_taxonomy_data(assay_df.clone())?;
+            } else {
+                taxid_df.vstack_mut(&get_taxonomy_data(assay_df.clone())?)?;
+            }
+
+            // Filter the experiment data according to how expresison atlas does it
+            df_inter = df_inter
+                .lazy()
+                .with_column(
+                    when(col(ass_group.as_str()))
+                        .then(lit(ass_group.as_ref()))
+                        .otherwise(col("group_id"))
+                        .alias("group_id"),
+                )
+                .filter(col(ass_group.as_str()))
+                .collect()?;
+
+            if df_result.height() == 0 {
+                df_result = df_inter.clone();
+            } else {
+                df_result.vstack_mut(&df_inter)?;
+            }
+        } // close loop on assay groups
+    } //close loop on analyses
+
+    df_result =
+        join_augmentations(&df_result, &taxid_df, &localisation_df, &disease_df, &cell_type_df)?;
+
+    Ok(df_result)
+}
+
+fn get_localisation_data(assay_df: LazyFrame) -> Result<DataFrame> {
+    let localisation = assay_df
+        .filter(
+            ((col("feat_class").eq(lit("factor"))).or(col("feat_class").eq(lit("characteristic"))))
+                .and(col("feat_type").eq(lit("organism part"))),
+        )
+        .select([col("group_id"), col("ontology").alias("location")])
+        .first()
+        .collect()?;
+
+    Ok(localisation)
+}
+
+fn get_disease_data(assay_df: LazyFrame) -> Result<DataFrame> {
+    let disease = assay_df
+        .filter(
+            ((col("feat_class").eq(lit("factor"))).or(col("feat_class").eq(lit("characteristic"))))
+                .and(col("feat_type").eq(lit("disease"))),
+        )
+        .select([col("group_id"), col("ontology").alias("disease")])
+        .first()
+        .collect()?;
+
+    Ok(disease)
+}
+
+fn get_cell_type_data(assay_df: LazyFrame) -> Result<DataFrame> {
+    let cell_type = assay_df
+        .filter(
+            col("feat_class").eq(lit("characteristic")).and(col("feat_type").eq(lit("cell type"))),
+        )
+        .select([col("group_id"), col("ontology").alias("cell_type")])
+        .first()
+        .collect()?;
+
+    Ok(cell_type)
+}
+
+fn get_taxonomy_data(assay_df: LazyFrame) -> Result<DataFrame> {
+    let tax_data = assay_df
+        .filter(col("feat_type").eq(lit("organism")))
+        .select([col("group_id"), col("ontology").alias("taxonomy")])
+        .first()
+        .collect()?;
+
+    Ok(tax_data)
+}
+
+fn join_augmentations(
+    df_result_bare: &DataFrame,
+    taxid_df: &DataFrame,
+    localisation_df: &DataFrame,
+    disease_df: &DataFrame,
+    cell_type_df: &DataFrame,
+) -> Result<DataFrame> {
+    // Use a df to join on selectively
+    let mut df_result = df_result_bare.clone();
+    if taxid_df.height() > 0 {
+        df_result = df_result.join(taxid_df, ["group_id"], ["group_id"], JoinType::Inner, None)?;
+    } else {
+        df_result = df_result
+            .lazy()
+            .with_column(lit(NULL).cast(DataType::Utf8).alias("taxonomy"))
+            .collect()?;
+    }
+
+    if localisation_df.height() > 0 {
+        df_result =
+            df_result.join(localisation_df, ["group_id"], ["group_id"], JoinType::Inner, None)?;
+    } else {
+        df_result = df_result
+            .lazy()
+            .with_column(lit(NULL).cast(DataType::Utf8).alias("location"))
+            .collect()?;
+    }
+
+    if disease_df.height() > 0 {
+        df_result =
+            df_result.join(disease_df, ["group_id"], ["group_id"], JoinType::Inner, None)?;
+    } else {
+        df_result = df_result
+            .lazy()
+            .with_column(lit(NULL).cast(DataType::Utf8).alias("disease"))
+            .collect()?;
+    }
+
+    if cell_type_df.height() > 0 {
+        df_result =
+            df_result.join(cell_type_df, ["group_id"], ["group_id"], JoinType::Inner, None)?;
+    } else {
+        df_result = df_result
+            .lazy()
+            .with_column(lit(NULL).cast(DataType::Utf8).alias("cell_type"))
+            .collect()?;
+    }
+
+    df_result = df_result.select([
+        "GeneID",
+        "Gene Name",
+        "experiment",
+        "taxonomy",
+        "location",
+        "disease",
+        "cell_type",
+    ])?;
+    Ok(df_result)
+}
diff --git a/utils/expression-atlas/src/configuration.rs b/utils/expression-atlas/src/configuration.rs
new file mode 100644
index 000000000..b40b0f65a
--- /dev/null
+++ b/utils/expression-atlas/src/configuration.rs
@@ -0,0 +1,54 @@
+/// This module handles the parsing of the configuration file
+use quick_xml::de::from_str;
+use quick_xml::DeError;
+use serde::Deserialize;
+use std::fs;
+
+use std::path::PathBuf;
+
+#[derive(Debug, Deserialize, PartialEq, Eq)]
+pub struct Config {
+    #[serde(rename = "experimentType")]
+    pub exp_type: String,
+    #[serde(rename = "analytics")]
+    pub analytics: Vec<Analytics>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq)]
+pub struct Analytics {
+    pub assay_groups: AssayGroups,
+    pub array_design: Option<String>,
+    pub contrasts: Option<Contrasts>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq)]
+pub struct AssayGroups {
+    pub assay_group: Vec<AssayGroup>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq, Clone)]
+pub struct AssayGroup {
+    pub id: String,
+    pub label: Option<String>, // This contains the factors in a ; separated list
+    #[serde(rename = "assay", default)]
+    pub assays: Vec<String>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq)]
+pub struct Contrasts {
+    pub contrast: Vec<Contrast>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq)]
+pub struct Contrast {
+    pub id: String,
+    pub name: String,
+    #[serde(alias = "reference_assay_group")]
+    pub ref_group: String,
+    #[serde(alias = "test_assay_group")]
+    pub test_group: String,
+}
+
+pub fn parse_config(file: &PathBuf) -> Result<Config, DeError> {
+    from_str::<Config>(&fs::read_to_string(file).unwrap())
+}
diff --git a/utils/expression-atlas/src/filtering.rs b/utils/expression-atlas/src/filtering.rs
new file mode 100644
index 000000000..3038543d6
--- /dev/null
+++ b/utils/expression-atlas/src/filtering.rs
@@ -0,0 +1,103 @@
+/// This module implements the filtering that EA uses in their webapp
+/// NB: This is all reverse engineered
+use anyhow::Result;
+use polars::frame::DataFrame;
+use polars::prelude::*;
+use polars::series::Series;
+use regex::Regex;
+
+fn lowercase_fn(ent: &Option<&str>) -> Option<String> {
+    ent.as_ref().map(|ent| ent.to_lowercase())
+    // match ent {
+    //     None => None,
+    //     Some(ent) => Some(ent.to_lowercase()),
+    // }
+}
+
+fn fix_bad_infinities(str_val: &Series) -> Series {
+    let lowercased = str_val
+        .utf8()
+        .unwrap()
+        .into_iter()
+        .map(|x| lowercase_fn(&x))
+        .collect::<Vec<Option<String>>>()
+        .into_iter()
+        .map(|x| x.map(|x| x.parse::<f64>().unwrap()))
+        .collect::<Vec<Option<f64>>>();
+
+    Series::from_iter(lowercased)
+}
+
+fn baseline_get_median_gt_zero(str_val: &Series) -> Series {
+    let lists = str_val.utf8().unwrap().into_iter().map(|x| {
+        x.unwrap().split(',').into_iter().map(|y| y.parse::<f64>().unwrap()).collect::<Vec<f64>>()
+    });
+
+    let medians: Vec<bool> =
+        lists.into_iter().map(|x| Series::from_iter(x).median().unwrap() > 0.0).collect();
+    Series::from_iter(medians)
+}
+
+/// This function will filter the differential results based on:
+/// - non-null p value
+/// - absolute log2 fold change greater than 1
+// find the p value and log fold columns
+pub fn filter_differential(input: &DataFrame) -> Result<DataFrame> {
+    let pv_regex = Regex::new(r".*p-value.*").unwrap();
+    let log_fold_regex = Regex::new(r".*log2.*").unwrap();
+
+    let mut inter = input.clone();
+
+    for column in input.get_column_names_owned() {
+        // Check for badly parsed infinities
+        if pv_regex.is_match(&column) {
+            if !inter.column(&column)?.dtype().is_numeric() {
+                inter.apply(&column, fix_bad_infinities)?;
+            }
+            inter = inter
+                .lazy()
+                .with_column(
+                    when(col(&column).lt(lit(0.05f64)))
+                        .then(lit(true).alias(&column))
+                        .otherwise(lit(false).alias(&column)),
+                )
+                .collect()?;
+        } else if log_fold_regex.is_match(&column) {
+            if !inter.column(&column)?.dtype().is_numeric() {
+                inter.apply(&column, fix_bad_infinities)?;
+            }
+            inter = inter
+                .lazy()
+                .with_column(
+                    when(col(&column).abs().gt_eq(lit(1.0f64)))
+                        .then(lit(true).alias(&column))
+                        .otherwise(lit(false).alias(&column)),
+                )
+                .collect()?;
+        }
+    }
+    Ok(inter)
+}
+
+pub fn filter_baseline(input: &mut DataFrame) -> DataFrame {
+    // This is a baseline experiment.
+    // Find columns starting with lower case g, then apply the function to convert to
+    // medain and select greater than zero
+    let mut meas = Vec::<Expr>::new();
+
+    for column in input.get_column_names_owned() {
+        if column.starts_with('g') {
+            input.apply(&column, baseline_get_median_gt_zero).unwrap();
+            meas.push(col(&column));
+        }
+    }
+    // Selection should now have all the gN column names in it
+    input
+        .clone()
+        .lazy()
+        .filter(
+            any_exprs(&meas[0..meas.len() / 2]).or(any_exprs(&meas[meas.len() / 2..meas.len()])),
+        ) // If we try to do the whole thing at once, we get a stack overflow
+        .collect()
+        .unwrap()
+}
diff --git a/utils/expression-atlas/src/main.rs b/utils/expression-atlas/src/main.rs
new file mode 100644
index 000000000..bb5c055bf
--- /dev/null
+++ b/utils/expression-atlas/src/main.rs
@@ -0,0 +1,343 @@
+use anyhow::Result;
+use clap::{Parser, Subcommand};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fs;
+use std::path::PathBuf;
+
+use polars::frame::DataFrame;
+use polars::prelude::*;
+
+use log::{info, warn};
+
+pub mod augment;
+pub mod configuration;
+pub mod filtering;
+pub mod sdrf;
+
+#[derive(Parser, Debug)]
+#[clap(author = "Andrew Green", version, about)]
+struct Args {
+    #[clap(subcommand)]
+    cmd: Command,
+}
+
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Parse the Expression Atlas data into the unique genes per experiment jsonlines
+    Parse {
+        /// Path where input has been copied. Must contain the config files
+        #[clap(short, long, multiple_values(true))]
+        input: PathBuf,
+
+        /// An output file
+        #[clap(short, long)]
+        output: String,
+    },
+    /// Lookup the gene names we found, using a dump from the database
+    Lookup {
+        /// File containing the genes from all experiments
+        #[clap(short, long)]
+        genes: PathBuf,
+
+        /// Dump from the database containing all URS -> Gene names. See the query file for references
+        #[clap(short, long)]
+        lookup: PathBuf,
+
+        /// An output file. Will contain the data from the experiments file along with URS data and some other useful stuff
+        #[clap(short, long)]
+        output: String,
+    },
+}
+
+fn load_df_add_experiment(path: &PathBuf) -> Result<DataFrame, polars::prelude::PolarsError> {
+    info!("Loading experiment data from {:?}", path);
+    let exp_name = path.file_name().unwrap().to_str().unwrap().split('-').collect::<Vec<&str>>()
+        [0..=2]
+        .join("-")
+        .replace("_A", "");
+
+    let mut exp_df: DataFrame = CsvReader::from_path(&path)?
+        .has_header(true)
+        .with_delimiter(b'\t')
+        .with_null_values(Some(NullValues::AllColumns("NA".to_string())))
+        .infer_schema(None)
+        .finish()
+        .unwrap_or_else(|x| panic!("Failed on {:?} with error {:?}", path, x));
+
+    // hstack the experiment name (derived from the filename) into the DataFrame
+    let mut exp_col_arr = Utf8ChunkedBuilder::new("experiment", exp_df.height(), 128);
+    for _i in 0..exp_df.height() {
+        exp_col_arr.append_value(&exp_name);
+    }
+    // let iter_exp = std::iter::repeat([&exp_name].into_iter()).take(exp_df.height());
+    let exp_col: Series = exp_col_arr.finish().into_series();
+    exp_df.hstack_mut(&[exp_col]).unwrap();
+
+    if !&exp_df.get_column_names().contains(&"GeneID") {
+        // normalise column names
+        info!("Standard column heading not found, normalising column names");
+        info!("Column names were {:?}", &exp_df.get_column_names());
+        if exp_df.get_column_names().contains(&"Gene ID") {
+            exp_df.rename("Gene ID", "GeneID")?;
+        } else if exp_df.get_column_names().contains(&"Gene.ID") {
+            exp_df.rename("Gene.ID", "GeneID")?;
+        }
+        info!("Column names are now {:?}", &exp_df.get_column_names());
+    }
+
+    Ok(exp_df)
+}
+
+fn run_parse(input: &PathBuf, output: &String) -> Result<()> {
+    let config_re = Regex::new(r"configuration.xml").unwrap();
+    let mut config_lookup: HashMap<String, configuration::Config> = HashMap::new();
+    for file in fs::read_dir(&input)? {
+        let file = file?;
+        let path = file.path();
+        if config_re.is_match(path.to_str().unwrap()) {
+            let exp_name =
+                path.file_name().unwrap().to_str().unwrap().split('-').collect::<Vec<&str>>()
+                    [0..=2]
+                    .join("-")
+                    .replace("configuration", ""); // yeah...
+            let config = configuration::parse_config(&path)?;
+            config_lookup.insert(exp_name, config);
+        }
+    }
+
+    // Now have a hashmap with exp_name:config. We can loop over it and
+    // - Check config for experiment type
+    // - Construct appropriate filenames
+    // - Dispatch filenames for loading, appropriate error handling if they don't exist
+    // - Construct new df to merge with big one
+
+    let differential_re = Regex::new(r".*diff.*").unwrap();
+
+    let mut big_df = DataFrame::default();
+    // String::new();
+    //  data_path = PathBuf::from(&args.input);
+    // let mut sdrf_path = PathBuf::from(&args.input);
+
+    let mut gene_count: usize = 0;
+
+    for (exp_name, config) in &config_lookup {
+        let mut exp_df = DataFrame::default();
+        let mut data_path = PathBuf::from(&input);
+        let mut sdrf_path = PathBuf::from(&input);
+        if differential_re.is_match(&config.exp_type) {
+            for analysis in &config.analytics {
+                let array_design = analysis.array_design.as_deref().unwrap_or("");
+                if !array_design.is_empty() {
+                    let data_filename: String =
+                        format!("{}_{}-analytics.tsv", exp_name, array_design);
+                    data_path.push(&data_filename);
+
+                    if !data_path.exists() {
+                        warn!(
+                            "File {} does not exist, skipping this experiment",
+                            data_path.to_str().unwrap()
+                        );
+                        data_path.pop();
+                        continue;
+                    }
+
+                    // Load the data
+                    if exp_df.height() == 0 {
+                        exp_df = load_df_add_experiment(&data_path)?;
+                        data_path.pop();
+                    } else {
+                        exp_df = exp_df
+                            .lazy()
+                            .join(
+                                load_df_add_experiment(&data_path)?.lazy(),
+                                [col("GeneID")],
+                                [col("GeneID")],
+                                JoinType::Inner,
+                            )
+                            .select(&[col("*").exclude([
+                                "Gene Name_right",
+                                "experiment_right",
+                                "Design Element_right",
+                            ])])
+                            .collect()?;
+                        data_path.pop();
+                    }
+                } else {
+                    let data_filename: String = format!("{}-analytics.tsv", exp_name);
+                    data_path.push(&data_filename);
+
+                    if !data_path.exists() {
+                        warn!(
+                            "File {} does not exist, skipping this experiment",
+                            data_path.to_str().unwrap()
+                        );
+                        data_path.pop();
+                        continue;
+                    }
+
+                    exp_df = load_df_add_experiment(&data_path)?;
+                    data_path.pop();
+                }
+            }
+        } else {
+            let data_filename: String = format!("{}-tpms.tsv", exp_name);
+            data_path.push(&data_filename);
+
+            if !data_path.exists() {
+                warn!(
+                    "File {} does not exist, skipping this experiment",
+                    data_path.to_str().unwrap()
+                );
+                data_path.pop();
+                continue;
+            }
+
+            exp_df = load_df_add_experiment(&data_path)?;
+            data_path.pop();
+        }
+
+        let sdrf_filename: String = format!("{}.condensed-sdrf.tsv", exp_name);
+        sdrf_path.push(&sdrf_filename);
+
+        if !sdrf_path.exists() {
+            warn!("File {} does not exist, skipping this experiment", sdrf_path.to_str().unwrap());
+            sdrf_path.pop();
+            data_path.pop();
+            continue;
+        }
+
+        // Now load the sdrf
+        let sdrf_df = sdrf::parse_condensed_sdrf(&sdrf_path)?;
+        // println!("{:?}", sdrf_df);
+        // filter based on differential or baseline
+        if differential_re.is_match(&config.exp_type) {
+            info!("Filtering experiment dataset with differential filters");
+            exp_df = filtering::filter_differential(&exp_df)?;
+            exp_df = augment::augment_differential_df(&mut exp_df, config, &sdrf_df)?;
+        } else {
+            info!("Filtering with baseline filters");
+            exp_df = filtering::filter_baseline(&mut exp_df);
+            exp_df = augment::augment_baseline_df(&mut exp_df, config, &sdrf_df)?;
+        }
+
+        data_path.pop();
+        sdrf_path.pop();
+
+        info!("dataframe remaining: {}", exp_df.height());
+        gene_count += exp_df.height();
+
+        // Add the newly parsed data to the big df ready for export
+        if big_df.height() == 0 {
+            big_df = exp_df.clone();
+        } else {
+            big_df.vstack_mut(&exp_df)?;
+        }
+    }
+
+    info!(
+        "Parsed a total of {} lines, from which {} were selected ({}%)",
+        gene_count,
+        big_df.height(),
+        100.0 * (big_df.height() as f64) / (gene_count as f64)
+    );
+    println!("{:?}", big_df.height());
+
+    info!("All files parsed, preparing to write import csvs");
+
+    let mut output_file = fs::File::create(&output)?;
+    CsvWriter::new(&mut output_file).has_header(true).finish(&mut big_df)?;
+
+    Ok(())
+}
+
+fn run_lookup(genes: &PathBuf, lookup: &PathBuf, output: &String) -> Result<()> {
+    let mut gene_df: DataFrame = CsvReader::from_path(&genes)?
+        .has_header(true)
+        .finish()
+        .unwrap_or_else(|_x| panic!("Failed to load gene output"));
+    // You need to get the taxid from the taxonomy URL. Should be able to split on _ and take last element if you can figure it out
+    gene_df = gene_df
+        .lazy()
+        .with_column(
+            col("taxonomy").str().extract("([0-9]+)$", 1).cast(DataType::Int64).alias("taxid"),
+        )
+        .collect()
+        .unwrap();
+    // gene_df = gene_df.lazy().with_column(col("taxid").cast(DataType::UInt32)).collect().unwrap();
+    println!("{:?}", gene_df);
+
+    let mut lookup_df: DataFrame = CsvReader::from_path(&lookup)?
+        .has_header(true)
+        .finish()
+        .unwrap_or_else(|_x| panic!("Failed to load lookup data!"));
+
+    // lookup_df.rename("column_1", "upi");
+    // lookup_df.rename("column_2", "taxid");
+    // lookup_df.rename("column_3", "possible_ids");
+    // lookup_df.rename("column_4", "start");
+    // lookup_df.rename("column_5", "end");
+    // lookup_df.rename("column_6", "rna_type");
+
+    // The database dump has a column where possible IDs are separated by a | character, so we need to split on that
+    // The plan then is to use explode on the df with the external IDs column to get a mega big dataframe which we join onto
+    // the gene one
+    println!("{:?}", &lookup_df);
+
+    // For now, I only have the external ID from the database, if this doesn't match many, I can tweak the lookupo query and re-add this
+    lookup_df = lookup_df
+        .lazy()
+        .with_column(col("external_id").str().split("|").alias("external_id"))
+        .explode([col("external_id")])
+        .with_column(col("external_id").str().split(",").alias("external_id"))
+        .explode([col("external_id")])
+        .filter(col("external_id").neq(lit("")))
+        .filter(col("external_id").neq(lit("null")))
+        .filter(col("external_id").is_not_null())
+        .collect()
+        .unwrap();
+
+    println!("Got to the end in one piece!");
+    println!("{:?}", &lookup_df);
+
+    // println!("{:?}", &gene_df);
+
+    let mut matched_df =
+        gene_df.join(&lookup_df, ["GeneID"], ["external_id"], JoinType::Inner, None).unwrap();
+    println!("{:?}", matched_df.get_column_names());
+
+    matched_df = matched_df.lazy().filter(col("taxid").eq(col("taxid_right"))).collect().unwrap();
+
+    let mut grouped_df =
+        matched_df.lazy().groupby([col("GeneID"), col("urs_taxid")]).agg([col("*").list().unique()]).collect()?;
+
+    println!("{:?}", grouped_df);
+
+    let mut output_file = fs::File::create(&output)?;
+    JsonWriter::new(&mut output_file)
+        .with_json_format(JsonFormat::JsonLines)
+        .finish(&mut grouped_df)?;
+
+    Ok(())
+}
+
+fn main() -> Result<()> {
+    env_logger::init();
+    info!("Starting Expression Atlas parser");
+    let args = Args::parse();
+
+    match args.cmd {
+        Command::Parse {
+            input,
+            output,
+        } => run_parse(&input, &output),
+        Command::Lookup {
+            genes,
+            lookup,
+            output,
+        } => run_lookup(&genes, &lookup, &output),
+    }
+
+    // Parse the config files first
+    // set up the regex
+}
diff --git a/utils/expression-atlas/src/sdrf.rs b/utils/expression-atlas/src/sdrf.rs
new file mode 100644
index 000000000..cff79deb5
--- /dev/null
+++ b/utils/expression-atlas/src/sdrf.rs
@@ -0,0 +1,80 @@
+use anyhow::Result;
+use polars::chunked_array::builder::Utf8ChunkedBuilder;
+use polars::frame::DataFrame;
+use polars::prelude::IntoSeries;
+use std::fs;
+use std::io::Read;
+use std::path::PathBuf;
+
+use log::{info, warn};
+
+pub fn parse_condensed_sdrf(path: &PathBuf) -> Result<DataFrame, polars::prelude::PolarsError> {
+    /*
+    A condensed sdrf file has 7 columns, but the last is often not delimited correctly meaning it
+    is tricky to read with the polars default csv reader.
+
+    Therefore, we will be manually parsing the file into 6 series objects (one column seems to
+    always be null) and constructing a dataframe from them
+
+    We use a chunked array builder for Utf8 strings.
+    */
+
+    info!("Loading sdrf data from {:?}", path);
+
+    let mut file = fs::File::open(path).unwrap();
+    let mut s = String::new();
+    file.read_to_string(&mut s)?;
+
+    let part_parsed: Vec<Vec<&str>> = s.lines().map(|line| line.split('\t').collect()).collect();
+    let bytes_per_string: usize = 128;
+    let mut exp_name = Utf8ChunkedBuilder::new("exp_name", part_parsed.len(), bytes_per_string);
+    let mut assay_name = Utf8ChunkedBuilder::new("assay_name", part_parsed.len(), bytes_per_string);
+    let mut feat_class = Utf8ChunkedBuilder::new("feat_class", part_parsed.len(), bytes_per_string);
+    let mut feat_type = Utf8ChunkedBuilder::new("feat_type", part_parsed.len(), bytes_per_string);
+    let mut feat_value = Utf8ChunkedBuilder::new("feat_value", part_parsed.len(), bytes_per_string);
+    let mut ontology = Utf8ChunkedBuilder::new("ontology", part_parsed.len(), bytes_per_string);
+
+    // There is one experiment file that does not have the empty column in line[1]
+    if part_parsed.iter().map(|x| x.len()).max().unwrap() == 7 {
+        for line in part_parsed.iter() {
+            exp_name.append_value(line[0]);
+            assay_name.append_value(line[2]); // remember line[1] will be empty
+            feat_class.append_value(line[3]);
+            feat_type.append_value(line[4]);
+            feat_value.append_value(line[5]);
+            if line.len() == 7 {
+                ontology.append_value(line[6]);
+            } else {
+                ontology.append_null();
+            }
+        }
+    } else {
+        warn!(
+            "Unusual sdrf parsing with {} columns, not 7 for experiment {}",
+            part_parsed.iter().map(|x| x.len()).max().unwrap(),
+            part_parsed[0][0]
+        );
+        for line in part_parsed.iter() {
+            exp_name.append_value(line[0]);
+            assay_name.append_value(line[1]);
+            feat_class.append_value(line[2]);
+            feat_type.append_value(line[3]);
+            feat_value.append_value(line[4]);
+            if line.len() == 6 {
+                ontology.append_value(line[5]);
+            } else {
+                ontology.append_null();
+            }
+        }
+    }
+
+    // Now have all the lines parsed with the same lengths. Try to construct a dataframe...
+    DataFrame::new(vec![
+        exp_name.finish().into_series(),
+        assay_name.finish().into_series(),
+        feat_class.finish().into_series(),
+        feat_type.finish().into_series(),
+        feat_value.finish().into_series(),
+        ontology.finish().into_series(),
+    ])
+}
diff --git a/utils/precompute/Cargo.toml b/utils/precompute/Cargo.toml
index 40dc3516a..761212348 100644
--- a/utils/precompute/Cargo.toml
+++ b/utils/precompute/Cargo.toml
@@ -20,3 +20,4 @@ sorted-iter = "0.1.7"
 structopt = "0.3"
 strum = "0.21"
 strum_macros = "0.21"
+polars = "0.21.1"
diff --git a/utils/precompute/src/releases.rs b/utils/precompute/src/releases.rs
index cca9879b5..69d05ae54 100644
--- a/utils/precompute/src/releases.rs
+++ b/utils/precompute/src/releases.rs
@@ -25,6 +25,8 @@ use anyhow::{
     Result,
 };
 
+use polars::prelude::*;
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct UrsEntry {
     id: usize,
@@ -61,28 +63,34 @@ pub fn write_max(filename: &Path, output: &Path) -> Result<()> {
 }
 
 pub fn select_new(xrefs: &Path, known: &Path, output: &Path) -> Result<()> {
-    let xref_records = entries(xrefs)?.map(|e: UrsEntry| (e.id, e)).assume_sorted_by_key();
-    let known_records = entries(known)?.map(|e: UrsEntry| (e.id, e)).assume_sorted_by_key();
-
-    let mut writer = csv::Writer::from_writer(File::create(output)?);
-    let pairs = xref_records.outer_join(known_records);
-    for (_key, (xref, pre)) in pairs {
-        match (xref, pre) {
-            (Some(x), Some(p)) => match x.release.cmp(&p.release) {
-                Less => Err(anyhow!(
-                    "This should never happen, too small release for {:?} vs {:?}",
-                    &x,
-                    &p
-                ))?,
-                Equal => (),
-                Greater => writer.write_record(&[x.urs])?,
-            },
-            (Some(x), None) => writer.write_record(&[x.urs])?,
-            (None, Some(_)) => (),
-            (None, None) => (),
-        }
-    }
-    writer.flush()?;
+
+    let mut xref_records : DataFrame = CsvReader::from_path(xrefs)?.has_header(false).finish().unwrap();
+    xref_records.rename("column_1", "id").ok();
+    xref_records.rename("column_2", "upi").ok();
+    xref_records.rename("column_3", "last").ok();
+    let mut known_records : DataFrame = CsvReader::from_path(known)?.has_header(false).finish().unwrap();
+    known_records.rename("column_1", "id").ok();
+    known_records.rename("column_2", "upi").ok();
+    known_records.rename("column_3", "last").ok();
+    // Run groupby, sort and max on xref (because the DB doesn't have the memory to do it)
+    xref_records = xref_records.groupby(["id", "upi"])?
+                .select(["last"])
+                .max()?
+                .sort(["id"], false)
+                .unwrap();
+
+    // Join the frames on id, then filter to select those where xref > known (?)
+    let mut selection = xref_records.join(&known_records, ["id", "upi"], ["id", "upi"], JoinType::Outer, None)?;
+    let mask = selection.column("last_max")?.gt(selection.column("last")?)?;
+    let mut selected_upis = selection.filter(&mask).unwrap()
+                                 .select(["upi"])?
+                                 .unique(None, UniqueKeepStrategy::First)?;
+
+
+    let out_stream : File = File::create(output).unwrap();
+    CsvWriter::new(out_stream)
+        .has_header(false)
+        .finish(&mut selected_upis);
 
     Ok(())
 }
diff --git a/utils/search-export/src/sequences/normalized.rs b/utils/search-export/src/sequences/normalized.rs
index 3d2c11b4f..78bc78a99 100644
--- a/utils/search-export/src/sequences/normalized.rs
+++ b/utils/search-export/src/sequences/normalized.rs
@@ -65,6 +65,7 @@ pub struct Normalized {
     interacting_proteins: Vec<InteractingProtein>,
     interacting_rnas: Vec<InteractingRna>,
     so_rna_type_tree: so_tree::SoTree,
+    publication_count: usize,
 
     #[serde(flatten)]
     orfs: OrfVec,
@@ -108,6 +109,7 @@ impl Normalized {
             short_urs: parsed.short(),
             deleted: String::from("N"),
             so_rna_type_tree: raw.so_tree().to_owned(),
+            publication_count: raw.publication_count(),
             pre_summary: raw.precompute().into(),
             basic: base,
             qa_status: raw.qa_status().to_owned(),
diff --git a/utils/search-export/src/sequences/publication_counts.rs b/utils/search-export/src/sequences/publication_counts.rs
index 271f3055c..561749a57 100644
--- a/utils/search-export/src/sequences/publication_counts.rs
+++ b/utils/search-export/src/sequences/publication_counts.rs
@@ -22,3 +22,9 @@ impl grouper::HasIndex for PublicationCount {
 pub fn group(path: &Path, max: usize, output: &Path) -> Result<()> {
     grouper::group::<PublicationCount>(grouper::Criteria::ZeroOrOne, &path, 1, max, &output)
 }
+
+impl PublicationCount {
+    pub fn publication_count(&self) -> usize {
+        self.publication_count
+    }
+}
diff --git a/utils/search-export/src/sequences/raw.rs b/utils/search-export/src/sequences/raw.rs
index 420eca0db..3e67649fd 100644
--- a/utils/search-export/src/sequences/raw.rs
+++ b/utils/search-export/src/sequences/raw.rs
@@ -133,4 +133,9 @@ impl Raw {
     pub fn base(&self) -> &Basic {
         &self.base
     }
+
+    /// Get this raw's publication count.
+    pub fn publication_count(&self) -> usize {
+        self.publication_counts.as_ref().map(|p| p.publication_count()).unwrap_or(0)
+    }
 }
diff --git a/weekly-update/crontab.txt b/weekly-update/crontab.txt
index b4f9b89df..512e60848 100644
--- a/weekly-update/crontab.txt
+++ b/weekly-update/crontab.txt
@@ -1,3 +1,4 @@
-MAILTO=bsweeney@ebi.ac.uk
+MAILTO=agreen@ebi.ac.uk
+SHELL=/bin/bash
 
-0 17 * * 5   source /etc/bashrc; cd /hps/nobackup/production/xfam/bsweeney/automated && bsub ./weekly-update/run.sh
+0 17 * * 4   . ~/.bashrc &&  cd /hps/nobackup/agb/rnacentral/weekly-run && bsub -o weekly_run.out -e weekly_run.err -M 20480 ./weekly-update/run.sh
diff --git a/weekly-update/run.sh b/weekly-update/run.sh
index dba811bdd..d86f0075b 100755
--- a/weekly-update/run.sh
+++ b/weekly-update/run.sh
@@ -1,30 +1,59 @@
 #!/usr/bin/env bash
+#BSUB -oo weekly_run.out
+#BSUB -eo weekly_run.err
+#BSUB -M 4096
+#BSUB -cwd /hps/nobackup/agb/rnacentral/weekly-run
+#BSUB -J "PDBe weekly import"
 
 set -euo pipefail
 IFS=$'\n\t'
 
 export NXF_OPTS='-Dnxf.pool.type=sync -Dnxf.pool.maxThreads=10000'
-export PATH="/nfs/software/singularity/3.5.0/bin:$HOME/.cargo/bin:$PATH"
 
 [ -d work/tmp ] || mkdir -p work/tmp
 [ ! -e local.config ] || rm local.config
 
 when=$(date +'%Y-%m-%d')
 
-if [[ -d singularity/bind/r2dt/ ]]; then
-  rm -r singularity/bind/r2dt/
+
+ln -s weekly-update/update.config local.config
+
+make rust
+
+# Download latest version of nextflow
+curl --max-time 10 -s https://get.nextflow.io | bash
+res=$?
+# Load module as fallback
+if test "$res" != "0"; then
+  echo "Using module nextflow..."
+  module load nextflow-21.10.6-gcc-9.3.0-tkuemwd
+  NF="nextflow"
+else
+  echo "Using downloaded nextflow..."
+  NF="./nextflow"
 fi
 
-mkdir -p singularity/bind/r2dt/data
-pushd singularity/bind/r2dt/data
-wget -O cms.tar.gz https://www.dropbox.com/s/3ie8kzb8ol658s0/cms.tar.gz?dl=1
-tar xf cms.tar.gz
-popd
+# Clean up previous run by nextflow
+$NF clean -f
 
-ln -s weekly-update/update.config local.config 
+rm .nextflow.log
+
+## Run new DB selection workflow - selects DBs based on file changes from remotes
+rm -f db_selection.config && touch db_selection.config
+
+$NF -quiet run -profile pg11prod select_databases.nf --import_selection_remotes=weekly-update/weekly_db_remotes
+
+$NF -quiet run -with-report "$when-setup.html" -profile pg11prod --use_datamover prepare-environment.nf
+$NF -quiet run -with-report "$when-import.html" -profile pg11prod import-data.nf
+$NF -quiet run -with-report "$when-analyze.html" -profile pg11prod analyze.nf
+$NF -quiet run -with-report "$when-precompute.html" -profile pg11prod precompute.nf
+
+$NF -quiet run -profile pg11prod report.nf
+
+$NF -quiet run -with-report "$when-search.html" -profile pg11prod export.nf
 
-make rust
 
-./nextflow -quiet run -with-report "$when-import.html" -profile prod import-data.nf
-./nextflow -quiet run -with-report "$when-precompute.html" -profile prod precompute.nf
-./nextflow -quiet run -with-report "$when-search.html" -profile prod search-export.nf
+# Zip up reports and email them to me
+tar -cjf reports.tar.bz2 *.html
+rm *.html
+mail -a reports.tar.bz2 -s "Weekly workflow completion reports" agreen@ebi.ac.uk < .nextflow.log
diff --git a/weekly-update/update.config b/weekly-update/update.config
index c6778e29b..7cb2c4b79 100644
--- a/weekly-update/update.config
+++ b/weekly-update/update.config
@@ -15,10 +15,15 @@ params {
     pfam.run = false
   }
 
+  rfam
+  {
+    memory = 2.GB
+  }
+
   precompute {
     run = true
     maxForks = 4
-    range.memory = '5GB'
+    range.memory = '8GB'
   }
 
   r2dt.run = false
@@ -30,31 +35,32 @@ params {
     memory = '15 GB'
     publish {
       host = ''
-      path = "/nfs/production/xfam/rnacentral/search_dumps/dev-nightly/"
+      path = "/nfs/production/agb/rnacentral/search-export/dev-nightly/"
     }
   }
 
   sequence_search {
     run = false
   }
+
+  use_datamover = true
 }
 
 singularity {
   enabled = true
   cacheDir = "$baseDir/singularity"
-  runOptions = '--bind /nfs/ftp/pub/databases/ena --bind /ebi/ftp --bind /nfs/ftp --bind /nfs/ensemblftp --bind /nfs/ensemblgenomes/ftp'
 }
 
 notification {
   enabled = true
-  to = 'bsweeney@ebi.ac.uk'
+  to = 'agreen@ebi.ac.uk'
 }
 
-includeConfig '../private.config'
+includeConfig '../profiles.config'
 includeConfig 'config/cluster.config'
+includeConfig 'db_selection.config'
 
  process {
-  time = '5h'
   errorStrategy = { task.exitStatus == 130 ? 'retry' : 'terminate' }
   maxRetries = 2
  }
diff --git a/weekly-update/weekly_db_remotes b/weekly-update/weekly_db_remotes
new file mode 100644
index 000000000..047c59633
--- /dev/null
+++ b/weekly-update/weekly_db_remotes
@@ -0,0 +1,6 @@
+sgd,https://downloads.yeastgenome.org/latest/RNAcentral.json
+pombase,ftp://ftp.pombase.org/nightly_update/misc/rnacentral.json
+zfin,https://zfin.org/downloads/rnaCentral.json
+intact,https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip
+hgnc,https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/locus_groups/non-coding_RNA.json
+flybase,ftp://ftp.flybase.net/releases/current/precomputed_files/genes/ncRNA*.json.gz
diff --git a/workflows/databases/crw.nf b/workflows/databases/crw.nf
index 5c769fbde..1c505beb9 100644
--- a/workflows/databases/crw.nf
+++ b/workflows/databases/crw.nf
@@ -11,7 +11,7 @@ process fetch_and_process {
   psql -f "$metadata_query" "$PGDATABASE" > metadata.json
   git clone "$params.databases.crw.r2dt_repo" r2dt
   rnac crw r2dt-to-fasta r2dt/data/crw-fasta sequences.fasta
-  rnac crw parse metadata.json $sequences
+  rnac crw parse metadata.json sequences.fasta
   """
 }
 
diff --git a/workflows/databases/ena.nf b/workflows/databases/ena.nf
index 7f1a6a2af..aa9ebb3d7 100644
--- a/workflows/databases/ena.nf
+++ b/workflows/databases/ena.nf
@@ -2,6 +2,8 @@ process fetch_directory {
   tag { "$name" }
   when { params.databases.ena.run }
   clusterOptions '-sp 100'
+  queue 'datamover'
+  containerOptions '--bind /nfs:/nfs'
 
   input:
   tuple val(name), val(remote)
diff --git a/workflows/databases/ensembl.nf b/workflows/databases/ensembl.nf
index bcecf048e..e8185cd0c 100644
--- a/workflows/databases/ensembl.nf
+++ b/workflows/databases/ensembl.nf
@@ -28,15 +28,16 @@ process find_urls {
   output:
   path('species.txt')
 
-  """
-  rnac ensembl urls-for $division ${params.databases.ensembl[division].ftp_host} species.txt
-  """
+  script:
+    """
+    rnac ensembl urls-for $division ${params.databases.ensembl[division].ftp_host} species.txt
+    """
 }
 
 process fetch_species_data {
   tag { "$species" }
   clusterOptions '-sp 90'
-  errorStrategy 'retry'
+  errorStrategy { task.exitStatis == 8 ? 'retry' : 'ignore' }
   maxRetries 10
   maxForks 10
 
@@ -70,7 +71,7 @@ process parse_data {
 
   """
   rnac ensembl parse $division --family-file $rfam $embl $gff .
-  rnac ensembl pseudogenes $division $embl ensembl-pseudogenes.csv
+#  rnac ensembl pseudogenes $division $embl ensembl-pseudogenes.csv
   """
 }
 
diff --git a/workflows/databases/expressionatlas.nf b/workflows/databases/expressionatlas.nf
new file mode 100644
index 000000000..267e97520
--- /dev/null
+++ b/workflows/databases/expressionatlas.nf
@@ -0,0 +1,91 @@
+process fetch_data {
+  queue 'datamover'
+  container ''
+  errorStrategy 'ignore'
+
+  input:
+    path("base_dir")
+
+  output:
+  path('tsv_files')
+
+  """
+  mkdir tsv_files
+  find $base_dir -type f .. | xargs -I {} -P 10 cp {} tsv_files
+  """
+}
+
+process fetch_lookup {
+  queue 'short'
+
+  input:
+    path (query)
+
+  output:
+    path("lookup_dump.csv")
+
+    """
+    psql -f $query $PGDATABASE > lookup_dump.csv
+    """
+}
+
+
+process parse_tsvs {
+  memory 24.GB
+
+  input:
+  path(tsvs)
+
+  output:
+  path('chunk_*')
+
+  """
+  expression-parse parse -i $tsvs -o all_genes.csv
+  split -n l/10 all_genes.csv chunk_
+  """
+
+}
+
+process lookup_genes {
+
+  input:
+    path(lookup)
+    path(genes)
+
+  output:
+    path('*.csv')
+
+  """
+  expression-parse lookup -g $genes -l $lookup -o exp_parse_stage2.json
+  rnac expressionatlas parse exp_parse_stage2.json .
+  """
+}
+
+
+workflow expressionatlas {
+
+  emit: data
+  main:
+
+  if( params.databases.expressionatlas.run ) {
+    Channel.fromPath('files/import-data/expressionatlas/lookup-dump-query.sql') | set { lookup_sql }
+    Channel.fromPath($params.databases.expressionatlas.remote) | set { tsv_path }
+    lookup_sql | fetch_lookup | set { lookup }
+    tsv_path \
+    | fetch_data \
+    | filter { tsv_name ->
+      !params.databases.expressionatlas.exclude.any {p -> tsv_name.baseName =~ p}
+    } \
+    | parse_tsvs \
+    | set { genes }
+
+   lookup_genes(genes, lookup) \
+   | collectFile() {csvfile -> [csvfile.name, csvfile.text]} \
+   | set { data }
+
+  }
+  else {
+    Channel.empty() | set { data }
+  }
+
+}
diff --git a/workflows/databases/genecards_suite.nf b/workflows/databases/genecards_suite.nf
index f9a1a87f9..34ee18f4e 100644
--- a/workflows/databases/genecards_suite.nf
+++ b/workflows/databases/genecards_suite.nf
@@ -1,5 +1,8 @@
 process fetch {
   tag { "$name" }
+  queue 'datamover'
+  container ''
+
   input:
   tuple val(name), path(data), val(column_name)
 
diff --git a/workflows/databases/lncbook.nf b/workflows/databases/lncbook.nf
index 697b939ba..ca1e92bc8 100644
--- a/workflows/databases/lncbook.nf
+++ b/workflows/databases/lncbook.nf
@@ -5,7 +5,8 @@ process lncbook {
   path('*.csv')
 
   """
-  wget -O lncbook.json ${params.databases.lncbook.remote}
+  wget -O lncbook.json.gz ${params.databases.lncbook.remote}
+  gzip -d lncbook.json.gz
   rnac lncbook parse lncbook.json .
   """
 }
diff --git a/workflows/databases/lncipedia.nf b/workflows/databases/lncipedia.nf
index 20dd8427e..b073050c3 100644
--- a/workflows/databases/lncipedia.nf
+++ b/workflows/databases/lncipedia.nf
@@ -8,7 +8,7 @@ process lncipedia {
   path('*.csv')
 
   """
-  curl ${params.databases.lncipedia.remote} > lncipedia.json
+  wget -O lncipedia.json ${params.databases.lncipedia.remote}
   rnac lncipedia parse lncipedia.json .
   """
 }
diff --git a/workflows/databases/pdbe.nf b/workflows/databases/pdbe.nf
index 7d5715387..8b7382dab 100644
--- a/workflows/databases/pdbe.nf
+++ b/workflows/databases/pdbe.nf
@@ -5,6 +5,9 @@ process pdbe {
   path('*.csv')
 
   """
-  rnac pdb generate .
+  wget --read-timeout=30 -t 1 -O pdb_full_region.txt.gz http://ftp.ebi.ac.uk/pub/databases/Rfam/.preview/pdb_full_region.txt.gz
+  gzip -d pdb_full_region.txt.gz
+  awk 'BEGIN {OFS = FS = "\t" } \$11 == 1 { print \$2, \$3} ' pdb_full_region.txt | sort -u  > rfam_hit_ids
+  rnac pdb generate --override-chains=rfam_hit_ids .
   """
 }
diff --git a/workflows/databases/plncdb.nf b/workflows/databases/plncdb.nf
new file mode 100644
index 000000000..14cba07c7
--- /dev/null
+++ b/workflows/databases/plncdb.nf
@@ -0,0 +1,53 @@
+nextflow.enable.dsl = 2
+
+process fetch_data {
+  when { !params.databases.plncdb.prefetch and params.databases.plncdb.run }
+
+  containerOptions "--contain --bind $baseDir"
+
+  output:
+  path("data")
+
+  """
+  rnac plncdb fetch-data $params.databases.plncdb.urls data
+  """
+}
+
+process parse_data {
+  when { params.databases.plncdb.run }
+
+  queue 'short'
+  memory { 8.GB * task.attempt }
+
+  errorStrategy 'retry'
+  maxRetries 16
+
+  input:
+  path data
+
+  output:
+  path('*.csv')
+
+  """
+  # rnac notify step "Data parsing for PLncDB" $params.databases.plncdb.data_path$data
+  rnac plncdb parse $params.databases.plncdb.data_path$data
+  """
+}
+
+workflow plncdb {
+  emit: data_files
+
+  main:
+  if( params.databases.plncdb.run ) {
+    Channel.fromPath("$params.databases.plncdb.data_path/*", type:'dir') \
+    | parse_data \
+    | flatten
+    | collectFile() {csvfile -> [csvfile.name, csvfile.text]} \
+    | set { data_files }
+  }
+  else {
+  Channel.empty() | set { data_files }
+  }
+
+
+}
diff --git a/workflows/databases/quickgo.nf b/workflows/databases/quickgo.nf
index cbba55bbf..de41f5c4f 100644
--- a/workflows/databases/quickgo.nf
+++ b/workflows/databases/quickgo.nf
@@ -1,13 +1,45 @@
-process quickgo {
-  when { params.databases.quickgo.run }
+process quickgo_get {
+  queue 'datamover'
+  container ''
+
+  output:
+  path('data.gpa')
+
+  """
+  scp $params.databases.quickgo.remote data.gpa.gz
+  gzip -d data.gpa.gz
+  """
+}
+
+
+
+process quickgo_parse {
   memory { params.databases.quickgo.memory }
 
+  input:
+  path(data)
+
   output:
   path('*.csv')
 
   """
-  scp $params.databases.quickgo.remote data.gpa.gz
-  gzip -d data.gpa
-  rnac quickgo parse data.gpa .
+  rnac quickgo parse $data .
   """
 }
+
+
+
+workflow quickgo {
+
+  emit: data
+
+  main:
+    if ( params.databases.quickgo.run ) {
+      quickgo_get | quickgo_parse | set { data }
+    }
+    else {
+      Channel.empty() | set { data }
+    }
+
+
+}
diff --git a/workflows/databases/rfam.nf b/workflows/databases/rfam.nf
index ba2137f5d..4c052634d 100644
--- a/workflows/databases/rfam.nf
+++ b/workflows/databases/rfam.nf
@@ -78,7 +78,7 @@ workflow rfam {
   emit: data
   main:
     Channel.fromPath('files/import-data/rfam/select-families.sql') | set { family_sql }
-    Channel.fromPath('files/import-data/rfam/select-families.sql') | set { family_sql }
+    Channel.fromPath('files/import-data/rfam/families.sql') | set { info_sql }
     Channel.fromPath('files/import-data/rfam/sequences.sql') | set { sequence_sql }
 
     info_sql | fetch_families_info | set { info }
diff --git a/workflows/databases/select.nf b/workflows/databases/select.nf
new file mode 100644
index 000000000..b9a36fa84
--- /dev/null
+++ b/workflows/databases/select.nf
@@ -0,0 +1,63 @@
+nextflow.enable.dsl=2
+
+
+
+process check_db_md5 {
+  container ''
+
+  input:
+    tuple val(db_name), val(remote)
+
+  output:
+    path("*.csv")
+
+
+    """
+    wget -O target_file $remote
+    echo -n "$db_name," >> latest_md5s.csv && md5sum target_file | awk 'BEGIN {fs="[ ]"}; {print \$1}' >> latest_md5s.csv
+    """
+}
+
+
+process make_selection {
+  publishDir "$projectDir"
+
+  input:
+    path latest_md5s
+
+  output:
+    path ("*.config")
+    path ("$latest_md5s")
+
+  """
+  rnac scan-imports select-for-import $latest_md5s
+  """
+
+}
+
+
+process update_tracker_table {
+  input:
+    path latest_md5s
+
+  """
+  rnac scan-imports update-tracker $latest_md5s
+  """
+}
+
+
+workflow select {
+
+  Channel.fromPath(params.import_selection_remotes) \
+  | splitCsv
+  | map { row -> tuple(row[0], row[1])}
+  | check_db_md5
+  | collectFile
+  | ( make_selection & update_tracker_table )
+
+}
+
+
+workflow {
+  select()
+}
diff --git a/workflows/databases/silva.nf b/workflows/databases/silva.nf
index 7407631c2..91e2b258c 100644
--- a/workflows/databases/silva.nf
+++ b/workflows/databases/silva.nf
@@ -5,7 +5,7 @@ process fetch {
   path('*.rnac')
 
   """
-  wget $params.databases.silva.remote
+  wget -e robots=off -nH -r --cut-dirs 3 --no-parent -A "SILVA_*Parc.rnac.gz" $params.databases.silva.remote
   gzip -d *.gz
   """
 }
diff --git a/workflows/databases/zwd.nf b/workflows/databases/zwd.nf
index fcd3f8bfd..a6ab0b1e7 100644
--- a/workflows/databases/zwd.nf
+++ b/workflows/databases/zwd.nf
@@ -8,7 +8,7 @@ process zwd {
   path('*.csv')
 
   """
-  cp $params.databases.zwd.remote zwd.json
+  wget -O zwd.json $params.databases.zwd.remote
   rnac zwd parse $context zwd.json .
   """
 }
diff --git a/workflows/export/sequence-search.nf b/workflows/export/sequence-search.nf
index 1ada4cdee..90fda0e2a 100755
--- a/workflows/export/sequence-search.nf
+++ b/workflows/export/sequence-search.nf
@@ -23,7 +23,7 @@ process query_database {
   maxForks params.export.sequence_search.max_forks
 
   input:
-  tuple val(name), path(query), val(partition) 
+  tuple val(name), path(query), val(partition)
 
   output:
   tuple val(name), path('raw.json')
@@ -64,6 +64,7 @@ process create_fasta {
 
 process atomic_publish {
   stageInMode 'copy'
+  queue 'datamover'
 
   input:
   path(fasta)
diff --git a/workflows/export/text-search.nf b/workflows/export/text-search.nf
index 991c87ede..7f6e5c3f8 100755
--- a/workflows/export/text-search.nf
+++ b/workflows/export/text-search.nf
@@ -23,7 +23,7 @@ process create_release_note {
 // At this point we should be able to safely move data into the final location.
 // This deletes the old data and then moves the new data in place.
 process atomic_publish {
-  container ''
+  queue 'datamover'
 
   input:
   path('release_note.txt')
diff --git a/workflows/export/text-search/sequences.nf b/workflows/export/text-search/sequences.nf
index 451cd8b09..9c308cb11 100755
--- a/workflows/export/text-search/sequences.nf
+++ b/workflows/export/text-search/sequences.nf
@@ -90,9 +90,9 @@ process build_ranges {
 process fetch_accession {
   tag { "$min-$max" }
   maxForks 3
-  time '10m'
   errorStrategy 'retry'
   maxRetries 5
+  container ''
 
   input:
   tuple val(min), val(max), path(sql), val(_flag)
@@ -114,15 +114,16 @@ process text_mining_query {
   input:
   val(max_count)
   path(script)
+  container ''
 
   output:
-  path("text-mining.json")
+  path("publication-count.json")
 
   """
   curl "$params.export.search.text_mining" > counts.csv
   psql -v ON_ERROR_STOP=1 -c "\\copy search_export_publication_counts from 'counts.csv'" "$PGDATABASE"
   psql -v ON_ERROR_STOP=1 -f "$script" "$PGDATABASE" > raw.json
-  search-export group text-mining raw.json ${max_count} text-mining.json
+  search-export group publication-count raw.json ${max_count} publication-count.json
   """
 }
 
diff --git a/workflows/load-data.nf b/workflows/load-data.nf
index 298e0a159..874be7c1a 100644
--- a/workflows/load-data.nf
+++ b/workflows/load-data.nf
@@ -14,7 +14,7 @@ process create_load_tables {
 
 process merge_and_import {
   tag { name }
-  memory 3.GB
+  memory 9.GB
   maxForks 2
   containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir"
 
diff --git a/workflows/lookup-references.nf b/workflows/lookup-references.nf
index cbd970355..010ae1cfa 100644
--- a/workflows/lookup-references.nf
+++ b/workflows/lookup-references.nf
@@ -16,12 +16,15 @@ process merge_and_split_all_publications {
 
 process fetch_publications {
   when { params.needs_publications }
+  queue 'datamover'
+  executor 'lsf'
+  container ''
 
   output:
   path('out')
 
   """
-  curl -L http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz > PMCLiteMetadata.tgz
+  cp /nfs/ftp/public/databases/pmc/PMCLiteMetadata/PMCLiteMetadata.tgz .
   tar xvf PMCLiteMetadata.tgz
   """
 }
diff --git a/workflows/metadata/taxonomy.nf b/workflows/metadata/taxonomy.nf
index d0583923e..ca432235c 100644
--- a/workflows/metadata/taxonomy.nf
+++ b/workflows/metadata/taxonomy.nf
@@ -1,11 +1,14 @@
 process taxonomy {
   memory '2GB'
+  errorStrategy 'retry'
 
   output:
   path('*.csv')
 
   """
-  wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz
+  wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz
+  wget https://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5
+  md5sum -c new_taxdump.tar.gz.md5
   tar xvf new_taxdump.tar.gz
   mkdir taxdump
   mv *.dmp taxdump
diff --git a/workflows/parse-databases.nf b/workflows/parse-databases.nf
index cd02d2c9e..daf7d709a 100644
--- a/workflows/parse-databases.nf
+++ b/workflows/parse-databases.nf
@@ -1,6 +1,7 @@
 include { crw } from './databases/crw'
 include { ena } from './databases/ena'
 include { ensembl } from './databases/ensembl'
+include { expressionatlas } from './databases/expressionatlas'
 include { five_s_rrnadb } from './databases/5srrnadb'
 include { flybase } from './databases/flybase'
 include { genecards_suite } from './databases/genecards_suite'
@@ -14,6 +15,7 @@ include { mirbase } from './databases/mirbase'
 include { mirgenedb } from './databases/mirgenedb'
 include { pdbe } from './databases/pdbe'
 include { pirbase } from './databases/pirbase'
+include { plncdb } from './databases/plncdb'
 include { pombase } from './databases/pombase'
 include { psicquic } from './databases/psicquic'
 include { quickgo } from './databases/quickgo'
@@ -57,6 +59,7 @@ workflow parse_databases {
       five_s_rrnadb(),
       ena(),
       ensembl(),
+      expressionatlas(),
       flybase(),
       genecards_suite(),
       gtrnadb(context),
@@ -69,6 +72,7 @@ workflow parse_databases {
       mirgenedb(),
       pdbe(),
       pirbase(),
+      plncdb(),
       pombase(),
       psicquic(),
       quickgo(),
diff --git a/workflows/precompute/build_urs_table.nf b/workflows/precompute/build_urs_table.nf
index 807490c2c..18cda328f 100644
--- a/workflows/precompute/build_urs_table.nf
+++ b/workflows/precompute/build_urs_table.nf
@@ -29,6 +29,8 @@ process fetch_all_urs_taxid {
 
 process select_outdated {
   containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir"
+  memory '24 GB'
+  cpus 4
 
   input:
   path('xref.csv')
@@ -96,7 +98,8 @@ process sort_ids {
 
 process xref_releases {
   input:
-  tuple val(_flag), file(query)
+  tuple val(_flag)
+  file(query)
 
   output:
   path('data.csv')
@@ -106,9 +109,10 @@ process xref_releases {
   """
 }
 
-process fetch_release_info {
+process precompute_releases {
   input:
-  tuple val(_flag), file(query)
+  val(_flag)
+  file(query)
 
   output:
   path('data.csv')
@@ -158,7 +162,7 @@ workflow using_ids {
 
     flag \
     | map { _flag -> file(params.precompute.select.id_file) } \
-    | set { id_files } 
+    | set { id_files }
 
     sort_ids(flag, id_files) | set { selected }
 }
diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf
index da8f3082d..3be666626 100644
--- a/workflows/r2dt.nf
+++ b/workflows/r2dt.nf
@@ -58,7 +58,7 @@ process layout_sequences {
   tag { "${sequences}" }
   memory params.r2dt.layout.memory
   container params.r2dt.container
-  containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms" 
+  containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms"
   errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' }
 
   input:
@@ -75,8 +75,9 @@ process layout_sequences {
 
 process publish_layout {
   maxForks 50
-  errorStrategy { task.attempt < 5 ? "retry" : "finish" }
+  errorStrategy { task.attempt < 5 ? "retry" : "ignore" }
   maxRetries 5
+  queue 'datamover'
 
   input:
   tuple path(sequences), path(output), path(mapping)
@@ -94,6 +95,7 @@ process publish_layout {
 process parse_layout {
   input:
   tuple path(sequences), path(to_parse), path(mapping)
+  errorStrategy "ignore"
 
   output:
   path "data.csv", emit: data
@@ -142,10 +144,10 @@ workflow common {
 
 workflow for_database {
   take: sequences
-  emit: 
+  emit:
     parsed
     layouts
-  main: 
+  main:
     common | set { model_mapping }
 
     sequences \
@@ -183,7 +185,7 @@ workflow r2dt {
     | set { data }
 
     data | publish_layout
-    data | parse_layout 
+    data | parse_layout
 
     parse_layout.out.data | collect | set { data }
     parse_layout.out.attempted | collect | set { attempted }
diff --git a/workflows/references/manually_annotated/query.sql b/workflows/references/manually_annotated/query.sql
new file mode 100644
index 000000000..fdde38275
--- /dev/null
+++ b/workflows/references/manually_annotated/query.sql
@@ -0,0 +1,17 @@
+select
+	xref.upi || '_' || xref.taxid,
+	-- acc.accession,
+	acc."database",
+	refs.pmid,
+	refs.doi,
+	refs.pmcid
+	-- refs.epmcid
+from rnc_accessions acc
+join xref
+on xref.ac = acc.accession
+join rnc_reference_map rmap on rmap.accession = acc.accession
+join rnc_references refs on refs.id = rmap.reference_id
+where
+	xref.dbid in (24, 20, 14, 16, 18, 23, 27, 44, 48)
+	and xref.deleted = 'N'
+;
diff --git a/workflows/references/queries/ensembl_gene.sql b/workflows/references/queries/ensembl.sql
similarity index 82%
rename from workflows/references/queries/ensembl_gene.sql
rename to workflows/references/queries/ensembl.sql
index 20592eee3..a9c194b30 100644
--- a/workflows/references/queries/ensembl_gene.sql
+++ b/workflows/references/queries/ensembl.sql
@@ -1,9 +1,10 @@
 -- ENSEMBL
 select
-    gene, -- Also search for everything up to the first '.'
-	  external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene, -- Also search for everything up to the first '.'
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/ensembl_gencode_gene.sql b/workflows/references/queries/ensembl_gencode.sql
similarity index 90%
rename from workflows/references/queries/ensembl_gencode_gene.sql
rename to workflows/references/queries/ensembl_gencode.sql
index ed7432395..b4c63180f 100644
--- a/workflows/references/queries/ensembl_gencode_gene.sql
+++ b/workflows/references/queries/ensembl_gencode.sql
@@ -1,9 +1,10 @@
 -- ENSEMBL_GENCODE
 select
-    gene, -- Also search for everything up to the first '.'
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene, -- Also search for everything up to the first '.'
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/ensembl_gencode_locus_tag.sql b/workflows/references/queries/ensembl_gencode_locus_tag.sql
deleted file mode 100644
index 746306912..000000000
--- a/workflows/references/queries/ensembl_gencode_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- ENSEMBL_GENCODE
-select
-	  locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('ENSEMBL_GENCODE')
-;
diff --git a/workflows/references/queries/ensembl_locus_tag.sql b/workflows/references/queries/ensembl_locus_tag.sql
deleted file mode 100644
index 7ad75b2ae..000000000
--- a/workflows/references/queries/ensembl_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- ENSEMBL
-select
-    locus_tag,
-	  external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('ENSEMBL')
-;
diff --git a/workflows/references/queries/ensembl_metazoa_gene.sql b/workflows/references/queries/ensembl_metazoa.sql
similarity index 90%
rename from workflows/references/queries/ensembl_metazoa_gene.sql
rename to workflows/references/queries/ensembl_metazoa.sql
index 16d9619e3..c9fa76c9a 100644
--- a/workflows/references/queries/ensembl_metazoa_gene.sql
+++ b/workflows/references/queries/ensembl_metazoa.sql
@@ -1,9 +1,10 @@
 -- ENSEMBL METAZOA
 select
-    gene, -- Also search for everything up to the first '.'
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene, -- Also search for everything up to the first '.'
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/ensembl_metazoa_locus_tag.sql b/workflows/references/queries/ensembl_metazoa_locus_tag.sql
deleted file mode 100644
index ee1f7286e..000000000
--- a/workflows/references/queries/ensembl_metazoa_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- ENSEMBL METAZOA
-select
-	  locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('ENSEMBL_METAZOA')
-;
diff --git a/workflows/references/queries/ensembl_plants_gene.sql b/workflows/references/queries/ensembl_plants.sql
similarity index 88%
rename from workflows/references/queries/ensembl_plants_gene.sql
rename to workflows/references/queries/ensembl_plants.sql
index 2ed18276e..8078e5216 100644
--- a/workflows/references/queries/ensembl_plants_gene.sql
+++ b/workflows/references/queries/ensembl_plants.sql
@@ -1,9 +1,10 @@
 -- ENSEMBL PLANTS
 select
-    gene,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene,
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/ensembl_plants_locus_tag.sql b/workflows/references/queries/ensembl_plants_locus_tag.sql
deleted file mode 100644
index ea2c1f3b7..000000000
--- a/workflows/references/queries/ensembl_plants_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- ENSEMBL PLANTS
-select
-	  locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('ENSEMBL_PLANTS')
-;
diff --git a/workflows/references/queries/ensembl_protists_gene.sql b/workflows/references/queries/ensembl_protists.sql
similarity index 88%
rename from workflows/references/queries/ensembl_protists_gene.sql
rename to workflows/references/queries/ensembl_protists.sql
index c0ec32cc1..332e924a8 100644
--- a/workflows/references/queries/ensembl_protists_gene.sql
+++ b/workflows/references/queries/ensembl_protists.sql
@@ -1,9 +1,10 @@
 -- ENSEMBL PROTISTS
 select
-    gene,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene,
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/ensembl_protists_locus_tag.sql b/workflows/references/queries/ensembl_protists_locus_tag.sql
deleted file mode 100644
index 1b12b1e55..000000000
--- a/workflows/references/queries/ensembl_protists_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- ENSEMBL PROTISTS
-select
-	  locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('ENSEMBL_PROTISTS')
-;
diff --git a/workflows/references/queries/flybase_gene_synonym.sql b/workflows/references/queries/flybase.sql
similarity index 88%
rename from workflows/references/queries/flybase_gene_synonym.sql
rename to workflows/references/queries/flybase.sql
index 41823a045..735010c39 100644
--- a/workflows/references/queries/flybase_gene_synonym.sql
+++ b/workflows/references/queries/flybase.sql
@@ -1,9 +1,10 @@
 -- Flybase
 select
-    gene_synonym, -- Split on ,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene_synonym, -- Split on ,
+    locus_tag
 from xref x 
 join rnc_accessions ra 
 on 
diff --git a/workflows/references/queries/flybase_locus_tag.sql b/workflows/references/queries/flybase_locus_tag.sql
deleted file mode 100644
index 261af9de8..000000000
--- a/workflows/references/queries/flybase_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- Flybase
-select
-    locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x 
-join rnc_accessions ra 
-on 
-	ra.accession = x.ac 
-where
-	x.deleted = 'N'
-	and ra."database" = 'FLYBASE'
-;
diff --git a/workflows/references/queries/genecards.sql b/workflows/references/queries/genecards.sql
index 62c1d15e4..765e01045 100644
--- a/workflows/references/queries/genecards.sql
+++ b/workflows/references/queries/genecards.sql
@@ -1,8 +1,8 @@
 -- GENECARDS
 select
-	  gene,
     upi,
-    taxid
+    taxid,
+    gene
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/gtrnadb.sql b/workflows/references/queries/gtrnadb.sql
index dfc535087..eb0ab6800 100644
--- a/workflows/references/queries/gtrnadb.sql
+++ b/workflows/references/queries/gtrnadb.sql
@@ -1,8 +1,8 @@
 -- GTRNADB
 select
-	  gene,
     upi,
-    taxid
+    taxid,
+    gene
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/hgnc_accession.sql b/workflows/references/queries/hgnc.sql
similarity index 80%
rename from workflows/references/queries/hgnc_accession.sql
rename to workflows/references/queries/hgnc.sql
index a8eba7ccc..9a3fdf0c6 100644
--- a/workflows/references/queries/hgnc_accession.sql
+++ b/workflows/references/queries/hgnc.sql
@@ -1,9 +1,10 @@
 -- HGNC
 select
-	  accession,
-	  gene,
     upi,
-    taxid
+    taxid,
+    gene,
+	  accession,
+	  gene_synonym
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/hgnc_gene_synonym.sql b/workflows/references/queries/hgnc_gene_synonym.sql
deleted file mode 100644
index 142752c31..000000000
--- a/workflows/references/queries/hgnc_gene_synonym.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- HGNC
-select
-    gene_synonym,
-	  gene,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" = 'HGNC'
-;
diff --git a/workflows/references/queries/mirbase.sql b/workflows/references/queries/mirbase.sql
index 7530b38f8..0d804c00e 100644
--- a/workflows/references/queries/mirbase.sql
+++ b/workflows/references/queries/mirbase.sql
@@ -1,9 +1,9 @@
 -- MIRBASE
-select 
-	  optional_id,
-	  external_id,
+select
     upi,
-    taxid
+    taxid,
+    external_id,
+	  optional_id
 from xref x 
 join rnc_accessions ra 
 on 
diff --git a/workflows/references/queries/mirgenedb.sql b/workflows/references/queries/mirgenedb.sql
index 98d372191..f558be54a 100644
--- a/workflows/references/queries/mirgenedb.sql
+++ b/workflows/references/queries/mirgenedb.sql
@@ -1,8 +1,8 @@
 -- MIRGENEDB
 select
-	  external_id,
     upi,
-    taxid
+    taxid,
+	  external_id
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/pdbe.sql b/workflows/references/queries/pdbe.sql
index e53d315ec..eca4b6644 100644
--- a/workflows/references/queries/pdbe.sql
+++ b/workflows/references/queries/pdbe.sql
@@ -1,8 +1,8 @@
 -- PDBE
 select
-	  external_id,
     upi,
-    taxid
+    taxid,
+	  external_id
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/pombase_gene.sql b/workflows/references/queries/pombase.sql
similarity index 79%
rename from workflows/references/queries/pombase_gene.sql
rename to workflows/references/queries/pombase.sql
index 211c45232..5e868e181 100644
--- a/workflows/references/queries/pombase_gene.sql
+++ b/workflows/references/queries/pombase.sql
@@ -1,9 +1,10 @@
 -- POMBASE
 select
-	  gene,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+	  gene,
+	  gene_synonym -- SPlit on ','
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/pombase_gene_synonym.sql b/workflows/references/queries/pombase_gene_synonym.sql
deleted file mode 100644
index ed9562602..000000000
--- a/workflows/references/queries/pombase_gene_synonym.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- POMBASE
-select
-    gene_synonym, -- SPlit on ','
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('POMBASE')
-;
diff --git a/workflows/references/queries/refseq_gene.sql b/workflows/references/queries/refseq.sql
similarity index 66%
rename from workflows/references/queries/refseq_gene.sql
rename to workflows/references/queries/refseq.sql
index 8dbf153ac..8dc4cc9d0 100644
--- a/workflows/references/queries/refseq_gene.sql
+++ b/workflows/references/queries/refseq.sql
@@ -1,9 +1,11 @@
 -- REFSEQ
 select
-    gene,
-	  external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene,
+    gene_synonym, -- Split on ','
+    optional_id
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/refseq_gene_synonym.sql b/workflows/references/queries/refseq_gene_synonym.sql
deleted file mode 100644
index a60ac7e4a..000000000
--- a/workflows/references/queries/refseq_gene_synonym.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- REFSEQ
-select
-	  gene_synonym, -- Split on ','
-	  external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('REFSEQ')
-;
diff --git a/workflows/references/queries/refseq_optional_id.sql b/workflows/references/queries/refseq_optional_id.sql
deleted file mode 100644
index 84287782c..000000000
--- a/workflows/references/queries/refseq_optional_id.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- REFSEQ
-select
-    optional_id, -- Do not split
-	  external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-	ra.accession = x.ac
-where
-	x.deleted = 'N'
-	and ra."database" in ('REFSEQ')
-;
diff --git a/workflows/references/queries/rfam.sql b/workflows/references/queries/rfam.sql
index f1f2b319c..83ba1406f 100644
--- a/workflows/references/queries/rfam.sql
+++ b/workflows/references/queries/rfam.sql
@@ -1,9 +1,9 @@
 -- Rfam
 select
-    optional_id,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    optional_id
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/sgd.sql b/workflows/references/queries/sgd.sql
index 5a2674ae1..5034ff5f0 100644
--- a/workflows/references/queries/sgd.sql
+++ b/workflows/references/queries/sgd.sql
@@ -1,8 +1,8 @@
 -- SGD
 select
-	  external_id,
     upi,
-    taxid
+    taxid,
+	  external_id
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/tair.sql b/workflows/references/queries/tair.sql
index de6676337..a21438632 100644
--- a/workflows/references/queries/tair.sql
+++ b/workflows/references/queries/tair.sql
@@ -1,9 +1,9 @@
 -- TAIR
 select
-	  gene,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/wormbase.sql b/workflows/references/queries/wormbase.sql
index 790ee75b3..275337074 100644
--- a/workflows/references/queries/wormbase.sql
+++ b/workflows/references/queries/wormbase.sql
@@ -1,9 +1,10 @@
 -- WORMBASE
 select
-    optional_id,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    optional_id,
+    locus_tag
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/queries/wormbase_locus_tag.sql b/workflows/references/queries/wormbase_locus_tag.sql
deleted file mode 100644
index cd6d69dd7..000000000
--- a/workflows/references/queries/wormbase_locus_tag.sql
+++ /dev/null
@@ -1,14 +0,0 @@
--- WORMBASE
-select
-    locus_tag,
-    external_id,
-    upi,
-    taxid
-from xref x
-join rnc_accessions ra
-on
-    ra.accession = x.ac
-where
-    x.deleted = 'N'
-    and ra."database" in ('WORMBASE')
-;
diff --git a/workflows/references/queries/zfin.sql b/workflows/references/queries/zfin.sql
index 3cfdd6e51..3d03623db 100644
--- a/workflows/references/queries/zfin.sql
+++ b/workflows/references/queries/zfin.sql
@@ -1,9 +1,9 @@
 -- ZFIN
 select
-    gene,
-    external_id,
     upi,
-    taxid
+    taxid,
+    external_id,
+    gene
 from xref x
 join rnc_accessions ra
 on
diff --git a/workflows/references/submit/ensembl_gencode_ids.txt b/workflows/references/submit/ensembl_gencode_ids.txt
new file mode 100644
index 000000000..d7809e737
--- /dev/null
+++ b/workflows/references/submit/ensembl_gencode_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26e54218795a93e36c7bb522f430235c1e0e80561712c4db4d612c782919e4c6
+size 2954775
diff --git a/workflows/references/submit/ensembl_ids.txt b/workflows/references/submit/ensembl_ids.txt
new file mode 100644
index 000000000..81b662657
--- /dev/null
+++ b/workflows/references/submit/ensembl_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd35397a3c3036fe909568f273c0d22c5073141501ac3a5871e9085a996fc46
+size 77758708
diff --git a/workflows/references/submit/ensembl_metazoa_ids.txt b/workflows/references/submit/ensembl_metazoa_ids.txt
new file mode 100644
index 000000000..0163a44ae
--- /dev/null
+++ b/workflows/references/submit/ensembl_metazoa_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ec48f51e3650751a4735e169eabd213bfce86fef1be8cb132b2e18937ff0ba8
+size 5163520
diff --git a/workflows/references/submit/ensembl_plants_ids.txt b/workflows/references/submit/ensembl_plants_ids.txt
new file mode 100644
index 000000000..fd8ebc36b
--- /dev/null
+++ b/workflows/references/submit/ensembl_plants_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fd63f1099cdf76fa152329273fed50a19b7b5e1e04f58a500d51501660dd7f5
+size 5432399
diff --git a/workflows/references/submit/ensembl_protists_ids.txt b/workflows/references/submit/ensembl_protists_ids.txt
new file mode 100644
index 000000000..6b8d3f4e3
--- /dev/null
+++ b/workflows/references/submit/ensembl_protists_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aadc96282b91ecb4a5d6acd0dbbc9429f455fe67fa6a0d732dff5f3b2070a9d
+size 544884
diff --git a/workflows/references/submit/flybase_ids.txt b/workflows/references/submit/flybase_ids.txt
new file mode 100644
index 000000000..eb17f7f0f
--- /dev/null
+++ b/workflows/references/submit/flybase_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d59a992feb5fe136d35a5aa257607fbcbf54f2f1091e885158a1292cb3ff488
+size 253169
diff --git a/workflows/references/submit/genecards_ids.txt b/workflows/references/submit/genecards_ids.txt
new file mode 100644
index 000000000..982241102
--- /dev/null
+++ b/workflows/references/submit/genecards_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fce18cd50752c9630b709b4682e0981a6eb84ff02083ab66823db3e99a885c55
+size 12648409
diff --git a/workflows/references/submit/gtrnadb_ids.txt b/workflows/references/submit/gtrnadb_ids.txt
new file mode 100644
index 000000000..9db72c482
--- /dev/null
+++ b/workflows/references/submit/gtrnadb_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58bd8991d16966046140cc57f884da3050286b0510f1abff4b2acca783ab6bc7
+size 5291180
diff --git a/workflows/references/submit/hgnc_ids.txt b/workflows/references/submit/hgnc_ids.txt
new file mode 100644
index 000000000..1582c2b80
--- /dev/null
+++ b/workflows/references/submit/hgnc_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae6decd826f276b1b43d19c23341fb1a30f35ba86a19db05692b0d3195e6b478
+size 79668
diff --git a/workflows/references/submit/mirbase_ids.txt b/workflows/references/submit/mirbase_ids.txt
new file mode 100644
index 000000000..bb2816ec1
--- /dev/null
+++ b/workflows/references/submit/mirbase_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:384472d7ce2e65e9e693572341b6d7bd8258b6f1cd626f4799835ee4276ee008
+size 3829708
diff --git a/workflows/references/submit/mirgenedb_ids.txt b/workflows/references/submit/mirgenedb_ids.txt
new file mode 100644
index 000000000..d1bdada0c
--- /dev/null
+++ b/workflows/references/submit/mirgenedb_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da77cb18cdbb27b64b9a174f004436391d446921b01c0b5d188f97b2c5d8dd97
+size 1209980
diff --git a/workflows/references/submit/pdbe_ids.txt b/workflows/references/submit/pdbe_ids.txt
new file mode 100644
index 000000000..99950072f
--- /dev/null
+++ b/workflows/references/submit/pdbe_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8625ddfbdeccf32a95629d71f6fa4057e932b0062b1baa9589c7aaf971febc10
+size 98027
diff --git a/workflows/references/submit/pombase_ids.txt b/workflows/references/submit/pombase_ids.txt
new file mode 100644
index 000000000..5b54cd100
--- /dev/null
+++ b/workflows/references/submit/pombase_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfae65f806baeb9cd5e45aff4c10c3949909e40289d41ed403a37b3aef2ee4bc
+size 373012
diff --git a/workflows/references/submit/refseq_ids.txt b/workflows/references/submit/refseq_ids.txt
new file mode 100644
index 000000000..536cfae9a
--- /dev/null
+++ b/workflows/references/submit/refseq_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d874ce9eb8fbc20bd251a54f344f3a590cd3fedc0d6f4a5bd801da3074d6374d
+size 2902887
diff --git a/workflows/references/submit/rfam_ids.txt b/workflows/references/submit/rfam_ids.txt
new file mode 100644
index 000000000..e84a2fe53
--- /dev/null
+++ b/workflows/references/submit/rfam_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97a9a091da295c6f9ed877e6990473b6878d9307ecdef840b4f947cc028b051f
+size 43048703
diff --git a/workflows/references/submit/sgd_ids.txt b/workflows/references/submit/sgd_ids.txt
new file mode 100644
index 000000000..5d38566a7
--- /dev/null
+++ b/workflows/references/submit/sgd_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6886370d74b311a808323eb08eaedbd674c19d801b9e77aaf421fe229a4342e
+size 9182
diff --git a/workflows/references/submit/tair_ids.txt b/workflows/references/submit/tair_ids.txt
new file mode 100644
index 000000000..782ea8393
--- /dev/null
+++ b/workflows/references/submit/tair_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b7561d9883f16bb8fea61d4652e3a22b946b7a9411ece4ccce312c41242a354
+size 190481
diff --git a/workflows/references/submit/wormbase_ids.txt b/workflows/references/submit/wormbase_ids.txt
new file mode 100644
index 000000000..998d179e5
--- /dev/null
+++ b/workflows/references/submit/wormbase_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a22355e705541c86481b4efe7865ea0942a39722ca9f389d1f5215c22d00240
+size 1537664
diff --git a/workflows/references/submit/zfin_ids.txt b/workflows/references/submit/zfin_ids.txt
new file mode 100644
index 000000000..d3aada3da
--- /dev/null
+++ b/workflows/references/submit/zfin_ids.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60a0f93922a5d5005c7c9889a2d03096ec51f389e200a809e1a3ac66dad8f7a4
+size 65577
diff --git a/workflows/rfam-scan.nf b/workflows/rfam-scan.nf
index 25e775221..ddd392b9f 100644
--- a/workflows/rfam-scan.nf
+++ b/workflows/rfam-scan.nf
@@ -24,9 +24,10 @@ process generate_files {
 }
 
 process sequences {
-  memory '20GB'
+  memory '4GB'
+  queue short
   containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir"
-  clusterOptions '-R "rusage[scratch=4000]"'
+  // clusterOptions '-R "rusage[scratch=4000]"'
 
   input:
   tuple path(version), path(active_xrefs), path(computed), path(compute_missing)
@@ -50,6 +51,7 @@ process scan {
   memory { params.rfam.memory * params.rfam.cpus }
   errorStrategy 'ignore'
   containerOptions "--contain --workdir $baseDir/work/tmp --bind $baseDir"
+  queue 'short'
 
   input:
   tuple path(version), path('sequences.fasta'), path(cm_files)
diff --git a/workflows/utils/slack.nf b/workflows/utils/slack.nf
new file mode 100644
index 000000000..06a15ef5e
--- /dev/null
+++ b/workflows/utils/slack.nf
@@ -0,0 +1,47 @@
+process slack_message {
+
+  input:
+  val(message)
+
+  """
+  rnac notify step "Import Workflow" "$message"
+  """
+
+}
+
+
+process slack_file {
+
+  input:
+  path(message)
+
+  """
+  rnac notify file "$message"
+  """
+
+}
+
+
+import groovy.json.JsonSlurper
+
+// A groovy function for use in closures - uses groovy's own URL class to make the request
+def slack_closure(msg) {
+  def configFile = new File("secrets.json");
+  def config = new JsonSlurper().parseFile(configFile, 'UTF-8');
+
+  def post = new URL(config.SLACK_WEBHOOK).openConnection();
+  post.setRequestMethod("POST")
+  post.setDoOutput(true);
+  post.setRequestProperty("Content-Type", "application/json");
+
+  def  payload = "{\"text\" : \"$msg\" }"
+
+
+  post.getOutputStream().write(payload.getBytes("UTF-8"));
+  def postRC = post.getResponseCode();
+  if (postRC != 200) {
+    println("Something went wrong calling slack webhook!");
+    println(post.getInputStream().getText());
+  }
+
+}