RNAcentral · afg1 · Oct 28, 2022 · May 11, 2022 · May 11, 2022 · May 11, 2022
diff --git a/.editorconfig b/.editorconfig
@@ -34,3 +34,6 @@ indent_size = 2
 [*.yaml]
 indent_style = space
 indent_size = 2
+
+[*.nf]
+indent_size = 2
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+workflows/references/submit/*.txt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -0,0 +1,75 @@
+# Thid workflow will build and push the import pipeline container.
+# the plan later will be to include unit tests as well
+
+
+name: Building Pipeline Containers
+
+on:
+ push:
+  branches:
+   'dev'
+jobs:
+
+  starting-notification:
+   runs-on: ubuntu-latest
+   steps:
+     - uses: actions/checkout@v2
+
+     - name: Intital notification
+       uses: rtCamp/action-slack-notify@v2
+       env:
+         SLACK_MESSAGE: 'Creating new pipeline image in docker hub'
+         SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+         MSG_MINIMAL: true
+
+  create-docker-image:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: docker login
+        env:
+          DOCKER_USER: ${{ secrets.DOCKER_USER }}
+          DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
+        run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
+
+      - name: docker build
+        run: docker build -f Dockerfile -t rnacentral/rnacentral-import-pipeline .
+
+      - name: docker push
+        run: docker push rnacentral/rnacentral-import-pipeline
+
+  finished-notification:
+    needs:
+      - create-docker-image
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Finished notification
+        uses: rtCamp/action-slack-notify@v2
+        env:
+          SLACK_MESSAGE: 'New pipeline image pushed to docker hub'
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          MSG_MINIMAL: true
+
+  singularity-conversion:
+    needs:
+      - create-docker-image
+    uses: rnacentral/rnacentral-import-pipeline/.github/workflows/singularity.yaml@dev
+    secrets: inherit
+
+
+  finished-singularity:
+    needs:
+      - singularity-conversion
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Finished notification
+        uses: rtCamp/action-slack-notify@v2
+        env:
+          SLACK_MESSAGE: 'New singularity image pushed to ghcr'
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          MSG_MINIMAL: true
diff --git a/.github/workflows/singularity.yaml b/.github/workflows/singularity.yaml
@@ -0,0 +1,25 @@
+# This workflow runs the conversion to singularity and stores the result in the
+# ghcr so we can pull it easier
+
+name: Singularity Build
+on: workflow_call
+
+
+jobs:
+  run_conversion:
+    name: "Pull docker image and convert"
+    runs-on: ubuntu-latest
+
+    container:
+      image: quay.io/singularity/singularity:v3.8.1
+      options: --privileged
+
+    steps:
+      - name: "Pull image"
+        run: |
+          singularity pull  --name rnacentral-rnacentral-import-pipeline-latest.sif docker://rnacentral/rnacentral-import-pipeline:latest
+
+      - name: "Push to ghcr"
+        run: |
+          echo ${{ secrets.GITHUB_TOKEN }} | singularity remote login -u ${{ secrets.GHCR_USERNAME }} --password-stdin oras://ghcr.io
+          singularity push rnacentral-rnacentral-import-pipeline-latest.sif oras://ghcr.io/${GITHUB_REPOSITORY}:latest
diff --git a/.gitignore b/.gitignore
@@ -101,3 +101,8 @@ stubs
 .envrc
 workflows/references/results
 workflows/references/metadata
+workflows/references/backup
+workflows/references/submit/previous-release
+workflows/references/manually_annotated/from*
+workflows/references/manually_annotated/results
+singularity/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,28 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v4.3.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
 -   repo: https://github.com/psf/black
-    rev: 19.3b0
+    rev: 22.6.0
     hooks:
     -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 5.10.1
+    hooks:
+    -   id: isort
+        args: ["--profile", "black", "--filter-files"]
+        name: isort (python)
+# -   repo: https://github.com/doublify/pre-commit-rust
+#     rev: v1.0
+#     hooks:
+#     -   id: fmt
+#     -   id: cargo-check
+#     -   id: clippy
+- repo: https://github.com/python-poetry/poetry
+  rev: '1.2.0rc1'
+  hooks:
+    - id: poetry-check
+    # - id: poetry-lock
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7-buster
+FROM python:3.8-buster
 
 ENV RNA /rna
 
@@ -46,6 +46,7 @@ RUN apt-get install -y \
     unzip \
     wget
 
+
 # Install Infernal
 RUN \
     cd $RNA/ && \
@@ -94,6 +95,7 @@ RUN pip3 install -r $RNACENTRAL_IMPORT_PIPELINE/requirements.txt
 
 RUN python3 -m textblob.download_corpora
 
+
 WORKDIR /
 
 COPY openssl/openssl.cnf /etc/ssl/

diff --git a/Makefile b/Makefile
@@ -13,13 +13,25 @@ requirements-dev.txt: requirements-dev.in
 
 rust:
 	cargo build --release
-	cp target/release/json2fasta bin
-	cp target/release/split-ena bin
-	cp target/release/expand-urs bin
-	cp target/release/precompute bin
-	cp target/release/search-export bin
-	cp target/release/ftp-export bin
-	cp target/release/json2dfasta bin
+	mv -f target/release/json2fasta bin
+	mv -f target/release/split-ena bin
+	mv -f target/release/expand-urs bin
+	mv -f target/release/precompute bin
+	mv -f target/release/search-export bin
+	mv -f target/release/ftp-export bin
+	mv -f target/release/json2dfasta bin
+	mv -f target/release/expression-parse bin
+
+clean:
+	rm bin/json2fasta
+	rm bin/split-ena
+	rm bin/expand-urs
+	rm bin/precompute
+	rm bin/search-export
+	rm bin/ftp-export
+	rm bin/json2dfasta
+	rm bin/expression-parse
+	cargo clean
 
 docker: Dockerfile requirements.txt .dockerignore
 	docker build -t "$(docker)" .

diff --git a/analyze.nf b/analyze.nf
@@ -7,13 +7,29 @@ include { genome_mapping } from './workflows/genome-mapping'
 include { r2dt } from './workflows/r2dt'
 include { rfam_scan } from './workflows/rfam-scan'
 
+include { slack_closure } from './workflows/utils/slack'
+include { slack_message } from './workflows/utils/slack'
+
 workflow analyze {
   take: ready
   emit: done
   main:
+    Channel.of("Starting analyze pipeline") | slack_message
     ready | (genome_mapping & rfam_scan & r2dt & cpat) | mix | collect | set { done }
 }
 
 workflow {
   analyze(Channel.of('ready'))
 }
+
+
+workflow.onComplete {
+  slack_closure("Analyze workflow completed")
+
+}
+
+workflow.onError {
+
+  slack_closure("Analyze workflow hit an error and crashed")
+
+}
diff --git a/bin/check_ids.py b/bin/check_ids.py
@@ -26,10 +26,11 @@
 words.update(ignore_ids)
 special_char = re.compile('[@!#$%^&()<>?/\[\]\'}{~:]')
 nts = re.compile('^[acgu]+$')
+numbers_and_dash = re.compile('^\d+[\-]\d+$')  # do not use ids like 6-1, 260-1, etc
 
 
 def check_id(item):
-    if item.isnumeric() or item.lower() in words:
+    if item.isnumeric() or item.lower() in words or numbers_and_dash.search(item):
         result = None
     elif len(item) > 2 and not special_char.search(item) and not nts.search(item.lower()) and "\\" not in item:
         result = item
@@ -47,55 +48,72 @@ def main(database, filename, output):
     """
     Check ids and create file that will be used by RNAcentral-references.
     """
-    remove_dot = ["ensembl_gene", "ensembl_gencode_gene", "ensembl_metazoa_gene"]
-    split_on_comma = ["flybase_gene_synonym", "pombase_gene_synonym", "refseq_gene_synonym", "hgnc_gene_synonym"]
+    remove_dot = ["ensembl", "ensembl_gencode", "ensembl_metazoa"]
+    split_on_comma = ["flybase", "hgnc", "pombase", "refseq"]
+    rfam_ignore = [
+        "30_255", "30_292", "5S_rRNA", "5_8S_rRNA", "6A", "6S", "7SK", "C4", "CRISPR-DR10", "CRISPR-DR11",
+        "CRISPR-DR12", "CRISPR-DR13", "CRISPR-DR14", "CRISPR-DR15", "CRISPR-DR16", "CRISPR-DR17", "CRISPR-DR18",
+        "CRISPR-DR19", "CRISPR-DR2", "CRISPR-DR20", "CRISPR-DR21", "CRISPR-DR22", "CRISPR-DR23", "CRISPR-DR24",
+        "CRISPR-DR25", "CRISPR-DR26", "CRISPR-DR27", "CRISPR-DR28", "CRISPR-DR29", "CRISPR-DR3", "CRISPR-DR30",
+        "CRISPR-DR31", "CRISPR-DR32", "CRISPR-DR33", "CRISPR-DR34", "CRISPR-DR35", "CRISPR-DR36", "CRISPR-DR37",
+        "CRISPR-DR38", "CRISPR-DR39", "CRISPR-DR4", "CRISPR-DR40", "CRISPR-DR41", "CRISPR-DR42", "CRISPR-DR43",
+        "CRISPR-DR44", "CRISPR-DR45", "CRISPR-DR46", "CRISPR-DR47", "CRISPR-DR48", "CRISPR-DR49", "CRISPR-DR5",
+        "CRISPR-DR50", "CRISPR-DR51", "CRISPR-DR52", "CRISPR-DR53", "CRISPR-DR54", "CRISPR-DR55", "CRISPR-DR56",
+        "CRISPR-DR57", "CRISPR-DR58", "CRISPR-DR6", "CRISPR-DR60", "CRISPR-DR61", "CRISPR-DR62", "CRISPR-DR63",
+        "CRISPR-DR64", "CRISPR-DR65", "CRISPR-DR66", "CRISPR-DR7", "CRISPR-DR8", "CRISPR-DR9", "F6", "Hairpin",
+        "Hairpin-meta1", "Hairpin-meta2", "Hatchet", "P1", "P10", "P11", "P13", "P14", "P15", "P17", "P18", "P2", "P24",
+        "P26", "P27", "P31", "P33", "P34", "P35", "P36", "P37", "P4", "P5", "P6", "P8", "P9", "ROSE", "S35", "S414",
+        "S774", "S808", "SAM", "SL1", "SL2", "U1", "U11", "U12", "U1_yeast", "U2", "U3", "U4", "U4atac", "U5", "U54",
+        "U6", "U6atac", "U7", "U8", "VA", "csRNA", "drum", "g2", "pRNA", "sar", "sul1", "t44", "tRNA", "tRNA-Sec",
+        "tmRNA", "tp2", "tracrRNA"
+    ]
 
     with open(filename, 'r') as input_file:
         with open(output, 'w') as output_file:
             while line := input_file.readline():
                 line = line.rstrip()
                 line = line.split('|')
-
-                if len(line) == 4:
-                    get_gene = line[0]
-                    get_primary_id = line[1]
-                    urs = line[2]
-                    taxid = line[3]
-
-                    # remove "."
-                    if database in remove_dot and "." in get_gene:
-                        get_gene = get_gene.split('.')[0]
-
-                    # split on ","
-                    gene_results = []
-                    if database in split_on_comma:
-                        gene_list = get_gene.split(',')
-                        for item in gene_list:
-                            item = check_id(item)
-                            if item:
-                                gene_results.append(item)
-
-                    if gene_results:
-                        primary_id = check_id(get_primary_id)
-                        for gene in gene_results:
-                            if gene and primary_id and gene != primary_id:
-                                output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
-                    else:
-                        gene = check_id(get_gene)
-                        primary_id = check_id(get_primary_id)
-                        if gene and primary_id and gene != primary_id:
-                            output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
-
-                else:
-                    get_primary_id = line[0]
-                    urs = line[1]
-                    taxid = line[2]
-
-                    # check if it is a valid id
-                    primary_id = check_id(get_primary_id)
-
-                    if primary_id:
-                        output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
+                urs = line[0]
+                taxid = line[1]
+                primary_id = check_id(line[2])
+                if primary_id and database in remove_dot and "." in primary_id:
+                    primary_id = primary_id.split('.')[0]
+
+                if primary_id and line[3:]:
+                    for item in line[3:]:
+                        if item:
+                            get_id = item
+                        else:
+                            continue
+
+                        # ignore some optional_id from Rfam
+                        if database == "rfam" and get_id in rfam_ignore:
+                            output_file.write('|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                            continue
+
+                        # remove "."
+                        if database in remove_dot and "." in get_id:
+                            get_id = get_id.split('.')[0]
+
+                        # split on ","
+                        results = []
+                        if database in split_on_comma:
+                            list_of_ids = get_id.split(',')
+                            for elem in list_of_ids:
+                                elem = check_id(elem)
+                                if elem:
+                                    results.append(elem)
+
+                        if results:
+                            for db_id in results:
+                                if db_id != primary_id:
+                                    output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                        else:
+                            db_id = check_id(get_id)
+                            if db_id and db_id != primary_id:
+                                output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
+                elif primary_id:
+                    output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
 
 
 if __name__ == '__main__':
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		workflows/references/submit/*.txt filter=lfs diff=lfs merge=lfs -text