Skip to content

Commit b0db47f

Browse files
authored
Merge pull request #151 from RNAcentral/dev
Update master with changes from dev for Release 21
2 parents 97d8900 + 657f542 commit b0db47f

File tree

175 files changed

+4956
-788
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

175 files changed

+4956
-788
lines changed

Diff for: .editorconfig

+3
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@ indent_size = 2
3434
[*.yaml]
3535
indent_style = space
3636
indent_size = 2
37+
38+
[*.nf]
39+
indent_size = 2

Diff for: .gitattributes

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
workflows/references/submit/*.txt filter=lfs diff=lfs merge=lfs -text

Diff for: .github/workflows/main.yaml

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Thid workflow will build and push the import pipeline container.
2+
# the plan later will be to include unit tests as well
3+
4+
5+
name: Building Pipeline Containers
6+
7+
on:
8+
push:
9+
branches:
10+
'dev'
11+
jobs:
12+
13+
starting-notification:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v2
17+
18+
- name: Intital notification
19+
uses: rtCamp/action-slack-notify@v2
20+
env:
21+
SLACK_MESSAGE: 'Creating new pipeline image in docker hub'
22+
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
23+
MSG_MINIMAL: true
24+
25+
create-docker-image:
26+
runs-on: ubuntu-latest
27+
steps:
28+
- uses: actions/checkout@v2
29+
30+
- name: docker login
31+
env:
32+
DOCKER_USER: ${{ secrets.DOCKER_USER }}
33+
DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
34+
run: docker login -u $DOCKER_USER -p $DOCKER_PASSWORD
35+
36+
- name: docker build
37+
run: docker build -f Dockerfile -t rnacentral/rnacentral-import-pipeline .
38+
39+
- name: docker push
40+
run: docker push rnacentral/rnacentral-import-pipeline
41+
42+
finished-notification:
43+
needs:
44+
- create-docker-image
45+
runs-on: ubuntu-latest
46+
steps:
47+
- uses: actions/checkout@v2
48+
49+
- name: Finished notification
50+
uses: rtCamp/action-slack-notify@v2
51+
env:
52+
SLACK_MESSAGE: 'New pipeline image pushed to docker hub'
53+
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
54+
MSG_MINIMAL: true
55+
56+
singularity-conversion:
57+
needs:
58+
- create-docker-image
59+
uses: rnacentral/rnacentral-import-pipeline/.github/workflows/singularity.yaml@dev
60+
secrets: inherit
61+
62+
63+
finished-singularity:
64+
needs:
65+
- singularity-conversion
66+
runs-on: ubuntu-latest
67+
steps:
68+
- uses: actions/checkout@v2
69+
70+
- name: Finished notification
71+
uses: rtCamp/action-slack-notify@v2
72+
env:
73+
SLACK_MESSAGE: 'New singularity image pushed to ghcr'
74+
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
75+
MSG_MINIMAL: true

Diff for: .github/workflows/singularity.yaml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# This workflow runs the conversion to singularity and stores the result in the
2+
# ghcr so we can pull it easier
3+
4+
name: Singularity Build
5+
on: workflow_call
6+
7+
8+
jobs:
9+
run_conversion:
10+
name: "Pull docker image and convert"
11+
runs-on: ubuntu-latest
12+
13+
container:
14+
image: quay.io/singularity/singularity:v3.8.1
15+
options: --privileged
16+
17+
steps:
18+
- name: "Pull image"
19+
run: |
20+
singularity pull --name rnacentral-rnacentral-import-pipeline-latest.sif docker://rnacentral/rnacentral-import-pipeline:latest
21+
22+
- name: "Push to ghcr"
23+
run: |
24+
echo ${{ secrets.GITHUB_TOKEN }} | singularity remote login -u ${{ secrets.GHCR_USERNAME }} --password-stdin oras://ghcr.io
25+
singularity push rnacentral-rnacentral-import-pipeline-latest.sif oras://ghcr.io/${GITHUB_REPOSITORY}:latest

Diff for: .gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,8 @@ stubs
101101
.envrc
102102
workflows/references/results
103103
workflows/references/metadata
104+
workflows/references/backup
105+
workflows/references/submit/previous-release
106+
workflows/references/manually_annotated/from*
107+
workflows/references/manually_annotated/results
108+
singularity/*

Diff for: .pre-commit-config.yaml

+19-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,28 @@
11
repos:
22
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v3.2.0
3+
rev: v4.3.0
44
hooks:
55
- id: trailing-whitespace
66
- id: end-of-file-fixer
77
- id: check-yaml
88
- repo: https://github.com/psf/black
9-
rev: 19.3b0
9+
rev: 22.6.0
1010
hooks:
1111
- id: black
12+
- repo: https://github.com/pycqa/isort
13+
rev: 5.10.1
14+
hooks:
15+
- id: isort
16+
args: ["--profile", "black", "--filter-files"]
17+
name: isort (python)
18+
# - repo: https://github.com/doublify/pre-commit-rust
19+
# rev: v1.0
20+
# hooks:
21+
# - id: fmt
22+
# - id: cargo-check
23+
# - id: clippy
24+
- repo: https://github.com/python-poetry/poetry
25+
rev: '1.2.0rc1'
26+
hooks:
27+
- id: poetry-check
28+
# - id: poetry-lock

Diff for: Dockerfile

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.7-buster
1+
FROM python:3.8-buster
22

33
ENV RNA /rna
44

@@ -46,6 +46,7 @@ RUN apt-get install -y \
4646
unzip \
4747
wget
4848

49+
4950
# Install Infernal
5051
RUN \
5152
cd $RNA/ && \
@@ -94,6 +95,7 @@ RUN pip3 install -r $RNACENTRAL_IMPORT_PIPELINE/requirements.txt
9495

9596
RUN python3 -m textblob.download_corpora
9697

98+
9799
WORKDIR /
98100

99101
COPY openssl/openssl.cnf /etc/ssl/

Diff for: Makefile

+19-7
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,25 @@ requirements-dev.txt: requirements-dev.in
1313

1414
rust:
1515
cargo build --release
16-
cp target/release/json2fasta bin
17-
cp target/release/split-ena bin
18-
cp target/release/expand-urs bin
19-
cp target/release/precompute bin
20-
cp target/release/search-export bin
21-
cp target/release/ftp-export bin
22-
cp target/release/json2dfasta bin
16+
mv -f target/release/json2fasta bin
17+
mv -f target/release/split-ena bin
18+
mv -f target/release/expand-urs bin
19+
mv -f target/release/precompute bin
20+
mv -f target/release/search-export bin
21+
mv -f target/release/ftp-export bin
22+
mv -f target/release/json2dfasta bin
23+
mv -f target/release/expression-parse bin
24+
25+
clean:
26+
rm bin/json2fasta
27+
rm bin/split-ena
28+
rm bin/expand-urs
29+
rm bin/precompute
30+
rm bin/search-export
31+
rm bin/ftp-export
32+
rm bin/json2dfasta
33+
rm bin/expression-parse
34+
cargo clean
2335

2436
docker: Dockerfile requirements.txt .dockerignore
2537
docker build -t "$(docker)" .

Diff for: analyze.nf

+16
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,29 @@ include { genome_mapping } from './workflows/genome-mapping'
77
include { r2dt } from './workflows/r2dt'
88
include { rfam_scan } from './workflows/rfam-scan'
99

10+
include { slack_closure } from './workflows/utils/slack'
11+
include { slack_message } from './workflows/utils/slack'
12+
1013
workflow analyze {
1114
take: ready
1215
emit: done
1316
main:
17+
Channel.of("Starting analyze pipeline") | slack_message
1418
ready | (genome_mapping & rfam_scan & r2dt & cpat) | mix | collect | set { done }
1519
}
1620

1721
workflow {
1822
analyze(Channel.of('ready'))
1923
}
24+
25+
26+
workflow.onComplete {
27+
slack_closure("Analyze workflow completed")
28+
29+
}
30+
31+
workflow.onError {
32+
33+
slack_closure("Analyze workflow hit an error and crashed")
34+
35+
}

Diff for: bin/check_ids.py

+62-44
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@
2626
words.update(ignore_ids)
2727
special_char = re.compile('[@!#$%^&()<>?/\[\]\'}{~:]')
2828
nts = re.compile('^[acgu]+$')
29+
numbers_and_dash = re.compile('^\d+[\-]\d+$') # do not use ids like 6-1, 260-1, etc
2930

3031

3132
def check_id(item):
32-
if item.isnumeric() or item.lower() in words:
33+
if item.isnumeric() or item.lower() in words or numbers_and_dash.search(item):
3334
result = None
3435
elif len(item) > 2 and not special_char.search(item) and not nts.search(item.lower()) and "\\" not in item:
3536
result = item
@@ -47,55 +48,72 @@ def main(database, filename, output):
4748
"""
4849
Check ids and create file that will be used by RNAcentral-references.
4950
"""
50-
remove_dot = ["ensembl_gene", "ensembl_gencode_gene", "ensembl_metazoa_gene"]
51-
split_on_comma = ["flybase_gene_synonym", "pombase_gene_synonym", "refseq_gene_synonym", "hgnc_gene_synonym"]
51+
remove_dot = ["ensembl", "ensembl_gencode", "ensembl_metazoa"]
52+
split_on_comma = ["flybase", "hgnc", "pombase", "refseq"]
53+
rfam_ignore = [
54+
"30_255", "30_292", "5S_rRNA", "5_8S_rRNA", "6A", "6S", "7SK", "C4", "CRISPR-DR10", "CRISPR-DR11",
55+
"CRISPR-DR12", "CRISPR-DR13", "CRISPR-DR14", "CRISPR-DR15", "CRISPR-DR16", "CRISPR-DR17", "CRISPR-DR18",
56+
"CRISPR-DR19", "CRISPR-DR2", "CRISPR-DR20", "CRISPR-DR21", "CRISPR-DR22", "CRISPR-DR23", "CRISPR-DR24",
57+
"CRISPR-DR25", "CRISPR-DR26", "CRISPR-DR27", "CRISPR-DR28", "CRISPR-DR29", "CRISPR-DR3", "CRISPR-DR30",
58+
"CRISPR-DR31", "CRISPR-DR32", "CRISPR-DR33", "CRISPR-DR34", "CRISPR-DR35", "CRISPR-DR36", "CRISPR-DR37",
59+
"CRISPR-DR38", "CRISPR-DR39", "CRISPR-DR4", "CRISPR-DR40", "CRISPR-DR41", "CRISPR-DR42", "CRISPR-DR43",
60+
"CRISPR-DR44", "CRISPR-DR45", "CRISPR-DR46", "CRISPR-DR47", "CRISPR-DR48", "CRISPR-DR49", "CRISPR-DR5",
61+
"CRISPR-DR50", "CRISPR-DR51", "CRISPR-DR52", "CRISPR-DR53", "CRISPR-DR54", "CRISPR-DR55", "CRISPR-DR56",
62+
"CRISPR-DR57", "CRISPR-DR58", "CRISPR-DR6", "CRISPR-DR60", "CRISPR-DR61", "CRISPR-DR62", "CRISPR-DR63",
63+
"CRISPR-DR64", "CRISPR-DR65", "CRISPR-DR66", "CRISPR-DR7", "CRISPR-DR8", "CRISPR-DR9", "F6", "Hairpin",
64+
"Hairpin-meta1", "Hairpin-meta2", "Hatchet", "P1", "P10", "P11", "P13", "P14", "P15", "P17", "P18", "P2", "P24",
65+
"P26", "P27", "P31", "P33", "P34", "P35", "P36", "P37", "P4", "P5", "P6", "P8", "P9", "ROSE", "S35", "S414",
66+
"S774", "S808", "SAM", "SL1", "SL2", "U1", "U11", "U12", "U1_yeast", "U2", "U3", "U4", "U4atac", "U5", "U54",
67+
"U6", "U6atac", "U7", "U8", "VA", "csRNA", "drum", "g2", "pRNA", "sar", "sul1", "t44", "tRNA", "tRNA-Sec",
68+
"tmRNA", "tp2", "tracrRNA"
69+
]
5270

5371
with open(filename, 'r') as input_file:
5472
with open(output, 'w') as output_file:
5573
while line := input_file.readline():
5674
line = line.rstrip()
5775
line = line.split('|')
58-
59-
if len(line) == 4:
60-
get_gene = line[0]
61-
get_primary_id = line[1]
62-
urs = line[2]
63-
taxid = line[3]
64-
65-
# remove "."
66-
if database in remove_dot and "." in get_gene:
67-
get_gene = get_gene.split('.')[0]
68-
69-
# split on ","
70-
gene_results = []
71-
if database in split_on_comma:
72-
gene_list = get_gene.split(',')
73-
for item in gene_list:
74-
item = check_id(item)
75-
if item:
76-
gene_results.append(item)
77-
78-
if gene_results:
79-
primary_id = check_id(get_primary_id)
80-
for gene in gene_results:
81-
if gene and primary_id and gene != primary_id:
82-
output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
83-
else:
84-
gene = check_id(get_gene)
85-
primary_id = check_id(get_primary_id)
86-
if gene and primary_id and gene != primary_id:
87-
output_file.write(gene + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
88-
89-
else:
90-
get_primary_id = line[0]
91-
urs = line[1]
92-
taxid = line[2]
93-
94-
# check if it is a valid id
95-
primary_id = check_id(get_primary_id)
96-
97-
if primary_id:
98-
output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
76+
urs = line[0]
77+
taxid = line[1]
78+
primary_id = check_id(line[2])
79+
if primary_id and database in remove_dot and "." in primary_id:
80+
primary_id = primary_id.split('.')[0]
81+
82+
if primary_id and line[3:]:
83+
for item in line[3:]:
84+
if item:
85+
get_id = item
86+
else:
87+
continue
88+
89+
# ignore some optional_id from Rfam
90+
if database == "rfam" and get_id in rfam_ignore:
91+
output_file.write('|' + primary_id + '|' + urs + '_' + taxid + '\n')
92+
continue
93+
94+
# remove "."
95+
if database in remove_dot and "." in get_id:
96+
get_id = get_id.split('.')[0]
97+
98+
# split on ","
99+
results = []
100+
if database in split_on_comma:
101+
list_of_ids = get_id.split(',')
102+
for elem in list_of_ids:
103+
elem = check_id(elem)
104+
if elem:
105+
results.append(elem)
106+
107+
if results:
108+
for db_id in results:
109+
if db_id != primary_id:
110+
output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
111+
else:
112+
db_id = check_id(get_id)
113+
if db_id and db_id != primary_id:
114+
output_file.write(db_id + '|' + primary_id + '|' + urs + '_' + taxid + '\n')
115+
elif primary_id:
116+
output_file.write(primary_id + '|' + urs + '_' + taxid + '\n')
99117

100118

101119
if __name__ == '__main__':

0 commit comments

Comments
 (0)