Skip to content

Commit 0746a84

Browse files
authored
Merge branch 'dev' into cleanup-text-mining
2 parents a7f926f + 5b2a7a8 commit 0746a84

File tree

15 files changed

+766
-611
lines changed

15 files changed

+766
-611
lines changed

bin/litscan-get-articles.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,13 @@ def create_xml_file(results, directory):
8181
@click.argument('directory')
8282
def main(database, directory):
8383
"""
84-
Get the data that will be used by the search index
84+
Get the data that will be used by the search index.
85+
I tried to fetch all data in one query (query available at the end of this
86+
file), but that requires a large amount of RAM. A second approach was made
87+
using two queries (the second query was to fetch organism data), but that
88+
still requires 65GB of RAM. This task usually takes about 3 hours and for
89+
now it will stay like this.
90+
8591
:param database: params to connect to the db
8692
:param directory: directory to store xml files
8793
:return: None
@@ -173,3 +179,21 @@ def main(database, directory):
173179

174180
if __name__ == "__main__":
175181
main()
182+
183+
# cursor.execute("""
184+
# SELECT
185+
# a.pmcid, a.title, a.abstract, a.author, a.pmid, a.doi, a.year, a.journal, a.score, a.cited_by, a.type,
186+
# r.id AS result_id, r.job_id, r.id_in_title, r.id_in_abstract, r.id_in_body,
187+
# j.display_id,
188+
# abs_s.sentence AS abstract_sentence,
189+
# bod_s.sentence AS body_sentence,
190+
# ma.urs AS manually_annotated
191+
# FROM litscan_article a
192+
# LEFT JOIN litscan_result r ON r.pmcid = a.pmcid
193+
# LEFT JOIN litscan_job j ON r.job_id = j.job_id
194+
# LEFT JOIN (SELECT result_id, sentence FROM litscan_abstract_sentence ORDER BY length(sentence) DESC LIMIT 1) abs_s ON abs_s.result_id = r.id
195+
# LEFT JOIN (SELECT result_id, sentence FROM litscan_body_sentence ORDER BY length(sentence) DESC LIMIT 1) bod_s ON bod_s.result_id = r.id
196+
# LEFT JOIN litscan_manually_annotated ma ON ma.pmcid = a.pmcid
197+
# WHERE a.retracted IS NOT TRUE;
198+
# """)
199+
# rows = cursor.fetchall()

bin/litscan-get-unique-ids.sh

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,17 @@
44
file=$1
55
database=$2
66

7-
# read file line by line
8-
while IFS= read -r line; do
9-
IFS=$"|"
10-
tmp=($line)
11-
if [[ ${#tmp[*]} = 2 ]]; then
12-
job_id="${tmp[0]}"
13-
urs="${tmp[1]}"
14-
else
15-
job_id="${tmp[0]}"
16-
primary_id="${tmp[1]}"
17-
urs="${tmp[2]}"
18-
fi
7+
awk -F'|' '{
8+
if (NF == 2) {
9+
job_id = $1;
10+
urs = $2;
11+
} else {
12+
job_id = $1;
13+
primary_id = $2;
14+
urs = $3;
15+
}
1916
20-
if [[ -n "${job_id}" ]]; then
21-
echo ${job_id} >> ${database}_all_ids.txt
22-
fi
23-
24-
if [[ -n "${primary_id}" ]]; then
25-
echo ${primary_id} >> ${database}_all_ids.txt
26-
fi
27-
28-
if [[ -n "${urs}" ]]; then
29-
echo ${urs} >> ${database}_all_ids.txt
30-
fi
31-
done < ${file}
32-
33-
# create file with unique ids
34-
cat ${database}_all_ids.txt | sort | uniq > ${database}_ids.txt
17+
if (job_id) print job_id;
18+
if (primary_id) print primary_id;
19+
if (urs) print urs;
20+
}' "$file" | LC_ALL=C sort -u > "${database}_ids.txt"

bin/litscan-upload-ids.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ function submitJob
1717
curl -X POST \
1818
-H "Content-Type:application/json" \
1919
-d "{\"id\": \"${job_id}\"}" \
20-
http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
20+
http://45.88.80.122:8080/api/submit-job
2121
}
2222

2323
# loop through the file

poetry.lock

Lines changed: 440 additions & 380 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,12 @@ scikit-learn = "^1.1.3"
3030
semver = "^2.13.0"
3131
slack_sdk = "^3.19.4"
3232
sqlitedict = "^2.0.0"
33-
TatSu = "5.8.2"
33+
TatSu = "5.10.6"
34+
textblob = "0.15.3"
3435
throttler = "^1.2.0"
3536
nltk = "^3.8.1"
3637
openpyxl = "^3.0.10"
37-
pybedtools = {git = "https://github.com/afg1/pybedtools.git"}
38+
pybedtools = "^0.10.0"
3839
psycopg2-binary = "^2.9.7"
3940

4041
[tool.poetry.group.dev.dependencies]

rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py

Lines changed: 54 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,71 +14,68 @@
1414
"""
1515

1616
import csv
17+
import re
18+
import typing as ty
1719

18-
from rnacentral_pipeline import psql
19-
20-
21-
def gene(result):
22-
"""
23-
Convert the gene name into a standarized format.
24-
"""
25-
26-
if result["database"] == "ENSEMBL":
27-
return result["optional_id"]
20+
from attr import frozen
2821

29-
if result["database"] == "MIRBASE":
30-
return result["optional_id"]
31-
32-
if result["rna_type"] == "piRNA" and result["database"] == "ENA":
33-
return result["product"]
34-
35-
name = result["gene"] or ""
36-
name = name.replace("\t", " ")
37-
return name
38-
39-
40-
def accession(result):
41-
"""
42-
Produce the accession for the result. This will compute the accession
43-
depending on the database.
44-
"""
45-
46-
if result["database"] == "ENA" or result["database"] == "HGNC":
47-
return result["accession"]
48-
if result["database"] == "PDBE":
49-
return "%s_%s" % (result["external_id"], result["optional_id"])
50-
return result["external_id"]
51-
52-
53-
def database(result):
54-
"""
55-
Normalize the database name.
56-
"""
57-
58-
if result["database"] == "PDBE":
59-
return "PDB"
60-
return result["database"]
22+
from rnacentral_pipeline import psql
6123

6224

63-
def as_entry(result):
64-
"""
65-
Produce the final result list for writing.
66-
"""
67-
return [
68-
result["upi"],
69-
database(result),
70-
accession(result),
71-
result["taxid"],
72-
result["rna_type"],
73-
gene(result),
74-
]
25+
@frozen
26+
class IdMapping:
27+
upi: str
28+
accession: str
29+
taxid: int
30+
external_id: str
31+
optional_id: str
32+
rna_type: str
33+
gene: str
34+
product: str
35+
database: str
36+
37+
def entry(self) -> ty.List[str]:
38+
database = self.database
39+
accession = self.external_id
40+
gene = self.gene or ""
41+
gene = gene.replace("\t", " ")
42+
43+
if self.database == "PDBE":
44+
database = "PDB"
45+
accession = "%s_%s" % (self.external_id, self.optional_id)
46+
elif self.database == "HGNC":
47+
accession = self.accession
48+
elif self.database == "ENA":
49+
if self.rna_type == "piRNA":
50+
gene = self.product
51+
accession = self.accession
52+
elif self.database == "MIRBASE":
53+
gene = self.optional_id
54+
elif self.database == "ENSEMBL":
55+
gene = self.optional_id
56+
elif self.database == "RFAM":
57+
if self.rna_type == "pre_miRNA":
58+
acc_range, endpoints, _ = self.accession.split(":", 2)
59+
acc = re.sub(r"\.\d\d+$", "", acc_range)
60+
start, stop = endpoints.split("..", 1)
61+
gene = f"{acc}/{start}-{stop}"
62+
63+
return [
64+
self.upi,
65+
database,
66+
accession,
67+
str(self.taxid),
68+
self.rna_type,
69+
gene,
70+
]
7571

7672

7773
def generate_file(json_file, output):
7874
"""
7975
This will generate a TSV mapping file given the input TSV.
8076
"""
8177

82-
entries = psql.json_handler(json_file)
83-
data = map(as_entry, entries)
84-
csv.writer(output, delimiter="\t").writerows(data)
78+
writer = csv.writer(output, delimiter="\t")
79+
for entry in psql.json_handler(json_file):
80+
data = IdMapping(**entry)
81+
writer.writerow(data.entry())

rnacentral_pipeline/rnacentral/genome_mapping/igv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def check_url(file_list: List[str], name: str, path: str, assembly_id: str) -> D
7272

7373
def create_json(species: str, assembly_id: str, output: IO[str]) -> None:
7474
with ftp(FTP_SERVER) as conn:
75-
path = "pub/databases/RNAcentral/.genome-browser"
75+
path = "pub/databases/RNAcentral/.genome-browser-dev"
7676
conn.cwd(path)
7777
file_list = conn.nlst()
7878
name = f"{species}.{assembly_id}"

0 commit comments

Comments
 (0)