Skip to content

Commit 7a53178

Browse files
committed
#12 - Create file with annotations of up to a thousand articles
1 parent e4b9245 commit 7a53178

File tree

1 file changed

+53
-29
lines changed

1 file changed

+53
-29
lines changed

export_results.py

+53-29
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import asyncio
1515
import json
1616
import os
17+
import random
18+
import string
1719
import sqlalchemy as sa
1820

1921
from aiopg.sa import create_engine
@@ -24,6 +26,41 @@
2426
load_dotenv()
2527

2628

29+
async def create_json_file(annotations):
30+
"""
31+
Creates the Json file that will be used by Europe PMC
32+
:param annotations: list of results
33+
:return: none
34+
"""
35+
# random string to use in filename
36+
random_char = "".join(random.choices(string.ascii_uppercase + string.digits, k=16))
37+
38+
with open("json_files/" + "RNAcentral_annotations_" + random_char, "w", encoding="utf-8") as outfile:
39+
outfile.write("[" + ",\n".join(json.dumps(row, ensure_ascii=False) for row in annotations) + "]\n")
40+
41+
42+
async def create_annotation(json_obj, exact, section, job_id, urs):
43+
"""
44+
Function to add annotation
45+
:param json_obj: json object
46+
:param exact: sentence
47+
:param section: section from which the sentence was taken
48+
:param job_id: identifiers (ids), gene names or synonyms
49+
:param urs: URS related to the job_id
50+
:return: new annotation
51+
"""
52+
return json_obj["anns"].append({
53+
"exact": exact,
54+
"section": section,
55+
"tags": [
56+
{
57+
"name": job_id,
58+
"uri": "https://rnacentral.org/rna/" + urs.upper()
59+
}
60+
]
61+
})
62+
63+
2764
async def export_results():
2865
"""
2966
Function to export results to Europe PMC.
@@ -36,6 +73,9 @@ async def export_results():
3673
host = os.getenv("host")
3774
password = os.getenv("pass")
3875

76+
# list of annotations
77+
annotations = []
78+
3979
async with create_engine(user=user, database=database, host=host, password=password) as engine:
4080
# get list of articles
4181
query = (sa.select([Article.c.pmcid, Article.c.title]).select_from(Article).where(~Article.c.retracted)) # noqa
@@ -100,14 +140,7 @@ async def export_results():
100140

101141
# add annotation in case the article title contains the job_id
102142
if "urs" in result and result["id_in_title"]:
103-
json_obj["anns"].append({
104-
"exact": item["title"],
105-
"section": "title",
106-
"tags": [{
107-
"name": result["job_id"],
108-
"uri": "https://rnacentral.org/rna/" + result["urs"].upper()
109-
}]
110-
})
143+
await create_annotation(json_obj, item["title"], "title", result["job_id"], result["urs"])
111144

112145
# add annotation if the abstract contains the job_id
113146
if "urs" in result and result["id_in_abstract"]:
@@ -117,17 +150,10 @@ async def export_results():
117150
WHERE result_id=:result_id ORDER BY length(sentence) DESC LIMIT 1'''
118151
)
119152
async for row in connection.execute(abstract_sql, result_id=result["id"]):
120-
abstract_sentence = row.sentence
121-
122-
if "found in an image, table or supplementary material" not in abstract_sentence:
123-
json_obj["anns"].append({
124-
"exact": abstract_sentence,
125-
"section": "abstract",
126-
"tags": [{
127-
"name": result["job_id"],
128-
"uri": "https://rnacentral.org/rna/" + result["urs"].upper()
129-
}]
130-
})
153+
abs_sentence = row.sentence
154+
155+
if "found in an image, table or supplementary material" not in abs_sentence:
156+
await create_annotation(json_obj, abs_sentence, "abstract", result["job_id"], result["urs"])
131157

132158
# add annotation if the body of the article contains the job_id
133159
if "urs" in result and result["id_in_body"]:
@@ -140,18 +166,16 @@ async def export_results():
140166
body_sentence = row.sentence
141167

142168
if "found in an image, table or supplementary material" not in body_sentence:
143-
json_obj["anns"].append({
144-
"exact": body_sentence,
145-
"section": "body",
146-
"tags": [{
147-
"name": result["job_id"],
148-
"uri": "https://rnacentral.org/rna/" + result["urs"].upper()
149-
}]
150-
})
169+
await create_annotation(json_obj, body_sentence, "body", result["job_id"], result["urs"])
151170

152171
if json_obj["anns"]:
153-
with open("json_files/" + item["pmcid"] + ".json", "w") as outfile:
154-
outfile.write(json.dumps(json_obj, ensure_ascii=False, indent=4))
172+
# add annotation
173+
annotations.append(json_obj)
174+
175+
# every file must have less than 10000 rows, where each row represents an individual
176+
# article with all associated annotations.
177+
for i in range(0, len(annotations), 9999):
178+
await create_json_file(annotations[i:i + 9999])
155179

156180

157181
if __name__ == "__main__":

0 commit comments

Comments
 (0)