14
14
import asyncio
15
15
import json
16
16
import os
17
+ import random
18
+ import string
17
19
import sqlalchemy as sa
18
20
19
21
from aiopg .sa import create_engine
24
26
load_dotenv ()
25
27
26
28
29
+ async def create_json_file (annotations ):
30
+ """
31
+ Creates the Json file that will be used by Europe PMC
32
+ :param annotations: list of results
33
+ :return: none
34
+ """
35
+ # random string to use in filename
36
+ random_char = "" .join (random .choices (string .ascii_uppercase + string .digits , k = 16 ))
37
+
38
+ with open ("json_files/" + "RNAcentral_annotations_" + random_char , "w" , encoding = "utf-8" ) as outfile :
39
+ outfile .write ("[" + ",\n " .join (json .dumps (row , ensure_ascii = False ) for row in annotations ) + "]\n " )
40
+
41
+
42
+ async def create_annotation (json_obj , exact , section , job_id , urs ):
43
+ """
44
+ Function to add annotation
45
+ :param json_obj: json object
46
+ :param exact: sentence
47
+ :param section: section from which the sentence was taken
48
+ :param job_id: identifiers (ids), gene names or synonyms
49
+ :param urs: URS related to the job_id
50
+ :return: new annotation
51
+ """
52
+ return json_obj ["anns" ].append ({
53
+ "exact" : exact ,
54
+ "section" : section ,
55
+ "tags" : [
56
+ {
57
+ "name" : job_id ,
58
+ "uri" : "https://rnacentral.org/rna/" + urs .upper ()
59
+ }
60
+ ]
61
+ })
62
+
63
+
27
64
async def export_results ():
28
65
"""
29
66
Function to export results to Europe PMC.
@@ -36,6 +73,9 @@ async def export_results():
36
73
host = os .getenv ("host" )
37
74
password = os .getenv ("pass" )
38
75
76
+ # list of annotations
77
+ annotations = []
78
+
39
79
async with create_engine (user = user , database = database , host = host , password = password ) as engine :
40
80
# get list of articles
41
81
query = (sa .select ([Article .c .pmcid , Article .c .title ]).select_from (Article ).where (~ Article .c .retracted )) # noqa
@@ -100,14 +140,7 @@ async def export_results():
100
140
101
141
# add annotation in case the article title contains the job_id
102
142
if "urs" in result and result ["id_in_title" ]:
103
- json_obj ["anns" ].append ({
104
- "exact" : item ["title" ],
105
- "section" : "title" ,
106
- "tags" : [{
107
- "name" : result ["job_id" ],
108
- "uri" : "https://rnacentral.org/rna/" + result ["urs" ].upper ()
109
- }]
110
- })
143
+ await create_annotation (json_obj , item ["title" ], "title" , result ["job_id" ], result ["urs" ])
111
144
112
145
# add annotation if the abstract contains the job_id
113
146
if "urs" in result and result ["id_in_abstract" ]:
@@ -117,17 +150,10 @@ async def export_results():
117
150
WHERE result_id=:result_id ORDER BY length(sentence) DESC LIMIT 1'''
118
151
)
119
152
async for row in connection .execute (abstract_sql , result_id = result ["id" ]):
120
- abstract_sentence = row .sentence
121
-
122
- if "found in an image, table or supplementary material" not in abstract_sentence :
123
- json_obj ["anns" ].append ({
124
- "exact" : abstract_sentence ,
125
- "section" : "abstract" ,
126
- "tags" : [{
127
- "name" : result ["job_id" ],
128
- "uri" : "https://rnacentral.org/rna/" + result ["urs" ].upper ()
129
- }]
130
- })
153
+ abs_sentence = row .sentence
154
+
155
+ if "found in an image, table or supplementary material" not in abs_sentence :
156
+ await create_annotation (json_obj , abs_sentence , "abstract" , result ["job_id" ], result ["urs" ])
131
157
132
158
# add annotation if the body of the article contains the job_id
133
159
if "urs" in result and result ["id_in_body" ]:
@@ -140,18 +166,16 @@ async def export_results():
140
166
body_sentence = row .sentence
141
167
142
168
if "found in an image, table or supplementary material" not in body_sentence :
143
- json_obj ["anns" ].append ({
144
- "exact" : body_sentence ,
145
- "section" : "body" ,
146
- "tags" : [{
147
- "name" : result ["job_id" ],
148
- "uri" : "https://rnacentral.org/rna/" + result ["urs" ].upper ()
149
- }]
150
- })
169
+ await create_annotation (json_obj , body_sentence , "body" , result ["job_id" ], result ["urs" ])
151
170
152
171
if json_obj ["anns" ]:
153
- with open ("json_files/" + item ["pmcid" ] + ".json" , "w" ) as outfile :
154
- outfile .write (json .dumps (json_obj , ensure_ascii = False , indent = 4 ))
172
+ # add annotation
173
+ annotations .append (json_obj )
174
+
175
+ # every file must have less than 10000 rows, where each row represents an individual
176
+ # article with all associated annotations.
177
+ for i in range (0 , len (annotations ), 9999 ):
178
+ await create_json_file (annotations [i :i + 9999 ])
155
179
156
180
157
181
if __name__ == "__main__" :
0 commit comments