30
30
# # add caching of already searched terms
31
31
# # add default values for functions
32
32
33
-
34
- # # PARANOID SETUP
35
- # # delete venv
36
- # # purge pip cache
37
- # # create new venv and enter it
38
- # # install wheel
39
- # # install packages from requirements file
40
-
33
+ # todo this silences
34
+ # SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
35
+ # but I should really be dealing with it
36
+ pd .options .mode .chained_assignment = None # default='warn'
41
37
42
38
logger = logging .getLogger (__name__ )
43
39
click_log .basic_config (logger )
@@ -106,7 +102,7 @@ def ols_term_search(term, chars_to_whiteout, ontology_param, qf_param, rowcount_
106
102
'rows=' + str (rowcount_param ) + '&' + \
107
103
qf_param
108
104
109
- # logger.debug(request_string)
105
+ logger .debug (request_string )
110
106
111
107
# this gets matching terms but doesn't show why they matched
112
108
response_param = session_param .get (request_string )
@@ -218,25 +214,18 @@ def get_ols_term_annotations(iri_param, ontology_param, session_param, ols_terms
218
214
help = """how much of a cosine distance will you tolerate
219
215
when comparing an enum name to a term lable or synonym?""" ,
220
216
default = 0.05 , show_default = True )
221
- # looks like click already provides -v, --verbosity LVL
222
- @click .option ('--log_level' ,
223
- help = "at what level do you want to see logging messages?" ,
224
- default = 'INFO' , type = click .Choice (['DEBUG' , 'INFO' , 'WARNING' , 'ERROR' , 'CRITICAL' ]), show_default = True )
225
217
@click .option ('--query_field_string' ,
226
218
help = """do you want to define a custom list of fields to search in?
227
219
The default settings work well in most cases.""" ,
228
220
default = '' , show_default = True )
229
221
@click .option ('--test_sample_size' ,
230
222
help = """if greater than 0, the enum name list will be samples at this size before mapping.""" ,
231
223
default = 0 , show_default = True )
232
- def make_iot_yaml (modelfile , all_mappings_fn , requested_enum_name , whiteout_chars , ontology_string ,
233
- ols_search_base_url , ols_terms_based_url , desired_row_count , shingle_size , max_cosine ,
234
- overwrite_meaning , log_level , query_field_string , test_sample_size ):
224
+ def enum_annotator (modelfile , all_mappings_fn , requested_enum_name , whiteout_chars , ontology_string ,
225
+ ols_search_base_url , ols_terms_based_url , desired_row_count , shingle_size , max_cosine ,
226
+ overwrite_meaning , query_field_string , test_sample_size ):
235
227
# show entire width of data frames
236
228
pd .set_option ('display.expand_frame_repr' , False )
237
- # DEBUG, INFO, WARNING, ERROR, or CRITICAL
238
- # allow log messages to be sent to a file, too?
239
- logger .setLevel (log_level )
240
229
241
230
# GLOBALS within this method
242
231
blank_row = {'title' : '' , 'id' : '' , 'iri' : '' , 'is_defining_ontology' : '' ,
@@ -255,13 +244,13 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
255
244
256
245
requested_pvs_names = get_pv_names (requested_pvs_obj )
257
246
requested_pvs_names .sort ()
258
- # logger.debug(requested_pvs_names)
247
+ logger .debug (requested_pvs_names )
259
248
260
249
ontologies_phrased = make_ontolgy_phrase (ontology_string )
261
- # logger.debug(ontologies_phrased)
250
+ logger .debug (ontologies_phrased )
262
251
263
252
qf_phrased = make_qf_phrase (query_field_string )
264
- # logger.debug(qf_phrased)
253
+ logger .debug (qf_phrased )
265
254
266
255
cosine_obj = make_cosine_obj (shingle_size )
267
256
# logger.debug(cosine_obj)
@@ -293,13 +282,15 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
293
282
# # could look at growth in enum_name_mappings
294
283
295
284
enum_name_mapping_frame = enum_name_mappings .get ()
285
+ # logger.debug(enum_name_mapping_frame)
296
286
297
287
term_and_source = enum_name_mapping_frame .loc [enum_name_mapping_frame ['raw_query' ].eq (pv_name )]
298
288
299
289
term_and_source = term_and_source [["iri" , "ontology_name" ]]
300
290
term_and_source .drop_duplicates (inplace = True )
301
291
term_and_source = term_and_source .loc [~ term_and_source ['iri' ].eq ("" )]
302
292
term_and_source .sort_values (["iri" , "ontology_name" ], inplace = True )
293
+ logger .debug (term_and_source )
303
294
304
295
term_and_source = term_and_source .to_dict (orient = "records" )
305
296
@@ -314,8 +305,11 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
314
305
suffixes = ('_term' , '_ano' ))
315
306
316
307
for_str_dist = raw_through_annotations [["tidied_query" , "name" ]]
308
+ # 20211215 0912
309
+ # A value is trying to be set on a copy of a slice from a DataFrame.
317
310
for_str_dist ["tidied_query_lc" ] = for_str_dist ["tidied_query" ].str .lower ()
318
311
for_str_dist ["name_lc" ] = for_str_dist ["name" ].str .lower ()
312
+ logger .debug (for_str_dist )
319
313
320
314
# favoring simplicity over efficiency
321
315
# ie may be string-comparing some duplicates
@@ -344,17 +338,18 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
344
338
for_str_dist = pd .DataFrame (new_pair_list )
345
339
for_str_dist .drop (labels = ["tidied_query_lc" , "name_lc" ], axis = 1 , inplace = True )
346
340
347
- # logger. debug(for_str_dist)
348
-
341
+ # was debug
342
+ logger . debug ( for_str_dist )
349
343
raw_through_dist = raw_through_annotations .merge (for_str_dist , how = "left" , on = ["tidied_query" , "name" ])
350
-
351
344
all_mappings_frame = []
352
345
353
346
new_enum = linkml_runtime .linkml_model .EnumDefinition (name = requested_enum_name )
354
347
# logger.info(new_enum)
355
348
349
+ # looping inside the same loop ?!
356
350
for i in requested_pvs_names :
357
- # logger.info(i)
351
+ # todo unnest loop?
352
+ logger .debug (i )
358
353
ce = requested_pvs_obj [i ]
359
354
cr = raw_through_dist .loc [raw_through_dist ["raw_query" ].eq (i )]
360
355
all_mappings_frame .append (cr )
@@ -365,6 +360,7 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
365
360
with_min_row_count = len (with_min .index )
366
361
if with_min_row_count > 0 :
367
362
with_min = with_min .drop (labels = ['xrefs' ], axis = 1 )
363
+ with_min ['description' ] = str (with_min ['description' ])
368
364
with_min .drop_duplicates (inplace = True )
369
365
deduped_row_count = len (with_min .index )
370
366
# # I'm surprised that there aren't any 2+ equally good mappings here
@@ -388,24 +384,29 @@ def make_iot_yaml(modelfile, all_mappings_fn, requested_enum_name, whiteout_char
388
384
new_enum .permissible_values [i ] = ce
389
385
390
386
all_mappings_frame = pd .concat (all_mappings_frame )
391
- all_mappings_frame .drop (labels = "xrefs" , axis = 1 , inplace = True )
387
+
388
+ all_mappings_frame .to_csv ("all_mappings_frame.tsv" , sep = "\t " , index = False )
389
+
390
+ all_mappings_frame ['description' ] = str (all_mappings_frame ['description' ])
391
+ all_mappings_frame ['xrefs' ] = str (all_mappings_frame ['xrefs' ])
392
+
392
393
all_mappings_frame .drop_duplicates (inplace = True )
394
+
393
395
all_mappings_frame .to_csv (all_mappings_fn , sep = "\t " , index = False )
394
396
395
397
gs_tq_vc = all_mappings_frame ["raw_query" ].value_counts ()
398
+ logger .info ("Number of hits for each term, regardless of quality. Doesn't show terms with 0 hits." )
396
399
logger .info (gs_tq_vc )
397
400
398
401
current_schema .enums [requested_enum_name ] = new_enum
399
402
400
403
string_dumped_schema = yaml_dumper .dumps (current_schema )
401
404
402
- logger .debug (string_dumped_schema )
403
-
405
+ # todo send to STDOUT or output file?
404
406
print (string_dumped_schema )
405
-
406
- # with open(xxx, 'w') as file:
407
- # documents = yaml.safe_dump(string_dumped_schema, file)
407
+ # # with open(xxx, 'w') as file:
408
+ # # documents = yaml.safe_dump(string_dumped_schema, file)
408
409
409
410
410
411
if __name__ == '__main__' :
411
- make_iot_yaml ()
412
+ enum_annotator ()
0 commit comments