10
10
11
11
logger = logging .getLogger (__name__ )
12
12
13
- _BIOCHEM_FOLDER = ' Biochemistry'
13
+ _BIOCHEM_FOLDER = " Biochemistry"
14
14
15
15
ALIAS_CPD_IDENTIFIERS_ORG = {
16
16
"BiGG" : "bigg.metabolite" ,
@@ -139,13 +139,13 @@ def load_metabolites_from_df(
139
139
if cpd_id in structures :
140
140
if "SMILE" in structures [cpd_id ]:
141
141
smiles = structures [cpd_id ]["SMILE" ]
142
- aliases_annotation [' SMILE' ] = smiles
142
+ aliases_annotation [" SMILE" ] = smiles
143
143
if "InChI" in structures [cpd_id ]:
144
144
inchi = structures [cpd_id ]["InChI" ]
145
- aliases_annotation [' InChI' ] = inchi
145
+ aliases_annotation [" InChI" ] = inchi
146
146
if "InChIKey" in structures [cpd_id ]:
147
147
inchi_key = structures [cpd_id ]["InChIKey" ]
148
- aliases_annotation [' InChIKey' ] = inchi_key
148
+ aliases_annotation [" InChIKey" ] = inchi_key
149
149
inchi_key = None if pd .isna (inchi_key ) or len (inchi_key ) == 0 else inchi_key
150
150
other_names = set ()
151
151
if cpd_id in names :
@@ -193,31 +193,44 @@ def _load_aliases_df(df_aliases, seed_index=1, source_index=3, alias_id_index=2)
193
193
return aliases
194
194
195
195
196
- def _load_metabolites (database_path : str , aliases = None , names = None , structures = None ) -> dict :
196
+ def _load_metabolites (
197
+ database_path : str , aliases = None , names = None , structures = None
198
+ ) -> dict :
197
199
if aliases is None :
198
200
aliases = {}
199
201
if names is None :
200
202
names = {}
201
203
if structures is None :
202
204
structures = {}
203
205
metabolites = {}
204
- contents = os .listdir (f' { database_path } /{ _BIOCHEM_FOLDER } ' )
206
+ contents = os .listdir (f" { database_path } /{ _BIOCHEM_FOLDER } " )
205
207
for f in contents :
206
- if f .startswith (' compound_' ) and f .endswith (' .json' ):
207
- with open (f' { database_path } /{ _BIOCHEM_FOLDER } /{ f } ' , 'r' ) as fh :
208
+ if f .startswith (" compound_" ) and f .endswith (" .json" ):
209
+ with open (f" { database_path } /{ _BIOCHEM_FOLDER } /{ f } " , "r" ) as fh :
208
210
_compounds_data = json .load (fh )
209
211
for o in _compounds_data :
210
- if 'id' in o and o ['id' ]:
212
+ if "id" in o and o ["id" ]:
211
213
cpd_names = set ()
212
- if o ['id' ] in names :
213
- cpd_names |= names [o ['id' ]]
214
- cpd = ModelSEEDCompound2 (o ['id' ], o .get ('formula' ),
215
- o .get ('name' ), o .get ('charge' ), '' ,
216
- o .get ('abbreviation' ), cpd_names ,
217
- o .get ('mass' ), o .get ('deltag' ), o .get ('deltagerr' ),
218
- o .get ('is_core' ), o .get ('is_obsolete' ), None ,
219
- o .get ('pka' ), o .get ('pkb' ),
220
- o .get ('source' ))
214
+ if o ["id" ] in names :
215
+ cpd_names |= names [o ["id" ]]
216
+ cpd = ModelSEEDCompound2 (
217
+ o ["id" ],
218
+ o .get ("formula" ),
219
+ o .get ("name" ),
220
+ o .get ("charge" ),
221
+ "" ,
222
+ o .get ("abbreviation" ),
223
+ cpd_names ,
224
+ o .get ("mass" ),
225
+ o .get ("deltag" ),
226
+ o .get ("deltagerr" ),
227
+ o .get ("is_core" ),
228
+ o .get ("is_obsolete" ),
229
+ None ,
230
+ o .get ("pka" ),
231
+ o .get ("pkb" ),
232
+ o .get ("source" ),
233
+ )
221
234
if cpd .id in aliases :
222
235
cpd .annotation .update (aliases [cpd .id ])
223
236
if cpd .id in structures :
@@ -226,62 +239,83 @@ def _load_metabolites(database_path: str, aliases=None, names=None, structures=N
226
239
if len (v ) == 1 :
227
240
cpd .annotation [alias_type ] = list (v )[0 ]
228
241
else :
229
- logger .warning (f'multiple { alias_type } structures found for { cpd .id } ' )
242
+ logger .warning (
243
+ f"multiple { alias_type } structures found for { cpd .id } "
244
+ )
230
245
metabolites [cpd .id ] = cpd
231
246
else :
232
- print (' error' , o )
233
- #print(_compounds_data[0].keys())
247
+ print (" error" , o )
248
+ # print(_compounds_data[0].keys())
234
249
return metabolites
235
250
236
251
237
- def _load_reactions (database_path : str , metabolites : dict , aliases = None , names = None , ec_numbers = None ) -> (dict , dict ):
252
+ def _load_reactions (
253
+ database_path : str , metabolites : dict , aliases = None , names = None , ec_numbers = None
254
+ ) -> (dict , dict ):
238
255
if aliases is None :
239
256
aliases = {}
240
257
if names is None :
241
258
names = {}
242
259
if ec_numbers is None :
243
260
ec_numbers = {}
244
261
reactions = {}
245
- contents = os .listdir (f' { database_path } /{ _BIOCHEM_FOLDER } ' )
262
+ contents = os .listdir (f" { database_path } /{ _BIOCHEM_FOLDER } " )
246
263
metabolites_indexed = {}
247
264
for f in contents :
248
- if f .startswith (' reaction_' ) and f .endswith (' .json' ):
249
- with open (f' { database_path } /{ _BIOCHEM_FOLDER } /{ f } ' , 'r' ) as fh :
265
+ if f .startswith (" reaction_" ) and f .endswith (" .json" ):
266
+ with open (f" { database_path } /{ _BIOCHEM_FOLDER } /{ f } " , "r" ) as fh :
250
267
_reactions_data = json .load (fh )
251
268
for o in _reactions_data :
252
- if 'id' in o and o ['id' ]:
269
+ if "id" in o and o ["id" ]:
253
270
rxn_names = set ()
254
- if o ['id' ] in names :
255
- rxn_names |= names [o ['id' ]]
256
- lower_bound , upper_bound = get_reaction_constraints_from_direction (o .get ('reversibility' ))
257
- stoichiometry = o .get ('stoichiometry' )
271
+ if o ["id" ] in names :
272
+ rxn_names |= names [o ["id" ]]
273
+ (
274
+ lower_bound ,
275
+ upper_bound ,
276
+ ) = get_reaction_constraints_from_direction (
277
+ o .get ("reversibility" )
278
+ )
279
+ stoichiometry = o .get ("stoichiometry" )
258
280
reaction_metabolites = {}
259
281
for s in stoichiometry :
260
- cmp_token = s [' compartment' ]
261
- value = s [' coefficient' ]
262
- cpd = metabolites [s [' compound' ]]
282
+ cmp_token = s [" compartment" ]
283
+ value = s [" coefficient" ]
284
+ cpd = metabolites [s [" compound" ]]
263
285
cpd_index_id = f"{ cpd .id } _{ cmp_token } "
264
286
if cpd_index_id not in metabolites_indexed :
265
287
cpd_token = cpd .copy ()
266
288
cpd_token .id = f"{ cpd .id } _{ cmp_token } "
267
289
cpd_token .base_id = cpd .id
268
290
cpd_token .compartment = cmp_token
269
291
metabolites_indexed [cpd_index_id ] = cpd_token
270
- reaction_metabolites [metabolites_indexed [cpd_index_id ]] = value
271
- rxn = ModelSEEDReaction2 (o ['id' ], o .get ('name' ), '' , lower_bound , upper_bound ,
272
- '' , rxn_names ,
273
- o .get ('deltag' ), o .get ('deltagerr' ),
274
- o .get ('is_obsolete' ), None ,
275
- o .get ('status' ), o .get ('source' ))
292
+ reaction_metabolites [
293
+ metabolites_indexed [cpd_index_id ]
294
+ ] = value
295
+ rxn = ModelSEEDReaction2 (
296
+ o ["id" ],
297
+ o .get ("name" ),
298
+ "" ,
299
+ lower_bound ,
300
+ upper_bound ,
301
+ "" ,
302
+ rxn_names ,
303
+ o .get ("deltag" ),
304
+ o .get ("deltagerr" ),
305
+ o .get ("is_obsolete" ),
306
+ None ,
307
+ o .get ("status" ),
308
+ o .get ("source" ),
309
+ )
276
310
rxn .add_metabolites (reaction_metabolites )
277
311
if rxn .id in aliases :
278
312
rxn .annotation .update (aliases [rxn .id ])
279
313
if rxn .id in ec_numbers :
280
- rxn .annotation [' ec-code' ] = ec_numbers [rxn .id ]
314
+ rxn .annotation [" ec-code" ] = ec_numbers [rxn .id ]
281
315
metabolites [cpd .id ] = cpd
282
316
reactions [rxn .id ] = rxn
283
317
else :
284
- logger .error (f' failed to read reaction record { o } ' )
318
+ logger .error (f" failed to read reaction record { o } " )
285
319
286
320
return reactions , metabolites_indexed
287
321
@@ -389,7 +423,7 @@ def __init__(self, compounds: list, reactions: list, compound_tokens: list):
389
423
self .reactions = DictList ()
390
424
self .compounds += compounds
391
425
self .reactions += reactions
392
- self .reactions += compound_tokens
426
+ self .compound_tokens += compound_tokens
393
427
394
428
self .inchi_key_lookup = {}
395
429
self .metabolite_reactions = {}
@@ -399,12 +433,12 @@ def __init__(self, compounds: list, reactions: list, compound_tokens: list):
399
433
def _index_inchi (self ):
400
434
for m in self .compounds :
401
435
if m .inchi_key :
402
- f , s , p = m .inchi_key .split ('-' )
436
+ f , s , p = m .inchi_key .split ("-" )
403
437
if f not in self .inchi_key_lookup :
404
438
self .inchi_key_lookup [f ] = {}
405
439
if s not in self .inchi_key_lookup [f ]:
406
440
self .inchi_key_lookup [f ][s ] = set ()
407
- proton_pair = (m .id , p )
441
+ proton_pair = (m .id , p )
408
442
if proton_pair not in self .inchi_key_lookup [f ][s ]:
409
443
self .inchi_key_lookup [f ][s ].add (proton_pair )
410
444
@@ -415,7 +449,7 @@ def reactions_by_alias(self, alias, value):
415
449
pass
416
450
417
451
def find_compounds_by_inchi_key (self , inchi_key , exact = True ):
418
- f , s , p = inchi_key .split ('-' )
452
+ f , s , p = inchi_key .split ("-" )
419
453
if exact and f in self .inchi_key_lookup and s in self .inchi_key_lookup [f ]:
420
454
# x is tuple (cpd.id, protonation)
421
455
return [self .compounds .get_by_id (x [0 ]) for x in self .inchi_key_lookup [f ][s ]]
@@ -763,40 +797,75 @@ def from_local_old(path):
763
797
764
798
765
799
def from_local (database_path : str ):
766
- contents = os .listdir (f' { database_path } /Biochemistry/' )
767
- if ' compounds.tsv' in contents :
800
+ contents = os .listdir (f" { database_path } /Biochemistry/" )
801
+ if " compounds.tsv" in contents :
768
802
return from_local_old (database_path )
769
803
770
- compound_aliases_url = f'{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt'
771
- reaction_aliases_url = f'{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt'
772
- compound_aliases = _load_aliases_df (pd .read_csv (compound_aliases_url , index_col = None , sep = '\t ' ))
773
- reaction_aliases = _load_aliases_df (pd .read_csv (reaction_aliases_url , index_col = None , sep = '\t ' ))
804
+ compound_aliases_url = (
805
+ f"{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt"
806
+ )
807
+ reaction_aliases_url = (
808
+ f"{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt"
809
+ )
810
+ compound_aliases = _load_aliases_df (
811
+ pd .read_csv (compound_aliases_url , index_col = None , sep = "\t " )
812
+ )
813
+ reaction_aliases = _load_aliases_df (
814
+ pd .read_csv (reaction_aliases_url , index_col = None , sep = "\t " )
815
+ )
774
816
775
- compound_structures_url = f'{ database_path } /Biochemistry/Structures/Unique_ModelSEED_Structures.txt'
776
- compound_structures = _load_aliases_df (pd .read_csv (compound_structures_url , index_col = None , sep = '\t ' ),
777
- source_index = 2 , alias_id_index = 6 )
817
+ compound_structures_url = (
818
+ f"{ database_path } /Biochemistry/Structures/Unique_ModelSEED_Structures.txt"
819
+ )
820
+ compound_structures = _load_aliases_df (
821
+ pd .read_csv (compound_structures_url , index_col = None , sep = "\t " ),
822
+ source_index = 2 ,
823
+ alias_id_index = 6 ,
824
+ )
778
825
779
- compound_names_url = f'{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt'
780
- reaction_names_url = f'{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt'
781
- compound_names = _load_aliases_df (pd .read_csv (compound_names_url , index_col = None , sep = '\t ' ))
782
- reaction_names = _load_aliases_df (pd .read_csv (reaction_names_url , index_col = None , sep = '\t ' ))
826
+ compound_names_url = (
827
+ f"{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt"
828
+ )
829
+ reaction_names_url = (
830
+ f"{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt"
831
+ )
832
+ compound_names = _load_aliases_df (
833
+ pd .read_csv (compound_names_url , index_col = None , sep = "\t " )
834
+ )
835
+ reaction_names = _load_aliases_df (
836
+ pd .read_csv (reaction_names_url , index_col = None , sep = "\t " )
837
+ )
783
838
784
- reaction_ecs_url = f'{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt'
785
- reaction_ecs = _load_aliases_df (pd .read_csv (reaction_ecs_url , index_col = None , sep = '\t ' ))
839
+ reaction_ecs_url = (
840
+ f"{ database_path } /Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt"
841
+ )
842
+ reaction_ecs = _load_aliases_df (
843
+ pd .read_csv (reaction_ecs_url , index_col = None , sep = "\t " )
844
+ )
786
845
787
846
# build metabolites unpack names
788
- metabolites = _load_metabolites (database_path , compound_aliases ,
789
- {k :v ['name' ] for k , v in compound_names .items ()},
790
- compound_structures )
847
+ metabolites = _load_metabolites (
848
+ database_path ,
849
+ compound_aliases ,
850
+ {k : v ["name" ] for k , v in compound_names .items ()},
851
+ compound_structures ,
852
+ )
791
853
792
854
# build reactions unpack names, ecs
793
- reactions , metabolite_tokens = _load_reactions (database_path , metabolites , reaction_aliases ,
794
- {k :v ['name' ] for k , v in reaction_names .items ()},
795
- {k :v ['Enzyme Class' ] for k , v in reaction_ecs .items ()})
796
- database = ModelSEEDDatabase (metabolites .values (), reactions .values (), metabolite_tokens .values ())
855
+ reactions , metabolite_tokens = _load_reactions (
856
+ database_path ,
857
+ metabolites ,
858
+ reaction_aliases ,
859
+ {k : v ["name" ] for k , v in reaction_names .items ()},
860
+ {k : v ["Enzyme Class" ] for k , v in reaction_ecs .items ()},
861
+ )
862
+ database = ModelSEEDDatabase (
863
+ metabolites .values (), reactions .values (), metabolite_tokens .values ()
864
+ )
797
865
798
866
return database
799
867
868
+
800
869
def get_names_from_df (df ):
801
870
names = {}
802
871
for t in df .itertuples ():
0 commit comments