Skip to content

Commit 11d4c70

Browse files
authored
Merge pull request #91 from Fxe/dev
bug fixes and python version
2 parents 28de76a + 29afcdd commit 11d4c70

File tree

8 files changed

+175
-92
lines changed

8 files changed

+175
-92
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
language: python
22
python:
3-
- 3.6
43
- 3.7
54
- 3.8
5+
- 3.9
66
before_install:
77
- python --version
88
- pip install -U pip

examples/Others/Biochem.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
"cell_type": "markdown",
1414
"metadata": {},
1515
"source": [
16-
"### Load the database object from local github repository"
16+
"### Load the database object from local github repository\n",
17+
"https://github.com/ModelSEED/ModelSEEDDatabase"
1718
]
1819
},
1920
{

modelseedpy/biochem/modelseed_biochem.py

Lines changed: 136 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
logger = logging.getLogger(__name__)
1212

13-
_BIOCHEM_FOLDER = 'Biochemistry'
13+
_BIOCHEM_FOLDER = "Biochemistry"
1414

1515
ALIAS_CPD_IDENTIFIERS_ORG = {
1616
"BiGG": "bigg.metabolite",
@@ -139,13 +139,13 @@ def load_metabolites_from_df(
139139
if cpd_id in structures:
140140
if "SMILE" in structures[cpd_id]:
141141
smiles = structures[cpd_id]["SMILE"]
142-
aliases_annotation['SMILE'] = smiles
142+
aliases_annotation["SMILE"] = smiles
143143
if "InChI" in structures[cpd_id]:
144144
inchi = structures[cpd_id]["InChI"]
145-
aliases_annotation['InChI'] = inchi
145+
aliases_annotation["InChI"] = inchi
146146
if "InChIKey" in structures[cpd_id]:
147147
inchi_key = structures[cpd_id]["InChIKey"]
148-
aliases_annotation['InChIKey'] = inchi_key
148+
aliases_annotation["InChIKey"] = inchi_key
149149
inchi_key = None if pd.isna(inchi_key) or len(inchi_key) == 0 else inchi_key
150150
other_names = set()
151151
if cpd_id in names:
@@ -193,31 +193,44 @@ def _load_aliases_df(df_aliases, seed_index=1, source_index=3, alias_id_index=2)
193193
return aliases
194194

195195

196-
def _load_metabolites(database_path: str, aliases=None, names=None, structures=None) -> dict:
196+
def _load_metabolites(
197+
database_path: str, aliases=None, names=None, structures=None
198+
) -> dict:
197199
if aliases is None:
198200
aliases = {}
199201
if names is None:
200202
names = {}
201203
if structures is None:
202204
structures = {}
203205
metabolites = {}
204-
contents = os.listdir(f'{database_path}/{_BIOCHEM_FOLDER}')
206+
contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}")
205207
for f in contents:
206-
if f.startswith('compound_') and f.endswith('.json'):
207-
with open(f'{database_path}/{_BIOCHEM_FOLDER}/{f}', 'r') as fh:
208+
if f.startswith("compound_") and f.endswith(".json"):
209+
with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh:
208210
_compounds_data = json.load(fh)
209211
for o in _compounds_data:
210-
if 'id' in o and o['id']:
212+
if "id" in o and o["id"]:
211213
cpd_names = set()
212-
if o['id'] in names:
213-
cpd_names |= names[o['id']]
214-
cpd = ModelSEEDCompound2(o['id'], o.get('formula'),
215-
o.get('name'), o.get('charge'), '',
216-
o.get('abbreviation'), cpd_names,
217-
o.get('mass'), o.get('deltag'), o.get('deltagerr'),
218-
o.get('is_core'), o.get('is_obsolete'), None,
219-
o.get('pka'), o.get('pkb'),
220-
o.get('source'))
214+
if o["id"] in names:
215+
cpd_names |= names[o["id"]]
216+
cpd = ModelSEEDCompound2(
217+
o["id"],
218+
o.get("formula"),
219+
o.get("name"),
220+
o.get("charge"),
221+
"",
222+
o.get("abbreviation"),
223+
cpd_names,
224+
o.get("mass"),
225+
o.get("deltag"),
226+
o.get("deltagerr"),
227+
o.get("is_core"),
228+
o.get("is_obsolete"),
229+
None,
230+
o.get("pka"),
231+
o.get("pkb"),
232+
o.get("source"),
233+
)
221234
if cpd.id in aliases:
222235
cpd.annotation.update(aliases[cpd.id])
223236
if cpd.id in structures:
@@ -226,62 +239,83 @@ def _load_metabolites(database_path: str, aliases=None, names=None, structures=N
226239
if len(v) == 1:
227240
cpd.annotation[alias_type] = list(v)[0]
228241
else:
229-
logger.warning(f'multiple {alias_type} structures found for {cpd.id}')
242+
logger.warning(
243+
f"multiple {alias_type} structures found for {cpd.id}"
244+
)
230245
metabolites[cpd.id] = cpd
231246
else:
232-
print('error', o)
233-
#print(_compounds_data[0].keys())
247+
print("error", o)
248+
# print(_compounds_data[0].keys())
234249
return metabolites
235250

236251

237-
def _load_reactions(database_path: str, metabolites: dict, aliases=None, names=None, ec_numbers=None) -> (dict, dict):
252+
def _load_reactions(
253+
database_path: str, metabolites: dict, aliases=None, names=None, ec_numbers=None
254+
) -> (dict, dict):
238255
if aliases is None:
239256
aliases = {}
240257
if names is None:
241258
names = {}
242259
if ec_numbers is None:
243260
ec_numbers = {}
244261
reactions = {}
245-
contents = os.listdir(f'{database_path}/{_BIOCHEM_FOLDER}')
262+
contents = os.listdir(f"{database_path}/{_BIOCHEM_FOLDER}")
246263
metabolites_indexed = {}
247264
for f in contents:
248-
if f.startswith('reaction_') and f.endswith('.json'):
249-
with open(f'{database_path}/{_BIOCHEM_FOLDER}/{f}', 'r') as fh:
265+
if f.startswith("reaction_") and f.endswith(".json"):
266+
with open(f"{database_path}/{_BIOCHEM_FOLDER}/{f}", "r") as fh:
250267
_reactions_data = json.load(fh)
251268
for o in _reactions_data:
252-
if 'id' in o and o['id']:
269+
if "id" in o and o["id"]:
253270
rxn_names = set()
254-
if o['id'] in names:
255-
rxn_names |= names[o['id']]
256-
lower_bound, upper_bound = get_reaction_constraints_from_direction(o.get('reversibility'))
257-
stoichiometry = o.get('stoichiometry')
271+
if o["id"] in names:
272+
rxn_names |= names[o["id"]]
273+
(
274+
lower_bound,
275+
upper_bound,
276+
) = get_reaction_constraints_from_direction(
277+
o.get("reversibility")
278+
)
279+
stoichiometry = o.get("stoichiometry")
258280
reaction_metabolites = {}
259281
for s in stoichiometry:
260-
cmp_token = s['compartment']
261-
value = s['coefficient']
262-
cpd = metabolites[s['compound']]
282+
cmp_token = s["compartment"]
283+
value = s["coefficient"]
284+
cpd = metabolites[s["compound"]]
263285
cpd_index_id = f"{cpd.id}_{cmp_token}"
264286
if cpd_index_id not in metabolites_indexed:
265287
cpd_token = cpd.copy()
266288
cpd_token.id = f"{cpd.id}_{cmp_token}"
267289
cpd_token.base_id = cpd.id
268290
cpd_token.compartment = cmp_token
269291
metabolites_indexed[cpd_index_id] = cpd_token
270-
reaction_metabolites[metabolites_indexed[cpd_index_id]] = value
271-
rxn = ModelSEEDReaction2(o['id'], o.get('name'), '', lower_bound, upper_bound,
272-
'', rxn_names,
273-
o.get('deltag'), o.get('deltagerr'),
274-
o.get('is_obsolete'), None,
275-
o.get('status'), o.get('source'))
292+
reaction_metabolites[
293+
metabolites_indexed[cpd_index_id]
294+
] = value
295+
rxn = ModelSEEDReaction2(
296+
o["id"],
297+
o.get("name"),
298+
"",
299+
lower_bound,
300+
upper_bound,
301+
"",
302+
rxn_names,
303+
o.get("deltag"),
304+
o.get("deltagerr"),
305+
o.get("is_obsolete"),
306+
None,
307+
o.get("status"),
308+
o.get("source"),
309+
)
276310
rxn.add_metabolites(reaction_metabolites)
277311
if rxn.id in aliases:
278312
rxn.annotation.update(aliases[rxn.id])
279313
if rxn.id in ec_numbers:
280-
rxn.annotation['ec-code'] = ec_numbers[rxn.id]
314+
rxn.annotation["ec-code"] = ec_numbers[rxn.id]
281315
metabolites[cpd.id] = cpd
282316
reactions[rxn.id] = rxn
283317
else:
284-
logger.error(f'failed to read reaction record {o}')
318+
logger.error(f"failed to read reaction record {o}")
285319

286320
return reactions, metabolites_indexed
287321

@@ -389,7 +423,7 @@ def __init__(self, compounds: list, reactions: list, compound_tokens: list):
389423
self.reactions = DictList()
390424
self.compounds += compounds
391425
self.reactions += reactions
392-
self.reactions += compound_tokens
426+
self.compound_tokens += compound_tokens
393427

394428
self.inchi_key_lookup = {}
395429
self.metabolite_reactions = {}
@@ -399,12 +433,12 @@ def __init__(self, compounds: list, reactions: list, compound_tokens: list):
399433
def _index_inchi(self):
400434
for m in self.compounds:
401435
if m.inchi_key:
402-
f, s, p = m.inchi_key.split('-')
436+
f, s, p = m.inchi_key.split("-")
403437
if f not in self.inchi_key_lookup:
404438
self.inchi_key_lookup[f] = {}
405439
if s not in self.inchi_key_lookup[f]:
406440
self.inchi_key_lookup[f][s] = set()
407-
proton_pair = (m.id , p)
441+
proton_pair = (m.id, p)
408442
if proton_pair not in self.inchi_key_lookup[f][s]:
409443
self.inchi_key_lookup[f][s].add(proton_pair)
410444

@@ -415,7 +449,7 @@ def reactions_by_alias(self, alias, value):
415449
pass
416450

417451
def find_compounds_by_inchi_key(self, inchi_key, exact=True):
418-
f, s, p = inchi_key.split('-')
452+
f, s, p = inchi_key.split("-")
419453
if exact and f in self.inchi_key_lookup and s in self.inchi_key_lookup[f]:
420454
# x is tuple (cpd.id, protonation)
421455
return [self.compounds.get_by_id(x[0]) for x in self.inchi_key_lookup[f][s]]
@@ -763,40 +797,75 @@ def from_local_old(path):
763797

764798

765799
def from_local(database_path: str):
766-
contents = os.listdir(f'{database_path}/Biochemistry/')
767-
if 'compounds.tsv' in contents:
800+
contents = os.listdir(f"{database_path}/Biochemistry/")
801+
if "compounds.tsv" in contents:
768802
return from_local_old(database_path)
769803

770-
compound_aliases_url = f'{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt'
771-
reaction_aliases_url = f'{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt'
772-
compound_aliases = _load_aliases_df(pd.read_csv(compound_aliases_url, index_col=None, sep='\t'))
773-
reaction_aliases = _load_aliases_df(pd.read_csv(reaction_aliases_url, index_col=None, sep='\t'))
804+
compound_aliases_url = (
805+
f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Aliases.txt"
806+
)
807+
reaction_aliases_url = (
808+
f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Aliases.txt"
809+
)
810+
compound_aliases = _load_aliases_df(
811+
pd.read_csv(compound_aliases_url, index_col=None, sep="\t")
812+
)
813+
reaction_aliases = _load_aliases_df(
814+
pd.read_csv(reaction_aliases_url, index_col=None, sep="\t")
815+
)
774816

775-
compound_structures_url = f'{database_path}/Biochemistry/Structures/Unique_ModelSEED_Structures.txt'
776-
compound_structures = _load_aliases_df(pd.read_csv(compound_structures_url, index_col=None, sep='\t'),
777-
source_index=2, alias_id_index=6)
817+
compound_structures_url = (
818+
f"{database_path}/Biochemistry/Structures/Unique_ModelSEED_Structures.txt"
819+
)
820+
compound_structures = _load_aliases_df(
821+
pd.read_csv(compound_structures_url, index_col=None, sep="\t"),
822+
source_index=2,
823+
alias_id_index=6,
824+
)
778825

779-
compound_names_url = f'{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt'
780-
reaction_names_url = f'{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt'
781-
compound_names = _load_aliases_df(pd.read_csv(compound_names_url, index_col=None, sep='\t'))
782-
reaction_names = _load_aliases_df(pd.read_csv(reaction_names_url, index_col=None, sep='\t'))
826+
compound_names_url = (
827+
f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Compound_Names.txt"
828+
)
829+
reaction_names_url = (
830+
f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_Names.txt"
831+
)
832+
compound_names = _load_aliases_df(
833+
pd.read_csv(compound_names_url, index_col=None, sep="\t")
834+
)
835+
reaction_names = _load_aliases_df(
836+
pd.read_csv(reaction_names_url, index_col=None, sep="\t")
837+
)
783838

784-
reaction_ecs_url = f'{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt'
785-
reaction_ecs = _load_aliases_df(pd.read_csv(reaction_ecs_url, index_col=None, sep='\t'))
839+
reaction_ecs_url = (
840+
f"{database_path}/Biochemistry/Aliases/Unique_ModelSEED_Reaction_ECs.txt"
841+
)
842+
reaction_ecs = _load_aliases_df(
843+
pd.read_csv(reaction_ecs_url, index_col=None, sep="\t")
844+
)
786845

787846
# build metabolites unpack names
788-
metabolites = _load_metabolites(database_path, compound_aliases,
789-
{k:v['name'] for k, v in compound_names.items()},
790-
compound_structures)
847+
metabolites = _load_metabolites(
848+
database_path,
849+
compound_aliases,
850+
{k: v["name"] for k, v in compound_names.items()},
851+
compound_structures,
852+
)
791853

792854
# build reactions unpack names, ecs
793-
reactions, metabolite_tokens = _load_reactions(database_path, metabolites, reaction_aliases,
794-
{k:v['name'] for k, v in reaction_names.items()},
795-
{k:v['Enzyme Class'] for k, v in reaction_ecs.items()})
796-
database = ModelSEEDDatabase(metabolites.values(), reactions.values(), metabolite_tokens.values())
855+
reactions, metabolite_tokens = _load_reactions(
856+
database_path,
857+
metabolites,
858+
reaction_aliases,
859+
{k: v["name"] for k, v in reaction_names.items()},
860+
{k: v["Enzyme Class"] for k, v in reaction_ecs.items()},
861+
)
862+
database = ModelSEEDDatabase(
863+
metabolites.values(), reactions.values(), metabolite_tokens.values()
864+
)
797865

798866
return database
799867

868+
800869
def get_names_from_df(df):
801870
names = {}
802871
for t in df.itertuples():

0 commit comments

Comments
 (0)