Skip to content

Commit fe848a0

Browse files
author
Corentin
committed
working on ontology converter
1 parent 7e6984b commit fe848a0

File tree

4 files changed

+343
-147
lines changed

4 files changed

+343
-147
lines changed

.python-version

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.10.9
1+
3.10

notebooks/import_ontology.ipynb

+164-144
Original file line numberDiff line numberDiff line change
@@ -2,187 +2,207 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 23,
5+
"execution_count": null,
66
"metadata": {},
7-
"outputs": [
8-
{
9-
"ename": "TypeError",
10-
"evalue": "dumps() got multiple values for argument 'format'",
11-
"output_type": "error",
12-
"traceback": [
13-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14-
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
15-
"Cell \u001b[0;32mIn[23], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpronto\u001b[39;00m \u001b[39mimport\u001b[39;00m Ontology\n\u001b[1;32m 2\u001b[0m go \u001b[39m=\u001b[39m Ontology(\u001b[39m\"\u001b[39m\u001b[39mgoslim_agr.obo\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m go_json \u001b[39m=\u001b[39m go\u001b[39m.\u001b[39;49mdumps(f, \u001b[39mformat\u001b[39;49m\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mjson\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
16-
"\u001b[0;31mTypeError\u001b[0m: dumps() got multiple values for argument 'format'"
17-
]
18-
}
19-
],
7+
"outputs": [],
208
"source": [
219
"from pronto import Ontology\n",
22-
"go = Ontology(\"goslim_agr.obo\")\n",
10+
"go = Ontology(\"go.obo\")\n",
2311
"go"
2412
]
2513
},
2614
{
2715
"cell_type": "code",
28-
"execution_count": 12,
16+
"execution_count": null,
2917
"metadata": {},
3018
"outputs": [],
3119
"source": [
3220
"with open(\"ms.json\", \"wb\") as f:\n",
33-
" go.dumps(f, format=\"json\")"
21+
" go.dump(f, format=\"json\")"
3422
]
3523
},
3624
{
3725
"cell_type": "code",
38-
"execution_count": 33,
26+
"execution_count": null,
3927
"metadata": {},
40-
"outputs": [
41-
{
42-
"data": {
43-
"text/plain": [
44-
"dict_keys(['nodes', 'edges', 'id', 'lbl', 'meta', 'equivalentNodesSets', 'logicalDefinitionAxioms', 'domainRangeAxioms', 'propertyChainAxioms'])"
45-
]
46-
},
47-
"execution_count": 33,
48-
"metadata": {},
49-
"output_type": "execute_result"
50-
}
51-
],
28+
"outputs": [],
29+
"source": [
30+
"import json\n",
31+
"with open(\"ms.json\", \"r\") as f:\n",
32+
" go = json.load(f)"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": null,
38+
"metadata": {},
39+
"outputs": [],
5240
"source": [
5341
"go[\"graphs\"][0].keys()"
5442
]
5543
},
5644
{
5745
"cell_type": "code",
58-
"execution_count": 29,
46+
"execution_count": null,
5947
"metadata": {},
6048
"outputs": [],
6149
"source": [
62-
"import json\n",
63-
"with open(\"ms.json\", \"r\") as f:\n",
64-
" go = json.load(f)"
50+
"go[\"graphs\"][0][\"nodes\"][0]"
6551
]
6652
},
6753
{
6854
"cell_type": "code",
69-
"execution_count": 36,
55+
"execution_count": null,
7056
"metadata": {},
71-
"outputs": [
72-
{
73-
"data": {
74-
"text/plain": [
75-
"{'definition': None,\n",
76-
" 'comments': [],\n",
77-
" 'subsets': ['chebi_ph7_3',\n",
78-
" 'gocheck_do_not_annotate',\n",
79-
" 'gocheck_do_not_manually_annotate',\n",
80-
" 'goslim_agr',\n",
81-
" 'goslim_aspergillus',\n",
82-
" 'goslim_candida',\n",
83-
" 'goslim_chembl',\n",
84-
" 'goslim_drosophila',\n",
85-
" 'goslim_flybase_ribbon',\n",
86-
" 'goslim_generic',\n",
87-
" 'goslim_metagenomics',\n",
88-
" 'goslim_mouse',\n",
89-
" 'goslim_pir',\n",
90-
" 'goslim_plant',\n",
91-
" 'goslim_pombe',\n",
92-
" 'goslim_synapse',\n",
93-
" 'goslim_yeast',\n",
94-
" 'prokaryote_subset'],\n",
95-
" 'xrefs': [],\n",
96-
" 'synonyms': [],\n",
97-
" 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion',\n",
98-
" 'val': '1.2',\n",
99-
" 'xrefs': [],\n",
100-
" 'meta': None},\n",
101-
" {'pred': 'http://purl.obolibrary.org/obo/owl_versionInfo',\n",
102-
" 'val': '2023-07-27',\n",
103-
" 'xrefs': [],\n",
104-
" 'meta': None}],\n",
105-
" 'version': 'http://purl.obolibrary.org/obo/go/subsets/goslim_agr/go/2023-07-27/subsets/goslim_agr.owl/go/subsets/goslim_agr.owl',\n",
106-
" 'deprecated': False}"
107-
]
108-
},
109-
"execution_count": 36,
110-
"metadata": {},
111-
"output_type": "execute_result"
112-
}
113-
],
57+
"outputs": [],
58+
"source": [
59+
"edge_dict: dict = {}\n",
60+
"for relationship in go[\"graphs\"][0][\"edges\"]:\n",
61+
" parent_list = edge_dict.get(relationship[\"sub\"].split(\"/\")[-1], [])\n",
62+
" parent_list.append((relationship[\"obj\"].split(\"/\")[-1], relationship[\"pred\"]))\n",
63+
" edge_dict[relationship[\"sub\"].split(\"/\")[-1]] = parent_list"
64+
]
65+
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": null,
69+
"metadata": {},
70+
"outputs": [],
11471
"source": [
115-
"go[\"graphs\"][0][\"meta\"]"
72+
"edge_dict"
11673
]
11774
},
11875
{
11976
"cell_type": "code",
120-
"execution_count": 26,
77+
"execution_count": null,
12178
"metadata": {},
122-
"outputs": [
123-
{
124-
"name": "stdout",
125-
"output_type": "stream",
126-
"text": [
127-
"Term('GO:0000003', name='reproduction')\n",
128-
"Term('GO:0002376', name='immune system process')\n",
129-
"Term('GO:0003677', name='DNA binding')\n",
130-
"Term('GO:0003700', name='DNA-binding transcription factor activity')\n",
131-
"Term('GO:0003723', name='RNA binding')\n",
132-
"Term('GO:0003824', name='catalytic activity')\n",
133-
"Term('GO:0005102', name='signaling receptor binding')\n",
134-
"Term('GO:0005198', name='structural molecule activity')\n",
135-
"Term('GO:0005215', name='transporter activity')\n",
136-
"Term('GO:0005576', name='extracellular region')\n",
137-
"Term('GO:0005634', name='nucleus')\n",
138-
"Term('GO:0005694', name='chromosome')\n",
139-
"Term('GO:0005739', name='mitochondrion')\n",
140-
"Term('GO:0005768', name='endosome')\n",
141-
"Term('GO:0005773', name='vacuole')\n",
142-
"Term('GO:0005783', name='endoplasmic reticulum')\n",
143-
"Term('GO:0005794', name='Golgi apparatus')\n",
144-
"Term('GO:0005829', name='cytosol')\n",
145-
"Term('GO:0005856', name='cytoskeleton')\n",
146-
"Term('GO:0005886', name='plasma membrane')\n",
147-
"Term('GO:0005975', name='carbohydrate metabolic process')\n",
148-
"Term('GO:0006259', name='DNA metabolic process')\n",
149-
"Term('GO:0006629', name='lipid metabolic process')\n",
150-
"Term('GO:0007049', name='cell cycle')\n",
151-
"Term('GO:0007610', name='behavior')\n",
152-
"Term('GO:0008092', name='cytoskeletal protein binding')\n",
153-
"Term('GO:0008134', name='transcription factor binding')\n",
154-
"Term('GO:0008283', name='cell population proliferation')\n",
155-
"Term('GO:0008289', name='lipid binding')\n",
156-
"Term('GO:0009056', name='catabolic process')\n",
157-
"Term('GO:0012501', name='programmed cell death')\n",
158-
"Term('GO:0016043', name='cellular component organization')\n",
159-
"Term('GO:0016070', name='RNA metabolic process')\n",
160-
"Term('GO:0019538', name='protein metabolic process')\n",
161-
"Term('GO:0023052', name='signaling')\n",
162-
"Term('GO:0030054', name='cell junction')\n",
163-
"Term('GO:0030154', name='cell differentiation')\n",
164-
"Term('GO:0030234', name='enzyme regulator activity')\n",
165-
"Term('GO:0030246', name='carbohydrate binding')\n",
166-
"Term('GO:0031410', name='cytoplasmic vesicle')\n",
167-
"Term('GO:0032502', name='developmental process')\n",
168-
"Term('GO:0032991', name='protein-containing complex')\n",
169-
"Term('GO:0036094', name='small molecule binding')\n",
170-
"Term('GO:0038023', name='signaling receptor activity')\n",
171-
"Term('GO:0042592', name='homeostatic process')\n",
172-
"Term('GO:0042995', name='cell projection')\n",
173-
"Term('GO:0045202', name='synapse')\n",
174-
"Term('GO:0050877', name='nervous system process')\n",
175-
"Term('GO:0050896', name='response to stimulus')\n",
176-
"Term('GO:0051234', name='establishment of localization')\n",
177-
"Term('GO:0097367', name='carbohydrate derivative binding')\n",
178-
"Term('GO:1901135', name='carbohydrate derivative metabolic process')\n",
179-
"Term('GO:0046872', name='metal ion binding')\n"
180-
]
181-
}
182-
],
79+
"outputs": [],
80+
"source": [
81+
"for go_term in go[\"graphs\"][0][\"nodes\"]:\n",
82+
" if go_term[\"type\"] != \"CLASS\":\n",
83+
" print(go_term)"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": null,
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"names: list[str] = []\n",
93+
"id: list[str] = []\n",
94+
"desc: list[str] = []\n",
95+
"synonymes: list[list[str]] = []\n",
96+
"\n",
97+
"for go_term in go[\"graphs\"][0][\"nodes\"]:\n",
98+
" if go_term[\"type\"] == \"CLASS\":\n",
99+
" id.append(go_term[\"id\"].split(\"/\")[-1])\n",
100+
" names.append(go_term[\"lbl\"])\n",
101+
" desc.append(go_term[\"meta\"][\"definition\"][\"val\"])\n",
102+
" synonymes.append([syn[\"val\"] for syn in go_term[\"meta\"][\"synonyms\"]])"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": null,
108+
"metadata": {},
109+
"outputs": [],
110+
"source": []
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"import jsonschema\n",
119+
"from jsonschema import validate\n",
120+
"\n",
121+
"impatient_json: list[dict] = []\n",
122+
"impatient_json_schema = {\n",
123+
" \"type\": \"object\",\n",
124+
" \"properties\": {\n",
125+
" \"id\": {\"type\": \"string\"},\n",
126+
" \"text\": {\"type\": \"string\"},\n",
127+
" \"icon\": {\"type\": \"boolean\"},\n",
128+
" \"data\": {\n",
129+
" \"type\": \"object\",\n",
130+
" \"properties\": {\n",
131+
" \"description\": {\"type\": \"string\"},\n",
132+
" \"synonymes\": {\"type\": \"string\"},\n",
133+
" \"phenotype_datamined\": {\"type\": \"string\"},\n",
134+
" \"gene_datamined\": {\"type\": \"string\"},\n",
135+
" \"alternative_language\": {\"type\": \"string\"},\n",
136+
" \"correlates_with\": {\"type\": \"string\"},\n",
137+
" \"image_annotation\": {\"type\": \"boolean\"},\n",
138+
" \"hex_color\": {\"type\": \"string\", \"pattern\": \"^#[0-9a-fA-F]{6}$\"},\n",
139+
" \"hpo_datamined\": {\"type\": \"string\"},\n",
140+
" },\n",
141+
" \"required\": [\n",
142+
" \"description\",\n",
143+
" \"synonymes\",\n",
144+
" \"phenotype_datamined\",\n",
145+
" \"gene_datamined\",\n",
146+
" \"alternative_language\",\n",
147+
" \"correlates_with\",\n",
148+
" \"image_annotation\",\n",
149+
" \"hex_color\",\n",
150+
" \"hpo_datamined\",\n",
151+
" ],\n",
152+
" },\n",
153+
" \"parent\": {\"type\": \"string\"},\n",
154+
" },\n",
155+
" \"required\": [\"id\", \"text\", \"icon\", \"data\", \"parent\"],\n",
156+
"}\n",
157+
"\n",
158+
"for index in range(len(id)):\n",
159+
" impatient_json.append(\n",
160+
" {\n",
161+
" \"id\": id[index].replace(\"_\", \":\"),\n",
162+
" \"text\": names[index],\n",
163+
" \"icon\": True,\n",
164+
" \"data\": {\n",
165+
" \"description\": desc[index],\n",
166+
" \"synonymes\": ','.join(synonymes[index]),\n",
167+
" \"phenotype_datamined\": \"\",\n",
168+
" \"gene_datamined\": \"\",\n",
169+
" \"alternative_language\": names[index],\n",
170+
" \"correlates_with\": \"\",\n",
171+
" \"image_annotation\": True if index==0 else False,\n",
172+
" \"hex_color\": \"#FFFFFF\",\n",
173+
" \"hpo_datamined\": \"\",\n",
174+
" },\n",
175+
" \"parent\": \"#\",\n",
176+
" }\n",
177+
" )\n",
178+
" \n",
179+
"for child, parent in edge_dict.items():\n",
180+
" try:\n",
181+
" index_term = id.index(child)\n",
182+
" except ValueError:\n",
183+
" print(f\"Term {child} not found in the list of terms\")\n",
184+
" continue\n",
185+
" # Only one parent so yeah we are loosing information.\n",
186+
" impatient_json[index_term][\"parent\"] = parent[0][0].replace(\"_\", \":\")"
187+
]
188+
},
189+
{
190+
"cell_type": "code",
191+
"execution_count": null,
192+
"metadata": {},
193+
"outputs": [],
194+
"source": [
195+
"json.dump(impatient_json, open(\"impatient.json\", \"w\"))"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": null,
201+
"metadata": {},
202+
"outputs": [],
183203
"source": [
184-
"for terms in go.terms():\n",
185-
" print(terms)"
204+
"for idx, json_data in enumerate(impatient_json, start=1):\n",
205+
" validate(instance=json_data, schema=impatient_json_schema)"
186206
]
187207
}
188208
],
@@ -202,7 +222,7 @@
202222
"name": "python",
203223
"nbconvert_exporter": "python",
204224
"pygments_lexer": "ipython3",
205-
"version": "3.8.16"
225+
"version": "3.10.9"
206226
},
207227
"orig_nbformat": 4
208228
},

0 commit comments

Comments
 (0)