|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 23, |
| 5 | + "execution_count": null, |
6 | 6 | "metadata": {},
|
7 |
| - "outputs": [ |
8 |
| - { |
9 |
| - "ename": "TypeError", |
10 |
| - "evalue": "dumps() got multiple values for argument 'format'", |
11 |
| - "output_type": "error", |
12 |
| - "traceback": [ |
13 |
| - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", |
14 |
| - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", |
15 |
| - "Cell \u001b[0;32mIn[23], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpronto\u001b[39;00m \u001b[39mimport\u001b[39;00m Ontology\n\u001b[1;32m 2\u001b[0m go \u001b[39m=\u001b[39m Ontology(\u001b[39m\"\u001b[39m\u001b[39mgoslim_agr.obo\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m go_json \u001b[39m=\u001b[39m go\u001b[39m.\u001b[39;49mdumps(f, \u001b[39mformat\u001b[39;49m\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mjson\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", |
16 |
| - "\u001b[0;31mTypeError\u001b[0m: dumps() got multiple values for argument 'format'" |
17 |
| - ] |
18 |
| - } |
19 |
| - ], |
| 7 | + "outputs": [], |
20 | 8 | "source": [
|
21 | 9 | "from pronto import Ontology\n",
|
22 |
| - "go = Ontology(\"goslim_agr.obo\")\n", |
| 10 | + "go = Ontology(\"go.obo\")\n", |
23 | 11 | "go"
|
24 | 12 | ]
|
25 | 13 | },
|
26 | 14 | {
|
27 | 15 | "cell_type": "code",
|
28 |
| - "execution_count": 12, |
| 16 | + "execution_count": null, |
29 | 17 | "metadata": {},
|
30 | 18 | "outputs": [],
|
31 | 19 | "source": [
|
32 | 20 | "with open(\"ms.json\", \"wb\") as f:\n",
|
33 |
| - " go.dumps(f, format=\"json\")" |
| 21 | + " go.dump(f, format=\"json\")" |
34 | 22 | ]
|
35 | 23 | },
|
36 | 24 | {
|
37 | 25 | "cell_type": "code",
|
38 |
| - "execution_count": 33, |
| 26 | + "execution_count": null, |
39 | 27 | "metadata": {},
|
40 |
| - "outputs": [ |
41 |
| - { |
42 |
| - "data": { |
43 |
| - "text/plain": [ |
44 |
| - "dict_keys(['nodes', 'edges', 'id', 'lbl', 'meta', 'equivalentNodesSets', 'logicalDefinitionAxioms', 'domainRangeAxioms', 'propertyChainAxioms'])" |
45 |
| - ] |
46 |
| - }, |
47 |
| - "execution_count": 33, |
48 |
| - "metadata": {}, |
49 |
| - "output_type": "execute_result" |
50 |
| - } |
51 |
| - ], |
| 28 | + "outputs": [], |
| 29 | + "source": [ |
| 30 | + "import json\n", |
| 31 | + "with open(\"ms.json\", \"r\") as f:\n", |
| 32 | + " go = json.load(f)" |
| 33 | + ] |
| 34 | + }, |
| 35 | + { |
| 36 | + "cell_type": "code", |
| 37 | + "execution_count": null, |
| 38 | + "metadata": {}, |
| 39 | + "outputs": [], |
52 | 40 | "source": [
|
53 | 41 | "go[\"graphs\"][0].keys()"
|
54 | 42 | ]
|
55 | 43 | },
|
56 | 44 | {
|
57 | 45 | "cell_type": "code",
|
58 |
| - "execution_count": 29, |
| 46 | + "execution_count": null, |
59 | 47 | "metadata": {},
|
60 | 48 | "outputs": [],
|
61 | 49 | "source": [
|
62 |
| - "import json\n", |
63 |
| - "with open(\"ms.json\", \"r\") as f:\n", |
64 |
| - " go = json.load(f)" |
| 50 | + "go[\"graphs\"][0][\"nodes\"][0]" |
65 | 51 | ]
|
66 | 52 | },
|
67 | 53 | {
|
68 | 54 | "cell_type": "code",
|
69 |
| - "execution_count": 36, |
| 55 | + "execution_count": null, |
70 | 56 | "metadata": {},
|
71 |
| - "outputs": [ |
72 |
| - { |
73 |
| - "data": { |
74 |
| - "text/plain": [ |
75 |
| - "{'definition': None,\n", |
76 |
| - " 'comments': [],\n", |
77 |
| - " 'subsets': ['chebi_ph7_3',\n", |
78 |
| - " 'gocheck_do_not_annotate',\n", |
79 |
| - " 'gocheck_do_not_manually_annotate',\n", |
80 |
| - " 'goslim_agr',\n", |
81 |
| - " 'goslim_aspergillus',\n", |
82 |
| - " 'goslim_candida',\n", |
83 |
| - " 'goslim_chembl',\n", |
84 |
| - " 'goslim_drosophila',\n", |
85 |
| - " 'goslim_flybase_ribbon',\n", |
86 |
| - " 'goslim_generic',\n", |
87 |
| - " 'goslim_metagenomics',\n", |
88 |
| - " 'goslim_mouse',\n", |
89 |
| - " 'goslim_pir',\n", |
90 |
| - " 'goslim_plant',\n", |
91 |
| - " 'goslim_pombe',\n", |
92 |
| - " 'goslim_synapse',\n", |
93 |
| - " 'goslim_yeast',\n", |
94 |
| - " 'prokaryote_subset'],\n", |
95 |
| - " 'xrefs': [],\n", |
96 |
| - " 'synonyms': [],\n", |
97 |
| - " 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion',\n", |
98 |
| - " 'val': '1.2',\n", |
99 |
| - " 'xrefs': [],\n", |
100 |
| - " 'meta': None},\n", |
101 |
| - " {'pred': 'http://purl.obolibrary.org/obo/owl_versionInfo',\n", |
102 |
| - " 'val': '2023-07-27',\n", |
103 |
| - " 'xrefs': [],\n", |
104 |
| - " 'meta': None}],\n", |
105 |
| - " 'version': 'http://purl.obolibrary.org/obo/go/subsets/goslim_agr/go/2023-07-27/subsets/goslim_agr.owl/go/subsets/goslim_agr.owl',\n", |
106 |
| - " 'deprecated': False}" |
107 |
| - ] |
108 |
| - }, |
109 |
| - "execution_count": 36, |
110 |
| - "metadata": {}, |
111 |
| - "output_type": "execute_result" |
112 |
| - } |
113 |
| - ], |
| 57 | + "outputs": [], |
| 58 | + "source": [ |
| 59 | + "edge_dict: dict = {}\n", |
| 60 | + "for relationship in go[\"graphs\"][0][\"edges\"]:\n", |
| 61 | + " parent_list = edge_dict.get(relationship[\"sub\"].split(\"/\")[-1], [])\n", |
| 62 | + " parent_list.append((relationship[\"obj\"].split(\"/\")[-1], relationship[\"pred\"]))\n", |
| 63 | + " edge_dict[relationship[\"sub\"].split(\"/\")[-1]] = parent_list" |
| 64 | + ] |
| 65 | + }, |
| 66 | + { |
| 67 | + "cell_type": "code", |
| 68 | + "execution_count": null, |
| 69 | + "metadata": {}, |
| 70 | + "outputs": [], |
114 | 71 | "source": [
|
115 |
| - "go[\"graphs\"][0][\"meta\"]" |
| 72 | + "edge_dict" |
116 | 73 | ]
|
117 | 74 | },
|
118 | 75 | {
|
119 | 76 | "cell_type": "code",
|
120 |
| - "execution_count": 26, |
| 77 | + "execution_count": null, |
121 | 78 | "metadata": {},
|
122 |
| - "outputs": [ |
123 |
| - { |
124 |
| - "name": "stdout", |
125 |
| - "output_type": "stream", |
126 |
| - "text": [ |
127 |
| - "Term('GO:0000003', name='reproduction')\n", |
128 |
| - "Term('GO:0002376', name='immune system process')\n", |
129 |
| - "Term('GO:0003677', name='DNA binding')\n", |
130 |
| - "Term('GO:0003700', name='DNA-binding transcription factor activity')\n", |
131 |
| - "Term('GO:0003723', name='RNA binding')\n", |
132 |
| - "Term('GO:0003824', name='catalytic activity')\n", |
133 |
| - "Term('GO:0005102', name='signaling receptor binding')\n", |
134 |
| - "Term('GO:0005198', name='structural molecule activity')\n", |
135 |
| - "Term('GO:0005215', name='transporter activity')\n", |
136 |
| - "Term('GO:0005576', name='extracellular region')\n", |
137 |
| - "Term('GO:0005634', name='nucleus')\n", |
138 |
| - "Term('GO:0005694', name='chromosome')\n", |
139 |
| - "Term('GO:0005739', name='mitochondrion')\n", |
140 |
| - "Term('GO:0005768', name='endosome')\n", |
141 |
| - "Term('GO:0005773', name='vacuole')\n", |
142 |
| - "Term('GO:0005783', name='endoplasmic reticulum')\n", |
143 |
| - "Term('GO:0005794', name='Golgi apparatus')\n", |
144 |
| - "Term('GO:0005829', name='cytosol')\n", |
145 |
| - "Term('GO:0005856', name='cytoskeleton')\n", |
146 |
| - "Term('GO:0005886', name='plasma membrane')\n", |
147 |
| - "Term('GO:0005975', name='carbohydrate metabolic process')\n", |
148 |
| - "Term('GO:0006259', name='DNA metabolic process')\n", |
149 |
| - "Term('GO:0006629', name='lipid metabolic process')\n", |
150 |
| - "Term('GO:0007049', name='cell cycle')\n", |
151 |
| - "Term('GO:0007610', name='behavior')\n", |
152 |
| - "Term('GO:0008092', name='cytoskeletal protein binding')\n", |
153 |
| - "Term('GO:0008134', name='transcription factor binding')\n", |
154 |
| - "Term('GO:0008283', name='cell population proliferation')\n", |
155 |
| - "Term('GO:0008289', name='lipid binding')\n", |
156 |
| - "Term('GO:0009056', name='catabolic process')\n", |
157 |
| - "Term('GO:0012501', name='programmed cell death')\n", |
158 |
| - "Term('GO:0016043', name='cellular component organization')\n", |
159 |
| - "Term('GO:0016070', name='RNA metabolic process')\n", |
160 |
| - "Term('GO:0019538', name='protein metabolic process')\n", |
161 |
| - "Term('GO:0023052', name='signaling')\n", |
162 |
| - "Term('GO:0030054', name='cell junction')\n", |
163 |
| - "Term('GO:0030154', name='cell differentiation')\n", |
164 |
| - "Term('GO:0030234', name='enzyme regulator activity')\n", |
165 |
| - "Term('GO:0030246', name='carbohydrate binding')\n", |
166 |
| - "Term('GO:0031410', name='cytoplasmic vesicle')\n", |
167 |
| - "Term('GO:0032502', name='developmental process')\n", |
168 |
| - "Term('GO:0032991', name='protein-containing complex')\n", |
169 |
| - "Term('GO:0036094', name='small molecule binding')\n", |
170 |
| - "Term('GO:0038023', name='signaling receptor activity')\n", |
171 |
| - "Term('GO:0042592', name='homeostatic process')\n", |
172 |
| - "Term('GO:0042995', name='cell projection')\n", |
173 |
| - "Term('GO:0045202', name='synapse')\n", |
174 |
| - "Term('GO:0050877', name='nervous system process')\n", |
175 |
| - "Term('GO:0050896', name='response to stimulus')\n", |
176 |
| - "Term('GO:0051234', name='establishment of localization')\n", |
177 |
| - "Term('GO:0097367', name='carbohydrate derivative binding')\n", |
178 |
| - "Term('GO:1901135', name='carbohydrate derivative metabolic process')\n", |
179 |
| - "Term('GO:0046872', name='metal ion binding')\n" |
180 |
| - ] |
181 |
| - } |
182 |
| - ], |
| 79 | + "outputs": [], |
| 80 | + "source": [ |
| 81 | + "for go_term in go[\"graphs\"][0][\"nodes\"]:\n", |
| 82 | + " if go_term[\"type\"] != \"CLASS\":\n", |
| 83 | + " print(go_term)" |
| 84 | + ] |
| 85 | + }, |
| 86 | + { |
| 87 | + "cell_type": "code", |
| 88 | + "execution_count": null, |
| 89 | + "metadata": {}, |
| 90 | + "outputs": [], |
| 91 | + "source": [ |
| 92 | + "names: list[str] = []\n", |
| 93 | + "id: list[str] = []\n", |
| 94 | + "desc: list[str] = []\n", |
| 95 | + "synonymes: list[list[str]] = []\n", |
| 96 | + "\n", |
| 97 | + "for go_term in go[\"graphs\"][0][\"nodes\"]:\n", |
| 98 | + " if go_term[\"type\"] == \"CLASS\":\n", |
| 99 | + " id.append(go_term[\"id\"].split(\"/\")[-1])\n", |
| 100 | + " names.append(go_term[\"lbl\"])\n", |
| 101 | + " desc.append(go_term[\"meta\"][\"definition\"][\"val\"])\n", |
| 102 | + " synonymes.append([syn[\"val\"] for syn in go_term[\"meta\"][\"synonyms\"]])" |
| 103 | + ] |
| 104 | + }, |
| 105 | + { |
| 106 | + "cell_type": "code", |
| 107 | + "execution_count": null, |
| 108 | + "metadata": {}, |
| 109 | + "outputs": [], |
| 110 | + "source": [] |
| 111 | + }, |
| 112 | + { |
| 113 | + "cell_type": "code", |
| 114 | + "execution_count": null, |
| 115 | + "metadata": {}, |
| 116 | + "outputs": [], |
| 117 | + "source": [ |
| 118 | + "import jsonschema\n", |
| 119 | + "from jsonschema import validate\n", |
| 120 | + "\n", |
| 121 | + "impatient_json: list[dict] = []\n", |
| 122 | + "impatient_json_schema = {\n", |
| 123 | + " \"type\": \"object\",\n", |
| 124 | + " \"properties\": {\n", |
| 125 | + " \"id\": {\"type\": \"string\"},\n", |
| 126 | + " \"text\": {\"type\": \"string\"},\n", |
| 127 | + " \"icon\": {\"type\": \"boolean\"},\n", |
| 128 | + " \"data\": {\n", |
| 129 | + " \"type\": \"object\",\n", |
| 130 | + " \"properties\": {\n", |
| 131 | + " \"description\": {\"type\": \"string\"},\n", |
| 132 | + " \"synonymes\": {\"type\": \"string\"},\n", |
| 133 | + " \"phenotype_datamined\": {\"type\": \"string\"},\n", |
| 134 | + " \"gene_datamined\": {\"type\": \"string\"},\n", |
| 135 | + " \"alternative_language\": {\"type\": \"string\"},\n", |
| 136 | + " \"correlates_with\": {\"type\": \"string\"},\n", |
| 137 | + " \"image_annotation\": {\"type\": \"boolean\"},\n", |
| 138 | + " \"hex_color\": {\"type\": \"string\", \"pattern\": \"^#[0-9a-fA-F]{6}$\"},\n", |
| 139 | + " \"hpo_datamined\": {\"type\": \"string\"},\n", |
| 140 | + " },\n", |
| 141 | + " \"required\": [\n", |
| 142 | + " \"description\",\n", |
| 143 | + " \"synonymes\",\n", |
| 144 | + " \"phenotype_datamined\",\n", |
| 145 | + " \"gene_datamined\",\n", |
| 146 | + " \"alternative_language\",\n", |
| 147 | + " \"correlates_with\",\n", |
| 148 | + " \"image_annotation\",\n", |
| 149 | + " \"hex_color\",\n", |
| 150 | + " \"hpo_datamined\",\n", |
| 151 | + " ],\n", |
| 152 | + " },\n", |
| 153 | + " \"parent\": {\"type\": \"string\"},\n", |
| 154 | + " },\n", |
| 155 | + " \"required\": [\"id\", \"text\", \"icon\", \"data\", \"parent\"],\n", |
| 156 | + "}\n", |
| 157 | + "\n", |
| 158 | + "for index in range(len(id)):\n", |
| 159 | + " impatient_json.append(\n", |
| 160 | + " {\n", |
| 161 | + " \"id\": id[index].replace(\"_\", \":\"),\n", |
| 162 | + " \"text\": names[index],\n", |
| 163 | + " \"icon\": True,\n", |
| 164 | + " \"data\": {\n", |
| 165 | + " \"description\": desc[index],\n", |
| 166 | + " \"synonymes\": ','.join(synonymes[index]),\n", |
| 167 | + " \"phenotype_datamined\": \"\",\n", |
| 168 | + " \"gene_datamined\": \"\",\n", |
| 169 | + " \"alternative_language\": names[index],\n", |
| 170 | + " \"correlates_with\": \"\",\n", |
| 171 | + " \"image_annotation\": True if index==0 else False,\n", |
| 172 | + " \"hex_color\": \"#FFFFFF\",\n", |
| 173 | + " \"hpo_datamined\": \"\",\n", |
| 174 | + " },\n", |
| 175 | + " \"parent\": \"#\",\n", |
| 176 | + " }\n", |
| 177 | + " )\n", |
| 178 | + " \n", |
| 179 | + "for child, parent in edge_dict.items():\n", |
| 180 | + " try:\n", |
| 181 | + " index_term = id.index(child)\n", |
| 182 | + " except ValueError:\n", |
| 183 | + " print(f\"Term {child} not found in the list of terms\")\n", |
| 184 | + " continue\n", |
| 185 | + " # Only one parent so yeah we are loosing information.\n", |
| 186 | + " impatient_json[index_term][\"parent\"] = parent[0][0].replace(\"_\", \":\")" |
| 187 | + ] |
| 188 | + }, |
| 189 | + { |
| 190 | + "cell_type": "code", |
| 191 | + "execution_count": null, |
| 192 | + "metadata": {}, |
| 193 | + "outputs": [], |
| 194 | + "source": [ |
| 195 | + "json.dump(impatient_json, open(\"impatient.json\", \"w\"))" |
| 196 | + ] |
| 197 | + }, |
| 198 | + { |
| 199 | + "cell_type": "code", |
| 200 | + "execution_count": null, |
| 201 | + "metadata": {}, |
| 202 | + "outputs": [], |
183 | 203 | "source": [
|
184 |
| - "for terms in go.terms():\n", |
185 |
| - " print(terms)" |
| 204 | + "for idx, json_data in enumerate(impatient_json, start=1):\n", |
| 205 | + " validate(instance=json_data, schema=impatient_json_schema)" |
186 | 206 | ]
|
187 | 207 | }
|
188 | 208 | ],
|
|
202 | 222 | "name": "python",
|
203 | 223 | "nbconvert_exporter": "python",
|
204 | 224 | "pygments_lexer": "ipython3",
|
205 |
| - "version": "3.8.16" |
| 225 | + "version": "3.10.9" |
206 | 226 | },
|
207 | 227 | "orig_nbformat": 4
|
208 | 228 | },
|
|
0 commit comments