ontology conversion class

Corentin · Corentin · commit 00e6f563dbfe · 2023-09-11T21:44:36.000+02:00
diff --git a/notebooks/import_ontology.ipynb b/notebooks/import_ontology.ipynb
@@ -1,86 +1,93 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pronto import Ontology\n",
-    "go = Ontology(\"go.obo\")\n",
-    "go"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(\"ms.json\", \"wb\") as f:\n",
-    "    go.dump(f, format=\"json\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
-    "with open(\"ms.json\", \"r\") as f:\n",
-    "    go = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "go[\"graphs\"][0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "go[\"graphs\"][0][\"nodes\"][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "edge_dict: dict = {}\n",
-    "for relationship in go[\"graphs\"][0][\"edges\"]:\n",
-    "    parent_list = edge_dict.get(relationship[\"sub\"].split(\"/\")[-1], [])\n",
-    "    parent_list.append((relationship[\"obj\"].split(\"/\")[-1], relationship[\"pred\"]))\n",
-    "    edge_dict[relationship[\"sub\"].split(\"/\")[-1]] = parent_list"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "edge_dict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for go_term in go[\"graphs\"][0][\"nodes\"]:\n",
-    "    if go_term[\"type\"] != \"CLASS\":\n",
-    "        print(go_term)"
+    "import random\n",
+    "from pronto import Ontology, Definition\n",
+    "\n",
+    "class ImpatientVocab():\n",
+    "    def __init__(self) -> None:\n",
+    "        self.used_colors: list[str] = []\n",
+    "        self.impatient_json: list[dict] = []\n",
+    "        self.impatient_onto: Ontology = None\n",
+    "        self.list_of_terms: list[str] = []\n",
+    "\n",
+    "    def load_json(self, path: str) -> list[dict]:\n",
+    "        self.impatient_json = json.load(open(path, \"r\"))\n",
+    "        return self.impatient_json\n",
+    "    \n",
+    "    def load_ontology(self, path: str) -> Ontology:\n",
+    "        self.impatient_onto = Ontology(path)\n",
+    "        return self.impatient_onto\n",
+    "    \n",
+    "    def json_to_onto(self) -> Ontology:\n",
+    "        self.impatient_onto = Ontology()\n",
+    "        for term in self.impatient_json:\n",
+    "            added_term = self.impatient_onto.create_term(term[\"id\"].replace(\"_\", \":\"))\n",
+    "            added_term.name = term[\"text\"]\n",
+    "            for syn in term[\"data\"][\"synonymes\"].split(\",\"):\n",
+    "                if syn != \"\":\n",
+    "                    added_term.add_synonym(syn, scope=\"EXACT\")\n",
+    "            if term[\"data\"][\"description\"] != \"\":\n",
+    "                added_term.definition = Definition(term[\"data\"][\"description\"])\n",
+    "            if term[\"parent\"] != \"#\":\n",
+    "                added_term.superclasses().add(self.impatient_onto[term[\"parent\"].replace(\"_\", \":\")])\n",
+    "        \n",
+    "            self.list_of_terms.append(added_term)\n",
+    "        return self.impatient_onto\n",
+    "    \n",
+    "    def onto_to_json(self) -> list[dict]:\n",
+    "        self.impatient_json = []\n",
+    "        index = 0\n",
+    "        for term in self.impatient_onto.terms():\n",
+    "            relationships = []\n",
+    "            for rel in term.superclasses():\n",
+    "                relationships.append(rel.id)\n",
+    "            relationships.pop(0)\n",
+    "            self.impatient_json.append(\n",
+    "                {\n",
+    "                    \"id\": term.id.replace(\"_\", \":\"),\n",
+    "                    \"text\": term.name,\n",
+    "                    \"icon\": True,\n",
+    "                    \"data\": {\n",
+    "                        \"description\": term.definition if term.definition is not None else \"\",\n",
+    "                        \"synonymes\": \",\".join([syn.description for syn in term.synonyms]),\n",
+    "                        \"phenotype_datamined\": \"\",\n",
+    "                        \"gene_datamined\": \"\",\n",
+    "                        \"alternative_language\": term.name,\n",
+    "                        \"correlates_with\": \"\",\n",
+    "                        \"image_annotation\": True if index == 0 else False,\n",
+    "                        \"hex_color\": self._generate_hex_color(),\n",
+    "                        \"hpo_datamined\": \"\",\n",
+    "                    },\n",
+    "                    \"parent\": relationships[0].replace(\"_\", \":\") if relationships != [] else \"#\"\n",
+    "                }\n",
+    "            )\n",
+    "            index += 1\n",
+    "        return self.impatient_json\n",
+    "    \n",
+    "    def _generate_hex_color(self):\n",
+    "        while True:\n",
+    "            # Generate a random hex color\n",
+    "            color = \"#{:06x}\".format(random.randint(0, 0xFFFFFF))\n",
+    "            # Check if the color has already been used\n",
+    "            if color not in self.used_colors:\n",
+    "                # Add the color to the list of used colors and return it\n",
+    "                self.used_colors.append(color)\n",
+    "                return color\n",
+    "        \n",
+    "    def dump_onto(self, path: str) -> None:\n",
+    "        with open(path, \"wb\") as f:\n",
+    "            self.impatient_onto.dump(f, format=\"obo\")\n",
+    "\n",
+    "    def dump_json(self, path: str) -> None:\n",
+    "        with open(path, \"w\") as f:\n",
+    "            json.dump(self.impatient_json, f, indent=2)"
    ]
   },
   {
@@ -89,101 +96,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "names: list[str] = []\n",
-    "id: list[str] = []\n",
-    "desc: list[str] = []\n",
-    "synonymes: list[list[str]] = []\n",
-    "\n",
-    "for go_term in go[\"graphs\"][0][\"nodes\"]:\n",
-    "    if go_term[\"type\"] == \"CLASS\":\n",
-    "        id.append(go_term[\"id\"].split(\"/\")[-1])\n",
-    "        names.append(go_term[\"lbl\"])\n",
-    "        desc.append(go_term[\"meta\"][\"definition\"][\"val\"])\n",
-    "        synonymes.append([syn[\"val\"] for syn in go_term[\"meta\"][\"synonyms\"]])"
+    "my_onto = ImpatientVocab()\n",
+    "my_onto.load_json(\"ontology.json.demo\")\n",
+    "my_onto.json_to_onto()\n",
+    "my_onto.dump_onto(\"ontology_imp.obo\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import jsonschema\n",
-    "from jsonschema import validate\n",
-    "\n",
-    "impatient_json: list[dict] = []\n",
-    "impatient_json_schema = {\n",
-    "    \"type\": \"object\",\n",
-    "    \"properties\": {\n",
-    "        \"id\": {\"type\": \"string\"},\n",
-    "        \"text\": {\"type\": \"string\"},\n",
-    "        \"icon\": {\"type\": \"boolean\"},\n",
-    "        \"data\": {\n",
-    "            \"type\": \"object\",\n",
-    "            \"properties\": {\n",
-    "                \"description\": {\"type\": \"string\"},\n",
-    "                \"synonymes\": {\"type\": \"string\"},\n",
-    "                \"phenotype_datamined\": {\"type\": \"string\"},\n",
-    "                \"gene_datamined\": {\"type\": \"string\"},\n",
-    "                \"alternative_language\": {\"type\": \"string\"},\n",
-    "                \"correlates_with\": {\"type\": \"string\"},\n",
-    "                \"image_annotation\": {\"type\": \"boolean\"},\n",
-    "                \"hex_color\": {\"type\": \"string\", \"pattern\": \"^#[0-9a-fA-F]{6}$\"},\n",
-    "                \"hpo_datamined\": {\"type\": \"string\"},\n",
-    "            },\n",
-    "            \"required\": [\n",
-    "                \"description\",\n",
-    "                \"synonymes\",\n",
-    "                \"phenotype_datamined\",\n",
-    "                \"gene_datamined\",\n",
-    "                \"alternative_language\",\n",
-    "                \"correlates_with\",\n",
-    "                \"image_annotation\",\n",
-    "                \"hex_color\",\n",
-    "                \"hpo_datamined\",\n",
-    "            ],\n",
-    "        },\n",
-    "        \"parent\": {\"type\": \"string\"},\n",
-    "    },\n",
-    "    \"required\": [\"id\", \"text\", \"icon\", \"data\", \"parent\"],\n",
-    "}\n",
-    "\n",
-    "for index in range(len(id)):\n",
-    "    impatient_json.append(\n",
-    "        {\n",
-    "            \"id\": id[index].replace(\"_\", \":\"),\n",
-    "            \"text\": names[index],\n",
-    "            \"icon\": True,\n",
-    "            \"data\": {\n",
-    "                \"description\": desc[index],\n",
-    "                \"synonymes\": ','.join(synonymes[index]),\n",
-    "                \"phenotype_datamined\": \"\",\n",
-    "                \"gene_datamined\": \"\",\n",
-    "                \"alternative_language\": names[index],\n",
-    "                \"correlates_with\": \"\",\n",
-    "                \"image_annotation\": True if index==0 else False,\n",
-    "                \"hex_color\": \"#FFFFFF\",\n",
-    "                \"hpo_datamined\": \"\",\n",
-    "            },\n",
-    "            \"parent\": \"#\",\n",
-    "        }\n",
-    "    )\n",
-    "    \n",
-    "for child, parent in edge_dict.items():\n",
-    "    try:\n",
-    "        index_term = id.index(child)\n",
-    "    except ValueError:\n",
-    "        print(f\"Term {child} not found in the list of terms\")\n",
-    "        continue\n",
-    "    # Only one parent so yeah we are loosing information.\n",
-    "    impatient_json[index_term][\"parent\"] = parent[0][0].replace(\"_\", \":\")"
+    "my_onto = ImpatientVocab()\n",
+    "my_onto.load_ontology(\"goslim_agr.obo\")\n",
+    "my_onto.onto_to_json()\n",
+    "my_onto.dump_json(\"obo_to_json_GO.json\")"
    ]
   },
   {
@@ -192,7 +120,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "json.dump(impatient_json, open(\"impatient.json\", \"w\"))"
+    "my_onto = ImpatientVocab()\n",
+    "my_onto.load_ontology(\"ontology_imp.obo\")\n",
+    "my_onto.onto_to_json()\n",
+    "my_onto.dump_json(\"obo_to_json_IMP.json\")"
    ]
   },
   {
@@ -201,8 +132,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for idx, json_data in enumerate(impatient_json, start=1):\n",
-    "    validate(instance=json_data, schema=impatient_json_schema)"
+    "my_onto = ImpatientVocab()\n",
+    "my_onto.load_ontology(\"hp.owl\")\n",
+    "my_onto.onto_to_json()\n",
+    "my_onto.dump_json(\"obo_to_json_HPO.json\")"
    ]
   }
  ],