Merge pull request #48 from GeneDx/obo

remove phenopy as dependency
GeneDx · Mar 8, 2021 · 9cc1a9c · 9cc1a9c
2 parents c4f7c09 + f60e3f4
commit 9cc1a9c
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 36 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -20,13 +20,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pipenv
-        pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
-        pip install -r requirements_temp.txt
+        pip install pytest flake8 pytest-cov
         python setup.py develop
-        pip install -e .
 
 
     - name: Test with unittest
       run: |
-        pipenv run pytest tests
+        pytest tests
diff --git a/Pipfile b/Pipfile
@@ -13,8 +13,6 @@ nltk = "==3.4.5"
 spacy = "==2.2.4"
 scispacy = "==0.2.4"
 negspacy = "==0.1.9"
-phenopy = {git = "https://github.com/GeneDx/phenopy.git", editable = true}
-networkx = "*"
 gensim = "==3.8.1"
 
 [requires]

diff --git a/setup.py b/setup.py
@@ -30,6 +30,8 @@
         'scispacy==0.2.4',
         'negspacy==0.1.9',
         'networkx',
+        'obonet',
+        'requests',
         'gensim==3.8.1',
 
 

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -4,7 +4,7 @@
 from txt2hpo.extract import Extractor, Data, group_sequence
 from txt2hpo.data import load_model
 from tests.test_cases import *
-from txt2hpo.util import hpo_network
+from txt2hpo.util import hpo_network, non_phenos
 
 
 class ExtractPhenotypesTestCase(unittest.TestCase):
@@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self):
     def test_hpo_big_text_spellcheck_off(self):
         # test parsing a page
         extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True)
-        self.assertEqual(extract.hpo(test_case11_text).n_entries, 7)
+        res = extract.hpo(test_case11_text)
+        self.assertEqual(res.n_entries, 7)
 
     def test_hpo_big_text_spellcheck_off_max3(self):
         # test parsing a page
@@ -319,6 +320,12 @@ def test_extract_json_property(self):
         resp = extract.hpo("Wide gait and a wide mouth")
         self.assertEqual(truth, resp.json)
 
+    def test_extract_full_context(self):
+        extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False)
+        resp = extract.hpo("X linked")
+        self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417')
+        self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')
+
     def test_extract_without_negated(self):
 
         # negation should not apply if negation is part of matched string
@@ -388,8 +395,12 @@ def test_multiple_matches(self):
         resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.")
         self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))
 
-    def test_handing_term_hyphenation(self):
-        extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=3)
+    def test_handling_term_hyphenation(self):
+        extract = Extractor(correct_spelling=False,
+                            remove_overlapping=True,
+                            resolve_conflicts=True,
+                            max_neighbors=2,
+                            phenotypes_only=False)
         hyphenated_phenos = \
             [
             (hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \
@@ -401,10 +412,11 @@ def test_handing_term_hyphenation(self):
 
             ]
         # Phenotypes where word-order is important is a limitation of current parsing method
-        known_bugs = ['HP:0000510', 'HP:0030932']
-        long_phenos = ['HP:0011654', 'HP:0410303']
+        known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215']
+        long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074']
         hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos]
-
+        hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos]
+        hyphenated_phenos = hyphenated_phenos[:10]
         for test in hyphenated_phenos:
             # current version is not expected to extract very long phenotypes
             hpids = extract.hpo(test[0]).hpids

diff --git a/txt2hpo/config.py b/txt2hpo/config.py
@@ -1,6 +1,8 @@
 import configparser
 import logging
 import os
+import requests
+
 from gensim.models import KeyedVectors
 from txt2hpo import __project__, __version__
 
@@ -58,6 +60,21 @@
     wv.save(d2v_vw_path)
     config['models']['doc2vec'] = d2v_vw_path
 
+    config['hpo'] = {}
+    obo_path = os.path.join(data_directory, 'hp.obo')
+
+    if os.path.isfile(obo_path):
+        config['hpo']['obo'] = obo_path
+    else:
+        url = "http://purl.obolibrary.org/obo/hp.obo"
+        r = requests.get(url, allow_redirects=True)
+        with open(obo_path, 'wb') as fh:
+            fh.write(r.content)
+        if os.path.isfile(obo_path):
+            config['hpo']['obo'] = obo_path
+        else:
+            logger.critical("Unable to download hp.obo from ", url)
+
     config['data'] = {}
     spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json')
     config['data']['spellcheck_vocab'] = spellcheck_vocab_path

diff --git a/txt2hpo/extract.py b/txt2hpo/extract.py
@@ -11,7 +11,7 @@
 from txt2hpo.nlp import st
 from txt2hpo.data import load_model
 from txt2hpo.build_tree import search_tree, build_search_tree
-from txt2hpo.util import remove_key
+from txt2hpo.util import remove_key, non_phenos
 
 
 class Data(object):
@@ -29,8 +29,11 @@ def add(self,entry):
     def remove(self, item):
         self.entries.remove(item)
 
-    def remove_tagged(self, tag, state=True):
-        to_remove = [entry for entry in self.entries if entry[tag] is state]
+    def remove_tagged(self, tag, state=True, status=True):
+        if status is True:
+            to_remove = [entry for entry in self.entries if entry[tag] == state]
+        else:
+            to_remove = [entry for entry in self.entries if entry[tag] != state]
         for element in to_remove:
             self.remove(element)
 
@@ -46,13 +49,24 @@ def detect_negation(self):
                 entry['matched_words'] = []
             entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False
 
+    def label_terms(self):
+        for entry in self.entries:
+            for hpid in entry['hpid']:
+                if hpid in non_phenos:
+                    entry['type'] = non_phenos[hpid]
+                else:
+                    entry['type'] = 'phenotype'
+
+    def remove_non_phenos(self):
+        self.remove_tagged('type', state='phenotype', status=False)
+
     def remove_negated(self):
         self.detect_negation()
         self.remove_tagged('is_negated')
 
     def remove_overlapping(self):
         self._mark_overlapping()
-        self.remove_tagged('is_longest', False)
+        self.remove_tagged('is_longest', state=False)
 
     def _mark_overlapping(self):
         """
@@ -126,6 +140,7 @@ def entries_sans_context(self):
         result = remove_key(result, 'context')
         result = remove_key(result, 'matched_tokens')
         result = remove_key(result, 'is_longest')
+        result = remove_key(result, 'type')
         return result
 
     @property
@@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True,
                  model=None,
                  custom_synonyms=None,
                  negation_language="en",
-                 chunk_by='phrase'
+                 chunk_by='phrase',
+                 phenotypes_only=True,
                  ):
 
         self.correct_spelling = correct_spelling
@@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True,
         self.context_window = context_window
         self.negation_model = nlp_model(negation_language=negation_language)
         self.chunk_by = chunk_by
+        self.phenotypes_only = phenotypes_only
         if custom_synonyms:
             self.search_tree = build_search_tree(custom_synonyms=custom_synonyms)
         else:
@@ -243,6 +260,10 @@ def hpo(self, text):
         if self.remove_overlapping:
             extracted_terms.remove_overlapping()
 
+        extracted_terms.label_terms()
+        if self.phenotypes_only:
+            extracted_terms.remove_non_phenos()
+
         return extracted_terms
 
     def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index):

diff --git a/txt2hpo/nlp.py b/txt2hpo/nlp.py
@@ -32,6 +32,7 @@ def nlp_model(negation_language="en"):
 
     return nlp
 
+
 try:
     import en_core_sci_sm
     nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"])
@@ -53,7 +54,7 @@ def nlp_model(negation_language="en"):
 
 # these are used in hpo as part of phenotype definition, should block from filtering
 remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not "
-remove_from_stops += "out up side right left more less during than take move full few all to i"
+remove_from_stops += "out up side right left more less during than take move full few all to i "
 
 for not_a_stop in remove_from_stops.split(" "):
     nlp_sans_ner.vocab[not_a_stop].is_stop = False

diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py
@@ -4,10 +4,16 @@
 import numpy as np
 from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples
 from txt2hpo.config import logger
-from phenopy.util import half_product
 from functools import reduce
 
 
+def half_product(num_rows, num_columns):
+    """yield combinations and the diagonal"""
+    for m in range(0, num_rows):
+        for n in range(m, num_columns):
+            yield (m, n)
+
+
 def phenotype_distance(extracted_hpos):
     """
     Given the return from hpo, find the normalized distance between all terms in the document.

diff --git a/txt2hpo/util.py b/txt2hpo/util.py
@@ -1,24 +1,43 @@
 import pandas as pd
 
 import math
-from phenopy.config import config as phenopy_config
-from phenopy.build_hpo import generate_annotated_hpo_network
-
+import obonet
+import re
 import sys
 import subprocess
 import os
-
-
-obo_file = phenopy_config.get('hpo', 'obo_file')
-
-disease_to_phenotype_file = phenopy_config.get('hpo', 'disease_to_phenotype_file')
-
-hpo_network, alt2prim, disease_records = \
-    generate_annotated_hpo_network(obo_file,
-                                   disease_to_phenotype_file,
-                                   annotations_file=None,
-                                   ages_distribution_file=None
-                                   )
+import networkx as nx
+from txt2hpo.config import config
+
+# hpo_network = obonet.read_obo(obo_file)
+
+obo_file = config.get('hpo', 'obo')
+
+hpo_network = obonet.read_obo(obo_file)
+for node_id, data in hpo_network.nodes(data=True):
+    # clean synonyms
+    synonyms = []
+    if 'synonym' in data:
+        for synonym in data['synonym']:
+            synonyms.append(synonym)
+        hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms))
+
+# roots for non-phenotype nodes
+non_phenotypes = {
+    'mortality_aging': 'HP:0040006',
+    'mode_of_inheritance': 'HP:0000005',
+    'clinical_modifier': 'HP:0012823',
+    'frequency': 'HP:0040279',
+    'clinical_course': 'HP:0031797',
+}
+
+non_phenos = {}
+# remove non-phenotype branches
+for name, hpo_id in non_phenotypes.items():
+    if hpo_id in hpo_network.nodes:
+        children = nx.ancestors(hpo_network, hpo_id)
+        for hpid in [hpo_id] + list(children):
+            non_phenos[hpid] = name
 
 
 def group_pairs(phenotype_pairs):