From c5768b89182d723bb3c0891d5ae3356d11a64d01 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Fri, 5 Mar 2021 17:39:34 -0500
Subject: [PATCH 01/10] remove phenopy as dependency download and parse hp obo
 keep non-phenotype hpo terms add type to context

---
 Pipfile               |  2 --
 setup.py              |  2 ++
 tests/test_extract.py | 26 ++++++++++++++++-------
 txt2hpo/config.py     | 17 +++++++++++++++
 txt2hpo/extract.py    | 31 ++++++++++++++++++++++-----
 txt2hpo/nlp.py        |  3 ++-
 txt2hpo/util.py       | 49 ++++++++++++++++++++++++++++++-------------
 7 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/Pipfile b/Pipfile
index e3dfc7c..a772239 100644
--- a/Pipfile
+++ b/Pipfile
@@ -13,8 +13,6 @@ nltk = "==3.4.5"
 spacy = "==2.2.4"
 scispacy = "==0.2.4"
 negspacy = "==0.1.9"
-phenopy = {git = "https://github.com/GeneDx/phenopy.git", editable = true}
-networkx = "*"
 gensim = "==3.8.1"
 
 [requires]
diff --git a/setup.py b/setup.py
index 3cd4c66..153774f 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,8 @@
         'scispacy==0.2.4',
         'negspacy==0.1.9',
         'networkx',
+        'obonet',
+        'requests',
         'gensim==3.8.1',
 
 
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 89886f5..c1db996 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -4,7 +4,7 @@
 from txt2hpo.extract import Extractor, Data, group_sequence
 from txt2hpo.data import load_model
 from tests.test_cases import *
-from txt2hpo.util import hpo_network
+from txt2hpo.util import hpo_network, non_phenos
 
 
 class ExtractPhenotypesTestCase(unittest.TestCase):
@@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self):
     def test_hpo_big_text_spellcheck_off(self):
         # test parsing a page
         extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True)
-        self.assertEqual(extract.hpo(test_case11_text).n_entries, 7)
+        res = extract.hpo(test_case11_text)
+        self.assertEqual(res.n_entries, 7)
 
     def test_hpo_big_text_spellcheck_off_max3(self):
         # test parsing a page
@@ -319,6 +320,12 @@ def test_extract_json_property(self):
         resp = extract.hpo("Wide gait and a wide mouth")
         self.assertEqual(truth, resp.json)
 
+    def test_extract_full_context(self):
+        extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False)
+        resp = extract.hpo("X linked")
+        self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417')
+        self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')
+
     def test_extract_without_negated(self):
 
         # negation should not apply if negation is part of matched string
@@ -388,8 +395,12 @@ def test_multiple_matches(self):
         resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.")
         self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))
 
-    def test_handing_term_hyphenation(self):
-        extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=3)
+    def test_handling_term_hyphenation(self):
+        extract = Extractor(correct_spelling=False,
+                            remove_overlapping=True,
+                            resolve_conflicts=True,
+                            max_neighbors=2,
+                            phenotypes_only=False)
         hyphenated_phenos = \
             [
             (hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \
@@ -401,10 +412,11 @@ def test_handing_term_hyphenation(self):
 
             ]
         # Phenotypes where word-order is important is a limitation of current parsing method
-        known_bugs = ['HP:0000510', 'HP:0030932']
-        long_phenos = ['HP:0011654', 'HP:0410303']
+        known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215']
+        long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074']
         hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos]
-
+        hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos]
+        hyphenated_phenos = hyphenated_phenos[:10]
         for test in hyphenated_phenos:
             # current version is not expected to extract very long phenotypes
             hpids = extract.hpo(test[0]).hpids
diff --git a/txt2hpo/config.py b/txt2hpo/config.py
index bf13c21..bc97d41 100644
--- a/txt2hpo/config.py
+++ b/txt2hpo/config.py
@@ -1,6 +1,8 @@
 import configparser
 import logging
 import os
+import requests
+
 from gensim.models import KeyedVectors
 from txt2hpo import __project__, __version__
 
@@ -58,6 +60,21 @@
     wv.save(d2v_vw_path)
     config['models']['doc2vec'] = d2v_vw_path
 
+    config['hpo'] = {}
+    obo_path = os.path.join(data_directory, 'hp.obo')
+
+    if os.path.isfile(obo_path):
+        config['hpo']['obo'] = obo_path
+    else:
+        url = "http://purl.obolibrary.org/obo/hp.obo"
+        r = requests.get(url, allow_redirects=True)
+        with open(obo_path, 'wb') as fh:
+            fh.write(r.content)
+        if os.path.isfile(obo_path):
+            config['hpo']['obo'] = obo_path
+        else:
+            logger.critical("Unable to download hp.obo from ", url)
+
     config['data'] = {}
     spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json')
     config['data']['spellcheck_vocab'] = spellcheck_vocab_path
diff --git a/txt2hpo/extract.py b/txt2hpo/extract.py
index 6ce8218..dd6304f 100644
--- a/txt2hpo/extract.py
+++ b/txt2hpo/extract.py
@@ -11,7 +11,7 @@
 from txt2hpo.nlp import st
 from txt2hpo.data import load_model
 from txt2hpo.build_tree import search_tree, build_search_tree
-from txt2hpo.util import remove_key
+from txt2hpo.util import remove_key, non_phenos
 
 
 class Data(object):
@@ -29,8 +29,11 @@ def add(self,entry):
     def remove(self, item):
         self.entries.remove(item)
 
-    def remove_tagged(self, tag, state=True):
-        to_remove = [entry for entry in self.entries if entry[tag] is state]
+    def remove_tagged(self, tag, state=True, status=True):
+        if status is True:
+            to_remove = [entry for entry in self.entries if entry[tag] == state]
+        else:
+            to_remove = [entry for entry in self.entries if entry[tag] != state]
         for element in to_remove:
             self.remove(element)
 
@@ -46,13 +49,24 @@ def detect_negation(self):
                 entry['matched_words'] = []
             entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False
 
+    def label_terms(self):
+        for entry in self.entries:
+            for hpid in entry['hpid']:
+                if hpid in non_phenos:
+                    entry['type'] = non_phenos[hpid]
+                else:
+                    entry['type'] = 'phenotype'
+
+    def remove_non_phenos(self):
+        self.remove_tagged('type', state='phenotype', status=False)
+
     def remove_negated(self):
         self.detect_negation()
         self.remove_tagged('is_negated')
 
     def remove_overlapping(self):
         self._mark_overlapping()
-        self.remove_tagged('is_longest', False)
+        self.remove_tagged('is_longest', state=False)
 
     def _mark_overlapping(self):
         """
@@ -126,6 +140,7 @@ def entries_sans_context(self):
         result = remove_key(result, 'context')
         result = remove_key(result, 'matched_tokens')
         result = remove_key(result, 'is_longest')
+        result = remove_key(result, 'type')
         return result
 
     @property
@@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True,
                  model=None,
                  custom_synonyms=None,
                  negation_language="en",
-                 chunk_by='phrase'
+                 chunk_by='phrase',
+                 phenotypes_only=True,
                  ):
 
         self.correct_spelling = correct_spelling
@@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True,
         self.context_window = context_window
         self.negation_model = nlp_model(negation_language=negation_language)
         self.chunk_by = chunk_by
+        self.phenotypes_only = phenotypes_only
         if custom_synonyms:
             self.search_tree = build_search_tree(custom_synonyms=custom_synonyms)
         else:
@@ -243,6 +260,10 @@ def hpo(self, text):
         if self.remove_overlapping:
             extracted_terms.remove_overlapping()
 
+        extracted_terms.label_terms()
+        if self.phenotypes_only:
+            extracted_terms.remove_non_phenos()
+
         return extracted_terms
 
     def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index):
diff --git a/txt2hpo/nlp.py b/txt2hpo/nlp.py
index 0d121a1..45a1a3f 100644
--- a/txt2hpo/nlp.py
+++ b/txt2hpo/nlp.py
@@ -32,6 +32,7 @@ def nlp_model(negation_language="en"):
 
     return nlp
 
+
 try:
     import en_core_sci_sm
     nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"])
@@ -53,7 +54,7 @@ def nlp_model(negation_language="en"):
 
 # these are used in hpo as part of phenotype definition, should block from filtering
 remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not "
-remove_from_stops += "out up side right left more less during than take move full few all to i"
+remove_from_stops += "out up side right left more less during than take move full few all to i "
 
 for not_a_stop in remove_from_stops.split(" "):
     nlp_sans_ner.vocab[not_a_stop].is_stop = False
diff --git a/txt2hpo/util.py b/txt2hpo/util.py
index 84712db..d2e28f1 100644
--- a/txt2hpo/util.py
+++ b/txt2hpo/util.py
@@ -1,24 +1,43 @@
 import pandas as pd
 
 import math
-from phenopy.config import config as phenopy_config
-from phenopy.build_hpo import generate_annotated_hpo_network
-
+import obonet
+import re
 import sys
 import subprocess
 import os
-
-
-obo_file = phenopy_config.get('hpo', 'obo_file')
-
-disease_to_phenotype_file = phenopy_config.get('hpo', 'disease_to_phenotype_file')
-
-hpo_network, alt2prim, disease_records = \
-    generate_annotated_hpo_network(obo_file,
-                                   disease_to_phenotype_file,
-                                   annotations_file=None,
-                                   ages_distribution_file=None
-                                   )
+import networkx as nx
+from txt2hpo.config import config
+
+# hpo_network = obonet.read_obo(obo_file)
+
+obo_file = config.get('hpo', 'obo')
+
+hpo_network = obonet.read_obo(obo_file)
+for node_id, data in hpo_network.nodes(data=True):
+    # clean synonyms
+    synonyms = []
+    if 'synonym' in data:
+        for synonym in data['synonym']:
+            synonyms.append(synonym)
+        hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms))
+
+# roots for non-phenotype nodes
+non_phenotypes = {
+    'mortality_aging': 'HP:0040006',
+    'mode_of_inheritance': 'HP:0000005',
+    'clinical_modifier': 'HP:0012823',
+    'frequency': 'HP:0040279',
+    'clinical_course': 'HP:0031797',
+}
+
+non_phenos = {}
+# remove non-phenotype branches
+for name, hpo_id in non_phenotypes.items():
+    if hpo_id in hpo_network.nodes:
+        children = nx.ancestors(hpo_network, hpo_id)
+        for hpid in [hpo_id] + list(children):
+            non_phenos[hpid] = name
 
 
 def group_pairs(phenotype_pairs):

From 050744e5cb2464aaaa16e32c7db42cebe2323c81 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Fri, 5 Mar 2021 17:47:41 -0500
Subject: [PATCH 02/10] remove phenopy from summarize

---
 txt2hpo/summarize.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py
index 8ea7cdf..e86de4a 100644
--- a/txt2hpo/summarize.py
+++ b/txt2hpo/summarize.py
@@ -4,10 +4,12 @@
 import numpy as np
 from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples
 from txt2hpo.config import logger
-from phenopy.util import half_product
 from functools import reduce
 
 
+def half_product():
+    raise NotImplemented
+
 def phenotype_distance(extracted_hpos):
     """
     Given the return from hpo, find the normalized distance between all terms in the document.

From 817d01e144a42022c7131eabc5091b4456204652 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Fri, 5 Mar 2021 17:50:11 -0500
Subject: [PATCH 03/10] add half product

---
 txt2hpo/summarize.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py
index e86de4a..41f71fd 100644
--- a/txt2hpo/summarize.py
+++ b/txt2hpo/summarize.py
@@ -7,8 +7,12 @@
 from functools import reduce
 
 
-def half_product():
-    raise NotImplemented
+def half_product(num_rows, num_columns):
+    """yield combinations and the diagonal"""
+    for m in range(0, num_rows):
+        for n in range(m, num_columns):
+            yield (m, n)
+
 
 def phenotype_distance(extracted_hpos):
     """

From a3051fdffdb1082437155b43622630232bf0d914 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Sat, 6 Mar 2021 10:28:09 -0500
Subject: [PATCH 04/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index d0a2b97..5907821 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -20,7 +20,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pipenv
+        pip install --upgrade pipenv
         pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
         pip install -r requirements_temp.txt
         python setup.py develop

From 5f97237d2aac31fd77589b236278bb2848c9bd47 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:19:20 -0500
Subject: [PATCH 05/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 5907821..0322e05 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -20,10 +20,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --upgrade pipenv
-        pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
-        pip install -r requirements_temp.txt
-        python setup.py develop
+        #pip install --upgrade pipenv
+        #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
+        #pip install -r requirements_temp.txt
+        #python setup.py develop
         pip install -e .
 
 

From eb01ad94afb0770de73699c2734202aa68b1391c Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:22:14 -0500
Subject: [PATCH 06/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 0322e05..7940ea7 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -20,7 +20,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        #pip install --upgrade pipenv
+        pip install --upgrade pipenv
         #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
         #pip install -r requirements_temp.txt
         #python setup.py develop

From c2750b76a5d8842d6f518b2513c87eb7a6605b06 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:28:35 -0500
Subject: [PATCH 07/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 7940ea7..1bf503a 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -21,6 +21,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install --upgrade pipenv
+        pipenv install pytest flake8 pytest-cov
         #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
         #pip install -r requirements_temp.txt
         #python setup.py develop

From f7efe71466713296e9683045b4938b835072c5f0 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:33:43 -0500
Subject: [PATCH 08/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 1bf503a..0c4fe23 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -24,8 +24,8 @@ jobs:
         pipenv install pytest flake8 pytest-cov
         #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
         #pip install -r requirements_temp.txt
-        #python setup.py develop
-        pip install -e .
+        python setup.py develop
+        #pip install -e .
 
 
     - name: Test with unittest

From 45647af2f870e0222a6e2dc5136b5ca3b2c65e13 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:41:34 -0500
Subject: [PATCH 09/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 0c4fe23..292fa26 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -20,8 +20,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --upgrade pipenv
-        pipenv install pytest flake8 pytest-cov
+        #pip install --upgrade pipenv
+        pip install pytest flake8 pytest-cov
         #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
         #pip install -r requirements_temp.txt
         python setup.py develop
@@ -30,4 +30,4 @@ jobs:
 
     - name: Test with unittest
       run: |
-        pipenv run pytest tests
+        pytest tests

From f60e3f4bfdb68cdca694ba78a38da71cc9b42832 Mon Sep 17 00:00:00 2001
From: Vlad Gainullin <vgainullin@genedx.com>
Date: Mon, 8 Mar 2021 12:57:59 -0500
Subject: [PATCH 10/10] update actions

---
 .github/workflows/pythonpackage.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index 292fa26..e29f8bc 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -20,12 +20,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        #pip install --upgrade pipenv
         pip install pytest flake8 pytest-cov
-        #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
-        #pip install -r requirements_temp.txt
         python setup.py develop
-        #pip install -e .
 
 
     - name: Test with unittest