From c5768b89182d723bb3c0891d5ae3356d11a64d01 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Fri, 5 Mar 2021 17:39:34 -0500 Subject: [PATCH 01/10] remove phenopy as dependency download and parse hp obo keep non-phenotype hpo terms add type to context --- Pipfile | 2 -- setup.py | 2 ++ tests/test_extract.py | 26 ++++++++++++++++------- txt2hpo/config.py | 17 +++++++++++++++ txt2hpo/extract.py | 31 ++++++++++++++++++++++----- txt2hpo/nlp.py | 3 ++- txt2hpo/util.py | 49 ++++++++++++++++++++++++++++++------------- 7 files changed, 100 insertions(+), 30 deletions(-) diff --git a/Pipfile b/Pipfile index e3dfc7c..a772239 100644 --- a/Pipfile +++ b/Pipfile @@ -13,8 +13,6 @@ nltk = "==3.4.5" spacy = "==2.2.4" scispacy = "==0.2.4" negspacy = "==0.1.9" -phenopy = {git = "https://github.com/GeneDx/phenopy.git", editable = true} -networkx = "*" gensim = "==3.8.1" [requires] diff --git a/setup.py b/setup.py index 3cd4c66..153774f 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,8 @@ 'scispacy==0.2.4', 'negspacy==0.1.9', 'networkx', + 'obonet', + 'requests', 'gensim==3.8.1', diff --git a/tests/test_extract.py b/tests/test_extract.py index 89886f5..c1db996 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -4,7 +4,7 @@ from txt2hpo.extract import Extractor, Data, group_sequence from txt2hpo.data import load_model from tests.test_cases import * -from txt2hpo.util import hpo_network +from txt2hpo.util import hpo_network, non_phenos class ExtractPhenotypesTestCase(unittest.TestCase): @@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self): def test_hpo_big_text_spellcheck_off(self): # test parsing a page extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True) - self.assertEqual(extract.hpo(test_case11_text).n_entries, 7) + res = extract.hpo(test_case11_text) + self.assertEqual(res.n_entries, 7) def test_hpo_big_text_spellcheck_off_max3(self): # test parsing a page @@ -319,6 +320,12 @@ def test_extract_json_property(self): resp = extract.hpo("Wide gait and a wide mouth") self.assertEqual(truth, resp.json) + def test_extract_full_context(self): + extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False) + resp = extract.hpo("X linked") + self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417') + self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance') + def test_extract_without_negated(self): # negation should not apply if negation is part of matched string @@ -388,8 +395,12 @@ def test_multiple_matches(self): resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.") self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256'])) - def test_handing_term_hyphenation(self): - extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=3) + def test_handling_term_hyphenation(self): + extract = Extractor(correct_spelling=False, + remove_overlapping=True, + resolve_conflicts=True, + max_neighbors=2, + phenotypes_only=False) hyphenated_phenos = \ [ (hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \ @@ -401,10 +412,11 @@ def test_handing_term_hyphenation(self): ] # Phenotypes where word-order is important is a limitation of current parsing method - known_bugs = ['HP:0000510', 'HP:0030932'] - long_phenos = ['HP:0011654', 'HP:0410303'] + known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215'] + long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074'] hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos] - + hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos] + hyphenated_phenos = hyphenated_phenos[:10] for test in hyphenated_phenos: # current version is not expected to extract very long phenotypes hpids = extract.hpo(test[0]).hpids diff --git a/txt2hpo/config.py b/txt2hpo/config.py index bf13c21..bc97d41 100644 --- a/txt2hpo/config.py +++ b/txt2hpo/config.py @@ -1,6 +1,8 @@ import configparser import logging import os +import requests + from gensim.models import KeyedVectors from txt2hpo import __project__, __version__ @@ -58,6 +60,21 @@ wv.save(d2v_vw_path) config['models']['doc2vec'] = d2v_vw_path + config['hpo'] = {} + obo_path = os.path.join(data_directory, 'hp.obo') + + if os.path.isfile(obo_path): + config['hpo']['obo'] = obo_path + else: + url = "http://purl.obolibrary.org/obo/hp.obo" + r = requests.get(url, allow_redirects=True) + with open(obo_path, 'wb') as fh: + fh.write(r.content) + if os.path.isfile(obo_path): + config['hpo']['obo'] = obo_path + else: + logger.critical("Unable to download hp.obo from ", url) + config['data'] = {} spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json') config['data']['spellcheck_vocab'] = spellcheck_vocab_path diff --git a/txt2hpo/extract.py b/txt2hpo/extract.py index 6ce8218..dd6304f 100644 --- a/txt2hpo/extract.py +++ b/txt2hpo/extract.py @@ -11,7 +11,7 @@ from txt2hpo.nlp import st from txt2hpo.data import load_model from txt2hpo.build_tree import search_tree, build_search_tree -from txt2hpo.util import remove_key +from txt2hpo.util import remove_key, non_phenos class Data(object): @@ -29,8 +29,11 @@ def add(self,entry): def remove(self, item): self.entries.remove(item) - def remove_tagged(self, tag, state=True): - to_remove = [entry for entry in self.entries if entry[tag] is state] + def remove_tagged(self, tag, state=True, status=True): + if status is True: + to_remove = [entry for entry in self.entries if entry[tag] == state] + else: + to_remove = [entry for entry in self.entries if entry[tag] != state] for element in to_remove: self.remove(element) @@ -46,13 +49,24 @@ def detect_negation(self): entry['matched_words'] = [] entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False + def label_terms(self): + for entry in self.entries: + for hpid in entry['hpid']: + if hpid in non_phenos: + entry['type'] = non_phenos[hpid] + else: + entry['type'] = 'phenotype' + + def remove_non_phenos(self): + self.remove_tagged('type', state='phenotype', status=False) + def remove_negated(self): self.detect_negation() self.remove_tagged('is_negated') def remove_overlapping(self): self._mark_overlapping() - self.remove_tagged('is_longest', False) + self.remove_tagged('is_longest', state=False) def _mark_overlapping(self): """ @@ -126,6 +140,7 @@ def entries_sans_context(self): result = remove_key(result, 'context') result = remove_key(result, 'matched_tokens') result = remove_key(result, 'is_longest') + result = remove_key(result, 'type') return result @property @@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True, model=None, custom_synonyms=None, negation_language="en", - chunk_by='phrase' + chunk_by='phrase', + phenotypes_only=True, ): self.correct_spelling = correct_spelling @@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True, self.context_window = context_window self.negation_model = nlp_model(negation_language=negation_language) self.chunk_by = chunk_by + self.phenotypes_only = phenotypes_only if custom_synonyms: self.search_tree = build_search_tree(custom_synonyms=custom_synonyms) else: @@ -243,6 +260,10 @@ def hpo(self, text): if self.remove_overlapping: extracted_terms.remove_overlapping() + extracted_terms.label_terms() + if self.phenotypes_only: + extracted_terms.remove_non_phenos() + return extracted_terms def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index): diff --git a/txt2hpo/nlp.py b/txt2hpo/nlp.py index 0d121a1..45a1a3f 100644 --- a/txt2hpo/nlp.py +++ b/txt2hpo/nlp.py @@ -32,6 +32,7 @@ def nlp_model(negation_language="en"): return nlp + try: import en_core_sci_sm nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"]) @@ -53,7 +54,7 @@ def nlp_model(negation_language="en"): # these are used in hpo as part of phenotype definition, should block from filtering remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not " -remove_from_stops += "out up side right left more less during than take move full few all to i" +remove_from_stops += "out up side right left more less during than take move full few all to i " for not_a_stop in remove_from_stops.split(" "): nlp_sans_ner.vocab[not_a_stop].is_stop = False diff --git a/txt2hpo/util.py b/txt2hpo/util.py index 84712db..d2e28f1 100644 --- a/txt2hpo/util.py +++ b/txt2hpo/util.py @@ -1,24 +1,43 @@ import pandas as pd import math -from phenopy.config import config as phenopy_config -from phenopy.build_hpo import generate_annotated_hpo_network - +import obonet +import re import sys import subprocess import os - - -obo_file = phenopy_config.get('hpo', 'obo_file') - -disease_to_phenotype_file = phenopy_config.get('hpo', 'disease_to_phenotype_file') - -hpo_network, alt2prim, disease_records = \ - generate_annotated_hpo_network(obo_file, - disease_to_phenotype_file, - annotations_file=None, - ages_distribution_file=None - ) +import networkx as nx +from txt2hpo.config import config + +# hpo_network = obonet.read_obo(obo_file) + +obo_file = config.get('hpo', 'obo') + +hpo_network = obonet.read_obo(obo_file) +for node_id, data in hpo_network.nodes(data=True): + # clean synonyms + synonyms = [] + if 'synonym' in data: + for synonym in data['synonym']: + synonyms.append(synonym) + hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms)) + +# roots for non-phenotype nodes +non_phenotypes = { + 'mortality_aging': 'HP:0040006', + 'mode_of_inheritance': 'HP:0000005', + 'clinical_modifier': 'HP:0012823', + 'frequency': 'HP:0040279', + 'clinical_course': 'HP:0031797', +} + +non_phenos = {} +# remove non-phenotype branches +for name, hpo_id in non_phenotypes.items(): + if hpo_id in hpo_network.nodes: + children = nx.ancestors(hpo_network, hpo_id) + for hpid in [hpo_id] + list(children): + non_phenos[hpid] = name def group_pairs(phenotype_pairs): From 050744e5cb2464aaaa16e32c7db42cebe2323c81 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Fri, 5 Mar 2021 17:47:41 -0500 Subject: [PATCH 02/10] remove phenopy from summarize --- txt2hpo/summarize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py index 8ea7cdf..e86de4a 100644 --- a/txt2hpo/summarize.py +++ b/txt2hpo/summarize.py @@ -4,10 +4,12 @@ import numpy as np from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples from txt2hpo.config import logger -from phenopy.util import half_product from functools import reduce +def half_product(): + raise NotImplemented + def phenotype_distance(extracted_hpos): """ Given the return from hpo, find the normalized distance between all terms in the document. From 817d01e144a42022c7131eabc5091b4456204652 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Fri, 5 Mar 2021 17:50:11 -0500 Subject: [PATCH 03/10] add half product --- txt2hpo/summarize.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py index e86de4a..41f71fd 100644 --- a/txt2hpo/summarize.py +++ b/txt2hpo/summarize.py @@ -7,8 +7,12 @@ from functools import reduce -def half_product(): - raise NotImplemented +def half_product(num_rows, num_columns): + """yield combinations and the diagonal""" + for m in range(0, num_rows): + for n in range(m, num_columns): + yield (m, n) + def phenotype_distance(extracted_hpos): """ From a3051fdffdb1082437155b43622630232bf0d914 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Sat, 6 Mar 2021 10:28:09 -0500 Subject: [PATCH 04/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index d0a2b97..5907821 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -20,7 +20,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pipenv + pip install --upgrade pipenv pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt pip install -r requirements_temp.txt python setup.py develop From 5f97237d2aac31fd77589b236278bb2848c9bd47 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:19:20 -0500 Subject: [PATCH 05/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 5907821..0322e05 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -20,10 +20,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install --upgrade pipenv - pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt - pip install -r requirements_temp.txt - python setup.py develop + #pip install --upgrade pipenv + #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt + #pip install -r requirements_temp.txt + #python setup.py develop pip install -e . From eb01ad94afb0770de73699c2734202aa68b1391c Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:22:14 -0500 Subject: [PATCH 06/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 0322e05..7940ea7 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -20,7 +20,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - #pip install --upgrade pipenv + pip install --upgrade pipenv #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt #pip install -r requirements_temp.txt #python setup.py develop From c2750b76a5d8842d6f518b2513c87eb7a6605b06 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:28:35 -0500 Subject: [PATCH 07/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 7940ea7..1bf503a 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -21,6 +21,7 @@ jobs: run: | python -m pip install --upgrade pip pip install --upgrade pipenv + pipenv install pytest flake8 pytest-cov #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt #pip install -r requirements_temp.txt #python setup.py develop From f7efe71466713296e9683045b4938b835072c5f0 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:33:43 -0500 Subject: [PATCH 08/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 1bf503a..0c4fe23 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -24,8 +24,8 @@ jobs: pipenv install pytest flake8 pytest-cov #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt #pip install -r requirements_temp.txt - #python setup.py develop - pip install -e . + python setup.py develop + #pip install -e . - name: Test with unittest From 45647af2f870e0222a6e2dc5136b5ca3b2c65e13 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:41:34 -0500 Subject: [PATCH 09/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 0c4fe23..292fa26 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -20,8 +20,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install --upgrade pipenv - pipenv install pytest flake8 pytest-cov + #pip install --upgrade pipenv + pip install pytest flake8 pytest-cov #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt #pip install -r requirements_temp.txt python setup.py develop @@ -30,4 +30,4 @@ jobs: - name: Test with unittest run: | - pipenv run pytest tests + pytest tests From f60e3f4bfdb68cdca694ba78a38da71cc9b42832 Mon Sep 17 00:00:00 2001 From: Vlad Gainullin Date: Mon, 8 Mar 2021 12:57:59 -0500 Subject: [PATCH 10/10] update actions --- .github/workflows/pythonpackage.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 292fa26..e29f8bc 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -20,12 +20,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - #pip install --upgrade pipenv pip install pytest flake8 pytest-cov - #pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt - #pip install -r requirements_temp.txt python setup.py develop - #pip install -e . - name: Test with unittest