Skip to content

Commit

Permalink
Merge pull request #48 from GeneDx/obo
Browse files Browse the repository at this point in the history
remove phenopy as dependency
  • Loading branch information
vgainullin authored Mar 8, 2021
2 parents c4f7c09 + f60e3f4 commit 9cc1a9c
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 36 deletions.
7 changes: 2 additions & 5 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pipenv
pipenv lock --keep-outdated --dev --requirements > requirements_temp.txt
pip install -r requirements_temp.txt
pip install pytest flake8 pytest-cov
python setup.py develop
pip install -e .
- name: Test with unittest
run: |
pipenv run pytest tests
pytest tests
2 changes: 0 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ nltk = "==3.4.5"
spacy = "==2.2.4"
scispacy = "==0.2.4"
negspacy = "==0.1.9"
phenopy = {git = "https://github.com/GeneDx/phenopy.git", editable = true}
networkx = "*"
gensim = "==3.8.1"

[requires]
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
'scispacy==0.2.4',
'negspacy==0.1.9',
'networkx',
'obonet',
'requests',
'gensim==3.8.1',


Expand Down
26 changes: 19 additions & 7 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from txt2hpo.extract import Extractor, Data, group_sequence
from txt2hpo.data import load_model
from tests.test_cases import *
from txt2hpo.util import hpo_network
from txt2hpo.util import hpo_network, non_phenos


class ExtractPhenotypesTestCase(unittest.TestCase):
Expand Down Expand Up @@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self):
def test_hpo_big_text_spellcheck_off(self):
# test parsing a page
extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True)
self.assertEqual(extract.hpo(test_case11_text).n_entries, 7)
res = extract.hpo(test_case11_text)
self.assertEqual(res.n_entries, 7)

def test_hpo_big_text_spellcheck_off_max3(self):
# test parsing a page
Expand Down Expand Up @@ -319,6 +320,12 @@ def test_extract_json_property(self):
resp = extract.hpo("Wide gait and a wide mouth")
self.assertEqual(truth, resp.json)

def test_extract_full_context(self):
extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False)
resp = extract.hpo("X linked")
self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417')
self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')

def test_extract_without_negated(self):

# negation should not apply if negation is part of matched string
Expand Down Expand Up @@ -388,8 +395,12 @@ def test_multiple_matches(self):
resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.")
self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))

def test_handing_term_hyphenation(self):
extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True, max_neighbors=3)
def test_handling_term_hyphenation(self):
extract = Extractor(correct_spelling=False,
remove_overlapping=True,
resolve_conflicts=True,
max_neighbors=2,
phenotypes_only=False)
hyphenated_phenos = \
[
(hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \
Expand All @@ -401,10 +412,11 @@ def test_handing_term_hyphenation(self):

]
# Phenotypes where word-order is important is a limitation of current parsing method
known_bugs = ['HP:0000510', 'HP:0030932']
long_phenos = ['HP:0011654', 'HP:0410303']
known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215']
long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074']
hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos]

hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos]
hyphenated_phenos = hyphenated_phenos[:10]
for test in hyphenated_phenos:
# current version is not expected to extract very long phenotypes
hpids = extract.hpo(test[0]).hpids
Expand Down
17 changes: 17 additions & 0 deletions txt2hpo/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import configparser
import logging
import os
import requests

from gensim.models import KeyedVectors
from txt2hpo import __project__, __version__

Expand Down Expand Up @@ -58,6 +60,21 @@
wv.save(d2v_vw_path)
config['models']['doc2vec'] = d2v_vw_path

config['hpo'] = {}
obo_path = os.path.join(data_directory, 'hp.obo')

if os.path.isfile(obo_path):
config['hpo']['obo'] = obo_path
else:
url = "http://purl.obolibrary.org/obo/hp.obo"
r = requests.get(url, allow_redirects=True)
with open(obo_path, 'wb') as fh:
fh.write(r.content)
if os.path.isfile(obo_path):
config['hpo']['obo'] = obo_path
else:
logger.critical("Unable to download hp.obo from ", url)

config['data'] = {}
spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json')
config['data']['spellcheck_vocab'] = spellcheck_vocab_path
Expand Down
31 changes: 26 additions & 5 deletions txt2hpo/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from txt2hpo.nlp import st
from txt2hpo.data import load_model
from txt2hpo.build_tree import search_tree, build_search_tree
from txt2hpo.util import remove_key
from txt2hpo.util import remove_key, non_phenos


class Data(object):
Expand All @@ -29,8 +29,11 @@ def add(self,entry):
def remove(self, item):
self.entries.remove(item)

def remove_tagged(self, tag, state=True):
to_remove = [entry for entry in self.entries if entry[tag] is state]
def remove_tagged(self, tag, state=True, status=True):
if status is True:
to_remove = [entry for entry in self.entries if entry[tag] == state]
else:
to_remove = [entry for entry in self.entries if entry[tag] != state]
for element in to_remove:
self.remove(element)

Expand All @@ -46,13 +49,24 @@ def detect_negation(self):
entry['matched_words'] = []
entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False

def label_terms(self):
for entry in self.entries:
for hpid in entry['hpid']:
if hpid in non_phenos:
entry['type'] = non_phenos[hpid]
else:
entry['type'] = 'phenotype'

def remove_non_phenos(self):
self.remove_tagged('type', state='phenotype', status=False)

def remove_negated(self):
self.detect_negation()
self.remove_tagged('is_negated')

def remove_overlapping(self):
self._mark_overlapping()
self.remove_tagged('is_longest', False)
self.remove_tagged('is_longest', state=False)

def _mark_overlapping(self):
"""
Expand Down Expand Up @@ -126,6 +140,7 @@ def entries_sans_context(self):
result = remove_key(result, 'context')
result = remove_key(result, 'matched_tokens')
result = remove_key(result, 'is_longest')
result = remove_key(result, 'type')
return result

@property
Expand Down Expand Up @@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True,
model=None,
custom_synonyms=None,
negation_language="en",
chunk_by='phrase'
chunk_by='phrase',
phenotypes_only=True,
):

self.correct_spelling = correct_spelling
Expand All @@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True,
self.context_window = context_window
self.negation_model = nlp_model(negation_language=negation_language)
self.chunk_by = chunk_by
self.phenotypes_only = phenotypes_only
if custom_synonyms:
self.search_tree = build_search_tree(custom_synonyms=custom_synonyms)
else:
Expand Down Expand Up @@ -243,6 +260,10 @@ def hpo(self, text):
if self.remove_overlapping:
extracted_terms.remove_overlapping()

extracted_terms.label_terms()
if self.phenotypes_only:
extracted_terms.remove_non_phenos()

return extracted_terms

def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index):
Expand Down
3 changes: 2 additions & 1 deletion txt2hpo/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def nlp_model(negation_language="en"):

return nlp


try:
import en_core_sci_sm
nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"])
Expand All @@ -53,7 +54,7 @@ def nlp_model(negation_language="en"):

# these are used in hpo as part of phenotype definition, should block from filtering
remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not "
remove_from_stops += "out up side right left more less during than take move full few all to i"
remove_from_stops += "out up side right left more less during than take move full few all to i "

for not_a_stop in remove_from_stops.split(" "):
nlp_sans_ner.vocab[not_a_stop].is_stop = False
Expand Down
8 changes: 7 additions & 1 deletion txt2hpo/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
import numpy as np
from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples
from txt2hpo.config import logger
from phenopy.util import half_product
from functools import reduce


def half_product(num_rows, num_columns):
"""yield combinations and the diagonal"""
for m in range(0, num_rows):
for n in range(m, num_columns):
yield (m, n)


def phenotype_distance(extracted_hpos):
"""
Given the return from hpo, find the normalized distance between all terms in the document.
Expand Down
49 changes: 34 additions & 15 deletions txt2hpo/util.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,43 @@
import pandas as pd

import math
from phenopy.config import config as phenopy_config
from phenopy.build_hpo import generate_annotated_hpo_network

import obonet
import re
import sys
import subprocess
import os


obo_file = phenopy_config.get('hpo', 'obo_file')

disease_to_phenotype_file = phenopy_config.get('hpo', 'disease_to_phenotype_file')

hpo_network, alt2prim, disease_records = \
generate_annotated_hpo_network(obo_file,
disease_to_phenotype_file,
annotations_file=None,
ages_distribution_file=None
)
import networkx as nx
from txt2hpo.config import config

# hpo_network = obonet.read_obo(obo_file)

obo_file = config.get('hpo', 'obo')

hpo_network = obonet.read_obo(obo_file)
for node_id, data in hpo_network.nodes(data=True):
# clean synonyms
synonyms = []
if 'synonym' in data:
for synonym in data['synonym']:
synonyms.append(synonym)
hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms))

# roots for non-phenotype nodes
non_phenotypes = {
'mortality_aging': 'HP:0040006',
'mode_of_inheritance': 'HP:0000005',
'clinical_modifier': 'HP:0012823',
'frequency': 'HP:0040279',
'clinical_course': 'HP:0031797',
}

non_phenos = {}
# remove non-phenotype branches
for name, hpo_id in non_phenotypes.items():
if hpo_id in hpo_network.nodes:
children = nx.ancestors(hpo_network, hpo_id)
for hpid in [hpo_id] + list(children):
non_phenos[hpid] = name


def group_pairs(phenotype_pairs):
Expand Down

0 comments on commit 9cc1a9c

Please sign in to comment.