From 4c17be0b84785be5e37c043de2be0b922e37a032 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Wed, 27 Mar 2024 00:35:58 -0400 Subject: [PATCH] fixing linting and test helpers --- README.md | 134 ++++++++++++++++++-------------------- lint.sh | 2 +- mhcgnomes/dataframe.py | 3 +- mhcgnomes/function_api.py | 2 +- tests/common.py | 30 ++++++++- 5 files changed, 96 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 31710cb..9411729 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,12 @@ - - Build Status - +[![Tests](https://github.com/pirl-unc/mhcgnomes/actions/workflows/tests.yml/badge.svg)](https://github.com/pirl-unc/mhcgnomes/actions/workflows/tests.yml) - Coverage Status +Coverage Status - PyPI +PyPI - -![](https://raw.githubusercontent.com/til-unc/mhcgnomes/main/gnome-red-text.png) +![](https://raw.githubusercontent.com/til-unc/mhcgnomes/main/gnome-red-text.png) # mhcgnomes: Parsing MHC nomenclature in the wild @@ -23,10 +20,10 @@ aims to correctly parse every name in [IEDB](http://www.iedb.org/), [IMGT/HLA](h In [1]: mhcgnomes.parse("HLA-A0201") Out[1]: Allele( gene=Gene( - species=Species(name="Homo sapiens', prefix="HLA"), - name="A"), - allele_fields=("02", "01"), - annotations=(), + species=Species(name="Homo sapiens', prefix="HLA"), + name="A"), + allele_fields=("02", "01"), + annotations=(), mutations=()) In [2]: mhcgnomes.parse("HLA-A0201").to_string() @@ -43,40 +40,38 @@ Despite the valiant efforts of groups such as the [Comparative MHC Nomenclature For example, these all refer to the same MHC protein sequence: -* "HLA-A\*02:01" -* "HLA-A02:01" -* "HLA-A:02:01" -* "HLA-A0201" - +- "HLA-A\*02:01" +- "HLA-A02:01" +- "HLA-A:02:01" +- "HLA-A0201" Additionally, for human alleles, the species prefix is often omitted: -* "A\*02:01" -* "A\*0201" -* "A02:01" -* "A:02:01" -* "A0201" - +- "A\*02:01" +- "A\*0201" +- "A02:01" +- "A:02:01" +- "A0201" ### Annotations -Sometimes, alleles are bundled with modifier suffixes which specify +Sometimes, alleles are bundled with modifier suffixes which specify the functionality or abundance of the MHC. Here's an example with an allele which is secreted instead of membrane-bound: -* "HLA-A\*02:01:01S" +- "HLA-A\*02:01:01S" -These are collected in the `annotations` field of an +These are collected in the `annotations` field of an [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) result. ### Mutations -MHC proteins are sometimes described in terms of mutations to a known allele. +MHC proteins are sometimes described in terms of mutations to a known allele. -* "HLA-B\*08:01 N80I mutant" +- "HLA-B\*08:01 N80I mutant" -These mutations are collected in the `mutations` field of an +These mutations are collected in the `mutations` field of an [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) result. ### Beyond humans @@ -85,78 +80,75 @@ To make things worse, several model organisms (like mice and rats) use archaic naming systems, where there is no notion of allele groups or four/six/eight digit alleles but every allele is simply given a name, such as: -* "H2-Kk" -* "RT1-9.5f" - +- "H2-Kk" +- "RT1-9.5f" In the above example "H2"/"RT1" correspond to species, "K"/"9.5" are the gene names and "k"/"f" are the allele names. -To make these even worse, the name of a species is subject to variation (e.g. "H2" vs. "H-2") as well as drift over time (e.g. ChLA -> MhcPatr -> Patr). +To make these even worse, the name of a species is subject to variation (e.g. "H2" vs. "H-2") as well as drift over time (e.g. ChLA -> MhcPatr -> Patr). ### Serotypes, haplotypes, and other named entitites Besides alleles are also other named MHC related entities you'll encounter in immunological data. Closely related to alleles are serotypes, which effectively denote a grouping of alleles that are all recognized by the same antibody: -* "HLA-A2" -* "A2" +- "HLA-A2" +- "A2" -In many datasets the exact allele is not known but an experiment might note the genetic background of a model animal, resulting in loose haplotype restrictions such as: +In many datasets the exact allele is not known but an experiment might note the genetic background of a model animal, resulting in loose haplotype restrictions such as: -* "H2-k class I" +- "H2-k class I" -Yes, good luck disambiguating "H2-k" the haplotype from "H2-K" the gene, especially since capitalization is not stable enough to be relied on for parsing. +Yes, good luck disambiguating "H2-k" the haplotype from "H2-K" the gene, especially since capitalization is not stable enough to be relied on for parsing. -In some cases immunological data comes only with a denoted species (e.g. "mouse"), a gene (e.g. "HLA-A"), or an MHC class ("human class I"). MHCgnomes has a structured representation for all of these cases and more. +In some cases immunological data comes only with a denoted species (e.g. "mouse"), a gene (e.g. "HLA-A"), or an MHC class ("human class I"). MHCgnomes has a structured representation for all of these cases and more. ## Parsing strategy -It is a fool's errand to curate *all* possible MHC allele names since that list grows daily as the MHC loci of more people (and non-human animals) are sequenced. Instead, MHCgnomes contains an ontology of curated species and genes and then attempts to parse any given string into a multiple candidates of the following types: - -* [`Species`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/species.py) -* [`Gene`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/gene.py) -* [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) -* [`AlleleWithoutGene`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele_without_gene.py) -* [`Class2Pair`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/class2_pair.py) -* [`Class2Locus`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/class2_locus.py) -* [`MhcClass`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/mhc_class.py) -* [`Haplotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/haplotype.py) -* [`Serotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/serotype.py) - - -The set of candidate interpretations for each string are then -ranked according to heuristic rules. For example, a string will be -preferentially interpreted as an [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) rather +It is a fool's errand to curate _all_ possible MHC allele names since that list grows daily as the MHC loci of more people (and non-human animals) are sequenced. Instead, MHCgnomes contains an ontology of curated species and genes and then attempts to parse any given string into a multiple candidates of the following types: + +- [`Species`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/species.py) +- [`Gene`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/gene.py) +- [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) +- [`AlleleWithoutGene`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele_without_gene.py) +- [`Class2Pair`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/class2_pair.py) +- [`Class2Locus`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/class2_locus.py) +- [`MhcClass`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/mhc_class.py) +- [`Haplotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/haplotype.py) +- [`Serotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/serotype.py) + +The set of candidate interpretations for each string are then +ranked according to heuristic rules. For example, a string will be +preferentially interpreted as an [`Allele`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/allele.py) rather than a [`Serotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/serotype.py) -or [`Haplotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/haplotype.py). - +or [`Haplotype`](https://github.com/til-unc/mhcgnomes/blob/main/mhcgnomes/haplotype.py). ## How many digits per field? Originally alleles for many genes were numbered with two digits: -* "HLA-MICB\*01" +- "HLA-MICB\*01" But as the number of identified alleles increased, the number of -fields specifying a distinct protein increase to two. This became +fields specifying a distinct protein increase to two. This became conventionally called a "four digit" format, since each field has two -digits. Yet, as the number of identified alleles continued to increase, then -the number of digits per field has often increased from two to three: +digits. Yet, as the number of identified alleles continued to increase, then +the number of digits per field has often increased from two to three: -* "MICB\*002:01" -* "HLA-A00201" -* "A:002:01" -* "A\*00201" +- "MICB\*002:01" +- "HLA-A00201" +- "A:002:01" +- "A\*00201" These are not always currently treated as equivalent to allele strings with two digits in their first field, but that feature is in the works. -However, if databases such as [IPD-MHC](https://www.ebi.ac.uk/ipd/mhc/) or [IMGT-HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) recorded an older form of an allele, then MHCgnomes can optionally map it onto the modern version (including capturing differences in numbers of digits per field). +However, if databases such as [IPD-MHC](https://www.ebi.ac.uk/ipd/mhc/) or [IMGT-HLA](https://www.ebi.ac.uk/ipd/imgt/hla/) recorded an older form of an allele, then MHCgnomes can optionally map it onto the modern version (including capturing differences in numbers of digits per field). ## References -* [IPD-MHC: nomenclature requirements for the non-human major histocompatibility complex in the next-generation sequencing era](https://link.springer.com/article/10.1007%2Fs00251-018-1072-4) -* [Comparative MHC nomenclature: report from the ISAG/IUIS-VIC committee 2018]() -* [ISAG/IUIS-VIC Comparative MHC Nomenclature -Committee report, 2005](https://link.springer.com/content/pdf/10.1007%2Fs00251-005-0071-4.pdf) -* [Marsupial MHC Class II β Genes Are Not Orthologous to the Eutherian β Gene Families]() -* [Nomenclature for factors of the SLA system, update 2008](https://www.ncbi.nlm.nih.gov/pubmed/19317739) +- [IPD-MHC: nomenclature requirements for the non-human major histocompatibility complex in the next-generation sequencing era](https://link.springer.com/article/10.1007%2Fs00251-018-1072-4) +- [Comparative MHC nomenclature: report from the ISAG/IUIS-VIC committee 2018]() +- [ISAG/IUIS-VIC Comparative MHC Nomenclature + Committee report, 2005](https://link.springer.com/content/pdf/10.1007%2Fs00251-005-0071-4.pdf) +- [Marsupial MHC Class II β Genes Are Not Orthologous to the Eutherian β Gene Families]() +- [Nomenclature for factors of the SLA system, update 2008](https://www.ncbi.nlm.nih.gov/pubmed/19317739) diff --git a/lint.sh b/lint.sh index 60d83b6..d521fb6 100755 --- a/lint.sh +++ b/lint.sh @@ -7,7 +7,7 @@ set -o errexit # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and # - https://bitbucket.org/logilab/pylint/issues/58 -find mhcgnomes test -name '*.py' \ +find mhcgnomes tests -name '*.py' \ | xargs pylint \ --errors-only \ --disable=unsubscriptable-object,not-an-iterable,no-member diff --git a/mhcgnomes/dataframe.py b/mhcgnomes/dataframe.py index 7007e38..fed85b7 100644 --- a/mhcgnomes/dataframe.py +++ b/mhcgnomes/dataframe.py @@ -12,7 +12,6 @@ import pandas as pd -from .function_api import parse def dataframe_from_parsed_objects(parsed_objects): records = [ @@ -23,5 +22,7 @@ def dataframe_from_parsed_objects(parsed_objects): def dataframe_from_string_list(names): + from .function_api import parse + parsed_objects = [parse(name) for name in names] return dataframe_from_parsed_objects(parsed_objects) diff --git a/mhcgnomes/function_api.py b/mhcgnomes/function_api.py index 79a400c..046b906 100644 --- a/mhcgnomes/function_api.py +++ b/mhcgnomes/function_api.py @@ -11,8 +11,8 @@ # limitations under the License. from .common import cache -from .parser import Parser from .parser import ( + Parser, DEFAULT_SPECIES_PREFIX, USE_ALLELE_ALIASES, GENE_SEPS, diff --git a/tests/common.py b/tests/common.py index 3dae4b2..f956a6e 100644 --- a/tests/common.py +++ b/tests/common.py @@ -10,6 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from functools import wraps + def ok_(a, s=None): if s is None: assert a @@ -56,4 +58,30 @@ def almost_eq_(a, b, tol=1e-6, s=None): if s is None: assert abs(a - b) < tol else: - assert abs(a - b) < tol, s \ No newline at end of file + assert abs(a - b) < tol, s + +class assert_raises: + def __init__(self, *exception_types): + self.exception_types = exception_types + + def __enter__(self): + pass + + def to_string(self): + return " or ".join(["%s" % e for e in self.exception_types]) + + def __exit__(self, type, value, traceback): + if type is None: + raise AssertionError("Expected exception %s not raised" % self.to_string()) + if type not in self.exception_types: + raise AssertionError("Expected exception %s, got %s" % (self.to_string(), type)) + return True + +def raises(*exception_types): + def decorator(f): + @wraps(f) + def wrapper(*args, **kwargs): + with assert_raises(*exception_types): + f(*args, **kwargs) + return wrapper + return decorator \ No newline at end of file