diff --git a/deeprank/domain/amino_acid.py b/deeprank/domain/amino_acid.py index eff2b95..9112413 100644 --- a/deeprank/domain/amino_acid.py +++ b/deeprank/domain/amino_acid.py @@ -1,48 +1,332 @@ from deeprank.models.amino_acid import AminoAcid +from deeprank.models.polarity import Polarity -alanine = AminoAcid("Alanine", "ALA", "A") -arginine = AminoAcid("Arginine", "ARG", "R") -asparagine = AminoAcid("Asparagine", "ASN", "N") -aspartate = AminoAcid("Aspartate", "ASP", "D") -protonated_aspartate = AminoAcid("Protonated Aspartate", "ASH", "D") -cysteine = AminoAcid("Cysteine", "CYS", "C") -cysteine_metal = AminoAcid("Cysteine Metal Ligand", "CYM", "C") -cysteine_iron = AminoAcid("Cysteine Iron Ligand", "CFE", "C") -cysteine_zinc = AminoAcid("Cysteine Zinc Ligand", "CYF", "C") -cysteine_phosphate = AminoAcid("Cysteine Phosphate", "CSP", "C") -glutamate = AminoAcid("Glutamate", "GLU", "E") -protonated_glutamate = AminoAcid("Protonated Glutamate", "GLH", "E") -glutamine = AminoAcid("Glutamine", "GLN", "Q") -glycine = AminoAcid("Glycine", "GLY", "G") -histidine = AminoAcid("Histidine", "HIS", "H") -histidine_phophate = AminoAcid("Histidine Phosphate", "NEP", "H") -isoleucine = AminoAcid("Isoleucine", "ILE", "I") -leucine = AminoAcid("Leucine", "LEU", "L") -lysine = AminoAcid("Lysine", "LYS", "K") -methionine = AminoAcid("Methionine", "MET", "M") -phenylalanine = AminoAcid("Phenylalanine", "PHE", "F") -proline = AminoAcid("Proline", "PRO", "P") -serine = AminoAcid("Serine", "SER", "S") -threonine = AminoAcid("Threonine", "THR", "T") -tryptophan = AminoAcid("Tryptophan", "TRP", "W") -tyrosine = AminoAcid("Tyrosine", "TYR", "Y") -valine = AminoAcid("Valine", "VAL", "V") -selenocysteine = AminoAcid("Selenocysteine", "SEC", "U") -pyrrolysine = AminoAcid("Pyrrolysine", "PYL", "O") -alysine = AminoAcid("Alysine", "ALY", "K") -methyllysine = AminoAcid("Methyllysine", "MLZ", "K") -dimethyllysine = AminoAcid("Dimethyllysine", "MLY", "K") -trimethyllysine = AminoAcid("Trimethyllysine", "3ML", "K") -epsilon_mthionine = AminoAcid("Epsilon Methionine", "MSE", "M") -hydroxy_proline = AminoAcid("Hydroxy Proline", "HYP", "P") -serine_phosphate = AminoAcid("Serine Phosphate", "SEP", "S") -threonine_phosphate = AminoAcid("Threonine Phosphate", "TOP", "T") -tyrosine_phosphate = AminoAcid("Tyrosine Phosphate", "TYP", "Y") -tyrosine_sulphate = AminoAcid("Tyrosine Sulphate", "TYS", "Y") -tyrosine_phosphate = AminoAcid("Tyrosine Phosphate", "PTR", "Y") -cyclohexane_alanine = AminoAcid("Cyclohexane Alanine", "CHX", "?") -unknown_amino_acid = AminoAcid("Unknown", "XXX", "X") +alanine = AminoAcid("Alanine", "ALA", "A", + charge=-0.37, + polarity=Polarity.APOLAR, + size=1, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +arginine = AminoAcid("Arginine", "ARG", "R", + charge=-1.65, + polarity=Polarity.POSITIVE_CHARGE, + size=7, + count_hydrogen_bond_donors=5, + count_hydrogen_bond_acceptors=0 +) + +asparagine = AminoAcid("Asparagine", "ASN", "N", + charge=-1.22, + polarity=Polarity.POLAR, + size=4, + count_hydrogen_bond_donors=2, + count_hydrogen_bond_acceptors=2 +) + +aspartate = AminoAcid("Aspartate", "ASP", "D", + charge=-1.37, + polarity=Polarity.NEGATIVE_CHARGE, + size=4, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4 +) + +protonated_aspartate = AminoAcid("Protonated Aspartate", "ASH", "D", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=4, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=3 +) + +cysteine = AminoAcid("Cysteine", "CYS", "C", + charge=-0.64, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +cysteine_metal = AminoAcid("Cysteine Metal Ligand", "CYM", "C", + charge=-0.64, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +cysteine_iron = AminoAcid("Cysteine Iron Ligand", "CFE", "C", + charge=-0.64, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +cysteine_zinc = AminoAcid("Cysteine Zinc Ligand", "CYF", "C", + charge=-0.64, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +cysteine_phosphate = AminoAcid("Cysteine Phosphate", "CSP", "C", + charge=-0.64, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) +glutamate = AminoAcid("Glutamate", "GLU", "E", + charge=-1.37, + polarity=Polarity.NEGATIVE_CHARGE, + size=5, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4, +) +protonated_glutamate = AminoAcid("Protonated Glutamate", "GLH", "E", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=4, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=3 +) + +glutamine = AminoAcid("Glutamine", "GLN", "Q", + charge=-1.22, + polarity=Polarity.POLAR, + size=5, + count_hydrogen_bond_donors=2, + count_hydrogen_bond_acceptors=2, +) + +glycine = AminoAcid("Glycine", "GLY", "G", + charge=-0.37, + polarity=Polarity.APOLAR, + size=0, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +histidine = AminoAcid("Histidine", "HIS", "H", + charge=-0.29, + polarity=Polarity.POLAR, + size=6, + count_hydrogen_bond_donors=2, + count_hydrogen_bond_acceptors=2 +) + +histidine_phosphate = AminoAcid("Histidine Phosphate", "NEP", "H", + charge=-0.29, + polarity=Polarity.POLAR, + size=6, + count_hydrogen_bond_donors=2, + count_hydrogen_bond_acceptors=2 +) + +isoleucine = AminoAcid("Isoleucine", "ILE", "I", + charge=-0.37, + polarity=Polarity.APOLAR, + size=4, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0, +) + +leucine = AminoAcid("Leucine", "LEU", "L", + charge=-0.37, + polarity=Polarity.APOLAR, + size=4, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +lysine = AminoAcid("Lysine", "LYS", "K", + charge=-0.36, + polarity=Polarity.POSITIVE_CHARGE, + size=5, + count_hydrogen_bond_donors=3, + count_hydrogen_bond_acceptors=0 +) + +methionine = AminoAcid("Methionine", "MET", "M", + charge=-0.37, + polarity=Polarity.APOLAR, + size=4, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +phenylalanine = AminoAcid("Phenylalanine", "PHE", "F", + charge=-0.37, + polarity=Polarity.APOLAR, + size=7, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +proline = AminoAcid("Proline", "PRO", "P", + charge=0.0, + polarity=Polarity.APOLAR, + size=3, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +serine = AminoAcid("Serine", "SER", "S", + charge=-0.80, + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=2 +) + +threonine = AminoAcid("Threonine", "THR", "T", + charge=-0.80, + polarity=Polarity.POLAR, + size=3, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=2 +) + +tryptophan = AminoAcid("Tryptophan", "TRP", "W", + charge=-0.79, + polarity=Polarity.POLAR, + size=10, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=0 +) + +tyrosine = AminoAcid("Tyrosine", "TYR", "Y", + charge=-0.80, + polarity=Polarity.POLAR, + size=8, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=1 +) + +valine = AminoAcid("Valine", "VAL", "V", + charge=-0.37, + polarity=Polarity.APOLAR, + size=3, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +selenocysteine = AminoAcid("Selenocysteine", "SEC", "U", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=2, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=2, +) + +pyrrolysine = AminoAcid("Pyrrolysine", "PYL", "O", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=13, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=4 +) + +alysine = AminoAcid("Alysine", "ALY", "K", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=13, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=4 +) + +methyllysine = AminoAcid("Methyllysine", "MLZ", "K", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=14, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=4 +) + +dimethyllysine = AminoAcid("Dimethyllysine", "MLY", "K", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=15, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=4 +) + +trimethyllysine = AminoAcid("Trimethyllysine", "3ML", "K", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=16, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=4 +) + +epsilon_methionine = AminoAcid("Epsilon Methionine", "MSE", "M", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.APOLAR, + size=4, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +hydroxy_proline = AminoAcid("Hydroxy Proline", "HYP", "P", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.POLAR, + size=4, + count_hydrogen_bond_donors=1, + count_hydrogen_bond_acceptors=2 +) + +serine_phosphate = AminoAcid("Serine Phosphate", "SEP", "S", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.NEGATIVE_CHARGE, + size=7, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4 +) + +threonine_phosphate = AminoAcid("Threonine Phosphate", "TOP", "T", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.NEGATIVE_CHARGE, + size=8, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4 +) + +tyrosine_phosphate = AminoAcid("Tyrosine Phosphate", "TYP", "Y", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.NEGATIVE_CHARGE, + size=13, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4 +) + +tyrosine_sulphate = AminoAcid("Tyrosine Sulphate", "TYS", "Y", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.NEGATIVE_CHARGE, + size=13, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=4 +) + +cyclohexane_alanine = AminoAcid("Cyclohexane Alanine", "CHX", "?", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.APOLAR, + size=7, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) + +unknown_amino_acid = AminoAcid("Unknown", "XXX", "X", + charge=0.0, # unknown, but needs a numerical value + polarity=Polarity.APOLAR, + size=0, + count_hydrogen_bond_donors=0, + count_hydrogen_bond_acceptors=0 +) amino_acids = [alanine, arginine, asparagine, aspartate, cysteine, glutamate, glutamine, glycine, diff --git a/deeprank/features/amino_acid_swap.py b/deeprank/features/amino_acid_swap.py new file mode 100644 index 0000000..36c5e0f --- /dev/null +++ b/deeprank/features/amino_acid_swap.py @@ -0,0 +1,70 @@ +from typing import List, Union + +import h5py +import numpy +from pdb2sql import pdb2sql + +from deeprank.models.environment import Environment +from deeprank.models.variant import PdbVariantSelection +from deeprank.operate.pdb import get_pdb_path + + + +SIZE_DELTA_FEATURE_NAME = "size_delta" +CHARGE_DELTA_FEATURE_NAME = "charge_delta" +POLARITY_DELTA_FEATURE_NAME = "polarity_delta" +HBACCEPTOR_DELTA_FEATURE_NAME = "hb_acceptor_delta" +HBDONOR_DELTA_FEATURE_NAME = "hb_donor_delta" + + +def _store_feature(feature_group_xyz: h5py.Group, + atom_positions: numpy.array, + feature_name: str, + value: Union[float, int, numpy.array]): + """ + atom_positions: [n_atom, 3] x,y,z per atom + value: one float or a vector + """ + + if type(value) == float or type(value) == int: + values = [value] + else: + values = list(value) + + data = [list(position) + values for position in atom_positions] + + feature_group_xyz.create_dataset(feature_name, data=data) + + +def __compute_feature__(environment: Environment, + distance_cutoff: float, + feature_group: h5py.Group, + variant: PdbVariantSelection): + + "computes features that express the difference in physiochemical properties of the swapped amino acid" + + # get the C-alpha position + pdb = pdb2sql(get_pdb_path(environment.pdb_root, variant.pdb_ac)) + if variant.insertion_code is not None: + atom_positions = pdb.get("x,y,z", chainID=variant.chain_id, resSeq=variant.residue_number, iCode=variant.insertion_code) + else: + atom_positions = pdb.get("x,y,z", chainID=variant.chain_id, resSeq=variant.residue_number) + + wildtype_amino_acid = variant.wildtype_amino_acid + variant_amino_acid = variant.variant_amino_acid + + size_delta = variant_amino_acid.size - wildtype_amino_acid.size + _store_feature(feature_group, atom_positions, SIZE_DELTA_FEATURE_NAME, size_delta) + + charge_delta = variant_amino_acid.charge - wildtype_amino_acid.charge + _store_feature(feature_group, atom_positions, CHARGE_DELTA_FEATURE_NAME, charge_delta) + + polarity_delta = variant_amino_acid.polarity.onehot - wildtype_amino_acid.polarity.onehot + _store_feature(feature_group, atom_positions, POLARITY_DELTA_FEATURE_NAME, polarity_delta) + + hb_acceptor_delta = variant_amino_acid.count_hydrogen_bond_acceptors - wildtype_amino_acid.count_hydrogen_bond_acceptors + _store_feature(feature_group, atom_positions, HBACCEPTOR_DELTA_FEATURE_NAME, hb_acceptor_delta) + + hb_donor_delta = variant_amino_acid.count_hydrogen_bond_donors - wildtype_amino_acid.count_hydrogen_bond_donors + _store_feature(feature_group, atom_positions, HBDONOR_DELTA_FEATURE_NAME, hb_donor_delta) + diff --git a/deeprank/features/neighbour_profile.py b/deeprank/features/neighbour_profile.py index 5fa14ce..c18aee8 100644 --- a/deeprank/features/neighbour_profile.py +++ b/deeprank/features/neighbour_profile.py @@ -43,7 +43,7 @@ def get_c_alpha_pos(environment, variant): db = pdb2sql(pdb_path) try: if variant.insertion_code is not None: - position = db.get("x,y,z", chainID=variant.chain_id, resSeq=variant.residue_number, iCode=variant, name="CA")[0] + position = db.get("x,y,z", chainID=variant.chain_id, resSeq=variant.residue_number, iCode=variant.insertion_code, name="CA")[0] else: position = db.get("x,y,z", chainID=variant.chain_id, resSeq=variant.residue_number, name="CA")[0] diff --git a/deeprank/models/amino_acid.py b/deeprank/models/amino_acid.py index d138440..cf0e086 100644 --- a/deeprank/models/amino_acid.py +++ b/deeprank/models/amino_acid.py @@ -1,11 +1,23 @@ +from deeprank.models.polarity import Polarity class AminoAcid: - def __init__(self, name, code, letter): + def __init__(self, name: str, code: str, letter: str, + charge: float, polarity: Polarity, size: int, + count_hydrogen_bond_donors: int, + count_hydrogen_bond_acceptors: int): + self.name = name self.code = code self.letter = letter + # these settings apply to the side chain + self._size = size + self._charge = charge + self._polarity = polarity + self._count_hydrogen_bond_donors = count_hydrogen_bond_donors + self._count_hydrogen_bond_acceptors = count_hydrogen_bond_acceptors + def __repr__(self): return self.name @@ -14,3 +26,24 @@ def __eq__(self, other): def __hash__(self): return hash(self.name) + + @property + def count_hydrogen_bond_donors(self) -> int: + return self._count_hydrogen_bond_donors + + @property + def count_hydrogen_bond_acceptors(self) -> int: + return self._count_hydrogen_bond_acceptors + + @property + def charge(self) -> float: + return self._charge + + @property + def polarity(self) -> Polarity: + return self._polarity + + @property + def size(self) -> int: + return self._size + diff --git a/deeprank/models/polarity.py b/deeprank/models/polarity.py new file mode 100644 index 0000000..505009b --- /dev/null +++ b/deeprank/models/polarity.py @@ -0,0 +1,20 @@ +import numpy + +from enum import Enum + + +class Polarity(Enum): + "a value to express a residue's polarity" + + APOLAR = 0 + POLAR = 1 + NEGATIVE_CHARGE = 2 + POSITIVE_CHARGE = 3 + + @property + def onehot(self): + t = numpy.zeros(4) + t[self.value] = 1.0 + + return t + diff --git a/scripts/preprocess_bioprodict.py b/scripts/preprocess_bioprodict.py index c520fac..7e7821d 100755 --- a/scripts/preprocess_bioprodict.py +++ b/scripts/preprocess_bioprodict.py @@ -337,6 +337,7 @@ def get_subset(variants): device=device) feature_modules = ["deeprank.features.atomic_contacts", + "deeprank.features.amino_acid_swap", "deeprank.features.accessibility"] target_modules = ["deeprank.targets.variant_class"] diff --git a/test/features/test_amino_acid_swap.py b/test/features/test_amino_acid_swap.py new file mode 100644 index 0000000..357a79c --- /dev/null +++ b/test/features/test_amino_acid_swap.py @@ -0,0 +1,40 @@ +from tempfile import mkstemp +import os +import h5py + +from deeprank.features.amino_acid_swap import ( + SIZE_DELTA_FEATURE_NAME, + CHARGE_DELTA_FEATURE_NAME, + POLARITY_DELTA_FEATURE_NAME, + HBACCEPTOR_DELTA_FEATURE_NAME, + HBDONOR_DELTA_FEATURE_NAME, + __compute_feature__ +) +from deeprank.models.environment import Environment +from deeprank.models.variant import PdbVariantSelection +from deeprank.domain.amino_acid import glycine, tryptophan + + +def test_compute_features(): + + environment = Environment(pdb_root="test/data/pdb", device="cpu") + + variant = PdbVariantSelection("101M", "A", 25, glycine, tryptophan) + + tmp_file, tmp_path = mkstemp(suffix=".h5py") + os.close(tmp_file) + + try: + with h5py.File(tmp_path, 'w') as f5: + + xyz_group = f5.require_group('xyz') + + __compute_feature__(environment, 10.0, xyz_group, variant) + + assert SIZE_DELTA_FEATURE_NAME in xyz_group + assert CHARGE_DELTA_FEATURE_NAME in xyz_group + assert POLARITY_DELTA_FEATURE_NAME in xyz_group + assert HBACCEPTOR_DELTA_FEATURE_NAME in xyz_group + assert HBDONOR_DELTA_FEATURE_NAME in xyz_group + finally: + os.remove(tmp_path)