update md5s for alphafolddb and handle unknown residue types in esm

KiddoZhu · KiddoZhu · commit c8155f40485c · 2023-07-16T15:41:44.000-04:00
diff --git a/conda/torchdrug/meta.yaml b/conda/torchdrug/meta.yaml
@@ -7,10 +7,10 @@ source:
 
 requirements:
   host:
-    - python >=3.7,<3.10
+    - python >=3.7,<3.11
     - pip
   run:
-    - python >=3.7,<3.10
+    - python >=3.7,<3.11
     - pytorch >=1.8.0
     - pytorch-scatter >=2.0.8
     - pytorch-cluster >=1.5.9
diff --git a/setup.py b/setup.py
@@ -43,7 +43,7 @@
             "lmdb",
             "fair-esm",
         ],
-        python_requires=">=3.7,<3.10",
+        python_requires=">=3.7,<3.11",
         classifiers=[
             "Development Status :: 4 - Beta",
             'Intended Audience :: Developers',
diff --git a/torchdrug/data/feature.py b/torchdrug/data/feature.py
@@ -50,6 +50,7 @@ def onehot(x, vocab, allow_unknown=False):
     return feature
 
 
+# TODO: this one is too slow
 @R.register("features.atom.default")
 def atom_default(atom):
     """Default atom feature.
@@ -331,6 +332,7 @@ def molecule_default(mol):
     """Default molecule feature."""
     return ExtendedConnectivityFingerprint(mol)
 
+
 ECFP = ExtendedConnectivityFingerprint
 
 
diff --git a/torchdrug/data/protein.py b/torchdrug/data/protein.py
@@ -295,7 +295,7 @@ def from_sequence(cls, sequence, atom_feature="default", bond_feature="default",
         """
         if atom_feature is None and bond_feature is None and residue_feature == "default":
             return cls._residue_from_sequence(sequence)
-        
+
         mol = Chem.MolFromSequence(sequence)
         if mol is None:
             raise ValueError("Invalid sequence `%s`" % sequence)
diff --git a/torchdrug/datasets/alphafolddb.py b/torchdrug/datasets/alphafolddb.py
@@ -18,10 +18,10 @@ class AlphaFoldDB(data.ProteinDataset):
     Parameters:
         path (str): path to store the dataset
         species_id (int, optional): the id of species to be loaded. The species are numbered
-            by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model 
+            by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model
             organism proteomes, 21 for Swiss-Prot)
-        split_id (int, optional): the id of split to be loaded. To avoid large memory consumption 
-            for one dataset, we have cut each species into several splits, each of which contains 
+        split_id (int, optional): the id of split to be loaded. To avoid large memory consumption
+            for one dataset, we have cut each species into several splits, each of which contains
             at most 22000 proteins.
         verbose (int, optional): output verbose level
         **kwargs
@@ -60,46 +60,52 @@ class AlphaFoldDB(data.ProteinDataset):
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000579_71421_HAEIN_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000429_85962_HELPY_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000007841_1125630_KLEPH_v2.tar",
-        "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008153_5671_LEIIN_v2.tar",
+        # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008153_5671_LEIIN_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000078237_100816_9PEZI1_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000806_272631_MYCLE_v2.tar",
-        "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001584_83332_MYCTU_v2.tar",
+        # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001584_83332_MYCTU_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000020681_1299332_MYCUL_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000535_242231_NEIG1_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000006304_1133849_9NOCA1_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000024404_6282_ONCVO_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002059_502779_PARBA_v2.tar",
-        "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001450_36329_PLAF7_v2.tar",
+        # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001450_36329_PLAF7_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002438_208964_PSEAE_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001014_99287_SALTY_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008854_6183_SCHMA_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002716_300267_SHIDS_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000018087_1391915_SPOS1_v2.tar",
-        "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008816_93061_STAA8_v2.tar",
+        # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008816_93061_STAA8_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000586_171101_STRR6_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000035681_6248_STRER_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000030665_36087_TRITR_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008524_185431_TRYB2_v2.tar",
-        "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002296_353153_TRYCC_v2.tar",
+        # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002296_353153_TRYCC_v2.tar",
         "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000270924_6293_WUCBA_v2.tar"
     ]
     md5s = [
-        "4cd5f596ebfc3d45d9f6b647dc5684af", "9e26602ba2d9f233ef4fcf82703ddb59",
-        "60a09db1e1c47a98763d09879784f536", "a0ab562b7372f149673c4518f949501f", 
-        "6205138b14fb7e7ec09b366e3e4f294b", "31f31359cd7254f82304e3886440bdd3", 
-        "a590096e65461ed4eb092b2147b97f0b", "8f1e120f372995644a7101ad58e5b2ae", 
-        "9a659c4aed2a8b833478dcd5fffc5fd8", "95d775f2ae271cf50a101c73335cd250", 
-        "e5b12da43f5bd77298ca50e19706bdeb", "90e953abba9c8fe202e0adf825c0dfcc", 
-        "38a11553c7e2d00482281e74f7daf321", "2bcdfe2c37154a355fe4e8150c279c13", 
-        "580a55e56a44fed935f0101c37a8c4ab", "b8d08a9033d111429fadb4e25820f9f7", 
-        "59d1167f414a86cbccfb204791fea0eb", "dfde6b44026f19a88f1abc8ac2798ce6", 
-        "a1c2047a16130d61cac4db23b2f5b560", "e4d4b72df8d075aeb607dcb095210304", 
-        "5cdad48c799ffd723636cae26433f1f9", "98a7c13987f578277bfb66ac48a1e242", 
+        "4cd5f596ebfc3d45d9f6b647dc5684af", "b89bee5507f78f971417cc8fd75b40f7", "a6459a1f1a0a22fbf25f1c05c2889ae3",
+        "24dfba8ab93dbf3f51e7db6b912dd6b4", "6b81b3086ed9e57e04a54f148ecf974c", "a50f4fd9f581c89e79e1b2857e54b786",
+        "fdd16245769bf1f7d91a0e285ac00e52", "66b9750c511182bc5f8ee71fe2ab2a17", "5dadeb5aac704025cac33f7557794858",
+        "99b22e0f050d845782d914becbfe4d2f", "da938dfae4fabf6e144f4b5ede5885ec", "2003c09d437cfb4093552c588a33e06d",
+        "fba59f386cfa33af3f70ae664b7feac0", "d7a1a6c02213754ee1a1ffb3b41ad4ba", "8a0e8deadffec2aba3b7edd6534b7481",
+        "1854d0bbcf819de1de7b0cfdb6d32b2e", "d9720e3809db6916405db096b520c236", "6b918e9e4d645b12a80468bcea805f1f",
+        "ed0eefe927eb8c3b81cf87eaabbb8d6e", "051369e0dc8fed4798c8b2c68e6cbe2e", "b05ff57164167851651c625dca66ed28",
+        "68e7a6e57bd43cb52e344b3190073387", "75d027ac7833f284fda65ea620353e8a", "7d85bb2ee4130096a6d905ab8d726bcc",
+        "63498210c88e8bfb1a7346c4ddf73bb1", "5bf2211304ef91d60bb3838ec12d89cd", "4981758eb8980e9df970ac6113e4084c",
+        "322431789942595b599d2b86670f41b3", "35d7b32e37bcc23d02b12b03b1e0c093", "1b8847dd786fa41b5b38f5e7aa58b813",
+        "126bdbe59fa82d55bfa098b710bdf650", "6c6d3248ed943dd7137637fc92d7ba37", "532203c6877433df5651b95d27685825",
+        "6e7112411da5843bec576271c44e0a0a", "0e4f913a9b4672b0ad3cc9c4f2de5c8d", "a138d0060b2e8a0ef1f90cf3ab7b7ca0",
+        "04d491dd1c679e91b5a2f3b9f14db555", "889c051e39305614accdff00414bfa67", "cd87cf24e5135c9d729940194ccc65c8",
+        "75eb8bfe866cf3040f4c08a566c32bc1", "fd8e6ddb9c159aab781a11c287c85feb", "b91a2e103980b96f755712f2b559ad66",
+        "26187d09b093649686d7c158aa4fd113", "62e16894bb4b8951a82befd24ad4ee21", "85c001df1d91788bf3cc1f97230b1dac",
+        "91a25af808351757b101a8c9c787db9e", "8b3e8645cc4c2484c331759b9d1df5bc", "e8a76a6ab290e6743233510e8d1eb4a5",
+        "38280bd7804f4c060b0775c4abed9b89"
     ]
     species_nsplit = [
         2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 20,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+        1, 1, 1, 1, 1, #1, 1, 1, 1, 1
     ]
     split_length = 22000
 
@@ -111,7 +117,7 @@ def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs):
 
         species_name = os.path.basename(self.urls[species_id])[:-4]
         if split_id >= self.species_nsplit[species_id]:
-            raise ValueError("Split id %d should be less than %d in species %s" % 
+            raise ValueError("Split id %d should be less than %d in species %s" %
                             (split_id, self.species_nsplit[species_id], species_name))
         self.processed_file = "%s_%d.pkl.gz" % (species_name, split_id)
         pkl_file = os.path.join(path, self.processed_file)
diff --git a/torchdrug/datasets/gene_ontology.py b/torchdrug/datasets/gene_ontology.py
@@ -13,8 +13,8 @@
 @utils.copy_args(data.ProteinDataset.load_pdbs)
 class GeneOntology(data.ProteinDataset):
     """
-    A set of proteins with their 3D structures and GO terms. These terms classify proteins 
-    into hierarchically related functional classes organized into three ontologies: molecular 
+    A set of proteins with their 3D structures and GO terms. These terms classify proteins
+    into hierarchically related functional classes organized into three ontologies: molecular
     function (MF), biological process (BP) and cellular component (CC).
 
     Statistics (test_cutoff=0.95):
@@ -51,7 +51,7 @@ def __init__(self, path, branch="MF", test_cutoff=0.95, verbose=1, **kwargs):
         zip_file = utils.download(self.url, path, md5=self.md5)
         path = os.path.join(utils.extract(zip_file), "GeneOntology")
         pkl_file = os.path.join(path, self.processed_file)
-        
+
         csv_file = os.path.join(path, "nrPDB-GO_test.csv")
         pdb_ids = []
         with open(csv_file, "r") as fin:
diff --git a/torchdrug/layers/common.py b/torchdrug/layers/common.py
@@ -329,10 +329,10 @@ class SinusoidalPositionEmbedding(nn.Module):
     Positional embedding based on sine and cosine functions, proposed in `Attention Is All You Need`_.
 
     .. _Attention Is All You Need:
-        https://arxiv.org/pdf/1706.03762.pdf
+       https://arxiv.org/pdf/1706.03762.pdf
 
     Parameters:
-        output_dim (int): output dimension
+       output_dim (int): output dimension
     """
 
     def __init__(self, output_dim):
diff --git a/torchdrug/layers/conv.py b/torchdrug/layers/conv.py
@@ -810,4 +810,4 @@ def message_and_aggregate(self, graph, input):
                                       dim_size=graph.num_node * graph.num_relation)
             update += edge_update
 
-        return update.view(graph.num_node, self.num_relation * self.input_dim)
+        return update.view(graph.num_node, self.num_relation * self.input_dim)
diff --git a/torchdrug/models/esm.py b/torchdrug/models/esm.py
@@ -71,7 +71,7 @@ class EvolutionaryScaleModeling(nn.Module, core.Configurable):
         "ESM-2-3B": 36,
         "ESM-2-15B": 48,
     }
-    
+
     max_input_length = 1024 - 2
 
     def __init__(self, path, model="ESM-1b", readout="mean"):
@@ -82,6 +82,7 @@ def __init__(self, path, model="ESM-1b", readout="mean"):
         self.path = path
 
         _model, alphabet = self.load_weight(path, model)
+        self.alphabet = alphabet
         mapping = self.construct_mapping(alphabet)
         self.output_dim = self.output_dim[model]
         self.model = _model
@@ -111,7 +112,7 @@ def load_weight(self, path, model):
         return esm.pretrained.load_model_and_alphabet_core(model_name, model_data, regression_data)
 
     def construct_mapping(self, alphabet):
-        mapping = [0] * len(data.Protein.id2residue_symbol)
+        mapping = [-1] * max(len(data.Protein.id2residue_symbol), len(self.alphabet))
         for i, token in data.Protein.id2residue_symbol.items():
             mapping[i] = alphabet.get_idx(token)
         mapping = torch.tensor(mapping)
@@ -133,6 +134,7 @@ def forward(self, graph, input, all_loss=None, metric=None):
         """
         input = graph.residue_type
         input = self.mapping[input]
+        input[input == -1] = graph.residue_type[input == -1]
         size = graph.num_residues
         if (size > self.max_input_length).any():
             warnings.warn("ESM can only encode proteins within %d residues. Truncate the input to fit into ESM."