@@ -18,10 +18,10 @@ class AlphaFoldDB(data.ProteinDataset):
18
18
Parameters:
19
19
path (str): path to store the dataset
20
20
species_id (int, optional): the id of species to be loaded. The species are numbered
21
- by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model
21
+ by the order appeared on https://alphafold.ebi.ac.uk/download (0-20 for model
22
22
organism proteomes, 21 for Swiss-Prot)
23
- split_id (int, optional): the id of split to be loaded. To avoid large memory consumption
24
- for one dataset, we have cut each species into several splits, each of which contains
23
+ split_id (int, optional): the id of split to be loaded. To avoid large memory consumption
24
+ for one dataset, we have cut each species into several splits, each of which contains
25
25
at most 22000 proteins.
26
26
verbose (int, optional): output verbose level
27
27
**kwargs
@@ -60,46 +60,52 @@ class AlphaFoldDB(data.ProteinDataset):
60
60
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000579_71421_HAEIN_v2.tar" ,
61
61
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000429_85962_HELPY_v2.tar" ,
62
62
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000007841_1125630_KLEPH_v2.tar" ,
63
- "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008153_5671_LEIIN_v2.tar" ,
63
+ # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008153_5671_LEIIN_v2.tar",
64
64
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000078237_100816_9PEZI1_v2.tar" ,
65
65
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000806_272631_MYCLE_v2.tar" ,
66
- "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001584_83332_MYCTU_v2.tar" ,
66
+ # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001584_83332_MYCTU_v2.tar",
67
67
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000020681_1299332_MYCUL_v2.tar" ,
68
68
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000535_242231_NEIG1_v2.tar" ,
69
69
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000006304_1133849_9NOCA1_v2.tar" ,
70
70
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000024404_6282_ONCVO_v2.tar" ,
71
71
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002059_502779_PARBA_v2.tar" ,
72
- "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001450_36329_PLAF7_v2.tar" ,
72
+ # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001450_36329_PLAF7_v2.tar",
73
73
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002438_208964_PSEAE_v2.tar" ,
74
74
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000001014_99287_SALTY_v2.tar" ,
75
75
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008854_6183_SCHMA_v2.tar" ,
76
76
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002716_300267_SHIDS_v2.tar" ,
77
77
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000018087_1391915_SPOS1_v2.tar" ,
78
- "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008816_93061_STAA8_v2.tar" ,
78
+ # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008816_93061_STAA8_v2.tar",
79
79
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000000586_171101_STRR6_v2.tar" ,
80
80
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000035681_6248_STRER_v2.tar" ,
81
81
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000030665_36087_TRITR_v2.tar" ,
82
82
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000008524_185431_TRYB2_v2.tar" ,
83
- "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002296_353153_TRYCC_v2.tar" ,
83
+ # "https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000002296_353153_TRYCC_v2.tar",
84
84
"https://ftp.ebi.ac.uk/pub/databases/alphafold/v2/UP000270924_6293_WUCBA_v2.tar"
85
85
]
86
86
md5s = [
87
- "4cd5f596ebfc3d45d9f6b647dc5684af" , "9e26602ba2d9f233ef4fcf82703ddb59" ,
88
- "60a09db1e1c47a98763d09879784f536" , "a0ab562b7372f149673c4518f949501f" ,
89
- "6205138b14fb7e7ec09b366e3e4f294b" , "31f31359cd7254f82304e3886440bdd3" ,
90
- "a590096e65461ed4eb092b2147b97f0b" , "8f1e120f372995644a7101ad58e5b2ae" ,
91
- "9a659c4aed2a8b833478dcd5fffc5fd8" , "95d775f2ae271cf50a101c73335cd250" ,
92
- "e5b12da43f5bd77298ca50e19706bdeb" , "90e953abba9c8fe202e0adf825c0dfcc" ,
93
- "38a11553c7e2d00482281e74f7daf321" , "2bcdfe2c37154a355fe4e8150c279c13" ,
94
- "580a55e56a44fed935f0101c37a8c4ab" , "b8d08a9033d111429fadb4e25820f9f7" ,
95
- "59d1167f414a86cbccfb204791fea0eb" , "dfde6b44026f19a88f1abc8ac2798ce6" ,
96
- "a1c2047a16130d61cac4db23b2f5b560" , "e4d4b72df8d075aeb607dcb095210304" ,
97
- "5cdad48c799ffd723636cae26433f1f9" , "98a7c13987f578277bfb66ac48a1e242" ,
87
+ "4cd5f596ebfc3d45d9f6b647dc5684af" , "b89bee5507f78f971417cc8fd75b40f7" , "a6459a1f1a0a22fbf25f1c05c2889ae3" ,
88
+ "24dfba8ab93dbf3f51e7db6b912dd6b4" , "6b81b3086ed9e57e04a54f148ecf974c" , "a50f4fd9f581c89e79e1b2857e54b786" ,
89
+ "fdd16245769bf1f7d91a0e285ac00e52" , "66b9750c511182bc5f8ee71fe2ab2a17" , "5dadeb5aac704025cac33f7557794858" ,
90
+ "99b22e0f050d845782d914becbfe4d2f" , "da938dfae4fabf6e144f4b5ede5885ec" , "2003c09d437cfb4093552c588a33e06d" ,
91
+ "fba59f386cfa33af3f70ae664b7feac0" , "d7a1a6c02213754ee1a1ffb3b41ad4ba" , "8a0e8deadffec2aba3b7edd6534b7481" ,
92
+ "1854d0bbcf819de1de7b0cfdb6d32b2e" , "d9720e3809db6916405db096b520c236" , "6b918e9e4d645b12a80468bcea805f1f" ,
93
+ "ed0eefe927eb8c3b81cf87eaabbb8d6e" , "051369e0dc8fed4798c8b2c68e6cbe2e" , "b05ff57164167851651c625dca66ed28" ,
94
+ "68e7a6e57bd43cb52e344b3190073387" , "75d027ac7833f284fda65ea620353e8a" , "7d85bb2ee4130096a6d905ab8d726bcc" ,
95
+ "63498210c88e8bfb1a7346c4ddf73bb1" , "5bf2211304ef91d60bb3838ec12d89cd" , "4981758eb8980e9df970ac6113e4084c" ,
96
+ "322431789942595b599d2b86670f41b3" , "35d7b32e37bcc23d02b12b03b1e0c093" , "1b8847dd786fa41b5b38f5e7aa58b813" ,
97
+ "126bdbe59fa82d55bfa098b710bdf650" , "6c6d3248ed943dd7137637fc92d7ba37" , "532203c6877433df5651b95d27685825" ,
98
+ "6e7112411da5843bec576271c44e0a0a" , "0e4f913a9b4672b0ad3cc9c4f2de5c8d" , "a138d0060b2e8a0ef1f90cf3ab7b7ca0" ,
99
+ "04d491dd1c679e91b5a2f3b9f14db555" , "889c051e39305614accdff00414bfa67" , "cd87cf24e5135c9d729940194ccc65c8" ,
100
+ "75eb8bfe866cf3040f4c08a566c32bc1" , "fd8e6ddb9c159aab781a11c287c85feb" , "b91a2e103980b96f755712f2b559ad66" ,
101
+ "26187d09b093649686d7c158aa4fd113" , "62e16894bb4b8951a82befd24ad4ee21" , "85c001df1d91788bf3cc1f97230b1dac" ,
102
+ "91a25af808351757b101a8c9c787db9e" , "8b3e8645cc4c2484c331759b9d1df5bc" , "e8a76a6ab290e6743233510e8d1eb4a5" ,
103
+ "38280bd7804f4c060b0775c4abed9b89"
98
104
]
99
105
species_nsplit = [
100
106
2 , 1 , 1 , 2 , 1 , 1 , 1 , 3 , 2 , 1 , 1 , 1 , 1 , 2 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 20 ,
101
107
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
102
- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1
108
+ 1 , 1 , 1 , 1 , 1 , # 1, 1, 1, 1, 1
103
109
]
104
110
split_length = 22000
105
111
@@ -111,7 +117,7 @@ def __init__(self, path, species_id=0, split_id=0, verbose=1, **kwargs):
111
117
112
118
species_name = os .path .basename (self .urls [species_id ])[:- 4 ]
113
119
if split_id >= self .species_nsplit [species_id ]:
114
- raise ValueError ("Split id %d should be less than %d in species %s" %
120
+ raise ValueError ("Split id %d should be less than %d in species %s" %
115
121
(split_id , self .species_nsplit [species_id ], species_name ))
116
122
self .processed_file = "%s_%d.pkl.gz" % (species_name , split_id )
117
123
pkl_file = os .path .join (path , self .processed_file )
0 commit comments