forked from ajr15/CustodiPaper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDeepChemRun.py
167 lines (145 loc) · 6.55 KB
/
DeepChemRun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import settings
import deepchem as dc
import numpy as np
import os
from dask import delayed, compute, dataframe
from dask.distributed import Client
from rdkit import Chem
import pandas as pd
from time import time
import sys; sys.path.append(settings.torina_parent_dir)
from Torina.Model.Model import Model
from Torina.Model.utils import grid_estimation
from commons import *
import sys
# wrapper around DeepChemModel to make a good comparision with other results
class GraphConv (Model):
"""Wrapper around deepchem.models.GraphConvModel. to allow for Torina.Model functionalities
ARGS:
- graph_conv_layers
- dense_layer_size"""
def __init__(self):
# model hyperparameters are taken from https://github.com/wcjordan/schrodingerdeepchem/blob/master/examples/benchmark.py
hps = {
'batch_size': 128,
'nb_epoch': 20,
'learning_rate': 0.0005,
'n_filters': 128,
'n_fully_connected_nodes': 256,
'seed': 123
}
self.model = dc.models.GraphConvModel(n_tasks=1,
graph_conv_layers=[hps["n_filters"], hps["n_filters"]],
dense_layer_size=hps["fully_connected_nodes"],
batch_size=32
mode='regression')
def train(self, x, y):
dataset = dc.data.DiskDataset.from_numpy(x, y=y, ids=range(len(x)))
self.model.fit(dataset)
def predict(self, X):
return self.model.predict_on_batch(X)
class DeepTensorNN (Model):
"""Wrapper around deepchem.models.DTNNModel. to allow for Torina.Model functionalities.
Model hyperparameters are teken from https://github.com/deepchem/deepchem/blob/master/examples/qm9/qm9_DTNN.py"""
def __init__(self):
self.model = dc.models.DTNNModel(n_tasks=1,
batch_size=50,
n_embedding=20,
n_distance=51,
distance_min=-1.,
distance_max=9.2,
n_hidden=15,
mode='regression')
def train(self, x, y):
dataset = dc.data.DiskDataset.from_numpy(x, y=y, ids=range(len(x)))
self.model.fit(dataset)
def predict(self, X):
return self.model.predict_on_batch(X)
class MessagePassingNN (Model):
"""Wrapper around deepchem.models.DTNNModel. to allow for Torina.Model functionalities.
Model hyperparameters are teken from https://github.com/deepchem/deepchem/blob/master/examples/qm9/qm9_DTNN.py"""
def __init__(self):
self.model = dc.models.MPNNModel(1)
def train(self, x, y):
dataset = dc.data.DiskDataset.from_numpy(x, y=y, ids=range(len(x)))
self.model.fit(dataset)
def predict(self, X):
return self.model.predict_on_batch(X)
def data_prep(target_label, train_size, model, dataset, sample='all'):
data = loaders[dataset](target_label, normalization_method='z_score', pad_smiles=False)
if not sample == 'all':
data = data.sample(sample)
# make a set of rd-molecules for featurizer
mols = []
idxs = []
for i, s in enumerate(data.vectorized_inputs):
mol = Chem.MolFromSmiles(''.join(s), sanitize=True)
if mol is None:
continue
mols.append(mol)
idxs.append(i)
# filtering data to include only rdkit-readable mols
data = data.data_from_idxs(idxs)
# featurizing data
if model == "GC":
featurizer = dc.feat.ConvMolFeaturizer()
data.vectorized_inputs = featurizer.featurize(mols)
elif model == "DTNN":
featurizer = dc.feat.CoulombMatrix(29)
data.vectorized_inputs = featurizer.featurize(mols)
data.remove_entries(['empty_arrays'])
else:
featurizer = dc.feat.ConvMolFeaturizer()
data.vectorized_inputs = featurizer.featurize(mols)
if train_size <= 0.8:
return data.split_to_groups([train_size, 0.1], add_fill_group=True, random_seed=0)
else:
return data.split_to_groups([train_size, 0.05], add_fill_group=True, random_seed=0)
def run_fit(target_label, train_size, model, dataset, sample='all'):
counter = 0
fname = lambda x: "{}_{}_{}_{}_{}.csv".format(x, target_label, train_size, model, dataset)
while True:
if os.path.isfile(os.path.join(settings.results_dir, fname(counter))):
counter += 1
else:
results_file = os.path.join(settings.results_dir, fname(counter))
break
train, val, test = data_prep(target_label, train_size, model, dataset, sample=sample)
models_dict = {
"GC": GraphConv,
"DTNN": DeepTensorNN,
"MPNN": MessagePassingNN
}
# running grid optimization
additional_descrps = {'model': model, 'train_size': len(train), 'label': target_label, 'count': counter}
grid_estimation(models_dict[model],
train,
[("val", val), ("test", test)],
estimators=['r_squared', 'rmse','mae', 'mare'],
additional_descriptors=additional_descrps,
write_to=results_file,
init_kwargs={})
def main():
# Running on the rest
parallel_args_scan(run_fit,
[[1], [0.1, 0.5, 0.8], ['DTNN', "GC"], ["delaney", "lipophilicity", "sampl"]],
addtional_kwargs={},
scheduler='distributed')
# Running on QM9
parallel_args_scan(run_fit,
[settings.qm9_labels, [0.001, 0.01, 0.1], ['DTNN', "GC"], ["qm9"]],
addtional_kwargs={},
scheduler='distributed')
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="Parser for running files")
parser.add_argument("-n_workers", type=int, default=1)
parser.add_argument("-threads_per_worker", type=int, default=1)
parser.add_argument("-memory_limit", type=str, default="2GB", help="max amount of memory, string such as \'4GB\'")
args = parser.parse_args()
client = Client(n_workers=args.n_workers, threads_per_worker=args.threads_per_worker, memory_limit=args.memory_limit)
main()
client.close()