-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathMD17RevisedDataset.py
143 lines (117 loc) · 6.5 KB
/
MD17RevisedDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
import numpy as np
import pandas as pd
from kgcnn.data.base import MemoryGraphDataset
from kgcnn.data.download import DownloadDataset
class MD17RevisedDataset(DownloadDataset, MemoryGraphDataset):
r"""Store and process trajectories from :obj:`MD17DatasetRevised` dataset.
The information of the readme file is given below:
The molecules are taken from the original MD17 dataset by
`Chmiela et al. <https://www.science.org/doi/10.1126/sciadv.1603015>`__ , and 100,000 structures are taken,
and the energies and forces are recalculated at the PBE/def2-SVP level of theory using very tight SCF convergence
and very dense DFT integration grid. As such, the dataset is practically free from nummerical noise.
One warning: As the structures are taken from a molecular dynamics simulation
(i.e. time series data), they are not guaranteed to be independent samples.
This is easily evident from the auto-correlation function for the original MD17 dataset
In short: DO NOT train a model on more than 1000 samples from this dataset.
Data already published with 50K samples on the original MD17 dataset should be considered
meaningless due to this fact and due to the noise in the original data.
The data:
The ten molecules are saved in Numpy .npz format.
The keys correspond to:
- 'nuclear_charges' : The nuclear charges for the molecule
- 'coords' : The coordinates for each conformation (in units of Angstrom)
- 'energies' : The total energy of each conformation (in units of kcal/mol)
- 'forces' : The cartesian forces of each conformation (in units of kcal/mol/Angstrom)
- 'old_indices' : The index of each conformation in the original MD17 dataset
- 'old_energies' : The energy of each conformation taken from the original MD17 dataset
- 'old_forces' : The forces of each conformation taken from the original MD17 dataset
Note that for Azobenzene, only 99988 samples are available due to 11 failed DFT calculations due to van der
Walls clash, and the original dataset only contained 99999 structures.
Data splits:
Five training and test splits are saved in CSV format containing the corresponding indices.
References:
(1) Anders Christensen, O. Anatole von Lilienfeld, Revised MD17 dataset, Materials Cloud Archive 2020.82 (2020),
doi: 10.24435/materialscloud:wy-kn.
"""
download_info = {
"dataset_name": "MD17Revised",
"data_directory_name": "MD17Revised",
"download_url": "https://archive.materialscloud.org/record/file?filename=rmd17.tar.bz2&record_id=466",
"download_file_name": 'rmd17.tar.bz2',
"unpack_tar": True,
"unpack_zip": False,
"unpack_directory_name": "rmd17"
}
possible_trajectory_names = ["aspirin", "azobenzene", "benzene", "ethanol", "malonaldehyde", "naphthalene",
"paracetamol", "salicylic", "toluene", "uracil"]
def __init__(self, trajectory_name: str = None, reload=False, verbose=10):
"""Initialize MD17DatasetRevised dataset for a specific trajectory.
Args:
trajectory_name (str): Name of trajectory to load.
reload (bool): Whether to reload the data and make new dataset. Default is False.
verbose (int): Print progress or info for processing where 60=silent. Default is 10.
"""
self.data_keys = None
self.trajectory_name = trajectory_name
if trajectory_name not in self.possible_trajectory_names:
raise ValueError(
"Name for trajectory '%s' not found. Choose: %s." % (trajectory_name, self.possible_trajectory_names))
MemoryGraphDataset.__init__(self, dataset_name="MD17Revised", verbose=verbose)
DownloadDataset.__init__(self, **self.download_info, reload=reload, verbose=verbose)
self.data_directory = os.path.join(self.data_main_dir, self.data_directory_name,
self.unpack_directory_name, "rmd17")
self.file_name = "rmd17_%s.npz" % self.trajectory_name
self.dataset_name = self.dataset_name + "_" + self.trajectory_name
# May add name of trajectory to name of dataset here.
if self.fits_in_memory:
self.read_in_memory()
def _get_trajectory_from_npz(self, file_path: str = None):
if file_path is None:
file_dir = os.path.join(self.data_directory, "npz_data")
file_path = os.path.join(file_dir, self.file_name)
return np.load(file_path)
def _get_train_test_splits(self):
file_dir = os.path.join(self.data_directory, "splits")
def read_splits(file_name: str) -> list:
return [
np.squeeze(pd.read_csv(
os.path.join(file_dir, file_name % i), header=None).values, axis=-1) for i in range(1, 6)]
return read_splits("index_train_0%i.csv"), read_splits("index_test_0%i.csv")
def read_in_memory(self,):
"""Read dataset trajectory into memory.
Returns:
self.
"""
data = self._get_trajectory_from_npz()
self.data_keys = list(data.keys())
for key in ["coords", "forces", "old_indices", "old_energies", "old_forces"]:
self.assign_property(key, [x for x in data[key]])
self.assign_property("energies", [np.expand_dims(x, axis=-1) for x in data["energies"]])
node_number = data["nuclear_charges"]
self.assign_property("nuclear_charges", [np.array(node_number) for _ in range(len(self))])
# Add splits to self.
splits_train, splits_test = self._get_train_test_splits()
property_train = []
property_test = []
for i in range(len(self)):
is_train = []
is_test = []
for j, split in enumerate(splits_train):
if i in split:
is_train.append(j + 1)
for j, split in enumerate(splits_test):
if i in split:
is_test.append(j + 1)
# Add to list
if len(is_train) > 0:
property_train.append(np.array(is_train, dtype="int"))
else:
property_train.append(None)
if len(is_test) > 0:
property_test.append(np.array(is_test, dtype="int"))
else:
property_test.append(None)
self.assign_property("train", property_train)
self.assign_property("test", property_test)
return self