Skip to content

Commit 1e41b34

Browse files
committed
Merge branch 'master' of github.com:ppdebreuck/modnet
2 parents fdb1301 + e3c9208 commit 1e41b34

File tree

5 files changed

+272
-27
lines changed

5 files changed

+272
-27
lines changed

modnet/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.3"
1+
__version__ = "0.4.4"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
"""This submodule contains the `Matminer2024FastFeaturizer` class. """
2+
3+
import numpy as np
4+
import modnet.featurizers
5+
import contextlib
6+
7+
8+
class Matminer2024FastFeaturizer(modnet.featurizers.MODFeaturizer):
9+
"""A set of efficient featurizers for features implemented in matminer
10+
at time of creation (matminer v0.9.2 from 2024).
11+
12+
Removes featurizers that are known to be slow (i.e., orders of magnitude
13+
more intensive to compute than the rest of the featurizers).
14+
15+
"""
16+
17+
def __init__(
18+
self,
19+
fast_oxid: bool = True,
20+
continuous_only: bool = True,
21+
):
22+
"""Creates the featurizer and imports all featurizer functions.
23+
24+
Parameters:
25+
fast_oxid: Whether to use the accelerated oxidation state parameters within
26+
pymatgen when constructing features that constrain oxidation states such
27+
that all sites with the same species in a structure will have the same
28+
oxidation state (recommended if featurizing any structure
29+
with large unit cells).
30+
continuous_only: Whether to keep only the features that are continuous
31+
with respect to the composition (only for composition featurizers).
32+
Discontinuous features may lead to discontinuities in the model predictions.
33+
34+
"""
35+
36+
super().__init__()
37+
self.drop_allnan = False
38+
self.fast_oxid = fast_oxid
39+
self.continuous_only = continuous_only
40+
self.load_featurizers()
41+
42+
def load_featurizers(self):
43+
with contextlib.redirect_stdout(None):
44+
from matminer.featurizers.composition import (
45+
BandCenter,
46+
ElementFraction,
47+
ElementProperty,
48+
Stoichiometry,
49+
TMetalFraction,
50+
ValenceOrbital,
51+
)
52+
from matminer.featurizers.structure import (
53+
DensityFeatures,
54+
EwaldEnergy,
55+
GlobalSymmetryFeatures,
56+
StructuralComplexity,
57+
)
58+
from matminer.utils.data import (
59+
DemlData,
60+
PymatgenData,
61+
)
62+
63+
pymatgen_features = [
64+
"block",
65+
"mendeleev_no",
66+
"electrical_resistivity",
67+
"velocity_of_sound",
68+
"thermal_conductivity",
69+
"bulk_modulus",
70+
"coefficient_of_linear_thermal_expansion",
71+
]
72+
73+
deml_features = [
74+
"atom_radius",
75+
"molar_vol",
76+
"heat_fusion",
77+
"boiling_point",
78+
"heat_cap",
79+
"first_ioniz",
80+
"electric_pol",
81+
"GGAU_Etot",
82+
"mus_fere",
83+
"FERE correction",
84+
]
85+
86+
magpie_featurizer = ElementProperty.from_preset("magpie")
87+
magpie_featurizer.stats = ["mean", "avg_dev"]
88+
89+
pymatgen_featurizer = ElementProperty(
90+
data_source=PymatgenData(),
91+
stats=["mean", "avg_dev"],
92+
features=pymatgen_features,
93+
)
94+
95+
deml_featurizer = ElementProperty(
96+
data_source=DemlData(),
97+
stats=["mean", "avg_dev"],
98+
features=deml_features,
99+
)
100+
101+
self.composition_featurizers = (
102+
BandCenter(),
103+
ElementFraction(),
104+
magpie_featurizer,
105+
pymatgen_featurizer,
106+
deml_featurizer,
107+
Stoichiometry(p_list=[2, 3, 5, 7, 10]),
108+
TMetalFraction(),
109+
ValenceOrbital(props=["frac"]),
110+
)
111+
112+
self.oxid_composition_featurizers = []
113+
114+
self.structure_featurizers = (
115+
DensityFeatures(),
116+
EwaldEnergy(),
117+
GlobalSymmetryFeatures(),
118+
StructuralComplexity(),
119+
)
120+
121+
self.site_featurizers = []
122+
123+
def featurize_composition(self, df):
124+
"""Applies the preset composition featurizers to the input dataframe,
125+
renames some fields and cleans the output dataframe.
126+
127+
"""
128+
from pymatgen.core.periodic_table import Element
129+
130+
df = super().featurize_composition(df)
131+
132+
if self.composition_featurizers and not self.continuous_only:
133+
_orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
134+
df["AtomicOrbitals|HOMO_character"] = df[
135+
"AtomicOrbitals|HOMO_character"
136+
].map(_orbitals)
137+
df["AtomicOrbitals|LUMO_character"] = df[
138+
"AtomicOrbitals|LUMO_character"
139+
].map(_orbitals)
140+
141+
df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
142+
lambda x: -1 if not isinstance(x, str) else Element(x).Z
143+
)
144+
df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
145+
lambda x: -1 if not isinstance(x, str) else Element(x).Z
146+
)
147+
148+
if self.continuous_only:
149+
# These are additional features that have shown discontinuities in my tests.
150+
# Hopefully, I got them all...
151+
df.drop(
152+
columns=[
153+
"ElementProperty|DemlData mean electric_pol",
154+
"ElementProperty|DemlData mean FERE correction",
155+
"ElementProperty|DemlData mean GGAU_Etot",
156+
"ElementProperty|DemlData mean heat_fusion",
157+
"ElementProperty|DemlData mean mus_fere",
158+
],
159+
inplace=True,
160+
errors="ignore",
161+
)
162+
163+
if self.oxid_composition_featurizers:
164+
df.drop(columns=["IonProperty|max ionic char"], inplace=True)
165+
166+
return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
167+
168+
def featurize_structure(self, df):
169+
"""Applies the preset structural featurizers to the input dataframe,
170+
renames some fields and cleans the output dataframe.
171+
172+
"""
173+
174+
if self.structure_featurizers:
175+
df = super().featurize_structure(df)
176+
177+
_crystal_system = {
178+
"cubic": 1,
179+
"tetragonal": 2,
180+
"orthorombic": 3,
181+
"hexagonal": 4,
182+
"trigonal": 5,
183+
"monoclinic": 6,
184+
"triclinic": 7,
185+
}
186+
187+
def _int_map(x):
188+
if x == np.nan:
189+
return 0
190+
elif x:
191+
return 1
192+
else:
193+
return 0
194+
195+
df["GlobalSymmetryFeatures|crystal_system"] = df[
196+
"GlobalSymmetryFeatures|crystal_system"
197+
].map(_crystal_system)
198+
df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
199+
"GlobalSymmetryFeatures|is_centrosymmetric"
200+
].map(_int_map)
201+
202+
return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
203+
204+
def featurize_site(self, df):
205+
"""Applies the preset site featurizers to the input dataframe,
206+
renames some fields and cleans the output dataframe.
207+
208+
"""
209+
210+
# rename some features for backwards compatibility with pretrained models
211+
aliases = {
212+
"GeneralizedRadialDistributionFunction": "GeneralizedRDF",
213+
"AGNIFingerprints": "AGNIFingerPrint",
214+
"BondOrientationalParameter": "BondOrientationParameter",
215+
}
216+
df = super().featurize_site(df, aliases=aliases)
217+
df = df.loc[:, (df != 0).any(axis=0)]
218+
219+
return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)

modnet/models/ensemble.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,11 @@ def fit(
144144
pool.join()
145145

146146
def predict(
147-
self, test_data: MODData, return_unc=False, return_prob=False
147+
self,
148+
test_data: MODData,
149+
return_unc: bool = False,
150+
return_prob: bool = False,
151+
remap_out_of_bounds: bool = True,
148152
) -> pd.DataFrame:
149153
"""Predict the target values for the passed MODData.
150154
@@ -154,6 +158,7 @@ def predict(
154158
return_prob: For a classification task only: whether to return the probability of each
155159
class OR only return the most probable class.
156160
return_unc: whether to return a second dataframe containing the uncertainties
161+
remap_out_of_bounds: whether to remap out-of-bounds values to the nearest bound.
157162
158163
Returns:
159164
A `pandas.DataFrame` containing the predicted values of the targets.
@@ -163,7 +168,11 @@ class OR only return the most probable class.
163168

164169
all_predictions = []
165170
for i in range(self.n_models):
166-
p = self.models[i].predict(test_data, return_prob=return_prob)
171+
p = self.models[i].predict(
172+
test_data,
173+
return_prob=return_prob,
174+
remap_out_of_bounds=remap_out_of_bounds,
175+
)
167176
all_predictions.append(p.values)
168177

169178
p_mean = np.array(all_predictions).mean(axis=0)

modnet/models/vanilla.py

+23-15
Original file line numberDiff line numberDiff line change
@@ -693,14 +693,20 @@ def fit_preset(
693693

694694
return models, val_losses, best_learning_curve, learning_curves, best_preset
695695

696-
def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
696+
def predict(
697+
self,
698+
test_data: MODData,
699+
return_prob: bool = False,
700+
remap_out_of_bounds: bool = True,
701+
) -> pd.DataFrame:
697702
"""Predict the target values for the passed MODData.
698703
699704
Parameters:
700705
test_data: A featurized and feature-selected `MODData`
701706
object containing the descriptors used in training.
702707
return_prob: For a classification tasks only: whether to return the probability of each
703708
class OR only return the most probable class.
709+
remap_out_of_bounds: Whether to remap out-of-bounds predictions to the training data distribution.
704710
705711
Returns:
706712
A `pandas.DataFrame` containing the predicted values of the targets.
@@ -724,20 +730,22 @@ class OR only return the most probable class.
724730
p = [p]
725731

726732
# post-process based on training data
727-
if max(self.num_classes.values()) <= 2: # regression
728-
for i, vals in enumerate(p):
729-
yrange = self.max_y[i] - self.min_y[i]
730-
upper_bound = self.max_y[i] + 0.25 * yrange
731-
lower_bound = self.min_y[i] - 0.25 * yrange
732-
for j in range(len(self.targets_groups[i])):
733-
out_of_range_idxs = np.where(
734-
(vals[:, j] < lower_bound[j]) | (vals[:, j] > upper_bound[j])
735-
)
736-
vals[out_of_range_idxs, j] = (
737-
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
738-
* (yrange[j])
739-
+ self.min_y[i][j]
740-
)
733+
if remap_out_of_bounds:
734+
if max(self.num_classes.values()) <= 2: # regression
735+
for i, vals in enumerate(p):
736+
yrange = self.max_y[i] - self.min_y[i]
737+
upper_bound = self.max_y[i] + 0.25 * yrange
738+
lower_bound = self.min_y[i] - 0.25 * yrange
739+
for j in range(len(self.targets_groups[i])):
740+
out_of_range_idxs = np.where(
741+
(vals[:, j] < lower_bound[j])
742+
| (vals[:, j] > upper_bound[j])
743+
)
744+
vals[out_of_range_idxs, j] = (
745+
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
746+
* (yrange[j])
747+
+ self.min_y[i][j]
748+
)
741749

742750
p_dic = {}
743751

modnet/preprocessing.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -664,8 +664,12 @@ def __init__(
664664
LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.")
665665

666666
if target_names is not None:
667+
if isinstance(target_names, str):
668+
target_names = [target_names]
667669
if np.shape(targets)[-1] != len(target_names):
668-
raise ValueError("Target names must be supplied for every target.")
670+
raise ValueError(
671+
f"Target names must be supplied for every target: {np.shape(targets)} vs {target_names=}"
672+
)
669673
elif targets is not None:
670674
if len(np.shape(targets)) == 1:
671675
target_names = ["prop0"]
@@ -681,16 +685,20 @@ def __init__(
681685
"List of IDs (`structure_ids`) provided must be unique."
682686
)
683687

684-
if len(structure_ids) != len(materials):
685-
raise ValueError(
686-
"List of IDs (`structure_ids`) must have same length as list of structure."
687-
)
688+
if materials is not None:
689+
if len(structure_ids) != len(materials):
690+
raise ValueError(
691+
"List of IDs (`structure_ids`) must have same length as list of structure."
692+
)
688693

689694
else:
690-
num_entries = (
691-
len(materials) if materials is not None else len(df_featurized)
692-
)
693-
structure_ids = [f"id{i}" for i in range(num_entries)]
695+
if df_featurized is not None:
696+
structure_ids = df_featurized.index
697+
else:
698+
num_entries = (
699+
len(materials) if materials is not None else len(df_featurized)
700+
)
701+
structure_ids = [f"id{i}" for i in range(num_entries)]
694702

695703
if targets is not None:
696704
# set up dataframe for targets with columns (id, property_1, ..., property_n)
@@ -816,6 +824,7 @@ def feature_selection(
816824
n_jobs: max. number of processes to use when calculating cross NMI.
817825
ignore_names (List): Optional list of property names to ignore during feature selection.
818826
Feature selection will be performed w.r.t. all properties except the ones in ignore_names.
827+
random_state (int): Seed used to compute the NMI.
819828
820829
"""
821830
if getattr(self, "df_featurized", None) is None:

0 commit comments

Comments
 (0)