From 922b2b7924085d91410ecfdc86e950373477a4d6 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 27 Jun 2022 12:04:23 +0200 Subject: [PATCH 1/8] Upgraded pymatgen and matminer requirements --- README.md | 6 ------ modnet/featurizers/featurizers.py | 8 ++++---- modnet/preprocessing.py | 10 +++++----- setup.py | 8 ++++---- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d4ffd405..72a4b761 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ activate the environment: conda activate modnet ``` -Then, install pymatgen v2020.8.13 with conda, which will bundle several pre-built dependencies (e.g., numpy, scipy): - -```shell -conda install -c conda-forge pymatgen=2020.8.13 -``` - Finally, install MODNet from PyPI with pip: ```bash diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 0835668c..0fd3ec77 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -70,7 +70,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: The featurized DataFrame. @@ -137,7 +137,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty @@ -184,7 +184,7 @@ def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. @@ -206,7 +206,7 @@ def featurize_site( Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 8cf3bed5..7b888eee 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -13,7 +13,7 @@ from typing import Dict, List, Union, Optional, Callable, Hashable, Iterable, Tuple from functools import partial -from pymatgen import Structure, Composition +from pymatgen.core import Structure, Composition from sklearn.feature_selection import mutual_info_regression, mutual_info_classif from sklearn.utils import resample @@ -539,14 +539,14 @@ def merge_ranked(lists: List[List[Hashable]]) -> List[Hashable]: class MODData: - """The MODData class takes takes a list of `pymatgen.Structure` + """The MODData class takes takes a list of `pymatgen.core.structure.Structure` objects and creates a `pandas.DataFrame` that contains many matminer features per structure. It then uses mutual information between features and targets, and between the features themselves, to perform feature selection using relevance-redundancy indices. Attributes: - df_structure (pd.DataFrame): dataframe storing the `pymatgen.Structure` + df_structure (pd.DataFrame): dataframe storing the `pymatgen.core.structure.Structure` representations for each structured, indexed by ID. df_targets (pd.Dataframe): dataframe storing the prediction targets per structure, indexed by ID. @@ -906,12 +906,12 @@ def rebalance(self): @property def structures(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of `pymatgen.Structure` objects.""" + """Returns the list of `pymatgen.core.structure.Structure` objects.""" return list(self.df_structure["structure"]) @property def compositions(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of materials as`pymatgen.Composition` objects.""" + """Returns the list of materials as`pymatgen.core.composition.Composition` objects.""" return [s.composition for s in self.df_structure["structure"]] @property diff --git a/setup.py b/setup.py index 45d311d3..bb05700a 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,10 @@ "pandas>=0.25.3", "tensorflow>=2.4", "tensorflow-probability>=0.12", - "pymatgen>=2020,<2020.9", - "matminer>=0.6.2", - "numpy>=1.18.3", - "scikit-learn>=0.23,<0.24", + "pymatgen>=2022.5.17", + "matminer>=0.7.6", + "numpy>=1.22.3", + "scikit-learn>=1.1.0", ], tests_require=tests_require, test_suite="modnet.tests", From 62c482571b9a1ba0d6f3a825f095204475f20153 Mon Sep 17 00:00:00 2001 From: ppdebreuck Date: Tue, 11 Jul 2023 16:57:26 +0200 Subject: [PATCH 2/8] backward compatibility warning --- modnet/models/vanilla.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index 77119444..bb145796 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -846,6 +846,11 @@ def _restore_model(self): fill_value=-1, ).fit(np.zeros((1, self.n_feat))), ) + if not hasattr(self, "targets_groups"): + self.targets_groups = [x for subl in self.targets for x in subl] + LOG.warning( + "Installed modnet version (v>=0.4.0) does not match loaded model (v<0.4.0) and may result in errors. Please retrain or change your modnet version !" + ) def save(self, filename: str) -> None: """Save the `MODNetModel` to filename: From a408f0ac02120fc63b9f1987d618deb2583958f6 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Jul 2023 09:04:03 +0200 Subject: [PATCH 3/8] Possibility to remove all NaNs features or not after featurization. --- modnet/featurizers/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modnet/featurizers/utils.py b/modnet/featurizers/utils.py index 2297b6ef..3d54863e 100644 --- a/modnet/featurizers/utils.py +++ b/modnet/featurizers/utils.py @@ -3,12 +3,13 @@ __all__ = ("clean_df",) -def clean_df(df): +def clean_df(df, drop_allnan: bool = True): """Cleans dataframe by dropping missing values, replacing NaN's and infinities and selecting only columns containing numerical data. Args: df (pd.DataFrame): the dataframe to clean. + drop_allnan: if True, clean_df will remove features that are fully NaNs. Returns: pandas.DataFrame: the cleaned dataframe. @@ -16,7 +17,8 @@ def clean_df(df): """ df = df.select_dtypes(include="number") - df = df.dropna(axis=1, how="all") + if drop_allnan: + df = df.dropna(axis=1, how="all") df = df.replace([np.inf, -np.inf, np.nan], np.nan) return df From b9700c721e303f09ae455109ddc86d413d924cd6 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Jul 2023 09:09:48 +0200 Subject: [PATCH 4/8] Arg in featurize. --- modnet/preprocessing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index d8f5e2e7..f34914ba 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -706,7 +706,9 @@ def __init__( self.df_structure = pd.DataFrame({"id": structure_ids, "structure": materials}) self.df_structure.set_index("id", inplace=True) - def featurize(self, fast: bool = False, db_file=None, n_jobs=None): + def featurize( + self, fast: bool = False, db_file=None, n_jobs=None, drop_allnan: bool = True + ): """For the input structures, construct many matminer features and save a featurized dataframe. If `db_file` is specified, this method will try to load previous feature calculations for each @@ -720,6 +722,7 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None): Note : The database will be downloaded in this case, and takes around 2GB of space on your drive ! db_file: Deprecated. Do Not use this anymore. + drop_allnan: if True, features that are fully NaNs will be removed. """ @@ -778,7 +781,7 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None): df_final = self.featurizer.featurize(self.df_structure) # replace infinite values by nan that are handled during the fit - df_final = clean_df(df_final) + df_final = clean_df(df_final, drop_allnan=drop_allnan) self.df_featurized = df_final LOG.info("Data has successfully been featurized!") From 7f41fd795c3a42edc737bcbcffb4e166050a79d9 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Jul 2023 09:36:24 +0200 Subject: [PATCH 5/8] Arg in preset because there are clean_df there as well. --- .../featurizers/presets/matminer_all_2023.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index 2e2a4e4b..3e26d60f 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -16,7 +16,12 @@ class MatminerAll2023Featurizer(modnet.featurizers.MODFeaturizer): """ - def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): + def __init__( + self, + fast_oxid: bool = False, + continuous_only: bool = False, + drop_allnan: bool = True, + ): """Creates the featurizer and imports all featurizer functions. Parameters: @@ -28,12 +33,14 @@ def __init__(self, fast_oxid: bool = False, continuous_only: bool = False): continuous_only: Whether to keep only the features that are continuous with respect to the composition (only for composition featurizers). Discontinuous features may lead to discontinuities in the model predictions. + drop_allnan: if True, features that are fully NaNs will be removed. """ super().__init__() self.fast_oxid = fast_oxid self.continuous_only = continuous_only + self.drop_allnan = drop_allnan self.load_featurizers() def load_featurizers(self): @@ -323,7 +330,7 @@ def featurize_composition(self, df): if self.oxid_composition_featurizers: df.drop(columns=["IonProperty|max ionic char"], inplace=True) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, @@ -359,7 +366,7 @@ def _int_map(x): "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, @@ -376,7 +383,7 @@ def featurize_site(self, df): df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) class CompositionOnlyMatminerAll2023Featurizer(MatminerAll2023Featurizer): @@ -391,9 +398,14 @@ def __init__( self, continuous_only: bool = False, oxidation_featurizers: bool = False, + drop_allnan: bool = True, fast_oxid: bool = False, ): - super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only) + super().__init__( + fast_oxid=fast_oxid, + continuous_only=continuous_only, + drop_allnan=drop_allnan, + ) self.fast_oxid = fast_oxid self.structure_featurizers = () self.site_featurizers = () From 5874ef9e71a2b750e96f194901c1b2d2c2afca0f Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Jul 2023 12:15:15 +0200 Subject: [PATCH 6/8] Easier setting of drop_allnan. --- modnet/featurizers/featurizers.py | 7 ++++++- modnet/featurizers/presets/debreuck_2020.py | 6 +++--- modnet/featurizers/presets/matminer_2023.py | 6 +++--- modnet/featurizers/presets/matminer_all_2023.py | 5 ----- modnet/preprocessing.py | 2 ++ 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 4422c570..49bbcca0 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -48,16 +48,18 @@ class MODFeaturizer(abc.ABC): site_stats: Tuple[str] = ("mean", "std_dev") featurizer_mode: str = "multi" - def __init__(self, n_jobs=None): + def __init__(self, n_jobs=None, drop_allnan: bool = True): """Initialise the MODFeaturizer object with a requested number of threads to use during featurization. Arguments: n_jobs: The number of threads to use. If `None`, matminer will use `multiprocessing.cpu_count()` by default. + drop_allnan: if True, features that are fully NaNs will be removed. """ self.set_n_jobs(n_jobs) + self.set_drop_allnan(drop_allnan) def set_n_jobs(self, n_jobs: Optional[int]): """Set the no. of threads to pass to matminer for featurizer @@ -70,6 +72,9 @@ def set_n_jobs(self, n_jobs: Optional[int]): """ self._n_jobs = n_jobs + def set_drop_allnan(self, drop_allnan: bool = True): + self.drop_allnan = drop_allnan + def featurize(self, df: pd.DataFrame) -> pd.DataFrame: """Run all of the preset featurizers on the input dataframe. diff --git a/modnet/featurizers/presets/debreuck_2020.py b/modnet/featurizers/presets/debreuck_2020.py index dd588f14..065506bf 100644 --- a/modnet/featurizers/presets/debreuck_2020.py +++ b/modnet/featurizers/presets/debreuck_2020.py @@ -175,7 +175,7 @@ def featurize_composition(self, df): lambda x: -1 if not isinstance(x, str) else Element(x).Z ) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, @@ -226,7 +226,7 @@ def _int_map(x): "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, @@ -243,7 +243,7 @@ def featurize_site(self, df): df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) class CompositionOnlyFeaturizer(DeBreuck2020Featurizer): diff --git a/modnet/featurizers/presets/matminer_2023.py b/modnet/featurizers/presets/matminer_2023.py index 0d67a2a6..7e557b69 100644 --- a/modnet/featurizers/presets/matminer_2023.py +++ b/modnet/featurizers/presets/matminer_2023.py @@ -179,7 +179,7 @@ def featurize_composition(self, df): else: df.drop(columns=["IonProperty|max ionic char"], inplace=True) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_structure(self, df): """Applies the preset structural featurizers to the input dataframe, @@ -215,7 +215,7 @@ def _int_map(x): "GlobalSymmetryFeatures|is_centrosymmetric" ].map(_int_map) - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) def featurize_site(self, df): """Applies the preset site featurizers to the input dataframe, @@ -232,7 +232,7 @@ def featurize_site(self, df): df = super().featurize_site(df, aliases=aliases) df = df.loc[:, (df != 0).any(axis=0)] - return modnet.featurizers.clean_df(df) + return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan) class CompositionOnlyMatminer2023Featurizer(Matminer2023Featurizer): diff --git a/modnet/featurizers/presets/matminer_all_2023.py b/modnet/featurizers/presets/matminer_all_2023.py index 3e26d60f..bfdfdc9e 100644 --- a/modnet/featurizers/presets/matminer_all_2023.py +++ b/modnet/featurizers/presets/matminer_all_2023.py @@ -20,7 +20,6 @@ def __init__( self, fast_oxid: bool = False, continuous_only: bool = False, - drop_allnan: bool = True, ): """Creates the featurizer and imports all featurizer functions. @@ -33,14 +32,12 @@ def __init__( continuous_only: Whether to keep only the features that are continuous with respect to the composition (only for composition featurizers). Discontinuous features may lead to discontinuities in the model predictions. - drop_allnan: if True, features that are fully NaNs will be removed. """ super().__init__() self.fast_oxid = fast_oxid self.continuous_only = continuous_only - self.drop_allnan = drop_allnan self.load_featurizers() def load_featurizers(self): @@ -398,13 +395,11 @@ def __init__( self, continuous_only: bool = False, oxidation_featurizers: bool = False, - drop_allnan: bool = True, fast_oxid: bool = False, ): super().__init__( fast_oxid=fast_oxid, continuous_only=continuous_only, - drop_allnan=drop_allnan, ) self.fast_oxid = fast_oxid self.structure_featurizers = () diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index f34914ba..f7690d02 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -740,6 +740,8 @@ def featurize( if n_jobs is not None: self.featurizer.set_n_jobs(n_jobs) + self.featurizer.set_drop_allnan(drop_allnan) + if self.df_featurized is not None: raise RuntimeError("Not overwriting existing featurized dataframe.") From bdc8cf15c57a5b01b89a36e4af6403e4771852b2 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Jul 2023 12:21:18 +0200 Subject: [PATCH 7/8] Let this for another PR. --- modnet/models/vanilla.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index bb145796..77119444 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -846,11 +846,6 @@ def _restore_model(self): fill_value=-1, ).fit(np.zeros((1, self.n_feat))), ) - if not hasattr(self, "targets_groups"): - self.targets_groups = [x for subl in self.targets for x in subl] - LOG.warning( - "Installed modnet version (v>=0.4.0) does not match loaded model (v<0.4.0) and may result in errors. Please retrain or change your modnet version !" - ) def save(self, filename: str) -> None: """Save the `MODNetModel` to filename: From 476b93a800b1d4e948ce4c52a7827bf2bb2a4a55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 09:51:53 +0000 Subject: [PATCH 8/8] Bump tensorflow from 2.11.0 to 2.13.0 Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.11.0 to 2.13.0. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.11.0...v2.13.0) --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f82c7c1d..2a70e616 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -tensorflow==2.11.0 +tensorflow==2.13.0 tensorflow-probability==0.19.0 pandas==1.5.2 pymatgen==2023.7.20