diff --git a/install_local.sh b/install_local.sh index 9f7d7df..c067cbe 100644 --- a/install_local.sh +++ b/install_local.sh @@ -10,11 +10,17 @@ micromamba install ndcctools -c conda-forge # install psiflow pip install -e psiflow[dev] -# install PyTorch and MACE +# install PyTorch and MACE -- CUDA pip install torch --index-url https://download.pytorch.org/whl/cu128 pip install mace-torch pip install cuequivariance cuequivariance-torch cuequivariance-ops-torch-cu12 +# install PyTorch and MACE -- ROCM +pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 +pip install mace-torch +pip install openequivariance +python -c "import openequivariance" # compile some binary, maybe + # install basic PLUMED and python API micromamba install plumed -c conda-forge pip install plumed diff --git a/psiflow/abandonware.py b/psiflow/abandonware.py new file mode 100644 index 0000000..91e4982 --- /dev/null +++ b/psiflow/abandonware.py @@ -0,0 +1,75 @@ +""" +TODO: these functions seem to not be used anywhere in the codebase + probably a good idea to remove these eventually +""" + + + +@typeguard.typechecked +def _check_distances(state: Geometry, threshold: float) -> Geometry: + """ + Check if all interatomic distances in a Geometry are above a threshold. + + Args: + state: Geometry instance to check. + threshold: Minimum allowed interatomic distance. + + Returns: + Geometry: The input Geometry if all distances are above the threshold, otherwise NullState. + + Note: + This function is wrapped as a Parsl app and executed using the default_htex executor. + """ + from ase.geometry.geometry import find_mic + + if state == NullState: + return NullState + nrows = int(len(state) * (len(state) - 1) / 2) + deltas = np.zeros((nrows, 3)) + count = 0 + for i in range(len(state) - 1): + for j in range(i + 1, len(state)): + deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j] + count += 1 + assert count == nrows + if state.periodic: + deltas, _ = find_mic(deltas, state.cell) + check = np.all(np.linalg.norm(deltas, axis=1) > threshold) + if check: + return state + else: + return NullState + + +check_distances = python_app(_check_distances, executors=["default_htex"]) + + + +def test_metrics(dataset): + # TODO: extracted from test_data.test_data_extract + + data = dataset[:2] + Dataset([NullState]) + dataset[3:5] + forces = data.get("forces", elements=["Cu"]) + reference = np.zeros((5, 4, 3)) + reference[2, :] = np.nan # ensure nan is in same place + reference[:, 0] = np.nan # ensure nan is in same place + value = compute_rmse(forces, reference) + + # last three atoms are Cu + forces = np.zeros((5, 4, 3)) + for i in range(5): + forces[i, :] = data[i].result().per_atom.forces + forces[:, 0] = np.nan + assert np.allclose( + value.result(), + compute_rmse(forces, forces * np.zeros_like(forces)).result(), + ) + unreduced = compute_rmse( + forces, forces * np.zeros_like(forces), reduce=False + ).result() + assert len(unreduced) == 5 + unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)] + assert np.allclose( + np.sqrt(np.mean(np.square(unreduced_))), + value.result(), + ) \ No newline at end of file diff --git a/psiflow/compute.py b/psiflow/compute.py new file mode 100644 index 0000000..a6d1802 --- /dev/null +++ b/psiflow/compute.py @@ -0,0 +1,252 @@ +from pathlib import Path +from typing import Callable, ClassVar, Optional, Union, Type, Any, TypeAlias +from collections.abc import Sequence +from dataclasses import dataclass + +import numpy as np +from parsl.app.app import join_app, python_app +from parsl.dataflow.futures import AppFuture, DataFuture + +import psiflow +from psiflow.geometry import Geometry, PER_ATOM_FIELDS, DEFAULT_PROPERTIES, MISSING +from psiflow.data import Dataset +from psiflow.data.utils import insert +from psiflow.functions import Function +from psiflow.utils.apps import pack + +# TODO: what with missing values in arrays? + + +ComputeInput: TypeAlias = ( + Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry +) + + +@dataclass +class ComputeResult: + """Container to hold and manipulate the results from compute/apply operations.""" + + n_atoms: np.ndarray + data: dict[str, np.ndarray] + + def __post_init__(self): + self.cutoffs = self.n_atoms.cumsum() + + def __getattr__(self, item): + """Enable parsl deferred_getitem on AppFuture""" + if item == "data": + raise AttributeError() # prevent RecursionError from pickle + return self.data[item] + + @property + def keys(self) -> set[str]: + return set(self.data.keys()) + + def get(self, key: str, per_geom: bool = False) -> list | np.ndarray: + arr = self.data[key] + if not per_geom: + return arr + + if self.cutoffs[-1] == arr.shape[0]: + data_list = np.array_split(arr, self.cutoffs[:-1]) + else: + data_list = list(arr) + return data_list + + def to_dict(self) -> dict[str, list]: + """Convert to a dict that is extract/insert compliant for geometries""" + return {k: self.get(k, per_geom=True) for k in self.keys} + + @classmethod + def from_data(cls, n_atoms: np.ndarray, data: dict[str, list]): + values = {} + for k, v in data.items(): + assert len(v) == n_atoms.size + if not np.iterable(v[0]): + values[k] = np.array(v) + elif len(v[0]) == n_atoms[0] and len(v[-1]) == n_atoms[-1]: + values[k] = np.concatenate(v) # assume per-atom property + else: + values[k] = np.stack(v) + return cls(np.array(n_atoms), values) + + +def _apply( + states: Sequence[Geometry], + function_cls: Type[Function], + inputs: Sequence = (), + parsl_resource_specification: dict = {}, + **parameters, +) -> ComputeResult: + assert function_cls is not None + function = function_cls(**parameters) # psiflow.functions.Function subclass + output_dict = function.compute(states) + n_atoms = np.array([len(geom) for geom in states]) + return ComputeResult.from_data(n_atoms, output_dict) + + +@python_app(executors=["default_threads"]) +def concatenate_results(*results: ComputeResult) -> ComputeResult: + """""" + n_atoms_list = [result.n_atoms for result in results] + n_atoms = np.concatenate(n_atoms_list) + + data = {} + for k in results[0].keys: + values = [result.get(k) for result in results] + data[k] = np.concatenate(values) + + return ComputeResult(n_atoms, data) + + +@python_app(executors=["default_threads"]) +def aggregate_results( + *results: ComputeResult, coefficients: Optional[np.ndarray] = None +) -> ComputeResult: + """""" + if coefficients is None: + coefficients = np.ones(len(results)) + assert len(coefficients) == len(results) + + n_atoms = results[0].n_atoms + for result in results[1:]: + assert np.allclose(n_atoms, result.n_atoms) + + data = {k: np.zeros_like(v) for k, v in results[0].data.items()} + for i, result in enumerate(results): + for k in data: + data[k] += result.get(k) * coefficients[i] + + return ComputeResult(n_atoms, data) + + +@join_app +def batch_apply( + states: list[Geometry], + apply_apps: Sequence[Callable], + batch_size: int, + reduce_func: Callable, +) -> AppFuture: + """Apply a set of apps to batches of data""" + # TODO: holds everything in memory -- what with very large datasets? + n_batch = len(states) // batch_size + 1 * (len(states) % batch_size > 0) + batches = [arr.tolist() for arr in np.array_split(states, n_batch)] + + output = [] + for batch in batches: + futures = [] + for app in apply_apps: + future = app(batch) + futures.append(future) + future = reduce_func(*futures) + output.append(future) + + return concatenate_results(*output) + + +def compute( + arg: ComputeInput, + apply_apps: Sequence[Callable], + reduce_func: Union[python_app, Callable] = aggregate_results, + batch_size: Optional[int] = None, +) -> AppFuture: + """ + Compute results by applying apps to the input data. + + Args: + arg: Input data to compute on. + apply_apps: Apps to apply to the data. + reduce_func: Function to reduce results. + batch_size: Optional batch size for processing. + """ + states = input_to_geometries(arg) + if batch_size is None: + futures = [] + for app in apply_apps: + future = app(states) + futures.append(future) + future = reduce_func(*futures) + else: + future = batch_apply(states, apply_apps, batch_size, reduce_func) + + return future + + +metric_func_map: dict[str, Callable] = { + "RMSE": lambda arr1, arr2: np.mean((arr1 - arr2) ** 2) ** 0.5, + "MAE": lambda arr1, arr2: np.abs(arr1 - arr2).mean(), +} + + +def _compare_results( + result1: ComputeResult, + result2: Optional[ComputeResult] = None, + metric: str = "RMSE", + reduce: bool = True, + **kwargs: list | np.ndarray, +) -> dict: + """""" + assert (result2 is None) != (len(kwargs) == 0) # xor + if kwargs: + result2 = ComputeResult.from_data(result1.n_atoms, kwargs) + elif not np.allclose(result1.cutoffs, result2.cutoffs): + raise ValueError("Results cannot be compared") + + metric_func = metric_func_map[metric] + keys = result1.keys.intersection(result2.keys) + + out = {} + for k in keys: + if reduce: + arr1, arr2 = result1.get(k), result2.get(k) + out[k] = metric_func(arr1, arr2) + else: + list1, list2 = result1.get(k, per_geom=True), result2.get(k, per_geom=True) + out[k] = [metric_func(v1, v2) for v1, v2 in zip(list1, list2)] + + return out + + +compare_results = python_app(_compare_results, executors=["default_threads"]) + + +def _compare_arrays(arr1: Sequence, arr2: Sequence, metric: str = "RMSE") -> float: + """""" + metric_func = metric_func_map[metric] + arr1, arr2 = np.array(arr1), np.array(arr2) + return metric_func(arr1, arr2) + + +compare_arrays = python_app(_compare_arrays, executors=["default_threads"]) + + +def input_to_geometries(data: ComputeInput) -> AppFuture: + """Convert ComputeInput into a sequence of geometries (as a future)""" + # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry + + @python_app(executors=["default_threads"]) + def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]: + # make sure apply apps get consistent input + if isinstance(data, Geometry): + data = [data] + return data + + if isinstance(data, Dataset): + return data.geometries() + elif isinstance(data, Geometry): + return pack(data) + elif isinstance(data, AppFuture): + return prep_input(data) + elif isinstance(data, list): + return pack(*data) + else: + assert False + + +@python_app(executors=["default_threads"]) +def insert_results( + states: Sequence[Geometry], result: ComputeResult +) -> Sequence[Geometry]: + """""" + insert(states, result.to_dict()) + return states diff --git a/psiflow/data/__init__.py b/psiflow/data/__init__.py index b91798a..7ade023 100644 --- a/psiflow/data/__init__.py +++ b/psiflow/data/__init__.py @@ -1,2 +1,2 @@ -from .dataset import Computable, Dataset, aggregate_multiple, compute # noqa: F401 -from .utils import compute_mae, compute_rmse # noqa: F401 +from .dataset import Dataset +from .file import read_frames, count_frames \ No newline at end of file diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 4a86f39..52bbf00 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -1,38 +1,29 @@ -from __future__ import annotations # necessary for type-guarding class methods - -import math from pathlib import Path -from typing import Callable, ClassVar, Optional, Union +from typing import Optional, Union -import numpy as np -import typeguard -from parsl.app.app import join_app, python_app -from parsl.app.python import PythonApp from parsl.data_provider.files import File from parsl.dataflow.futures import AppFuture, DataFuture import psiflow -from psiflow.geometry import QUANTITIES, Geometry -from psiflow.utils.apps import copy_data_future, pack +from psiflow.geometry import Geometry +from psiflow.utils.apps import copy_data_future -from .utils import ( - align_axes, - app_filter, - apply_offset, - assign_identifiers, - batch_frames, - clean_frames, +from psiflow.data.file import ( count_frames, - extract_quantities, get_elements, - get_train_valid_indices, - insert_quantities, join_frames, - not_null, read_frames, - reset_frames, - shuffle, write_frames, + split_frames, + shuffle_frames, + apply_offset, + reset_frames, + clean_frames, + align_axes, + filter_frames, + extract_quantities, + extract_quantities_per_atom, + assign_identifiers, ) @@ -48,34 +39,23 @@ class Dataset: def __init__( self, - states: Optional[list[Union[AppFuture, Geometry]], AppFuture], + states: Optional[list[AppFuture | Geometry] | AppFuture] = None, extxyz: Optional[psiflow._DataFuture] = None, ): """ - Initialize a Dataset. + Initialize a Dataset. Provide either states or extxyz. Args: states: List of Geometry instances or AppFutures representing geometries. - extxyz: Optional DataFuture representing an existing extxyz file. - - Note: - Either states or extxyz should be provided, not both. """ - if extxyz is not None: - assert states is None + if extxyz is not None: # takes precedence over states self.extxyz = extxyz - else: - assert states is not None - if not isinstance(states, list): # AppFuture[list, geometry] - extra_states = states - states = [] - else: - extra_states = None - self.extxyz = write_frames( - *states, - extra_states=extra_states, - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] + return + + if not isinstance(states, list): # AppFuture[list, geometry] + states = [states] + file = psiflow.context().new_file("data_", ".xyz") + self.extxyz = write_frames(*states, outputs=[file]).outputs[0] def length(self) -> AppFuture: """ @@ -84,25 +64,19 @@ def length(self) -> AppFuture: Returns: AppFuture: Future representing the number of structures. """ - return count_frames(inputs=[self.extxyz]) + return count_frames(self.extxyz) - def shuffle(self): + def shuffle(self) -> "Dataset": """ Shuffle the order of structures in the dataset. - - Returns: - Dataset: A new Dataset with shuffled structures. """ - extxyz = shuffle( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + extxyz = shuffle_frames(self.extxyz, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) def __getitem__( - self, - index: Union[int, slice, list[int], AppFuture], - ) -> Union[Dataset, AppFuture]: + self, index: int | slice | list[int] | AppFuture + ) -> Dataset | AppFuture: """ Get a subset of the dataset or a single structure. @@ -112,36 +86,23 @@ def __getitem__( Returns: Union[Dataset, AppFuture]: A new Dataset or an AppFuture of a single Geometry. """ + future_frames = read_frames(self.extxyz, indices=index) if isinstance(index, int): - future = read_frames( - [index], - inputs=[self.extxyz], - outputs=[], # will return Geometry as Future - ) - return future[0] - else: # slice, list, AppFuture - extxyz = read_frames( - index, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) - - def save(self, path: Union[Path, str]) -> DataFuture: - """ - Save the dataset to a file. + return future_frames[0] # will return Geometry as Future - Args: - path: Path to save the dataset. + # slice, list, AppFuture + file = psiflow.context().new_file("data_", ".xyz") + extxyz = write_frames(future_frames, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) + def save(self, path: Path | str) -> DataFuture: + """ + Save the dataset to a file. Returns: DataFuture: Future representing the file to which will be saved. """ path = psiflow.resolve_and_check(Path(path)) - future = copy_data_future( - inputs=[self.extxyz], - outputs=[File(str(path))], - ) + future = copy_data_future(inputs=[self.extxyz], outputs=[File(path)]) return future.outputs[0] def geometries(self) -> AppFuture: @@ -151,226 +112,121 @@ def geometries(self) -> AppFuture: Returns: AppFuture: Future representing a list of Geometry instances. """ - return read_frames(inputs=[self.extxyz]) + return read_frames(self.extxyz) def __add__(self, dataset: Dataset) -> Dataset: """ Concatenate two datasets. - - Args: - dataset: Another Dataset to add to this one. - - Returns: - Dataset: A new Dataset containing structures from both datasets. """ - extxyz = join_frames( - inputs=[self.extxyz, dataset.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = join_frames(inputs=[self.extxyz, dataset.extxyz], outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def subtract_offset(self, **atomic_energies: Union[float, AppFuture]) -> Dataset: + def subtract_offset(self, **atomic_energies: float | AppFuture) -> Dataset: """ Subtract atomic energy offsets from the dataset. - - Args: - **atomic_energies: Atomic energies for each element. - - Returns: - Dataset: A new Dataset with adjusted energies. """ assert len(atomic_energies) > 0 - extxyz = apply_offset( - True, - **atomic_energies, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = apply_offset( + self.extxyz, subtract=True, **atomic_energies, outputs=[file] + ) + return Dataset(extxyz=future.outputs[0]) def add_offset(self, **atomic_energies) -> Dataset: """ Add atomic energy offsets to the dataset. - - Args: - **atomic_energies: Atomic energies for each element. - - Returns: - Dataset: A new Dataset with adjusted energies. """ assert len(atomic_energies) > 0 - extxyz = apply_offset( - False, - **atomic_energies, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = apply_offset( + self.extxyz, subtract=False, **atomic_energies, outputs=[file] + ) + return Dataset(extxyz=future.outputs[0]) - def elements(self): + def elements(self) -> AppFuture: """ Get the set of elements present in the dataset. Returns: AppFuture: Future representing a set of element symbols. """ - return get_elements(inputs=[self.extxyz]) + return get_elements(self.extxyz) - def reset(self): + def reset(self) -> Dataset: """ Reset all structures in the dataset. - - Returns: - Dataset: A new Dataset with reset structures. """ - extxyz = reset_frames( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = reset_frames(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def clean(self): + def clean(self) -> Dataset: """ Clean all structures in the dataset. + """ + file = psiflow.context().new_file("data_", ".xyz") + future = clean_frames(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - Returns: - Dataset: A new Dataset with cleaned structures. + def get(self, *quantities: str) -> tuple[AppFuture, ...]: """ - extxyz = clean_frames( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + Extract specified quantities from the dataset. + """ + future_dict = extract_quantities(self.extxyz, quantities) + return tuple(future_dict[q] for q in quantities) - def get( + def get_per_atom( self, *quantities: str, atom_indices: Optional[list[int]] = None, elements: Optional[list[str]] = None, - ): + ) -> tuple[AppFuture, ...]: """ - Extract specified quantities from the dataset. + Extract specified per atom quantities from the dataset. Args: *quantities: Names of quantities to extract. atom_indices: Optional list of atom indices to consider. elements: Optional list of element symbols to consider. - - Returns: - Union[AppFuture, tuple[AppFuture, ...]]: Future(s) representing the extracted quantities. - """ - result = extract_quantities( - quantities, - atom_indices, - elements, - inputs=[self.extxyz], - ) - if len(quantities) == 1: - return result[0] - else: - return tuple([result[i] for i in range(len(quantities))]) - - def evaluate( - self, - computable: Computable, - batch_size: Optional[int] = None, - ) -> Dataset: """ - Evaluate a Computable on the dataset. - - Args: - computable: Computable object to evaluate. - batch_size: Optional batch size for evaluation. - - Returns: - Dataset: A new Dataset with evaluation results. - """ - # TODO: remove this functionality? - from psiflow.hamiltonians import Hamiltonian - - if not isinstance(computable, Hamiltonian): - # avoid extracting and inserting the same quantities - return computable.compute_dataset(self) - - # use Hamiltonian.compute method - if batch_size is not None: - outputs = computable.compute(self, batch_size=batch_size) - else: - outputs = computable.compute(self) # use default from computable - if not isinstance(outputs, list): # compute unpacks for only one property - outputs = [outputs] - future = insert_quantities( - quantities=tuple(computable.outputs), - arrays=pack(*outputs), - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], + future_dict = extract_quantities_per_atom( + self.extxyz, quantities, atom_indices, elements ) - return Dataset(None, future.outputs[0]) + return tuple(future_dict[q] for q in quantities) - def filter( - self, - quantity: str, - ) -> Dataset: + def filter(self, quantity: str) -> Dataset: """ Filter the dataset based on a specified quantity. - - Args: - quantity: The quantity to filter on. - - Returns: - Dataset: A new Dataset containing only structures that pass the filter. - """ - assert quantity in QUANTITIES - extxyz = app_filter( - quantity, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) - - def not_null(self) -> Dataset: """ - Remove null states from the dataset. + # TODO: where is this used? + file = psiflow.context().new_file("data_", ".xyz") + future = filter_frames(self.extxyz, quantity, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - Returns: - Dataset: A new Dataset without null states. - """ - extxyz = not_null( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) - - def align_axes(self): + def align_axes(self) -> Dataset: """ Adopt a canonical orientation for all (periodic) structures in the dataset. - - Returns: - Dataset: A new Dataset with aligned structures. """ - extxyz = align_axes( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = align_axes(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def split(self, fraction, shuffle=True): # auto-shuffles + def split(self, fraction: float, shuffle: bool = True) -> tuple[Dataset, Dataset]: """ Split the dataset into training and validation sets. Args: fraction: Fraction of data to use for training. shuffle: Whether to shuffle before splitting. - - Returns: - tuple[Dataset, Dataset]: Training and validation datasets. """ - train, valid = get_train_valid_indices( - self.length(), - fraction, - shuffle, + assert 0 <= fraction <= 1 + file_train = psiflow.context().new_file("data_", ".xyz") + file_val = psiflow.context().new_file("data_", ".xyz") + future = split_frames( + self.extxyz, fraction, shuffle, outputs=[file_train, file_val] ) - return self.__getitem__(train), self.__getitem__(valid) + return Dataset(extxyz=future.outputs[0]), Dataset(extxyz=future.outputs[1]) def assign_identifiers( self, identifier: Union[int, AppFuture, None] = None @@ -384,273 +240,19 @@ def assign_identifiers( Returns: AppFuture: Future representing the next available identifier. """ - result = assign_identifiers( - identifier, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ) - self.extxyz = result.outputs[0] - return result + # TODO: what is the use case for this? + # this is the only method that changes inplace instead of returning a new Dataset + future = assign_identifiers(self.extxyz, identifier) + future_states, future_id = future[0], future[1] + file = psiflow.context().new_file("data_", ".xyz") + self.extxyz = write_frames(future_states, outputs=[file]).outputs[0] + return future_id @classmethod - def load( - cls, - path_xyz: Union[Path, str], - ) -> Dataset: + def load(cls, path_xyz: Union[Path, str]) -> Dataset: """ Load a dataset from a file. - - Args: - path_xyz: Path to the XYZ file. - - Returns: - Dataset: Loaded dataset. """ path_xyz = psiflow.resolve_and_check(Path(path_xyz)) assert path_xyz.exists() # needs to be locally accessible - return cls(None, extxyz=File(str(path_xyz))) - - -@typeguard.typechecked -def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: - """ - Concatenate multiple lists of arrays. - - Args: - *args: Lists of numpy arrays to concatenate. - - Returns: - list[np.ndarray]: List of concatenated arrays. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - - def pad_arrays( - arrays: list[np.ndarray], - pad_dimension: int = 1, - ) -> list[np.ndarray]: - ndims = np.array([len(a.shape) for a in arrays]) - assert np.all(ndims == ndims[0]) - assert np.all(pad_dimension < ndims) - - pad_size = max([a.shape[pad_dimension] for a in arrays]) - for i in range(len(arrays)): - shape = list(arrays[i].shape) - shape[pad_dimension] = pad_size - shape[pad_dimension] - padding = np.zeros(tuple(shape)) + np.nan - arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension) - return arrays - - narrays = len(args[0]) - for arg in args: - assert isinstance(arg, list) - assert all([len(a) == narrays for a in args]) - - concatenated = [] - for i in range(narrays): - arrays = [arg[i] for arg in args] - if len(arrays[0].shape) > 1: - pad_arrays(arrays) - concatenated.append(np.concatenate(tuple(arrays))) - return concatenated - - -concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"]) - - -@typeguard.typechecked -def _aggregate_multiple( - *arrays_list, - coefficients: Optional[np.ndarray] = None, -) -> list[np.ndarray]: - """ - Aggregate multiple lists of arrays with optional coefficients. - - Args: - *arrays_list: Lists of arrays to aggregate. - coefficients: Optional coefficients for weighted aggregation. - - Returns: - list[np.ndarray]: List of aggregated arrays. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - if coefficients is None: - coefficients = np.ones(len(arrays_list)) - else: - assert len(coefficients) == len(arrays_list) - - results = [np.zeros(a.shape) for a in arrays_list[0]] - for i, arrays in enumerate(arrays_list): - for j, array in enumerate(arrays): - results[j] += coefficients[i] * array - return results - - -aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"]) - - -@join_app -@typeguard.typechecked -def batch_apply( - apply_apps: tuple[Union[PythonApp, Callable]], - arg: Union[Dataset, list[Geometry]], - batch_size: int, - length: int, - outputs: list = [], - reduce_func: Optional[PythonApp] = None, - **app_kwargs, -) -> AppFuture: - """ - Apply a set of apps to batches of data. - - Args: - apply_apps: Tuple of PythonApps or Callables to apply. - arg: Dataset or list of Geometries to process. - batch_size: Size of each batch. - length: Total number of items to process. - outputs: List of output files. - reduce_func: Optional function to reduce results. - **app_kwargs: Additional keyword arguments for the apps. - - Returns: - AppFuture: Future representing the result of batch application. - - Note: - This function is wrapped as a Parsl join_app. - """ - nbatches = math.ceil(length / batch_size) - batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)] - future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches) - output_futures = [] - for i in range(nbatches): - futures = [] - for app in apply_apps: - f = app( - None, - inputs=[future.outputs[i]], - **app_kwargs, - ) - futures.append(f) - reduced = reduce_func(*futures) - output_futures.append(reduced) - future = concatenate_multiple(*output_futures) - return future - - -@python_app(executors=["default_threads"]) -def get_length(arg): - """ - Get the length of the input argument. - - Args: - arg: Input to get the length of. - - Returns: - int: Length of the input. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - if isinstance(arg, list): - return len(arg) - else: - return 1 - - -@typeguard.typechecked -def compute( - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *apply_apps: Union[PythonApp, Callable], - outputs_: Union[str, list[str], tuple[str, ...], None] = None, - reduce_func: Union[PythonApp, Callable] = aggregate_multiple, - batch_size: Optional[int] = None, -) -> Union[list[AppFuture], AppFuture]: - """ - Compute results by applying apps to the input data. - - Args: - arg: Input data to compute on. - *apply_apps: Apps to apply to the data. - outputs_: Names of output quantities. - reduce_func: Function to reduce results. - batch_size: Optional batch size for processing. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. - """ - if type(outputs_) is str: - outputs_ = [outputs_] - if batch_size is not None: - if isinstance(arg, Dataset): - length = arg.length() - else: - length = get_length(arg) - # convert to Dataset for convenience - arg = Dataset(arg) - future = batch_apply( - apply_apps, - arg, - batch_size, - length, - outputs_=outputs_, - reduce_func=reduce_func, - ) - else: - futures = [] - if isinstance(arg, Dataset): - for app in apply_apps: - future = app( - None, - outputs_=outputs_, - inputs=[arg.extxyz], - ) - futures.append(future) - else: - for app in apply_apps: - future = app( - arg, - outputs_=outputs_, - inputs=[], - ) - futures.append(future) - future = reduce_func(*futures) - if len(outputs_) == 1: - return future[0] - else: - return [future[i] for i in range(len(outputs_))] - - -@typeguard.typechecked -class Computable: - """ - Base class for computable objects. - - Attributes: - outputs (ClassVar[tuple[str, ...]]): Names of output quantities. - batch_size (ClassVar[Optional[int]]): Default batch size for computation. - """ - - outputs: ClassVar[tuple[str, ...]] = () - batch_size: ClassVar[Optional[int]] = None - - def compute( - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *outputs: Optional[str], - batch_size: Optional[int] = -1, # if -1: take class default - ) -> Union[list[AppFuture], AppFuture]: - """ - Compute results for the given input. - - Args: - arg: Input data to compute on. - outputs: Names of output quantities. - batch_size: Batch size for computation. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. - """ - raise NotImplementedError + return cls(extxyz=File(path_xyz)) diff --git a/psiflow/data/file.py b/psiflow/data/file.py new file mode 100644 index 0000000..f86093b --- /dev/null +++ b/psiflow/data/file.py @@ -0,0 +1,239 @@ +import re +import shutil +import subprocess +from pathlib import Path +from functools import partial +from typing import Optional, Generator, TypeAlias, Any, Callable +from collections.abc import Sequence + +import numpy as np +from ase.data import chemical_symbols +from parsl import python_app, File + +from psiflow.geometry import Geometry +from psiflow.data.utils import ( + get_unique_numbers, + apply_energy_offset, + filter_quantity, + extract, + extract_per_atom, + assign_ids, +) + + +FileLike: TypeAlias = str | Path | File + + +def iter_read_frames(file: FileLike) -> Generator[list[str]]: + """Yields text data to instantiate geometries""" + frame_regex = re.compile(r"^\d+$") + with open(file, "r") as f: + for line in f: + if frame_regex.match(_ := line.strip()): + n = int(_) + yield [line] + [f.readline() for _i in range(n + 1)] + + +def _write_frames( + *states: Geometry | list[Geometry], outputs: Sequence[File] = () +) -> None: + """ + Write Geometry instances to a file. + + Args: + states: Variable number of (lists of) Geometry instances to write. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path. + """ + assert len(outputs) == 1 + data = [] + for d in states: + if isinstance(d, list): + data.extend(d) + else: + data.append(d) + with open(outputs[0], "w") as f: + f.write("".join([geom.to_string() for geom in data])) + + +def _read_frames( + file: FileLike, indices: Optional[slice | list[int] | int] = None +) -> list[Geometry]: + """ + Read Geometry instances from a file. + + Args: + file: DataFuture representing the input file path containing the geometry data. + indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. + + """ + if indices is None: + # read everything + data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] + return [Geometry.from_string(s) for s in data] + + # figure out what frames to read + # TODO: reads twice + indices always wrap modulo nframes which might be unexpected + length = _count_frames(file) + if isinstance(indices, slice): + indices = list(range(length)[indices]) + elif isinstance(indices, int): + indices = [indices] + + if length > 0: + indices = [i % length for i in indices] # for negative indices and wrapping + indices_ = set(indices) # for *much* faster 'i in indices' + + data = {} + for i, geom_str_list in enumerate(iter_read_frames(file)): + if i in indices_: + data[i] = Geometry.from_string("".join(geom_str_list)) + + # return in original order + return [data[i] for i in indices] + + +def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None: + """ + Join multiple frame files into a single file. + + Args: + inputs: List of Parsl futures. Each element should be a DataFuture + representing an input file path containing geometry data. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path where joined frames will be written. + """ + assert len(outputs) == 1 + + with open(outputs[0], "wb") as destination: + for input_file in inputs: + with open(input_file, "rb") as source: + shutil.copyfileobj(source, destination) + + +def _count_frames(file: FileLike) -> int: + """Count the number of frames in a file.""" + path = Path(file) + threshold = 10 * 1024 * 1024 # 10 MB in bytes + file_size = path.stat().st_size + + if file_size > threshold: # use grep + cmd = f"grep -Fc Properties {path}" + result = subprocess.check_output(cmd.split()) + return int(result.strip()) + return len([_ for _ in iter_read_frames(path)]) + + +def _get_elements(*files: File) -> set[str]: + """ + Get the set of elements present in all frames of a sequence of file. + """ + frames = [geom for file in files for geom in _read_frames(file)] + return {chemical_symbols[i] for i in get_unique_numbers(frames)} + + +def _split_frames( + file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = () +) -> None: + """Split into training and validation sets""" + assert len(outputs) == 2 + assert 0 <= fraction <= 1 + frames = _read_frames(file) + order = np.arange(len(frames)) + if shuffle: + np.random.shuffle(order) + + n_train = int(len(frames) * fraction) + train_ids = order[:n_train] + val_ids = order[n_train:] + + train = [frames[i] for i in train_ids] + val = [frames[i] for i in val_ids] + _write_frames(train, outputs=outputs[:1]) + _write_frames(val, outputs=outputs[1:]) + + +write_frames = python_app(_write_frames, executors=["default_threads"]) +read_frames = python_app(_read_frames, executors=["default_threads"]) +join_frames = python_app(_join_frames, executors=["default_threads"]) +count_frames = python_app(_count_frames, executors=["default_threads"]) +get_elements = python_app(_get_elements, executors=["default_threads"]) +split_frames = python_app(_split_frames, executors=["default_threads"]) + + +def _read_write_wrapper( + execute: Callable, # Parsl throws if this is named 'func' + file: FileLike, + *args: Any, + outputs: Sequence[File] = (), + **kwargs: Any, +) -> None: + """ + Wrapper function to make simple Geometry functions work for Dataset instances. + Essentially does (i) read (ii) modify (iii) write. + + Args: + file: DataFuture representing the input file path containing the geometry data. + execute: Function that will get executed for the geometries in file + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path where reset frames will be written. + """ + assert len(outputs) == 1 + frames_in = _read_frames(file) + frames_out = execute(frames_in, *args, **kwargs) + if frames_out is None: # execute can alter in-place + frames_out = frames_in + _write_frames(frames_out, outputs=outputs) + + +def _reset_frames(states: Sequence[Geometry]) -> None: + for geom in states: + geom.reset() + + +def _clean_frames(states: Sequence[Geometry]) -> None: + for geom in states: + geom.clean() + + +def _align_axes(states: Sequence[Geometry]) -> None: + for geom in states: + geom.align_axes() + + +def _shuffle_frames(states: Sequence[Geometry]) -> None: + np.random.shuffle(states) + + +read_write_app = python_app(_read_write_wrapper, executors=["default_threads"]) +reset_frames = partial(read_write_app, _reset_frames) +clean_frames = partial(read_write_app, _clean_frames) +align_axes = partial(read_write_app, _align_axes) +shuffle_frames = partial(read_write_app, _shuffle_frames) + +apply_offset = partial(read_write_app, apply_energy_offset) +filter_frames = partial(read_write_app, filter_quantity) + + +def _read_wrapper( + execute: Callable, # Parsl throws if this is named 'func' + file: FileLike, + *args: Any, + **kwargs: Any, +) -> Any: + """ + Wrapper function to make simple Geometry functions work for Dataset instances. + Essentially does (i) read (ii) modify (iii) return. + + Args: + file: DataFuture representing the input file path containing the geometry data. + execute: Function that will get executed for the geometries in file + """ + frames_in = _read_frames(file) + return execute(frames_in, *args, **kwargs) + + +read_app = python_app(_read_wrapper, executors=["default_threads"]) +extract_quantities = partial(read_app, extract) +extract_quantities_per_atom = partial(read_app, extract_per_atom) +assign_identifiers = partial(read_app, assign_ids) diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 0f54936..d9bb37a 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -1,941 +1,156 @@ -import re -import shutil -from typing import Optional, Union, Sequence +from typing import Optional +from collections.abc import Sequence import numpy as np -import typeguard -from ase.data import atomic_numbers, chemical_symbols -from parsl.app.app import python_app -from parsl.dataflow.futures import AppFuture +from ase.data import atomic_numbers +from parsl import python_app -from psiflow.geometry import Geometry, NullState, _assign_identifier, create_outputs +from psiflow.geometry import ( + Geometry, + get_atomic_energy, + DEFAULT_PROPERTIES, + PER_ATOM_FIELDS, + MISSING, +) +# TODO: some of these have side effects.. -@typeguard.typechecked -def _write_frames( - *states: Geometry, - extra_states: Union[Geometry, list[Geometry], None] = None, - outputs: list = [], -) -> None: - """ - Write Geometry instances to a file. - - Args: - *states: Variable number of Geometry instances to write. - extra_states: Additional Geometry instance(s) to write. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - all_states = list(states) - if extra_states is not None: - if isinstance(extra_states, list): - all_states += extra_states - else: # single geometry - all_states.append(extra_states) - with open(outputs[0], "w") as f: - for state in all_states: # avoid double newline by using strip! - f.write(state.to_string().strip() + "\n") - - -write_frames = python_app(_write_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _read_frames( - indices: Union[None, slice, list[int], int] = None, - inputs: list = [], - outputs: list = [], -) -> Optional[list[Geometry]]: - """ - Read Geometry instances from a file. - - Args: - indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing the geometry data. - outputs: List of Parsl futures. If provided, the first element should be - a DataFuture representing the output file path where the selected - geometries will be written. - - Returns: - Optional[list[Geometry]]: List of read Geometry instances if no output - is specified, otherwise None. - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - frame_index = 0 - frame_regex = re.compile(r"^\d+$") - length = _count_frames(inputs=inputs) - _all = range(length) - if isinstance(indices, slice): - indices = list(_all[indices]) - elif isinstance(indices, int): - indices = [list(_all)[indices]] - - if isinstance(indices, list): - if length > 0: - indices = [i % length for i in indices] # for negative indices and wrapping - indices_ = set(indices) # for *much* faster 'i in indices' - else: - assert indices is None - indices_ = None - - data = [] - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) - - # currently at position frame_index, check if to be read - _ = [f.readline() for _i in range(natoms + 1)] - if indices_ is None or frame_index in indices_: - data.append("".join([line] + _)) - else: - data.append(None) - frame_index += 1 - - if indices is not None: # sort states accordingly - data = [data[i] for i in indices] - - if len(outputs) > 0: - with open(outputs[0], "w") as f: - f.write("\n".join([d.strip() for d in data if d is not None])) - f.write("\n") - else: - geometries = [Geometry.from_string(s) for s in data if s is not None] - return geometries - - -read_frames = python_app(_read_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _extract_quantities( - quantities: tuple[str, ...], - atom_indices: Optional[list[int]], - elements: Optional[list[str]], - *extra_data: Geometry, - inputs: list = [], -) -> tuple[np.ndarray, ...]: +def extract(states: Sequence[Geometry], quantities: Sequence[str]) -> dict[str, list]: """ Extract specified quantities from Geometry instances. - - Args: - quantities: Tuple of quantity names to extract. - atom_indices: List of atom indices to consider. - elements: List of element symbols to consider. - *extra_data: Additional Geometry instances. - inputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - tuple[np.ndarray, ...]: Tuple of arrays containing extracted quantities. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - if not len(extra_data): - assert len(inputs) == 1 - data = _read_frames(inputs=inputs) - else: - assert len(inputs) == 0 - data = list(extra_data) - order_names = list(set([k for g in data for k in g.order])) - natoms = np.array([len(geometry) for geometry in data], dtype=int) - max_natoms = np.max(natoms) + data = {} - arrays = create_outputs(quantities, data) - for i, geometry in enumerate(data): - mask = get_index_element_mask( - geometry.per_atom.numbers, - atom_indices, - elements, - natoms_padded=int(max_natoms), - ) - natoms = len(geometry) - for j, quantity in enumerate(quantities): - if quantity == "positions": - arrays[j][i, mask, :] = geometry.per_atom.positions[mask[:natoms]] - elif quantity == "forces": - arrays[j][i, mask, :] = geometry.per_atom.forces[mask[:natoms]] - elif quantity == "cell": - arrays[j][i, :, :] = geometry.cell - elif quantity == "stress": - if geometry.stress is not None: - arrays[j][i, :, :] = geometry.stress - elif quantity == "numbers": - arrays[j][i, :] = geometry.numbers - elif quantity == "energy": - if geometry.energy is not None: - arrays[j][i] = geometry.energy - elif quantity == "delta": - if geometry.delta is not None: - arrays[j][i] = geometry.delta - elif quantity == "per_atom_energy": - if geometry.energy is not None: - arrays[j][i] = geometry.per_atom_energy - elif quantity == "phase": - if geometry.phase is not None: - arrays[j][i] = geometry.phase - elif quantity == "logprob": - if geometry.logprob is not None: - arrays[j][i, :] = geometry.logprob - elif quantity == "identifier": - if geometry.identifier is not None: - arrays[j][i] = geometry.identifier - elif quantity in order_names: - if quantity in geometry.order: - arrays[j][i] = geometry.order[quantity] - return tuple(arrays) + # figure out where to find quantities + per_atom, per_structure, unknown = [], [], [] + for q in set(quantities): + if q in PER_ATOM_FIELDS: + per_atom.append(q) + elif q in DEFAULT_PROPERTIES: + per_structure.append(q) + else: + unknown.append(q) + for q in per_atom: + data[q] = [getattr(geom.per_atom, q, MISSING) for geom in states] + for q in per_structure: + data[q] = [getattr(geom, q, MISSING) for geom in states] + for q in unknown: + # try both options + data[q] = [] + for geom in states: + value = getattr(geom, q, MISSING) + if value is MISSING: + value = getattr(geom.per_atom, q, MISSING) + data[q].append(value) -extract_quantities = python_app(_extract_quantities, executors=["default_threads"]) + return data -@typeguard.typechecked -def _insert_quantities( - quantities: tuple[str, ...], - arrays: Sequence[np.ndarray], - data: Optional[list[Geometry]] = None, - inputs: list = [], - outputs: list = [], -) -> None: +def insert(states: Sequence[Geometry], data: dict[str, list]) -> None: """ - Insert quantities into Geometry instances. - - Args: - quantities: Tuple of quantity names to insert. - arrays: List of arrays containing the quantities to insert. - data: List of Geometry instances to update. - inputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the output file path where updated geometries will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. + Insert quantities from data into Geometry instances. """ - if data is None: - assert len(inputs) == 1 - data = _read_frames(inputs=inputs) - else: - assert len(inputs) == 0 - max_natoms = max([len(geometry) for geometry in data]) - - for i, geometry in enumerate(data): - if geometry == NullState: - continue - mask = get_index_element_mask( - geometry.per_atom.numbers, - None, - None, - natoms_padded=int(max_natoms), - ) - natoms = len(geometry) - for j, quantity in enumerate(quantities): - if quantity == "positions": - geometry.per_atom.positions[:natoms, :] = arrays[j][i, mask, :] - elif quantity == "forces": - geometry.per_atom.forces[mask[:natoms]] = arrays[j][i, mask, :] - elif quantity == "cell": - geometry.cell = arrays[j][i, :, :] - elif quantity == "stress": - geometry.stress = arrays[j][i, :, :] - elif quantity == "numbers": - geometry.numbers = arrays[j][i, :] - elif quantity == "energy": - geometry.energy = arrays[j][i] - elif quantity == "delta": - geometry.delta = arrays[j][i] - elif quantity == "per_atom_energy": - geometry.per_atom_energy = arrays[j][i] - elif quantity == "phase": - geometry.phase = arrays[j][i] - elif quantity == "logprob": - geometry.logprob = arrays[j][i, :] - elif quantity == "identifier": - geometry.identifier = arrays[j][i] - elif quantity in geometry.order: - geometry.order[quantity] = arrays[j][i] + for q, values in data.items(): + assert len(states) == len(values) + for geom, v in zip(states, values): + if isinstance(v, np.ndarray) and len(geom) == len(v): + setattr(geom.per_atom, q, v) else: - raise ValueError("unknown quantity {}".format(quantity)) - if len(outputs) > 0: - _write_frames(*data, outputs=[outputs[0]]) - - -insert_quantities = python_app(_insert_quantities, executors=["default_threads"]) - - -@typeguard.typechecked -def _check_distances(state: Geometry, threshold: float) -> Geometry: - """ - Check if all interatomic distances in a Geometry are above a threshold. - - Args: - state: Geometry instance to check. - threshold: Minimum allowed interatomic distance. - - Returns: - Geometry: The input Geometry if all distances are above the threshold, otherwise NullState. - - Note: - This function is wrapped as a Parsl app and executed using the default_htex executor. - """ - from ase.geometry.geometry import find_mic - - if state == NullState: - return NullState - nrows = int(len(state) * (len(state) - 1) / 2) - deltas = np.zeros((nrows, 3)) - count = 0 - for i in range(len(state) - 1): - for j in range(i + 1, len(state)): - deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j] - count += 1 - assert count == nrows - if state.periodic: - deltas, _ = find_mic(deltas, state.cell) - check = np.all(np.linalg.norm(deltas, axis=1) > threshold) - if check: - return state - else: - return NullState - - -check_distances = python_app(_check_distances, executors=["default_htex"]) - - -@typeguard.typechecked -def _assign_identifiers( - identifier: Optional[int], - inputs: list = [], - outputs: list = [], -) -> int: - """ - Assign identifiers to Geometry instances in a file. - - Args: - identifier: Starting identifier value. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where updated geometries will be written. - - Returns: - int: Next available identifier. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(slice(None), inputs=[inputs[0]]) - states = [] - if identifier is None: # do not assign but look for max - identifier = -1 - for geometry in data: - if geometry.identifier is not None: - identifier = max(identifier, geometry.identifier) - identifier += 1 - for geometry in data: # assign those which were not yet assigned - if geometry.identifier is None: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier - else: - for geometry in data: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier - - -assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"]) - - -@typeguard.typechecked -def _join_frames( - inputs: list = [], - outputs: list = [], -): - """ - Join multiple frame files into a single file. - - Args: - inputs: List of Parsl futures. Each element should be a DataFuture - representing an input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where joined frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert len(outputs) == 1 - - with open(outputs[0], "wb") as destination: - for input_file in inputs: - with open(input_file, "rb") as source: - shutil.copyfileobj(source, destination) - - -join_frames = python_app(_join_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _count_frames(inputs: list = []) -> int: - """ - Count the number of frames in a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - int: Number of frames in the file. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - nframes = 0 - frame_regex = re.compile(r"^\d+$") - with open(inputs[0], "r") as f: - for line in f: - if frame_regex.match(line.strip()): - nframes += 1 - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - return nframes - - -count_frames = python_app(_count_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _reset_frames(inputs: list = [], outputs: list = []) -> None: - """ - Reset all frames in a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where reset frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - for geometry in data: - geometry.reset() - _write_frames(*data, outputs=[outputs[0]]) - - -reset_frames = python_app(_reset_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _clean_frames(inputs: list = [], outputs: list = []) -> None: - """ - Clean all frames in a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where cleaned frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - for geometry in data: - geometry.clean() - _write_frames(*data, outputs=[outputs[0]]) - - -clean_frames = python_app(_clean_frames, executors=["default_threads"]) - - -@typeguard.typechecked -def _apply_offset( - subtract: bool, - inputs: list = [], - outputs: list = [], - **atomic_energies: float, -) -> None: - """ - Apply an energy offset to all frames in a file. - - Args: - subtract: Whether to subtract or add the offset. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where updated frames will be written. - **atomic_energies: Atomic energies for each element. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert len(inputs) == 1 - assert len(outputs) == 1 - data = _read_frames(inputs=[inputs[0]]) - numbers = [atomic_numbers[e] for e in atomic_energies.keys()] - all_numbers = [n for geometry in data for n in set(geometry.per_atom.numbers)] - for n in all_numbers: - if n != 0: # from NullState - assert n in numbers - for geometry in data: - if geometry == NullState: - continue - natoms = len(geometry) - energy = geometry.energy - for number in numbers: - natoms_per_number = np.sum(geometry.per_atom.numbers == number) - if natoms_per_number == 0: - continue - element = chemical_symbols[number] - multiplier = -1 if subtract else 1 - energy += multiplier * natoms_per_number * atomic_energies[element] - natoms -= natoms_per_number - assert natoms == 0 # all atoms accounted for - geometry.energy = energy - _write_frames(*data, outputs=[outputs[0]]) - - -apply_offset = python_app(_apply_offset, executors=["default_threads"]) - - -@typeguard.typechecked -def _get_elements(inputs: list = []) -> set[str]: - """ - Get the set of elements present in all frames of a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - set[str]: Set of element symbols. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - return set([chemical_symbols[n] for g in data for n in g.per_atom.numbers]) - - -get_elements = python_app(_get_elements, executors=["default_threads"]) - - -@typeguard.typechecked -def _align_axes(inputs: list = [], outputs: list = []) -> None: - """ - Align axes for all frames in a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where aligned frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - for geometry in data: - geometry.align_axes() - _write_frames(*data, outputs=[outputs[0]]) - - -align_axes = python_app(_align_axes, executors=["default_threads"]) - - -@typeguard.typechecked -def _not_null(inputs: list = [], outputs: list = []) -> list[bool]: - """ - Check which frames in a file are not null states. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the output file path where non-null frames will be written. - - Returns: - list[bool]: List of boolean values indicating non-null states. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - frame_regex = re.compile(r"^\d+$") - - data = [] - mask = [] - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - if natoms == 1: - if _[-1].strip()[0] == "X": # check if element(-1) == X - mask.append(False) - continue - data.append("".join([line] + _)) - mask.append(True) - if len(outputs) > 0: - with open(outputs[0], "w") as f: - f.write("\n".join(data)) - return mask - - -not_null = python_app(_not_null, executors=["default_threads"]) - - -@typeguard.typechecked -def _app_filter( + setattr(geom, q, v) + + +def extract_per_atom( + states: Sequence[Geometry], + quantities: Sequence[str], + atom_indices: Optional[Sequence[int]] = None, + elements: Optional[Sequence[str]] = None, +) -> dict[str, list]: + """ + Extract per atom quantities from Geometry instances, filtering based on atom_indices or elements. + """ + if atom_indices is None and elements is None: + # no filtering + data = {} + for k in quantities: + data[k] = [getattr(geom.per_atom, k, MISSING) for geom in states] + return data + + data = {k: [] for k in quantities} + numbers = {atomic_numbers[s] for s in elements or ()} + for geom in states: + # mask out unwanted rows + mask = np.zeros(len(geom), dtype=bool) + for n in numbers: + mask[geom.numbers == n] = True + if atom_indices is not None: + mask[atom_indices] = True + + for k in quantities: + value = getattr(geom.per_atom, k, MISSING) + if not value is MISSING: + value = value[mask] + data[k].append(value) + + return data + + +def filter_quantity( + states: Sequence[Geometry], quantity: str, - inputs: list = [], - outputs: list = [], -) -> None: +) -> list[Geometry]: """ Filter frames based on a specified quantity. - - Args: - quantity: The quantity to filter on. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where filtered frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(inputs=[inputs[0]]) - i = 0 - while i < len(data): - if quantity == "forces": - if np.all(np.invert(np.isnan(data[i].per_atom.forces))): - retain = True - else: - retain = False - elif quantity == "cell": - if not np.allclose(data[i].cell, 0.0): - retain = True - else: - retain = False - elif hasattr(data[i], quantity) and getattr(data[i], quantity) is not None: - retain = True - else: - retain = False - - # pop if necessary: - if retain: - i += 1 - else: - data.pop(i) - - _write_frames(*data, outputs=[outputs[0]]) + data = extract(states, [quantity])[quantity] + return [geom for i, geom in enumerate(states) if not data[i] is MISSING] -app_filter = python_app( - _app_filter, executors=["default_threads"] -) # filter is protected +def get_unique_numbers(states: Sequence[Geometry]) -> set[int]: + """Returns a set of unique atom numbers found across all states.""" + numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers)) + return {int(n) for n in numbers} -@typeguard.typechecked -def _shuffle( - inputs: list = [], - outputs: list = [], +def apply_energy_offset( + states: Sequence[Geometry], subtract: bool, **atomic_energies: float ) -> None: - """ - Shuffle the order of frames in a file. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where shuffled frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - indices = np.arange(len(data)) - np.random.shuffle(indices) - - shuffled = [data[int(i)] for i in indices] - _write_frames(*shuffled, outputs=[outputs[0]]) - - -shuffle = python_app(_shuffle, executors=["default_threads"]) - - -@typeguard.typechecked -def _train_valid_indices( - effective_nstates: int, - train_valid_split: float, - shuffle: bool, -) -> tuple[list[int], list[int]]: - """ - Generate indices for train and validation splits. - - Args: - effective_nstates: Total number of states. - train_valid_split: Fraction of states to use for training. - shuffle: Whether to shuffle the indices. - - Returns: - tuple[list[int], list[int]]: Lists of indices for training and validation sets. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - ntrain = int(np.floor(effective_nstates * train_valid_split)) - nvalid = effective_nstates - ntrain - assert ntrain > 0 - assert nvalid > 0 - order = np.arange(ntrain + nvalid, dtype=int) - if shuffle: - np.random.shuffle(order) - train_list = list(order[:ntrain]) - valid_list = list(order[ntrain : (ntrain + nvalid)]) - return [int(i) for i in train_list], [int(i) for i in valid_list] - - -train_valid_indices = python_app(_train_valid_indices, executors=["default_threads"]) - - -@typeguard.typechecked -def get_train_valid_indices( - effective_nstates: AppFuture, - train_valid_split: float, - shuffle: bool, -) -> tuple[AppFuture, AppFuture]: - """ - Get futures for train and validation indices. - - Args: - effective_nstates: Future representing the total number of states. - train_valid_split: Fraction of states to use for training. - shuffle: Whether to shuffle the indices. - - Returns: - tuple[AppFuture, AppFuture]: Futures for training and validation indices. - """ - future = train_valid_indices(effective_nstates, train_valid_split, shuffle) - return future[0], future[1] - - -@typeguard.typechecked -def get_index_element_mask( - numbers: np.ndarray, - atom_indices: Optional[list[int]], - elements: Optional[list[str]], - natoms_padded: Optional[int] = None, -) -> np.ndarray: - """ - Generate a mask for atom indices and elements. - - Args: - numbers: Array of atomic numbers. - atom_indices: List of atom indices to include. - elements: List of element symbols to include. - natoms_padded: Total number of atoms including padding. - - Returns: - np.ndarray: Boolean mask array. - """ - mask = np.array([True] * len(numbers)) - - if elements is not None: - numbers_to_include = [atomic_numbers[e] for e in elements] - mask_elements = np.array([False] * len(numbers)) - for number in numbers_to_include: - mask_elements = np.logical_or(mask_elements, (numbers == number)) - mask = np.logical_and(mask, mask_elements) - - if natoms_padded is not None: - assert natoms_padded >= len(numbers) - padding = natoms_padded - len(numbers) - mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool) - - if atom_indices is not None: # below padding - mask_indices = np.array([False] * len(mask)) - mask_indices[np.array(atom_indices)] = True - mask = np.logical_and(mask, mask_indices) - - return mask - - -@typeguard.typechecked -def _compute_rmse( - array0: np.ndarray, - array1: np.ndarray, - reduce: bool = True, -) -> Union[float, np.ndarray]: - """ - Compute the Root Mean Square Error (RMSE) between two arrays. - - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: RMSE value(s). - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - assert np.all(np.isnan(array0) == np.isnan(array1)) - - se = (array0 - array1) ** 2 - se = se.reshape(se.shape[0], -1) - - if reduce: # across both dimensions - mask = np.logical_not(np.isnan(se)) - return float(np.sqrt(np.mean(se[mask]))) - else: # retain first dimension - if se.ndim == 1: - return se + """Apply an energy offset to all geometries""" + numbers_data = get_unique_numbers(states) + numbers_kwargs = {atomic_numbers[e] for e in atomic_energies.keys()} + assert numbers_data == numbers_kwargs, "Provide atomic energies for all elements.." + + for geom in states: + energy = get_atomic_energy(geom, atomic_energies) + if subtract: + geom.energy -= energy else: - values = np.empty(len(se)) - for i in range(len(se)): - if np.all(np.isnan(se[i])): - values[i] = np.nan - else: - mask = np.logical_not(np.isnan(se[i])) - value = np.sqrt(np.mean(se[i][mask])) - values[i] = value - return values + geom.energy += energy -compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) - - -@typeguard.typechecked -def _compute_mae( - array0, - array1, - reduce: bool = True, -) -> Union[float, np.ndarray]: +def assign_ids( + states: Sequence[Geometry], identifier: Optional[int] = None +) -> tuple[Sequence[Geometry], int]: """ - Compute the Mean Absolute Error (MAE) between two arrays. - - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: MAE value(s). - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - mask0 = np.logical_not(np.isnan(array0)) - mask1 = np.logical_not(np.isnan(array1)) - assert np.all(mask0 == mask1) - ae = np.abs(array0 - array1) - to_reduce = tuple(range(1, array0.ndim)) - mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) - ae = ae[mask0].reshape(np.sum(1 * mask), -1) - if reduce: # across both dimensions - return float(np.sqrt(np.mean(ae))) - else: # retain first dimension - return np.sqrt(np.mean(ae, axis=1)) - - -compute_mae = python_app(_compute_mae, executors=["default_threads"]) - - -@typeguard.typechecked -def _batch_frames( - batch_size: int, - inputs: list = [], - outputs: list = [], -) -> Optional[list[Geometry]]: + Assign unique identifiers to Geometry instances, starting at provided identifier. """ - Split frames into batches. - - Args: - batch_size: Number of frames per batch. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. Each element should be a DataFuture - representing an output file path for each batch. - - Returns: - Optional[list[Geometry]]: List of Geometry instances if no outputs are specified. + # TODO: when would we want to supply the starting identifier? + if identifier is None: + # find largest existing value and add one + ids = extract(states, ["identifier"])["identifier"] + identifier = max([i for i in ids if i is not MISSING], default=0) + 1 - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - frame_regex = re.compile(r"^\d+$") + for geom in states: + if hasattr(geom, "identifier"): + continue + geom.identifier = identifier + identifier += 1 - data = [] - batch_index = 0 - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - data.append("".join([line] + _)) - if len(data) == batch_size: - with open(outputs[batch_index], "w") as g: - g.write("\n".join(data)) - data = [] - batch_index += 1 - else: # write and clear - pass - if len(data) > 0: - with open(outputs[batch_index], "w") as f: - f.write("\n".join([d.strip() for d in data if d is not None])) - f.write("\n") - batch_index += 1 - assert batch_index == len(outputs) + return states, identifier -batch_frames = python_app(_batch_frames, executors=["default_threads"]) +@python_app(executors=["default_threads"]) +def check_equality(state0: Geometry, state1: Geometry) -> bool: + """Check if two Geometry instances are equal""" + return state0 == state1 diff --git a/psiflow/execution.py b/psiflow/execution.py index 26e7f3a..8b47ff7 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -345,13 +345,7 @@ def wrap_in_timeout(self, command: str) -> str: return command # noop # send SIGTERM after max_runtime, follow with SIGKILL 30s later - return f"timeout -k 30s {self.max_runtime}s {command}" - - # def wrap_in_srun(self, command: str) -> str: - # # TODO: stub -- this does not work - # if self.provider is None: - # return command # noop - # return f"srun -t 1 -c $CORES {command}" + return f"timeout -v -k 30s {self.max_runtime}s {command}" def _create_threadpool(self, path: Path) -> ThreadPoolExecutor: max_threads = self.kwargs["max_threads"] @@ -464,7 +458,7 @@ def __init__( ): super().__init__(**kwargs) - if self.use_gpu and self.kwargs["gpus_per_task"] > 1: + if self.use_gpu and self.kwargs.get("gpus_per_task", 1) > 1: raise ConfigurationError("No Hamiltonian can do multi-GPU evaluation") # i-Pi will kill client connections after no response for timeout seconds diff --git a/psiflow/functions.py b/psiflow/functions.py index a461e7d..11b64e7 100644 --- a/psiflow/functions.py +++ b/psiflow/functions.py @@ -4,17 +4,16 @@ import logging from dataclasses import dataclass, field from pathlib import Path -from typing import Optional, Type, Union, Any +from typing import Optional, Union from collections.abc import Sequence import numpy as np -from ase import Atoms from ase.calculators.calculator import Calculator from ase.data import atomic_masses from ase.units import fs, kJ, mol, nm from ase.stress import voigt_6_to_full_3x3_stress -from psiflow.geometry import Geometry, NullState, create_outputs +from psiflow.geometry import Geometry def format_output( @@ -43,18 +42,15 @@ def __call__( def compute( self, - geometries: list[Geometry], - ) -> dict[str, float | np.ndarray]: + geometries: Sequence[Geometry], + ) -> dict[str, list[float | np.ndarray]]: """Evaluate multiple geometries and merge data into single arrays""" - value, grad_pos, grad_cell = create_outputs(self.outputs, geometries) + data = {k: [] for k in self.outputs} for i, geometry in enumerate(geometries): - if geometry == NullState: # TODO: remove - continue out = self(geometry) - value[i] = out["energy"] - grad_pos[i, : len(geometry)] = out["forces"] - grad_cell[i] = out["stress"] - return {"energy": value, "forces": grad_pos, "stress": grad_cell} + for k, v in out.items(): + data[k].append(v) + return data @dataclass(frozen=True) @@ -145,7 +141,7 @@ def __call__( @staticmethod def _geometry_to_key(geometry: Geometry) -> tuple: - return tuple([geometry.periodic]) + tuple(geometry.per_atom.numbers) + return tuple([geometry.periodic]) + tuple(geometry.numbers) @dataclass(frozen=True) @@ -154,7 +150,7 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - return format_output(geometry, 0) + return format_output(geometry, 0.0) @dataclass(frozen=True) @@ -177,7 +173,6 @@ def __call__( @dataclass(frozen=True) class MACEFunction(Function): - # TODO: why are some arguments separated and others 'calc_kwargs'? model_path: str | Path ncores: int device: str @@ -212,7 +207,7 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - atoms = geometry_to_atoms(geometry) + atoms = geometry.to_atoms(structural_only=True) self.calc.calculate(atoms) return format_output(geometry, **self.calc.results) @@ -222,7 +217,7 @@ class DispersionFunction(Function): method: str damping: str = "d3bj" params_tweaks: Optional[dict[str, float]] = None - num_threads: int = 1 + num_threads: int = 4 calc: Calculator = field(init=False) @@ -243,37 +238,12 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - atoms = geometry_to_atoms(geometry) + atoms = geometry.to_atoms(structural_only=True) self.calc.calculate(atoms) return format_output(geometry, **self.calc.results) -def _apply( - arg: Union[Geometry, list[Geometry], None], - outputs_: tuple[str, ...], - function_cls: Type[Function], - wait_for: Any = None, - inputs: Sequence = (), - parsl_resource_specification: dict = {}, - **parameters, -) -> Optional[list[np.ndarray]]: - from psiflow.data.utils import _read_frames - - # TODO: why pass geoms through arg or inputs? - # should we not have a single keyword for input structures? - # or at least reserve inputs for arbitrary futures (to wait on)? - - assert function_cls is not None - if arg is None: - states = _read_frames(inputs=[inputs[0]]) - elif not isinstance(arg, list): - states = [arg] - else: - states = arg - function = function_cls(**parameters) - output_dict = function.compute(states) - output_arrays = [output_dict[k] for k in outputs_] - return output_arrays + def function_from_json(path: Union[str, Path], **kwargs) -> Function: @@ -296,13 +266,3 @@ def function_from_json(path: Union[str, Path], **kwargs) -> Function: data[key] = value function = function_cls(**data) return function - - -def geometry_to_atoms(geom: Geometry) -> Atoms: - """Only structural information""" - return Atoms( - positions=geom.per_atom.positions, - numbers=geom.per_atom.numbers, - cell=np.copy(geom.cell) if geom.periodic else None, - pbc=geom.periodic, - ) diff --git a/psiflow/geometry.py b/psiflow/geometry.py index c9400c4..357b5f8 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -1,463 +1,377 @@ -from __future__ import annotations # necessary for type-guarding class methods - +import io +import pickle +import copy from pathlib import Path -from typing import Optional, Union +from typing import Optional, Any, Final +import ase.calculators.calculator import numpy as np -import typeguard +import numpy.typing as npt from ase import Atoms -from ase.data import atomic_masses, chemical_symbols -from ase.io.extxyz import key_val_dict_to_str, key_val_str_to_dict_regex -from parsl.app.app import python_app +from ase.data import atomic_masses, chemical_symbols, atomic_numbers +from ase.io.extxyz import key_val_dict_to_str, read_xyz, key_val_str_to_dict import psiflow -per_atom_dtype = np.dtype( - [ - ("numbers", np.uint8), - ("positions", np.float64, (3,)), - ("forces", np.float64, (3,)), - ] -) - -QUANTITIES = [ - "positions", - "cell", - "numbers", - "energy", - "per_atom_energy", - "forces", - "stress", - "delta", - "logprob", - "phase", - "identifier", -] - - -@typeguard.typechecked + +# TODO: all per-atom attributes are cast to float when writing, making read-write not a noop + + +# these are always accessible for every geometry +DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress" +PER_ATOM_FIELDS = "numbers", "positions", "forces" + + +class MissingType: + """Placeholder sentinel for missing data fields - replaces None""" + + def __repr__(self): + return "" + + def __bool__(self): + return False + + +MISSING: Final = MissingType() + + +class PerAtom: + """ + Holds 'per atom' arrays like positions and forces. + Must contain 'numbers' and 'positions', can contain anything else. + All arrays should be 2-dimensional with shape [n_atoms, *]. + """ + + numbers: npt.NDArray[np.uint8] + positions: npt.NDArray[np.float64] + forces: npt.NDArray[np.float64] + + def __init__( + self, + numbers: npt.NDArray[np.uint8], + positions: npt.NDArray[np.float64], + **kwargs: npt.NDArray, + ): + self.numbers = numbers + self.positions = positions + for k, v in kwargs.items(): + setattr(self, k, v) + + def __setattr__(self, name: str, value: np.ndarray) -> None: + if value is MISSING: + return # do not set MISSING values + elif name == "numbers": + value = value.astype(int) + elif len(value) != len(self): + raise ValueError(f"Field '{name}' is not a per-atom property..") + if value.ndim != 2: + value = value.reshape(-1, 1) # 1D arrays + super().__setattr__(name, value) + + def __getattr__(self, name): + # only runs if the attribute isn't found normally + if name == "forces": + return MISSING # forces should always be accessible + raise AttributeError( + f"'{type(self).__name__}' instance has no attribute '{name}'" + ) + + def __len__(self) -> int: + return self.numbers.shape[0] + + def __str__(self) -> str: + return f"PerAtom[{len(self)}]{set(vars(self))}" + + def attributes(self) -> dict[str, Any]: + """Return all optional per-atom attributes""" + return { + k: v for k, v in vars(self).items() if k not in ("numbers", "positions") + } + + def reset(self): + """Wipes optional fields""" + for attr in self.attributes(): + delattr(self, attr) + + def to_string(self) -> tuple[str, str]: + """Creates extxyz header and body""" + symbols = [chemical_symbols[i] for i in self.numbers.flatten()] + data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions} + data |= self.attributes() + + # create extxyz header + header = "Properties=species:S:1:pos:R:3" + for k, arr in data.items(): + if k in header: + continue + header += f":{k}:R:{arr.shape[1]}" + + # create a structured array with all fields + dtypes = [] + for k, arr in data.items(): + dtypes += [arr.dtype] * arr.shape[1] + dtype = [(str(i), t) for i, t in enumerate(dtypes)] + + structured = np.zeros(len(self), dtype=dtype) + i = 0 + for k, arr in data.items(): + for j in range(arr.shape[1]): + structured[str(i)] = arr[:, j] + i += 1 + + # write to string + formats = [] + format_map = {"species": "%-2s"} + for k, arr in data.items(): + dformat = format_map.get(k, "%16.8f") + formats += [dformat] * arr.shape[1] + buffer = io.StringIO() + np.savetxt(buffer, structured, fmt=formats) + txt = buffer.getvalue() + + return header, txt + + @classmethod + def from_string(cls, s: str, props: Optional[str] = None): + """Reverses to_string""" + props = (props or "species:S:1:pos:R:3") + ":" + dtype_map = {"R": "f8", "S": "i8"} + + # figure out what columns belong to which property + keys, dtypes = [], [] + while props: + k, t, n, props = props.split(":", 3) + keys += [k] * int(n) + dtypes += [dtype_map[t]] * int(n) + dtype = [(str(i), t) for i, t in enumerate(dtypes)] + + # convert species to numbers automatically + conv = {0: lambda s: atomic_numbers[s]} + structured = np.loadtxt(io.StringIO(s), dtype=dtype, converters=conv) + + # group subarrays per property + data = {k: [] for k in set(keys)} + for i, k in enumerate(structured.dtype.names): + arr = structured[k] + data[keys[i]].append(arr) + arrays = {k: np.stack(v, axis=-1) for k, v in data.items()} + + arrays["numbers"] = arrays.pop("species") + arrays["positions"] = arrays.pop("pos") + return cls(**arrays) + + class Geometry: """ Represents an atomic structure with associated properties. - This class encapsulates the atomic structure, including atom positions, cell parameters, and various physical properties such as energy and forces. Attributes: - per_atom (np.recarray): Record array containing per-atom properties. - cell (np.ndarray): 3x3 array representing the unit cell vectors. - order (dict): Dictionary to store custom ordering information. + per_atom (PerAtom): Container with per-atom properties. + cell (Optional[np.ndarray]): 3x3 array representing the unit cell vectors (or None for isolated molecules). energy (Optional[float]): Total energy of the system. - stress (Optional[np.ndarray]): Stress tensor of the system. - delta (Optional[float]): Delta value, if applicable. - phase (Optional[str]): Phase information, if applicable. - logprob (Optional[np.ndarray]): Log probability values, if applicable. - stdout (Optional[str]): Standard output information, if applicable. - identifier (Optional[int]): Unique identifier for the geometry. + stress (Optional[np.ndarray]): Stress tensor of the system (3x3 array). """ - per_atom: np.recarray - cell: np.ndarray - order: dict - energy: Optional[float] - stress: Optional[np.ndarray] - delta: Optional[float] - phase: Optional[str] - logprob: Optional[np.ndarray] - stdout: Optional[str] - identifier: Optional[int] + per_atom: PerAtom + cell: Optional[np.ndarray] + energy: float + stress: np.ndarray def __init__( self, - per_atom: np.recarray, - cell: np.ndarray, - order: Optional[dict] = None, - energy: Optional[float] = None, - stress: Optional[np.ndarray] = None, - delta: Optional[float] = None, - phase: Optional[str] = None, - logprob: Optional[np.ndarray] = None, - stdout: Optional[str] = None, - identifier: Optional[int] = None, + per_atom: PerAtom, + cell: Optional[np.ndarray] = None, + **kwargs: Any, ): - """ - Initialize a Geometry instance, though the preferred way of instantiating - proceeds via the `from_data` or `from_atoms` class methods - - Args: - per_atom (np.recarray): Record array containing per-atom properties. - cell (np.ndarray): 3x3 array representing the unit cell vectors. - order (Optional[dict], optional): Custom ordering information. Defaults to None. - energy (Optional[float], optional): Total energy of the system. Defaults to None. - stress (Optional[np.ndarray], optional): Stress tensor of the system. Defaults to None. - delta (Optional[float], optional): Delta value. Defaults to None. - phase (Optional[str], optional): Phase information. Defaults to None. - logprob (Optional[np.ndarray], optional): Log probability values. Defaults to None. - stdout (Optional[str], optional): Standard output information. Defaults to None. - identifier (Optional[int], optional): Unique identifier for the geometry. Defaults to None. - """ - self.per_atom = per_atom.astype(per_atom_dtype) # copies data - self.cell = cell.astype(np.float64) - assert self.cell.shape == (3, 3) - if order is None: - order = {} - self.order = order - self.energy = energy - self.stress = stress - self.delta = delta - self.phase = phase - self.logprob = logprob - self.stdout = stdout - self.identifier = identifier - - def reset(self): - """ - Reset all computed properties of the geometry to their default values. - """ - self.energy = None - self.stress = None - self.delta = None - self.phase = None - self.logprob = None - self.per_atom.forces[:] = np.nan - - def clean(self): - """ - Clean the geometry by resetting properties and removing additional information. - """ - self.reset() - self.order = {} - self.stdout = None - self.identifier = None - - def __eq__(self, other) -> bool: - """ - Check if two Geometry instances are equal. - - Args: - other: The other object to compare with. - - Returns: - bool: True if the geometries are equal, False otherwise. - """ - if not isinstance(other, Geometry): - return False - # have to check separately for np.allclose due to different dtypes - equal = True - equal = equal and (len(self) == len(other)) - equal = equal and (self.periodic == other.periodic) - if not equal: - return False - equal = equal and np.allclose(self.per_atom.numbers, other.per_atom.numbers) - equal = equal and np.allclose(self.per_atom.positions, other.per_atom.positions) - equal = equal and np.allclose(self.cell, other.cell) - return bool(equal) - - def align_axes(self): - """ - Align the axes of the unit cell to a canonical representation for periodic systems. - """ - if self.periodic: # only do something if periodic: - positions = self.per_atom.positions - cell = self.cell - transform_lower_triangular(positions, cell, reorder=False) - reduce_box_vectors(cell) - - def __len__(self): - """ - Get the number of atoms in the geometry. - - Returns: - int: The number of atoms. - """ - return len(self.per_atom) - - def to_string(self) -> str: - """ - Convert the Geometry instance to a string representation in extended XYZ format. + """Initialize a Geometry instance. Preferably instantiate through class methods.""" + self.per_atom = per_atom + self.cell = cell + + # set optional attributes + for k, v in kwargs.items(): + setattr(self, k, v) + + def __setattr__(self, name: str, value: Any) -> None: + if value is MISSING: + return # do not set MISSING values + + # enforce format for DEFAULT_PROPERTIES + if name == "cell" and value is not None: + assert isinstance(value, np.ndarray) and value.shape == (3, 3) + elif name == "energy": + value = float(value) + # assert isinstance(value, float) + elif name == "stress": + if self.cell is None: + raise ValueError("Cannot assign stress to molecules..") + assert isinstance(value, np.ndarray) and value.shape == (3, 3) + + super().__setattr__(name, value) + + def __getattr__(self, name): + # only runs if the attribute isn't found normally + if name in ("energy", "stress"): + return MISSING # should always be accessible + raise AttributeError( + f"'{type(self).__name__}' instance has no attribute '{name}'" + ) - Returns: - str: String representation of the geometry. - """ - if self.periodic: - comment = 'Lattice="' - comment += " ".join([str(x) for x in np.reshape(self.cell.T, 9, order="F")]) - comment += '" pbc="T T T" ' - else: - comment = 'pbc="F F F" ' - - write_forces = not np.any(np.isnan(self.per_atom.forces)) - comment += "Properties=species:S:1:pos:R:3" - if write_forces: - comment += ":forces:R:3" - comment += " " - - keys = [ - "energy", - "stress", - "delta", - "phase", - "logprob", - "stdout", - "identifier", - ] - values_dict = {} - for key in keys: - value = getattr(self, key) - if value is None: + def attributes(self) -> dict[str, Any]: + """Return all non-MISSING instance attributes""" + return {k: v for k, v in vars(self).items() if k != "per_atom"} + + def reset(self) -> None: + """Reset all computed properties to their default values""" + self.per_atom.reset() + for k in {"energy", "stress"}.intersection(self.attributes()): + delattr(self, k) + + def clean(self) -> None: + """Remove all non-structural information ('numbers', 'positions', 'cell')""" + self.per_atom.reset() + for k in self.attributes(): + if k == "cell": continue - if type(value) is np.ndarray: - if np.all(np.isnan(value)): - continue - values_dict[key] = value - for key, value in self.order.items(): - values_dict["order_" + key] = value - comment += key_val_dict_to_str(values_dict) - lines = ["{}".format(len(self))] - lines.append("{}".format(comment)) - fmt = " ".join(["%2s"] + 3 * ["%16.8f"]) + " " - if write_forces: - fmt += " ".join(3 * ["%16.8f"]) - for i in range(len(self)): - entry = (chemical_symbols[self.per_atom.numbers[i]],) - entry = entry + tuple(self.per_atom.positions[i]) - if write_forces: - entry = entry + tuple(self.per_atom.forces[i]) - lines.append(fmt % entry) - return "\n".join(lines) - - def save(self, path_xyz: Union[Path, str]): - """ - Save the Geometry instance to an XYZ file. - - Args: - path_xyz (Union[Path, str]): Path to save the XYZ file. - """ - path_xyz = psiflow.resolve_and_check(Path(path_xyz)) - with open(path_xyz, "w") as f: - f.write(self.to_string()) + delattr(self, k) + + def align_axes(self) -> None: + """Align the cell axes to a canonical representation for periodic systems""" + if not self.periodic: + return + positions = self.per_atom.positions + cell = self.cell + transform_lower_triangular(positions, cell, reorder=False) + reduce_box_vectors(cell) def copy(self) -> Geometry: - """ - Create a deep copy of the Geometry instance. + """Create a deep copy of the Geometry instance""" + return pickle.loads(pickle.dumps(self)) - Returns: - Geometry: A new Geometry instance with the same data. - """ - return Geometry.from_string(self.to_string()) + def to_string(self) -> str: + """Convert the Geometry instance to a string representation in extended XYZ format""" + data = self.attributes() + data.pop("cell") # cell needs special treatment + if self.periodic: + data["Lattice"] = self.cell.T # ase does Fortran ordering + data["pbc"] = "T T T" + info_str = key_val_dict_to_str(data) + properties, txt = self.per_atom.to_string() + header = " ".join([properties, info_str]) + return "\n".join([str(len(self)), header, txt]) + + def save(self, path_xyz: Path | str) -> None: + """Save the Geometry instance to an XYZ file""" + path_xyz = psiflow.resolve_and_check(Path(path_xyz)) + path_xyz.write_text(self.to_string()) + + def to_atoms(self, structural_only: bool = False) -> Atoms: + """Convert the Geometry instance to an Atoms object""" + if structural_only: + return Atoms( + positions=self.per_atom.positions, + numbers=self.numbers, + cell=self.cell, + pbc=self.periodic, + ) + + # use the ASE extxyz reader + s = self.to_string() + return next(read_xyz(io.StringIO(s), index=0)) + + def __eq__(self, other: "Geometry") -> bool: + """Check if two Geometry instances are structurally equal""" + if ( + isinstance(other, Geometry) + and len(self) == len(other) + and self.periodic == other.periodic + and (not self.periodic or np.allclose(self.cell, other.cell)) + and np.allclose(self.per_atom.numbers, other.per_atom.numbers) + and np.allclose(self.per_atom.positions, other.per_atom.positions) + ): + return True + return False + + def __len__(self) -> int: + """Get the number of atoms in the geometry""" + return len(self.per_atom) @classmethod - def from_string(cls, s: str, natoms: Optional[int] = None) -> Optional[Geometry]: - """ - Create a Geometry instance from a string representation in extended XYZ format. - - Args: - s (str): String representation of the geometry. - natoms (Optional[int], optional): Number of atoms (if known). Defaults to None. - - Returns: - Optional[Geometry]: A new Geometry instance, or None if the string is empty. - """ - if len(s) == 0: - return None - if not natoms: # natoms in s - lines = s.strip().split("\n") - natoms = int(lines[0]) - lines = lines[1:] + def from_string(cls, s: str) -> Geometry: + """Create a Geometry instance from a string representation in extended XYZ format""" + n_atoms, header, body = s.strip().split("\n", 2) + data = key_val_str_to_dict(header) + data.pop("pbc", None) # geometry derives pbc from cell + if "Lattice" in data: + data["cell"] = data.pop("Lattice").T # ase does Fortran ordering else: - lines = s.rstrip().split( - "\n" - ) # i-PI nonperiodic starts with empty -> rstrip! - assert len(lines) == natoms + 1 - comment = lines[0] - comment_dict = key_val_str_to_dict_regex(comment) - - # read and format per_atom data - column_indices = {} - if "Properties" in comment_dict: - properties = comment_dict["Properties"].split(":") - count = 0 - for i in range(len(properties) // 3): - name = properties[3 * i] - ncolumns = int(properties[3 * i + 2]) - column_indices[name] = count - count += ncolumns - assert "pos" in column_indices # positions need to be there - - per_atom = np.recarray(natoms, dtype=per_atom_dtype) - per_atom.forces[:] = np.nan - POS_INDEX = column_indices.get("pos", 1) - FORCES_INDEX = column_indices.get("forces", None) - for i in range(natoms): - values = lines[i + 1].split() - per_atom.numbers[i] = chemical_symbols.index(values[0]) - per_atom.positions[i, :] = [ - float(_) for _ in values[POS_INDEX : POS_INDEX + 3] - ] - if FORCES_INDEX is not None: - per_atom.forces[i, :] = [ - float(_) for _ in values[FORCES_INDEX : FORCES_INDEX + 3] - ] - - order = {} - for key, value in comment_dict.items(): - if key.startswith("order_"): - order[key.replace("order_", "")] = value - - # TODO: this needs a better solution - cell = comment_dict.pop("Lattice", np.zeros((3, 3))) - if isinstance(cell, str): - # most likely an array of NaNs - cell = np.array(cell.split()).reshape((3, 3)) - cell = cell.T # transposed! - pbc = sum(comment_dict.pop("pbc", [])) == 3 - geometry = cls( - per_atom=per_atom, - cell=cell if pbc else np.zeros((3, 3)), - energy=comment_dict.pop("energy", None), - stress=comment_dict.pop("stress", None), - delta=comment_dict.pop("delta", None), - phase=comment_dict.pop("phase", None), - logprob=comment_dict.pop("logprob", None), - stdout=comment_dict.pop("stdout", None), - identifier=comment_dict.pop("identifier", None), - order=order, - ) - return geometry - - @classmethod - def load(cls, path_xyz: Union[Path, str]) -> Geometry: - """ - Load a Geometry instance from an XYZ file. + data["cell"] = None + per_atom = PerAtom.from_string(body, data.pop("Properties", None)) - Args: - path_xyz (Union[Path, str]): Path to the XYZ file. + assert len(per_atom) == int(n_atoms) + return Geometry(per_atom, **data) - Returns: - Geometry: A new Geometry instance loaded from the file. - """ + @classmethod + def load(cls, path_xyz: Path | str) -> "Geometry": + """Load a Geometry instance from an XYZ file""" path_xyz = psiflow.resolve_and_check(Path(path_xyz)) - assert path_xyz.exists() - with open(path_xyz, "r") as f: - content = f.read() + content = path_xyz.read_text() return cls.from_string(content) - @property - def periodic(self): - """ - Check if the geometry is periodic. - - Returns: - bool: True if the geometry is periodic, False otherwise. - """ - return np.any(self.cell) - - @property - def per_atom_energy(self): - """ - Calculate the energy per atom. - - Returns: - Optional[float]: Energy per atom if total energy is available, None otherwise. - """ - if self.energy is None: - return None - else: - return self.energy / len(self) - - @property - def volume(self): - """ - Calculate the volume of the unit cell. - - Returns: - float: Volume of the unit cell for periodic systems, np.nan for non-periodic systems. - """ - if not self.periodic: - return np.nan - else: - return np.linalg.det(self.cell) - - @property - def atomic_masses(self): - """ - Get the atomic masses of the atoms in the geometry. - - Returns: - np.ndarray: Array of atomic masses. - """ - return np.array([atomic_masses[n] for n in self.per_atom.numbers]) - @classmethod def from_data( - cls, - numbers: np.ndarray, - positions: np.ndarray, - cell: Optional[np.ndarray], + cls, numbers: np.ndarray, positions: np.ndarray, cell: Optional[np.ndarray] ) -> Geometry: - """ - Create a Geometry instance from atomic numbers, positions, and cell data. - - Args: - numbers (np.ndarray): Array of atomic numbers. - positions (np.ndarray): Array of atomic positions. - cell (Optional[np.ndarray]): Unit cell vectors (or None for non-periodic systems). - - Returns: - Geometry: A new Geometry instance. - """ - per_atom = np.recarray(len(numbers), dtype=per_atom_dtype) - per_atom.numbers[:] = numbers - per_atom.positions[:] = positions - per_atom.forces[:] = np.nan - if cell is not None: - cell = cell.copy() - else: - cell = np.zeros((3, 3)) - return Geometry(per_atom, cell) + """Create a Geometry instance from atomic numbers, positions, and cell data""" + per_atom = PerAtom(numbers.copy(), positions.copy()) + return Geometry(per_atom, cell=copy.copy(cell)) @classmethod def from_atoms(cls, atoms: Atoms) -> Geometry: - """ - Create a Geometry instance from an ASE Atoms object. - - Args: - atoms (Atoms): ASE Atoms object. - - Returns: - Geometry: A new Geometry instance. - """ - per_atom = np.recarray(len(atoms), dtype=per_atom_dtype) - per_atom.numbers[:] = atoms.numbers.astype(np.uint8) - per_atom.positions[:] = atoms.get_positions() - per_atom.forces[:] = atoms.arrays.get("forces", np.nan) # TODO: ASE stores forces in calc now - if np.any(atoms.pbc): - cell = np.array(atoms.cell) - else: - cell = np.zeros((3, 3)) - geometry = cls(per_atom, cell) - geometry.energy = atoms.info.get("energy", None) - geometry.stress = atoms.info.get("stress", None) - geometry.delta = atoms.info.get("delta", None) - geometry.phase = atoms.info.get("phase", None) - geometry.logprob = atoms.info.get("logprob", None) - geometry.stdout = atoms.info.get("stdout", None) - geometry.identifier = atoms.info.get("identifier", None) - return geometry - - -def new_nullstate(): - """ - Create a new null state Geometry. + """Create a Geometry instance from an ASE Atoms object""" + per_atom = PerAtom(**atoms.arrays) + data = atoms.info + if all(atoms.pbc): + data["cell"] = atoms.cell.array + if atoms.calc is not None: + # ASE does stupid calc things + try: + data["energy"] = atoms.get_potential_energy() + per_atom.forces = atoms.get_forces() + data["stress"] = atoms.get_stress(voigt=False) + except ase.calculators.calculator.PropertyNotImplementedError: + pass # property was not stored in calc + return cls(per_atom, **data) - Returns: - Geometry: A Geometry instance representing a null state. - """ - return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None) + @property + def periodic(self) -> bool: + """Check if the geometry is periodic""" + return not self.cell is None + @property + def per_atom_energy(self) -> float | MISSING: + """Calculate the per-atom energy""" + if self.energy is MISSING: + return MISSING + return self.energy / len(self) -# use universal dummy state -NullState = new_nullstate() + @property + def volume(self) -> Optional[float]: + """Calculate the volume of the unit cell""" + if not self.periodic: + return None + return np.linalg.det(self.cell) + + @property + def atomic_masses(self) -> npt.NDArray: + """Get the atomic masses of the atoms in the geometry""" + return np.array([atomic_masses[n] for n in self.numbers]) + + @property + def numbers(self) -> npt.NDArray: + """Get a flattened version of per_atom.numbers""" + return self.per_atom.numbers.flatten() def is_lower_triangular(cell: np.ndarray) -> bool: @@ -554,7 +468,6 @@ def reduce_box_vectors(cell: np.ndarray): cell[1, :] = cell[1, :] - cell[0, :] * np.round(cell[1, 0] / cell[0, 0]) -@typeguard.typechecked def get_mass_matrix(geometry: Geometry) -> np.ndarray: """ Compute the mass matrix for a given geometry. @@ -573,7 +486,6 @@ def get_mass_matrix(geometry: Geometry) -> np.ndarray: return np.outer(sqrt_inv, sqrt_inv) -@typeguard.typechecked def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: """ Apply mass-weighting to a Hessian matrix. @@ -590,7 +502,6 @@ def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: return hessian * get_mass_matrix(geometry) -@typeguard.typechecked def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: """ Remove mass-weighting from a Hessian matrix. @@ -607,105 +518,14 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: return hessian / get_mass_matrix(geometry) -def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]: - """ - Create output arrays for specified quantities from a list of Geometry instances. - - Args: - quantities (list[str]): List of quantity names to extract. - data (list[Geometry]): List of Geometry instances. - - Returns: - list[np.ndarray]: List of arrays containing the requested quantities. - """ - order_names = list(set([k for g in data for k in g.order])) - assert all([q in QUANTITIES + order_names for q in quantities]) - natoms = np.array([len(geometry) for geometry in data], dtype=int) - max_natoms = np.max(natoms) - nframes = len(data) - nprob = 0 - max_phase = 0 - for state in data: - if state.logprob is not None: - nprob = max(len(state.logprob), nprob) - if state.phase is not None: - max_phase = max(len(state.phase), max_phase) - - arrays = [] - for quantity in quantities: - if quantity in ["positions", "forces"]: - array = np.empty((nframes, max_natoms, 3), dtype=np.float64) - array[:] = np.nan - elif quantity in ["cell", "stress"]: - array = np.empty((nframes, 3, 3), dtype=np.float64) - array[:] = np.nan - elif quantity in ["numbers"]: - array = np.empty((nframes, max_natoms), dtype=np.uint8) - array[:] = 0 - elif quantity in ["energy", "delta", "per_atom_energy"]: - array = np.empty((nframes,), dtype=np.float64) - array[:] = np.nan - elif quantity in ["phase"]: - array = np.empty((nframes,), dtype=(np.unicode_, max_phase)) - array[:] = "" - elif quantity in ["logprob"]: - array = np.empty((nframes, nprob), dtype=np.float64) - array[:] = np.nan - elif quantity in ["identifier"]: - array = np.empty((nframes,), dtype=np.int64) - array[:] = -1 - elif quantity in order_names: - array = np.empty((nframes,), dtype=np.float64) - array[:] = np.nan - else: - raise AssertionError("missing quantity in if/else") - arrays.append(array) - return arrays - - -def _assign_identifier( - state: Geometry, - identifier: int, - discard: bool = False, -) -> tuple[Geometry, int]: - """ - Assign an identifier to a Geometry instance. - - Args: - state (Geometry): Input Geometry instance. - identifier (int): Identifier to assign. - discard (bool, optional): Whether to discard the state. Defaults to False. - - Returns: - tuple[Geometry, int]: Updated Geometry and next available identifier. - """ - if (state == NullState) or discard: - return state, identifier - else: - assert state.identifier is None - state.identifier = identifier - return state, identifier + 1 - - -assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) - - -@typeguard.typechecked -def _check_equality( - state0: Geometry, - state1: Geometry, -) -> bool: - """ - Check if two Geometry instances are equal. - - Args: - state0 (Geometry): First Geometry instance. - state1 (Geometry): Second Geometry instance. - - Returns: - bool: True if the Geometry instances are equal, False otherwise. - """ - return state0 == state1 - - -check_equality = python_app(_check_equality, executors=["default_threads"]) +def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float: + """Compute the total atomic energy based on single atom energies""" + total = 0 + numbers, counts = np.unique(geometry.numbers, return_counts=True) + for number, count in zip(numbers, counts): + symbol = chemical_symbols[number] + try: + total += count * atomic_energies[symbol] + except KeyError: + raise KeyError(f"No atomic energy value for symbol '{symbol}'..") + return float(total) diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py index f101188..dd65fc4 100644 --- a/psiflow/hamiltonians.py +++ b/psiflow/hamiltonians.py @@ -11,7 +11,15 @@ from parsl.dataflow.futures import AppFuture, Future import psiflow -from psiflow.data import Computable, Dataset, aggregate_multiple, compute +from psiflow.data import Dataset +from psiflow.compute import ( + aggregate_results, + compute, + _apply, + ComputeInput, + ComputeResult, + insert_results, +) from psiflow.functions import ( EinsteinCrystalFunction, HarmonicFunction, @@ -19,18 +27,12 @@ PlumedFunction, ZeroFunction, DispersionFunction, - _apply, ) from psiflow.geometry import Geometry from psiflow.utils._plumed import remove_comments_printflush from psiflow.utils.apps import get_attribute from psiflow.utils.io import dump_json - -# TODO: comparison logic in __eq__ only works for hamiltonians without futures -# TODO: dataclasses automatically generate __eq__ - - logger = logging.getLogger(__name__) # logging per module @@ -39,23 +41,20 @@ apply_modelevaluation = python_app(_apply, executors=["ModelEvaluation"]) -# TODO: why have the Computable class? -class Hamiltonian(Computable): - batch_size = 1000 +class Hamiltonian: outputs: ClassVar[tuple] = ("energy", "forces", "stress") function_name: ClassVar[str] - def compute( - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *outputs: Optional[str], - batch_size: Optional[int] = -1, # if -1: take class default TODO: why? - ) -> Union[list[AppFuture], AppFuture]: - if len(outputs) == 0: - outputs = tuple(self.__class__.outputs) - if batch_size == -1: - batch_size = self.__class__.batch_size - return compute(arg, self.get_app(), outputs_=outputs, batch_size=batch_size) + def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture: + """Evaluate hamiltonian function on input data and return ComputeResult""" + return compute(arg, [self.get_app()], batch_size=batch_size) + + def evaluate(self, arg: Dataset, batch_size: int = 1000) -> Dataset: + """Return new dataset labelled with hamiltonian function""" + geoms = arg.geometries() + result = self.compute(geoms, batch_size) + future = insert_results(geoms, result) + return Dataset(future) def __eq__(self, other: "Hamiltonian") -> bool: raise NotImplementedError @@ -107,25 +106,12 @@ def __init__( self.hamiltonians = list(hamiltonians) self.coefficients = list(coefficients) - def compute( # override compute for efficient batching - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - outputs: Union[str, list[str], None] = None, - batch_size: Optional[int] = None, - ) -> Union[list[AppFuture], AppFuture]: - if outputs is None: - outputs = list(self.__class__.outputs) + def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture: apply_apps = [h.get_app() for h in self.hamiltonians] reduce_func = partial( - aggregate_multiple, coefficients=np.array(self.coefficients) - ) - return compute( - arg, - *apply_apps, - outputs_=outputs, - reduce_func=reduce_func, - batch_size=batch_size, + aggregate_results, coefficients=np.array(self.coefficients) ) + return compute(arg, apply_apps, reduce_func=reduce_func, batch_size=batch_size) def __eq__(self, hamiltonian: Hamiltonian) -> bool: if type(hamiltonian) is not MixtureHamiltonian: @@ -223,9 +209,6 @@ def serialize(self, **kwargs) -> list[DataFuture]: class Zero(Hamiltonian): function_name: ClassVar[str] = "ZeroFunction" - def __init__(self): - pass - def get_app(self) -> Callable: return partial(apply_threads, function_cls=ZeroFunction) @@ -311,7 +294,7 @@ def get_app(self) -> Callable: return partial( apply_htex, function_cls=PlumedFunction, - wait_for=self.external, + inputs=[self.external], **self.parameters(), ) @@ -428,7 +411,7 @@ def get_app(self) -> Callable: return partial( apply_modelevaluation, function_cls=MACEFunction, - wait_for=self.external, + inputs=[self.external], parsl_resource_specification=evaluation.wq_resources(1), **self.parameters(include_env=True), ) diff --git a/psiflow/learning.py b/psiflow/learning.py index 11cf8d0..5dca750 100644 --- a/psiflow/learning.py +++ b/psiflow/learning.py @@ -8,13 +8,14 @@ import numpy as np import typeguard from parsl.app.app import python_app -from parsl.dataflow.futures import AppFuture +from parsl.dataflow.futures import AppFuture, Future import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, NullState, assign_identifier +from psiflow.geometry import Geometry#, NullState, assign_identifier from psiflow.hamiltonians import Hamiltonian, Zero from psiflow.metrics import Metrics +from psiflow.models import MACE # from psiflow.models import Model from psiflow.reference import Reference from psiflow.sampling import SimulationOutput, Walker, sample @@ -24,6 +25,24 @@ logger = logging.getLogger(__name__) # logging per module +# TODO: reset_model method to remove existing MLP + train/val split +# TODO: some plugin for data discarding + walker reset +# TODO: some plugin for data selection / uncertainty thing +# TODO: some plugin for wandb +# TODO: add_data method after init? +# TODO: clean_data method to throw away ridiculous structures +# TODO: remove psiflow.wait calls -- but still have blocking functionality + +# TODO: PASSIVE LEARNING +# 1. walk MD + store structures + track which structure belongs to which walker +# 2. some data selection / cleanup step +# 3. reference single point + hamiltonian eval +# 4. compare geometries + filter + discard + keep track of walkers to reset +# 5. update train/val and retrain + + + +# TODO: remove -> merge into one app @typeguard.typechecked def _compute_error( state0: Geometry, @@ -45,7 +64,7 @@ def _compute_error( compute_error = python_app(_compute_error, executors=["default_threads"]) - +# TODO: remove -> merge into one app @typeguard.typechecked def _exceeds_error( errors: np.ndarray, @@ -56,7 +75,7 @@ def _exceeds_error( exceeds_error = python_app(_exceeds_error, executors=["default_threads"]) - +# TODO: remove -> merge into one app @typeguard.typechecked def evaluate_outputs( outputs: list[SimulationOutput], @@ -99,18 +118,24 @@ def evaluate_outputs( return identifier, data, resets -@typeguard.typechecked -# @psiflow.serializable +class Thresholds: + """Container for hyperparams""" + pass + +@psiflow.register_serializable class Learning: + root: Path + model: MACE reference: Reference - path_output: str - identifier: Union[AppFuture, int] train_valid_split: float - error_thresholds_for_reset: list[Optional[float]] - error_thresholds_for_discard: list[Optional[float]] - metrics: Metrics + identifier: Union[AppFuture, int] + error_tresholds: Optional[Thresholds] + # error_thresholds_for_reset: list[Optional[float]] + # error_thresholds_for_discard: list[Optional[float]] + # metrics: Metrics iteration: int - data: Dataset + wait_for: Future + # data: Dataset def __init__( self, @@ -123,6 +148,8 @@ def __init__( wandb_project: Optional[str] = None, initial_data: Optional[Dataset] = None, ): + # TODO: accept model argument + # TODO: differentiate between fresh start <> restart self.reference = reference self.path_output = str(path_output) Path(self.path_output).mkdir(exist_ok=True, parents=True) @@ -145,6 +172,7 @@ def __init__( self.iteration = -1 def update(self, learning: Learning) -> None: + # TODO: does this make sense? assert self.path_output == learning.path_output self.train_valid_split = learning.train_valid_split self.error_thresholds_for_reset = learning.error_thresholds_for_reset @@ -156,6 +184,7 @@ def update(self, learning: Learning) -> None: self.iteration = learning.iteration def skip(self, name: str) -> bool: + # TODO: upon restart, should we not immediately find how far we got and what the next action should be? if not (Path(self.path_output) / name).exists(): return False else: # check that the iteration has completed, or delete @@ -168,6 +197,7 @@ def skip(self, name: str) -> bool: return True def save(self, name: str, model: Model, walkers: list[Walker]) -> None: + # TODO: fully revamp this method model.save(Path(self.path_output) / name) self.data.save(Path(self.path_output) / name / "data.xyz") @@ -216,6 +246,7 @@ def passive_learning( steps: int, **sampling_kwargs, ) -> tuple[Model, list[Walker]]: + # TODO: remove model argument + return only walkers self.iteration += 1 name = "{}_{}".format(self.iteration, "passive_learning") if self.skip(name): @@ -231,6 +262,9 @@ def passive_learning( step = steps nevaluations = steps // step for _i in range(nevaluations): + # TODO: splits into multiple simulations and does data evaluation + validation in steps + # -- do not think we want this.. The instruction is to collect multiple geometries per walker + # -- because we should handle all data validation in one go (errors / uncertainty ...) outputs = sample( walkers, steps=step, @@ -238,8 +272,10 @@ def passive_learning( **sampling_kwargs, ) # only apply thresholds to forces, not to energies since large offset can exist + # TODO: give users the choice? threshold_reset = [None, self.error_thresholds_for_reset[1]] threshold_discard = [None, self.error_thresholds_for_discard[1]] + # TODO: do single point evaluations here identifier, data, _ = evaluate_outputs( # ignore resets outputs, hamiltonian, @@ -252,6 +288,7 @@ def passive_learning( self.identifier = identifier self.data += data # automatically sorted based on identifier + # TODO: keep consistent train/val splits and do not reset model train, valid = self.data.split(self.train_valid_split) # also shuffles model.reset() model.initialize(train) diff --git a/psiflow/metrics.py b/psiflow/metrics.py index 7b7e351..60cb99c 100644 --- a/psiflow/metrics.py +++ b/psiflow/metrics.py @@ -386,7 +386,8 @@ def _update_logs( inputs: list = [], outputs: list = [], ): - from psiflow.data.utils import _compute_rmse, _extract_quantities, _read_frames + from psiflow.data.utils import _extract_quantities, _read_frames + from psiflow.compute import _compute_rmse from psiflow.utils.io import _load_metrics, _save_metrics data0 = _read_frames(inputs=[inputs[1]]) diff --git a/psiflow/models/mace.py b/psiflow/models/mace.py index cd8c41b..6d2c8d7 100644 --- a/psiflow/models/mace.py +++ b/psiflow/models/mace.py @@ -22,7 +22,6 @@ # TODO: when changing the training dataset, the computed avg_num_neighbors will also change, # making old checkpoints inconsistent -# TODO: sanitise_config consistently called? # TODO: training fails when batch_size > dataset (see github issue) # TODO: restart_latest functionality to continue training?? @@ -59,8 +58,7 @@ def sanitise_config(config: dict) -> dict: def format_E0s(atomic_energies: dict) -> dict | str: - """""" - # TODO: MACE fails when atomic energy is not defined for every element.. + """MACE will fail if atomic energy is not defined for all elements""" if atomic_energies: return {atomic_numbers[k]: v for k, v in atomic_energies.items()} else: diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py index 014a1e4..6000624 100644 --- a/psiflow/reference/cp2k_.py +++ b/psiflow/reference/cp2k_.py @@ -4,8 +4,6 @@ from typing import Optional from collections.abc import Sequence -import numpy as np -from ase.data import chemical_symbols from ase.units import Bohr, Ha from cp2k_input_tools.generator import CP2KInputGenerator from cp2k_input_tools.parser import CP2KInputParserSimplified @@ -130,9 +128,6 @@ def get_single_atom_references(self, element: str) -> dict[int, Reference]: else: input_section["scf"] = {"ot": {"minimizer": "CG"}} - # necessary for oxygen calculation, at least in 2024.1 - # TODO: not an allowed keyword in 2025.2 - # input_section["scf"]["ignore_convergence_failure"] = "TRUE" input_str = dict_to_str(input_dict) references = {} @@ -153,13 +148,12 @@ def create_input(self, geom: Geometry) -> tuple[bool, Optional[File]]: section_copy = copy.deepcopy(section) section.pop("topology", None) # remove topology section - # insert geometry and cell - symbols = np.array(chemical_symbols)[geom.per_atom.numbers] - coord = [ - f"{s:5} {p[0]:<15.8f} {p[1]:<15.8f} {p[2]:<15.8f}" - for s, p in zip(symbols, geom.per_atom.positions) - ] - section["coord"] = {"*": coord} + # insert coordinates + _, body = geom.per_atom.to_string() + coord_list = body.split("\n")[:-1] + section["coord"] = {"*": coord_list} + + # insert cell cell = {} for i, vector in enumerate(["A", "B", "C"]): cell[vector] = "{} {} {}".format(*geom.cell[i]) diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 1a6d6d7..38ef513 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -1,6 +1,5 @@ -import warnings import logging -from typing import Optional, Union, Callable, Optional +from typing import Callable, Optional from collections.abc import Sequence from pathlib import Path from functools import partial @@ -14,12 +13,15 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import extract_quantities -from psiflow.geometry import Geometry, NullState +from psiflow.geometry import Geometry from psiflow.utils.apps import copy_app_future from psiflow.utils.parse import LineNotFoundError, get_task_name_id +# TODO: existing Reference instances do not react to changes in execution environment +# e.g., altering max_runtime is not retroactively updated + + logger = logging.getLogger(__name__) # logging per module @@ -31,13 +33,14 @@ class Status(Enum): def update_geometry(geom: Geometry, data: dict) -> Geometry: """""" + # TODO: is the metadata not more a logging thing? # data should contain 'stdout' and 'status' keys task_name, task_id = get_task_name_id(data["stdout"]) logger.info(f'Task "{task_name}" (ID {task_id}): {data["status"].name}') geom = geom.copy() geom.reset() - geom.order["status"], geom.order["task_id"] = data["status"].name, task_id + geom.metadata = {'status': data["status"].name, 'task_id': task_id} if data["status"] != Status.SUCCESS: return geom @@ -47,18 +50,18 @@ def update_geometry(geom: Geometry, data: dict) -> Geometry: shift = data["positions"][0] - geom.per_atom.positions[0] if not np.allclose(data["positions"], geom.per_atom.positions + shift, atol=1e-6): # output does not match geometry up to a translation - geom.order["status"] = Status.INCONSISTENT + geom.status = Status.INCONSISTENT return geom geom.energy = data["energy"] - geom.order["runtime"] = data.get("runtime") + geom.metadata["runtime"] = data.get("runtime") if "forces" in data: - geom.per_atom.forces[:] = data["forces"] + geom.per_atom.forces = data["forces"] return geom -@join_app -def get_minimum_energy(element: str, **kwargs) -> AppFuture[float]: +@python_app(executors=['default_threads']) +def get_minimum_energy(element: str, **kwargs) -> AppFuture: energies = {m: state.energy or np.inf for m, state in kwargs.items()} energy = min(energies.values()) msg = [ @@ -67,11 +70,11 @@ def get_minimum_energy(element: str, **kwargs) -> AppFuture[float]: ] logger.info('\n'.join(msg)) assert not np.isinf(energy), f"Atomic energy calculation of '{element}' failed" - return copy_app_future(energy) # TODO: why? + return energy def get_spin_multiplicities(element: str) -> list[int]: - """TODO: rethink this""" + """TODO: redo this""" # max S = N * 1/2, max mult = 2 * S + 1 from ase.symbols import atomic_numbers @@ -84,18 +87,6 @@ def get_spin_multiplicities(element: str) -> list[int]: return mults -@join_app -def compute_dataset( - dataset: Dataset, - length: int, - reference: "Reference", -) -> list[AppFuture[Geometry]]: - logger.info(f"Performing {length} {reference.__class__.__name__} calculations.") - geometries = dataset.geometries() # read it once - evaluated = [reference.evaluate(geometries[i]) for i in range(length)] - return evaluated - - def _execute( bash_template: str, inputs: list[File], @@ -125,38 +116,15 @@ def __init__( if (n := self.n_cores) is not None: assert n <= definition.spec["cores"] - def compute( - self, - arg: Union[Dataset, Geometry, AppFuture, list], - *outputs: Optional[Union[str, tuple]], - ): - # TODO: deprecate? Reference evaluations are too costly - for output in outputs: - if output not in self.outputs: - raise ValueError("output {} not in {}".format(output, self.outputs)) - - # TODO: convert_to_dataset util? - if isinstance(arg, Dataset): - dataset = arg - elif isinstance(arg, list): - dataset = Dataset(arg) - elif isinstance(arg, AppFuture) or isinstance(arg, Geometry): - dataset = Dataset([arg]) - else: - raise TypeError - - # TODO: writing to dataset first is wasted overhead, - # but extract_quantities does not accept AppFuture[list[Geometry]] (yet?) - future_dataset = self.compute_dataset(dataset) - outputs = outputs or self.outputs - future_data = extract_quantities( - tuple(outputs), None, None, inputs=[future_dataset.extxyz] - ) - if len(outputs) == 1: - return future_data[0] - return [future_data[_] for _ in range(len(outputs))] + def __call__(self, geometry: Geometry | AppFuture) -> AppFuture: + return evaluate(geometry, self) + + def evaluate(self, dataset: Dataset) -> Dataset: + """Return a new labelled dataset""" + future = evaluate_geometries(dataset.geometries(), self) + return Dataset(future) - def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]: + def compute_atomic_energy(self, element, box_size=None) -> AppFuture: references = self.get_single_atom_references(element) state = Geometry.from_data( numbers=np.array([atomic_numbers[element]]), @@ -165,11 +133,10 @@ def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]: ) futures = {} for mult, reference in references.items(): - futures[str(mult)] = reference.evaluate(state) - return get_minimum_energy(element, **futures) + futures[str(mult)] = reference(state) + return get_minimum_energy(element, **futures) # float def get_execute_app(self) -> Callable: - # TODO: how often is this called? context = psiflow.context() definition = context.definitions[self.executor] return partial( @@ -179,13 +146,6 @@ def get_execute_app(self) -> Callable: label=self._execute_label, ) - def evaluate(self, geometry: Geometry | AppFuture) -> AppFuture: - return evaluate(geometry, self) - - def compute_dataset(self, dataset: Dataset) -> Dataset: - future = compute_dataset(dataset, dataset.length(), self) - return Dataset(future) - def get_single_atom_references(self, element: str) -> dict[int, "Reference"]: raise NotImplementedError @@ -206,7 +166,7 @@ def _process_output( try: data = reference.parse_output(stdout) except LineNotFoundError: - data = {"status": Status.FAILED} # TODO: find out what went wrong? + data = {"status": Status.FAILED} data |= {"stdout": Path(inputs[0]), "stderr": Path(inputs[1])} return update_geometry(geom, data) @@ -218,21 +178,18 @@ def _process_output( def evaluate( geom: Geometry, reference: Reference, - execute_func: Optional[Callable] = None, + execute_app: Optional[Callable] = None, ) -> AppFuture: """""" - if geom == NullState: # TODO: remove this - warnings.warn("Skipping NullState..") - return copy_app_future(geom) - flag, *files = reference.create_input(geom=geom) - if not flag: # TODO: should we reset geom? + if not flag: + geom.reset() # remove fields to indicate no evaluation happened return copy_app_future(geom) # do the actual evaluation - if execute_func is None: - execute_func = reference.get_execute_app() - future = execute_func(inputs=files) + if execute_app is None: + execute_app = reference.get_execute_app() + future = execute_app(inputs=files) future = process_output( geom=geom, @@ -240,3 +197,15 @@ def evaluate( inputs=[future.stdout, future.stderr, future], # wait for future ) return future + + +@join_app +def evaluate_geometries( + states: Sequence[Geometry], + reference: "Reference", +) -> list[AppFuture]: + msg = f"Performing {len(states)} {reference.__class__.__name__} calculations." + logger.info(msg) + execute_app = reference.get_execute_app() + evaluated = [evaluate(geom, reference, execute_app) for geom in states] + return evaluated diff --git a/psiflow/sampling/_ase.py b/psiflow/sampling/_ase.py index e66ae3d..cb23c1a 100644 --- a/psiflow/sampling/_ase.py +++ b/psiflow/sampling/_ase.py @@ -1,6 +1,5 @@ """ Structure optimisation through ASE -TODO: do we need to check for very large forces? """ import os @@ -169,9 +168,10 @@ def main(args: SimpleNamespace): Path(args.output_xyz).touch() return - atoms.set_constraint() # usually noop if not any(atoms.pbc): - atoms.cell = None # remove meaningless cell + # remove meaningless cell and stress + atoms.cell = None + atoms.calc.results.pop("stress", None) ase.io.write(FILE_OUT, atoms, format="extxyz") shutil.copy(FILE_OUT, args.output_xyz) diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index 69dde69..8a7f9e2 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -7,7 +7,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import write_frames +from psiflow.data.file import write_frames from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian from psiflow.utils.io import _dump_json diff --git a/psiflow/sampling/driver.py b/psiflow/sampling/driver.py index b56b852..463201f 100644 --- a/psiflow/sampling/driver.py +++ b/psiflow/sampling/driver.py @@ -74,7 +74,10 @@ def compute_structure(self, cell, pos): unit_to_internal("energy", "electronvolt", energy), np.float64 ) force_ipi = np.asarray(unit_to_internal("force", "ev/ang", forces), np.float64) - vir_calc = -stress * self.geometry.volume + if self.geometry.periodic: + vir_calc = -stress * self.geometry.volume + else: + vir_calc = np.zeros_like(stress) vir_ipi = np.array( unit_to_internal("energy", "electronvolt", vir_calc.T), dtype=np.float64 ) diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index 3d09880..4d7cd0c 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -9,7 +9,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import write_frames +from psiflow.data.file import write_frames from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian, MACEHamiltonian from psiflow.sampling.sampling import ( @@ -192,7 +192,7 @@ def optimize( parsl_resource_specification=definition.wq_resources(1), ) - final = Dataset(None, result.outputs[0]).evaluate(hamiltonian)[-1] + final = hamiltonian.evaluate(Dataset(extxyz=result.outputs[0]))[-1] if keep_trajectory: trajectory = Dataset(None, result.outputs[1]) return final, trajectory diff --git a/psiflow/sampling/output.py b/psiflow/sampling/output.py index 9daec95..7f08719 100644 --- a/psiflow/sampling/output.py +++ b/psiflow/sampling/output.py @@ -100,7 +100,7 @@ def _parse_simulation_data( task_id = get_task_name_id(simulation_stdout)[-1] logger.info(f"Simulation [ID {task_id}]: {status}") - return status, data, temperature, state.order.pop("time") + return status, data, temperature, state.time parse_simulation_data = python_app( diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 117f8b7..433623a 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -12,7 +12,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import read_frames +from psiflow.data.file import read_frames from psiflow.hamiltonians import Hamiltonian, MixtureHamiltonian, Zero, MACEHamiltonian from psiflow.sampling.output import ( DEFAULT_OBSERVABLES, @@ -192,7 +192,6 @@ def setup_ensemble( bias_weights_list = [] # add hamiltonian components that are not shared between all walkers as bias - # TODO: do we always want to do this? for comp in components: if not comp.shared: force = ET.Element("force", forcefield=comp.name) @@ -429,7 +428,6 @@ def make_driver_commands( driver_kwargs: list[dict], file_xyz: File, files_hamiltonian: list[File] ) -> list[str]: """""" - # TODO: what if 'file_xyz' contains multiple geometries of different size? assert len(driver_kwargs) >= len(files_hamiltonian) default = f'i-pi-driver-py -u -S "" -m custom -P {PATH_DRIVER} -a {{address}} -o {{options}} &' @@ -538,6 +536,7 @@ def _sample( ) smotion = setup_smotion(coupling, plumed_list) + # TODO: validate all arguments together + consistently # make sure at least one checkpoint is being written if checkpoint_step is None: # default to every 5% of simulation progress if step is None: @@ -635,7 +634,7 @@ def _sample( # process MD output simulation_outputs = [] - final_states = read_frames(inputs=result.outputs[:1]) + final_states = read_frames(result.outputs[0]) for idx, walker in enumerate(walkers): # TODO: order_parameter out of commission # if walker.order_parameter is not None: diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py index d0bc4e5..b1eaae2 100644 --- a/psiflow/sampling/server.py +++ b/psiflow/sampling/server.py @@ -46,7 +46,7 @@ def parse_checkpoint(file_xml: str | Path) -> list[Geometry]: # get current internal system time ensemble_data = sys.ensemble.fetch() time = ensemble_data.time * (_hbar / (_e * Ha)) * 1e12 - geometry.order["time"], geometry.order["name"] = time, name + geometry.time, geometry.name = time, name geometries.append(geometry) return geometries @@ -63,8 +63,6 @@ def insert_addresses(input_xml: ET.Element) -> None: address.text = str(Path.cwd() / address.text.strip()) - - def wait_for_clients(input_xml, timeout: int = 60) -> None: """Make sure clients have initialised successfully""" @@ -72,7 +70,7 @@ def wait_for_clients(input_xml, timeout: int = 60) -> None: sockets = [] xml_str = ET.tostring(input_xml, encoding="unicode") for line in xml_str.splitlines(): - if 'address' in line: + if "address" in line: sockets.append(line.split(">")[1].split("<")[0]) for _ in range(timeout): @@ -116,7 +114,7 @@ def run(start_xyz: str, input_xml: str): def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: - from psiflow.data.utils import _write_frames + from psiflow.data.file import _write_frames print("Starting cleanup") with open(INPUT_XML, "r") as f: @@ -129,7 +127,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: states = parse_checkpoint("output.checkpoint") for state in states: if np.allclose(state.cell, NONPERIODIC_CELL): - state.cell[:] = 0.0 + state.cell = None _write_frames(*states, outputs=[output_xyz]) print("Moved checkpoint geometries") @@ -140,7 +138,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: out = subprocess.run( "i-pi-remdsort input.xml", shell=True, capture_output=True, text=True ) - assert out.returncode == 0 # TODO: what if it isn't? + assert out.returncode == 0 print("REMDSORT") output_props = _.split(",") if (_ := output_props) else [] diff --git a/psiflow/sampling/utils.py b/psiflow/sampling/utils.py index 72ffb36..37f94f1 100644 --- a/psiflow/sampling/utils.py +++ b/psiflow/sampling/utils.py @@ -38,7 +38,7 @@ def check_forces( if not np.sum(exceeded): return indices = np.arange(len(geometry))[exceeded] - numbers = geometry.per_atom.numbers[exceeded] + numbers = geometry.numbers[exceeded] symbols = [chemical_symbols[n] for n in numbers] raise ForceMagnitudeException( "\nforce exceeded {} eV/A for atoms {}" diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py index d26d243..1c22cad 100644 --- a/psiflow/sampling/walker.py +++ b/psiflow/sampling/walker.py @@ -12,10 +12,9 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, check_equality +from psiflow.geometry import Geometry +from psiflow.data.utils import check_equality from psiflow.hamiltonians import Hamiltonian, Zero, combine_hamiltonians - -# from psiflow.order_parameters import OrderParameter from psiflow.sampling.metadynamics import Metadynamics from psiflow.utils.apps import copy_app_future @@ -186,7 +185,7 @@ def _get_minimum_energy_states( def quench(walkers: list[Walker], dataset: Dataset) -> None: """Assign the lowest energy geometry in dataset to every walker""" hamiltonians = combine_hamiltonians([w.hamiltonian for w in walkers]) - energies = [h.compute(dataset, "energy") for h in hamiltonians.hamiltonians] + energies = [h.compute(dataset).energy for h in hamiltonians.hamiltonians] coefficients = [ hamiltonians.get_coefficients(walker.hamiltonian * 1.0) for walker in walkers ] diff --git a/psiflow/utils/apps.py b/psiflow/utils/apps.py index d4e7937..134ad2a 100644 --- a/psiflow/utils/apps.py +++ b/psiflow/utils/apps.py @@ -1,6 +1,7 @@ import shutil import textwrap from typing import Any, Union +from collections.abc import Sequence from pathlib import Path import numpy as np @@ -38,8 +39,8 @@ def _compute_sum(a, b): def _copy_data_future( pass_on_exist: bool = False, - inputs: list[File] = [], - outputs: list[File] = [], + inputs: Sequence[File] = (), + outputs: Sequence[File] = () ) -> None: assert len(inputs) == 1 assert len(outputs) == 1 @@ -49,7 +50,6 @@ def _copy_data_future( shutil.copyfile(inputs[0], outputs[0]) else: # no need to copy empty file pass - return copy_data_future = python_app(_copy_data_future, executors=["default_threads"]) diff --git a/psiflow/utils/wq.py b/psiflow/utils/wq.py index 4ecc01d..1059ae5 100644 --- a/psiflow/utils/wq.py +++ b/psiflow/utils/wq.py @@ -1,4 +1,5 @@ # TODO: this probably does not work in a nested way +# or when join apps submit tasks at some point in the future (after exiting the context) import logging diff --git a/pyproject.toml b/pyproject.toml index 98e13db..4b058d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "psiflow" -version = "4.0.4" +version = "4.0.5" description = "Library for developing interatomic potentials" readme = "README.md" requires-python = ">=3.10" @@ -14,7 +14,7 @@ dependencies = [ "pyyaml>=6.0", # "numpy>=1.22.3, <2", "parsl==2026.02.23", - "prettytable", + "prettytable", # TODO: what is this used for? "psutil", # "cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools.git@3b9929735dcb3c8c0620a548b1fe20efecbad077", # need 2024.1 "ipi @ git+https://github.com/i-pi/i-pi.git@v3.1.10", diff --git a/tests/conftest.py b/tests/conftest.py index 58d2068..6ae4928 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -96,7 +96,7 @@ def generate_emt_cu_data(nstates, amplitude, supercell=None) -> list[ase.Atoms]: return atoms_list -@pytest.fixture +@pytest.fixture # TODO: do we need to regenerate this every test? def dataset(context) -> Dataset: data = generate_emt_cu_data(20, 0.2) data += generate_emt_cu_data(5, 0.15, supercell=np.diag([1, 2, 1])) diff --git a/tests/test_data.py b/tests/test_data.py index 738993f..d0fd49a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,4 +1,3 @@ -import copy import os import numpy as np @@ -8,15 +7,10 @@ from parsl.app.futures import DataFuture from parsl.data_provider.files import File -import psiflow -from psiflow.data import Dataset, compute_rmse -from psiflow.data.utils import ( - _read_frames, - _write_frames, - get_index_element_mask, - read_frames, -) -from psiflow.geometry import Geometry, NullState, check_equality +from psiflow.geometry import Geometry, MISSING +from psiflow.data import Dataset +from psiflow.data.file import _read_frames, _write_frames +from psiflow.data.utils import insert def test_geometry(tmp_path): @@ -29,9 +23,7 @@ def test_geometry(tmp_path): _ = np.array(geometry.per_atom.positions) atoms = Atoms( - numbers=np.arange(1, 6), - positions=geometry.per_atom.positions, - pbc=False, + numbers=np.arange(1, 6), positions=geometry.per_atom.positions, pbc=False ) geometry_ = Geometry.from_atoms(atoms) assert geometry == geometry_ @@ -46,108 +38,124 @@ def test_geometry(tmp_path): address = geometry.per_atom.positions.ctypes.data address_ = geometry_.per_atom.positions.ctypes.data assert address != address_ # should copy - assert np.allclose( - geometry.per_atom.positions, - geometry_.per_atom.positions, - ) + assert np.allclose(geometry.per_atom.positions, geometry_.per_atom.positions) geometry.align_axes() assert geometry.per_atom.positions.ctypes.data == address # should not copy - fake_data = [] + # create atoms dataset + atoms = [] for i in range(10): - atoms = Atoms( + at = Atoms( numbers=np.arange(i + 5), positions=np.random.uniform(-3, 3, size=(i + 5, 3)), pbc=False, ) - fake_data.append(atoms) - fake_data[2].info["energy"] = 1.0 - fake_data[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3)) - fake_data[1].info["stdout"] = "abcd" - fake_data[6].info["logprob"] = np.array([-23.1, 22.1]) - fake_data[7].cell = np.random.uniform(-3, 4, size=(3, 3)) - fake_data[7].pbc = True - - write(tmp_path / "test.xyz", fake_data) - - data = read_frames(inputs=[str(tmp_path / "test.xyz")]).result() + atoms.append(at) + atoms[2].info["energy"] = 1.0 + atoms[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3)) + atoms[1].info["stdout"] = "abcd" + atoms[6].info["logprob"] = np.array([-23.1, 22.1]) + atoms[7].cell = np.random.uniform(-3, 4, size=(3, 3)) + atoms[7].pbc = True + + write(tmp_path / "atoms.xyz", atoms) + + # check reading from extxyz + data = _read_frames(tmp_path / "atoms.xyz") assert data is not None # should return when no output file is given - assert len(data) == len(fake_data) - for i in range(10): - assert np.allclose( - fake_data[i].get_positions(), - data[i].per_atom.positions, - ) - assert np.allclose( - fake_data[i].numbers, - data[i].per_atom.numbers, - ) - assert data[i] == Geometry.from_atoms(fake_data[i]) + assert len(data) == len(atoms) + for at, geom in zip(atoms, data): + assert geom == Geometry.from_atoms(at) assert data[2].energy == 1.0 assert data[2].per_atom_energy == 1.0 / 7 - assert data[3].per_atom_energy is None - assert np.allclose( - data[3].per_atom.forces, - fake_data[3].arrays["forces"], - ) + assert data[3].per_atom_energy is MISSING + assert np.allclose(data[3].per_atom.forces, atoms[3].arrays["forces"]) assert data[1].stdout == "abcd" assert type(data[6].logprob) is np.ndarray - assert np.allclose( - data[6].logprob, - fake_data[6].info["logprob"], - ) + assert np.allclose(data[6].logprob, atoms[6].info["logprob"]) assert data[7].periodic - assert np.allclose( - data[7].cell, - np.array(fake_data[7].cell), - ) - assert np.allclose( - data[6].cell, - np.zeros((3, 3)), - ) - geometry = read_frames(indices=[3], inputs=[str(tmp_path / "test.xyz")]).result()[0] + assert np.allclose(data[7].cell, atoms[7].cell.array) + assert data[6].cell is None + + geometry = _read_frames(tmp_path / "atoms.xyz", indices=[3])[0] assert geometry == data[3] assert not geometry == data[4] - # check writing - data = read_frames( - inputs=[str(tmp_path / "test.xyz")], - outputs=[File(str(tmp_path / "test_.xyz"))], - ).result() - assert data is None - - data = read_frames(inputs=[File(str(tmp_path / "test.xyz"))]).result() - data_ = read_frames(inputs=[File(str(tmp_path / "test_.xyz"))]).result() - + # check writing to extxyz + geoms = [Geometry.from_atoms(at) for at in atoms] + _write_frames(geoms, outputs=[File(tmp_path / "geometries.xyz")]) + atoms_ = read(tmp_path / "geometries.xyz", ":") + for at, at_ in zip(atoms, atoms_): + assert np.allclose(at.cell, at_.cell) + data = at.arrays | at.info | (at.calc.results if at.calc else {}) + data_ = at_.arrays | at_.info | (at_.calc.results if at_.calc else {}) + for k in data: + if k == "cell": + pass + elif isinstance(data[k], np.ndarray): + assert np.allclose(data[k], data_[k]) + else: + assert data[k] == data_[k] + + data = _read_frames(tmp_path / "atoms.xyz") + data_ = _read_frames(tmp_path / "geometries.xyz") for state, state_ in zip(data, data_): assert state == state_ assert state.energy == state_.energy - if state.stress is not None: - assert np.allclose( - state.stress, - state_.stress, - ) - assert state.delta == state_.delta - assert state.phase == state_.phase - if state.logprob is not None: - assert np.allclose(state.logprob, state_.logprob) - assert state.stdout == state_.stdout - assert state.identifier == state.identifier + assert state.stress is MISSING or np.allclose(state.stress, state_.stress) + attrs = state.attributes() + attrs_ = state_.attributes() + for val, val_ in zip(attrs.values(), attrs_.values()): + if isinstance(val, np.ndarray): + assert np.allclose(val, val_) + else: + assert val == val_ state.save(tmp_path / "geo.xyz") - state.save(str(tmp_path / "geo.xyz")) assert state == Geometry.load(tmp_path / "geo.xyz") + # default attributes have type (and shape) requirements + with pytest.raises(TypeError): + state.energy = None + with pytest.raises(AssertionError): + state.cell = [[1, 2, 3]] * 3 + + +def test_custom_attributes(dataset): + """Do geometries behave for non-default attributes?""" + geom = dataset[0].result() + + geom.custom1 = False + geom.custom2 = [42, "42"] + geom.per_atom.custom1 = np.ones(len(geom), dtype=int) * 42 + geom.per_atom.custom2 = np.random.random(len(geom)).astype(bool) + + string = geom.to_string() # casts per-atom properties to float.. + geom_ = Geometry.from_string(string) + atoms = geom.to_atoms() + + assert geom.custom1 == geom_.custom1 + assert geom.custom2 == geom_.custom2 + assert np.allclose(geom.per_atom.custom1, geom_.per_atom.custom1) + assert np.allclose(geom.per_atom.custom2, geom_.per_atom.custom2) + + assert geom.custom1 == atoms.info["custom1"] + assert geom.custom2 == atoms.info["custom2"] + assert np.allclose(geom.per_atom.custom1, atoms.arrays["custom1"]) + assert np.allclose(geom.per_atom.custom2, atoms.arrays["custom2"]) + def test_readwrite_cycle(dataset, tmp_path): + # TODO: much overlap with test_geometry + tmp_file = File(tmp_path / "test.xyz") + data = dataset[:4].geometries().result() - data[2].order["test"] = 324 - _write_frames(*data, outputs=[str(tmp_path / "test.xyz")]) + data[2].test = 324 + _write_frames(*data, outputs=[tmp_file]) loaded = Dataset(data) - assert "test" in loaded[2].result().order - - states = _read_frames(inputs=[str(tmp_path / "test.xyz")]) - assert "test" in states[2].order + assert hasattr(loaded[2].result(), "test") + states = _read_frames(tmp_file) + assert hasattr(states[2], "test") s = """3 energy=3.0 phase=c7eq Properties=species:S:1:pos:R:3:momenta:R:3:forces:R:3 @@ -155,19 +163,15 @@ def test_readwrite_cycle(dataset, tmp_path): O 1 2 3 4 5 6 7 8 9 F 2 3 4 5 6 7 8 9 10 """ - geometry = Geometry.from_string(s, natoms=None) + geometry = Geometry.from_string(s) assert len(geometry) == 3 assert geometry.energy == 3.0 assert geometry.phase == "c7eq" assert not geometry.periodic - assert np.all(np.logical_not(np.isnan(geometry.per_atom.forces))) - assert np.allclose( - geometry.per_atom.numbers, - np.array([6, 8, 9]), - ) + assert not np.any(np.isnan(geometry.per_atom.forces)) + assert np.allclose(geometry.numbers, np.array([6, 8, 9])) assert np.allclose( - geometry.per_atom.forces, - np.array([[6,7,8], [7,8,9], [8,9,10]]), + geometry.per_atom.forces, np.array([[6, 7, 8], [7, 8, 9], [8, 9, 10]]) ) s = """7 @@ -182,33 +186,41 @@ def test_readwrite_cycle(dataset, tmp_path): geometry = Geometry.from_string(s) assert not geometry.periodic assert len(geometry) == 7 - assert geometry.energy is None + assert geometry.energy is MISSING - geometry = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None) - geometry.stress = np.array([np.nan] * 9).reshape(3, 3) - assert "nan" not in geometry.to_string() - - -def test_dataset_empty(tmp_path): - dataset = Dataset([]) - assert dataset.length().result() == 0 - assert isinstance(dataset.extxyz, DataFuture) - path_xyz = tmp_path / "test.xyz" - dataset.save(path_xyz) # ensure the copy is executed before assert - psiflow.wait() - assert os.path.isfile(path_xyz) - with pytest.raises(ValueError): # cannot save outside cwd - dataset.save(path_xyz.parents[3] / "test.xyz") + geom = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None) + with pytest.raises(ValueError): # cannot assign stress to molecules + geom.stress = np.array([np.nan] * 9).reshape(3, 3) + string = geom.to_string() + assert "stress" not in string + assert "pbc" not in string + geom_ = Geometry.from_string(string) + assert geom_.stress is MISSING -def test_dataset_append(dataset): +def test_dataset(dataset, tmp_path): + """Verifying basic dataset functionality""" l = dataset.length().result() # noqa: E741 geometries = dataset.geometries().result() assert len(geometries) == l assert type(geometries) is list assert type(geometries[0]) is Geometry for i in range(l): - assert check_equality(geometries[i], dataset[i]).result() + assert geometries[i] == dataset[i].result() + + path_xyz = tmp_path / "data.xyz" + dataset.save(path_xyz).result() + assert os.path.isfile(path_xyz) + with pytest.raises(ValueError): # cannot save outside cwd + dataset.save(path_xyz.parents[3] / "test.xyz") + + dataset = Dataset([]) + assert dataset.length().result() == 0 + assert isinstance(dataset.extxyz, DataFuture) + + +def test_dataset_append(dataset): + l = dataset.length().result() empty = Dataset([]) # use [] instead of None empty += dataset assert l == empty.length().result() @@ -216,19 +228,16 @@ def test_dataset_append(dataset): assert added.length().result() == 2 * l assert dataset.length().result() == l # must not have changed + +def test_dataset_align(dataset): # test canonical transformation transformed = dataset.align_axes() - for i in range(dataset.length().result()): - geometry = dataset[i].result() - geometry.align_axes() - assert np.allclose( - geometry.per_atom.positions, - transformed[i].result().per_atom.positions, - ) - assert np.allclose( - geometry.cell, - transformed[i].result().cell, - ) + geoms = dataset.geometries().result() + geoms_ = transformed.geometries().result() + for geom, geom_ in zip(geoms, geoms_): + geom.align_axes() + assert np.allclose(geom.per_atom.positions, geom_.per_atom.positions) + assert np.allclose(geom.cell, geom_.cell) def test_dataset_slice(dataset): @@ -255,63 +264,6 @@ def test_dataset_slice(dataset): for i, geometry in enumerate(subset.geometries().result()): assert geometry == dataset[i].result() - -def test_dataset_from_xyz(tmp_path, dataset): - path_xyz = tmp_path / "data.xyz" - dataset.save(path_xyz) - psiflow.wait() - _ = Dataset.load(path_xyz) - atoms_list = read(path_xyz, index=":") - - for i in range(dataset.length().result()): - assert np.allclose( - dataset[i].result().per_atom.positions, - atoms_list[i].get_positions(), - ) - assert dataset[i].result().energy == atoms_list[i].calc.results["energy"] - - -def test_index_element_mask(): - numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8]) - elements = ["H"] - indices = [0, 1, 4, 5] - assert np.allclose( - np.array([True, True, False, False, False, False, False, False]), - get_index_element_mask(numbers, indices, elements), - ) - elements = ["H", "O"] - indices = [0, 1, 4, 5] - assert np.allclose( - np.array([True, True, False, False, False, True, False, False]), - get_index_element_mask(numbers, indices, elements), - ) - elements = ["H", "O"] - indices = [3] - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([False] * len(numbers)), - ) - elements = ["H", "O"] - indices = None - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([True, True, True, False, False, True, True, True]), - ) - elements = None - indices = [0, 1, 2, 3, 5] - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([True, True, True, True, False, True, False, False]), - ) - elements = ["Cl"] # not present at all - indices = None - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([False] * len(numbers)), - ) - - -def test_dataset_gather(dataset): indices = [0, 3, 2, 6, 1] gathered = dataset[indices] assert gathered.length().result() == len(indices) @@ -322,180 +274,204 @@ def test_dataset_gather(dataset): ) +def test_dataset_from_xyz(tmp_path, dataset): + path_xyz = tmp_path / "data.xyz" + dataset.save(path_xyz).result() + atoms_list = read(path_xyz, index=":") + geoms = dataset.geometries().result() + for geom, at in zip(geoms, atoms_list): + assert np.allclose(geom.per_atom.positions, at.get_positions()) + assert geom.energy == at.calc.results["energy"] + + def test_data_elements(dataset): - assert "H" in dataset.elements().result() - assert "Cu" in dataset.elements().result() - assert len(dataset.elements().result()) == 2 + elements = dataset.elements().result() + assert "H" in elements + assert "Cu" in elements + assert len(elements) == 2 def test_data_reset(dataset): dataset = dataset.reset() - assert dataset[0].result().energy is None + geometries = dataset.geometries().result() + for geom in geometries: + assert geom.energy is MISSING + assert geom.per_atom.forces is MISSING def test_data_split(dataset): train, valid = dataset.split(0.9) - assert ( - train.length().result() + valid.length().result() == dataset.length().result() - ) + train_geoms = train.geometries().result() + valid_geoms = valid.geometries().result() + assert len(train_geoms) + len(valid_geoms) == dataset.length().result() + for geom_t in train_geoms: + for geom_v in valid_geoms: + assert geom_t != geom_v -def test_identifier(dataset): - data = dataset + Dataset([NullState, NullState, dataset[0]]) +def test_identifier(dataset, dataset_h2): + data = dataset + dataset_h2 identifier = data.assign_identifiers(0) - assert identifier.result() == dataset.length().result() + 1 - assert identifier.result() == data.not_null().length().result() - for i in range(data.length().result()): - if not data[i].result() == NullState: - assert data[i].result().identifier < identifier.result() - # assert data[i].result().reference_status - data = data.clean() # also removes identifier - for i in range(data.length().result()): - s = data[i].result() - if not s == NullState: - assert s.identifier is None - identifier = data.assign_identifiers(10) - assert identifier.result() == 10 + data.not_null().length().result() - identifier = dataset.assign_identifiers(10) - assert ( - dataset.assign_identifiers().result() - == 10 + dataset.not_null().length().result() - ) - for i in range(dataset.length().result()): - s = dataset[i].result() - if not s == NullState: - assert s.identifier >= 10 - - identifier = data.assign_identifiers() - data = data.clean() - assert ( - data.assign_identifiers(identifier).result() - == identifier.result() + data.not_null().length().result() - ) + assert identifier.result() == data.length().result() + + data = data.clean() # removes identifier + geoms = data.geometries().result() + for geom in geoms: + assert not hasattr(geom, "identifier") + + identifier = dataset.assign_identifiers(5).result() + geoms = dataset.geometries().result() + assert identifier == len(geoms) + 5 + for geom in geoms: + assert geom.identifier < identifier + + # set initial value + geoms = dataset_h2.geometries().result() + geoms[10].identifier = 42 + + # count from 0 + data = Dataset(geoms) + identifier = data.assign_identifiers(0).result() + geoms_ = data.geometries().result() + assert identifier == len(geoms) - 1 # skips over one geometry + assert geoms_[0].identifier == 0 + assert geoms_[10].identifier == 42 + assert geoms_[-1].identifier == identifier - 1 + + # count from highest + data = Dataset(geoms) + identifier = data.assign_identifiers().result() + geoms_ = data.geometries().result() + assert identifier == len(geoms) + 42 # skips over one geometry + assert geoms_[0].identifier == 43 + assert geoms_[10].identifier == 42 + assert geoms_[-1].identifier == identifier - 1 def test_data_offset(dataset): - atomic_energies = { - "H": 34, - "Cu": 12, - } - data = dataset.subtract_offset(**atomic_energies) - data_ = data.add_offset(**atomic_energies) - for i in range(dataset.length().result()): - natoms = len(dataset[i].result()) - offset = (natoms - 1) * atomic_energies["Cu"] + atomic_energies["H"] - assert np.allclose( - data[i].result().energy, - dataset[i].result().energy - offset, - ) - assert np.allclose( # unchanged - data[i].result().per_atom.forces, - dataset[i].result().per_atom.forces, - ) - assert np.allclose( - data_[i].result().energy, - dataset[i].result().energy, - ) + atomic_energies = {"H": 34, "Cu": 12} + data0 = dataset.subtract_offset(**atomic_energies) + data1 = data0.add_offset(**atomic_energies) + + geoms = dataset.geometries().result() + geoms0 = data0.geometries().result() + geoms1 = data1.geometries().result() + + for g, g0, g1 in zip(geoms, geoms0, geoms1): + offset = (len(g) - 1) * atomic_energies["Cu"] + atomic_energies["H"] + assert np.allclose(g0.energy, g.energy - offset) + assert np.allclose(g0.per_atom.forces, g.per_atom.forces) + assert np.allclose(g1.energy, g.energy) def test_data_extract(dataset): state = dataset[0].result() - state.energy = None + state.reset() state.identifier = 0 state.delta = 6 + # check Dataset.get data = dataset[:5] + dataset[-5:] + Dataset([state]) energy, forces, identifier = data.get("energy", "forces", "identifier") energy = energy.result() forces = forces.result() identifier = identifier.result() + assert energy[-1] is MISSING and forces[-1] is MISSING + assert identifier[-1] == 0 for i, geometry in enumerate(data.geometries().result()): if geometry.energy: assert np.allclose(geometry.energy, energy[i]) - n = len(geometry) - assert np.allclose( - geometry.per_atom.forces, - forces[i][:n, :], - ) - if identifier[i].item() != -1: + try: + assert np.allclose(geometry.per_atom.forces, forces[i]) + except TypeError: + assert geometry.per_atom.forces is MISSING and forces[i] is MISSING + if hasattr(geometry, "identifier"): assert geometry.identifier == identifier[i] - if i < 10: - assert identifier[i] == -1 - assert np.isnan(np.mean(energy)) - assert np.isnan(np.mean(forces)) - psiflow.wait() - - data = dataset[:2] + Dataset([NullState]) + dataset[3:5] - forces = data.get("forces", elements=["Cu"]) - reference = np.zeros((5, 4, 3)) - reference[2, :] = np.nan # ensure nan is in same place - reference[:, 0] = np.nan # ensure nan is in same place - value = compute_rmse(forces, reference) - - # last three atoms are Cu - forces = np.zeros((5, 4, 3)) - for i in range(5): - forces[i, :] = data[i].result().per_atom.forces - forces[:, 0] = np.nan - assert np.allclose( - value.result(), - compute_rmse(forces, forces * np.zeros_like(forces)).result(), - ) - unreduced = compute_rmse( - forces, forces * np.zeros_like(forces), reduce=False - ).result() - assert len(unreduced) == 5 - unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)] - assert np.allclose( - np.sqrt(np.mean(np.square(unreduced_))), - value.result(), - ) - # try weirdly specific indices - forces = np.zeros((5, 4, 3)) - for i in range(5): - g = data[i].result() - if len(g) >= 4: - forces[i, 3] = data[i].result().per_atom.forces[3] - forces[i, 1] = data[i].result().per_atom.forces[1] + # check extract-insert round-trip + geoms = data.clean().geometries().result() + data_dict = {"energy": energy, "forces": forces, "identifier": identifier} + insert(geoms, data_dict) + for geom, geom_ in zip(geoms, data.geometries().result()): + e, e_ = geom.energy, geom_.energy + f, f_ = geom.per_atom.forces, geom_.per_atom.forces + assert e == e_ + if f is MISSING: + assert f_ is MISSING else: - forces[i, :] = np.nan - forces_ = data.get("forces", atom_indices=[3, 1]).result() - mask = np.invert(np.isnan(forces_)) - assert np.allclose( - forces[mask], - forces_[mask], - ) + assert np.allclose(f, f_) + if hasattr(geom, "identifier"): + assert geom.identifier == geom_.identifier + + # check Dataset.get_per_atom + geoms = data.geometries().result() + forces = [geom.per_atom.forces for geom in geoms] + numbers = [geom.numbers for geom in geoms] + + forces_ = data.get_per_atom("forces", elements=["Cu"])[0].result() + for n, f, f_ in zip(numbers, forces, forces_): + # filter on Cu + if f is MISSING: + assert f_ is MISSING + else: + assert np.allclose(f[n == 29], f_) + + ids = [0, 2] + forces_ = data.get_per_atom("forces", atom_indices=ids)[0].result() + for f, f_ in zip(forces, forces_): + # filter on ids + if f is MISSING: + assert f_ is MISSING + else: + assert np.allclose(f[ids], f_) - # check order parameters + # check custom attributes s = Geometry.from_string( """ 3 -order_distance=2.3 energy=4 +distance=2.3 energy=4 O 0 0 0 H 1 1 1 H -1 1 1 """, ) - states = [copy.deepcopy(s) for _ in range(5)] - - states[1].order = {} - states[2].order = {"some": 4.2} - states[3].order["distance"] = 2.2 + states = [s.copy() for _ in range(5)] + states[1].order = {"answer": 42} + states[2].some = 4.2 + states[3].distance = 2.2 data = Dataset(states) - some, distance = data.get("some", "distance") + some, distance, energy, order = data.get("some", "distance", "energy", "order") some = some.result() distance = distance.result() - assert np.isnan(some[1]) - assert np.allclose(some[2], 4.2) - assert np.allclose(distance[0], 2.3) - assert np.allclose(distance[3], 2.2) - assert np.isnan(distance[2]) + energy = energy.result() + order = order.result() + assert some == [MISSING, MISSING, 4.2, MISSING, MISSING] + assert distance == [2.3, 2.3, 2.3, 2.2, 2.3] + assert energy == [4, 4, 4, 4, 4] + assert order == [MISSING, {"answer": 42}, MISSING, MISSING, MISSING] def test_filter(dataset, dataset_h2): - data = dataset + dataset_h2 + Dataset([NullState]) - data = data.shuffle() - assert data.filter("cell").length().result() == dataset.length().result() - assert data.filter("energy").length().result() == dataset.length().result() - assert data.filter("forces").length().result() == dataset.length().result() + geom = dataset[0].result() + geom.flagged = True + geom.reset() + data = dataset + dataset_h2 + Dataset([geom]) + + data0 = data.shuffle() + data1 = data.filter("cell") + data2 = data.filter("energy") + data3 = data.filter("forces") + data4 = data.filter("flagged") + + l = dataset.length().result() + l_ = dataset_h2.length().result() + l0 = data0.length().result() + l1 = data1.length().result() + l2 = data2.length().result() + l3 = data3.length().result() + l4 = data4.length().result() + + assert l0 == l + l_ + 1 # all geometries + assert l0 == l1 # cell always defined + assert l2 == l and l3 == l + assert l4 == 1 diff --git a/tests/test_free_energy.py b/tests/test_free_energy.py index ecacc3e..c1bc53a 100644 --- a/tests/test_free_energy.py +++ b/tests/test_free_energy.py @@ -1,15 +1,16 @@ import numpy as np from ase.units import _c, kB, second +import psiflow from psiflow.free_energy import ( Integration, compute_frequencies, compute_harmonic, harmonic_free_energy, ) -from psiflow.geometry import check_equality from psiflow.hamiltonians import EinsteinCrystal, Harmonic, MACEHamiltonian from psiflow.sampling.ase import optimize +from psiflow.data.utils import check_equality def test_integration_simple(dataset): @@ -39,7 +40,7 @@ def test_integration_simple(dataset): # manual computation of delta gradient delta = -0.1 * harmonic - energies = delta.compute(integration.outputs[i].trajectory, "energy") + energies = delta.compute(integration.outputs[i].trajectory).energy assert np.allclose( state.gradients["delta"].result(), np.mean(energies.result()) / (kB * state.temperature), diff --git a/tests/test_function.py b/tests/test_function.py index 39ec1ca..22b305e 100644 --- a/tests/test_function.py +++ b/tests/test_function.py @@ -5,11 +5,14 @@ from parsl.data_provider.files import File # type: ignore import psiflow +from psiflow.geometry import MISSING from psiflow.functions import ( EinsteinCrystalFunction, HarmonicFunction, MACEFunction, PlumedFunction, + DispersionFunction, + ZeroFunction, function_from_json, ) from psiflow.hamiltonians import ( @@ -18,6 +21,7 @@ MixtureHamiltonian, PlumedHamiltonian, D3Hamiltonian, + MACEHamiltonian, Zero, ) from psiflow.utils._plumed import remove_comments_printflush, set_path_in_plumed @@ -25,34 +29,39 @@ from psiflow.utils.io import dump_json from psiflow.utils.apps import get_attribute from psiflow.serialization import CLS_KEY, deserialize_hook +from psiflow.compute import compare_results def test_einstein_crystal(dataset): future_geom = dataset[0] + geom = future_geom.result() test_dataset = dataset[:4].reset() - geometries = test_dataset.geometries().result() + test_geoms = test_dataset.geometries().result() function = EinsteinCrystalFunction( - force_constant=1.0, centers=future_geom.result().per_atom.positions + force_constant=1.0, centers=geom.per_atom.positions, volume=geom.volume ) hamiltonian = EinsteinCrystal.from_geometry(future_geom, force_constant=1.0) - energy, forces, stress = function.compute(geometries).values() - forces1, stress1, energy1 = hamiltonian.compute( - test_dataset, "forces", "stress", "energy" - ) - forces2 = hamiltonian.compute(test_dataset, "forces", batch_size=3) - - assert np.all(energy >= 0) - assert energy[0] == 0 - assert geometries[0].energy is None - assert np.allclose( # forces point to centers - forces, - function.centers.reshape(1, -1, 3) - test_dataset.get("positions").result(), - ) - assert np.allclose(energy1.result(), energy) - assert np.allclose(forces1.result(), forces) - assert np.allclose(forces1.result(), forces2.result()) + out0 = function.compute(test_geoms) + assert all(e >= 0 for e in out0["energy"]) + assert out0["energy"][0] == 0 + assert test_geoms[0].energy is MISSING + positions = test_dataset.get("positions")[0].result() + centers = function.centers.reshape(1, -1, 3) + for f, p in zip(out0["forces"], positions): + assert np.allclose(f, centers - p) # forces point to centers + + out1 = hamiltonian.compute(test_dataset) + out2 = hamiltonian.compute(test_dataset, batch_size=3) + + error1 = compare_results(out1, **out0) + error2 = compare_results(out1, out2) + + for k, rmse in error1.result().items(): + assert np.isclose(rmse, 0.0) + for k, rmse in error2.result().items(): + assert np.isclose(rmse, 0.0) def test_einstein_force(dataset): @@ -72,7 +81,9 @@ def test_einstein_force(dataset): forces_[i, j] = (-1.0) * sign * force_constant * delta forces.append(forces_) - forces_ = einstein.compute(geometries, "forces").result() + # per atom arrays are concatenated - not stacked + forces_ = einstein.compute(geometries).result().forces + forces_ = forces_.reshape(-1, len(reference), 3) assert np.allclose(forces, forces_) @@ -91,9 +102,7 @@ def test_get_filename_hills(): plumed_input = remove_comments_printflush(plumed_input) plumed_input = set_path_in_plumed(plumed_input, "METAD", "/tmp/my_input") plumed_input = set_path_in_plumed(plumed_input, "METADD", "/tmp/my_input") - assert ( - plumed_input.strip() - == """ + assert plumed_input.strip() == """ RESTART UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: VOLUME @@ -101,12 +110,12 @@ def test_get_filename_hills(): METAD ARG=CV0 SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad FILE=/tmp/my_input sdld METADD ARG=CV SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad sdld FILE=/tmp/my_input """.strip() - ) def test_plumed_function(tmp_path, dataset, dataset_h2): data = dataset + dataset_h2 geometries = data.geometries().result() + f = 1 / (kJ / mol) * 10 # eV --> kJ/mol and nm --> A plumed_str = """ D1: DISTANCE ATOMS=1,2 NOPBC CV: BIASVALUE arg=D1 @@ -114,12 +123,13 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): function = PlumedFunction(plumed_str) outputs = function.compute(geometries) - f = 1 / (kJ / mol) * 10 # eV --> kJ/mol and nm --> A - positions = data.get("positions").result() - manual = np.linalg.norm(positions[:, 0, :] - positions[:, 1, :], axis=1) - assert np.allclose(outputs["energy"] * f, manual) - gradient = (positions[:, 0, :] - positions[:, 1, :]) / manual.reshape(-1, 1) - assert np.allclose(outputs["forces"][:, 0, :] * f, gradient * (-1.0)) + [positions] = data.get("positions") + manual = np.array([np.linalg.norm(p[0] - p[1]) for p in positions.result()]) + gradient = [(p[0] - p[1]) / d for p, d in zip(positions.result(), manual)] + assert np.allclose(np.array(outputs["energy"]) * f, manual) + for grad, force in zip(gradient, outputs["forces"]): + # only compare first force component + assert np.allclose(force[0] * f, grad * (-1.0)) # test volume bias geometries = dataset.geometries().result() @@ -132,12 +142,12 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): function = PlumedFunction(plumed_input) hamiltonian = PlumedHamiltonian(plumed_input) - energy, forces, stress = function.compute(geometries).values() - energy_, forces_, stress_ = hamiltonian.compute(dataset) + data = function.compute(geometries) + out = hamiltonian.compute(dataset).result() energy_manual = (volumes - 50) ** 2 * (kJ / mol) / 2 - assert np.allclose(energy, energy_manual) - assert np.allclose(energy, energy_.result()) - assert np.allclose(stress, stress_.result()) + assert np.allclose(data["energy"], energy_manual) + assert np.allclose(data["energy"], out.energy) + assert np.allclose(data["stress"], out.stress) # use external grid as bias, check that file is read test_set = dataset[:10] @@ -154,22 +164,19 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: DISTANCE ATOMS=1,2 NOPBC METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={} -""".format( - path_hills - ) +""".format(path_hills) hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills)) - energy = hamiltonian.compute(test_set, "energy") + energy = hamiltonian.compute(test_set).result().energy # compute bias energy manually - positions = test_set.get("positions").result() - vecs = positions[:, 0, :] - positions[:, 1, :] - distance = np.linalg.norm(vecs, axis=1).reshape(-1, 1) sigma = 2 * np.ones((1, 2)) height = np.array([70, 70]).reshape(1, -1) * (kJ / mol) # unit consistency center = np.array([2.5, 2.6]).reshape(1, -1) - energy_per_hill = height * np.exp((distance - center) ** 2 / (-2 * sigma**2)) + positions = test_set.get("positions")[0].result() + dists = np.array([np.linalg.norm(p[0] - p[1]) for p in positions]).reshape(-1, 1) + energy_per_hill = height * np.exp((dists - center) ** 2 / (-2 * sigma**2)) energy_ = np.sum(energy_per_hill, axis=1) - assert np.allclose(energy.result(), energy_, atol=1e-3) + assert np.allclose(energy, energy_, atol=1e-3) # check that hills file didn't change with open(path_hills, "r") as f: @@ -177,7 +184,7 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): def test_harmonic_function(dataset): - test_set = dataset[:10] + test_set = dataset[:5] geometries = test_set.geometries().result() reference = geometries[0] hess = np.eye(3 * len(reference)) @@ -185,30 +192,59 @@ def test_harmonic_function(dataset): function = HarmonicFunction(reference.per_atom.positions, hess, reference.energy) einstein = EinsteinCrystalFunction(1.0, reference.per_atom.positions) harmonic = Harmonic.from_geometry(dataset[0], hess) + assert Harmonic.outputs == ("energy", "forces", "stress") - energy, forces, _ = function.compute(geometries).values() - energy1, forces1, _ = einstein.compute(geometries).values() - energy2, forces2, _ = harmonic.compute(test_set) + out0 = function.compute(geometries) + out1 = einstein.compute(geometries) + result = harmonic.compute(test_set) + err0 = compare_results(result, **out0).result() + err1 = compare_results(result, **out1).result() - assert Harmonic.outputs == ("energy", "forces", "stress") - assert np.allclose(energy - reference.energy, energy1) - assert np.allclose(forces, forces1) - assert np.allclose(energy2.result() - reference.energy, energy1, atol=1e-5) - assert np.allclose(forces2.result(), forces1) + assert np.allclose(np.array(out0["energy"]) - reference.energy, out1["energy"]) + assert np.isclose(err0["energy"], 0) + assert np.isclose(err1["energy"], reference.energy) + assert np.allclose(out0["forces"], out1["forces"]) + assert np.isclose(err0["forces"], 0) + assert np.isclose(err1["forces"], 0) def test_dispersion_function(dataset): + test_set = dataset[:3] + function = DispersionFunction(method="pbe", damping="d3bj") hamiltonian = D3Hamiltonian(method="pbe", damping="d3bj") - energy = hamiltonian.compute(dataset[-1], "energy").result() - assert energy is not None - assert energy < 0.0 # dispersion is attractive - subset = dataset[:3] - data = subset.evaluate(hamiltonian) - energy, forces = hamiltonian.compute(subset, "energy", "forces") + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set) + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0) + assert np.all(np.array(out["energy"]) < 0.0) # dispersion is attractive + + +def test_zero(dataset): + test_set = dataset[:10] + function = ZeroFunction() + hamiltonian = Zero() + + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set).result() + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0) + assert np.allclose(result.energy, 0.0) + assert np.allclose(result.forces, 0.0) - assert np.allclose(data.get("energy").result(), energy.result()) - assert len(forces.result().shape) == 3 + +def test_mace(dataset, mace_foundation): + test_set = dataset[:3] + function = MACEFunction(mace_foundation, 1, "cpu", "float32") + hamiltonian = MACEHamiltonian(mace_foundation) + + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set).result() + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0, atol=1e-6) # single precision def test_hamiltonian_arithmetic(dataset): @@ -234,11 +270,11 @@ def test_hamiltonian_arithmetic(dataset): zero = Zero() assert hamiltonian == hamiltonian + zero assert 2 * hamiltonian + zero == 2 * hamiltonian - scaled_m = 0.5 * hamiltonian + scaled_h = 0.5 * hamiltonian scaled_h1 = EinsteinCrystal.from_geometry(future, 0.5) - assert len(scaled_m) == 1 - assert scaled_m.get_coefficient(hamiltonian) == 0.5 - assert scaled_m.get_coefficient(scaled_h1) is None + assert len(scaled_h) == 1 + assert scaled_h.get_coefficient(hamiltonian) == 0.5 + assert scaled_h.get_coefficient(scaled_h1) is None scaled_h2 = EinsteinCrystal.from_geometry(future, 4.0) mixture = hamiltonian + scaled_h2 assert len(mixture) == 2 @@ -246,32 +282,26 @@ def test_hamiltonian_arithmetic(dataset): assert mixture.get_coefficients(mixture) == (1, 1) assert mixture.get_coefficients(hamiltonian + scaled_h1) is None - energy_m = scaled_m.compute(test_set, "energy") - energy_h1 = scaled_h1.compute(test_set, "energy") - energy, forces, _ = hamiltonian.compute(test_set) - energy_, forces_, _ = mixture.compute(test_set) - assert np.allclose(energy_m.result(), energy_h1.result()) - assert np.allclose(energy_.result(), 5 * energy.result()) - assert np.allclose(forces_.result(), 5 * forces.result()) + result_h = scaled_h.compute(test_set) + result_h1 = scaled_h1.compute(test_set) + error = compare_results(result_h, result_h1) + out = hamiltonian.compute(test_set) + out_ = mixture.compute(test_set) + error, out, out_ = error.result(), out.result(), out_.result() - energy, forces, stress = zero.compute(test_set) - assert np.allclose(energy.result(), 0.0) - geometries = [dataset[i].result() for i in [0, -1]] - natoms = [len(geometry) for geometry in geometries] - forces = zero.compute(geometries, "forces", batch_size=1).result() - assert np.all(forces[0, : natoms[0]] == 0.0) - assert np.all(forces[-1, : natoms[1]] == 0.0) + for k, v in error.items(): + assert np.isclose(v, 0) + assert np.allclose(out.energy * 5, out_.energy) + assert np.allclose(out.forces * 5, out_.forces) - -def test_subtract(dataset): - einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) + # check subtraction + einstein = EinsteinCrystal.from_geometry(future, force_constant=1.0) h = einstein - einstein assert isinstance(h, MixtureHamiltonian) assert np.allclose(h.coefficients, 0.0) def test_hamiltonian_serialize(dataset): - data = json.loads(psiflow.serialize(Zero()).result()) assert data[CLS_KEY] == "Zero" zero = psiflow.deserialize(json.dumps(data)).result() @@ -281,9 +311,7 @@ def test_hamiltonian_serialize(dataset): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: VOLUME RESTRAINT ARG=CV AT={center} KAPPA={kappa} - """.format( - center=100, kappa=1 / (kJ / mol) - ) + """.format(center=100, kappa=1 / (kJ / mol)) plumed = PlumedHamiltonian(plumed_input) einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) @@ -308,27 +336,12 @@ def test_hamiltonian_serialize(dataset): assert h == h_ test_set = dataset[:10] - energy_e = einstein.compute(test_set, "energy") - energy_e_ = einstein_.compute(test_set, "energy") - energy_m = mixed.compute(test_set, "energy") - energy_m_ = mixed_.compute(test_set, "energy") - assert np.allclose(energy_e.result(), energy_e_.result()) - assert np.allclose(energy_m.result(), energy_m_.result()) - - -def test_evaluate(dataset): - hamiltonian = EinsteinCrystal.from_geometry(geometry=dataset[0], force_constant=1.0) - data = dataset[:10].reset() - evaluated = data.evaluate(hamiltonian, batch_size=None) - evaluated_ = data.evaluate(hamiltonian, batch_size=2) - energy = hamiltonian.compute(evaluated, "energy") - - geometries = evaluated.geometries().result() - geometries_ = evaluated_.geometries().result() - for geom, geom_, e in zip(geometries, geometries_, energy.result()): - assert not np.all(np.isnan(geom.per_atom.forces)) - assert geom == geom_ - assert geom.energy == e + out1 = einstein.compute(test_set) + out1_ = einstein_.compute(test_set) + out2 = mixed.compute(test_set) + out2_ = mixed_.compute(test_set) + assert np.allclose(out1.result().energy, out1_.result().energy) + assert np.allclose(out2.result().energy, out2_.result().energy) def test_json_dump(): @@ -368,15 +381,36 @@ def test_function_from_json(tmp_path, dataset): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: DISTANCE ATOMS=1,2 NOPBC METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={} -""".format( - path_hills - ) +""".format(path_hills) hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills)) future = hamiltonian.serialize_function() future.result() # ensure file exists function = function_from_json(future.filepath) - energies = hamiltonian.compute(dataset, "energy") + energies = hamiltonian.compute(dataset).result().energy energies_ = function.compute(dataset.geometries().result())["energy"] - assert np.allclose(energies.result(), energies_) + assert np.allclose(energies, energies_) + + +def test_evaluate(dataset): + test_set = dataset[:10].reset() + hamiltonian = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) + + out0 = hamiltonian.evaluate(test_set) + out1 = hamiltonian.evaluate(test_set, batch_size=None) + result = hamiltonian.compute(test_set) + + geoms0 = out0.geometries().result() + geoms1 = out1.geometries().result() + data = result.result().to_dict() + + for i in range(len(geoms0)): + geom0, geom1 = geoms0[i], geoms1[i] + assert np.isclose(geom0.energy, geom1.energy) + assert np.isclose(geom0.energy, data["energy"][i]) + assert np.allclose(geom0.per_atom.forces, geom1.per_atom.forces) + assert np.allclose(geom0.per_atom.forces, data["forces"][i]) + assert np.allclose(geom0.stress, geom1.stress) + assert np.allclose(geom0.stress, data["stress"][i]) + diff --git a/tests/test_learning.py b/tests/test_learning.py index 430a6ca..416a09c 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -3,7 +3,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import new_nullstate +# from psiflow.geometry import new_nullstate from psiflow.hamiltonians import EinsteinCrystal from psiflow.learning import Learning, evaluate_outputs from psiflow.metrics import Metrics, _create_table, parse_walker_log, reconstruct_dtypes diff --git a/tests/test_models.py b/tests/test_models.py index 9476975..7da4e80 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,7 +4,7 @@ from parsl.app.futures import DataFuture import psiflow -from psiflow.data import compute_rmse +from psiflow.compute import compare_arrays from psiflow.hamiltonians import MACEHamiltonian from psiflow.models import MACE from psiflow.models.mace import KEY_ATOMIC_ENERGIES, KEY_ITERATION, MODEL_DIRS @@ -59,19 +59,16 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path): validation = dataset[-5:] path = tmp_path / "mace" model = MACE.create(path, mace_config) + [data] = validation.get(key) model.train(training, validation) hamiltonian = model.create_hamiltonian() - rmse0 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation0 = hamiltonian.evaluate(validation) + [data0] = validation0.get(key) future_train = model.train(training, validation) hamiltonian = model.create_hamiltonian() - rmse1 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation1 = hamiltonian.evaluate(validation) + [data1] = validation1.get(key) future_train.result() # wait for second training run with pytest.raises(AssertionError): @@ -81,25 +78,25 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path): # train from load model_ = MACE.load(path) hamiltonian = model_.create_hamiltonian() - rmse2 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation2 = hamiltonian.evaluate(validation) + [data2] = validation2.get(key) model_.train(training, validation) hamiltonian = model_.create_hamiltonian() - rmse3 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation3 = hamiltonian.evaluate(validation) + [data3] = validation3.get(key) - print(rmse0.result()) - print(rmse1.result()) - print(rmse2.result()) - print(rmse3.result()) + rmse0 = compare_arrays(data, data0).result() + rmse1 = compare_arrays(data, data1).result() + rmse2 = compare_arrays(data, data2).result() + rmse3 = compare_arrays(data, data3).result() - assert rmse0.result() > rmse1.result() - assert np.isclose(rmse1.result(), rmse2.result()) - assert rmse2.result() > rmse3.result() + print(rmse0) + print(rmse1) + print(rmse2) + print(rmse3) + assert rmse0 > rmse1 + assert np.isclose(rmse1, rmse2) + assert rmse2 > rmse3 def test_mace_hamiltonian(dataset, mace_foundation): @@ -114,8 +111,11 @@ def test_mace_hamiltonian(dataset, mace_foundation): assert hamiltonian1 == hamiltonian2 hamiltonian2.update_kwargs(enable_cueq=True) - e0 = hamiltonian0.compute(dataset, "energy") - e1 = hamiltonian1.compute(dataset, "energy") - e2 = hamiltonian2.compute(dataset, "energy") - assert np.allclose(e0.result(), e1.result()) - assert np.allclose(e0.result(), e2.result()) + out0 = hamiltonian0.compute(dataset) + out1 = hamiltonian1.compute(dataset) + out2 = hamiltonian2.compute(dataset) + out0, out1, out2 = out0.result(), out1.result(), out2.result() + for key in out0.keys: + # single precision + assert np.allclose(out0.get(key), out1.get(key), atol=1e-6) + assert np.allclose(out0.get(key), out2.get(key), atol=1e-6) diff --git a/tests/test_reference.py b/tests/test_reference.py index 416974a..fcce746 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -5,7 +5,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, NullState +from psiflow.geometry import Geometry, MISSING from psiflow.reference import CP2K, GPAW, ORCA, create_orca_input from psiflow.reference.reference import Status from psiflow.reference.cp2k_ import ( @@ -198,23 +198,19 @@ def test_cp2k_parse_output(): def test_cp2k_success(simple_cp2k_input, geom_h2_p): reference = CP2K(simple_cp2k_input) - future = reference.evaluate(geom_h2_p) - future_null = reference.evaluate(NullState) + future = reference(geom_h2_p) assert isinstance(future, AppFuture) geom_out = future.result() - geom_null = future_null.result() ref_energy = -1.167407360449355 * Ha ref_forces = np.array([[-0.00968014, 0.0, 0.0], [0.00967947, 0.0, 0.0]]) * Ha / Bohr - assert geom_out != NullState assert geom_out.energy is not None assert not np.any(np.isnan(geom_out.per_atom.forces)) assert np.allclose(ref_energy, geom_out.energy) assert np.allclose(ref_forces, geom_out.per_atom.forces, atol=1e-5) - assert geom_null == NullState # check whether NullState evaluates to NullState # check number of mpi processes - stdout, _ = get_task_logs(geom_out.order["task_id"]) + stdout, _ = get_task_logs(geom_out.metadata["task_id"]) lines = stdout.read_text().split("\n") for line in lines: if "Total number of message passing processes" in line: @@ -295,12 +291,12 @@ def test_cp2k_failure(geom_h2_p): &END FORCE_EVAL """ # incorrect input file reference = CP2K(cp2k_input) - future = reference.evaluate(geom_h2_p) + future = reference(geom_h2_p) assert isinstance(future, AppFuture) geom_out = future.result() - assert geom_out.energy is None - assert np.all(np.isnan(geom_out.per_atom.forces)) - stdout, _ = get_task_logs(geom_out.order["task_id"]) + assert geom_out.energy is MISSING + assert geom_out.per_atom.forces is MISSING + stdout, _ = get_task_logs(geom_out.metadata["task_id"]) assert "ABORT" in stdout.read_text() # verify error is captured @@ -311,25 +307,34 @@ def test_cp2k_memory(simple_cp2k_input): positions=np.random.uniform(0, 20, size=(4000, 3)), cell=20 * np.eye(3), # box way too large ) - energy, forces = reference.compute(geometry) - energy, forces = energy.result(), forces.result() - assert np.all(np.isnan(energy)) + geom_fail = reference(geometry).result() + assert geom_fail.energy is MISSING + assert geom_fail.per_atom.forces is MISSING + assert geom_fail.metadata["status"] == Status.FAILED.name @pytest.mark.filterwarnings("ignore:Original input file not found") def test_cp2k_timeout(simple_cp2k_input, geom_h2_p): + definition = psiflow.context().definitions["CP2K"] + runtime = definition.max_runtime + definition.max_runtime = 1 # seconds + reference = CP2K(simple_cp2k_input) geom_h2_p.cell = 20 * np.eye(3) # box way too large - energy, forces = reference.compute(Dataset([geom_h2_p])) - energy, forces = energy.result(), forces.result() - assert np.all(np.isnan(energy)) + + geom_fail = reference(geom_h2_p).result() + assert geom_fail.energy is MISSING + assert geom_fail.per_atom.forces is MISSING + assert geom_fail.metadata["status"] == Status.FAILED.name + + definition.max_runtime = runtime # restore value def test_cp2k_energy(simple_cp2k_input, geom_h2_p): reference = CP2K(simple_cp2k_input, outputs=("energy",)) - geom_out = reference.evaluate(geom_h2_p).result() + geom_out = reference(geom_h2_p).result() assert geom_out.energy is not None - assert np.all(np.isnan(geom_out.per_atom.forces)) + assert geom_out.per_atom.forces is MISSING @pytest.mark.filterwarnings("ignore:Original input file not found") @@ -337,6 +342,7 @@ def test_cp2k_atomic_energies(simple_cp2k_input): # use energy-only because why not reference = CP2K(simple_cp2k_input, outputs=("energy",)) energy = reference.compute_atomic_energy("H", box_size=4) + print(energy.result()) assert abs(energy.result() - (-13.6)) < 1 # reasonably close to exact value @@ -380,27 +386,30 @@ def test_cp2k_posthf(geom_h2_p): &END FORCE_EVAL """ reference = CP2K(cp2k_input_str, outputs=("energy",)) - assert reference.evaluate(geom_h2_p).result().energy is not None + assert reference(geom_h2_p).result().energy is not MISSING -def test_gpaw_single(dataset_h2): +def test_gpaw(dataset_h2): + # basic functionality parameters = dict(mode="fd", nbands=0, xc="LDA", h=0.3, minimal_box_multiple=2) gpaw = GPAW(parameters) future_in = dataset_h2[0] - future_out = gpaw.evaluate(future_in) - future_energy = gpaw.compute(dataset_h2[:1])[0] + future_out = gpaw(future_in) + dataset_out = gpaw.evaluate(dataset_h2[:2]) future_energy_zr = gpaw.compute_atomic_energy("Zr", box_size=9) - gpaw = GPAW({"askdfj": "asdfk"}) # invalid input - future_fail = gpaw.evaluate(future_in) - geom_in, geom_out = future_in.result(), future_out.result() - energy, energy_zr = future_energy.result(), future_energy_zr.result() + # invalid input + gpaw = GPAW({"askdfj": "asdfk"}) + future_fail = gpaw(future_in) - assert geom_out.energy is not None and geom_out.energy < 0.0 + geom_in, geom_out = future_in.result(), future_out.result() + geoms_out = dataset_out.geometries().result() + assert geom_out.energy is not MISSING and geom_out.energy < 0.0 assert np.allclose(geom_out.per_atom.positions, geom_in.per_atom.positions) - assert np.allclose(geom_out.energy, energy) - assert energy_zr == 0.0 - assert future_fail.result().energy is None + assert future_energy_zr.result() == 0.0 + for geom in geoms_out: + assert geom.energy is not MISSING and geom.energy < 0.0 + assert future_fail.result().energy is MISSING # TODO: enable once we have an ORCA container diff --git a/tests/test_sampling.py b/tests/test_sampling.py index 0f3ccf8..152968c 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -5,7 +5,6 @@ from ase.units import Bohr import psiflow -from psiflow.geometry import check_equality from psiflow.hamiltonians import EinsteinCrystal, PlumedHamiltonian, MACEHamiltonian from psiflow.sampling.optimize import ( optimize as optimize_ipi, @@ -29,6 +28,8 @@ ) from psiflow.sampling.output import Status +from psiflow.data.utils import check_equality + def test_walkers(dataset): future = dataset[0] @@ -120,11 +121,9 @@ def test_parse_checkpoint(checkpoint): file = psiflow.context().new_file("input_", ".xml") Path(file.filepath).write_text(checkpoint) states = parse_checkpoint(file.filepath) - assert "time" in states[0].order - assert np.allclose( - states[0].cell, - np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr, - ) + cell = np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr + assert hasattr(states[0], "time") + assert np.allclose(states[0].cell, cell) def test_sample(dataset, mace_foundation): @@ -143,13 +142,9 @@ def test_sample(dataset, mace_foundation): METAD ARG=CV PACE=5 SIGMA=0.05 HEIGHT=5 """ metadynamics = Metadynamics(plumed_str) + mixture = 0.9 * plumed + einstein walkers = [ - Walker( - start=future, - temperature=300, - metadynamics=metadynamics, - hamiltonian=0.9 * plumed + einstein, - ), + Walker(start=future, metadynamics=metadynamics, hamiltonian=mixture), Walker(start=future, temperature=600, hamiltonian=einstein), Walker(start=future, temperature=450, hamiltonian=einstein), Walker(start=future, temperature=600, hamiltonian=einstein), @@ -182,11 +177,11 @@ def test_sample(dataset, mace_foundation): outputs[1]["potential{electronvolt}"].result(), ] energies_ = [ - (0.9 * plumed + einstein).compute(outputs[0].trajectory, "energy"), - einstein.compute(outputs[1].trajectory, "energy"), + mixture.compute(outputs[0].trajectory).result().energy, + einstein.compute(outputs[1].trajectory).result().energy, ] - assert len(energies[0]) == len(energies_[0].result()) - assert np.allclose(energies[0], energies_[0].result()) + assert len(energies[0]) == len(energies_[0]) + assert np.allclose(energies[0], energies_[0]) time = outputs[0]["time{picosecond}"].result() assert np.allclose(time, np.arange(0, 51, 5) * 5e-4) @@ -237,7 +232,7 @@ def test_sample(dataset, mace_foundation): checkpoint_step=1, fix_com=True, # otherwise temperature won't match ) - energy = hamiltonian.compute(dataset[1], "energy") + energy = hamiltonian.compute(dataset[1]).energy energy_ = output["potential{electronvolt}"] assert np.allclose(energy.result()[0], energy_.result()[0], atol=1e-5) assert np.allclose(output.time.result(), 10 * 0.5 / 1000)