From 482b16c8cac13ab09c62475965499d49e2fd458e Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 1 Apr 2026 20:13:55 +0200 Subject: [PATCH 01/16] Update geometry.py - per_atom quantities are now stored in a custom Container class, rather than a np.recarray - cleaned up geometry attributes - new to_atoms() function - redid a lot of internals - start removing NullState --- psiflow/geometry.py | 827 ++++++++++++++++++++------------------------ 1 file changed, 373 insertions(+), 454 deletions(-) diff --git a/psiflow/geometry.py b/psiflow/geometry.py index c9400c4..a2fa7e5 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -1,463 +1,379 @@ -from __future__ import annotations # necessary for type-guarding class methods - +import io +import pickle from pathlib import Path -from typing import Optional, Union +from types import SimpleNamespace +from typing import Optional, Union, Any +from collections.abc import Sequence +import ase.calculators.calculator import numpy as np -import typeguard +import numpy.typing as npt from ase import Atoms -from ase.data import atomic_masses, chemical_symbols -from ase.io.extxyz import key_val_dict_to_str, key_val_str_to_dict_regex +from ase.data import atomic_masses, chemical_symbols, atomic_numbers +from ase.io.extxyz import key_val_dict_to_str, read_xyz, key_val_str_to_dict from parsl.app.app import python_app import psiflow -per_atom_dtype = np.dtype( - [ - ("numbers", np.uint8), - ("positions", np.float64, (3,)), - ("forces", np.float64, (3,)), - ] -) - -QUANTITIES = [ - "positions", - "cell", - "numbers", - "energy", - "per_atom_energy", - "forces", - "stress", - "delta", - "logprob", - "phase", - "identifier", -] - - -@typeguard.typechecked +# TODO: docstrings +# TODO: custom attributes + + +class Container: + """ + Holds 'per atom' arrays like positions and forces + For every attribute, the first array dimension should be n_atoms + """ + + def __init__( + self, + numbers: npt.NDArray[np.uint8], + positions: npt.NDArray[np.float64], + **kwargs: npt.NDArray, + ): + self.numbers = numbers.astype(int).reshape(-1, 1) + self.positions = positions + self._check(positions) + for k, v in kwargs.items(): + setattr(self, k, v) + + def __setattr__(self, name, value): + if not self._check(value): + raise ValueError(f"Field '{name}' is not a per-atom property..") + super().__setattr__(name, value) + + def __getattr__(self, name): + # only runs if the attribute isn't found normally + raise AttributeError( + f"'{type(self).__name__}' instance has no attribute '{name}'" + ) + + def __len__(self) -> int: + return self.numbers.shape[0] + + def __str__(self) -> str: + return f"Container[{len(self)}](keys={set(vars(self))})" + + def _check(self, arr: npt.NDArray) -> bool: + try: + return (arr.shape[0] == len(self)) and arr.ndim == 2 + except AttributeError: + return arr.ndim == 2 # numbers not yet defined + + def reset(self): + """Wipes optional fields""" + attrs = [k for k in vars(self) if k not in ("numbers", "positions")] + for attr in attrs: + delattr(self, attr) + + def to_string(self) -> tuple[str, str]: + symbols = [chemical_symbols[i] for i in self.numbers.flatten()] + data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions} + if hasattr(self, "forces"): + data["forces"] = self.forces + + # create extxyz header + header = "Properties=species:S:1:pos:R:3" + for k, arr in data.items(): + if k in header: + continue + header += f":{k}:R:{arr.shape[1]}" + + # create a structured array with all fields + dtypes = [] + for k, arr in data.items(): + dtypes += [arr.dtype] * arr.shape[1] + dtype = [(str(i), t) for i, t in enumerate(dtypes)] + + structured = np.zeros(len(self), dtype=dtype) + i = 0 + for k, arr in data.items(): + for j in range(arr.shape[1]): + structured[str(i)] = arr[:, j] + i += 1 + + # write to string + formats = [] + format_map = {"species": "%-2s"} + for k, arr in data.items(): + dformat = format_map.get(k, "%16.8f") + formats += [dformat] * arr.shape[1] + buffer = io.StringIO() + np.savetxt(buffer, structured, fmt=formats) + txt = buffer.getvalue() + + return header, txt + + @classmethod + def from_string(cls, s: str, props: Optional[str] = None): + props = (props or "species:S:1:pos:R:3") + ":" + dtype_map = {"R": "f8", "S": "i8"} + + # figure out what columns belong to which property + keys, dtypes = [], [] + while props: + k, t, n, props = props.split(":", 3) + keys += [k] * int(n) + dtypes += [dtype_map[t]] * int(n) + dtype = [(str(i), t) for i, t in enumerate(dtypes)] + + # convert species to numbers automatically + conv = {0: lambda s: atomic_numbers[s]} + structured = np.loadtxt(io.StringIO(s), dtype=dtype, converters=conv) + + # group subarrays per property + data = {k: [] for k in set(keys)} + for i, k in enumerate(structured.dtype.names): + arr = structured[k] + data[keys[i]].append(arr) + arrays = {k: np.stack(v, axis=-1) for k, v in data.items()} + + arrays["numbers"] = arrays.pop("species") + arrays["positions"] = arrays.pop("pos") + return cls(**arrays) + + class Geometry: """ Represents an atomic structure with associated properties. - This class encapsulates the atomic structure, including atom positions, cell parameters, and various physical properties such as energy and forces. Attributes: - per_atom (np.recarray): Record array containing per-atom properties. + per_atom (Container): Record array containing per-atom properties. cell (np.ndarray): 3x3 array representing the unit cell vectors. - order (dict): Dictionary to store custom ordering information. energy (Optional[float]): Total energy of the system. stress (Optional[np.ndarray]): Stress tensor of the system. - delta (Optional[float]): Delta value, if applicable. - phase (Optional[str]): Phase information, if applicable. - logprob (Optional[np.ndarray]): Log probability values, if applicable. - stdout (Optional[str]): Standard output information, if applicable. - identifier (Optional[int]): Unique identifier for the geometry. """ - per_atom: np.recarray - cell: np.ndarray - order: dict + per_atom: Container + cell: npt.NDArray energy: Optional[float] stress: Optional[np.ndarray] - delta: Optional[float] - phase: Optional[str] - logprob: Optional[np.ndarray] - stdout: Optional[str] - identifier: Optional[int] + meta: SimpleNamespace def __init__( self, - per_atom: np.recarray, - cell: np.ndarray, - order: Optional[dict] = None, + per_atom: Container, energy: Optional[float] = None, + cell: Optional[np.ndarray] = None, stress: Optional[np.ndarray] = None, - delta: Optional[float] = None, - phase: Optional[str] = None, - logprob: Optional[np.ndarray] = None, - stdout: Optional[str] = None, - identifier: Optional[int] = None, + **kwargs: Any, ): """ Initialize a Geometry instance, though the preferred way of instantiating - proceeds via the `from_data` or `from_atoms` class methods - - Args: - per_atom (np.recarray): Record array containing per-atom properties. - cell (np.ndarray): 3x3 array representing the unit cell vectors. - order (Optional[dict], optional): Custom ordering information. Defaults to None. - energy (Optional[float], optional): Total energy of the system. Defaults to None. - stress (Optional[np.ndarray], optional): Stress tensor of the system. Defaults to None. - delta (Optional[float], optional): Delta value. Defaults to None. - phase (Optional[str], optional): Phase information. Defaults to None. - logprob (Optional[np.ndarray], optional): Log probability values. Defaults to None. - stdout (Optional[str], optional): Standard output information. Defaults to None. - identifier (Optional[int], optional): Unique identifier for the geometry. Defaults to None. - """ - self.per_atom = per_atom.astype(per_atom_dtype) # copies data - self.cell = cell.astype(np.float64) - assert self.cell.shape == (3, 3) - if order is None: - order = {} - self.order = order + proceeds via the class methods + """ + self.per_atom = per_atom self.energy = energy + if cell is not None: + assert cell.shape == (3, 3) + self.cell = cell + if stress is not None: + assert stress.shape == (3, 3) self.stress = stress - self.delta = delta - self.phase = phase - self.logprob = logprob - self.stdout = stdout - self.identifier = identifier + self.meta = SimpleNamespace(**kwargs) - def reset(self): + def reset(self) -> None: """ Reset all computed properties of the geometry to their default values. """ + self.per_atom.reset() self.energy = None self.stress = None - self.delta = None - self.phase = None - self.logprob = None - self.per_atom.forces[:] = np.nan - def clean(self): + def clean(self) -> None: """ Clean the geometry by resetting properties and removing additional information. """ self.reset() - self.order = {} - self.stdout = None - self.identifier = None + self.meta = SimpleNamespace() - def __eq__(self, other) -> bool: - """ - Check if two Geometry instances are equal. - - Args: - other: The other object to compare with. - - Returns: - bool: True if the geometries are equal, False otherwise. - """ - if not isinstance(other, Geometry): - return False - # have to check separately for np.allclose due to different dtypes - equal = True - equal = equal and (len(self) == len(other)) - equal = equal and (self.periodic == other.periodic) - if not equal: - return False - equal = equal and np.allclose(self.per_atom.numbers, other.per_atom.numbers) - equal = equal and np.allclose(self.per_atom.positions, other.per_atom.positions) - equal = equal and np.allclose(self.cell, other.cell) - return bool(equal) - - def align_axes(self): + def align_axes(self) -> None: """ Align the axes of the unit cell to a canonical representation for periodic systems. """ - if self.periodic: # only do something if periodic: - positions = self.per_atom.positions - cell = self.cell - transform_lower_triangular(positions, cell, reorder=False) - reduce_box_vectors(cell) + if not self.periodic: + return + positions = self.per_atom.positions + cell = self.cell + transform_lower_triangular(positions, cell, reorder=False) + reduce_box_vectors(cell) - def __len__(self): + def copy(self) -> Geometry: """ - Get the number of atoms in the geometry. - - Returns: - int: The number of atoms. + Create a deep copy of the Geometry instance. """ - return len(self.per_atom) + return pickle.loads(pickle.dumps(self)) def to_string(self) -> str: """ Convert the Geometry instance to a string representation in extended XYZ format. - - Returns: - str: String representation of the geometry. """ + data = {k: getattr(self, k) for k in ("energy", "stress")} if self.periodic: - comment = 'Lattice="' - comment += " ".join([str(x) for x in np.reshape(self.cell.T, 9, order="F")]) - comment += '" pbc="T T T" ' + data["Lattice"] = self.cell.T # ase does Fortran ordering + data["pbc"] = "T T T" else: - comment = 'pbc="F F F" ' - - write_forces = not np.any(np.isnan(self.per_atom.forces)) - comment += "Properties=species:S:1:pos:R:3" - if write_forces: - comment += ":forces:R:3" - comment += " " - - keys = [ - "energy", - "stress", - "delta", - "phase", - "logprob", - "stdout", - "identifier", - ] - values_dict = {} - for key in keys: - value = getattr(self, key) - if value is None: - continue - if type(value) is np.ndarray: - if np.all(np.isnan(value)): - continue - values_dict[key] = value - for key, value in self.order.items(): - values_dict["order_" + key] = value - comment += key_val_dict_to_str(values_dict) - lines = ["{}".format(len(self))] - lines.append("{}".format(comment)) - fmt = " ".join(["%2s"] + 3 * ["%16.8f"]) + " " - if write_forces: - fmt += " ".join(3 * ["%16.8f"]) - for i in range(len(self)): - entry = (chemical_symbols[self.per_atom.numbers[i]],) - entry = entry + tuple(self.per_atom.positions[i]) - if write_forces: - entry = entry + tuple(self.per_atom.forces[i]) - lines.append(fmt % entry) - return "\n".join(lines) - - def save(self, path_xyz: Union[Path, str]): + data["pbc"] = "F F F" + data |= vars(self.meta) + data = {k: v for k, v in data.items() if v is not None} + info_str = key_val_dict_to_str(data) + properties, txt = self.per_atom.to_string() + header = " ".join([properties, info_str]) + return "\n".join([str(len(self)), header, txt]) + + def save(self, path_xyz: Path | str) -> None: """ Save the Geometry instance to an XYZ file. - - Args: - path_xyz (Union[Path, str]): Path to save the XYZ file. """ path_xyz = psiflow.resolve_and_check(Path(path_xyz)) - with open(path_xyz, "w") as f: - f.write(self.to_string()) + path_xyz.write_text(self.to_string()) - def copy(self) -> Geometry: + def to_atoms(self, structural_only: bool = False) -> Atoms: """ - Create a deep copy of the Geometry instance. + Convert the Geometry instance to an Atoms object. + """ + if structural_only: + return Atoms( + positions=self.per_atom.positions, + numbers=self.numbers, + cell=self.cell, + pbc=self.periodic, + ) + + # use the ASE extxyz reader + s = self.to_string() + return next(read_xyz(io.StringIO(s), index=0)) + + def __eq__(self, other: "Geometry") -> bool: + """ + Check if two Geometry instances are structurally equal. + """ + if ( + isinstance(other, Geometry) + and len(self) == len(other) + and self.periodic == other.periodic + and (not self.periodic or np.allclose(self.cell, other.cell)) + and np.allclose(self.per_atom.numbers, other.per_atom.numbers) + and np.allclose(self.per_atom.positions, other.per_atom.positions) + ): + return True + return False - Returns: - Geometry: A new Geometry instance with the same data. + def __len__(self) -> int: """ - return Geometry.from_string(self.to_string()) + Get the number of atoms in the geometry. + """ + return len(self.per_atom) @classmethod - def from_string(cls, s: str, natoms: Optional[int] = None) -> Optional[Geometry]: + def from_string(cls, s: str) -> Geometry: """ Create a Geometry instance from a string representation in extended XYZ format. - - Args: - s (str): String representation of the geometry. - natoms (Optional[int], optional): Number of atoms (if known). Defaults to None. - - Returns: - Optional[Geometry]: A new Geometry instance, or None if the string is empty. """ - if len(s) == 0: - return None - if not natoms: # natoms in s - lines = s.strip().split("\n") - natoms = int(lines[0]) - lines = lines[1:] + n_atoms, header, body = s.strip().split("\n", 2) + data = key_val_str_to_dict(header) + data.pop("pbc", None) + if "Lattice" in data: + data["cell"] = data.pop("Lattice").T # ase does Fortran ordering else: - lines = s.rstrip().split( - "\n" - ) # i-PI nonperiodic starts with empty -> rstrip! - assert len(lines) == natoms + 1 - comment = lines[0] - comment_dict = key_val_str_to_dict_regex(comment) - - # read and format per_atom data - column_indices = {} - if "Properties" in comment_dict: - properties = comment_dict["Properties"].split(":") - count = 0 - for i in range(len(properties) // 3): - name = properties[3 * i] - ncolumns = int(properties[3 * i + 2]) - column_indices[name] = count - count += ncolumns - assert "pos" in column_indices # positions need to be there - - per_atom = np.recarray(natoms, dtype=per_atom_dtype) - per_atom.forces[:] = np.nan - POS_INDEX = column_indices.get("pos", 1) - FORCES_INDEX = column_indices.get("forces", None) - for i in range(natoms): - values = lines[i + 1].split() - per_atom.numbers[i] = chemical_symbols.index(values[0]) - per_atom.positions[i, :] = [ - float(_) for _ in values[POS_INDEX : POS_INDEX + 3] - ] - if FORCES_INDEX is not None: - per_atom.forces[i, :] = [ - float(_) for _ in values[FORCES_INDEX : FORCES_INDEX + 3] - ] - - order = {} - for key, value in comment_dict.items(): - if key.startswith("order_"): - order[key.replace("order_", "")] = value - - # TODO: this needs a better solution - cell = comment_dict.pop("Lattice", np.zeros((3, 3))) - if isinstance(cell, str): - # most likely an array of NaNs - cell = np.array(cell.split()).reshape((3, 3)) - cell = cell.T # transposed! - pbc = sum(comment_dict.pop("pbc", [])) == 3 - geometry = cls( - per_atom=per_atom, - cell=cell if pbc else np.zeros((3, 3)), - energy=comment_dict.pop("energy", None), - stress=comment_dict.pop("stress", None), - delta=comment_dict.pop("delta", None), - phase=comment_dict.pop("phase", None), - logprob=comment_dict.pop("logprob", None), - stdout=comment_dict.pop("stdout", None), - identifier=comment_dict.pop("identifier", None), - order=order, - ) - return geometry + data["cell"] = None + per_atom = Container.from_string(body, data.pop("Properties", None)) + + assert len(per_atom) == int(n_atoms) + return Geometry(per_atom, **data) @classmethod - def load(cls, path_xyz: Union[Path, str]) -> Geometry: + def load(cls, path_xyz: Path | str) -> "Geometry": """ Load a Geometry instance from an XYZ file. - - Args: - path_xyz (Union[Path, str]): Path to the XYZ file. - - Returns: - Geometry: A new Geometry instance loaded from the file. """ path_xyz = psiflow.resolve_and_check(Path(path_xyz)) - assert path_xyz.exists() - with open(path_xyz, "r") as f: - content = f.read() + content = path_xyz.read_text() return cls.from_string(content) + @classmethod + def from_data( + cls, numbers: np.ndarray, positions: np.ndarray, cell: Optional[np.ndarray] + ) -> Geometry: + """ + Create a Geometry instance from atomic numbers, positions, and cell data. + """ + per_atom = Container(numbers.copy(), positions.copy()) + cell = cell if cell is None else cell.copy() + return Geometry(per_atom, cell=cell) + + @classmethod + def from_atoms(cls, atoms: Atoms) -> Geometry: + """ + Create a Geometry instance from an ASE Atoms object. + """ + per_atom = Container(**atoms.arrays) + data = atoms.info + if all(atoms.pbc): + data["cell"] = atoms.cell.array + if atoms.calc is not None: + # ASE does stupid calc things + try: + data["energy"] = atoms.get_potential_energy() + per_atom.forces = atoms.get_forces() + except ase.calculators.calculator.PropertyNotImplementedError: + pass # property was not stored in calc + return cls(per_atom, **data) + @property - def periodic(self): + def periodic(self) -> bool: """ Check if the geometry is periodic. - - Returns: - bool: True if the geometry is periodic, False otherwise. """ return np.any(self.cell) @property - def per_atom_energy(self): + def per_atom_energy(self) -> Optional[float]: """ Calculate the energy per atom. - - Returns: - Optional[float]: Energy per atom if total energy is available, None otherwise. """ if self.energy is None: return None - else: - return self.energy / len(self) + return self.energy / len(self) @property - def volume(self): + def volume(self) -> Optional[float]: """ Calculate the volume of the unit cell. - - Returns: - float: Volume of the unit cell for periodic systems, np.nan for non-periodic systems. """ if not self.periodic: - return np.nan - else: - return np.linalg.det(self.cell) - - @property - def atomic_masses(self): - """ - Get the atomic masses of the atoms in the geometry. - - Returns: - np.ndarray: Array of atomic masses. - """ - return np.array([atomic_masses[n] for n in self.per_atom.numbers]) + return None + return np.linalg.det(self.cell) - @classmethod - def from_data( - cls, - numbers: np.ndarray, - positions: np.ndarray, - cell: Optional[np.ndarray], - ) -> Geometry: + @property + def atomic_masses(self) -> npt.NDArray: """ - Create a Geometry instance from atomic numbers, positions, and cell data. - - Args: - numbers (np.ndarray): Array of atomic numbers. - positions (np.ndarray): Array of atomic positions. - cell (Optional[np.ndarray]): Unit cell vectors (or None for non-periodic systems). - - Returns: - Geometry: A new Geometry instance. + Get the atomic masses of the atoms in the geometry. """ - per_atom = np.recarray(len(numbers), dtype=per_atom_dtype) - per_atom.numbers[:] = numbers - per_atom.positions[:] = positions - per_atom.forces[:] = np.nan - if cell is not None: - cell = cell.copy() - else: - cell = np.zeros((3, 3)) - return Geometry(per_atom, cell) + return np.array([atomic_masses[n] for n in self.per_atom.numbers]).flatten() - @classmethod - def from_atoms(cls, atoms: Atoms) -> Geometry: + @property + def numbers(self) -> npt.NDArray: """ - Create a Geometry instance from an ASE Atoms object. - - Args: - atoms (Atoms): ASE Atoms object. - - Returns: - Geometry: A new Geometry instance. + Get a flattened version of the per_atom.numbers array. """ - per_atom = np.recarray(len(atoms), dtype=per_atom_dtype) - per_atom.numbers[:] = atoms.numbers.astype(np.uint8) - per_atom.positions[:] = atoms.get_positions() - per_atom.forces[:] = atoms.arrays.get("forces", np.nan) # TODO: ASE stores forces in calc now - if np.any(atoms.pbc): - cell = np.array(atoms.cell) - else: - cell = np.zeros((3, 3)) - geometry = cls(per_atom, cell) - geometry.energy = atoms.info.get("energy", None) - geometry.stress = atoms.info.get("stress", None) - geometry.delta = atoms.info.get("delta", None) - geometry.phase = atoms.info.get("phase", None) - geometry.logprob = atoms.info.get("logprob", None) - geometry.stdout = atoms.info.get("stdout", None) - geometry.identifier = atoms.info.get("identifier", None) - return geometry - - -def new_nullstate(): - """ - Create a new null state Geometry. + return self.per_atom.numbers.flatten() - Returns: - Geometry: A Geometry instance representing a null state. - """ - return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None) + +# def new_nullstate(): +# """ +# Create a new null state Geometry. +# +# Returns: +# Geometry: A Geometry instance representing a null state. +# """ +# return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None) # use universal dummy state -NullState = new_nullstate() +# NullState = new_nullstate() def is_lower_triangular(cell: np.ndarray) -> bool: @@ -554,7 +470,6 @@ def reduce_box_vectors(cell: np.ndarray): cell[1, :] = cell[1, :] - cell[0, :] * np.round(cell[1, 0] / cell[0, 0]) -@typeguard.typechecked def get_mass_matrix(geometry: Geometry) -> np.ndarray: """ Compute the mass matrix for a given geometry. @@ -573,7 +488,6 @@ def get_mass_matrix(geometry: Geometry) -> np.ndarray: return np.outer(sqrt_inv, sqrt_inv) -@typeguard.typechecked def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: """ Apply mass-weighting to a Hessian matrix. @@ -590,7 +504,6 @@ def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: return hessian * get_mass_matrix(geometry) -@typeguard.typechecked def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: """ Remove mass-weighting from a Hessian matrix. @@ -607,104 +520,110 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: return hessian / get_mass_matrix(geometry) -def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]: - """ - Create output arrays for specified quantities from a list of Geometry instances. - - Args: - quantities (list[str]): List of quantity names to extract. - data (list[Geometry]): List of Geometry instances. - - Returns: - list[np.ndarray]: List of arrays containing the requested quantities. - """ - order_names = list(set([k for g in data for k in g.order])) - assert all([q in QUANTITIES + order_names for q in quantities]) - natoms = np.array([len(geometry) for geometry in data], dtype=int) - max_natoms = np.max(natoms) - nframes = len(data) - nprob = 0 - max_phase = 0 - for state in data: - if state.logprob is not None: - nprob = max(len(state.logprob), nprob) - if state.phase is not None: - max_phase = max(len(state.phase), max_phase) - - arrays = [] - for quantity in quantities: - if quantity in ["positions", "forces"]: - array = np.empty((nframes, max_natoms, 3), dtype=np.float64) - array[:] = np.nan - elif quantity in ["cell", "stress"]: - array = np.empty((nframes, 3, 3), dtype=np.float64) - array[:] = np.nan - elif quantity in ["numbers"]: - array = np.empty((nframes, max_natoms), dtype=np.uint8) - array[:] = 0 - elif quantity in ["energy", "delta", "per_atom_energy"]: - array = np.empty((nframes,), dtype=np.float64) - array[:] = np.nan - elif quantity in ["phase"]: - array = np.empty((nframes,), dtype=(np.unicode_, max_phase)) - array[:] = "" - elif quantity in ["logprob"]: - array = np.empty((nframes, nprob), dtype=np.float64) - array[:] = np.nan - elif quantity in ["identifier"]: - array = np.empty((nframes,), dtype=np.int64) - array[:] = -1 - elif quantity in order_names: - array = np.empty((nframes,), dtype=np.float64) - array[:] = np.nan - else: - raise AssertionError("missing quantity in if/else") - arrays.append(array) - return arrays - - -def _assign_identifier( - state: Geometry, - identifier: int, - discard: bool = False, -) -> tuple[Geometry, int]: - """ - Assign an identifier to a Geometry instance. - - Args: - state (Geometry): Input Geometry instance. - identifier (int): Identifier to assign. - discard (bool, optional): Whether to discard the state. Defaults to False. - - Returns: - tuple[Geometry, int]: Updated Geometry and next available identifier. - """ - if (state == NullState) or discard: - return state, identifier - else: - assert state.identifier is None - state.identifier = identifier - return state, identifier + 1 - - -assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) - - -@typeguard.typechecked -def _check_equality( - state0: Geometry, - state1: Geometry, -) -> bool: - """ - Check if two Geometry instances are equal. - - Args: - state0 (Geometry): First Geometry instance. - state1 (Geometry): Second Geometry instance. - - Returns: - bool: True if the Geometry instances are equal, False otherwise. - """ +def get_unique_numbers(states: Sequence[Geometry]) -> set[int]: + """Returns a set of unique atom numbers found across all states.""" + numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers)) + return {int(n) for n in numbers} + + +def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float: + """Compute the total atomic energy based on provided single atom energies.""" + total = 0 + numbers, counts = np.unique(geometry.numbers, return_counts=True) + for number, count in zip(numbers, counts): + symbol = chemical_symbols[number] + try: + total += count * atomic_energies[symbol] + except KeyError: + raise KeyError(f"No atomic energy value for symbol '{symbol}'..") + return float(total) + + +# def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]: +# """ +# Create output arrays for specified quantities from a list of Geometry instances. +# +# Args: +# quantities (list[str]): List of quantity names to extract. +# data (list[Geometry]): List of Geometry instances. +# +# Returns: +# list[np.ndarray]: List of arrays containing the requested quantities. +# """ +# order_names = list(set([k for g in data for k in g.order])) +# assert all([q in QUANTITIES + order_names for q in quantities]) +# natoms = np.array([len(geometry) for geometry in data], dtype=int) +# max_natoms = np.max(natoms) +# nframes = len(data) +# nprob = 0 +# max_phase = 0 +# for state in data: +# if state.logprob is not None: +# nprob = max(len(state.logprob), nprob) +# if state.phase is not None: +# max_phase = max(len(state.phase), max_phase) +# +# arrays = [] +# for quantity in quantities: +# if quantity in ["positions", "forces"]: +# array = np.empty((nframes, max_natoms, 3), dtype=np.float64) +# array[:] = np.nan +# elif quantity in ["cell", "stress"]: +# array = np.empty((nframes, 3, 3), dtype=np.float64) +# array[:] = np.nan +# elif quantity in ["numbers"]: +# array = np.empty((nframes, max_natoms), dtype=np.uint8) +# array[:] = 0 +# elif quantity in ["energy", "delta", "per_atom_energy"]: +# array = np.empty((nframes,), dtype=np.float64) +# array[:] = np.nan +# elif quantity in ["phase"]: +# array = np.empty((nframes,), dtype=(np.unicode_, max_phase)) +# array[:] = "" +# elif quantity in ["logprob"]: +# array = np.empty((nframes, nprob), dtype=np.float64) +# array[:] = np.nan +# elif quantity in ["identifier"]: +# array = np.empty((nframes,), dtype=np.int64) +# array[:] = -1 +# elif quantity in order_names: +# array = np.empty((nframes,), dtype=np.float64) +# array[:] = np.nan +# else: +# raise AssertionError("missing quantity in if/else") +# arrays.append(array) +# return arrays + + +# def _assign_identifier( +# state: Geometry, +# identifier: int, +# discard: bool = False, +# ) -> tuple[Geometry, int]: +# """ +# Assign an identifier to a Geometry instance. +# +# Args: +# state (Geometry): Input Geometry instance. +# identifier (int): Identifier to assign. +# discard (bool, optional): Whether to discard the state. Defaults to False. +# +# Returns: +# tuple[Geometry, int]: Updated Geometry and next available identifier. +# """ +# if (state == NullState) or discard: +# return state, identifier +# else: +# assert state.identifier is None +# state.identifier = identifier +# return state, identifier + 1 + + +# assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) + + +def _check_equality(state0: Geometry, state1: Geometry) -> bool: + """ Check if two Geometry instances are equal""" return state0 == state1 From 43fde16269cf375d33674bd18ba5153a8d5dccfe Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 1 Apr 2026 20:15:59 +0200 Subject: [PATCH 02/16] updating data.utils - API updates - some rewrites - functionaility unchanged --- psiflow/data/dataset.py | 192 +++++++------------------ psiflow/data/utils.py | 307 ++++++++++++++-------------------------- 2 files changed, 158 insertions(+), 341 deletions(-) diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 4a86f39..b67003a 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -1,18 +1,15 @@ -from __future__ import annotations # necessary for type-guarding class methods - import math from pathlib import Path from typing import Callable, ClassVar, Optional, Union import numpy as np -import typeguard from parsl.app.app import join_app, python_app from parsl.app.python import PythonApp from parsl.data_provider.files import File from parsl.dataflow.futures import AppFuture, DataFuture import psiflow -from psiflow.geometry import QUANTITIES, Geometry +from psiflow.geometry import Geometry from psiflow.utils.apps import copy_data_future, pack from .utils import ( @@ -48,7 +45,7 @@ class Dataset: def __init__( self, - states: Optional[list[Union[AppFuture, Geometry]], AppFuture], + states: Optional[list[AppFuture | Geometry] | AppFuture] = None, extxyz: Optional[psiflow._DataFuture] = None, ): """ @@ -56,26 +53,15 @@ def __init__( Args: states: List of Geometry instances or AppFutures representing geometries. - extxyz: Optional DataFuture representing an existing extxyz file. - - Note: - Either states or extxyz should be provided, not both. """ - if extxyz is not None: - assert states is None + if extxyz is not None: # takes precedence over states self.extxyz = extxyz - else: - assert states is not None - if not isinstance(states, list): # AppFuture[list, geometry] - extra_states = states - states = [] - else: - extra_states = None - self.extxyz = write_frames( - *states, - extra_states=extra_states, - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] + return + + if not isinstance(states, list): # AppFuture[list, geometry] + states = [states] + file = psiflow.context().new_file("data_", ".xyz") + self.extxyz = write_frames(*states, outputs=[file]).outputs[0] def length(self) -> AppFuture: """ @@ -84,25 +70,19 @@ def length(self) -> AppFuture: Returns: AppFuture: Future representing the number of structures. """ - return count_frames(inputs=[self.extxyz]) + return count_frames(self.extxyz) - def shuffle(self): + def shuffle(self) -> "Dataset": """ Shuffle the order of structures in the dataset. - - Returns: - Dataset: A new Dataset with shuffled structures. """ - extxyz = shuffle( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + extxyz = shuffle(self.extxyz, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) def __getitem__( - self, - index: Union[int, slice, list[int], AppFuture], - ) -> Union[Dataset, AppFuture]: + self, index: int | slice | list[int] | AppFuture + ) -> Dataset | AppFuture: """ Get a subset of the dataset or a single structure. @@ -112,36 +92,23 @@ def __getitem__( Returns: Union[Dataset, AppFuture]: A new Dataset or an AppFuture of a single Geometry. """ + future_frames = read_frames(self.extxyz, indices=index) if isinstance(index, int): - future = read_frames( - [index], - inputs=[self.extxyz], - outputs=[], # will return Geometry as Future - ) - return future[0] - else: # slice, list, AppFuture - extxyz = read_frames( - index, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + return future_frames[0] # will return Geometry as Future - def save(self, path: Union[Path, str]) -> DataFuture: + # slice, list, AppFuture + file = psiflow.context().new_file("data_", ".xyz") + extxyz = write_frames(future_frames, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) + + def save(self, path: Path | str) -> DataFuture: """ Save the dataset to a file. - - Args: - path: Path to save the dataset. - Returns: DataFuture: Future representing the file to which will be saved. """ path = psiflow.resolve_and_check(Path(path)) - future = copy_data_future( - inputs=[self.extxyz], - outputs=[File(str(path))], - ) + future = copy_data_future(inputs=[self.extxyz], outputs=[File(path)]) return future.outputs[0] def geometries(self) -> AppFuture: @@ -151,96 +118,63 @@ def geometries(self) -> AppFuture: Returns: AppFuture: Future representing a list of Geometry instances. """ - return read_frames(inputs=[self.extxyz]) + return read_frames(self.extxyz) def __add__(self, dataset: Dataset) -> Dataset: """ Concatenate two datasets. - - Args: - dataset: Another Dataset to add to this one. - - Returns: - Dataset: A new Dataset containing structures from both datasets. """ - extxyz = join_frames( - inputs=[self.extxyz, dataset.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = join_frames(inputs=[self.extxyz, dataset.extxyz], outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def subtract_offset(self, **atomic_energies: Union[float, AppFuture]) -> Dataset: + def subtract_offset(self, **atomic_energies: float | AppFuture) -> Dataset: """ Subtract atomic energy offsets from the dataset. - - Args: - **atomic_energies: Atomic energies for each element. - - Returns: - Dataset: A new Dataset with adjusted energies. """ assert len(atomic_energies) > 0 - extxyz = apply_offset( - True, - **atomic_energies, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = apply_offset( + self.extxyz, subtract=True, **atomic_energies, outputs=[file] + ) + return Dataset(extxyz=future.outputs[0]) def add_offset(self, **atomic_energies) -> Dataset: """ Add atomic energy offsets to the dataset. - - Args: - **atomic_energies: Atomic energies for each element. - - Returns: - Dataset: A new Dataset with adjusted energies. """ assert len(atomic_energies) > 0 - extxyz = apply_offset( - False, - **atomic_energies, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = apply_offset( + self.extxyz, subtract=False, **atomic_energies, outputs=[file] + ) + return Dataset(extxyz=future.outputs[0]) - def elements(self): + def elements(self) -> AppFuture: """ Get the set of elements present in the dataset. Returns: AppFuture: Future representing a set of element symbols. """ - return get_elements(inputs=[self.extxyz]) + return get_elements(self.extxyz) - def reset(self): + def reset(self) -> Dataset: """ Reset all structures in the dataset. - - Returns: - Dataset: A new Dataset with reset structures. """ - extxyz = reset_frames( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = reset_frames(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def clean(self): + def clean(self) -> Dataset: """ Clean all structures in the dataset. - - Returns: - Dataset: A new Dataset with cleaned structures. """ - extxyz = clean_frames( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + + file = psiflow.context().new_file("data_", ".xyz") + future = clean_frames(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) def get( self, @@ -335,10 +269,8 @@ def not_null(self) -> Dataset: Returns: Dataset: A new Dataset without null states. """ - extxyz = not_null( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] + file = psiflow.context().new_file("data_", ".xyz") + extxyz = not_null(inputs=[self.extxyz], outputs=[file]).outputs[0] return Dataset(None, extxyz) def align_axes(self): @@ -393,25 +325,15 @@ def assign_identifiers( return result @classmethod - def load( - cls, - path_xyz: Union[Path, str], - ) -> Dataset: + def load(cls, path_xyz: Union[Path, str]) -> Dataset: """ Load a dataset from a file. - - Args: - path_xyz: Path to the XYZ file. - - Returns: - Dataset: Loaded dataset. """ path_xyz = psiflow.resolve_and_check(Path(path_xyz)) assert path_xyz.exists() # needs to be locally accessible - return cls(None, extxyz=File(str(path_xyz))) + return cls(extxyz=File(path_xyz)) -@typeguard.typechecked def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: """ Concatenate multiple lists of arrays. @@ -459,7 +381,6 @@ def pad_arrays( concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"]) -@typeguard.typechecked def _aggregate_multiple( *arrays_list, coefficients: Optional[np.ndarray] = None, @@ -493,7 +414,6 @@ def _aggregate_multiple( @join_app -@typeguard.typechecked def batch_apply( apply_apps: tuple[Union[PythonApp, Callable]], arg: Union[Dataset, list[Geometry]], @@ -560,7 +480,6 @@ def get_length(arg): return 1 -@typeguard.typechecked def compute( arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], *apply_apps: Union[PythonApp, Callable], @@ -623,7 +542,6 @@ def compute( return [future[i] for i in range(len(outputs_))] -@typeguard.typechecked class Computable: """ Base class for computable objects. diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 0f54936..1ca0e03 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -1,119 +1,94 @@ import re import shutil -from typing import Optional, Union, Sequence +import subprocess +from pathlib import Path +from typing import Optional, Union, Sequence, Generator, TypeAlias import numpy as np import typeguard from ase.data import atomic_numbers, chemical_symbols -from parsl.app.app import python_app +from parsl import python_app, File from parsl.dataflow.futures import AppFuture -from psiflow.geometry import Geometry, NullState, _assign_identifier, create_outputs +from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers + +# TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor. +# TODO: loosen type hints? +# TODO: inputs/outputs for apps sometimes behave weird + + +FileLike: TypeAlias = str | Path | File + +def iter_read_frames(file: FileLike) -> Generator[list[str]]: + """Yields text data to instantiate geometries""" + frame_regex = re.compile(r"^\d+$") + with open(file, "r") as f: + for line in f: + if frame_regex.match(_ := line.strip()): + n = int(_) + yield [line] + [f.readline() for _i in range(n + 1)] -@typeguard.typechecked def _write_frames( - *states: Geometry, - extra_states: Union[Geometry, list[Geometry], None] = None, - outputs: list = [], + *states: Geometry | list[Geometry], outputs: Sequence[File] = () ) -> None: """ Write Geometry instances to a file. Args: - *states: Variable number of Geometry instances to write. - extra_states: Additional Geometry instance(s) to write. + states: Variable number of (lists of) Geometry instances to write. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - all_states = list(states) - if extra_states is not None: - if isinstance(extra_states, list): - all_states += extra_states - else: # single geometry - all_states.append(extra_states) + assert len(outputs) == 1 + data = [] + for d in states: + if isinstance(d, list): + data.extend(d) + else: + data.append(d) with open(outputs[0], "w") as f: - for state in all_states: # avoid double newline by using strip! - f.write(state.to_string().strip() + "\n") + f.write("".join([geom.to_string() for geom in data])) write_frames = python_app(_write_frames, executors=["default_threads"]) -@typeguard.typechecked def _read_frames( - indices: Union[None, slice, list[int], int] = None, - inputs: list = [], - outputs: list = [], -) -> Optional[list[Geometry]]: + file: FileLike, indices: Optional[slice | list[int] | int] = None +) -> list[Geometry]: """ Read Geometry instances from a file. Args: + file: DataFuture representing the input file path containing the geometry data. indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing the geometry data. - outputs: List of Parsl futures. If provided, the first element should be - a DataFuture representing the output file path where the selected - geometries will be written. - Returns: - Optional[list[Geometry]]: List of read Geometry instances if no output - is specified, otherwise None. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - frame_index = 0 - frame_regex = re.compile(r"^\d+$") - length = _count_frames(inputs=inputs) - _all = range(length) + if indices is None: + # read everything + data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] + return [Geometry.from_string(s) for s in data] + + # figure out what frames to read + # TODO: reads twice + indices always wrap modulo nframes which might be unexpected + length = _count_frames(file) if isinstance(indices, slice): - indices = list(_all[indices]) + indices = list(range(length)[indices]) elif isinstance(indices, int): - indices = [list(_all)[indices]] - - if isinstance(indices, list): - if length > 0: - indices = [i % length for i in indices] # for negative indices and wrapping - indices_ = set(indices) # for *much* faster 'i in indices' - else: - assert indices is None - indices_ = None - - data = [] - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) + indices = [indices] - # currently at position frame_index, check if to be read - _ = [f.readline() for _i in range(natoms + 1)] - if indices_ is None or frame_index in indices_: - data.append("".join([line] + _)) - else: - data.append(None) - frame_index += 1 + if length > 0: + indices = [i % length for i in indices] # for negative indices and wrapping + indices_ = set(indices) # for *much* faster 'i in indices' - if indices is not None: # sort states accordingly - data = [data[i] for i in indices] + data = {} + for i, geom_str_list in enumerate(iter_read_frames(file)): + if i in indices_: + data[i] = Geometry.from_string("".join(geom_str_list)) - if len(outputs) > 0: - with open(outputs[0], "w") as f: - f.write("\n".join([d.strip() for d in data if d is not None])) - f.write("\n") - else: - geometries = [Geometry.from_string(s) for s in data if s is not None] - return geometries + # return in original order + return [data[i] for i in indices] read_frames = python_app(_read_frames, executors=["default_threads"]) @@ -365,11 +340,7 @@ def _assign_identifiers( assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"]) -@typeguard.typechecked -def _join_frames( - inputs: list = [], - outputs: list = [], -): +def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None: """ Join multiple frame files into a single file. @@ -378,12 +349,6 @@ def _join_frames( representing an input file path containing geometry data. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path where joined frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ assert len(outputs) == 1 @@ -396,157 +361,101 @@ def _join_frames( join_frames = python_app(_join_frames, executors=["default_threads"]) -@typeguard.typechecked -def _count_frames(inputs: list = []) -> int: - """ - Count the number of frames in a file. +def _count_frames(file: FileLike) -> int: + """Count the number of frames in a file.""" + path = Path(file) + threshold = 10 * 1024 * 1024 # 10 MB in bytes + file_size = path.stat().st_size - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - int: Number of frames in the file. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - nframes = 0 - frame_regex = re.compile(r"^\d+$") - with open(inputs[0], "r") as f: - for line in f: - if frame_regex.match(line.strip()): - nframes += 1 - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - return nframes + if file_size > threshold: # use grep + cmd = f"grep -Fc Properties {path}" + result = subprocess.check_output(cmd.split()) + return int(result.strip()) + return len([_ for _ in iter_read_frames(path)]) count_frames = python_app(_count_frames, executors=["default_threads"]) -@typeguard.typechecked -def _reset_frames(inputs: list = [], outputs: list = []) -> None: +def _reset_frames(file: FileLike, outputs: Sequence[File] = ()) -> None: """ Reset all frames in a file. Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. + file: DataFuture representing the input file path containing the geometry data. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path where reset frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(inputs=[inputs[0]]) + assert len(outputs) == 1 + data = _read_frames(file) for geometry in data: geometry.reset() - _write_frames(*data, outputs=[outputs[0]]) + _write_frames(*data, outputs=outputs) reset_frames = python_app(_reset_frames, executors=["default_threads"]) -@typeguard.typechecked -def _clean_frames(inputs: list = [], outputs: list = []) -> None: +def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None: """ Clean all frames in a file. Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. + file: DataFuture representing the input file path containing the geometry data. outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where cleaned frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. + representing the output file path where reset frames will be written. """ - data = _read_frames(inputs=[inputs[0]]) + assert len(outputs) == 1 + data = _read_frames(file) for geometry in data: geometry.clean() - _write_frames(*data, outputs=[outputs[0]]) + _write_frames(*data, outputs=outputs) clean_frames = python_app(_clean_frames, executors=["default_threads"]) -@typeguard.typechecked def _apply_offset( - subtract: bool, - inputs: list = [], - outputs: list = [], - **atomic_energies: float, + file: FileLike, subtract: bool, outputs: Sequence[File] = (), **atomic_energies: float ) -> None: """ Apply an energy offset to all frames in a file. Args: + file: DataFuture representing the input file path containing the geometry data. subtract: Whether to subtract or add the offset. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path where updated frames will be written. **atomic_energies: Atomic energies for each element. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - assert len(inputs) == 1 assert len(outputs) == 1 - data = _read_frames(inputs=[inputs[0]]) - numbers = [atomic_numbers[e] for e in atomic_energies.keys()] - all_numbers = [n for geometry in data for n in set(geometry.per_atom.numbers)] - for n in all_numbers: - if n != 0: # from NullState - assert n in numbers - for geometry in data: - if geometry == NullState: - continue - natoms = len(geometry) - energy = geometry.energy - for number in numbers: - natoms_per_number = np.sum(geometry.per_atom.numbers == number) - if natoms_per_number == 0: - continue - element = chemical_symbols[number] - multiplier = -1 if subtract else 1 - energy += multiplier * natoms_per_number * atomic_energies[element] - natoms -= natoms_per_number - assert natoms == 0 # all atoms accounted for - geometry.energy = energy - _write_frames(*data, outputs=[outputs[0]]) + frames = _read_frames(file) + numbers_data = get_unique_numbers(frames) + numbers_kwargs = {atomic_numbers[e] for e in atomic_energies.keys()} + assert numbers_data == numbers_kwargs, "Provide atomic energies for all elements.." + + for geom in frames: + energy = get_atomic_energy(geom, atomic_energies) + if subtract: + geom.energy -= energy + else: + geom.energy += energy + + _write_frames(frames, outputs=outputs) apply_offset = python_app(_apply_offset, executors=["default_threads"]) -@typeguard.typechecked -def _get_elements(inputs: list = []) -> set[str]: +def _get_elements(*files: File) -> set[str]: """ - Get the set of elements present in all frames of a file. + Get the set of elements present in all frames of a sequence of file. Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - set[str]: Set of element symbols. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. + inputs: List of Parsl DataFuture """ - data = _read_frames(inputs=[inputs[0]]) - return set([chemical_symbols[n] for g in data for n in g.per_atom.numbers]) + frames = [geom for file in files for geom in _read_frames(file)] + return {chemical_symbols[i] for i in get_unique_numbers(frames)} get_elements = python_app(_get_elements, executors=["default_threads"]) @@ -569,7 +478,7 @@ def _align_axes(inputs: list = [], outputs: list = []) -> None: Note: This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(inputs=[inputs[0]]) + data = _read_frames(inputs[0]) for geometry in data: geometry.align_axes() _write_frames(*data, outputs=[outputs[0]]) @@ -676,32 +585,22 @@ def _app_filter( ) # filter is protected -@typeguard.typechecked def _shuffle( - inputs: list = [], - outputs: list = [], + file: FileLike, + outputs: Sequence[File] = (), ) -> None: """ Shuffle the order of frames in a file. Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. + file: DataFuture representing the input file path containing the geometry data. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path where shuffled frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(inputs=[inputs[0]]) - indices = np.arange(len(data)) - np.random.shuffle(indices) - - shuffled = [data[int(i)] for i in indices] - _write_frames(*shuffled, outputs=[outputs[0]]) + assert len(outputs) == 1 + frames = _read_frames(file) + np.random.shuffle(frames) + _write_frames(frames, outputs=outputs) shuffle = python_app(_shuffle, executors=["default_threads"]) From e825ea655cb94e09bd3250d53d34f48b9d834609 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 1 Apr 2026 23:20:00 +0200 Subject: [PATCH 03/16] dataset update v1 - Dataset.get simplified - cannot filter non per-atom attributes - Dataset.get_per_atom now does filtering bases on ids or element - Dataset.evaluate has got to go, I think - Dataset.not_null removed reimplemented _extract_quantities, _insert_quantities: - arbitrary quantities - no longer forces everything into arrays - use MISSING vaue to indicate missing data more changes in data.utils, but not different functionality --- psiflow/data/dataset.py | 101 +++------ psiflow/data/utils.py | 438 ++++++++++++++-------------------------- 2 files changed, 187 insertions(+), 352 deletions(-) diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index b67003a..9df0142 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -25,11 +25,11 @@ get_train_valid_indices, insert_quantities, join_frames, - not_null, read_frames, reset_frames, shuffle, write_frames, + extract_quantities_per_atom, ) @@ -171,55 +171,46 @@ def clean(self) -> Dataset: """ Clean all structures in the dataset. """ - file = psiflow.context().new_file("data_", ".xyz") future = clean_frames(self.extxyz, outputs=[file]) return Dataset(extxyz=future.outputs[0]) - def get( + def get(self, *quantities: str) -> tuple[AppFuture, ...]: + """ + Extract specified quantities from the dataset. + """ + future_states = read_frames(self.extxyz) + future_dict = extract_quantities(future_states, quantities) + return tuple(future_dict[q] for q in quantities) + + def get_per_atom( self, *quantities: str, atom_indices: Optional[list[int]] = None, elements: Optional[list[str]] = None, - ): + ) -> tuple[AppFuture, ...]: """ - Extract specified quantities from the dataset. + Extract specified per atom quantities from the dataset. Args: *quantities: Names of quantities to extract. atom_indices: Optional list of atom indices to consider. elements: Optional list of element symbols to consider. - - Returns: - Union[AppFuture, tuple[AppFuture, ...]]: Future(s) representing the extracted quantities. """ - result = extract_quantities( - quantities, - atom_indices, - elements, - inputs=[self.extxyz], + future_states = read_frames(self.extxyz) + future_dict = extract_quantities_per_atom( + future_states, quantities, atom_indices, elements ) - if len(quantities) == 1: - return result[0] - else: - return tuple([result[i] for i in range(len(quantities))]) + return tuple(future_dict[q] for q in quantities) def evaluate( - self, - computable: Computable, - batch_size: Optional[int] = None, + self, computable: Computable, batch_size: Optional[int] = None ) -> Dataset: """ Evaluate a Computable on the dataset. - - Args: - computable: Computable object to evaluate. - batch_size: Optional batch size for evaluation. - - Returns: - Dataset: A new Dataset with evaluation results. """ # TODO: remove this functionality? + # or this should be the (only) way to label a dataset? from psiflow.hamiltonians import Hamiltonian if not isinstance(computable, Hamiltonian): @@ -233,58 +224,30 @@ def evaluate( outputs = computable.compute(self) # use default from computable if not isinstance(outputs, list): # compute unpacks for only one property outputs = [outputs] - future = insert_quantities( - quantities=tuple(computable.outputs), - arrays=pack(*outputs), - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ) - return Dataset(None, future.outputs[0]) + data = {k: v for k, v in zip(computable.outputs, outputs)} - def filter( - self, - quantity: str, - ) -> Dataset: - """ - Filter the dataset based on a specified quantity. - - Args: - quantity: The quantity to filter on. + file = psiflow.context().new_file("data_", ".xyz") - Returns: - Dataset: A new Dataset containing only structures that pass the filter. - """ - assert quantity in QUANTITIES - extxyz = app_filter( - quantity, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + future = read_frames(self.extxyz) + future = insert_quantities(future, data) + extxyz = write_frames(future, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) - def not_null(self) -> Dataset: + def filter(self, quantity: str) -> Dataset: """ - Remove null states from the dataset. - - Returns: - Dataset: A new Dataset without null states. + Filter the dataset based on a specified quantity. """ file = psiflow.context().new_file("data_", ".xyz") - extxyz = not_null(inputs=[self.extxyz], outputs=[file]).outputs[0] - return Dataset(None, extxyz) + future = app_filter(self.extxyz, quantity, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) - def align_axes(self): + def align_axes(self) -> Dataset: """ Adopt a canonical orientation for all (periodic) structures in the dataset. - - Returns: - Dataset: A new Dataset with aligned structures. """ - extxyz = align_axes( - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ).outputs[0] - return Dataset(None, extxyz) + file = psiflow.context().new_file("data_", ".xyz") + future = align_axes(self.extxyz, outputs=[file]) + return Dataset(extxyz=future.outputs[0]) def split(self, fraction, shuffle=True): # auto-shuffles """ diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 1ca0e03..3f77308 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -19,6 +19,20 @@ FileLike: TypeAlias = str | Path | File + +class MissingType: + """Placeholder sentinel for missing data fields""" + + def __repr__(self): + return "" + + def __bool__(self): + return False + + +MISSING = MissingType() + + def iter_read_frames(file: FileLike) -> Generator[list[str]]: """Yields text data to instantiate geometries""" frame_regex = re.compile(r"^\d+$") @@ -94,166 +108,96 @@ def _read_frames( read_frames = python_app(_read_frames, executors=["default_threads"]) -@typeguard.typechecked def _extract_quantities( - quantities: tuple[str, ...], - atom_indices: Optional[list[int]], - elements: Optional[list[str]], - *extra_data: Geometry, - inputs: list = [], -) -> tuple[np.ndarray, ...]: + states: Sequence[Geometry], quantities: Sequence[str] +) -> dict[str, list]: """ Extract specified quantities from Geometry instances. - - Args: - quantities: Tuple of quantity names to extract. - atom_indices: List of atom indices to consider. - elements: List of element symbols to consider. - *extra_data: Additional Geometry instances. - inputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the input file path containing geometry data. - - Returns: - tuple[np.ndarray, ...]: Tuple of arrays containing extracted quantities. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - if not len(extra_data): - assert len(inputs) == 1 - data = _read_frames(inputs=inputs) - else: - assert len(inputs) == 0 - data = list(extra_data) - order_names = list(set([k for g in data for k in g.order])) - natoms = np.array([len(geometry) for geometry in data], dtype=int) - max_natoms = np.max(natoms) - - arrays = create_outputs(quantities, data) - for i, geometry in enumerate(data): - mask = get_index_element_mask( - geometry.per_atom.numbers, - atom_indices, - elements, - natoms_padded=int(max_natoms), - ) - natoms = len(geometry) - for j, quantity in enumerate(quantities): - if quantity == "positions": - arrays[j][i, mask, :] = geometry.per_atom.positions[mask[:natoms]] - elif quantity == "forces": - arrays[j][i, mask, :] = geometry.per_atom.forces[mask[:natoms]] - elif quantity == "cell": - arrays[j][i, :, :] = geometry.cell - elif quantity == "stress": - if geometry.stress is not None: - arrays[j][i, :, :] = geometry.stress - elif quantity == "numbers": - arrays[j][i, :] = geometry.numbers - elif quantity == "energy": - if geometry.energy is not None: - arrays[j][i] = geometry.energy - elif quantity == "delta": - if geometry.delta is not None: - arrays[j][i] = geometry.delta - elif quantity == "per_atom_energy": - if geometry.energy is not None: - arrays[j][i] = geometry.per_atom_energy - elif quantity == "phase": - if geometry.phase is not None: - arrays[j][i] = geometry.phase - elif quantity == "logprob": - if geometry.logprob is not None: - arrays[j][i, :] = geometry.logprob - elif quantity == "identifier": - if geometry.identifier is not None: - arrays[j][i] = geometry.identifier - elif quantity in order_names: - if quantity in geometry.order: - arrays[j][i] = geometry.order[quantity] - return tuple(arrays) + data = {k: [] for k in quantities} + for k in quantities: + if k in ("cell", "energy", "stress"): + for geom in states: + value = getattr(geom, k, MISSING) + data[k].append(value) + continue + for geom in states: + value = getattr(geom.per_atom, k, MISSING) + if value is MISSING: + value = getattr(geom.meta, k, MISSING) + data[k].append(value) + return data extract_quantities = python_app(_extract_quantities, executors=["default_threads"]) -@typeguard.typechecked def _insert_quantities( - quantities: tuple[str, ...], - arrays: Sequence[np.ndarray], - data: Optional[list[Geometry]] = None, - inputs: list = [], - outputs: list = [], -) -> None: + states: Sequence[Geometry], data: dict[str, list] +) -> Sequence[Geometry]: """ - Insert quantities into Geometry instances. - - Args: - quantities: Tuple of quantity names to insert. - arrays: List of arrays containing the quantities to insert. - data: List of Geometry instances to update. - inputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the output file path where updated geometries will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. + Insert quantities from data into Geometry instances. """ - if data is None: - assert len(inputs) == 1 - data = _read_frames(inputs=inputs) - else: - assert len(inputs) == 0 - max_natoms = max([len(geometry) for geometry in data]) + for q, values in data.items(): + assert len(states) == len(values) - for i, geometry in enumerate(data): - if geometry == NullState: + if q in ("cell", "energy", "stress"): + for geom, v in zip(states, values): + setattr(geom, q, v) continue - mask = get_index_element_mask( - geometry.per_atom.numbers, - None, - None, - natoms_padded=int(max_natoms), - ) - natoms = len(geometry) - for j, quantity in enumerate(quantities): - if quantity == "positions": - geometry.per_atom.positions[:natoms, :] = arrays[j][i, mask, :] - elif quantity == "forces": - geometry.per_atom.forces[mask[:natoms]] = arrays[j][i, mask, :] - elif quantity == "cell": - geometry.cell = arrays[j][i, :, :] - elif quantity == "stress": - geometry.stress = arrays[j][i, :, :] - elif quantity == "numbers": - geometry.numbers = arrays[j][i, :] - elif quantity == "energy": - geometry.energy = arrays[j][i] - elif quantity == "delta": - geometry.delta = arrays[j][i] - elif quantity == "per_atom_energy": - geometry.per_atom_energy = arrays[j][i] - elif quantity == "phase": - geometry.phase = arrays[j][i] - elif quantity == "logprob": - geometry.logprob = arrays[j][i, :] - elif quantity == "identifier": - geometry.identifier = arrays[j][i] - elif quantity in geometry.order: - geometry.order[quantity] = arrays[j][i] + + for geom, v in zip(states, values): + if isinstance(v, np.ndarray) and len(geom) == len(v): + setattr(geom.per_atom, q, v) else: - raise ValueError("unknown quantity {}".format(quantity)) - if len(outputs) > 0: - _write_frames(*data, outputs=[outputs[0]]) + setattr(geom.meta, q, v) + + return states insert_quantities = python_app(_insert_quantities, executors=["default_threads"]) +def _extract_quantities_per_atom( + states: Sequence[Geometry], + quantities: Sequence[str], + atom_indices: Optional[Sequence[int]] = None, + elements: Optional[Sequence[str]] = None, +) -> dict[str, list]: + """ + Extract per atom quantities from Geometry instances, filtering based on atom_indices or elements. + """ + if atom_indices is None and elements is None: + # no filtering + data = {} + for k in quantities: + data[k] = [getattr(geom.per_atom, k, MISSING) for geom in states] + return data + + data = {k: [] for k in quantities} + numbers = {atomic_numbers[s] for s in elements or ()} + for geom in states: + # mask out unwanted rows + mask = np.zeros(len(geom), dtype=bool) + for n in numbers: + mask[geom.numbers == n] = True + if atom_indices is not None: + mask[atom_indices] = True + + for k in quantities: + value = getattr(geom.per_atom, k, MISSING) + if not value is MISSING: + value = value[mask] + data[k].append(value) + + return data + + +extract_quantities_per_atom = python_app( + _extract_quantities_per_atom, executors=["default_threads"] +) + + @typeguard.typechecked def _check_distances(state: Geometry, threshold: float) -> Geometry: """ @@ -416,7 +360,10 @@ def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None: def _apply_offset( - file: FileLike, subtract: bool, outputs: Sequence[File] = (), **atomic_energies: float + file: FileLike, + subtract: bool, + outputs: Sequence[File] = (), + **atomic_energies: float, ) -> None: """ Apply an energy offset to all frames in a file. @@ -461,128 +408,52 @@ def _get_elements(*files: File) -> set[str]: get_elements = python_app(_get_elements, executors=["default_threads"]) -@typeguard.typechecked -def _align_axes(inputs: list = [], outputs: list = []) -> None: +def _align_axes(file: FileLike, outputs: Sequence[File] = ()) -> None: """ Align axes for all frames in a file. Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. + file: DataFuture representing the input file path containing the geometry data. outputs: List of Parsl futures. The first element should be a DataFuture representing the output file path where aligned frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(inputs[0]) + assert len(outputs) == 1 + data = _read_frames(file) for geometry in data: geometry.align_axes() - _write_frames(*data, outputs=[outputs[0]]) + _write_frames(*data, outputs=outputs) align_axes = python_app(_align_axes, executors=["default_threads"]) -@typeguard.typechecked -def _not_null(inputs: list = [], outputs: list = []) -> list[bool]: - """ - Check which frames in a file are not null states. - - Args: - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. If provided, the first element should be a DataFuture - representing the output file path where non-null frames will be written. - - Returns: - list[bool]: List of boolean values indicating non-null states. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. +def _app_filter(file: FileLike, quantity: str, outputs: Sequence[File] = ()) -> None: """ - frame_regex = re.compile(r"^\d+$") - - data = [] - mask = [] - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - if natoms == 1: - if _[-1].strip()[0] == "X": # check if element(-1) == X - mask.append(False) - continue - data.append("".join([line] + _)) - mask.append(True) - if len(outputs) > 0: - with open(outputs[0], "w") as f: - f.write("\n".join(data)) - return mask - - -not_null = python_app(_not_null, executors=["default_threads"]) - - -@typeguard.typechecked -def _app_filter( - quantity: str, - inputs: list = [], - outputs: list = [], -) -> None: + Filter frames based on a specified quantity and writes to first output. """ - Filter frames based on a specified quantity. - - Args: - quantity: The quantity to filter on. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where filtered frames will be written. - - Returns: - None - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(inputs=[inputs[0]]) - i = 0 - while i < len(data): - if quantity == "forces": - if np.all(np.invert(np.isnan(data[i].per_atom.forces))): - retain = True - else: - retain = False - elif quantity == "cell": - if not np.allclose(data[i].cell, 0.0): - retain = True - else: - retain = False - elif hasattr(data[i], quantity) and getattr(data[i], quantity) is not None: - retain = True - else: - retain = False + # TODO: where is this used? + data = _read_frames(file) + out = [] + + # None does not count + for geom in data: + if (quantity in ("cell", "energy", "stress")) and getattr( + geom, quantity, None + ) is not None: + out.append(geom) + continue - # pop if necessary: - if retain: - i += 1 - else: - data.pop(i) + value = getattr(geom.per_atom, quantity, None) + if value is None: + value = getattr(geom.meta, quantity, None) + if value is not None: + out.append(geom) - _write_frames(*data, outputs=[outputs[0]]) + _write_frames(out, outputs=outputs) -app_filter = python_app( - _app_filter, executors=["default_threads"] -) # filter is protected +# filter is protected +app_filter = python_app(_app_filter, executors=["default_threads"]) def _shuffle( @@ -662,46 +533,47 @@ def get_train_valid_indices( return future[0], future[1] -@typeguard.typechecked -def get_index_element_mask( - numbers: np.ndarray, - atom_indices: Optional[list[int]], - elements: Optional[list[str]], - natoms_padded: Optional[int] = None, -) -> np.ndarray: - """ - Generate a mask for atom indices and elements. - - Args: - numbers: Array of atomic numbers. - atom_indices: List of atom indices to include. - elements: List of element symbols to include. - natoms_padded: Total number of atoms including padding. - - Returns: - np.ndarray: Boolean mask array. - """ - mask = np.array([True] * len(numbers)) - - if elements is not None: - numbers_to_include = [atomic_numbers[e] for e in elements] - mask_elements = np.array([False] * len(numbers)) - for number in numbers_to_include: - mask_elements = np.logical_or(mask_elements, (numbers == number)) - mask = np.logical_and(mask, mask_elements) - - if natoms_padded is not None: - assert natoms_padded >= len(numbers) - padding = natoms_padded - len(numbers) - mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool) - - if atom_indices is not None: # below padding - mask_indices = np.array([False] * len(mask)) - mask_indices[np.array(atom_indices)] = True - mask = np.logical_and(mask, mask_indices) - - return mask - +# @typeguard.typechecked +# def get_index_element_mask( +# numbers: np.ndarray, +# atom_indices: Optional[list[int]], +# elements: Optional[list[str]], +# natoms_padded: Optional[int] = None, +# ) -> np.ndarray: +# """ +# Generate a mask for atom indices and elements. +# +# Args: +# numbers: Array of atomic numbers. +# atom_indices: List of atom indices to include. +# elements: List of element symbols to include. +# natoms_padded: Total number of atoms including padding. +# +# Returns: +# np.ndarray: Boolean mask array. +# """ +# mask = np.array([True] * len(numbers)) +# +# if elements is not None: +# numbers_to_include = [atomic_numbers[e] for e in elements] +# mask_elements = np.array([False] * len(numbers)) +# for number in numbers_to_include: +# mask_elements = np.logical_or(mask_elements, (numbers == number)) +# mask = np.logical_and(mask, mask_elements) +# +# if natoms_padded is not None: +# assert natoms_padded >= len(numbers) +# padding = natoms_padded - len(numbers) +# mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool) +# +# if atom_indices is not None: # below padding +# mask_indices = np.array([False] * len(mask)) +# mask_indices[np.array(atom_indices)] = True +# mask = np.logical_and(mask, mask_indices) +# +# return mask + +# TODO: these do not belong here @typeguard.typechecked def _compute_rmse( From 40fd431e2d07be0b236c445685122898af86adde Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Thu, 2 Apr 2026 11:37:03 +0200 Subject: [PATCH 04/16] some restructuring --- psiflow/abandonware.py | 51 +++++ psiflow/compute.py | 338 ++++++++++++++++++++++++++++ psiflow/data/__init__.py | 3 +- psiflow/data/dataset.py | 270 +--------------------- psiflow/data/utils.py | 472 +++++++++++++-------------------------- psiflow/geometry.py | 123 +--------- 6 files changed, 563 insertions(+), 694 deletions(-) create mode 100644 psiflow/abandonware.py create mode 100644 psiflow/compute.py diff --git a/psiflow/abandonware.py b/psiflow/abandonware.py new file mode 100644 index 0000000..5d9f9b0 --- /dev/null +++ b/psiflow/abandonware.py @@ -0,0 +1,51 @@ +""" +TODO: these functions seem to not be used anywhere in the codebase + probably a good idea to remove these eventually +""" + + +def _check_equality(state0: Geometry, state1: Geometry) -> bool: + """Check if two Geometry instances are equal""" + return state0 == state1 + + +check_equality = python_app(_check_equality, executors=["default_threads"]) + + +@typeguard.typechecked +def _check_distances(state: Geometry, threshold: float) -> Geometry: + """ + Check if all interatomic distances in a Geometry are above a threshold. + + Args: + state: Geometry instance to check. + threshold: Minimum allowed interatomic distance. + + Returns: + Geometry: The input Geometry if all distances are above the threshold, otherwise NullState. + + Note: + This function is wrapped as a Parsl app and executed using the default_htex executor. + """ + from ase.geometry.geometry import find_mic + + if state == NullState: + return NullState + nrows = int(len(state) * (len(state) - 1) / 2) + deltas = np.zeros((nrows, 3)) + count = 0 + for i in range(len(state) - 1): + for j in range(i + 1, len(state)): + deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j] + count += 1 + assert count == nrows + if state.periodic: + deltas, _ = find_mic(deltas, state.cell) + check = np.all(np.linalg.norm(deltas, axis=1) > threshold) + if check: + return state + else: + return NullState + + +check_distances = python_app(_check_distances, executors=["default_htex"]) diff --git a/psiflow/compute.py b/psiflow/compute.py new file mode 100644 index 0000000..110ab33 --- /dev/null +++ b/psiflow/compute.py @@ -0,0 +1,338 @@ +import math +from typing import Callable, ClassVar, Optional, Union + +import numpy as np +import typeguard +from parsl import python_app +from parsl.app.app import join_app, python_app +from parsl.app.python import PythonApp +from parsl.dataflow.futures import AppFuture, DataFuture + +import psiflow +from psiflow.geometry import Geometry +from psiflow.data import Dataset +from psiflow.data.utils import batch_frames + + +def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: + """ + Concatenate multiple lists of arrays. + + Args: + *args: Lists of numpy arrays to concatenate. + + Returns: + list[np.ndarray]: List of concatenated arrays. + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + + def pad_arrays( + arrays: list[np.ndarray], + pad_dimension: int = 1, + ) -> list[np.ndarray]: + ndims = np.array([len(a.shape) for a in arrays]) + assert np.all(ndims == ndims[0]) + assert np.all(pad_dimension < ndims) + + pad_size = max([a.shape[pad_dimension] for a in arrays]) + for i in range(len(arrays)): + shape = list(arrays[i].shape) + shape[pad_dimension] = pad_size - shape[pad_dimension] + padding = np.zeros(tuple(shape)) + np.nan + arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension) + return arrays + + narrays = len(args[0]) + for arg in args: + assert isinstance(arg, list) + assert all([len(a) == narrays for a in args]) + + concatenated = [] + for i in range(narrays): + arrays = [arg[i] for arg in args] + if len(arrays[0].shape) > 1: + pad_arrays(arrays) + concatenated.append(np.concatenate(tuple(arrays))) + return concatenated + + +concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"]) + + +def _aggregate_multiple( + *arrays_list, + coefficients: Optional[np.ndarray] = None, +) -> list[np.ndarray]: + """ + Aggregate multiple lists of arrays with optional coefficients. + + Args: + *arrays_list: Lists of arrays to aggregate. + coefficients: Optional coefficients for weighted aggregation. + + Returns: + list[np.ndarray]: List of aggregated arrays. + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + if coefficients is None: + coefficients = np.ones(len(arrays_list)) + else: + assert len(coefficients) == len(arrays_list) + + results = [np.zeros(a.shape) for a in arrays_list[0]] + for i, arrays in enumerate(arrays_list): + for j, array in enumerate(arrays): + results[j] += coefficients[i] * array + return results + + +aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"]) + + +@join_app +def batch_apply( + apply_apps: tuple[Union[PythonApp, Callable]], + arg: Union[Dataset, list[Geometry]], + batch_size: int, + length: int, + outputs: list = [], + reduce_func: Optional[PythonApp] = None, + **app_kwargs, +) -> AppFuture: + """ + Apply a set of apps to batches of data. + + Args: + apply_apps: Tuple of PythonApps or Callables to apply. + arg: Dataset or list of Geometries to process. + batch_size: Size of each batch. + length: Total number of items to process. + outputs: List of output files. + reduce_func: Optional function to reduce results. + **app_kwargs: Additional keyword arguments for the apps. + + Returns: + AppFuture: Future representing the result of batch application. + + Note: + This function is wrapped as a Parsl join_app. + """ + nbatches = math.ceil(length / batch_size) + batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)] + future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches) + output_futures = [] + for i in range(nbatches): + futures = [] + for app in apply_apps: + f = app( + None, + inputs=[future.outputs[i]], + **app_kwargs, + ) + futures.append(f) + reduced = reduce_func(*futures) + output_futures.append(reduced) + future = concatenate_multiple(*output_futures) + return future + + +@python_app(executors=["default_threads"]) +def get_length(arg): + """ + Get the length of the input argument. + + Args: + arg: Input to get the length of. + + Returns: + int: Length of the input. + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + if isinstance(arg, list): + return len(arg) + else: + return 1 + + +def compute( + arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], + *apply_apps: Union[PythonApp, Callable], + outputs_: Union[str, list[str], tuple[str, ...], None] = None, + reduce_func: Union[PythonApp, Callable] = aggregate_multiple, + batch_size: Optional[int] = None, +) -> Union[list[AppFuture], AppFuture]: + """ + Compute results by applying apps to the input data. + + Args: + arg: Input data to compute on. + *apply_apps: Apps to apply to the data. + outputs_: Names of output quantities. + reduce_func: Function to reduce results. + batch_size: Optional batch size for processing. + + Returns: + Union[list[AppFuture], AppFuture]: Future(s) representing computation results. + """ + if type(outputs_) is str: + outputs_ = [outputs_] + if batch_size is not None: + if isinstance(arg, Dataset): + length = arg.length() + else: + length = get_length(arg) + # convert to Dataset for convenience + arg = Dataset(arg) + future = batch_apply( + apply_apps, + arg, + batch_size, + length, + outputs_=outputs_, + reduce_func=reduce_func, + ) + else: + futures = [] + if isinstance(arg, Dataset): + for app in apply_apps: + future = app( + None, + outputs_=outputs_, + inputs=[arg.extxyz], + ) + futures.append(future) + else: + for app in apply_apps: + future = app( + arg, + outputs_=outputs_, + inputs=[], + ) + futures.append(future) + future = reduce_func(*futures) + if len(outputs_) == 1: + return future[0] + else: + return [future[i] for i in range(len(outputs_))] + + +class Computable: + """ + Base class for computable objects. + + Attributes: + outputs (ClassVar[tuple[str, ...]]): Names of output quantities. + batch_size (ClassVar[Optional[int]]): Default batch size for computation. + """ + + outputs: ClassVar[tuple[str, ...]] = () + batch_size: ClassVar[Optional[int]] = None + + def compute( + self, + arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], + *outputs: Optional[str], + batch_size: Optional[int] = -1, # if -1: take class default + ) -> Union[list[AppFuture], AppFuture]: + """ + Compute results for the given input. + + Args: + arg: Input data to compute on. + outputs: Names of output quantities. + batch_size: Batch size for computation. + + Returns: + Union[list[AppFuture], AppFuture]: Future(s) representing computation results. + """ + raise NotImplementedError + + +@typeguard.typechecked +def _compute_rmse( + array0: np.ndarray, + array1: np.ndarray, + reduce: bool = True, +) -> Union[float, np.ndarray]: + """ + Compute the Root Mean Square Error (RMSE) between two arrays. + + Args: + array0: First array. + array1: Second array. + reduce: Whether to reduce the result to a single value. + + Returns: + Union[float, np.ndarray]: RMSE value(s). + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + assert array0.shape == array1.shape + assert np.all(np.isnan(array0) == np.isnan(array1)) + + se = (array0 - array1) ** 2 + se = se.reshape(se.shape[0], -1) + + if reduce: # across both dimensions + mask = np.logical_not(np.isnan(se)) + return float(np.sqrt(np.mean(se[mask]))) + else: # retain first dimension + if se.ndim == 1: + return se + else: + values = np.empty(len(se)) + for i in range(len(se)): + if np.all(np.isnan(se[i])): + values[i] = np.nan + else: + mask = np.logical_not(np.isnan(se[i])) + value = np.sqrt(np.mean(se[i][mask])) + values[i] = value + return values + + +compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) + + +@typeguard.typechecked +def _compute_mae( + array0, + array1, + reduce: bool = True, +) -> Union[float, np.ndarray]: + """ + Compute the Mean Absolute Error (MAE) between two arrays. + + Args: + array0: First array. + array1: Second array. + reduce: Whether to reduce the result to a single value. + + Returns: + Union[float, np.ndarray]: MAE value(s). + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + assert array0.shape == array1.shape + mask0 = np.logical_not(np.isnan(array0)) + mask1 = np.logical_not(np.isnan(array1)) + assert np.all(mask0 == mask1) + ae = np.abs(array0 - array1) + to_reduce = tuple(range(1, array0.ndim)) + mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) + ae = ae[mask0].reshape(np.sum(1 * mask), -1) + if reduce: # across both dimensions + return float(np.sqrt(np.mean(ae))) + else: # retain first dimension + return np.sqrt(np.mean(ae, axis=1)) + + +compute_mae = python_app(_compute_mae, executors=["default_threads"]) diff --git a/psiflow/data/__init__.py b/psiflow/data/__init__.py index b91798a..13a9804 100644 --- a/psiflow/data/__init__.py +++ b/psiflow/data/__init__.py @@ -1,2 +1 @@ -from .dataset import Computable, Dataset, aggregate_multiple, compute # noqa: F401 -from .utils import compute_mae, compute_rmse # noqa: F401 +from .dataset import Dataset diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 9df0142..1dafd24 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -1,28 +1,22 @@ -import math from pathlib import Path -from typing import Callable, ClassVar, Optional, Union +from typing import Optional, Union -import numpy as np -from parsl.app.app import join_app, python_app -from parsl.app.python import PythonApp from parsl.data_provider.files import File from parsl.dataflow.futures import AppFuture, DataFuture import psiflow from psiflow.geometry import Geometry -from psiflow.utils.apps import copy_data_future, pack +from psiflow.utils.apps import copy_data_future from .utils import ( align_axes, app_filter, apply_offset, assign_identifiers, - batch_frames, clean_frames, count_frames, extract_quantities, get_elements, - get_train_valid_indices, insert_quantities, join_frames, read_frames, @@ -30,6 +24,7 @@ shuffle, write_frames, extract_quantities_per_atom, + split_frames, ) @@ -204,7 +199,7 @@ def get_per_atom( return tuple(future_dict[q] for q in quantities) def evaluate( - self, computable: Computable, batch_size: Optional[int] = None + self, computable: "Computable", batch_size: Optional[int] = None ) -> Dataset: """ Evaluate a Computable on the dataset. @@ -249,23 +244,21 @@ def align_axes(self) -> Dataset: future = align_axes(self.extxyz, outputs=[file]) return Dataset(extxyz=future.outputs[0]) - def split(self, fraction, shuffle=True): # auto-shuffles + def split(self, fraction: float, shuffle: bool = True) -> tuple[Dataset, Dataset]: """ Split the dataset into training and validation sets. Args: fraction: Fraction of data to use for training. shuffle: Whether to shuffle before splitting. - - Returns: - tuple[Dataset, Dataset]: Training and validation datasets. """ - train, valid = get_train_valid_indices( - self.length(), - fraction, - shuffle, + assert 0 <= fraction <= 1 + file_train = psiflow.context().new_file("data_", ".xyz") + file_val = psiflow.context().new_file("data_", ".xyz") + future = split_frames( + self.extxyz, fraction, shuffle, outputs=[file_train, file_val] ) - return self.__getitem__(train), self.__getitem__(valid) + return Dataset(extxyz=future.outputs[0]), Dataset(extxyz=future.outputs[1]) def assign_identifiers( self, identifier: Union[int, AppFuture, None] = None @@ -279,6 +272,7 @@ def assign_identifiers( Returns: AppFuture: Future representing the next available identifier. """ + # TODO: what is the use case for this? result = assign_identifiers( identifier, inputs=[self.extxyz], @@ -295,243 +289,3 @@ def load(cls, path_xyz: Union[Path, str]) -> Dataset: path_xyz = psiflow.resolve_and_check(Path(path_xyz)) assert path_xyz.exists() # needs to be locally accessible return cls(extxyz=File(path_xyz)) - - -def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: - """ - Concatenate multiple lists of arrays. - - Args: - *args: Lists of numpy arrays to concatenate. - - Returns: - list[np.ndarray]: List of concatenated arrays. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - - def pad_arrays( - arrays: list[np.ndarray], - pad_dimension: int = 1, - ) -> list[np.ndarray]: - ndims = np.array([len(a.shape) for a in arrays]) - assert np.all(ndims == ndims[0]) - assert np.all(pad_dimension < ndims) - - pad_size = max([a.shape[pad_dimension] for a in arrays]) - for i in range(len(arrays)): - shape = list(arrays[i].shape) - shape[pad_dimension] = pad_size - shape[pad_dimension] - padding = np.zeros(tuple(shape)) + np.nan - arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension) - return arrays - - narrays = len(args[0]) - for arg in args: - assert isinstance(arg, list) - assert all([len(a) == narrays for a in args]) - - concatenated = [] - for i in range(narrays): - arrays = [arg[i] for arg in args] - if len(arrays[0].shape) > 1: - pad_arrays(arrays) - concatenated.append(np.concatenate(tuple(arrays))) - return concatenated - - -concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"]) - - -def _aggregate_multiple( - *arrays_list, - coefficients: Optional[np.ndarray] = None, -) -> list[np.ndarray]: - """ - Aggregate multiple lists of arrays with optional coefficients. - - Args: - *arrays_list: Lists of arrays to aggregate. - coefficients: Optional coefficients for weighted aggregation. - - Returns: - list[np.ndarray]: List of aggregated arrays. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - if coefficients is None: - coefficients = np.ones(len(arrays_list)) - else: - assert len(coefficients) == len(arrays_list) - - results = [np.zeros(a.shape) for a in arrays_list[0]] - for i, arrays in enumerate(arrays_list): - for j, array in enumerate(arrays): - results[j] += coefficients[i] * array - return results - - -aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"]) - - -@join_app -def batch_apply( - apply_apps: tuple[Union[PythonApp, Callable]], - arg: Union[Dataset, list[Geometry]], - batch_size: int, - length: int, - outputs: list = [], - reduce_func: Optional[PythonApp] = None, - **app_kwargs, -) -> AppFuture: - """ - Apply a set of apps to batches of data. - - Args: - apply_apps: Tuple of PythonApps or Callables to apply. - arg: Dataset or list of Geometries to process. - batch_size: Size of each batch. - length: Total number of items to process. - outputs: List of output files. - reduce_func: Optional function to reduce results. - **app_kwargs: Additional keyword arguments for the apps. - - Returns: - AppFuture: Future representing the result of batch application. - - Note: - This function is wrapped as a Parsl join_app. - """ - nbatches = math.ceil(length / batch_size) - batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)] - future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches) - output_futures = [] - for i in range(nbatches): - futures = [] - for app in apply_apps: - f = app( - None, - inputs=[future.outputs[i]], - **app_kwargs, - ) - futures.append(f) - reduced = reduce_func(*futures) - output_futures.append(reduced) - future = concatenate_multiple(*output_futures) - return future - - -@python_app(executors=["default_threads"]) -def get_length(arg): - """ - Get the length of the input argument. - - Args: - arg: Input to get the length of. - - Returns: - int: Length of the input. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - if isinstance(arg, list): - return len(arg) - else: - return 1 - - -def compute( - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *apply_apps: Union[PythonApp, Callable], - outputs_: Union[str, list[str], tuple[str, ...], None] = None, - reduce_func: Union[PythonApp, Callable] = aggregate_multiple, - batch_size: Optional[int] = None, -) -> Union[list[AppFuture], AppFuture]: - """ - Compute results by applying apps to the input data. - - Args: - arg: Input data to compute on. - *apply_apps: Apps to apply to the data. - outputs_: Names of output quantities. - reduce_func: Function to reduce results. - batch_size: Optional batch size for processing. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. - """ - if type(outputs_) is str: - outputs_ = [outputs_] - if batch_size is not None: - if isinstance(arg, Dataset): - length = arg.length() - else: - length = get_length(arg) - # convert to Dataset for convenience - arg = Dataset(arg) - future = batch_apply( - apply_apps, - arg, - batch_size, - length, - outputs_=outputs_, - reduce_func=reduce_func, - ) - else: - futures = [] - if isinstance(arg, Dataset): - for app in apply_apps: - future = app( - None, - outputs_=outputs_, - inputs=[arg.extxyz], - ) - futures.append(future) - else: - for app in apply_apps: - future = app( - arg, - outputs_=outputs_, - inputs=[], - ) - futures.append(future) - future = reduce_func(*futures) - if len(outputs_) == 1: - return future[0] - else: - return [future[i] for i in range(len(outputs_))] - - -class Computable: - """ - Base class for computable objects. - - Attributes: - outputs (ClassVar[tuple[str, ...]]): Names of output quantities. - batch_size (ClassVar[Optional[int]]): Default batch size for computation. - """ - - outputs: ClassVar[tuple[str, ...]] = () - batch_size: ClassVar[Optional[int]] = None - - def compute( - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *outputs: Optional[str], - batch_size: Optional[int] = -1, # if -1: take class default - ) -> Union[list[AppFuture], AppFuture]: - """ - Compute results for the given input. - - Args: - arg: Input data to compute on. - outputs: Names of output quantities. - batch_size: Batch size for computation. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. - """ - raise NotImplementedError diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 3f77308..b9c6b94 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -2,18 +2,16 @@ import shutil import subprocess from pathlib import Path -from typing import Optional, Union, Sequence, Generator, TypeAlias +from typing import Optional, Sequence, Generator, TypeAlias import numpy as np import typeguard from ase.data import atomic_numbers, chemical_symbols from parsl import python_app, File -from parsl.dataflow.futures import AppFuture from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers # TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor. -# TODO: loosen type hints? # TODO: inputs/outputs for apps sometimes behave weird @@ -33,81 +31,6 @@ def __bool__(self): MISSING = MissingType() -def iter_read_frames(file: FileLike) -> Generator[list[str]]: - """Yields text data to instantiate geometries""" - frame_regex = re.compile(r"^\d+$") - with open(file, "r") as f: - for line in f: - if frame_regex.match(_ := line.strip()): - n = int(_) - yield [line] + [f.readline() for _i in range(n + 1)] - - -def _write_frames( - *states: Geometry | list[Geometry], outputs: Sequence[File] = () -) -> None: - """ - Write Geometry instances to a file. - - Args: - states: Variable number of (lists of) Geometry instances to write. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path. - """ - assert len(outputs) == 1 - data = [] - for d in states: - if isinstance(d, list): - data.extend(d) - else: - data.append(d) - with open(outputs[0], "w") as f: - f.write("".join([geom.to_string() for geom in data])) - - -write_frames = python_app(_write_frames, executors=["default_threads"]) - - -def _read_frames( - file: FileLike, indices: Optional[slice | list[int] | int] = None -) -> list[Geometry]: - """ - Read Geometry instances from a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. - - """ - if indices is None: - # read everything - data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] - return [Geometry.from_string(s) for s in data] - - # figure out what frames to read - # TODO: reads twice + indices always wrap modulo nframes which might be unexpected - length = _count_frames(file) - if isinstance(indices, slice): - indices = list(range(length)[indices]) - elif isinstance(indices, int): - indices = [indices] - - if length > 0: - indices = [i % length for i in indices] # for negative indices and wrapping - indices_ = set(indices) # for *much* faster 'i in indices' - - data = {} - for i, geom_str_list in enumerate(iter_read_frames(file)): - if i in indices_: - data[i] = Geometry.from_string("".join(geom_str_list)) - - # return in original order - return [data[i] for i in indices] - - -read_frames = python_app(_read_frames, executors=["default_threads"]) - - def _extract_quantities( states: Sequence[Geometry], quantities: Sequence[str] ) -> dict[str, list]: @@ -198,90 +121,79 @@ def _extract_quantities_per_atom( ) -@typeguard.typechecked -def _check_distances(state: Geometry, threshold: float) -> Geometry: - """ - Check if all interatomic distances in a Geometry are above a threshold. +def iter_read_frames(file: FileLike) -> Generator[list[str]]: + """Yields text data to instantiate geometries""" + frame_regex = re.compile(r"^\d+$") + with open(file, "r") as f: + for line in f: + if frame_regex.match(_ := line.strip()): + n = int(_) + yield [line] + [f.readline() for _i in range(n + 1)] - Args: - state: Geometry instance to check. - threshold: Minimum allowed interatomic distance. - Returns: - Geometry: The input Geometry if all distances are above the threshold, otherwise NullState. +def _write_frames( + *states: Geometry | list[Geometry], outputs: Sequence[File] = () +) -> None: + """ + Write Geometry instances to a file. - Note: - This function is wrapped as a Parsl app and executed using the default_htex executor. + Args: + states: Variable number of (lists of) Geometry instances to write. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path. """ - from ase.geometry.geometry import find_mic - - if state == NullState: - return NullState - nrows = int(len(state) * (len(state) - 1) / 2) - deltas = np.zeros((nrows, 3)) - count = 0 - for i in range(len(state) - 1): - for j in range(i + 1, len(state)): - deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j] - count += 1 - assert count == nrows - if state.periodic: - deltas, _ = find_mic(deltas, state.cell) - check = np.all(np.linalg.norm(deltas, axis=1) > threshold) - if check: - return state - else: - return NullState + assert len(outputs) == 1 + data = [] + for d in states: + if isinstance(d, list): + data.extend(d) + else: + data.append(d) + with open(outputs[0], "w") as f: + f.write("".join([geom.to_string() for geom in data])) -check_distances = python_app(_check_distances, executors=["default_htex"]) +write_frames = python_app(_write_frames, executors=["default_threads"]) -@typeguard.typechecked -def _assign_identifiers( - identifier: Optional[int], - inputs: list = [], - outputs: list = [], -) -> int: +def _read_frames( + file: FileLike, indices: Optional[slice | list[int] | int] = None +) -> list[Geometry]: """ - Assign identifiers to Geometry instances in a file. + Read Geometry instances from a file. Args: - identifier: Starting identifier value. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where updated geometries will be written. - - Returns: - int: Next available identifier. + file: DataFuture representing the input file path containing the geometry data. + indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. """ - data = _read_frames(slice(None), inputs=[inputs[0]]) - states = [] - if identifier is None: # do not assign but look for max - identifier = -1 - for geometry in data: - if geometry.identifier is not None: - identifier = max(identifier, geometry.identifier) - identifier += 1 - for geometry in data: # assign those which were not yet assigned - if geometry.identifier is None: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier - else: - for geometry in data: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier + if indices is None: + # read everything + data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] + return [Geometry.from_string(s) for s in data] + # figure out what frames to read + # TODO: reads twice + indices always wrap modulo nframes which might be unexpected + length = _count_frames(file) + if isinstance(indices, slice): + indices = list(range(length)[indices]) + elif isinstance(indices, int): + indices = [indices] -assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"]) + if length > 0: + indices = [i % length for i in indices] # for negative indices and wrapping + indices_ = set(indices) # for *much* faster 'i in indices' + + data = {} + for i, geom_str_list in enumerate(iter_read_frames(file)): + if i in indices_: + data[i] = Geometry.from_string("".join(geom_str_list)) + + # return in original order + return [data[i] for i in indices] + + +read_frames = python_app(_read_frames, executors=["default_threads"]) def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None: @@ -477,186 +389,33 @@ def _shuffle( shuffle = python_app(_shuffle, executors=["default_threads"]) -@typeguard.typechecked -def _train_valid_indices( - effective_nstates: int, - train_valid_split: float, - shuffle: bool, -) -> tuple[list[int], list[int]]: - """ - Generate indices for train and validation splits. - - Args: - effective_nstates: Total number of states. - train_valid_split: Fraction of states to use for training. - shuffle: Whether to shuffle the indices. - - Returns: - tuple[list[int], list[int]]: Lists of indices for training and validation sets. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - ntrain = int(np.floor(effective_nstates * train_valid_split)) - nvalid = effective_nstates - ntrain - assert ntrain > 0 - assert nvalid > 0 - order = np.arange(ntrain + nvalid, dtype=int) +def _split_frames( + file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = () +) -> None: + """Split into training and validation sets""" + assert len(outputs) == 2 + assert 0 <= fraction <= 1 + frames = _read_frames(file) + order = np.arange(len(frames)) if shuffle: np.random.shuffle(order) - train_list = list(order[:ntrain]) - valid_list = list(order[ntrain : (ntrain + nvalid)]) - return [int(i) for i in train_list], [int(i) for i in valid_list] - - -train_valid_indices = python_app(_train_valid_indices, executors=["default_threads"]) - - -@typeguard.typechecked -def get_train_valid_indices( - effective_nstates: AppFuture, - train_valid_split: float, - shuffle: bool, -) -> tuple[AppFuture, AppFuture]: - """ - Get futures for train and validation indices. - - Args: - effective_nstates: Future representing the total number of states. - train_valid_split: Fraction of states to use for training. - shuffle: Whether to shuffle the indices. - - Returns: - tuple[AppFuture, AppFuture]: Futures for training and validation indices. - """ - future = train_valid_indices(effective_nstates, train_valid_split, shuffle) - return future[0], future[1] - - -# @typeguard.typechecked -# def get_index_element_mask( -# numbers: np.ndarray, -# atom_indices: Optional[list[int]], -# elements: Optional[list[str]], -# natoms_padded: Optional[int] = None, -# ) -> np.ndarray: -# """ -# Generate a mask for atom indices and elements. -# -# Args: -# numbers: Array of atomic numbers. -# atom_indices: List of atom indices to include. -# elements: List of element symbols to include. -# natoms_padded: Total number of atoms including padding. -# -# Returns: -# np.ndarray: Boolean mask array. -# """ -# mask = np.array([True] * len(numbers)) -# -# if elements is not None: -# numbers_to_include = [atomic_numbers[e] for e in elements] -# mask_elements = np.array([False] * len(numbers)) -# for number in numbers_to_include: -# mask_elements = np.logical_or(mask_elements, (numbers == number)) -# mask = np.logical_and(mask, mask_elements) -# -# if natoms_padded is not None: -# assert natoms_padded >= len(numbers) -# padding = natoms_padded - len(numbers) -# mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool) -# -# if atom_indices is not None: # below padding -# mask_indices = np.array([False] * len(mask)) -# mask_indices[np.array(atom_indices)] = True -# mask = np.logical_and(mask, mask_indices) -# -# return mask - -# TODO: these do not belong here - -@typeguard.typechecked -def _compute_rmse( - array0: np.ndarray, - array1: np.ndarray, - reduce: bool = True, -) -> Union[float, np.ndarray]: - """ - Compute the Root Mean Square Error (RMSE) between two arrays. - - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: RMSE value(s). - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - assert np.all(np.isnan(array0) == np.isnan(array1)) - - se = (array0 - array1) ** 2 - se = se.reshape(se.shape[0], -1) - - if reduce: # across both dimensions - mask = np.logical_not(np.isnan(se)) - return float(np.sqrt(np.mean(se[mask]))) - else: # retain first dimension - if se.ndim == 1: - return se - else: - values = np.empty(len(se)) - for i in range(len(se)): - if np.all(np.isnan(se[i])): - values[i] = np.nan - else: - mask = np.logical_not(np.isnan(se[i])) - value = np.sqrt(np.mean(se[i][mask])) - values[i] = value - return values + n_train = int(len(frames) * fraction) + train_ids = order[:n_train] + val_ids = order[n_train:] + train = [frames[i] for i in train_ids] + val = [frames[i] for i in val_ids] + _write_frames(train, outputs=outputs[:1]) + _write_frames(val, outputs=outputs[1:]) -compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) +split_frames = python_app(_split_frames, executors=["default_threads"]) -@typeguard.typechecked -def _compute_mae( - array0, - array1, - reduce: bool = True, -) -> Union[float, np.ndarray]: - """ - Compute the Mean Absolute Error (MAE) between two arrays. - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: MAE value(s). - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - mask0 = np.logical_not(np.isnan(array0)) - mask1 = np.logical_not(np.isnan(array1)) - assert np.all(mask0 == mask1) - ae = np.abs(array0 - array1) - to_reduce = tuple(range(1, array0.ndim)) - mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) - ae = ae[mask0].reshape(np.sum(1 * mask), -1) - if reduce: # across both dimensions - return float(np.sqrt(np.mean(ae))) - else: # retain first dimension - return np.sqrt(np.mean(ae, axis=1)) - - -compute_mae = python_app(_compute_mae, executors=["default_threads"]) +# TODO +# TODO: think about where the parts below should live +# TODO @typeguard.typechecked @@ -710,3 +469,78 @@ def _batch_frames( batch_frames = python_app(_batch_frames, executors=["default_threads"]) + + +# def _assign_identifier( +# state: Geometry, +# identifier: int, +# discard: bool = False, +# ) -> tuple[Geometry, int]: +# """ +# Assign an identifier to a Geometry instance. +# +# Args: +# state (Geometry): Input Geometry instance. +# identifier (int): Identifier to assign. +# discard (bool, optional): Whether to discard the state. Defaults to False. +# +# Returns: +# tuple[Geometry, int]: Updated Geometry and next available identifier. +# """ +# if (state == NullState) or discard: +# return state, identifier +# else: +# assert state.identifier is None +# state.identifier = identifier +# return state, identifier + 1 +# +# +# assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) +# + + +@typeguard.typechecked +def _assign_identifiers( + identifier: Optional[int], + inputs: list = [], + outputs: list = [], +) -> int: + """ + Assign identifiers to Geometry instances in a file. + + Args: + identifier: Starting identifier value. + inputs: List of Parsl futures. The first element should be a DataFuture + representing the input file path containing geometry data. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path where updated geometries will be written. + + Returns: + int: Next available identifier. + + Note: + This function is wrapped as a Parsl app and executed using the default_threads executor. + """ + data = _read_frames(slice(None), inputs=[inputs[0]]) + states = [] + if identifier is None: # do not assign but look for max + identifier = -1 + for geometry in data: + if geometry.identifier is not None: + identifier = max(identifier, geometry.identifier) + identifier += 1 + for geometry in data: # assign those which were not yet assigned + if geometry.identifier is None: + geometry, identifier = _assign_identifier(geometry, identifier) + states.append(geometry) + _write_frames(*states, outputs=[outputs[0]]) + return identifier + else: + for geometry in data: + geometry, identifier = _assign_identifier(geometry, identifier) + states.append(geometry) + _write_frames(*states, outputs=[outputs[0]]) + return identifier + + +assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"]) diff --git a/psiflow/geometry.py b/psiflow/geometry.py index a2fa7e5..abac90e 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -11,7 +11,6 @@ from ase import Atoms from ase.data import atomic_masses, chemical_symbols, atomic_numbers from ase.io.extxyz import key_val_dict_to_str, read_xyz, key_val_str_to_dict -from parsl.app.app import python_app import psiflow @@ -19,7 +18,7 @@ # TODO: custom attributes -class Container: +class PerAtom: """ Holds 'per atom' arrays like positions and forces For every attribute, the first array dimension should be n_atoms @@ -52,7 +51,7 @@ def __len__(self) -> int: return self.numbers.shape[0] def __str__(self) -> str: - return f"Container[{len(self)}](keys={set(vars(self))})" + return f"PerAtom[{len(self)}](keys={set(vars(self))})" def _check(self, arr: npt.NDArray) -> bool: try: @@ -140,21 +139,20 @@ class Geometry: and various physical properties such as energy and forces. Attributes: - per_atom (Container): Record array containing per-atom properties. + per_atom (PerAtom): Record array containing per-atom properties. cell (np.ndarray): 3x3 array representing the unit cell vectors. energy (Optional[float]): Total energy of the system. stress (Optional[np.ndarray]): Stress tensor of the system. """ - per_atom: Container + per_atom: PerAtom cell: npt.NDArray energy: Optional[float] stress: Optional[np.ndarray] - meta: SimpleNamespace def __init__( self, - per_atom: Container, + per_atom: PerAtom, energy: Optional[float] = None, cell: Optional[np.ndarray] = None, stress: Optional[np.ndarray] = None, @@ -279,7 +277,7 @@ def from_string(cls, s: str) -> Geometry: data["cell"] = data.pop("Lattice").T # ase does Fortran ordering else: data["cell"] = None - per_atom = Container.from_string(body, data.pop("Properties", None)) + per_atom = PerAtom.from_string(body, data.pop("Properties", None)) assert len(per_atom) == int(n_atoms) return Geometry(per_atom, **data) @@ -300,7 +298,7 @@ def from_data( """ Create a Geometry instance from atomic numbers, positions, and cell data. """ - per_atom = Container(numbers.copy(), positions.copy()) + per_atom = PerAtom(numbers.copy(), positions.copy()) cell = cell if cell is None else cell.copy() return Geometry(per_atom, cell=cell) @@ -309,7 +307,7 @@ def from_atoms(cls, atoms: Atoms) -> Geometry: """ Create a Geometry instance from an ASE Atoms object. """ - per_atom = Container(**atoms.arrays) + per_atom = PerAtom(**atoms.arrays) data = atoms.info if all(atoms.pbc): data["cell"] = atoms.cell.array @@ -362,20 +360,6 @@ def numbers(self) -> npt.NDArray: return self.per_atom.numbers.flatten() -# def new_nullstate(): -# """ -# Create a new null state Geometry. -# -# Returns: -# Geometry: A Geometry instance representing a null state. -# """ -# return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None) - - -# use universal dummy state -# NullState = new_nullstate() - - def is_lower_triangular(cell: np.ndarray) -> bool: """ Check if a cell matrix is lower triangular. @@ -537,94 +521,3 @@ def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> except KeyError: raise KeyError(f"No atomic energy value for symbol '{symbol}'..") return float(total) - - -# def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]: -# """ -# Create output arrays for specified quantities from a list of Geometry instances. -# -# Args: -# quantities (list[str]): List of quantity names to extract. -# data (list[Geometry]): List of Geometry instances. -# -# Returns: -# list[np.ndarray]: List of arrays containing the requested quantities. -# """ -# order_names = list(set([k for g in data for k in g.order])) -# assert all([q in QUANTITIES + order_names for q in quantities]) -# natoms = np.array([len(geometry) for geometry in data], dtype=int) -# max_natoms = np.max(natoms) -# nframes = len(data) -# nprob = 0 -# max_phase = 0 -# for state in data: -# if state.logprob is not None: -# nprob = max(len(state.logprob), nprob) -# if state.phase is not None: -# max_phase = max(len(state.phase), max_phase) -# -# arrays = [] -# for quantity in quantities: -# if quantity in ["positions", "forces"]: -# array = np.empty((nframes, max_natoms, 3), dtype=np.float64) -# array[:] = np.nan -# elif quantity in ["cell", "stress"]: -# array = np.empty((nframes, 3, 3), dtype=np.float64) -# array[:] = np.nan -# elif quantity in ["numbers"]: -# array = np.empty((nframes, max_natoms), dtype=np.uint8) -# array[:] = 0 -# elif quantity in ["energy", "delta", "per_atom_energy"]: -# array = np.empty((nframes,), dtype=np.float64) -# array[:] = np.nan -# elif quantity in ["phase"]: -# array = np.empty((nframes,), dtype=(np.unicode_, max_phase)) -# array[:] = "" -# elif quantity in ["logprob"]: -# array = np.empty((nframes, nprob), dtype=np.float64) -# array[:] = np.nan -# elif quantity in ["identifier"]: -# array = np.empty((nframes,), dtype=np.int64) -# array[:] = -1 -# elif quantity in order_names: -# array = np.empty((nframes,), dtype=np.float64) -# array[:] = np.nan -# else: -# raise AssertionError("missing quantity in if/else") -# arrays.append(array) -# return arrays - - -# def _assign_identifier( -# state: Geometry, -# identifier: int, -# discard: bool = False, -# ) -> tuple[Geometry, int]: -# """ -# Assign an identifier to a Geometry instance. -# -# Args: -# state (Geometry): Input Geometry instance. -# identifier (int): Identifier to assign. -# discard (bool, optional): Whether to discard the state. Defaults to False. -# -# Returns: -# tuple[Geometry, int]: Updated Geometry and next available identifier. -# """ -# if (state == NullState) or discard: -# return state, identifier -# else: -# assert state.identifier is None -# state.identifier = identifier -# return state, identifier + 1 - - -# assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) - - -def _check_equality(state0: Geometry, state1: Geometry) -> bool: - """ Check if two Geometry instances are equal""" - return state0 == state1 - - -check_equality = python_app(_check_equality, executors=["default_threads"]) From 4978a83367e212dc86fffed05ce316d1881b7796 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Fri, 10 Apr 2026 11:09:22 +0200 Subject: [PATCH 05/16] more restructuring separated functions/apps for geometries from functions/apps for datasets --- psiflow/compute.py | 12 +- psiflow/data/dataset.py | 108 ++++---- psiflow/data/file.py | 273 +++++++++++++++++++++ psiflow/data/utils.py | 529 +++++----------------------------------- 4 files changed, 399 insertions(+), 523 deletions(-) create mode 100644 psiflow/data/file.py diff --git a/psiflow/compute.py b/psiflow/compute.py index 110ab33..22461ef 100644 --- a/psiflow/compute.py +++ b/psiflow/compute.py @@ -3,15 +3,13 @@ import numpy as np import typeguard -from parsl import python_app from parsl.app.app import join_app, python_app -from parsl.app.python import PythonApp from parsl.dataflow.futures import AppFuture, DataFuture import psiflow from psiflow.geometry import Geometry from psiflow.data import Dataset -from psiflow.data.utils import batch_frames +from psiflow.data.file import batch_frames def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: @@ -95,12 +93,12 @@ def _aggregate_multiple( @join_app def batch_apply( - apply_apps: tuple[Union[PythonApp, Callable]], + apply_apps: tuple[Union[python_app, Callable]], arg: Union[Dataset, list[Geometry]], batch_size: int, length: int, outputs: list = [], - reduce_func: Optional[PythonApp] = None, + reduce_func: Optional[python_app] = None, **app_kwargs, ) -> AppFuture: """ @@ -162,9 +160,9 @@ def get_length(arg): def compute( arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *apply_apps: Union[PythonApp, Callable], + *apply_apps: Union[python_app, Callable], outputs_: Union[str, list[str], tuple[str, ...], None] = None, - reduce_func: Union[PythonApp, Callable] = aggregate_multiple, + reduce_func: Union[python_app, Callable] = aggregate_multiple, batch_size: Optional[int] = None, ) -> Union[list[AppFuture], AppFuture]: """ diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 1dafd24..df79073 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -8,26 +8,28 @@ from psiflow.geometry import Geometry from psiflow.utils.apps import copy_data_future -from .utils import ( - align_axes, - app_filter, - apply_offset, - assign_identifiers, - clean_frames, +from psiflow.data.file import ( count_frames, - extract_quantities, get_elements, - insert_quantities, join_frames, read_frames, - reset_frames, - shuffle, write_frames, - extract_quantities_per_atom, split_frames, + shuffle_frames, + apply_offset, + reset_frames, + clean_frames, + align_axes, + filter_frames, + extract_quantities, + extract_quantities_per_atom, + assign_identifiers, ) +# TODO: do all operations need to generate a new file? + + @psiflow.register_serializable class Dataset: """ @@ -44,7 +46,7 @@ def __init__( extxyz: Optional[psiflow._DataFuture] = None, ): """ - Initialize a Dataset. + Initialize a Dataset. Provide either states or extxyz. Args: states: List of Geometry instances or AppFutures representing geometries. @@ -72,7 +74,7 @@ def shuffle(self) -> "Dataset": Shuffle the order of structures in the dataset. """ file = psiflow.context().new_file("data_", ".xyz") - extxyz = shuffle(self.extxyz, outputs=[file]).outputs[0] + extxyz = shuffle_frames(self.extxyz, outputs=[file]).outputs[0] return Dataset(extxyz=extxyz) def __getitem__( @@ -174,8 +176,7 @@ def get(self, *quantities: str) -> tuple[AppFuture, ...]: """ Extract specified quantities from the dataset. """ - future_states = read_frames(self.extxyz) - future_dict = extract_quantities(future_states, quantities) + future_dict = extract_quantities(self.extxyz, quantities) return tuple(future_dict[q] for q in quantities) def get_per_atom( @@ -192,48 +193,48 @@ def get_per_atom( atom_indices: Optional list of atom indices to consider. elements: Optional list of element symbols to consider. """ - future_states = read_frames(self.extxyz) future_dict = extract_quantities_per_atom( - future_states, quantities, atom_indices, elements + self.extxyz, quantities, atom_indices, elements ) return tuple(future_dict[q] for q in quantities) - def evaluate( - self, computable: "Computable", batch_size: Optional[int] = None - ) -> Dataset: - """ - Evaluate a Computable on the dataset. - """ - # TODO: remove this functionality? - # or this should be the (only) way to label a dataset? - from psiflow.hamiltonians import Hamiltonian - - if not isinstance(computable, Hamiltonian): - # avoid extracting and inserting the same quantities - return computable.compute_dataset(self) - - # use Hamiltonian.compute method - if batch_size is not None: - outputs = computable.compute(self, batch_size=batch_size) - else: - outputs = computable.compute(self) # use default from computable - if not isinstance(outputs, list): # compute unpacks for only one property - outputs = [outputs] - data = {k: v for k, v in zip(computable.outputs, outputs)} - - file = psiflow.context().new_file("data_", ".xyz") - - future = read_frames(self.extxyz) - future = insert_quantities(future, data) - extxyz = write_frames(future, outputs=[file]).outputs[0] - return Dataset(extxyz=extxyz) + # def evaluate( + # self, computable: "Computable", batch_size: Optional[int] = None + # ) -> Dataset: + # """ + # Evaluate a Computable on the dataset. + # """ + # # TODO: remove this functionality? + # # or this should be the (only) way to label a dataset? + # from psiflow.hamiltonians import Hamiltonian + # + # if not isinstance(computable, Hamiltonian): + # # avoid extracting and inserting the same quantities + # return computable.compute_dataset(self) + # + # # use Hamiltonian.compute method + # if batch_size is not None: + # outputs = computable.compute(self, batch_size=batch_size) + # else: + # outputs = computable.compute(self) # use default from computable + # if not isinstance(outputs, list): # compute unpacks for only one property + # outputs = [outputs] + # data = {k: v for k, v in zip(computable.outputs, outputs)} + # + # file = psiflow.context().new_file("data_", ".xyz") + # + # future = read_frames(self.extxyz) + # future = insert_quantities(future, data) + # extxyz = write_frames(future, outputs=[file]).outputs[0] + # return Dataset(extxyz=extxyz) def filter(self, quantity: str) -> Dataset: """ Filter the dataset based on a specified quantity. """ + # TODO: where is this used? file = psiflow.context().new_file("data_", ".xyz") - future = app_filter(self.extxyz, quantity, outputs=[file]) + future = filter_frames(self.extxyz, quantity, outputs=[file]) return Dataset(extxyz=future.outputs[0]) def align_axes(self) -> Dataset: @@ -273,13 +274,12 @@ def assign_identifiers( AppFuture: Future representing the next available identifier. """ # TODO: what is the use case for this? - result = assign_identifiers( - identifier, - inputs=[self.extxyz], - outputs=[psiflow.context().new_file("data_", ".xyz")], - ) - self.extxyz = result.outputs[0] - return result + # TODO: this is the only method that changes inplace instead of returning a new Dataset + future = assign_identifiers(self.extxyz, identifier) + future_states, future_id = future[0], future[1] + file = psiflow.context().new_file("data_", ".xyz") + self.extxyz = write_frames(future_states, outputs=[file]).outputs[0] + return future_id @classmethod def load(cls, path_xyz: Union[Path, str]) -> Dataset: diff --git a/psiflow/data/file.py b/psiflow/data/file.py new file mode 100644 index 0000000..b2c3902 --- /dev/null +++ b/psiflow/data/file.py @@ -0,0 +1,273 @@ +import re +import shutil +import subprocess +from pathlib import Path +from functools import partial +from typing import Optional, Generator, TypeAlias, Any, Callable +from collections.abc import Sequence + +import numpy as np +from ase.data import chemical_symbols +from parsl import python_app, File + +from psiflow.geometry import Geometry +from psiflow.data.utils import ( + get_unique_numbers, + apply_energy_offset, + filter_quantity, + extract, + extract_per_atom, + assign_ids, +) + +# TODO: some operations have side effects, which could be unexpected + + +FileLike: TypeAlias = str | Path | File + + +def iter_read_frames(file: FileLike) -> Generator[list[str]]: + """Yields text data to instantiate geometries""" + frame_regex = re.compile(r"^\d+$") + with open(file, "r") as f: + for line in f: + if frame_regex.match(_ := line.strip()): + n = int(_) + yield [line] + [f.readline() for _i in range(n + 1)] + + +def _write_frames( + *states: Geometry | list[Geometry], outputs: Sequence[File] = () +) -> None: + """ + Write Geometry instances to a file. + + Args: + states: Variable number of (lists of) Geometry instances to write. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path. + """ + assert len(outputs) == 1 + data = [] + for d in states: + if isinstance(d, list): + data.extend(d) + else: + data.append(d) + with open(outputs[0], "w") as f: + f.write("".join([geom.to_string() for geom in data])) + + +def _read_frames( + file: FileLike, indices: Optional[slice | list[int] | int] = None +) -> list[Geometry]: + """ + Read Geometry instances from a file. + + Args: + file: DataFuture representing the input file path containing the geometry data. + indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. + + """ + if indices is None: + # read everything + data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] + return [Geometry.from_string(s) for s in data] + + # figure out what frames to read + # TODO: reads twice + indices always wrap modulo nframes which might be unexpected + length = _count_frames(file) + if isinstance(indices, slice): + indices = list(range(length)[indices]) + elif isinstance(indices, int): + indices = [indices] + + if length > 0: + indices = [i % length for i in indices] # for negative indices and wrapping + indices_ = set(indices) # for *much* faster 'i in indices' + + data = {} + for i, geom_str_list in enumerate(iter_read_frames(file)): + if i in indices_: + data[i] = Geometry.from_string("".join(geom_str_list)) + + # return in original order + return [data[i] for i in indices] + + +def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None: + """ + Join multiple frame files into a single file. + + Args: + inputs: List of Parsl futures. Each element should be a DataFuture + representing an input file path containing geometry data. + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path where joined frames will be written. + """ + assert len(outputs) == 1 + + with open(outputs[0], "wb") as destination: + for input_file in inputs: + with open(input_file, "rb") as source: + shutil.copyfileobj(source, destination) + + +def _count_frames(file: FileLike) -> int: + """Count the number of frames in a file.""" + path = Path(file) + threshold = 10 * 1024 * 1024 # 10 MB in bytes + file_size = path.stat().st_size + + if file_size > threshold: # use grep + cmd = f"grep -Fc Properties {path}" + result = subprocess.check_output(cmd.split()) + return int(result.strip()) + return len([_ for _ in iter_read_frames(path)]) + + +def _get_elements(*files: File) -> set[str]: + """ + Get the set of elements present in all frames of a sequence of file. + """ + frames = [geom for file in files for geom in _read_frames(file)] + return {chemical_symbols[i] for i in get_unique_numbers(frames)} + + +def _split_frames( + file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = () +) -> None: + """Split into training and validation sets""" + assert len(outputs) == 2 + assert 0 <= fraction <= 1 + frames = _read_frames(file) + order = np.arange(len(frames)) + if shuffle: + np.random.shuffle(order) + + n_train = int(len(frames) * fraction) + train_ids = order[:n_train] + val_ids = order[n_train:] + + train = [frames[i] for i in train_ids] + val = [frames[i] for i in val_ids] + _write_frames(train, outputs=outputs[:1]) + _write_frames(val, outputs=outputs[1:]) + + +def _batch_frames( + file: FileLike, batch_size: int, outputs: Sequence[File] = () +) -> list[list[Geometry]]: + """ + Split frames into batches. + + Args: + file: DataFuture representing the input file path containing the geometry data. + batch_size: Number of frames per batch. + outputs: List of Parsl futures. Each element should be a DataFuture + representing an output file path for each batch. + """ + # TODO: why pass batch size? + # TODO: will not be great for large files + frames = _read_frames(file) + + batches, batch = [], [] + for i, geom in enumerate(frames): + batch.append(geom) + if (i + 1) % batch_size == 0: + batches.append(batch) + batch = [] + + if not outputs: + return batches + + assert len(outputs) == len(batches) + for file, batch in zip(outputs, batches): + _write_frames(batch, outputs=[file]) + return batches + + +write_frames = python_app(_write_frames, executors=["default_threads"]) +read_frames = python_app(_read_frames, executors=["default_threads"]) +join_frames = python_app(_join_frames, executors=["default_threads"]) +count_frames = python_app(_count_frames, executors=["default_threads"]) +get_elements = python_app(_get_elements, executors=["default_threads"]) +split_frames = python_app(_split_frames, executors=["default_threads"]) +batch_frames = python_app(_batch_frames, executors=["default_threads"]) + + +def _read_write_wrapper( + execute: Callable, # Parsl throws if this is named 'func' + file: FileLike, + *args: Any, + outputs: Sequence[File] = (), + **kwargs: Any, +) -> None: + """ + Wrapper function to make simple Geometry functions work for Dataset instances. + Essentially does (i) read (ii) modify (iii) write. + + Args: + file: DataFuture representing the input file path containing the geometry data. + execute: Function that will get executed for the geometries in file + outputs: List of Parsl futures. The first element should be a DataFuture + representing the output file path where reset frames will be written. + """ + assert len(outputs) == 1 + frames_in = _read_frames(file) + frames_out = execute(frames_in, *args, **kwargs) + if frames_out is None: # execute can alter in-place + frames_out = frames_in + _write_frames(frames_out, outputs=outputs) + + +def _reset_frames(states: Sequence[Geometry]) -> None: + for geom in states: + geom.reset() + + +def _clean_frames(states: Sequence[Geometry]) -> None: + for geom in states: + geom.clean() + + +def _align_axes(states: Sequence[Geometry]) -> None: + for geom in states: + geom.align_axes() + + +def _shuffle_frames(states: Sequence[Geometry]) -> None: + np.random.shuffle(states) + + +read_write_app = python_app(_read_write_wrapper, executors=["default_threads"]) +reset_frames = partial(read_write_app, _reset_frames) +clean_frames = partial(read_write_app, _clean_frames) +align_axes = partial(read_write_app, _align_axes) +shuffle_frames = partial(read_write_app, _shuffle_frames) + +apply_offset = partial(read_write_app, apply_energy_offset) +filter_frames = partial(read_write_app, filter_quantity) + + +def _read_wrapper( + execute: Callable, # Parsl throws if this is named 'func' + file: FileLike, + *args: Any, + **kwargs: Any, +) -> Any: + """ + Wrapper function to make simple Geometry functions work for Dataset instances. + Essentially does (i) read (ii) modify (iii) return. + + Args: + file: DataFuture representing the input file path containing the geometry data. + execute: Function that will get executed for the geometries in file + """ + frames_in = _read_frames(file) + return execute(frames_in, *args, **kwargs) + +read_app = python_app(_read_wrapper, executors=["default_threads"]) +extract_quantities = partial(read_app, extract) +extract_quantities_per_atom = partial(read_app, extract_per_atom) +assign_identifiers = partial(read_app, assign_ids) diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index b9c6b94..8cfa5a1 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -1,87 +1,68 @@ -import re -import shutil -import subprocess -from pathlib import Path -from typing import Optional, Sequence, Generator, TypeAlias +from typing import Optional +from collections.abc import Sequence import numpy as np -import typeguard -from ase.data import atomic_numbers, chemical_symbols -from parsl import python_app, File - -from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers - -# TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor. -# TODO: inputs/outputs for apps sometimes behave weird - - -FileLike: TypeAlias = str | Path | File - - -class MissingType: - """Placeholder sentinel for missing data fields""" - - def __repr__(self): - return "" - - def __bool__(self): - return False +from ase.data import atomic_numbers + +from psiflow.geometry import ( + Geometry, + get_atomic_energy, + DEFAULT_PROPERTIES, + PER_ATOM_FIELDS, + MISSING, +) -MISSING = MissingType() +# TODO: some of these have side effects.. -def _extract_quantities( - states: Sequence[Geometry], quantities: Sequence[str] -) -> dict[str, list]: +def extract(states: Sequence[Geometry], quantities: Sequence[str]) -> dict[str, list]: """ Extract specified quantities from Geometry instances. """ - data = {k: [] for k in quantities} - for k in quantities: - if k in ("cell", "energy", "stress"): - for geom in states: - value = getattr(geom, k, MISSING) - data[k].append(value) - continue + data = {} + + # figure out where to find quantities + per_atom, per_structure, unknown = [], [], [] + for q in set(quantities): + if q in PER_ATOM_FIELDS: + per_atom.append(q) + elif q in DEFAULT_PROPERTIES: + per_structure.append(q) + else: + unknown.append(q) + + for q in per_atom: + data[q] = [getattr(geom.per_atom, q, MISSING) for geom in states] + for q in per_structure: + data[q] = [getattr(geom, q, MISSING) for geom in states] + for q in unknown: + # try both options + data[q] = [] for geom in states: - value = getattr(geom.per_atom, k, MISSING) + value = getattr(geom, q, MISSING) if value is MISSING: - value = getattr(geom.meta, k, MISSING) - data[k].append(value) - return data + value = getattr(geom.per_atom, q, MISSING) + data[q].append(value) - -extract_quantities = python_app(_extract_quantities, executors=["default_threads"]) + return data -def _insert_quantities( - states: Sequence[Geometry], data: dict[str, list] -) -> Sequence[Geometry]: +def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geometry]: """ Insert quantities from data into Geometry instances. """ for q, values in data.items(): assert len(states) == len(values) - - if q in ("cell", "energy", "stress"): - for geom, v in zip(states, values): - setattr(geom, q, v) - continue - for geom, v in zip(states, values): if isinstance(v, np.ndarray) and len(geom) == len(v): setattr(geom.per_atom, q, v) else: - setattr(geom.meta, q, v) - + setattr(geom, q, v) return states -insert_quantities = python_app(_insert_quantities, executors=["default_threads"]) - - -def _extract_quantities_per_atom( +def extract_per_atom( states: Sequence[Geometry], quantities: Sequence[str], atom_indices: Optional[Sequence[int]] = None, @@ -116,431 +97,55 @@ def _extract_quantities_per_atom( return data -extract_quantities_per_atom = python_app( - _extract_quantities_per_atom, executors=["default_threads"] -) - - -def iter_read_frames(file: FileLike) -> Generator[list[str]]: - """Yields text data to instantiate geometries""" - frame_regex = re.compile(r"^\d+$") - with open(file, "r") as f: - for line in f: - if frame_regex.match(_ := line.strip()): - n = int(_) - yield [line] + [f.readline() for _i in range(n + 1)] - - -def _write_frames( - *states: Geometry | list[Geometry], outputs: Sequence[File] = () -) -> None: - """ - Write Geometry instances to a file. - - Args: - states: Variable number of (lists of) Geometry instances to write. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path. - """ - assert len(outputs) == 1 - data = [] - for d in states: - if isinstance(d, list): - data.extend(d) - else: - data.append(d) - with open(outputs[0], "w") as f: - f.write("".join([geom.to_string() for geom in data])) - - -write_frames = python_app(_write_frames, executors=["default_threads"]) - - -def _read_frames( - file: FileLike, indices: Optional[slice | list[int] | int] = None +def filter_quantity( + states: Sequence[Geometry], + quantity: str, ) -> list[Geometry]: """ - Read Geometry instances from a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer. - - """ - if indices is None: - # read everything - data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)] - return [Geometry.from_string(s) for s in data] - - # figure out what frames to read - # TODO: reads twice + indices always wrap modulo nframes which might be unexpected - length = _count_frames(file) - if isinstance(indices, slice): - indices = list(range(length)[indices]) - elif isinstance(indices, int): - indices = [indices] - - if length > 0: - indices = [i % length for i in indices] # for negative indices and wrapping - indices_ = set(indices) # for *much* faster 'i in indices' - - data = {} - for i, geom_str_list in enumerate(iter_read_frames(file)): - if i in indices_: - data[i] = Geometry.from_string("".join(geom_str_list)) - - # return in original order - return [data[i] for i in indices] - - -read_frames = python_app(_read_frames, executors=["default_threads"]) - - -def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None: - """ - Join multiple frame files into a single file. - - Args: - inputs: List of Parsl futures. Each element should be a DataFuture - representing an input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where joined frames will be written. - """ - assert len(outputs) == 1 - - with open(outputs[0], "wb") as destination: - for input_file in inputs: - with open(input_file, "rb") as source: - shutil.copyfileobj(source, destination) - - -join_frames = python_app(_join_frames, executors=["default_threads"]) - - -def _count_frames(file: FileLike) -> int: - """Count the number of frames in a file.""" - path = Path(file) - threshold = 10 * 1024 * 1024 # 10 MB in bytes - file_size = path.stat().st_size - - if file_size > threshold: # use grep - cmd = f"grep -Fc Properties {path}" - result = subprocess.check_output(cmd.split()) - return int(result.strip()) - return len([_ for _ in iter_read_frames(path)]) - - -count_frames = python_app(_count_frames, executors=["default_threads"]) - - -def _reset_frames(file: FileLike, outputs: Sequence[File] = ()) -> None: - """ - Reset all frames in a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where reset frames will be written. - """ - assert len(outputs) == 1 - data = _read_frames(file) - for geometry in data: - geometry.reset() - _write_frames(*data, outputs=outputs) - - -reset_frames = python_app(_reset_frames, executors=["default_threads"]) - - -def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None: + Filter frames based on a specified quantity. """ - Clean all frames in a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where reset frames will be written. - """ - assert len(outputs) == 1 - data = _read_frames(file) - for geometry in data: - geometry.clean() - _write_frames(*data, outputs=outputs) + data = extract(states, [quantity])[quantity] + return [geom for i, geom in enumerate(states) if not data[i] is MISSING] -clean_frames = python_app(_clean_frames, executors=["default_threads"]) +def get_unique_numbers(states: Sequence[Geometry]) -> set[int]: + """Returns a set of unique atom numbers found across all states.""" + numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers)) + return {int(n) for n in numbers} -def _apply_offset( - file: FileLike, - subtract: bool, - outputs: Sequence[File] = (), - **atomic_energies: float, +def apply_energy_offset( + states: Sequence[Geometry], subtract: bool, **atomic_energies: float ) -> None: - """ - Apply an energy offset to all frames in a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - subtract: Whether to subtract or add the offset. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where updated frames will be written. - **atomic_energies: Atomic energies for each element. - """ - assert len(outputs) == 1 - frames = _read_frames(file) - numbers_data = get_unique_numbers(frames) + """Apply an energy offset to all geometries""" + numbers_data = get_unique_numbers(states) numbers_kwargs = {atomic_numbers[e] for e in atomic_energies.keys()} assert numbers_data == numbers_kwargs, "Provide atomic energies for all elements.." - for geom in frames: + for geom in states: energy = get_atomic_energy(geom, atomic_energies) if subtract: geom.energy -= energy else: geom.energy += energy - _write_frames(frames, outputs=outputs) - - -apply_offset = python_app(_apply_offset, executors=["default_threads"]) - -def _get_elements(*files: File) -> set[str]: +def assign_ids( + states: Sequence[Geometry], identifier: Optional[int] = None +) -> tuple[Sequence[Geometry], int]: """ - Get the set of elements present in all frames of a sequence of file. - - Args: - inputs: List of Parsl DataFuture + Assign unique identifiers to Geometry instances, starting at provided identifier. """ - frames = [geom for file in files for geom in _read_frames(file)] - return {chemical_symbols[i] for i in get_unique_numbers(frames)} - - -get_elements = python_app(_get_elements, executors=["default_threads"]) - + # TODO: when would we want to supply the starting identifier? + if identifier is None: + # find largest existing value and add one + ids = extract(states, ["identifier"])["identifier"] + identifier = max([i for i in ids if i is not MISSING], default=0) + 1 -def _align_axes(file: FileLike, outputs: Sequence[File] = ()) -> None: - """ - Align axes for all frames in a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where aligned frames will be written. - """ - assert len(outputs) == 1 - data = _read_frames(file) - for geometry in data: - geometry.align_axes() - _write_frames(*data, outputs=outputs) - - -align_axes = python_app(_align_axes, executors=["default_threads"]) - - -def _app_filter(file: FileLike, quantity: str, outputs: Sequence[File] = ()) -> None: - """ - Filter frames based on a specified quantity and writes to first output. - """ - # TODO: where is this used? - data = _read_frames(file) - out = [] - - # None does not count - for geom in data: - if (quantity in ("cell", "energy", "stress")) and getattr( - geom, quantity, None - ) is not None: - out.append(geom) + for geom in states: + if hasattr(geom, "identifier"): continue - - value = getattr(geom.per_atom, quantity, None) - if value is None: - value = getattr(geom.meta, quantity, None) - if value is not None: - out.append(geom) - - _write_frames(out, outputs=outputs) - - -# filter is protected -app_filter = python_app(_app_filter, executors=["default_threads"]) - - -def _shuffle( - file: FileLike, - outputs: Sequence[File] = (), -) -> None: - """ - Shuffle the order of frames in a file. - - Args: - file: DataFuture representing the input file path containing the geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where shuffled frames will be written. - """ - assert len(outputs) == 1 - frames = _read_frames(file) - np.random.shuffle(frames) - _write_frames(frames, outputs=outputs) - - -shuffle = python_app(_shuffle, executors=["default_threads"]) - - -def _split_frames( - file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = () -) -> None: - """Split into training and validation sets""" - assert len(outputs) == 2 - assert 0 <= fraction <= 1 - frames = _read_frames(file) - order = np.arange(len(frames)) - if shuffle: - np.random.shuffle(order) - - n_train = int(len(frames) * fraction) - train_ids = order[:n_train] - val_ids = order[n_train:] - - train = [frames[i] for i in train_ids] - val = [frames[i] for i in val_ids] - _write_frames(train, outputs=outputs[:1]) - _write_frames(val, outputs=outputs[1:]) - - -split_frames = python_app(_split_frames, executors=["default_threads"]) - - -# TODO -# TODO: think about where the parts below should live -# TODO - - -@typeguard.typechecked -def _batch_frames( - batch_size: int, - inputs: list = [], - outputs: list = [], -) -> Optional[list[Geometry]]: - """ - Split frames into batches. - - Args: - batch_size: Number of frames per batch. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. Each element should be a DataFuture - representing an output file path for each batch. - - Returns: - Optional[list[Geometry]]: List of Geometry instances if no outputs are specified. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - frame_regex = re.compile(r"^\d+$") - - data = [] - batch_index = 0 - with open(inputs[0], "r") as f: - while True: - line = f.readline() - if not line: - break - if frame_regex.match(line.strip()): - natoms = int(line.strip()) - _ = [f.readline() for _i in range(natoms + 1)] - data.append("".join([line] + _)) - if len(data) == batch_size: - with open(outputs[batch_index], "w") as g: - g.write("\n".join(data)) - data = [] - batch_index += 1 - else: # write and clear - pass - if len(data) > 0: - with open(outputs[batch_index], "w") as f: - f.write("\n".join([d.strip() for d in data if d is not None])) - f.write("\n") - batch_index += 1 - assert batch_index == len(outputs) - - -batch_frames = python_app(_batch_frames, executors=["default_threads"]) - - -# def _assign_identifier( -# state: Geometry, -# identifier: int, -# discard: bool = False, -# ) -> tuple[Geometry, int]: -# """ -# Assign an identifier to a Geometry instance. -# -# Args: -# state (Geometry): Input Geometry instance. -# identifier (int): Identifier to assign. -# discard (bool, optional): Whether to discard the state. Defaults to False. -# -# Returns: -# tuple[Geometry, int]: Updated Geometry and next available identifier. -# """ -# if (state == NullState) or discard: -# return state, identifier -# else: -# assert state.identifier is None -# state.identifier = identifier -# return state, identifier + 1 -# -# -# assign_identifier = python_app(_assign_identifier, executors=["default_threads"]) -# - - -@typeguard.typechecked -def _assign_identifiers( - identifier: Optional[int], - inputs: list = [], - outputs: list = [], -) -> int: - """ - Assign identifiers to Geometry instances in a file. - - Args: - identifier: Starting identifier value. - inputs: List of Parsl futures. The first element should be a DataFuture - representing the input file path containing geometry data. - outputs: List of Parsl futures. The first element should be a DataFuture - representing the output file path where updated geometries will be written. - - Returns: - int: Next available identifier. - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - data = _read_frames(slice(None), inputs=[inputs[0]]) - states = [] - if identifier is None: # do not assign but look for max - identifier = -1 - for geometry in data: - if geometry.identifier is not None: - identifier = max(identifier, geometry.identifier) + geom.identifier = identifier identifier += 1 - for geometry in data: # assign those which were not yet assigned - if geometry.identifier is None: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier - else: - for geometry in data: - geometry, identifier = _assign_identifier(geometry, identifier) - states.append(geometry) - _write_frames(*states, outputs=[outputs[0]]) - return identifier - -assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"]) + return states, identifier From 7a50537f2549aa5606b8b8b19bb7356adc25a122 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Fri, 10 Apr 2026 14:34:07 +0200 Subject: [PATCH 06/16] Update geometry.py - MISSING replaces None for missing values - Geometry always has numbers, positions, cell - Geometry returns MISSING or value for forces, energy, stress - other missing attributes raise --- psiflow/geometry.py | 124 ++++++++++++++++++++++++++++++-------------- 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/psiflow/geometry.py b/psiflow/geometry.py index abac90e..9c0cd29 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -1,8 +1,8 @@ import io import pickle +import copy from pathlib import Path -from types import SimpleNamespace -from typing import Optional, Union, Any +from typing import Optional, Union, Any, Final from collections.abc import Sequence import ase.calculators.calculator @@ -15,7 +15,24 @@ import psiflow # TODO: docstrings -# TODO: custom attributes +# TODO: custom attributes writing + +# these are always accessible for every geometry +DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress" +PER_ATOM_FIELDS = "numbers", "positions", "forces" + + +class MissingType: + """Placeholder sentinel for missing data fields - replaces None""" + + def __repr__(self): + return "" + + def __bool__(self): + return False + + +MISSING: Final = MissingType() class PerAtom: @@ -24,6 +41,10 @@ class PerAtom: For every attribute, the first array dimension should be n_atoms """ + numbers: npt.NDArray[np.uint8] + positions: npt.NDArray[np.float64] + forces: npt.NDArray[np.float64] + def __init__( self, numbers: npt.NDArray[np.uint8], @@ -32,17 +53,20 @@ def __init__( ): self.numbers = numbers.astype(int).reshape(-1, 1) self.positions = positions - self._check(positions) for k, v in kwargs.items(): setattr(self, k, v) def __setattr__(self, name, value): - if not self._check(value): + if value is MISSING: + return # do not set MISSING values + elif not self._check(value): raise ValueError(f"Field '{name}' is not a per-atom property..") super().__setattr__(name, value) def __getattr__(self, name): # only runs if the attribute isn't found normally + if name == "forces": + return MISSING # forces should always be accessible raise AttributeError( f"'{type(self).__name__}' instance has no attribute '{name}'" ) @@ -51,7 +75,7 @@ def __len__(self) -> int: return self.numbers.shape[0] def __str__(self) -> str: - return f"PerAtom[{len(self)}](keys={set(vars(self))})" + return f"PerAtom[{len(self)}]{set(vars(self))}" def _check(self, arr: npt.NDArray) -> bool: try: @@ -68,7 +92,7 @@ def reset(self): def to_string(self) -> tuple[str, str]: symbols = [chemical_symbols[i] for i in self.numbers.flatten()] data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions} - if hasattr(self, "forces"): + if not self.forces is MISSING: data["forces"] = self.forces # create extxyz header @@ -146,16 +170,14 @@ class Geometry: """ per_atom: PerAtom - cell: npt.NDArray - energy: Optional[float] - stress: Optional[np.ndarray] + cell: Optional[np.ndarray] + energy: float + stress: np.ndarray def __init__( self, per_atom: PerAtom, - energy: Optional[float] = None, cell: Optional[np.ndarray] = None, - stress: Optional[np.ndarray] = None, **kwargs: Any, ): """ @@ -163,29 +185,58 @@ def __init__( proceeds via the class methods """ self.per_atom = per_atom - self.energy = energy - if cell is not None: - assert cell.shape == (3, 3) self.cell = cell - if stress is not None: - assert stress.shape == (3, 3) - self.stress = stress - self.meta = SimpleNamespace(**kwargs) + + # set optional attributes + for k, v in kwargs.items(): + setattr(self, k, v) + + def __setattr__(self, name, value): + if value is MISSING: + return # do not set MISSING values + + # enforce format for DEFAULT_PROPERTIES + if name == "cell" and value is not None: + assert isinstance(value, np.ndarray) and value.shape == (3, 3) + elif name == "energy": + value = float(value) + # assert isinstance(value, float) + elif name == "stress": + assert isinstance(value, np.ndarray) and value.shape == (3, 3) + + super().__setattr__(name, value) + + def __getattr__(self, name): + # only runs if the attribute isn't found normally + if name in ("energy", "stress"): + return MISSING # should always be accessible + raise AttributeError( + f"'{type(self).__name__}' instance has no attribute '{name}'" + ) + + def attributes(self) -> dict[str, Any]: + """Return all non-MISSING instance attributes""" + + return {k: v for k, v in vars(self).items() if k != "per_atom"} def reset(self) -> None: """ Reset all computed properties of the geometry to their default values. """ self.per_atom.reset() - self.energy = None - self.stress = None + for k in ("energy", "stress"): + if hasattr(self, k): + delattr(self, k) def clean(self) -> None: """ Clean the geometry by resetting properties and removing additional information. """ - self.reset() - self.meta = SimpleNamespace() + self.per_atom.reset() + for k in self.attributes(): + if k == "cell": + continue + delattr(self, k) def align_axes(self) -> None: """ @@ -208,14 +259,13 @@ def to_string(self) -> str: """ Convert the Geometry instance to a string representation in extended XYZ format. """ - data = {k: getattr(self, k) for k in ("energy", "stress")} + data = self.attributes() + data.pop("cell") # cell needs special treatment if self.periodic: data["Lattice"] = self.cell.T # ase does Fortran ordering data["pbc"] = "T T T" else: data["pbc"] = "F F F" - data |= vars(self.meta) - data = {k: v for k, v in data.items() if v is not None} info_str = key_val_dict_to_str(data) properties, txt = self.per_atom.to_string() header = " ".join([properties, info_str]) @@ -272,7 +322,7 @@ def from_string(cls, s: str) -> Geometry: """ n_atoms, header, body = s.strip().split("\n", 2) data = key_val_str_to_dict(header) - data.pop("pbc", None) + data.pop("pbc", None) # geometry derives pbc from cell if "Lattice" in data: data["cell"] = data.pop("Lattice").T # ase does Fortran ordering else: @@ -299,8 +349,7 @@ def from_data( Create a Geometry instance from atomic numbers, positions, and cell data. """ per_atom = PerAtom(numbers.copy(), positions.copy()) - cell = cell if cell is None else cell.copy() - return Geometry(per_atom, cell=cell) + return Geometry(per_atom, cell=copy.copy(cell)) @classmethod def from_atoms(cls, atoms: Atoms) -> Geometry: @@ -316,6 +365,7 @@ def from_atoms(cls, atoms: Atoms) -> Geometry: try: data["energy"] = atoms.get_potential_energy() per_atom.forces = atoms.get_forces() + data["stress"] = atoms.get_stress(voigt=False) except ase.calculators.calculator.PropertyNotImplementedError: pass # property was not stored in calc return cls(per_atom, **data) @@ -325,15 +375,15 @@ def periodic(self) -> bool: """ Check if the geometry is periodic. """ - return np.any(self.cell) + return not self.cell is None @property - def per_atom_energy(self) -> Optional[float]: + def per_atom_energy(self) -> float | MISSING: """ Calculate the energy per atom. """ - if self.energy is None: - return None + if self.energy is MISSING: + return MISSING return self.energy / len(self) @property @@ -350,7 +400,7 @@ def atomic_masses(self) -> npt.NDArray: """ Get the atomic masses of the atoms in the geometry. """ - return np.array([atomic_masses[n] for n in self.per_atom.numbers]).flatten() + return np.array([atomic_masses[n] for n in self.numbers]) @property def numbers(self) -> npt.NDArray: @@ -504,12 +554,6 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: return hessian / get_mass_matrix(geometry) -def get_unique_numbers(states: Sequence[Geometry]) -> set[int]: - """Returns a set of unique atom numbers found across all states.""" - numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers)) - return {int(n) for n in numbers} - - def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float: """Compute the total atomic energy based on provided single atom energies.""" total = 0 From 04e93b0fd3681706c51155679e385a375c44966d Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Fri, 10 Apr 2026 15:02:30 +0200 Subject: [PATCH 07/16] update imports + test_data --- psiflow/abandonware.py | 38 +- psiflow/data/dataset.py | 1 - psiflow/data/file.py | 1 + psiflow/data/utils.py | 11 +- psiflow/functions.py | 17 +- psiflow/hamiltonians.py | 3 +- psiflow/learning.py | 2 +- psiflow/metrics.py | 3 +- psiflow/reference/reference.py | 6 +- psiflow/sampling/ase.py | 2 +- psiflow/sampling/optimize.py | 2 +- psiflow/sampling/sampling.py | 2 +- psiflow/sampling/walker.py | 5 +- psiflow/utils/apps.py | 6 +- pyproject.toml | 4 +- tests/conftest.py | 2 +- tests/test_data.py | 610 ++++++++++++++++----------------- 17 files changed, 362 insertions(+), 353 deletions(-) diff --git a/psiflow/abandonware.py b/psiflow/abandonware.py index 5d9f9b0..91e4982 100644 --- a/psiflow/abandonware.py +++ b/psiflow/abandonware.py @@ -4,13 +4,6 @@ """ -def _check_equality(state0: Geometry, state1: Geometry) -> bool: - """Check if two Geometry instances are equal""" - return state0 == state1 - - -check_equality = python_app(_check_equality, executors=["default_threads"]) - @typeguard.typechecked def _check_distances(state: Geometry, threshold: float) -> Geometry: @@ -49,3 +42,34 @@ def _check_distances(state: Geometry, threshold: float) -> Geometry: check_distances = python_app(_check_distances, executors=["default_htex"]) + + + +def test_metrics(dataset): + # TODO: extracted from test_data.test_data_extract + + data = dataset[:2] + Dataset([NullState]) + dataset[3:5] + forces = data.get("forces", elements=["Cu"]) + reference = np.zeros((5, 4, 3)) + reference[2, :] = np.nan # ensure nan is in same place + reference[:, 0] = np.nan # ensure nan is in same place + value = compute_rmse(forces, reference) + + # last three atoms are Cu + forces = np.zeros((5, 4, 3)) + for i in range(5): + forces[i, :] = data[i].result().per_atom.forces + forces[:, 0] = np.nan + assert np.allclose( + value.result(), + compute_rmse(forces, forces * np.zeros_like(forces)).result(), + ) + unreduced = compute_rmse( + forces, forces * np.zeros_like(forces), reduce=False + ).result() + assert len(unreduced) == 5 + unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)] + assert np.allclose( + np.sqrt(np.mean(np.square(unreduced_))), + value.result(), + ) \ No newline at end of file diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index df79073..b5ad65c 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -26,7 +26,6 @@ assign_identifiers, ) - # TODO: do all operations need to generate a new file? diff --git a/psiflow/data/file.py b/psiflow/data/file.py index b2c3902..21bc507 100644 --- a/psiflow/data/file.py +++ b/psiflow/data/file.py @@ -267,6 +267,7 @@ def _read_wrapper( frames_in = _read_frames(file) return execute(frames_in, *args, **kwargs) + read_app = python_app(_read_wrapper, executors=["default_threads"]) extract_quantities = partial(read_app, extract) extract_quantities_per_atom = partial(read_app, extract_per_atom) diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 8cfa5a1..d9bb37a 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -3,6 +3,7 @@ import numpy as np from ase.data import atomic_numbers +from parsl import python_app from psiflow.geometry import ( Geometry, @@ -12,7 +13,6 @@ MISSING, ) - # TODO: some of these have side effects.. @@ -48,7 +48,7 @@ def extract(states: Sequence[Geometry], quantities: Sequence[str]) -> dict[str, return data -def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geometry]: +def insert(states: Sequence[Geometry], data: dict[str, list]) -> None: """ Insert quantities from data into Geometry instances. """ @@ -59,7 +59,6 @@ def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geomet setattr(geom.per_atom, q, v) else: setattr(geom, q, v) - return states def extract_per_atom( @@ -149,3 +148,9 @@ def assign_ids( identifier += 1 return states, identifier + + +@python_app(executors=["default_threads"]) +def check_equality(state0: Geometry, state1: Geometry) -> bool: + """Check if two Geometry instances are equal""" + return state0 == state1 diff --git a/psiflow/functions.py b/psiflow/functions.py index a461e7d..13d3ebb 100644 --- a/psiflow/functions.py +++ b/psiflow/functions.py @@ -8,13 +8,12 @@ from collections.abc import Sequence import numpy as np -from ase import Atoms from ase.calculators.calculator import Calculator from ase.data import atomic_masses from ase.units import fs, kJ, mol, nm from ase.stress import voigt_6_to_full_3x3_stress -from psiflow.geometry import Geometry, NullState, create_outputs +from psiflow.geometry import Geometry def format_output( @@ -212,7 +211,7 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - atoms = geometry_to_atoms(geometry) + atoms = geometry.to_atoms(structural_only=True) self.calc.calculate(atoms) return format_output(geometry, **self.calc.results) @@ -243,7 +242,7 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - atoms = geometry_to_atoms(geometry) + atoms = geometry.to_atoms(structural_only=True) self.calc.calculate(atoms) return format_output(geometry, **self.calc.results) @@ -296,13 +295,3 @@ def function_from_json(path: Union[str, Path], **kwargs) -> Function: data[key] = value function = function_cls(**data) return function - - -def geometry_to_atoms(geom: Geometry) -> Atoms: - """Only structural information""" - return Atoms( - positions=geom.per_atom.positions, - numbers=geom.per_atom.numbers, - cell=np.copy(geom.cell) if geom.periodic else None, - pbc=geom.periodic, - ) diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py index f101188..e337daa 100644 --- a/psiflow/hamiltonians.py +++ b/psiflow/hamiltonians.py @@ -11,7 +11,8 @@ from parsl.dataflow.futures import AppFuture, Future import psiflow -from psiflow.data import Computable, Dataset, aggregate_multiple, compute +from psiflow.data import Dataset +from psiflow.compute import Computable, aggregate_multiple, compute from psiflow.functions import ( EinsteinCrystalFunction, HarmonicFunction, diff --git a/psiflow/learning.py b/psiflow/learning.py index 11cf8d0..a1193c9 100644 --- a/psiflow/learning.py +++ b/psiflow/learning.py @@ -12,7 +12,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, NullState, assign_identifier +from psiflow.geometry import Geometry#, NullState, assign_identifier from psiflow.hamiltonians import Hamiltonian, Zero from psiflow.metrics import Metrics # from psiflow.models import Model diff --git a/psiflow/metrics.py b/psiflow/metrics.py index 7b7e351..60cb99c 100644 --- a/psiflow/metrics.py +++ b/psiflow/metrics.py @@ -386,7 +386,8 @@ def _update_logs( inputs: list = [], outputs: list = [], ): - from psiflow.data.utils import _compute_rmse, _extract_quantities, _read_frames + from psiflow.data.utils import _extract_quantities, _read_frames + from psiflow.compute import _compute_rmse from psiflow.utils.io import _load_metrics, _save_metrics data0 = _read_frames(inputs=[inputs[1]]) diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 1a6d6d7..cfbfe10 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -14,8 +14,8 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import extract_quantities -from psiflow.geometry import Geometry, NullState +from psiflow.data.utils import extract +from psiflow.geometry import Geometry from psiflow.utils.apps import copy_app_future from psiflow.utils.parse import LineNotFoundError, get_task_name_id @@ -149,7 +149,7 @@ def compute( # but extract_quantities does not accept AppFuture[list[Geometry]] (yet?) future_dataset = self.compute_dataset(dataset) outputs = outputs or self.outputs - future_data = extract_quantities( + future_data = extract( tuple(outputs), None, None, inputs=[future_dataset.extxyz] ) if len(outputs) == 1: diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index 69dde69..8a7f9e2 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -7,7 +7,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import write_frames +from psiflow.data.file import write_frames from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian from psiflow.utils.io import _dump_json diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index 3d09880..a5e97df 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -9,7 +9,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import write_frames +from psiflow.data.file import write_frames from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian, MACEHamiltonian from psiflow.sampling.sampling import ( diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 117f8b7..8650f03 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -12,7 +12,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import read_frames +from psiflow.data.file import read_frames from psiflow.hamiltonians import Hamiltonian, MixtureHamiltonian, Zero, MACEHamiltonian from psiflow.sampling.output import ( DEFAULT_OBSERVABLES, diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py index d26d243..8d9ada5 100644 --- a/psiflow/sampling/walker.py +++ b/psiflow/sampling/walker.py @@ -12,10 +12,9 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, check_equality +from psiflow.geometry import Geometry +from psiflow.data.utils import check_equality from psiflow.hamiltonians import Hamiltonian, Zero, combine_hamiltonians - -# from psiflow.order_parameters import OrderParameter from psiflow.sampling.metadynamics import Metadynamics from psiflow.utils.apps import copy_app_future diff --git a/psiflow/utils/apps.py b/psiflow/utils/apps.py index d4e7937..134ad2a 100644 --- a/psiflow/utils/apps.py +++ b/psiflow/utils/apps.py @@ -1,6 +1,7 @@ import shutil import textwrap from typing import Any, Union +from collections.abc import Sequence from pathlib import Path import numpy as np @@ -38,8 +39,8 @@ def _compute_sum(a, b): def _copy_data_future( pass_on_exist: bool = False, - inputs: list[File] = [], - outputs: list[File] = [], + inputs: Sequence[File] = (), + outputs: Sequence[File] = () ) -> None: assert len(inputs) == 1 assert len(outputs) == 1 @@ -49,7 +50,6 @@ def _copy_data_future( shutil.copyfile(inputs[0], outputs[0]) else: # no need to copy empty file pass - return copy_data_future = python_app(_copy_data_future, executors=["default_threads"]) diff --git a/pyproject.toml b/pyproject.toml index 98e13db..4b058d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "psiflow" -version = "4.0.4" +version = "4.0.5" description = "Library for developing interatomic potentials" readme = "README.md" requires-python = ">=3.10" @@ -14,7 +14,7 @@ dependencies = [ "pyyaml>=6.0", # "numpy>=1.22.3, <2", "parsl==2026.02.23", - "prettytable", + "prettytable", # TODO: what is this used for? "psutil", # "cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools.git@3b9929735dcb3c8c0620a548b1fe20efecbad077", # need 2024.1 "ipi @ git+https://github.com/i-pi/i-pi.git@v3.1.10", diff --git a/tests/conftest.py b/tests/conftest.py index 58d2068..6ae4928 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -96,7 +96,7 @@ def generate_emt_cu_data(nstates, amplitude, supercell=None) -> list[ase.Atoms]: return atoms_list -@pytest.fixture +@pytest.fixture # TODO: do we need to regenerate this every test? def dataset(context) -> Dataset: data = generate_emt_cu_data(20, 0.2) data += generate_emt_cu_data(5, 0.15, supercell=np.diag([1, 2, 1])) diff --git a/tests/test_data.py b/tests/test_data.py index 738993f..e5d8155 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,4 +1,3 @@ -import copy import os import numpy as np @@ -8,18 +7,14 @@ from parsl.app.futures import DataFuture from parsl.data_provider.files import File -import psiflow -from psiflow.data import Dataset, compute_rmse -from psiflow.data.utils import ( - _read_frames, - _write_frames, - get_index_element_mask, - read_frames, -) -from psiflow.geometry import Geometry, NullState, check_equality +from psiflow.geometry import Geometry, MISSING +from psiflow.data import Dataset +from psiflow.data.file import _read_frames, _write_frames +from psiflow.data.utils import insert def test_geometry(tmp_path): + # TODO: custom per_atom field geometry = Geometry.from_data( numbers=np.arange(1, 6), positions=np.random.uniform(0, 1, size=(5, 3)), @@ -29,9 +24,7 @@ def test_geometry(tmp_path): _ = np.array(geometry.per_atom.positions) atoms = Atoms( - numbers=np.arange(1, 6), - positions=geometry.per_atom.positions, - pbc=False, + numbers=np.arange(1, 6), positions=geometry.per_atom.positions, pbc=False ) geometry_ = Geometry.from_atoms(atoms) assert geometry == geometry_ @@ -46,108 +39,99 @@ def test_geometry(tmp_path): address = geometry.per_atom.positions.ctypes.data address_ = geometry_.per_atom.positions.ctypes.data assert address != address_ # should copy - assert np.allclose( - geometry.per_atom.positions, - geometry_.per_atom.positions, - ) + assert np.allclose(geometry.per_atom.positions, geometry_.per_atom.positions) geometry.align_axes() assert geometry.per_atom.positions.ctypes.data == address # should not copy - fake_data = [] + # create atoms dataset + atoms = [] for i in range(10): - atoms = Atoms( + at = Atoms( numbers=np.arange(i + 5), positions=np.random.uniform(-3, 3, size=(i + 5, 3)), pbc=False, ) - fake_data.append(atoms) - fake_data[2].info["energy"] = 1.0 - fake_data[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3)) - fake_data[1].info["stdout"] = "abcd" - fake_data[6].info["logprob"] = np.array([-23.1, 22.1]) - fake_data[7].cell = np.random.uniform(-3, 4, size=(3, 3)) - fake_data[7].pbc = True - - write(tmp_path / "test.xyz", fake_data) - - data = read_frames(inputs=[str(tmp_path / "test.xyz")]).result() + atoms.append(at) + atoms[2].info["energy"] = 1.0 + atoms[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3)) + atoms[1].info["stdout"] = "abcd" + atoms[6].info["logprob"] = np.array([-23.1, 22.1]) + atoms[7].cell = np.random.uniform(-3, 4, size=(3, 3)) + atoms[7].pbc = True + + write(tmp_path / "atoms.xyz", atoms) + + # check reading from extxyz + data = _read_frames(tmp_path / "atoms.xyz") assert data is not None # should return when no output file is given - assert len(data) == len(fake_data) - for i in range(10): - assert np.allclose( - fake_data[i].get_positions(), - data[i].per_atom.positions, - ) - assert np.allclose( - fake_data[i].numbers, - data[i].per_atom.numbers, - ) - assert data[i] == Geometry.from_atoms(fake_data[i]) + assert len(data) == len(atoms) + for at, geom in zip(atoms, data): + assert geom == Geometry.from_atoms(at) assert data[2].energy == 1.0 assert data[2].per_atom_energy == 1.0 / 7 - assert data[3].per_atom_energy is None - assert np.allclose( - data[3].per_atom.forces, - fake_data[3].arrays["forces"], - ) + assert data[3].per_atom_energy is MISSING + assert np.allclose(data[3].per_atom.forces, atoms[3].arrays["forces"]) assert data[1].stdout == "abcd" assert type(data[6].logprob) is np.ndarray - assert np.allclose( - data[6].logprob, - fake_data[6].info["logprob"], - ) + assert np.allclose(data[6].logprob, atoms[6].info["logprob"]) assert data[7].periodic - assert np.allclose( - data[7].cell, - np.array(fake_data[7].cell), - ) - assert np.allclose( - data[6].cell, - np.zeros((3, 3)), - ) - geometry = read_frames(indices=[3], inputs=[str(tmp_path / "test.xyz")]).result()[0] + assert np.allclose(data[7].cell, atoms[7].cell.array) + assert data[6].cell is None + + geometry = _read_frames(tmp_path / "atoms.xyz", indices=[3])[0] assert geometry == data[3] assert not geometry == data[4] - # check writing - data = read_frames( - inputs=[str(tmp_path / "test.xyz")], - outputs=[File(str(tmp_path / "test_.xyz"))], - ).result() - assert data is None - - data = read_frames(inputs=[File(str(tmp_path / "test.xyz"))]).result() - data_ = read_frames(inputs=[File(str(tmp_path / "test_.xyz"))]).result() - + # check writing to extxyz + geoms = [Geometry.from_atoms(at) for at in atoms] + _write_frames(geoms, outputs=[File(tmp_path / "geometries.xyz")]) + atoms_ = read(tmp_path / "geometries.xyz", ":") + for at, at_ in zip(atoms, atoms_): + assert np.allclose(at.cell, at_.cell) + data = at.arrays | at.info | (at.calc.results if at.calc else {}) + data_ = at_.arrays | at_.info | (at_.calc.results if at_.calc else {}) + for k in data: + if k == "cell": + pass + elif isinstance(data[k], np.ndarray): + assert np.allclose(data[k], data_[k]) + else: + assert data[k] == data_[k] + + data = _read_frames(tmp_path / "atoms.xyz") + data_ = _read_frames(tmp_path / "geometries.xyz") for state, state_ in zip(data, data_): assert state == state_ assert state.energy == state_.energy - if state.stress is not None: - assert np.allclose( - state.stress, - state_.stress, - ) - assert state.delta == state_.delta - assert state.phase == state_.phase - if state.logprob is not None: - assert np.allclose(state.logprob, state_.logprob) - assert state.stdout == state_.stdout - assert state.identifier == state.identifier + assert state.stress is MISSING or np.allclose(state.stress, state_.stress) + attrs = state.attributes() + attrs_ = state_.attributes() + for val, val_ in zip(attrs.values(), attrs_.values()): + if isinstance(val, np.ndarray): + assert np.allclose(val, val_) + else: + assert val == val_ state.save(tmp_path / "geo.xyz") - state.save(str(tmp_path / "geo.xyz")) assert state == Geometry.load(tmp_path / "geo.xyz") + # default attributes have type (and shape) requirements + with pytest.raises(TypeError): + state.energy = None + with pytest.raises(AssertionError): + state.cell = [[1, 2, 3]] * 3 + def test_readwrite_cycle(dataset, tmp_path): + tmp_file = File(tmp_path / "test.xyz") + data = dataset[:4].geometries().result() - data[2].order["test"] = 324 - _write_frames(*data, outputs=[str(tmp_path / "test.xyz")]) + data[2].test = 324 + _write_frames(*data, outputs=[tmp_file]) loaded = Dataset(data) - assert "test" in loaded[2].result().order - - states = _read_frames(inputs=[str(tmp_path / "test.xyz")]) - assert "test" in states[2].order + assert hasattr(loaded[2].result(), "test") + states = _read_frames(tmp_file) + assert hasattr(states[2], "test") s = """3 energy=3.0 phase=c7eq Properties=species:S:1:pos:R:3:momenta:R:3:forces:R:3 @@ -155,19 +139,15 @@ def test_readwrite_cycle(dataset, tmp_path): O 1 2 3 4 5 6 7 8 9 F 2 3 4 5 6 7 8 9 10 """ - geometry = Geometry.from_string(s, natoms=None) + geometry = Geometry.from_string(s) assert len(geometry) == 3 assert geometry.energy == 3.0 assert geometry.phase == "c7eq" assert not geometry.periodic - assert np.all(np.logical_not(np.isnan(geometry.per_atom.forces))) - assert np.allclose( - geometry.per_atom.numbers, - np.array([6, 8, 9]), - ) + assert not np.any(np.isnan(geometry.per_atom.forces)) + assert np.allclose(geometry.numbers, np.array([6, 8, 9])) assert np.allclose( - geometry.per_atom.forces, - np.array([[6,7,8], [7,8,9], [8,9,10]]), + geometry.per_atom.forces, np.array([[6, 7, 8], [7, 8, 9], [8, 9, 10]]) ) s = """7 @@ -182,33 +162,39 @@ def test_readwrite_cycle(dataset, tmp_path): geometry = Geometry.from_string(s) assert not geometry.periodic assert len(geometry) == 7 - assert geometry.energy is None + assert geometry.energy is MISSING - geometry = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None) - geometry.stress = np.array([np.nan] * 9).reshape(3, 3) - assert "nan" not in geometry.to_string() + geom = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None) + geom.stress = np.array([np.nan] * 9).reshape(3, 3) + string = geom.to_string() + assert "nan" in string # the stress field + geom_ = Geometry.from_string(string) + assert np.isnan(geom_.stress).all() -def test_dataset_empty(tmp_path): - dataset = Dataset([]) - assert dataset.length().result() == 0 - assert isinstance(dataset.extxyz, DataFuture) - path_xyz = tmp_path / "test.xyz" - dataset.save(path_xyz) # ensure the copy is executed before assert - psiflow.wait() - assert os.path.isfile(path_xyz) - with pytest.raises(ValueError): # cannot save outside cwd - dataset.save(path_xyz.parents[3] / "test.xyz") - - -def test_dataset_append(dataset): +def test_dataset(dataset, tmp_path): + """Verifying basic dataset functionality""" l = dataset.length().result() # noqa: E741 geometries = dataset.geometries().result() assert len(geometries) == l assert type(geometries) is list assert type(geometries[0]) is Geometry for i in range(l): - assert check_equality(geometries[i], dataset[i]).result() + assert geometries[i] == dataset[i].result() + + path_xyz = tmp_path / "data.xyz" + dataset.save(path_xyz).result() + assert os.path.isfile(path_xyz) + with pytest.raises(ValueError): # cannot save outside cwd + dataset.save(path_xyz.parents[3] / "test.xyz") + + dataset = Dataset([]) + assert dataset.length().result() == 0 + assert isinstance(dataset.extxyz, DataFuture) + + +def test_dataset_append(dataset): + l = dataset.length().result() empty = Dataset([]) # use [] instead of None empty += dataset assert l == empty.length().result() @@ -216,19 +202,16 @@ def test_dataset_append(dataset): assert added.length().result() == 2 * l assert dataset.length().result() == l # must not have changed + +def test_dataset_align(dataset): # test canonical transformation transformed = dataset.align_axes() - for i in range(dataset.length().result()): - geometry = dataset[i].result() - geometry.align_axes() - assert np.allclose( - geometry.per_atom.positions, - transformed[i].result().per_atom.positions, - ) - assert np.allclose( - geometry.cell, - transformed[i].result().cell, - ) + geoms = dataset.geometries().result() + geoms_ = transformed.geometries().result() + for geom, geom_ in zip(geoms, geoms_): + geom.align_axes() + assert np.allclose(geom.per_atom.positions, geom_.per_atom.positions) + assert np.allclose(geom.cell, geom_.cell) def test_dataset_slice(dataset): @@ -255,63 +238,6 @@ def test_dataset_slice(dataset): for i, geometry in enumerate(subset.geometries().result()): assert geometry == dataset[i].result() - -def test_dataset_from_xyz(tmp_path, dataset): - path_xyz = tmp_path / "data.xyz" - dataset.save(path_xyz) - psiflow.wait() - _ = Dataset.load(path_xyz) - atoms_list = read(path_xyz, index=":") - - for i in range(dataset.length().result()): - assert np.allclose( - dataset[i].result().per_atom.positions, - atoms_list[i].get_positions(), - ) - assert dataset[i].result().energy == atoms_list[i].calc.results["energy"] - - -def test_index_element_mask(): - numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8]) - elements = ["H"] - indices = [0, 1, 4, 5] - assert np.allclose( - np.array([True, True, False, False, False, False, False, False]), - get_index_element_mask(numbers, indices, elements), - ) - elements = ["H", "O"] - indices = [0, 1, 4, 5] - assert np.allclose( - np.array([True, True, False, False, False, True, False, False]), - get_index_element_mask(numbers, indices, elements), - ) - elements = ["H", "O"] - indices = [3] - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([False] * len(numbers)), - ) - elements = ["H", "O"] - indices = None - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([True, True, True, False, False, True, True, True]), - ) - elements = None - indices = [0, 1, 2, 3, 5] - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([True, True, True, True, False, True, False, False]), - ) - elements = ["Cl"] # not present at all - indices = None - assert np.allclose( - get_index_element_mask(numbers, indices, elements), - np.array([False] * len(numbers)), - ) - - -def test_dataset_gather(dataset): indices = [0, 3, 2, 6, 1] gathered = dataset[indices] assert gathered.length().result() == len(indices) @@ -322,180 +248,244 @@ def test_dataset_gather(dataset): ) +def test_dataset_from_xyz(tmp_path, dataset): + path_xyz = tmp_path / "data.xyz" + dataset.save(path_xyz).result() + atoms_list = read(path_xyz, index=":") + geoms = dataset.geometries().result() + for geom, at in zip(geoms, atoms_list): + assert np.allclose(geom.per_atom.positions, at.get_positions()) + assert geom.energy == at.calc.results["energy"] + + +# def test_index_element_mask(): +# numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8]) +# elements = ["H"] +# indices = [0, 1, 4, 5] +# assert np.allclose( +# np.array([True, True, False, False, False, False, False, False]), +# get_index_element_mask(numbers, indices, elements), +# ) +# elements = ["H", "O"] +# indices = [0, 1, 4, 5] +# assert np.allclose( +# np.array([True, True, False, False, False, True, False, False]), +# get_index_element_mask(numbers, indices, elements), +# ) +# elements = ["H", "O"] +# indices = [3] +# assert np.allclose( +# get_index_element_mask(numbers, indices, elements), +# np.array([False] * len(numbers)), +# ) +# elements = ["H", "O"] +# indices = None +# assert np.allclose( +# get_index_element_mask(numbers, indices, elements), +# np.array([True, True, True, False, False, True, True, True]), +# ) +# elements = None +# indices = [0, 1, 2, 3, 5] +# assert np.allclose( +# get_index_element_mask(numbers, indices, elements), +# np.array([True, True, True, True, False, True, False, False]), +# ) +# elements = ["Cl"] # not present at all +# indices = None +# assert np.allclose( +# get_index_element_mask(numbers, indices, elements), +# np.array([False] * len(numbers)), +# ) + + def test_data_elements(dataset): - assert "H" in dataset.elements().result() - assert "Cu" in dataset.elements().result() - assert len(dataset.elements().result()) == 2 + elements = dataset.elements().result() + assert "H" in elements + assert "Cu" in elements + assert len(elements) == 2 def test_data_reset(dataset): dataset = dataset.reset() - assert dataset[0].result().energy is None + geometries = dataset.geometries().result() + for geom in geometries: + assert geom.energy is MISSING + assert geom.per_atom.forces is MISSING def test_data_split(dataset): train, valid = dataset.split(0.9) - assert ( - train.length().result() + valid.length().result() == dataset.length().result() - ) + train_geoms = train.geometries().result() + valid_geoms = valid.geometries().result() + assert len(train_geoms) + len(valid_geoms) == dataset.length().result() + for geom_t in train_geoms: + for geom_v in valid_geoms: + assert geom_t != geom_v -def test_identifier(dataset): - data = dataset + Dataset([NullState, NullState, dataset[0]]) +def test_identifier(dataset, dataset_h2): + data = dataset + dataset_h2 identifier = data.assign_identifiers(0) - assert identifier.result() == dataset.length().result() + 1 - assert identifier.result() == data.not_null().length().result() - for i in range(data.length().result()): - if not data[i].result() == NullState: - assert data[i].result().identifier < identifier.result() - # assert data[i].result().reference_status - data = data.clean() # also removes identifier - for i in range(data.length().result()): - s = data[i].result() - if not s == NullState: - assert s.identifier is None - identifier = data.assign_identifiers(10) - assert identifier.result() == 10 + data.not_null().length().result() - identifier = dataset.assign_identifiers(10) - assert ( - dataset.assign_identifiers().result() - == 10 + dataset.not_null().length().result() - ) - for i in range(dataset.length().result()): - s = dataset[i].result() - if not s == NullState: - assert s.identifier >= 10 - - identifier = data.assign_identifiers() - data = data.clean() - assert ( - data.assign_identifiers(identifier).result() - == identifier.result() + data.not_null().length().result() - ) + assert identifier.result() == data.length().result() + + data = data.clean() # removes identifier + geoms = data.geometries().result() + for geom in geoms: + assert not hasattr(geom, "identifier") + + identifier = dataset.assign_identifiers(5).result() + geoms = dataset.geometries().result() + assert identifier == len(geoms) + 5 + for geom in geoms: + assert geom.identifier < identifier + + # set initial value + geoms = dataset_h2.geometries().result() + geoms[10].identifier = 42 + + # count from 0 + data = Dataset(geoms) + identifier = data.assign_identifiers(0).result() + geoms_ = data.geometries().result() + assert identifier == len(geoms) - 1 # skips over one geometry + assert geoms_[0].identifier == 0 + assert geoms_[10].identifier == 42 + assert geoms_[-1].identifier == identifier - 1 + + # count from highest + data = Dataset(geoms) + identifier = data.assign_identifiers().result() + geoms_ = data.geometries().result() + assert identifier == len(geoms) + 42 # skips over one geometry + assert geoms_[0].identifier == 43 + assert geoms_[10].identifier == 42 + assert geoms_[-1].identifier == identifier - 1 def test_data_offset(dataset): - atomic_energies = { - "H": 34, - "Cu": 12, - } - data = dataset.subtract_offset(**atomic_energies) - data_ = data.add_offset(**atomic_energies) - for i in range(dataset.length().result()): - natoms = len(dataset[i].result()) - offset = (natoms - 1) * atomic_energies["Cu"] + atomic_energies["H"] - assert np.allclose( - data[i].result().energy, - dataset[i].result().energy - offset, - ) - assert np.allclose( # unchanged - data[i].result().per_atom.forces, - dataset[i].result().per_atom.forces, - ) - assert np.allclose( - data_[i].result().energy, - dataset[i].result().energy, - ) + atomic_energies = {"H": 34, "Cu": 12} + data0 = dataset.subtract_offset(**atomic_energies) + data1 = data0.add_offset(**atomic_energies) + + geoms = dataset.geometries().result() + geoms0 = data0.geometries().result() + geoms1 = data1.geometries().result() + + for g, g0, g1 in zip(geoms, geoms0, geoms1): + offset = (len(g) - 1) * atomic_energies["Cu"] + atomic_energies["H"] + assert np.allclose(g0.energy, g.energy - offset) + assert np.allclose(g0.per_atom.forces, g.per_atom.forces) + assert np.allclose(g1.energy, g.energy) def test_data_extract(dataset): state = dataset[0].result() - state.energy = None + state.reset() state.identifier = 0 state.delta = 6 + # check Dataset.get data = dataset[:5] + dataset[-5:] + Dataset([state]) energy, forces, identifier = data.get("energy", "forces", "identifier") energy = energy.result() forces = forces.result() identifier = identifier.result() + assert energy[-1] is MISSING and forces[-1] is MISSING + assert identifier[-1] == 0 for i, geometry in enumerate(data.geometries().result()): if geometry.energy: assert np.allclose(geometry.energy, energy[i]) - n = len(geometry) - assert np.allclose( - geometry.per_atom.forces, - forces[i][:n, :], - ) - if identifier[i].item() != -1: + try: + assert np.allclose(geometry.per_atom.forces, forces[i]) + except TypeError: + assert geometry.per_atom.forces is MISSING and forces[i] is MISSING + if hasattr(geometry, "identifier"): assert geometry.identifier == identifier[i] - if i < 10: - assert identifier[i] == -1 - assert np.isnan(np.mean(energy)) - assert np.isnan(np.mean(forces)) - psiflow.wait() - - data = dataset[:2] + Dataset([NullState]) + dataset[3:5] - forces = data.get("forces", elements=["Cu"]) - reference = np.zeros((5, 4, 3)) - reference[2, :] = np.nan # ensure nan is in same place - reference[:, 0] = np.nan # ensure nan is in same place - value = compute_rmse(forces, reference) - - # last three atoms are Cu - forces = np.zeros((5, 4, 3)) - for i in range(5): - forces[i, :] = data[i].result().per_atom.forces - forces[:, 0] = np.nan - assert np.allclose( - value.result(), - compute_rmse(forces, forces * np.zeros_like(forces)).result(), - ) - unreduced = compute_rmse( - forces, forces * np.zeros_like(forces), reduce=False - ).result() - assert len(unreduced) == 5 - unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)] - assert np.allclose( - np.sqrt(np.mean(np.square(unreduced_))), - value.result(), - ) - # try weirdly specific indices - forces = np.zeros((5, 4, 3)) - for i in range(5): - g = data[i].result() - if len(g) >= 4: - forces[i, 3] = data[i].result().per_atom.forces[3] - forces[i, 1] = data[i].result().per_atom.forces[1] + # check extract-insert round-trip + geoms = data.clean().geometries().result() + data_dict = {"energy": energy, "forces": forces, "identifier": identifier} + insert(geoms, data_dict) + for geom, geom_ in zip(geoms, data.geometries().result()): + e, e_ = geom.energy, geom_.energy + f, f_ = geom.per_atom.forces, geom_.per_atom.forces + assert e == e_ + if f is MISSING: + assert f_ is MISSING else: - forces[i, :] = np.nan - forces_ = data.get("forces", atom_indices=[3, 1]).result() - mask = np.invert(np.isnan(forces_)) - assert np.allclose( - forces[mask], - forces_[mask], - ) + assert np.allclose(f, f_) + if hasattr(geom, "identifier"): + assert geom.identifier == geom_.identifier + + # check Dataset.get_per_atom + geoms = data.geometries().result() + forces = [geom.per_atom.forces for geom in geoms] + numbers = [geom.numbers for geom in geoms] + + forces_ = data.get_per_atom("forces", elements=["Cu"])[0].result() + for n, f, f_ in zip(numbers, forces, forces_): + # filter on Cu + if f is MISSING: + assert f_ is MISSING + else: + assert np.allclose(f[n == 29], f_) + + ids = [0, 2] + forces_ = data.get_per_atom("forces", atom_indices=ids)[0].result() + for f, f_ in zip(forces, forces_): + # filter on ids + if f is MISSING: + assert f_ is MISSING + else: + assert np.allclose(f[ids], f_) - # check order parameters + # check custom attributes s = Geometry.from_string( """ 3 -order_distance=2.3 energy=4 +distance=2.3 energy=4 O 0 0 0 H 1 1 1 H -1 1 1 """, ) - states = [copy.deepcopy(s) for _ in range(5)] - - states[1].order = {} - states[2].order = {"some": 4.2} - states[3].order["distance"] = 2.2 + states = [s.copy() for _ in range(5)] + states[1].order = {"answer": 42} + states[2].some = 4.2 + states[3].distance = 2.2 data = Dataset(states) - some, distance = data.get("some", "distance") + some, distance, energy, order = data.get("some", "distance", "energy", "order") some = some.result() distance = distance.result() - assert np.isnan(some[1]) - assert np.allclose(some[2], 4.2) - assert np.allclose(distance[0], 2.3) - assert np.allclose(distance[3], 2.2) - assert np.isnan(distance[2]) + energy = energy.result() + order = order.result() + assert some == [MISSING, MISSING, 4.2, MISSING, MISSING] + assert distance == [2.3, 2.3, 2.3, 2.2, 2.3] + assert energy == [4, 4, 4, 4, 4] + assert order == [MISSING, {"answer": 42}, MISSING, MISSING, MISSING] def test_filter(dataset, dataset_h2): - data = dataset + dataset_h2 + Dataset([NullState]) - data = data.shuffle() - assert data.filter("cell").length().result() == dataset.length().result() - assert data.filter("energy").length().result() == dataset.length().result() - assert data.filter("forces").length().result() == dataset.length().result() + geom = dataset[0].result() + geom.flagged = True + geom.reset() + data = dataset + dataset_h2 + Dataset([geom]) + + data0 = data.shuffle() + data1 = data.filter("cell") + data2 = data.filter("energy") + data3 = data.filter("forces") + data4 = data.filter("flagged") + + l = dataset.length().result() + l_ = dataset_h2.length().result() + l0 = data0.length().result() + l1 = data1.length().result() + l2 = data2.length().result() + l3 = data3.length().result() + l4 = data4.length().result() + + assert l0 == l + l_ + 1 # all geometries + assert l0 == l1 # cell always defined + assert l2 == l and l3 == l + assert l4 == 1 From 942ac3ca194bf6c4f7496888d89a2473deecfcb3 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 15:22:00 +0200 Subject: [PATCH 08/16] Update compute.py refactored compute functionality - ComputeResult instance instead of tuple of stacked + padded arrays - compute signature tweaked - compare_results to compute metrics between ComputeResults/loose data - functions return dict[str, list] - Hamiltonian.compute returns future of ComputeResult --- psiflow/compute.py | 543 +++++++++++++++++++++------------------------ 1 file changed, 258 insertions(+), 285 deletions(-) diff --git a/psiflow/compute.py b/psiflow/compute.py index 22461ef..8774962 100644 --- a/psiflow/compute.py +++ b/psiflow/compute.py @@ -1,336 +1,309 @@ -import math -from typing import Callable, ClassVar, Optional, Union +from typing import Callable, ClassVar, Optional, Union, Type, Any, TypeAlias +from collections.abc import Sequence +from dataclasses import dataclass import numpy as np -import typeguard from parsl.app.app import join_app, python_app from parsl.dataflow.futures import AppFuture, DataFuture import psiflow -from psiflow.geometry import Geometry +from psiflow.geometry import Geometry, PER_ATOM_FIELDS, DEFAULT_PROPERTIES, MISSING from psiflow.data import Dataset -from psiflow.data.file import batch_frames +from psiflow.functions import Function +from psiflow.utils.apps import pack +ComputeInput: TypeAlias = ( + Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry +) -def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]: - """ - Concatenate multiple lists of arrays. - Args: - *args: Lists of numpy arrays to concatenate. +@dataclass +class ComputeResult: + """Container to hold and manipulate the results from compute/apply operations.""" - Returns: - list[np.ndarray]: List of concatenated arrays. + n_atoms: np.ndarray + data: dict[str, np.ndarray] - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ + def __post_init__(self): + self.cutoffs = self.n_atoms.cumsum() - def pad_arrays( - arrays: list[np.ndarray], - pad_dimension: int = 1, - ) -> list[np.ndarray]: - ndims = np.array([len(a.shape) for a in arrays]) - assert np.all(ndims == ndims[0]) - assert np.all(pad_dimension < ndims) - - pad_size = max([a.shape[pad_dimension] for a in arrays]) - for i in range(len(arrays)): - shape = list(arrays[i].shape) - shape[pad_dimension] = pad_size - shape[pad_dimension] - padding = np.zeros(tuple(shape)) + np.nan - arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension) - return arrays - - narrays = len(args[0]) - for arg in args: - assert isinstance(arg, list) - assert all([len(a) == narrays for a in args]) - - concatenated = [] - for i in range(narrays): - arrays = [arg[i] for arg in args] - if len(arrays[0].shape) > 1: - pad_arrays(arrays) - concatenated.append(np.concatenate(tuple(arrays))) - return concatenated - - -concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"]) - - -def _aggregate_multiple( - *arrays_list, - coefficients: Optional[np.ndarray] = None, -) -> list[np.ndarray]: - """ - Aggregate multiple lists of arrays with optional coefficients. + def __getattr__(self, item): + """Enable parsl deferred_getitem on AppFuture""" + if item == "data": + raise AttributeError() # prevent RecursionError from pickle + return self.data[item] - Args: - *arrays_list: Lists of arrays to aggregate. - coefficients: Optional coefficients for weighted aggregation. + @property + def keys(self) -> set[str]: + return set(self.data.keys()) - Returns: - list[np.ndarray]: List of aggregated arrays. + def get(self, key: str, per_geom: bool = False) -> list | np.ndarray: + arr = self.data[key] + if not per_geom: + return arr - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ + if self.cutoffs[-1] == arr.shape[0]: + data_list = np.array_split(arr, self.cutoffs[:-1]) + else: + data_list = list(arr) + return data_list + + @classmethod + def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]): + values = {} + for k, v in data.items(): + assert len(v) == n_atoms.size + if k in PER_ATOM_FIELDS: + values[k] = np.concatenate(v) + elif k in DEFAULT_PROPERTIES: + values[k] = np.stack(v) + else: + # TODO: what with custom properties? + values[k] = np.stack(v) + return cls(np.array(n_atoms), values) + + +def _apply( + states: Sequence[Geometry], + function_cls: Type[Function], + inputs: Sequence = (), + parsl_resource_specification: dict = {}, + **parameters, +) -> ComputeResult: + assert function_cls is not None + function = function_cls(**parameters) # psiflow.functions.Function subclass + output_dict = function.compute(states) + n_atoms = np.array([len(geom) for geom in states]) + return ComputeResult.from_data(n_atoms, output_dict) + + +@python_app(executors=["default_threads"]) +def concatenate_results(*results: ComputeResult) -> ComputeResult: + """""" + n_atoms_list = [result.n_atoms for result in results] + n_atoms = np.concatenate(n_atoms_list) + + data = {} + for k in results[0].keys: + values = [result.get(k) for result in results] + data[k] = np.concatenate(values) + + return ComputeResult(n_atoms, data) + + +@python_app(executors=["default_threads"]) +def aggregate_results( + *results: ComputeResult, coefficients: Optional[np.ndarray] = None +) -> ComputeResult: + """""" + # TODO: what with data fields that are not shared? if coefficients is None: - coefficients = np.ones(len(arrays_list)) - else: - assert len(coefficients) == len(arrays_list) + coefficients = np.ones(len(results)) + assert len(coefficients) == len(results) - results = [np.zeros(a.shape) for a in arrays_list[0]] - for i, arrays in enumerate(arrays_list): - for j, array in enumerate(arrays): - results[j] += coefficients[i] * array - return results + n_atoms = results[0].n_atoms + for result in results[1:]: + assert np.allclose(n_atoms, result.n_atoms) + data = {k: np.zeros_like(v) for k, v in results[0].data.items()} + for i, result in enumerate(results): + for k in data: + data[k] += result.get(k) * coefficients[i] -aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"]) + return ComputeResult(n_atoms, data) @join_app def batch_apply( - apply_apps: tuple[Union[python_app, Callable]], - arg: Union[Dataset, list[Geometry]], + states: list[Geometry], + apply_apps: Sequence[Callable], batch_size: int, - length: int, - outputs: list = [], - reduce_func: Optional[python_app] = None, - **app_kwargs, + reduce_func: Callable, ) -> AppFuture: - """ - Apply a set of apps to batches of data. + """Apply a set of apps to batches of data""" + # TODO: holds everything in memory -- what with very large datasets? + n_batch = len(states) // batch_size + 1 * (len(states) % batch_size > 0) + batches = [arr.tolist() for arr in np.array_split(states, n_batch)] - Args: - apply_apps: Tuple of PythonApps or Callables to apply. - arg: Dataset or list of Geometries to process. - batch_size: Size of each batch. - length: Total number of items to process. - outputs: List of output files. - reduce_func: Optional function to reduce results. - **app_kwargs: Additional keyword arguments for the apps. - - Returns: - AppFuture: Future representing the result of batch application. - - Note: - This function is wrapped as a Parsl join_app. - """ - nbatches = math.ceil(length / batch_size) - batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)] - future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches) - output_futures = [] - for i in range(nbatches): + output = [] + for batch in batches: futures = [] for app in apply_apps: - f = app( - None, - inputs=[future.outputs[i]], - **app_kwargs, - ) - futures.append(f) - reduced = reduce_func(*futures) - output_futures.append(reduced) - future = concatenate_multiple(*output_futures) - return future - - -@python_app(executors=["default_threads"]) -def get_length(arg): - """ - Get the length of the input argument. - - Args: - arg: Input to get the length of. - - Returns: - int: Length of the input. + future = app(batch) + futures.append(future) + future = reduce_func(*futures) + output.append(future) - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - if isinstance(arg, list): - return len(arg) - else: - return 1 + return concatenate_results(*output) def compute( - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *apply_apps: Union[python_app, Callable], - outputs_: Union[str, list[str], tuple[str, ...], None] = None, - reduce_func: Union[python_app, Callable] = aggregate_multiple, + arg: ComputeInput, + apply_apps: Sequence[Callable], + reduce_func: Union[python_app, Callable] = aggregate_results, batch_size: Optional[int] = None, -) -> Union[list[AppFuture], AppFuture]: +) -> AppFuture: """ Compute results by applying apps to the input data. Args: arg: Input data to compute on. - *apply_apps: Apps to apply to the data. - outputs_: Names of output quantities. + apply_apps: Apps to apply to the data. reduce_func: Function to reduce results. batch_size: Optional batch size for processing. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. """ - if type(outputs_) is str: - outputs_ = [outputs_] - if batch_size is not None: - if isinstance(arg, Dataset): - length = arg.length() - else: - length = get_length(arg) - # convert to Dataset for convenience - arg = Dataset(arg) - future = batch_apply( - apply_apps, - arg, - batch_size, - length, - outputs_=outputs_, - reduce_func=reduce_func, - ) - else: + states = input_to_geometries(arg) + if batch_size is None: futures = [] - if isinstance(arg, Dataset): - for app in apply_apps: - future = app( - None, - outputs_=outputs_, - inputs=[arg.extxyz], - ) - futures.append(future) - else: - for app in apply_apps: - future = app( - arg, - outputs_=outputs_, - inputs=[], - ) - futures.append(future) + for app in apply_apps: + future = app(states) + futures.append(future) future = reduce_func(*futures) - if len(outputs_) == 1: - return future[0] else: - return [future[i] for i in range(len(outputs_))] - - -class Computable: - """ - Base class for computable objects. - - Attributes: - outputs (ClassVar[tuple[str, ...]]): Names of output quantities. - batch_size (ClassVar[Optional[int]]): Default batch size for computation. - """ - - outputs: ClassVar[tuple[str, ...]] = () - batch_size: ClassVar[Optional[int]] = None - - def compute( - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *outputs: Optional[str], - batch_size: Optional[int] = -1, # if -1: take class default - ) -> Union[list[AppFuture], AppFuture]: - """ - Compute results for the given input. - - Args: - arg: Input data to compute on. - outputs: Names of output quantities. - batch_size: Batch size for computation. - - Returns: - Union[list[AppFuture], AppFuture]: Future(s) representing computation results. - """ - raise NotImplementedError - - -@typeguard.typechecked -def _compute_rmse( - array0: np.ndarray, - array1: np.ndarray, - reduce: bool = True, -) -> Union[float, np.ndarray]: - """ - Compute the Root Mean Square Error (RMSE) between two arrays. + future = batch_apply(states, apply_apps, batch_size, reduce_func) - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: RMSE value(s). - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - assert np.all(np.isnan(array0) == np.isnan(array1)) - - se = (array0 - array1) ** 2 - se = se.reshape(se.shape[0], -1) - - if reduce: # across both dimensions - mask = np.logical_not(np.isnan(se)) - return float(np.sqrt(np.mean(se[mask]))) - else: # retain first dimension - if se.ndim == 1: - return se - else: - values = np.empty(len(se)) - for i in range(len(se)): - if np.all(np.isnan(se[i])): - values[i] = np.nan - else: - mask = np.logical_not(np.isnan(se[i])) - value = np.sqrt(np.mean(se[i][mask])) - values[i] = value - return values + return future -compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) +metric_func_map: dict[str, Callable] = { + "RMSE": lambda arr1, arr2: np.mean((arr1 - arr2) ** 2) ** 0.5, + "MAE": lambda arr1, arr2: np.abs(arr1 - arr2).mean(), +} -@typeguard.typechecked -def _compute_mae( - array0, - array1, +def _compare_results( + result1: ComputeResult, + result2: Optional[ComputeResult] = None, + metric: str = "RMSE", reduce: bool = True, -) -> Union[float, np.ndarray]: - """ - Compute the Mean Absolute Error (MAE) between two arrays. - - Args: - array0: First array. - array1: Second array. - reduce: Whether to reduce the result to a single value. - - Returns: - Union[float, np.ndarray]: MAE value(s). - - Note: - This function is wrapped as a Parsl app and executed using the default_threads executor. - """ - assert array0.shape == array1.shape - mask0 = np.logical_not(np.isnan(array0)) - mask1 = np.logical_not(np.isnan(array1)) - assert np.all(mask0 == mask1) - ae = np.abs(array0 - array1) - to_reduce = tuple(range(1, array0.ndim)) - mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) - ae = ae[mask0].reshape(np.sum(1 * mask), -1) - if reduce: # across both dimensions - return float(np.sqrt(np.mean(ae))) - else: # retain first dimension - return np.sqrt(np.mean(ae, axis=1)) - - -compute_mae = python_app(_compute_mae, executors=["default_threads"]) + **kwargs: list | np.ndarray, +) -> dict: + """""" + # TODO: what with missing values? + assert (result2 is None) != (len(kwargs) == 0) # xor + if kwargs: + result2 = ComputeResult.from_data(result1.n_atoms, kwargs) + elif not np.allclose(result1.cutoffs, result2.cutoffs): + raise ValueError("Results cannot be compared") + + metric_func = metric_func_map[metric] + keys = result1.keys.intersection(result2.keys) + + out = {} + for k in keys: + if reduce: + arr1, arr2 = result1.get(k), result2.get(k) + out[k] = metric_func(arr1, arr2) + else: + list1, list2 = result1.get(k, per_geom=True), result2.get(k, per_geom=True) + out[k] = [metric_func(v1, v2) for v1, v2 in zip(list1, list2)] + + return out + + +compare_results = python_app(_compare_results, executors=["default_threads"]) + + +# def _compute_rmse( +# array0: np.ndarray, +# array1: np.ndarray, +# reduce: bool = True, +# ) -> Union[float, np.ndarray]: +# """ +# Compute the Root Mean Square Error (RMSE) between two arrays. +# +# Args: +# array0: First array. +# array1: Second array. +# reduce: Whether to reduce the result to a single value. +# +# Returns: +# Union[float, np.ndarray]: RMSE value(s). +# +# Note: +# This function is wrapped as a Parsl app and executed using the default_threads executor. +# """ +# assert array0.shape == array1.shape +# assert np.all(np.isnan(array0) == np.isnan(array1)) +# +# se = (array0 - array1) ** 2 +# se = se.reshape(se.shape[0], -1) +# +# if reduce: # across both dimensions +# mask = np.logical_not(np.isnan(se)) +# return float(np.sqrt(np.mean(se[mask]))) +# else: # retain first dimension +# if se.ndim == 1: +# return se +# else: +# values = np.empty(len(se)) +# for i in range(len(se)): +# if np.all(np.isnan(se[i])): +# values[i] = np.nan +# else: +# mask = np.logical_not(np.isnan(se[i])) +# value = np.sqrt(np.mean(se[i][mask])) +# values[i] = value +# return values +# +# +# compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) +# +# +# def _compute_mae( +# array0, +# array1, +# reduce: bool = True, +# ) -> Union[float, np.ndarray]: +# """ +# Compute the Mean Absolute Error (MAE) between two arrays. +# +# Args: +# array0: First array. +# array1: Second array. +# reduce: Whether to reduce the result to a single value. +# +# Returns: +# Union[float, np.ndarray]: MAE value(s). +# +# Note: +# This function is wrapped as a Parsl app and executed using the default_threads executor. +# """ +# assert array0.shape == array1.shape +# mask0 = np.logical_not(np.isnan(array0)) +# mask1 = np.logical_not(np.isnan(array1)) +# assert np.all(mask0 == mask1) +# ae = np.abs(array0 - array1) +# to_reduce = tuple(range(1, array0.ndim)) +# mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) +# ae = ae[mask0].reshape(np.sum(1 * mask), -1) +# if reduce: # across both dimensions +# return float(np.sqrt(np.mean(ae))) +# else: # retain first dimension +# return np.sqrt(np.mean(ae, axis=1)) +# +# +# compute_mae = python_app(_compute_mae, executors=["default_threads"]) + + +def input_to_geometries(data: ComputeInput) -> AppFuture: + """Convert ComputeInput into a sequence of geometries (as a future)""" + # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry + + @python_app(executors=["default_threads"]) + def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]: + # make sure apply apps get consistent input + if isinstance(data, Geometry): + data = [data] + return data + + if isinstance(data, Dataset): + return data.geometries() + elif isinstance(data, Geometry): + return pack(data) + elif isinstance(data, AppFuture): + return prep_input(data) + elif isinstance(data, list): + return pack(*data) + else: + assert False From 54c4808e80b28e1b67927acb3933bfb411a9008a Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 16:22:10 +0200 Subject: [PATCH 09/16] update Hamiltonian API - Hamiltonian.compute returns ComputeResult - Hamiltonian.evaluate labels a dataset - moving towards deprecating Dataset.evaluate --- psiflow/compute.py | 12 +++++++ psiflow/data/dataset.py | 1 + psiflow/functions.py | 50 +++++++---------------------- psiflow/hamiltonians.py | 71 +++++++++++++++++------------------------ 4 files changed, 53 insertions(+), 81 deletions(-) diff --git a/psiflow/compute.py b/psiflow/compute.py index 8774962..de26fb5 100644 --- a/psiflow/compute.py +++ b/psiflow/compute.py @@ -9,6 +9,7 @@ import psiflow from psiflow.geometry import Geometry, PER_ATOM_FIELDS, DEFAULT_PROPERTIES, MISSING from psiflow.data import Dataset +from psiflow.data.utils import insert from psiflow.functions import Function from psiflow.utils.apps import pack @@ -48,6 +49,9 @@ def get(self, key: str, per_geom: bool = False) -> list | np.ndarray: data_list = list(arr) return data_list + def to_dict(self) -> dict[str, list]: + return {k: self.get(k, per_geom=True) for k in self.keys} + @classmethod def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]): values = {} @@ -204,6 +208,7 @@ def _compare_results( compare_results = python_app(_compare_results, executors=["default_threads"]) +# TODO: cleanup # def _compute_rmse( # array0: np.ndarray, # array1: np.ndarray, @@ -307,3 +312,10 @@ def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]: return pack(*data) else: assert False + + +@python_app(executors=["default_threads"]) +def insert_results(states: Sequence[Geometry], result: ComputeResult) -> Sequence[Geometry]: + """""" + insert(states, result.to_dict()) + return states diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index b5ad65c..1d0e8da 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -197,6 +197,7 @@ def get_per_atom( ) return tuple(future_dict[q] for q in quantities) + # TODO: cleanup? # def evaluate( # self, computable: "Computable", batch_size: Optional[int] = None # ) -> Dataset: diff --git a/psiflow/functions.py b/psiflow/functions.py index 13d3ebb..8d22978 100644 --- a/psiflow/functions.py +++ b/psiflow/functions.py @@ -4,7 +4,7 @@ import logging from dataclasses import dataclass, field from pathlib import Path -from typing import Optional, Type, Union, Any +from typing import Optional, Union from collections.abc import Sequence import numpy as np @@ -42,18 +42,15 @@ def __call__( def compute( self, - geometries: list[Geometry], - ) -> dict[str, float | np.ndarray]: + geometries: Sequence[Geometry], + ) -> dict[str, list[float | np.ndarray]]: """Evaluate multiple geometries and merge data into single arrays""" - value, grad_pos, grad_cell = create_outputs(self.outputs, geometries) + data = {k: [] for k in self.outputs} for i, geometry in enumerate(geometries): - if geometry == NullState: # TODO: remove - continue out = self(geometry) - value[i] = out["energy"] - grad_pos[i, : len(geometry)] = out["forces"] - grad_cell[i] = out["stress"] - return {"energy": value, "forces": grad_pos, "stress": grad_cell} + for k, v in out.items(): + data[k].append(v) + return data @dataclass(frozen=True) @@ -144,7 +141,7 @@ def __call__( @staticmethod def _geometry_to_key(geometry: Geometry) -> tuple: - return tuple([geometry.periodic]) + tuple(geometry.per_atom.numbers) + return tuple([geometry.periodic]) + tuple(geometry.numbers) @dataclass(frozen=True) @@ -153,7 +150,7 @@ def __call__( self, geometry: Geometry, ) -> dict[str, float | np.ndarray]: - return format_output(geometry, 0) + return format_output(geometry, 0.0) @dataclass(frozen=True) @@ -221,7 +218,7 @@ class DispersionFunction(Function): method: str damping: str = "d3bj" params_tweaks: Optional[dict[str, float]] = None - num_threads: int = 1 + num_threads: int = 4 calc: Calculator = field(init=False) @@ -247,32 +244,7 @@ def __call__( return format_output(geometry, **self.calc.results) -def _apply( - arg: Union[Geometry, list[Geometry], None], - outputs_: tuple[str, ...], - function_cls: Type[Function], - wait_for: Any = None, - inputs: Sequence = (), - parsl_resource_specification: dict = {}, - **parameters, -) -> Optional[list[np.ndarray]]: - from psiflow.data.utils import _read_frames - - # TODO: why pass geoms through arg or inputs? - # should we not have a single keyword for input structures? - # or at least reserve inputs for arbitrary futures (to wait on)? - - assert function_cls is not None - if arg is None: - states = _read_frames(inputs=[inputs[0]]) - elif not isinstance(arg, list): - states = [arg] - else: - states = arg - function = function_cls(**parameters) - output_dict = function.compute(states) - output_arrays = [output_dict[k] for k in outputs_] - return output_arrays + def function_from_json(path: Union[str, Path], **kwargs) -> Function: diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py index e337daa..b90e7ca 100644 --- a/psiflow/hamiltonians.py +++ b/psiflow/hamiltonians.py @@ -12,7 +12,15 @@ import psiflow from psiflow.data import Dataset -from psiflow.compute import Computable, aggregate_multiple, compute +from psiflow.data.file import write_frames +from psiflow.compute import ( + aggregate_results, + compute, + _apply, + ComputeInput, + ComputeResult, + insert_results, +) from psiflow.functions import ( EinsteinCrystalFunction, HarmonicFunction, @@ -20,18 +28,12 @@ PlumedFunction, ZeroFunction, DispersionFunction, - _apply, ) from psiflow.geometry import Geometry from psiflow.utils._plumed import remove_comments_printflush from psiflow.utils.apps import get_attribute from psiflow.utils.io import dump_json - -# TODO: comparison logic in __eq__ only works for hamiltonians without futures -# TODO: dataclasses automatically generate __eq__ - - logger = logging.getLogger(__name__) # logging per module @@ -40,23 +42,24 @@ apply_modelevaluation = python_app(_apply, executors=["ModelEvaluation"]) -# TODO: why have the Computable class? -class Hamiltonian(Computable): - batch_size = 1000 +# TODO: evaluate method for datasets? +class Hamiltonian: outputs: ClassVar[tuple] = ("energy", "forces", "stress") function_name: ClassVar[str] - def compute( - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - *outputs: Optional[str], - batch_size: Optional[int] = -1, # if -1: take class default TODO: why? - ) -> Union[list[AppFuture], AppFuture]: - if len(outputs) == 0: - outputs = tuple(self.__class__.outputs) - if batch_size == -1: - batch_size = self.__class__.batch_size - return compute(arg, self.get_app(), outputs_=outputs, batch_size=batch_size) + def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture: + """Evaluate hamiltonian function on input data and return ComputeResult""" + return compute(arg, [self.get_app()], batch_size=batch_size) + + def evaluate(self, arg: Dataset, batch_size: int = 1000) -> Dataset: + """Return new dataset labelled with hamiltonian function""" + geoms = arg.geometries() + result = self.compute(geoms, batch_size) + future = insert_results(geoms, result) + + file = psiflow.context().new_file("data_", ".xyz") + extxyz = write_frames(future, outputs=[file]).outputs[0] + return Dataset(extxyz=extxyz) def __eq__(self, other: "Hamiltonian") -> bool: raise NotImplementedError @@ -108,25 +111,12 @@ def __init__( self.hamiltonians = list(hamiltonians) self.coefficients = list(coefficients) - def compute( # override compute for efficient batching - self, - arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry], - outputs: Union[str, list[str], None] = None, - batch_size: Optional[int] = None, - ) -> Union[list[AppFuture], AppFuture]: - if outputs is None: - outputs = list(self.__class__.outputs) + def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture: apply_apps = [h.get_app() for h in self.hamiltonians] reduce_func = partial( - aggregate_multiple, coefficients=np.array(self.coefficients) - ) - return compute( - arg, - *apply_apps, - outputs_=outputs, - reduce_func=reduce_func, - batch_size=batch_size, + aggregate_results, coefficients=np.array(self.coefficients) ) + return compute(arg, apply_apps, reduce_func=reduce_func, batch_size=batch_size) def __eq__(self, hamiltonian: Hamiltonian) -> bool: if type(hamiltonian) is not MixtureHamiltonian: @@ -224,9 +214,6 @@ def serialize(self, **kwargs) -> list[DataFuture]: class Zero(Hamiltonian): function_name: ClassVar[str] = "ZeroFunction" - def __init__(self): - pass - def get_app(self) -> Callable: return partial(apply_threads, function_cls=ZeroFunction) @@ -312,7 +299,7 @@ def get_app(self) -> Callable: return partial( apply_htex, function_cls=PlumedFunction, - wait_for=self.external, + inputs=[self.external], **self.parameters(), ) @@ -429,7 +416,7 @@ def get_app(self) -> Callable: return partial( apply_modelevaluation, function_cls=MACEFunction, - wait_for=self.external, + inputs=[self.external], parsl_resource_specification=evaluation.wq_resources(1), **self.parameters(include_env=True), ) From bb98a49e302830adb38b97cf1512f9d370213515 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 16:23:15 +0200 Subject: [PATCH 10/16] Update test_function.py --- tests/test_function.py | 256 +++++++++++++++++++++++------------------ 1 file changed, 145 insertions(+), 111 deletions(-) diff --git a/tests/test_function.py b/tests/test_function.py index 39ec1ca..22b305e 100644 --- a/tests/test_function.py +++ b/tests/test_function.py @@ -5,11 +5,14 @@ from parsl.data_provider.files import File # type: ignore import psiflow +from psiflow.geometry import MISSING from psiflow.functions import ( EinsteinCrystalFunction, HarmonicFunction, MACEFunction, PlumedFunction, + DispersionFunction, + ZeroFunction, function_from_json, ) from psiflow.hamiltonians import ( @@ -18,6 +21,7 @@ MixtureHamiltonian, PlumedHamiltonian, D3Hamiltonian, + MACEHamiltonian, Zero, ) from psiflow.utils._plumed import remove_comments_printflush, set_path_in_plumed @@ -25,34 +29,39 @@ from psiflow.utils.io import dump_json from psiflow.utils.apps import get_attribute from psiflow.serialization import CLS_KEY, deserialize_hook +from psiflow.compute import compare_results def test_einstein_crystal(dataset): future_geom = dataset[0] + geom = future_geom.result() test_dataset = dataset[:4].reset() - geometries = test_dataset.geometries().result() + test_geoms = test_dataset.geometries().result() function = EinsteinCrystalFunction( - force_constant=1.0, centers=future_geom.result().per_atom.positions + force_constant=1.0, centers=geom.per_atom.positions, volume=geom.volume ) hamiltonian = EinsteinCrystal.from_geometry(future_geom, force_constant=1.0) - energy, forces, stress = function.compute(geometries).values() - forces1, stress1, energy1 = hamiltonian.compute( - test_dataset, "forces", "stress", "energy" - ) - forces2 = hamiltonian.compute(test_dataset, "forces", batch_size=3) - - assert np.all(energy >= 0) - assert energy[0] == 0 - assert geometries[0].energy is None - assert np.allclose( # forces point to centers - forces, - function.centers.reshape(1, -1, 3) - test_dataset.get("positions").result(), - ) - assert np.allclose(energy1.result(), energy) - assert np.allclose(forces1.result(), forces) - assert np.allclose(forces1.result(), forces2.result()) + out0 = function.compute(test_geoms) + assert all(e >= 0 for e in out0["energy"]) + assert out0["energy"][0] == 0 + assert test_geoms[0].energy is MISSING + positions = test_dataset.get("positions")[0].result() + centers = function.centers.reshape(1, -1, 3) + for f, p in zip(out0["forces"], positions): + assert np.allclose(f, centers - p) # forces point to centers + + out1 = hamiltonian.compute(test_dataset) + out2 = hamiltonian.compute(test_dataset, batch_size=3) + + error1 = compare_results(out1, **out0) + error2 = compare_results(out1, out2) + + for k, rmse in error1.result().items(): + assert np.isclose(rmse, 0.0) + for k, rmse in error2.result().items(): + assert np.isclose(rmse, 0.0) def test_einstein_force(dataset): @@ -72,7 +81,9 @@ def test_einstein_force(dataset): forces_[i, j] = (-1.0) * sign * force_constant * delta forces.append(forces_) - forces_ = einstein.compute(geometries, "forces").result() + # per atom arrays are concatenated - not stacked + forces_ = einstein.compute(geometries).result().forces + forces_ = forces_.reshape(-1, len(reference), 3) assert np.allclose(forces, forces_) @@ -91,9 +102,7 @@ def test_get_filename_hills(): plumed_input = remove_comments_printflush(plumed_input) plumed_input = set_path_in_plumed(plumed_input, "METAD", "/tmp/my_input") plumed_input = set_path_in_plumed(plumed_input, "METADD", "/tmp/my_input") - assert ( - plumed_input.strip() - == """ + assert plumed_input.strip() == """ RESTART UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: VOLUME @@ -101,12 +110,12 @@ def test_get_filename_hills(): METAD ARG=CV0 SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad FILE=/tmp/my_input sdld METADD ARG=CV SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad sdld FILE=/tmp/my_input """.strip() - ) def test_plumed_function(tmp_path, dataset, dataset_h2): data = dataset + dataset_h2 geometries = data.geometries().result() + f = 1 / (kJ / mol) * 10 # eV --> kJ/mol and nm --> A plumed_str = """ D1: DISTANCE ATOMS=1,2 NOPBC CV: BIASVALUE arg=D1 @@ -114,12 +123,13 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): function = PlumedFunction(plumed_str) outputs = function.compute(geometries) - f = 1 / (kJ / mol) * 10 # eV --> kJ/mol and nm --> A - positions = data.get("positions").result() - manual = np.linalg.norm(positions[:, 0, :] - positions[:, 1, :], axis=1) - assert np.allclose(outputs["energy"] * f, manual) - gradient = (positions[:, 0, :] - positions[:, 1, :]) / manual.reshape(-1, 1) - assert np.allclose(outputs["forces"][:, 0, :] * f, gradient * (-1.0)) + [positions] = data.get("positions") + manual = np.array([np.linalg.norm(p[0] - p[1]) for p in positions.result()]) + gradient = [(p[0] - p[1]) / d for p, d in zip(positions.result(), manual)] + assert np.allclose(np.array(outputs["energy"]) * f, manual) + for grad, force in zip(gradient, outputs["forces"]): + # only compare first force component + assert np.allclose(force[0] * f, grad * (-1.0)) # test volume bias geometries = dataset.geometries().result() @@ -132,12 +142,12 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): function = PlumedFunction(plumed_input) hamiltonian = PlumedHamiltonian(plumed_input) - energy, forces, stress = function.compute(geometries).values() - energy_, forces_, stress_ = hamiltonian.compute(dataset) + data = function.compute(geometries) + out = hamiltonian.compute(dataset).result() energy_manual = (volumes - 50) ** 2 * (kJ / mol) / 2 - assert np.allclose(energy, energy_manual) - assert np.allclose(energy, energy_.result()) - assert np.allclose(stress, stress_.result()) + assert np.allclose(data["energy"], energy_manual) + assert np.allclose(data["energy"], out.energy) + assert np.allclose(data["stress"], out.stress) # use external grid as bias, check that file is read test_set = dataset[:10] @@ -154,22 +164,19 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: DISTANCE ATOMS=1,2 NOPBC METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={} -""".format( - path_hills - ) +""".format(path_hills) hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills)) - energy = hamiltonian.compute(test_set, "energy") + energy = hamiltonian.compute(test_set).result().energy # compute bias energy manually - positions = test_set.get("positions").result() - vecs = positions[:, 0, :] - positions[:, 1, :] - distance = np.linalg.norm(vecs, axis=1).reshape(-1, 1) sigma = 2 * np.ones((1, 2)) height = np.array([70, 70]).reshape(1, -1) * (kJ / mol) # unit consistency center = np.array([2.5, 2.6]).reshape(1, -1) - energy_per_hill = height * np.exp((distance - center) ** 2 / (-2 * sigma**2)) + positions = test_set.get("positions")[0].result() + dists = np.array([np.linalg.norm(p[0] - p[1]) for p in positions]).reshape(-1, 1) + energy_per_hill = height * np.exp((dists - center) ** 2 / (-2 * sigma**2)) energy_ = np.sum(energy_per_hill, axis=1) - assert np.allclose(energy.result(), energy_, atol=1e-3) + assert np.allclose(energy, energy_, atol=1e-3) # check that hills file didn't change with open(path_hills, "r") as f: @@ -177,7 +184,7 @@ def test_plumed_function(tmp_path, dataset, dataset_h2): def test_harmonic_function(dataset): - test_set = dataset[:10] + test_set = dataset[:5] geometries = test_set.geometries().result() reference = geometries[0] hess = np.eye(3 * len(reference)) @@ -185,30 +192,59 @@ def test_harmonic_function(dataset): function = HarmonicFunction(reference.per_atom.positions, hess, reference.energy) einstein = EinsteinCrystalFunction(1.0, reference.per_atom.positions) harmonic = Harmonic.from_geometry(dataset[0], hess) + assert Harmonic.outputs == ("energy", "forces", "stress") - energy, forces, _ = function.compute(geometries).values() - energy1, forces1, _ = einstein.compute(geometries).values() - energy2, forces2, _ = harmonic.compute(test_set) + out0 = function.compute(geometries) + out1 = einstein.compute(geometries) + result = harmonic.compute(test_set) + err0 = compare_results(result, **out0).result() + err1 = compare_results(result, **out1).result() - assert Harmonic.outputs == ("energy", "forces", "stress") - assert np.allclose(energy - reference.energy, energy1) - assert np.allclose(forces, forces1) - assert np.allclose(energy2.result() - reference.energy, energy1, atol=1e-5) - assert np.allclose(forces2.result(), forces1) + assert np.allclose(np.array(out0["energy"]) - reference.energy, out1["energy"]) + assert np.isclose(err0["energy"], 0) + assert np.isclose(err1["energy"], reference.energy) + assert np.allclose(out0["forces"], out1["forces"]) + assert np.isclose(err0["forces"], 0) + assert np.isclose(err1["forces"], 0) def test_dispersion_function(dataset): + test_set = dataset[:3] + function = DispersionFunction(method="pbe", damping="d3bj") hamiltonian = D3Hamiltonian(method="pbe", damping="d3bj") - energy = hamiltonian.compute(dataset[-1], "energy").result() - assert energy is not None - assert energy < 0.0 # dispersion is attractive - subset = dataset[:3] - data = subset.evaluate(hamiltonian) - energy, forces = hamiltonian.compute(subset, "energy", "forces") + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set) + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0) + assert np.all(np.array(out["energy"]) < 0.0) # dispersion is attractive + + +def test_zero(dataset): + test_set = dataset[:10] + function = ZeroFunction() + hamiltonian = Zero() + + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set).result() + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0) + assert np.allclose(result.energy, 0.0) + assert np.allclose(result.forces, 0.0) - assert np.allclose(data.get("energy").result(), energy.result()) - assert len(forces.result().shape) == 3 + +def test_mace(dataset, mace_foundation): + test_set = dataset[:3] + function = MACEFunction(mace_foundation, 1, "cpu", "float32") + hamiltonian = MACEHamiltonian(mace_foundation) + + out = function.compute(test_set.geometries().result()) + result = hamiltonian.compute(test_set).result() + error = compare_results(result, **out) + for k, rmse in error.result().items(): + assert np.isclose(rmse, 0.0, atol=1e-6) # single precision def test_hamiltonian_arithmetic(dataset): @@ -234,11 +270,11 @@ def test_hamiltonian_arithmetic(dataset): zero = Zero() assert hamiltonian == hamiltonian + zero assert 2 * hamiltonian + zero == 2 * hamiltonian - scaled_m = 0.5 * hamiltonian + scaled_h = 0.5 * hamiltonian scaled_h1 = EinsteinCrystal.from_geometry(future, 0.5) - assert len(scaled_m) == 1 - assert scaled_m.get_coefficient(hamiltonian) == 0.5 - assert scaled_m.get_coefficient(scaled_h1) is None + assert len(scaled_h) == 1 + assert scaled_h.get_coefficient(hamiltonian) == 0.5 + assert scaled_h.get_coefficient(scaled_h1) is None scaled_h2 = EinsteinCrystal.from_geometry(future, 4.0) mixture = hamiltonian + scaled_h2 assert len(mixture) == 2 @@ -246,32 +282,26 @@ def test_hamiltonian_arithmetic(dataset): assert mixture.get_coefficients(mixture) == (1, 1) assert mixture.get_coefficients(hamiltonian + scaled_h1) is None - energy_m = scaled_m.compute(test_set, "energy") - energy_h1 = scaled_h1.compute(test_set, "energy") - energy, forces, _ = hamiltonian.compute(test_set) - energy_, forces_, _ = mixture.compute(test_set) - assert np.allclose(energy_m.result(), energy_h1.result()) - assert np.allclose(energy_.result(), 5 * energy.result()) - assert np.allclose(forces_.result(), 5 * forces.result()) + result_h = scaled_h.compute(test_set) + result_h1 = scaled_h1.compute(test_set) + error = compare_results(result_h, result_h1) + out = hamiltonian.compute(test_set) + out_ = mixture.compute(test_set) + error, out, out_ = error.result(), out.result(), out_.result() - energy, forces, stress = zero.compute(test_set) - assert np.allclose(energy.result(), 0.0) - geometries = [dataset[i].result() for i in [0, -1]] - natoms = [len(geometry) for geometry in geometries] - forces = zero.compute(geometries, "forces", batch_size=1).result() - assert np.all(forces[0, : natoms[0]] == 0.0) - assert np.all(forces[-1, : natoms[1]] == 0.0) + for k, v in error.items(): + assert np.isclose(v, 0) + assert np.allclose(out.energy * 5, out_.energy) + assert np.allclose(out.forces * 5, out_.forces) - -def test_subtract(dataset): - einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) + # check subtraction + einstein = EinsteinCrystal.from_geometry(future, force_constant=1.0) h = einstein - einstein assert isinstance(h, MixtureHamiltonian) assert np.allclose(h.coefficients, 0.0) def test_hamiltonian_serialize(dataset): - data = json.loads(psiflow.serialize(Zero()).result()) assert data[CLS_KEY] == "Zero" zero = psiflow.deserialize(json.dumps(data)).result() @@ -281,9 +311,7 @@ def test_hamiltonian_serialize(dataset): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: VOLUME RESTRAINT ARG=CV AT={center} KAPPA={kappa} - """.format( - center=100, kappa=1 / (kJ / mol) - ) + """.format(center=100, kappa=1 / (kJ / mol)) plumed = PlumedHamiltonian(plumed_input) einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) @@ -308,27 +336,12 @@ def test_hamiltonian_serialize(dataset): assert h == h_ test_set = dataset[:10] - energy_e = einstein.compute(test_set, "energy") - energy_e_ = einstein_.compute(test_set, "energy") - energy_m = mixed.compute(test_set, "energy") - energy_m_ = mixed_.compute(test_set, "energy") - assert np.allclose(energy_e.result(), energy_e_.result()) - assert np.allclose(energy_m.result(), energy_m_.result()) - - -def test_evaluate(dataset): - hamiltonian = EinsteinCrystal.from_geometry(geometry=dataset[0], force_constant=1.0) - data = dataset[:10].reset() - evaluated = data.evaluate(hamiltonian, batch_size=None) - evaluated_ = data.evaluate(hamiltonian, batch_size=2) - energy = hamiltonian.compute(evaluated, "energy") - - geometries = evaluated.geometries().result() - geometries_ = evaluated_.geometries().result() - for geom, geom_, e in zip(geometries, geometries_, energy.result()): - assert not np.all(np.isnan(geom.per_atom.forces)) - assert geom == geom_ - assert geom.energy == e + out1 = einstein.compute(test_set) + out1_ = einstein_.compute(test_set) + out2 = mixed.compute(test_set) + out2_ = mixed_.compute(test_set) + assert np.allclose(out1.result().energy, out1_.result().energy) + assert np.allclose(out2.result().energy, out2_.result().energy) def test_json_dump(): @@ -368,15 +381,36 @@ def test_function_from_json(tmp_path, dataset): UNITS LENGTH=A ENERGY=kj/mol TIME=fs CV: DISTANCE ATOMS=1,2 NOPBC METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={} -""".format( - path_hills - ) +""".format(path_hills) hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills)) future = hamiltonian.serialize_function() future.result() # ensure file exists function = function_from_json(future.filepath) - energies = hamiltonian.compute(dataset, "energy") + energies = hamiltonian.compute(dataset).result().energy energies_ = function.compute(dataset.geometries().result())["energy"] - assert np.allclose(energies.result(), energies_) + assert np.allclose(energies, energies_) + + +def test_evaluate(dataset): + test_set = dataset[:10].reset() + hamiltonian = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0) + + out0 = hamiltonian.evaluate(test_set) + out1 = hamiltonian.evaluate(test_set, batch_size=None) + result = hamiltonian.compute(test_set) + + geoms0 = out0.geometries().result() + geoms1 = out1.geometries().result() + data = result.result().to_dict() + + for i in range(len(geoms0)): + geom0, geom1 = geoms0[i], geoms1[i] + assert np.isclose(geom0.energy, geom1.energy) + assert np.isclose(geom0.energy, data["energy"][i]) + assert np.allclose(geom0.per_atom.forces, geom1.per_atom.forces) + assert np.allclose(geom0.per_atom.forces, data["forces"][i]) + assert np.allclose(geom0.stress, geom1.stress) + assert np.allclose(geom0.stress, data["stress"][i]) + From 0e5ebdd6b8bcb6ab6d6e15809497799ad023ce30 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 18:25:19 +0200 Subject: [PATCH 11/16] reference API update + tests Some streamlining of the Reference methods: - __call__ labels a single geometry - evaluate labels a Dataset (analogous to Hamiltonian) - compute was removed (for now) -similar functionality through Reference.evaluate(dataset).get('energy') In which scenario do you want loose energies/forces? --- psiflow/execution.py | 2 +- psiflow/geometry.py | 5 +- psiflow/hamiltonians.py | 7 +-- psiflow/reference/cp2k_.py | 15 ++--- psiflow/reference/reference.py | 106 ++++++++++++--------------------- tests/test_reference.py | 71 ++++++++++++---------- 6 files changed, 87 insertions(+), 119 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index 26e7f3a..812a69e 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -345,7 +345,7 @@ def wrap_in_timeout(self, command: str) -> str: return command # noop # send SIGTERM after max_runtime, follow with SIGKILL 30s later - return f"timeout -k 30s {self.max_runtime}s {command}" + return f"timeout -v -k 30s {self.max_runtime}s {command}" # def wrap_in_srun(self, command: str) -> str: # # TODO: stub -- this does not work diff --git a/psiflow/geometry.py b/psiflow/geometry.py index 9c0cd29..d162095 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -224,9 +224,8 @@ def reset(self) -> None: Reset all computed properties of the geometry to their default values. """ self.per_atom.reset() - for k in ("energy", "stress"): - if hasattr(self, k): - delattr(self, k) + for k in {"energy", "stress"}.intersection(self.attributes()): + delattr(self, k) def clean(self) -> None: """ diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py index b90e7ca..dd65fc4 100644 --- a/psiflow/hamiltonians.py +++ b/psiflow/hamiltonians.py @@ -12,7 +12,6 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.file import write_frames from psiflow.compute import ( aggregate_results, compute, @@ -42,7 +41,6 @@ apply_modelevaluation = python_app(_apply, executors=["ModelEvaluation"]) -# TODO: evaluate method for datasets? class Hamiltonian: outputs: ClassVar[tuple] = ("energy", "forces", "stress") function_name: ClassVar[str] @@ -56,10 +54,7 @@ def evaluate(self, arg: Dataset, batch_size: int = 1000) -> Dataset: geoms = arg.geometries() result = self.compute(geoms, batch_size) future = insert_results(geoms, result) - - file = psiflow.context().new_file("data_", ".xyz") - extxyz = write_frames(future, outputs=[file]).outputs[0] - return Dataset(extxyz=extxyz) + return Dataset(future) def __eq__(self, other: "Hamiltonian") -> bool: raise NotImplementedError diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py index 014a1e4..44412f6 100644 --- a/psiflow/reference/cp2k_.py +++ b/psiflow/reference/cp2k_.py @@ -4,8 +4,6 @@ from typing import Optional from collections.abc import Sequence -import numpy as np -from ase.data import chemical_symbols from ase.units import Bohr, Ha from cp2k_input_tools.generator import CP2KInputGenerator from cp2k_input_tools.parser import CP2KInputParserSimplified @@ -153,13 +151,12 @@ def create_input(self, geom: Geometry) -> tuple[bool, Optional[File]]: section_copy = copy.deepcopy(section) section.pop("topology", None) # remove topology section - # insert geometry and cell - symbols = np.array(chemical_symbols)[geom.per_atom.numbers] - coord = [ - f"{s:5} {p[0]:<15.8f} {p[1]:<15.8f} {p[2]:<15.8f}" - for s, p in zip(symbols, geom.per_atom.positions) - ] - section["coord"] = {"*": coord} + # insert coordinates + _, body = geom.per_atom.to_string() + coord_list = body.split("\n")[:-1] + section["coord"] = {"*": coord_list} + + # insert cell cell = {} for i, vector in enumerate(["A", "B", "C"]): cell[vector] = "{} {} {}".format(*geom.cell[i]) diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index cfbfe10..68f92c2 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -1,6 +1,5 @@ -import warnings import logging -from typing import Optional, Union, Callable, Optional +from typing import Callable, Optional from collections.abc import Sequence from pathlib import Path from functools import partial @@ -14,12 +13,15 @@ import psiflow from psiflow.data import Dataset -from psiflow.data.utils import extract from psiflow.geometry import Geometry from psiflow.utils.apps import copy_app_future from psiflow.utils.parse import LineNotFoundError, get_task_name_id +# TODO: existing Reference instances do not react to changes in execution environment +# e.g., altering max_runtime is not retroactively updated + + logger = logging.getLogger(__name__) # logging per module @@ -31,13 +33,14 @@ class Status(Enum): def update_geometry(geom: Geometry, data: dict) -> Geometry: """""" + # TODO: is the metadata not more a logging thing? # data should contain 'stdout' and 'status' keys task_name, task_id = get_task_name_id(data["stdout"]) logger.info(f'Task "{task_name}" (ID {task_id}): {data["status"].name}') geom = geom.copy() geom.reset() - geom.order["status"], geom.order["task_id"] = data["status"].name, task_id + geom.metadata = {'status': data["status"].name, 'task_id': task_id} if data["status"] != Status.SUCCESS: return geom @@ -47,18 +50,18 @@ def update_geometry(geom: Geometry, data: dict) -> Geometry: shift = data["positions"][0] - geom.per_atom.positions[0] if not np.allclose(data["positions"], geom.per_atom.positions + shift, atol=1e-6): # output does not match geometry up to a translation - geom.order["status"] = Status.INCONSISTENT + geom.status = Status.INCONSISTENT return geom geom.energy = data["energy"] - geom.order["runtime"] = data.get("runtime") + geom.metadata["runtime"] = data.get("runtime") if "forces" in data: - geom.per_atom.forces[:] = data["forces"] + geom.per_atom.forces = data["forces"] return geom @join_app -def get_minimum_energy(element: str, **kwargs) -> AppFuture[float]: +def get_minimum_energy(element: str, **kwargs) -> AppFuture: energies = {m: state.energy or np.inf for m, state in kwargs.items()} energy = min(energies.values()) msg = [ @@ -84,18 +87,6 @@ def get_spin_multiplicities(element: str) -> list[int]: return mults -@join_app -def compute_dataset( - dataset: Dataset, - length: int, - reference: "Reference", -) -> list[AppFuture[Geometry]]: - logger.info(f"Performing {length} {reference.__class__.__name__} calculations.") - geometries = dataset.geometries() # read it once - evaluated = [reference.evaluate(geometries[i]) for i in range(length)] - return evaluated - - def _execute( bash_template: str, inputs: list[File], @@ -125,38 +116,15 @@ def __init__( if (n := self.n_cores) is not None: assert n <= definition.spec["cores"] - def compute( - self, - arg: Union[Dataset, Geometry, AppFuture, list], - *outputs: Optional[Union[str, tuple]], - ): - # TODO: deprecate? Reference evaluations are too costly - for output in outputs: - if output not in self.outputs: - raise ValueError("output {} not in {}".format(output, self.outputs)) - - # TODO: convert_to_dataset util? - if isinstance(arg, Dataset): - dataset = arg - elif isinstance(arg, list): - dataset = Dataset(arg) - elif isinstance(arg, AppFuture) or isinstance(arg, Geometry): - dataset = Dataset([arg]) - else: - raise TypeError - - # TODO: writing to dataset first is wasted overhead, - # but extract_quantities does not accept AppFuture[list[Geometry]] (yet?) - future_dataset = self.compute_dataset(dataset) - outputs = outputs or self.outputs - future_data = extract( - tuple(outputs), None, None, inputs=[future_dataset.extxyz] - ) - if len(outputs) == 1: - return future_data[0] - return [future_data[_] for _ in range(len(outputs))] + def __call__(self, geometry: Geometry | AppFuture) -> AppFuture: + return evaluate(geometry, self) + + def evaluate(self, dataset: Dataset) -> Dataset: + """Return a new labelled dataset""" + future = evaluate_geometries(dataset.geometries(), self) + return Dataset(future) - def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]: + def compute_atomic_energy(self, element, box_size=None) -> AppFuture: references = self.get_single_atom_references(element) state = Geometry.from_data( numbers=np.array([atomic_numbers[element]]), @@ -165,11 +133,10 @@ def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]: ) futures = {} for mult, reference in references.items(): - futures[str(mult)] = reference.evaluate(state) - return get_minimum_energy(element, **futures) + futures[str(mult)] = reference(state) + return get_minimum_energy(element, **futures) # float def get_execute_app(self) -> Callable: - # TODO: how often is this called? context = psiflow.context() definition = context.definitions[self.executor] return partial( @@ -179,13 +146,6 @@ def get_execute_app(self) -> Callable: label=self._execute_label, ) - def evaluate(self, geometry: Geometry | AppFuture) -> AppFuture: - return evaluate(geometry, self) - - def compute_dataset(self, dataset: Dataset) -> Dataset: - future = compute_dataset(dataset, dataset.length(), self) - return Dataset(future) - def get_single_atom_references(self, element: str) -> dict[int, "Reference"]: raise NotImplementedError @@ -218,21 +178,17 @@ def _process_output( def evaluate( geom: Geometry, reference: Reference, - execute_func: Optional[Callable] = None, + execute_app: Optional[Callable] = None, ) -> AppFuture: """""" - if geom == NullState: # TODO: remove this - warnings.warn("Skipping NullState..") - return copy_app_future(geom) - flag, *files = reference.create_input(geom=geom) if not flag: # TODO: should we reset geom? return copy_app_future(geom) # do the actual evaluation - if execute_func is None: - execute_func = reference.get_execute_app() - future = execute_func(inputs=files) + if execute_app is None: + execute_app = reference.get_execute_app() + future = execute_app(inputs=files) future = process_output( geom=geom, @@ -240,3 +196,15 @@ def evaluate( inputs=[future.stdout, future.stderr, future], # wait for future ) return future + + +@join_app +def evaluate_geometries( + states: Sequence[Geometry], + reference: "Reference", +) -> list[AppFuture]: + msg = f"Performing {len(states)} {reference.__class__.__name__} calculations." + logger.info(msg) + execute_app = reference.get_execute_app() + evaluated = [evaluate(geom, reference, execute_app) for geom in states] + return evaluated diff --git a/tests/test_reference.py b/tests/test_reference.py index 416974a..fcce746 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -5,7 +5,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, NullState +from psiflow.geometry import Geometry, MISSING from psiflow.reference import CP2K, GPAW, ORCA, create_orca_input from psiflow.reference.reference import Status from psiflow.reference.cp2k_ import ( @@ -198,23 +198,19 @@ def test_cp2k_parse_output(): def test_cp2k_success(simple_cp2k_input, geom_h2_p): reference = CP2K(simple_cp2k_input) - future = reference.evaluate(geom_h2_p) - future_null = reference.evaluate(NullState) + future = reference(geom_h2_p) assert isinstance(future, AppFuture) geom_out = future.result() - geom_null = future_null.result() ref_energy = -1.167407360449355 * Ha ref_forces = np.array([[-0.00968014, 0.0, 0.0], [0.00967947, 0.0, 0.0]]) * Ha / Bohr - assert geom_out != NullState assert geom_out.energy is not None assert not np.any(np.isnan(geom_out.per_atom.forces)) assert np.allclose(ref_energy, geom_out.energy) assert np.allclose(ref_forces, geom_out.per_atom.forces, atol=1e-5) - assert geom_null == NullState # check whether NullState evaluates to NullState # check number of mpi processes - stdout, _ = get_task_logs(geom_out.order["task_id"]) + stdout, _ = get_task_logs(geom_out.metadata["task_id"]) lines = stdout.read_text().split("\n") for line in lines: if "Total number of message passing processes" in line: @@ -295,12 +291,12 @@ def test_cp2k_failure(geom_h2_p): &END FORCE_EVAL """ # incorrect input file reference = CP2K(cp2k_input) - future = reference.evaluate(geom_h2_p) + future = reference(geom_h2_p) assert isinstance(future, AppFuture) geom_out = future.result() - assert geom_out.energy is None - assert np.all(np.isnan(geom_out.per_atom.forces)) - stdout, _ = get_task_logs(geom_out.order["task_id"]) + assert geom_out.energy is MISSING + assert geom_out.per_atom.forces is MISSING + stdout, _ = get_task_logs(geom_out.metadata["task_id"]) assert "ABORT" in stdout.read_text() # verify error is captured @@ -311,25 +307,34 @@ def test_cp2k_memory(simple_cp2k_input): positions=np.random.uniform(0, 20, size=(4000, 3)), cell=20 * np.eye(3), # box way too large ) - energy, forces = reference.compute(geometry) - energy, forces = energy.result(), forces.result() - assert np.all(np.isnan(energy)) + geom_fail = reference(geometry).result() + assert geom_fail.energy is MISSING + assert geom_fail.per_atom.forces is MISSING + assert geom_fail.metadata["status"] == Status.FAILED.name @pytest.mark.filterwarnings("ignore:Original input file not found") def test_cp2k_timeout(simple_cp2k_input, geom_h2_p): + definition = psiflow.context().definitions["CP2K"] + runtime = definition.max_runtime + definition.max_runtime = 1 # seconds + reference = CP2K(simple_cp2k_input) geom_h2_p.cell = 20 * np.eye(3) # box way too large - energy, forces = reference.compute(Dataset([geom_h2_p])) - energy, forces = energy.result(), forces.result() - assert np.all(np.isnan(energy)) + + geom_fail = reference(geom_h2_p).result() + assert geom_fail.energy is MISSING + assert geom_fail.per_atom.forces is MISSING + assert geom_fail.metadata["status"] == Status.FAILED.name + + definition.max_runtime = runtime # restore value def test_cp2k_energy(simple_cp2k_input, geom_h2_p): reference = CP2K(simple_cp2k_input, outputs=("energy",)) - geom_out = reference.evaluate(geom_h2_p).result() + geom_out = reference(geom_h2_p).result() assert geom_out.energy is not None - assert np.all(np.isnan(geom_out.per_atom.forces)) + assert geom_out.per_atom.forces is MISSING @pytest.mark.filterwarnings("ignore:Original input file not found") @@ -337,6 +342,7 @@ def test_cp2k_atomic_energies(simple_cp2k_input): # use energy-only because why not reference = CP2K(simple_cp2k_input, outputs=("energy",)) energy = reference.compute_atomic_energy("H", box_size=4) + print(energy.result()) assert abs(energy.result() - (-13.6)) < 1 # reasonably close to exact value @@ -380,27 +386,30 @@ def test_cp2k_posthf(geom_h2_p): &END FORCE_EVAL """ reference = CP2K(cp2k_input_str, outputs=("energy",)) - assert reference.evaluate(geom_h2_p).result().energy is not None + assert reference(geom_h2_p).result().energy is not MISSING -def test_gpaw_single(dataset_h2): +def test_gpaw(dataset_h2): + # basic functionality parameters = dict(mode="fd", nbands=0, xc="LDA", h=0.3, minimal_box_multiple=2) gpaw = GPAW(parameters) future_in = dataset_h2[0] - future_out = gpaw.evaluate(future_in) - future_energy = gpaw.compute(dataset_h2[:1])[0] + future_out = gpaw(future_in) + dataset_out = gpaw.evaluate(dataset_h2[:2]) future_energy_zr = gpaw.compute_atomic_energy("Zr", box_size=9) - gpaw = GPAW({"askdfj": "asdfk"}) # invalid input - future_fail = gpaw.evaluate(future_in) - geom_in, geom_out = future_in.result(), future_out.result() - energy, energy_zr = future_energy.result(), future_energy_zr.result() + # invalid input + gpaw = GPAW({"askdfj": "asdfk"}) + future_fail = gpaw(future_in) - assert geom_out.energy is not None and geom_out.energy < 0.0 + geom_in, geom_out = future_in.result(), future_out.result() + geoms_out = dataset_out.geometries().result() + assert geom_out.energy is not MISSING and geom_out.energy < 0.0 assert np.allclose(geom_out.per_atom.positions, geom_in.per_atom.positions) - assert np.allclose(geom_out.energy, energy) - assert energy_zr == 0.0 - assert future_fail.result().energy is None + assert future_energy_zr.result() == 0.0 + for geom in geoms_out: + assert geom.energy is not MISSING and geom.energy < 0.0 + assert future_fail.result().energy is MISSING # TODO: enable once we have an ORCA container From 9523b2efd43301a5ca60856723a6fcb52fed619e Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 22:56:06 +0200 Subject: [PATCH 12/16] make sampling work with new geometry and data modules make sampling work with new geometry and data modules --- install_local.sh | 8 +++++++- psiflow/data/dataset.py | 2 -- psiflow/geometry.py | 2 ++ psiflow/sampling/driver.py | 5 ++++- psiflow/sampling/optimize.py | 2 +- psiflow/sampling/output.py | 2 +- psiflow/sampling/sampling.py | 2 +- psiflow/sampling/server.py | 10 ++++------ psiflow/sampling/utils.py | 2 +- psiflow/sampling/walker.py | 2 +- 10 files changed, 22 insertions(+), 15 deletions(-) diff --git a/install_local.sh b/install_local.sh index 9f7d7df..c067cbe 100644 --- a/install_local.sh +++ b/install_local.sh @@ -10,11 +10,17 @@ micromamba install ndcctools -c conda-forge # install psiflow pip install -e psiflow[dev] -# install PyTorch and MACE +# install PyTorch and MACE -- CUDA pip install torch --index-url https://download.pytorch.org/whl/cu128 pip install mace-torch pip install cuequivariance cuequivariance-torch cuequivariance-ops-torch-cu12 +# install PyTorch and MACE -- ROCM +pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 +pip install mace-torch +pip install openequivariance +python -c "import openequivariance" # compile some binary, maybe + # install basic PLUMED and python API micromamba install plumed -c conda-forge pip install plumed diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 1d0e8da..21d0459 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -26,8 +26,6 @@ assign_identifiers, ) -# TODO: do all operations need to generate a new file? - @psiflow.register_serializable class Dataset: diff --git a/psiflow/geometry.py b/psiflow/geometry.py index d162095..c23817a 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -16,6 +16,8 @@ # TODO: docstrings # TODO: custom attributes writing +# TODO: Reference and Sampling want to add attributes to Geometry (used to be 'order' dict) +# what to do? # these are always accessible for every geometry DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress" diff --git a/psiflow/sampling/driver.py b/psiflow/sampling/driver.py index b56b852..463201f 100644 --- a/psiflow/sampling/driver.py +++ b/psiflow/sampling/driver.py @@ -74,7 +74,10 @@ def compute_structure(self, cell, pos): unit_to_internal("energy", "electronvolt", energy), np.float64 ) force_ipi = np.asarray(unit_to_internal("force", "ev/ang", forces), np.float64) - vir_calc = -stress * self.geometry.volume + if self.geometry.periodic: + vir_calc = -stress * self.geometry.volume + else: + vir_calc = np.zeros_like(stress) vir_ipi = np.array( unit_to_internal("energy", "electronvolt", vir_calc.T), dtype=np.float64 ) diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index a5e97df..4d7cd0c 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -192,7 +192,7 @@ def optimize( parsl_resource_specification=definition.wq_resources(1), ) - final = Dataset(None, result.outputs[0]).evaluate(hamiltonian)[-1] + final = hamiltonian.evaluate(Dataset(extxyz=result.outputs[0]))[-1] if keep_trajectory: trajectory = Dataset(None, result.outputs[1]) return final, trajectory diff --git a/psiflow/sampling/output.py b/psiflow/sampling/output.py index 9daec95..7f08719 100644 --- a/psiflow/sampling/output.py +++ b/psiflow/sampling/output.py @@ -100,7 +100,7 @@ def _parse_simulation_data( task_id = get_task_name_id(simulation_stdout)[-1] logger.info(f"Simulation [ID {task_id}]: {status}") - return status, data, temperature, state.order.pop("time") + return status, data, temperature, state.time parse_simulation_data = python_app( diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 8650f03..a28fb6d 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -635,7 +635,7 @@ def _sample( # process MD output simulation_outputs = [] - final_states = read_frames(inputs=result.outputs[:1]) + final_states = read_frames(result.outputs[0]) for idx, walker in enumerate(walkers): # TODO: order_parameter out of commission # if walker.order_parameter is not None: diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py index d0bc4e5..c3d5856 100644 --- a/psiflow/sampling/server.py +++ b/psiflow/sampling/server.py @@ -46,7 +46,7 @@ def parse_checkpoint(file_xml: str | Path) -> list[Geometry]: # get current internal system time ensemble_data = sys.ensemble.fetch() time = ensemble_data.time * (_hbar / (_e * Ha)) * 1e12 - geometry.order["time"], geometry.order["name"] = time, name + geometry.time, geometry.name = time, name geometries.append(geometry) return geometries @@ -63,8 +63,6 @@ def insert_addresses(input_xml: ET.Element) -> None: address.text = str(Path.cwd() / address.text.strip()) - - def wait_for_clients(input_xml, timeout: int = 60) -> None: """Make sure clients have initialised successfully""" @@ -72,7 +70,7 @@ def wait_for_clients(input_xml, timeout: int = 60) -> None: sockets = [] xml_str = ET.tostring(input_xml, encoding="unicode") for line in xml_str.splitlines(): - if 'address' in line: + if "address" in line: sockets.append(line.split(">")[1].split("<")[0]) for _ in range(timeout): @@ -116,7 +114,7 @@ def run(start_xyz: str, input_xml: str): def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: - from psiflow.data.utils import _write_frames + from psiflow.data.file import _write_frames print("Starting cleanup") with open(INPUT_XML, "r") as f: @@ -129,7 +127,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: states = parse_checkpoint("output.checkpoint") for state in states: if np.allclose(state.cell, NONPERIODIC_CELL): - state.cell[:] = 0.0 + state.cell = None _write_frames(*states, outputs=[output_xyz]) print("Moved checkpoint geometries") diff --git a/psiflow/sampling/utils.py b/psiflow/sampling/utils.py index 72ffb36..37f94f1 100644 --- a/psiflow/sampling/utils.py +++ b/psiflow/sampling/utils.py @@ -38,7 +38,7 @@ def check_forces( if not np.sum(exceeded): return indices = np.arange(len(geometry))[exceeded] - numbers = geometry.per_atom.numbers[exceeded] + numbers = geometry.numbers[exceeded] symbols = [chemical_symbols[n] for n in numbers] raise ForceMagnitudeException( "\nforce exceeded {} eV/A for atoms {}" diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py index 8d9ada5..1c22cad 100644 --- a/psiflow/sampling/walker.py +++ b/psiflow/sampling/walker.py @@ -185,7 +185,7 @@ def _get_minimum_energy_states( def quench(walkers: list[Walker], dataset: Dataset) -> None: """Assign the lowest energy geometry in dataset to every walker""" hamiltonians = combine_hamiltonians([w.hamiltonian for w in walkers]) - energies = [h.compute(dataset, "energy") for h in hamiltonians.hamiltonians] + energies = [h.compute(dataset).energy for h in hamiltonians.hamiltonians] coefficients = [ hamiltonians.get_coefficients(walker.hamiltonian * 1.0) for walker in walkers ] From 6e9d3696ee03b18956838c6ce5d68b31a9401845 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 15 Apr 2026 22:56:38 +0200 Subject: [PATCH 13/16] fix tests --- tests/test_free_energy.py | 5 +++-- tests/test_learning.py | 2 +- tests/test_sampling.py | 29 ++++++++++++----------------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/test_free_energy.py b/tests/test_free_energy.py index ecacc3e..c1bc53a 100644 --- a/tests/test_free_energy.py +++ b/tests/test_free_energy.py @@ -1,15 +1,16 @@ import numpy as np from ase.units import _c, kB, second +import psiflow from psiflow.free_energy import ( Integration, compute_frequencies, compute_harmonic, harmonic_free_energy, ) -from psiflow.geometry import check_equality from psiflow.hamiltonians import EinsteinCrystal, Harmonic, MACEHamiltonian from psiflow.sampling.ase import optimize +from psiflow.data.utils import check_equality def test_integration_simple(dataset): @@ -39,7 +40,7 @@ def test_integration_simple(dataset): # manual computation of delta gradient delta = -0.1 * harmonic - energies = delta.compute(integration.outputs[i].trajectory, "energy") + energies = delta.compute(integration.outputs[i].trajectory).energy assert np.allclose( state.gradients["delta"].result(), np.mean(energies.result()) / (kB * state.temperature), diff --git a/tests/test_learning.py b/tests/test_learning.py index 430a6ca..416a09c 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -3,7 +3,7 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import new_nullstate +# from psiflow.geometry import new_nullstate from psiflow.hamiltonians import EinsteinCrystal from psiflow.learning import Learning, evaluate_outputs from psiflow.metrics import Metrics, _create_table, parse_walker_log, reconstruct_dtypes diff --git a/tests/test_sampling.py b/tests/test_sampling.py index 0f3ccf8..152968c 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -5,7 +5,6 @@ from ase.units import Bohr import psiflow -from psiflow.geometry import check_equality from psiflow.hamiltonians import EinsteinCrystal, PlumedHamiltonian, MACEHamiltonian from psiflow.sampling.optimize import ( optimize as optimize_ipi, @@ -29,6 +28,8 @@ ) from psiflow.sampling.output import Status +from psiflow.data.utils import check_equality + def test_walkers(dataset): future = dataset[0] @@ -120,11 +121,9 @@ def test_parse_checkpoint(checkpoint): file = psiflow.context().new_file("input_", ".xml") Path(file.filepath).write_text(checkpoint) states = parse_checkpoint(file.filepath) - assert "time" in states[0].order - assert np.allclose( - states[0].cell, - np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr, - ) + cell = np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr + assert hasattr(states[0], "time") + assert np.allclose(states[0].cell, cell) def test_sample(dataset, mace_foundation): @@ -143,13 +142,9 @@ def test_sample(dataset, mace_foundation): METAD ARG=CV PACE=5 SIGMA=0.05 HEIGHT=5 """ metadynamics = Metadynamics(plumed_str) + mixture = 0.9 * plumed + einstein walkers = [ - Walker( - start=future, - temperature=300, - metadynamics=metadynamics, - hamiltonian=0.9 * plumed + einstein, - ), + Walker(start=future, metadynamics=metadynamics, hamiltonian=mixture), Walker(start=future, temperature=600, hamiltonian=einstein), Walker(start=future, temperature=450, hamiltonian=einstein), Walker(start=future, temperature=600, hamiltonian=einstein), @@ -182,11 +177,11 @@ def test_sample(dataset, mace_foundation): outputs[1]["potential{electronvolt}"].result(), ] energies_ = [ - (0.9 * plumed + einstein).compute(outputs[0].trajectory, "energy"), - einstein.compute(outputs[1].trajectory, "energy"), + mixture.compute(outputs[0].trajectory).result().energy, + einstein.compute(outputs[1].trajectory).result().energy, ] - assert len(energies[0]) == len(energies_[0].result()) - assert np.allclose(energies[0], energies_[0].result()) + assert len(energies[0]) == len(energies_[0]) + assert np.allclose(energies[0], energies_[0]) time = outputs[0]["time{picosecond}"].result() assert np.allclose(time, np.arange(0, 51, 5) * 5e-4) @@ -237,7 +232,7 @@ def test_sample(dataset, mace_foundation): checkpoint_step=1, fix_com=True, # otherwise temperature won't match ) - energy = hamiltonian.compute(dataset[1], "energy") + energy = hamiltonian.compute(dataset[1]).energy energy_ = output["potential{electronvolt}"] assert np.allclose(energy.result()[0], energy_.result()[0], atol=1e-5) assert np.allclose(output.time.result(), 10 * 0.5 / 1000) From 9c0af275d41673d056f736ac52296ee6c08c00be Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 21 Apr 2026 16:12:54 +0200 Subject: [PATCH 14/16] cleanup + minor fixes --- psiflow/compute.py | 101 +++--------------------- psiflow/data/__init__.py | 1 + psiflow/data/dataset.py | 33 +------- psiflow/data/file.py | 35 --------- psiflow/execution.py | 8 +- psiflow/functions.py | 1 - psiflow/geometry.py | 135 ++++++++++++--------------------- psiflow/learning.py | 5 +- psiflow/models/mace.py | 4 +- psiflow/reference/cp2k_.py | 3 - psiflow/reference/reference.py | 11 +-- psiflow/sampling/_ase.py | 6 +- psiflow/sampling/sampling.py | 3 +- psiflow/sampling/server.py | 2 +- psiflow/utils/wq.py | 1 + tests/test_data.py | 74 ++++++++---------- 16 files changed, 108 insertions(+), 315 deletions(-) diff --git a/psiflow/compute.py b/psiflow/compute.py index de26fb5..836bbb9 100644 --- a/psiflow/compute.py +++ b/psiflow/compute.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Callable, ClassVar, Optional, Union, Type, Any, TypeAlias from collections.abc import Sequence from dataclasses import dataclass @@ -50,19 +51,19 @@ def get(self, key: str, per_geom: bool = False) -> list | np.ndarray: return data_list def to_dict(self) -> dict[str, list]: + """Convert to a dict that is extract/insert compliant for geometries""" return {k: self.get(k, per_geom=True) for k in self.keys} @classmethod - def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]): + def from_data(cls, n_atoms: np.ndarray, data: dict[str, list]): values = {} for k, v in data.items(): assert len(v) == n_atoms.size - if k in PER_ATOM_FIELDS: - values[k] = np.concatenate(v) - elif k in DEFAULT_PROPERTIES: - values[k] = np.stack(v) + if not np.iterable(v[0]): + values[k] = np.array(v) + elif len(v[0]) == n_atoms[0] and len(v[-1]) == n_atoms[-1]: + values[k] = np.concatenate(v) # assume per-atom property else: - # TODO: what with custom properties? values[k] = np.stack(v) return cls(np.array(n_atoms), values) @@ -100,7 +101,6 @@ def aggregate_results( *results: ComputeResult, coefficients: Optional[np.ndarray] = None ) -> ComputeResult: """""" - # TODO: what with data fields that are not shared? if coefficients is None: coefficients = np.ones(len(results)) assert len(coefficients) == len(results) @@ -208,89 +208,6 @@ def _compare_results( compare_results = python_app(_compare_results, executors=["default_threads"]) -# TODO: cleanup -# def _compute_rmse( -# array0: np.ndarray, -# array1: np.ndarray, -# reduce: bool = True, -# ) -> Union[float, np.ndarray]: -# """ -# Compute the Root Mean Square Error (RMSE) between two arrays. -# -# Args: -# array0: First array. -# array1: Second array. -# reduce: Whether to reduce the result to a single value. -# -# Returns: -# Union[float, np.ndarray]: RMSE value(s). -# -# Note: -# This function is wrapped as a Parsl app and executed using the default_threads executor. -# """ -# assert array0.shape == array1.shape -# assert np.all(np.isnan(array0) == np.isnan(array1)) -# -# se = (array0 - array1) ** 2 -# se = se.reshape(se.shape[0], -1) -# -# if reduce: # across both dimensions -# mask = np.logical_not(np.isnan(se)) -# return float(np.sqrt(np.mean(se[mask]))) -# else: # retain first dimension -# if se.ndim == 1: -# return se -# else: -# values = np.empty(len(se)) -# for i in range(len(se)): -# if np.all(np.isnan(se[i])): -# values[i] = np.nan -# else: -# mask = np.logical_not(np.isnan(se[i])) -# value = np.sqrt(np.mean(se[i][mask])) -# values[i] = value -# return values -# -# -# compute_rmse = python_app(_compute_rmse, executors=["default_threads"]) -# -# -# def _compute_mae( -# array0, -# array1, -# reduce: bool = True, -# ) -> Union[float, np.ndarray]: -# """ -# Compute the Mean Absolute Error (MAE) between two arrays. -# -# Args: -# array0: First array. -# array1: Second array. -# reduce: Whether to reduce the result to a single value. -# -# Returns: -# Union[float, np.ndarray]: MAE value(s). -# -# Note: -# This function is wrapped as a Parsl app and executed using the default_threads executor. -# """ -# assert array0.shape == array1.shape -# mask0 = np.logical_not(np.isnan(array0)) -# mask1 = np.logical_not(np.isnan(array1)) -# assert np.all(mask0 == mask1) -# ae = np.abs(array0 - array1) -# to_reduce = tuple(range(1, array0.ndim)) -# mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce)) -# ae = ae[mask0].reshape(np.sum(1 * mask), -1) -# if reduce: # across both dimensions -# return float(np.sqrt(np.mean(ae))) -# else: # retain first dimension -# return np.sqrt(np.mean(ae, axis=1)) -# -# -# compute_mae = python_app(_compute_mae, executors=["default_threads"]) - - def input_to_geometries(data: ComputeInput) -> AppFuture: """Convert ComputeInput into a sequence of geometries (as a future)""" # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry @@ -315,7 +232,9 @@ def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]: @python_app(executors=["default_threads"]) -def insert_results(states: Sequence[Geometry], result: ComputeResult) -> Sequence[Geometry]: +def insert_results( + states: Sequence[Geometry], result: ComputeResult +) -> Sequence[Geometry]: """""" insert(states, result.to_dict()) return states diff --git a/psiflow/data/__init__.py b/psiflow/data/__init__.py index 13a9804..7ade023 100644 --- a/psiflow/data/__init__.py +++ b/psiflow/data/__init__.py @@ -1 +1,2 @@ from .dataset import Dataset +from .file import read_frames, count_frames \ No newline at end of file diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index 21d0459..52bbf00 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -195,37 +195,6 @@ def get_per_atom( ) return tuple(future_dict[q] for q in quantities) - # TODO: cleanup? - # def evaluate( - # self, computable: "Computable", batch_size: Optional[int] = None - # ) -> Dataset: - # """ - # Evaluate a Computable on the dataset. - # """ - # # TODO: remove this functionality? - # # or this should be the (only) way to label a dataset? - # from psiflow.hamiltonians import Hamiltonian - # - # if not isinstance(computable, Hamiltonian): - # # avoid extracting and inserting the same quantities - # return computable.compute_dataset(self) - # - # # use Hamiltonian.compute method - # if batch_size is not None: - # outputs = computable.compute(self, batch_size=batch_size) - # else: - # outputs = computable.compute(self) # use default from computable - # if not isinstance(outputs, list): # compute unpacks for only one property - # outputs = [outputs] - # data = {k: v for k, v in zip(computable.outputs, outputs)} - # - # file = psiflow.context().new_file("data_", ".xyz") - # - # future = read_frames(self.extxyz) - # future = insert_quantities(future, data) - # extxyz = write_frames(future, outputs=[file]).outputs[0] - # return Dataset(extxyz=extxyz) - def filter(self, quantity: str) -> Dataset: """ Filter the dataset based on a specified quantity. @@ -272,7 +241,7 @@ def assign_identifiers( AppFuture: Future representing the next available identifier. """ # TODO: what is the use case for this? - # TODO: this is the only method that changes inplace instead of returning a new Dataset + # this is the only method that changes inplace instead of returning a new Dataset future = assign_identifiers(self.extxyz, identifier) future_states, future_id = future[0], future[1] file = psiflow.context().new_file("data_", ".xyz") diff --git a/psiflow/data/file.py b/psiflow/data/file.py index 21bc507..f86093b 100644 --- a/psiflow/data/file.py +++ b/psiflow/data/file.py @@ -20,8 +20,6 @@ assign_ids, ) -# TODO: some operations have side effects, which could be unexpected - FileLike: TypeAlias = str | Path | File @@ -155,45 +153,12 @@ def _split_frames( _write_frames(val, outputs=outputs[1:]) -def _batch_frames( - file: FileLike, batch_size: int, outputs: Sequence[File] = () -) -> list[list[Geometry]]: - """ - Split frames into batches. - - Args: - file: DataFuture representing the input file path containing the geometry data. - batch_size: Number of frames per batch. - outputs: List of Parsl futures. Each element should be a DataFuture - representing an output file path for each batch. - """ - # TODO: why pass batch size? - # TODO: will not be great for large files - frames = _read_frames(file) - - batches, batch = [], [] - for i, geom in enumerate(frames): - batch.append(geom) - if (i + 1) % batch_size == 0: - batches.append(batch) - batch = [] - - if not outputs: - return batches - - assert len(outputs) == len(batches) - for file, batch in zip(outputs, batches): - _write_frames(batch, outputs=[file]) - return batches - - write_frames = python_app(_write_frames, executors=["default_threads"]) read_frames = python_app(_read_frames, executors=["default_threads"]) join_frames = python_app(_join_frames, executors=["default_threads"]) count_frames = python_app(_count_frames, executors=["default_threads"]) get_elements = python_app(_get_elements, executors=["default_threads"]) split_frames = python_app(_split_frames, executors=["default_threads"]) -batch_frames = python_app(_batch_frames, executors=["default_threads"]) def _read_write_wrapper( diff --git a/psiflow/execution.py b/psiflow/execution.py index 812a69e..8b47ff7 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -347,12 +347,6 @@ def wrap_in_timeout(self, command: str) -> str: # send SIGTERM after max_runtime, follow with SIGKILL 30s later return f"timeout -v -k 30s {self.max_runtime}s {command}" - # def wrap_in_srun(self, command: str) -> str: - # # TODO: stub -- this does not work - # if self.provider is None: - # return command # noop - # return f"srun -t 1 -c $CORES {command}" - def _create_threadpool(self, path: Path) -> ThreadPoolExecutor: max_threads = self.kwargs["max_threads"] return ThreadPoolExecutor(self.name, max_threads, working_dir=str(path)) @@ -464,7 +458,7 @@ def __init__( ): super().__init__(**kwargs) - if self.use_gpu and self.kwargs["gpus_per_task"] > 1: + if self.use_gpu and self.kwargs.get("gpus_per_task", 1) > 1: raise ConfigurationError("No Hamiltonian can do multi-GPU evaluation") # i-Pi will kill client connections after no response for timeout seconds diff --git a/psiflow/functions.py b/psiflow/functions.py index 8d22978..11b64e7 100644 --- a/psiflow/functions.py +++ b/psiflow/functions.py @@ -173,7 +173,6 @@ def __call__( @dataclass(frozen=True) class MACEFunction(Function): - # TODO: why are some arguments separated and others 'calc_kwargs'? model_path: str | Path ncores: int device: str diff --git a/psiflow/geometry.py b/psiflow/geometry.py index c23817a..357b5f8 100644 --- a/psiflow/geometry.py +++ b/psiflow/geometry.py @@ -2,8 +2,7 @@ import pickle import copy from pathlib import Path -from typing import Optional, Union, Any, Final -from collections.abc import Sequence +from typing import Optional, Any, Final import ase.calculators.calculator import numpy as np @@ -14,10 +13,9 @@ import psiflow -# TODO: docstrings -# TODO: custom attributes writing -# TODO: Reference and Sampling want to add attributes to Geometry (used to be 'order' dict) -# what to do? + +# TODO: all per-atom attributes are cast to float when writing, making read-write not a noop + # these are always accessible for every geometry DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress" @@ -39,8 +37,9 @@ def __bool__(self): class PerAtom: """ - Holds 'per atom' arrays like positions and forces - For every attribute, the first array dimension should be n_atoms + Holds 'per atom' arrays like positions and forces. + Must contain 'numbers' and 'positions', can contain anything else. + All arrays should be 2-dimensional with shape [n_atoms, *]. """ numbers: npt.NDArray[np.uint8] @@ -53,16 +52,20 @@ def __init__( positions: npt.NDArray[np.float64], **kwargs: npt.NDArray, ): - self.numbers = numbers.astype(int).reshape(-1, 1) + self.numbers = numbers self.positions = positions for k, v in kwargs.items(): setattr(self, k, v) - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: np.ndarray) -> None: if value is MISSING: return # do not set MISSING values - elif not self._check(value): + elif name == "numbers": + value = value.astype(int) + elif len(value) != len(self): raise ValueError(f"Field '{name}' is not a per-atom property..") + if value.ndim != 2: + value = value.reshape(-1, 1) # 1D arrays super().__setattr__(name, value) def __getattr__(self, name): @@ -79,23 +82,22 @@ def __len__(self) -> int: def __str__(self) -> str: return f"PerAtom[{len(self)}]{set(vars(self))}" - def _check(self, arr: npt.NDArray) -> bool: - try: - return (arr.shape[0] == len(self)) and arr.ndim == 2 - except AttributeError: - return arr.ndim == 2 # numbers not yet defined + def attributes(self) -> dict[str, Any]: + """Return all optional per-atom attributes""" + return { + k: v for k, v in vars(self).items() if k not in ("numbers", "positions") + } def reset(self): """Wipes optional fields""" - attrs = [k for k in vars(self) if k not in ("numbers", "positions")] - for attr in attrs: + for attr in self.attributes(): delattr(self, attr) def to_string(self) -> tuple[str, str]: + """Creates extxyz header and body""" symbols = [chemical_symbols[i] for i in self.numbers.flatten()] data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions} - if not self.forces is MISSING: - data["forces"] = self.forces + data |= self.attributes() # create extxyz header header = "Properties=species:S:1:pos:R:3" @@ -131,6 +133,7 @@ def to_string(self) -> tuple[str, str]: @classmethod def from_string(cls, s: str, props: Optional[str] = None): + """Reverses to_string""" props = (props or "species:S:1:pos:R:3") + ":" dtype_map = {"R": "f8", "S": "i8"} @@ -165,10 +168,10 @@ class Geometry: and various physical properties such as energy and forces. Attributes: - per_atom (PerAtom): Record array containing per-atom properties. - cell (np.ndarray): 3x3 array representing the unit cell vectors. + per_atom (PerAtom): Container with per-atom properties. + cell (Optional[np.ndarray]): 3x3 array representing the unit cell vectors (or None for isolated molecules). energy (Optional[float]): Total energy of the system. - stress (Optional[np.ndarray]): Stress tensor of the system. + stress (Optional[np.ndarray]): Stress tensor of the system (3x3 array). """ per_atom: PerAtom @@ -182,10 +185,7 @@ def __init__( cell: Optional[np.ndarray] = None, **kwargs: Any, ): - """ - Initialize a Geometry instance, though the preferred way of instantiating - proceeds via the class methods - """ + """Initialize a Geometry instance. Preferably instantiate through class methods.""" self.per_atom = per_atom self.cell = cell @@ -193,7 +193,7 @@ def __init__( for k, v in kwargs.items(): setattr(self, k, v) - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any) -> None: if value is MISSING: return # do not set MISSING values @@ -204,6 +204,8 @@ def __setattr__(self, name, value): value = float(value) # assert isinstance(value, float) elif name == "stress": + if self.cell is None: + raise ValueError("Cannot assign stress to molecules..") assert isinstance(value, np.ndarray) and value.shape == (3, 3) super().__setattr__(name, value) @@ -218,21 +220,16 @@ def __getattr__(self, name): def attributes(self) -> dict[str, Any]: """Return all non-MISSING instance attributes""" - return {k: v for k, v in vars(self).items() if k != "per_atom"} def reset(self) -> None: - """ - Reset all computed properties of the geometry to their default values. - """ + """Reset all computed properties to their default values""" self.per_atom.reset() for k in {"energy", "stress"}.intersection(self.attributes()): delattr(self, k) def clean(self) -> None: - """ - Clean the geometry by resetting properties and removing additional information. - """ + """Remove all non-structural information ('numbers', 'positions', 'cell')""" self.per_atom.reset() for k in self.attributes(): if k == "cell": @@ -240,9 +237,7 @@ def clean(self) -> None: delattr(self, k) def align_axes(self) -> None: - """ - Align the axes of the unit cell to a canonical representation for periodic systems. - """ + """Align the cell axes to a canonical representation for periodic systems""" if not self.periodic: return positions = self.per_atom.positions @@ -251,38 +246,28 @@ def align_axes(self) -> None: reduce_box_vectors(cell) def copy(self) -> Geometry: - """ - Create a deep copy of the Geometry instance. - """ + """Create a deep copy of the Geometry instance""" return pickle.loads(pickle.dumps(self)) def to_string(self) -> str: - """ - Convert the Geometry instance to a string representation in extended XYZ format. - """ + """Convert the Geometry instance to a string representation in extended XYZ format""" data = self.attributes() data.pop("cell") # cell needs special treatment if self.periodic: data["Lattice"] = self.cell.T # ase does Fortran ordering data["pbc"] = "T T T" - else: - data["pbc"] = "F F F" info_str = key_val_dict_to_str(data) properties, txt = self.per_atom.to_string() header = " ".join([properties, info_str]) return "\n".join([str(len(self)), header, txt]) def save(self, path_xyz: Path | str) -> None: - """ - Save the Geometry instance to an XYZ file. - """ + """Save the Geometry instance to an XYZ file""" path_xyz = psiflow.resolve_and_check(Path(path_xyz)) path_xyz.write_text(self.to_string()) def to_atoms(self, structural_only: bool = False) -> Atoms: - """ - Convert the Geometry instance to an Atoms object. - """ + """Convert the Geometry instance to an Atoms object""" if structural_only: return Atoms( positions=self.per_atom.positions, @@ -296,9 +281,7 @@ def to_atoms(self, structural_only: bool = False) -> Atoms: return next(read_xyz(io.StringIO(s), index=0)) def __eq__(self, other: "Geometry") -> bool: - """ - Check if two Geometry instances are structurally equal. - """ + """Check if two Geometry instances are structurally equal""" if ( isinstance(other, Geometry) and len(self) == len(other) @@ -311,16 +294,12 @@ def __eq__(self, other: "Geometry") -> bool: return False def __len__(self) -> int: - """ - Get the number of atoms in the geometry. - """ + """Get the number of atoms in the geometry""" return len(self.per_atom) @classmethod def from_string(cls, s: str) -> Geometry: - """ - Create a Geometry instance from a string representation in extended XYZ format. - """ + """Create a Geometry instance from a string representation in extended XYZ format""" n_atoms, header, body = s.strip().split("\n", 2) data = key_val_str_to_dict(header) data.pop("pbc", None) # geometry derives pbc from cell @@ -335,9 +314,7 @@ def from_string(cls, s: str) -> Geometry: @classmethod def load(cls, path_xyz: Path | str) -> "Geometry": - """ - Load a Geometry instance from an XYZ file. - """ + """Load a Geometry instance from an XYZ file""" path_xyz = psiflow.resolve_and_check(Path(path_xyz)) content = path_xyz.read_text() return cls.from_string(content) @@ -346,17 +323,13 @@ def load(cls, path_xyz: Path | str) -> "Geometry": def from_data( cls, numbers: np.ndarray, positions: np.ndarray, cell: Optional[np.ndarray] ) -> Geometry: - """ - Create a Geometry instance from atomic numbers, positions, and cell data. - """ + """Create a Geometry instance from atomic numbers, positions, and cell data""" per_atom = PerAtom(numbers.copy(), positions.copy()) return Geometry(per_atom, cell=copy.copy(cell)) @classmethod def from_atoms(cls, atoms: Atoms) -> Geometry: - """ - Create a Geometry instance from an ASE Atoms object. - """ + """Create a Geometry instance from an ASE Atoms object""" per_atom = PerAtom(**atoms.arrays) data = atoms.info if all(atoms.pbc): @@ -373,41 +346,31 @@ def from_atoms(cls, atoms: Atoms) -> Geometry: @property def periodic(self) -> bool: - """ - Check if the geometry is periodic. - """ + """Check if the geometry is periodic""" return not self.cell is None @property def per_atom_energy(self) -> float | MISSING: - """ - Calculate the energy per atom. - """ + """Calculate the per-atom energy""" if self.energy is MISSING: return MISSING return self.energy / len(self) @property def volume(self) -> Optional[float]: - """ - Calculate the volume of the unit cell. - """ + """Calculate the volume of the unit cell""" if not self.periodic: return None return np.linalg.det(self.cell) @property def atomic_masses(self) -> npt.NDArray: - """ - Get the atomic masses of the atoms in the geometry. - """ + """Get the atomic masses of the atoms in the geometry""" return np.array([atomic_masses[n] for n in self.numbers]) @property def numbers(self) -> npt.NDArray: - """ - Get a flattened version of the per_atom.numbers array. - """ + """Get a flattened version of per_atom.numbers""" return self.per_atom.numbers.flatten() @@ -556,7 +519,7 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float: - """Compute the total atomic energy based on provided single atom energies.""" + """Compute the total atomic energy based on single atom energies""" total = 0 numbers, counts = np.unique(geometry.numbers, return_counts=True) for number, count in zip(numbers, counts): diff --git a/psiflow/learning.py b/psiflow/learning.py index a1193c9..af2dea3 100644 --- a/psiflow/learning.py +++ b/psiflow/learning.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) # logging per module +# TODO: remove -> merge into one app @typeguard.typechecked def _compute_error( state0: Geometry, @@ -45,7 +46,7 @@ def _compute_error( compute_error = python_app(_compute_error, executors=["default_threads"]) - +# TODO: remove -> merge into one app @typeguard.typechecked def _exceeds_error( errors: np.ndarray, @@ -56,7 +57,7 @@ def _exceeds_error( exceeds_error = python_app(_exceeds_error, executors=["default_threads"]) - +# TODO: remove -> merge into one app @typeguard.typechecked def evaluate_outputs( outputs: list[SimulationOutput], diff --git a/psiflow/models/mace.py b/psiflow/models/mace.py index cd8c41b..6d2c8d7 100644 --- a/psiflow/models/mace.py +++ b/psiflow/models/mace.py @@ -22,7 +22,6 @@ # TODO: when changing the training dataset, the computed avg_num_neighbors will also change, # making old checkpoints inconsistent -# TODO: sanitise_config consistently called? # TODO: training fails when batch_size > dataset (see github issue) # TODO: restart_latest functionality to continue training?? @@ -59,8 +58,7 @@ def sanitise_config(config: dict) -> dict: def format_E0s(atomic_energies: dict) -> dict | str: - """""" - # TODO: MACE fails when atomic energy is not defined for every element.. + """MACE will fail if atomic energy is not defined for all elements""" if atomic_energies: return {atomic_numbers[k]: v for k, v in atomic_energies.items()} else: diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py index 44412f6..6000624 100644 --- a/psiflow/reference/cp2k_.py +++ b/psiflow/reference/cp2k_.py @@ -128,9 +128,6 @@ def get_single_atom_references(self, element: str) -> dict[int, Reference]: else: input_section["scf"] = {"ot": {"minimizer": "CG"}} - # necessary for oxygen calculation, at least in 2024.1 - # TODO: not an allowed keyword in 2025.2 - # input_section["scf"]["ignore_convergence_failure"] = "TRUE" input_str = dict_to_str(input_dict) references = {} diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 68f92c2..38ef513 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -60,7 +60,7 @@ def update_geometry(geom: Geometry, data: dict) -> Geometry: return geom -@join_app +@python_app(executors=['default_threads']) def get_minimum_energy(element: str, **kwargs) -> AppFuture: energies = {m: state.energy or np.inf for m, state in kwargs.items()} energy = min(energies.values()) @@ -70,11 +70,11 @@ def get_minimum_energy(element: str, **kwargs) -> AppFuture: ] logger.info('\n'.join(msg)) assert not np.isinf(energy), f"Atomic energy calculation of '{element}' failed" - return copy_app_future(energy) # TODO: why? + return energy def get_spin_multiplicities(element: str) -> list[int]: - """TODO: rethink this""" + """TODO: redo this""" # max S = N * 1/2, max mult = 2 * S + 1 from ase.symbols import atomic_numbers @@ -166,7 +166,7 @@ def _process_output( try: data = reference.parse_output(stdout) except LineNotFoundError: - data = {"status": Status.FAILED} # TODO: find out what went wrong? + data = {"status": Status.FAILED} data |= {"stdout": Path(inputs[0]), "stderr": Path(inputs[1])} return update_geometry(geom, data) @@ -182,7 +182,8 @@ def evaluate( ) -> AppFuture: """""" flag, *files = reference.create_input(geom=geom) - if not flag: # TODO: should we reset geom? + if not flag: + geom.reset() # remove fields to indicate no evaluation happened return copy_app_future(geom) # do the actual evaluation diff --git a/psiflow/sampling/_ase.py b/psiflow/sampling/_ase.py index e66ae3d..cb23c1a 100644 --- a/psiflow/sampling/_ase.py +++ b/psiflow/sampling/_ase.py @@ -1,6 +1,5 @@ """ Structure optimisation through ASE -TODO: do we need to check for very large forces? """ import os @@ -169,9 +168,10 @@ def main(args: SimpleNamespace): Path(args.output_xyz).touch() return - atoms.set_constraint() # usually noop if not any(atoms.pbc): - atoms.cell = None # remove meaningless cell + # remove meaningless cell and stress + atoms.cell = None + atoms.calc.results.pop("stress", None) ase.io.write(FILE_OUT, atoms, format="extxyz") shutil.copy(FILE_OUT, args.output_xyz) diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index a28fb6d..433623a 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -192,7 +192,6 @@ def setup_ensemble( bias_weights_list = [] # add hamiltonian components that are not shared between all walkers as bias - # TODO: do we always want to do this? for comp in components: if not comp.shared: force = ET.Element("force", forcefield=comp.name) @@ -429,7 +428,6 @@ def make_driver_commands( driver_kwargs: list[dict], file_xyz: File, files_hamiltonian: list[File] ) -> list[str]: """""" - # TODO: what if 'file_xyz' contains multiple geometries of different size? assert len(driver_kwargs) >= len(files_hamiltonian) default = f'i-pi-driver-py -u -S "" -m custom -P {PATH_DRIVER} -a {{address}} -o {{options}} &' @@ -538,6 +536,7 @@ def _sample( ) smotion = setup_smotion(coupling, plumed_list) + # TODO: validate all arguments together + consistently # make sure at least one checkpoint is being written if checkpoint_step is None: # default to every 5% of simulation progress if step is None: diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py index c3d5856..b1eaae2 100644 --- a/psiflow/sampling/server.py +++ b/psiflow/sampling/server.py @@ -138,7 +138,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None: out = subprocess.run( "i-pi-remdsort input.xml", shell=True, capture_output=True, text=True ) - assert out.returncode == 0 # TODO: what if it isn't? + assert out.returncode == 0 print("REMDSORT") output_props = _.split(",") if (_ := output_props) else [] diff --git a/psiflow/utils/wq.py b/psiflow/utils/wq.py index 4ecc01d..1059ae5 100644 --- a/psiflow/utils/wq.py +++ b/psiflow/utils/wq.py @@ -1,4 +1,5 @@ # TODO: this probably does not work in a nested way +# or when join apps submit tasks at some point in the future (after exiting the context) import logging diff --git a/tests/test_data.py b/tests/test_data.py index e5d8155..d0fd49a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -14,7 +14,6 @@ def test_geometry(tmp_path): - # TODO: custom per_atom field geometry = Geometry.from_data( numbers=np.arange(1, 6), positions=np.random.uniform(0, 1, size=(5, 3)), @@ -122,7 +121,32 @@ def test_geometry(tmp_path): state.cell = [[1, 2, 3]] * 3 +def test_custom_attributes(dataset): + """Do geometries behave for non-default attributes?""" + geom = dataset[0].result() + + geom.custom1 = False + geom.custom2 = [42, "42"] + geom.per_atom.custom1 = np.ones(len(geom), dtype=int) * 42 + geom.per_atom.custom2 = np.random.random(len(geom)).astype(bool) + + string = geom.to_string() # casts per-atom properties to float.. + geom_ = Geometry.from_string(string) + atoms = geom.to_atoms() + + assert geom.custom1 == geom_.custom1 + assert geom.custom2 == geom_.custom2 + assert np.allclose(geom.per_atom.custom1, geom_.per_atom.custom1) + assert np.allclose(geom.per_atom.custom2, geom_.per_atom.custom2) + + assert geom.custom1 == atoms.info["custom1"] + assert geom.custom2 == atoms.info["custom2"] + assert np.allclose(geom.per_atom.custom1, atoms.arrays["custom1"]) + assert np.allclose(geom.per_atom.custom2, atoms.arrays["custom2"]) + + def test_readwrite_cycle(dataset, tmp_path): + # TODO: much overlap with test_geometry tmp_file = File(tmp_path / "test.xyz") data = dataset[:4].geometries().result() @@ -165,11 +189,13 @@ def test_readwrite_cycle(dataset, tmp_path): assert geometry.energy is MISSING geom = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None) - geom.stress = np.array([np.nan] * 9).reshape(3, 3) + with pytest.raises(ValueError): # cannot assign stress to molecules + geom.stress = np.array([np.nan] * 9).reshape(3, 3) string = geom.to_string() - assert "nan" in string # the stress field + assert "stress" not in string + assert "pbc" not in string geom_ = Geometry.from_string(string) - assert np.isnan(geom_.stress).all() + assert geom_.stress is MISSING def test_dataset(dataset, tmp_path): @@ -258,46 +284,6 @@ def test_dataset_from_xyz(tmp_path, dataset): assert geom.energy == at.calc.results["energy"] -# def test_index_element_mask(): -# numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8]) -# elements = ["H"] -# indices = [0, 1, 4, 5] -# assert np.allclose( -# np.array([True, True, False, False, False, False, False, False]), -# get_index_element_mask(numbers, indices, elements), -# ) -# elements = ["H", "O"] -# indices = [0, 1, 4, 5] -# assert np.allclose( -# np.array([True, True, False, False, False, True, False, False]), -# get_index_element_mask(numbers, indices, elements), -# ) -# elements = ["H", "O"] -# indices = [3] -# assert np.allclose( -# get_index_element_mask(numbers, indices, elements), -# np.array([False] * len(numbers)), -# ) -# elements = ["H", "O"] -# indices = None -# assert np.allclose( -# get_index_element_mask(numbers, indices, elements), -# np.array([True, True, True, False, False, True, True, True]), -# ) -# elements = None -# indices = [0, 1, 2, 3, 5] -# assert np.allclose( -# get_index_element_mask(numbers, indices, elements), -# np.array([True, True, True, True, False, True, False, False]), -# ) -# elements = ["Cl"] # not present at all -# indices = None -# assert np.allclose( -# get_index_element_mask(numbers, indices, elements), -# np.array([False] * len(numbers)), -# ) - - def test_data_elements(dataset): elements = dataset.elements().result() assert "H" in elements From 756a3fec301388fb85d55176cce13494893c66d0 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 21 Apr 2026 18:05:03 +0200 Subject: [PATCH 15/16] add analysis comments before refactor - fully separate wandb - probably going to redo metrics as well --- psiflow/learning.py | 54 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/psiflow/learning.py b/psiflow/learning.py index af2dea3..5dca750 100644 --- a/psiflow/learning.py +++ b/psiflow/learning.py @@ -8,13 +8,14 @@ import numpy as np import typeguard from parsl.app.app import python_app -from parsl.dataflow.futures import AppFuture +from parsl.dataflow.futures import AppFuture, Future import psiflow from psiflow.data import Dataset from psiflow.geometry import Geometry#, NullState, assign_identifier from psiflow.hamiltonians import Hamiltonian, Zero from psiflow.metrics import Metrics +from psiflow.models import MACE # from psiflow.models import Model from psiflow.reference import Reference from psiflow.sampling import SimulationOutput, Walker, sample @@ -24,6 +25,23 @@ logger = logging.getLogger(__name__) # logging per module +# TODO: reset_model method to remove existing MLP + train/val split +# TODO: some plugin for data discarding + walker reset +# TODO: some plugin for data selection / uncertainty thing +# TODO: some plugin for wandb +# TODO: add_data method after init? +# TODO: clean_data method to throw away ridiculous structures +# TODO: remove psiflow.wait calls -- but still have blocking functionality + +# TODO: PASSIVE LEARNING +# 1. walk MD + store structures + track which structure belongs to which walker +# 2. some data selection / cleanup step +# 3. reference single point + hamiltonian eval +# 4. compare geometries + filter + discard + keep track of walkers to reset +# 5. update train/val and retrain + + + # TODO: remove -> merge into one app @typeguard.typechecked def _compute_error( @@ -100,18 +118,24 @@ def evaluate_outputs( return identifier, data, resets -@typeguard.typechecked -# @psiflow.serializable +class Thresholds: + """Container for hyperparams""" + pass + +@psiflow.register_serializable class Learning: + root: Path + model: MACE reference: Reference - path_output: str - identifier: Union[AppFuture, int] train_valid_split: float - error_thresholds_for_reset: list[Optional[float]] - error_thresholds_for_discard: list[Optional[float]] - metrics: Metrics + identifier: Union[AppFuture, int] + error_tresholds: Optional[Thresholds] + # error_thresholds_for_reset: list[Optional[float]] + # error_thresholds_for_discard: list[Optional[float]] + # metrics: Metrics iteration: int - data: Dataset + wait_for: Future + # data: Dataset def __init__( self, @@ -124,6 +148,8 @@ def __init__( wandb_project: Optional[str] = None, initial_data: Optional[Dataset] = None, ): + # TODO: accept model argument + # TODO: differentiate between fresh start <> restart self.reference = reference self.path_output = str(path_output) Path(self.path_output).mkdir(exist_ok=True, parents=True) @@ -146,6 +172,7 @@ def __init__( self.iteration = -1 def update(self, learning: Learning) -> None: + # TODO: does this make sense? assert self.path_output == learning.path_output self.train_valid_split = learning.train_valid_split self.error_thresholds_for_reset = learning.error_thresholds_for_reset @@ -157,6 +184,7 @@ def update(self, learning: Learning) -> None: self.iteration = learning.iteration def skip(self, name: str) -> bool: + # TODO: upon restart, should we not immediately find how far we got and what the next action should be? if not (Path(self.path_output) / name).exists(): return False else: # check that the iteration has completed, or delete @@ -169,6 +197,7 @@ def skip(self, name: str) -> bool: return True def save(self, name: str, model: Model, walkers: list[Walker]) -> None: + # TODO: fully revamp this method model.save(Path(self.path_output) / name) self.data.save(Path(self.path_output) / name / "data.xyz") @@ -217,6 +246,7 @@ def passive_learning( steps: int, **sampling_kwargs, ) -> tuple[Model, list[Walker]]: + # TODO: remove model argument + return only walkers self.iteration += 1 name = "{}_{}".format(self.iteration, "passive_learning") if self.skip(name): @@ -232,6 +262,9 @@ def passive_learning( step = steps nevaluations = steps // step for _i in range(nevaluations): + # TODO: splits into multiple simulations and does data evaluation + validation in steps + # -- do not think we want this.. The instruction is to collect multiple geometries per walker + # -- because we should handle all data validation in one go (errors / uncertainty ...) outputs = sample( walkers, steps=step, @@ -239,8 +272,10 @@ def passive_learning( **sampling_kwargs, ) # only apply thresholds to forces, not to energies since large offset can exist + # TODO: give users the choice? threshold_reset = [None, self.error_thresholds_for_reset[1]] threshold_discard = [None, self.error_thresholds_for_discard[1]] + # TODO: do single point evaluations here identifier, data, _ = evaluate_outputs( # ignore resets outputs, hamiltonian, @@ -253,6 +288,7 @@ def passive_learning( self.identifier = identifier self.data += data # automatically sorted based on identifier + # TODO: keep consistent train/val splits and do not reset model train, valid = self.data.split(self.train_valid_split) # also shuffles model.reset() model.initialize(train) From e021709be7ffaea95738c13422657d86fcda4c57 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 22 Apr 2026 10:43:46 +0200 Subject: [PATCH 16/16] fix test_models (again) whoops --- psiflow/compute.py | 14 ++++++++++- tests/test_models.py | 58 ++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/psiflow/compute.py b/psiflow/compute.py index 836bbb9..a6d1802 100644 --- a/psiflow/compute.py +++ b/psiflow/compute.py @@ -14,6 +14,9 @@ from psiflow.functions import Function from psiflow.utils.apps import pack +# TODO: what with missing values in arrays? + + ComputeInput: TypeAlias = ( Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry ) @@ -183,7 +186,6 @@ def _compare_results( **kwargs: list | np.ndarray, ) -> dict: """""" - # TODO: what with missing values? assert (result2 is None) != (len(kwargs) == 0) # xor if kwargs: result2 = ComputeResult.from_data(result1.n_atoms, kwargs) @@ -208,6 +210,16 @@ def _compare_results( compare_results = python_app(_compare_results, executors=["default_threads"]) +def _compare_arrays(arr1: Sequence, arr2: Sequence, metric: str = "RMSE") -> float: + """""" + metric_func = metric_func_map[metric] + arr1, arr2 = np.array(arr1), np.array(arr2) + return metric_func(arr1, arr2) + + +compare_arrays = python_app(_compare_arrays, executors=["default_threads"]) + + def input_to_geometries(data: ComputeInput) -> AppFuture: """Convert ComputeInput into a sequence of geometries (as a future)""" # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry diff --git a/tests/test_models.py b/tests/test_models.py index 9476975..7da4e80 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,7 +4,7 @@ from parsl.app.futures import DataFuture import psiflow -from psiflow.data import compute_rmse +from psiflow.compute import compare_arrays from psiflow.hamiltonians import MACEHamiltonian from psiflow.models import MACE from psiflow.models.mace import KEY_ATOMIC_ENERGIES, KEY_ITERATION, MODEL_DIRS @@ -59,19 +59,16 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path): validation = dataset[-5:] path = tmp_path / "mace" model = MACE.create(path, mace_config) + [data] = validation.get(key) model.train(training, validation) hamiltonian = model.create_hamiltonian() - rmse0 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation0 = hamiltonian.evaluate(validation) + [data0] = validation0.get(key) future_train = model.train(training, validation) hamiltonian = model.create_hamiltonian() - rmse1 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation1 = hamiltonian.evaluate(validation) + [data1] = validation1.get(key) future_train.result() # wait for second training run with pytest.raises(AssertionError): @@ -81,25 +78,25 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path): # train from load model_ = MACE.load(path) hamiltonian = model_.create_hamiltonian() - rmse2 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation2 = hamiltonian.evaluate(validation) + [data2] = validation2.get(key) model_.train(training, validation) hamiltonian = model_.create_hamiltonian() - rmse3 = compute_rmse( - validation.get(key), - validation.evaluate(hamiltonian).get(key), - ) + validation3 = hamiltonian.evaluate(validation) + [data3] = validation3.get(key) - print(rmse0.result()) - print(rmse1.result()) - print(rmse2.result()) - print(rmse3.result()) + rmse0 = compare_arrays(data, data0).result() + rmse1 = compare_arrays(data, data1).result() + rmse2 = compare_arrays(data, data2).result() + rmse3 = compare_arrays(data, data3).result() - assert rmse0.result() > rmse1.result() - assert np.isclose(rmse1.result(), rmse2.result()) - assert rmse2.result() > rmse3.result() + print(rmse0) + print(rmse1) + print(rmse2) + print(rmse3) + assert rmse0 > rmse1 + assert np.isclose(rmse1, rmse2) + assert rmse2 > rmse3 def test_mace_hamiltonian(dataset, mace_foundation): @@ -114,8 +111,11 @@ def test_mace_hamiltonian(dataset, mace_foundation): assert hamiltonian1 == hamiltonian2 hamiltonian2.update_kwargs(enable_cueq=True) - e0 = hamiltonian0.compute(dataset, "energy") - e1 = hamiltonian1.compute(dataset, "energy") - e2 = hamiltonian2.compute(dataset, "energy") - assert np.allclose(e0.result(), e1.result()) - assert np.allclose(e0.result(), e2.result()) + out0 = hamiltonian0.compute(dataset) + out1 = hamiltonian1.compute(dataset) + out2 = hamiltonian2.compute(dataset) + out0, out1, out2 = out0.result(), out1.result(), out2.result() + for key in out0.keys: + # single precision + assert np.allclose(out0.get(key), out1.get(key), atol=1e-6) + assert np.allclose(out0.get(key), out2.get(key), atol=1e-6)