From 482b16c8cac13ab09c62475965499d49e2fd458e Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 1 Apr 2026 20:13:55 +0200
Subject: [PATCH 01/16] Update geometry.py

- per_atom quantities are now stored in a custom Container class,  rather than a np.recarray
- cleaned up geometry attributes
- new to_atoms() function
- redid a lot of internals
- start removing NullState
---
 psiflow/geometry.py | 827 ++++++++++++++++++++------------------------
 1 file changed, 373 insertions(+), 454 deletions(-)

diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index c9400c4..a2fa7e5 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -1,463 +1,379 @@
-from __future__ import annotations  # necessary for type-guarding class methods
-
+import io
+import pickle
 from pathlib import Path
-from typing import Optional, Union
+from types import SimpleNamespace
+from typing import Optional, Union, Any
+from collections.abc import Sequence
 
+import ase.calculators.calculator
 import numpy as np
-import typeguard
+import numpy.typing as npt
 from ase import Atoms
-from ase.data import atomic_masses, chemical_symbols
-from ase.io.extxyz import key_val_dict_to_str, key_val_str_to_dict_regex
+from ase.data import atomic_masses, chemical_symbols, atomic_numbers
+from ase.io.extxyz import key_val_dict_to_str, read_xyz, key_val_str_to_dict
 from parsl.app.app import python_app
 
 import psiflow
 
-per_atom_dtype = np.dtype(
-    [
-        ("numbers", np.uint8),
-        ("positions", np.float64, (3,)),
-        ("forces", np.float64, (3,)),
-    ]
-)
-
-QUANTITIES = [
-    "positions",
-    "cell",
-    "numbers",
-    "energy",
-    "per_atom_energy",
-    "forces",
-    "stress",
-    "delta",
-    "logprob",
-    "phase",
-    "identifier",
-]
-
-
-@typeguard.typechecked
+# TODO: docstrings
+# TODO: custom attributes
+
+
+class Container:
+    """
+    Holds 'per atom' arrays like positions and forces
+    For every attribute, the first array dimension should be n_atoms
+    """
+
+    def __init__(
+        self,
+        numbers: npt.NDArray[np.uint8],
+        positions: npt.NDArray[np.float64],
+        **kwargs: npt.NDArray,
+    ):
+        self.numbers = numbers.astype(int).reshape(-1, 1)
+        self.positions = positions
+        self._check(positions)
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    def __setattr__(self, name, value):
+        if not self._check(value):
+            raise ValueError(f"Field '{name}' is not a per-atom property..")
+        super().__setattr__(name, value)
+
+    def __getattr__(self, name):
+        # only runs if the attribute isn't found normally
+        raise AttributeError(
+            f"'{type(self).__name__}' instance has no attribute '{name}'"
+        )
+
+    def __len__(self) -> int:
+        return self.numbers.shape[0]
+
+    def __str__(self) -> str:
+        return f"Container[{len(self)}](keys={set(vars(self))})"
+
+    def _check(self, arr: npt.NDArray) -> bool:
+        try:
+            return (arr.shape[0] == len(self)) and arr.ndim == 2
+        except AttributeError:
+            return arr.ndim == 2  # numbers not yet defined
+
+    def reset(self):
+        """Wipes optional fields"""
+        attrs = [k for k in vars(self) if k not in ("numbers", "positions")]
+        for attr in attrs:
+            delattr(self, attr)
+
+    def to_string(self) -> tuple[str, str]:
+        symbols = [chemical_symbols[i] for i in self.numbers.flatten()]
+        data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions}
+        if hasattr(self, "forces"):
+            data["forces"] = self.forces
+
+        # create extxyz header
+        header = "Properties=species:S:1:pos:R:3"
+        for k, arr in data.items():
+            if k in header:
+                continue
+            header += f":{k}:R:{arr.shape[1]}"
+
+        # create a structured array with all fields
+        dtypes = []
+        for k, arr in data.items():
+            dtypes += [arr.dtype] * arr.shape[1]
+        dtype = [(str(i), t) for i, t in enumerate(dtypes)]
+
+        structured = np.zeros(len(self), dtype=dtype)
+        i = 0
+        for k, arr in data.items():
+            for j in range(arr.shape[1]):
+                structured[str(i)] = arr[:, j]
+                i += 1
+
+        # write to string
+        formats = []
+        format_map = {"species": "%-2s"}
+        for k, arr in data.items():
+            dformat = format_map.get(k, "%16.8f")
+            formats += [dformat] * arr.shape[1]
+        buffer = io.StringIO()
+        np.savetxt(buffer, structured, fmt=formats)
+        txt = buffer.getvalue()
+
+        return header, txt
+
+    @classmethod
+    def from_string(cls, s: str, props: Optional[str] = None):
+        props = (props or "species:S:1:pos:R:3") + ":"
+        dtype_map = {"R": "f8", "S": "i8"}
+
+        # figure out what columns belong to which property
+        keys, dtypes = [], []
+        while props:
+            k, t, n, props = props.split(":", 3)
+            keys += [k] * int(n)
+            dtypes += [dtype_map[t]] * int(n)
+        dtype = [(str(i), t) for i, t in enumerate(dtypes)]
+
+        # convert species to numbers automatically
+        conv = {0: lambda s: atomic_numbers[s]}
+        structured = np.loadtxt(io.StringIO(s), dtype=dtype, converters=conv)
+
+        # group subarrays per property
+        data = {k: [] for k in set(keys)}
+        for i, k in enumerate(structured.dtype.names):
+            arr = structured[k]
+            data[keys[i]].append(arr)
+        arrays = {k: np.stack(v, axis=-1) for k, v in data.items()}
+
+        arrays["numbers"] = arrays.pop("species")
+        arrays["positions"] = arrays.pop("pos")
+        return cls(**arrays)
+
+
 class Geometry:
     """
     Represents an atomic structure with associated properties.
-
     This class encapsulates the atomic structure, including atom positions, cell parameters,
     and various physical properties such as energy and forces.
 
     Attributes:
-        per_atom (np.recarray): Record array containing per-atom properties.
+        per_atom (Container): Record array containing per-atom properties.
         cell (np.ndarray): 3x3 array representing the unit cell vectors.
-        order (dict): Dictionary to store custom ordering information.
         energy (Optional[float]): Total energy of the system.
         stress (Optional[np.ndarray]): Stress tensor of the system.
-        delta (Optional[float]): Delta value, if applicable.
-        phase (Optional[str]): Phase information, if applicable.
-        logprob (Optional[np.ndarray]): Log probability values, if applicable.
-        stdout (Optional[str]): Standard output information, if applicable.
-        identifier (Optional[int]): Unique identifier for the geometry.
     """
 
-    per_atom: np.recarray
-    cell: np.ndarray
-    order: dict
+    per_atom: Container
+    cell: npt.NDArray
     energy: Optional[float]
     stress: Optional[np.ndarray]
-    delta: Optional[float]
-    phase: Optional[str]
-    logprob: Optional[np.ndarray]
-    stdout: Optional[str]
-    identifier: Optional[int]
+    meta: SimpleNamespace
 
     def __init__(
         self,
-        per_atom: np.recarray,
-        cell: np.ndarray,
-        order: Optional[dict] = None,
+        per_atom: Container,
         energy: Optional[float] = None,
+        cell: Optional[np.ndarray] = None,
         stress: Optional[np.ndarray] = None,
-        delta: Optional[float] = None,
-        phase: Optional[str] = None,
-        logprob: Optional[np.ndarray] = None,
-        stdout: Optional[str] = None,
-        identifier: Optional[int] = None,
+        **kwargs: Any,
     ):
         """
         Initialize a Geometry instance, though the preferred way of instantiating
-        proceeds via the `from_data` or `from_atoms` class methods
-
-        Args:
-            per_atom (np.recarray): Record array containing per-atom properties.
-            cell (np.ndarray): 3x3 array representing the unit cell vectors.
-            order (Optional[dict], optional): Custom ordering information. Defaults to None.
-            energy (Optional[float], optional): Total energy of the system. Defaults to None.
-            stress (Optional[np.ndarray], optional): Stress tensor of the system. Defaults to None.
-            delta (Optional[float], optional): Delta value. Defaults to None.
-            phase (Optional[str], optional): Phase information. Defaults to None.
-            logprob (Optional[np.ndarray], optional): Log probability values. Defaults to None.
-            stdout (Optional[str], optional): Standard output information. Defaults to None.
-            identifier (Optional[int], optional): Unique identifier for the geometry. Defaults to None.
-        """
-        self.per_atom = per_atom.astype(per_atom_dtype)  # copies data
-        self.cell = cell.astype(np.float64)
-        assert self.cell.shape == (3, 3)
-        if order is None:
-            order = {}
-        self.order = order
+        proceeds via the class methods
+        """
+        self.per_atom = per_atom
         self.energy = energy
+        if cell is not None:
+            assert cell.shape == (3, 3)
+        self.cell = cell
+        if stress is not None:
+            assert stress.shape == (3, 3)
         self.stress = stress
-        self.delta = delta
-        self.phase = phase
-        self.logprob = logprob
-        self.stdout = stdout
-        self.identifier = identifier
+        self.meta = SimpleNamespace(**kwargs)
 
-    def reset(self):
+    def reset(self) -> None:
         """
         Reset all computed properties of the geometry to their default values.
         """
+        self.per_atom.reset()
         self.energy = None
         self.stress = None
-        self.delta = None
-        self.phase = None
-        self.logprob = None
-        self.per_atom.forces[:] = np.nan
 
-    def clean(self):
+    def clean(self) -> None:
         """
         Clean the geometry by resetting properties and removing additional information.
         """
         self.reset()
-        self.order = {}
-        self.stdout = None
-        self.identifier = None
+        self.meta = SimpleNamespace()
 
-    def __eq__(self, other) -> bool:
-        """
-        Check if two Geometry instances are equal.
-
-        Args:
-            other: The other object to compare with.
-
-        Returns:
-            bool: True if the geometries are equal, False otherwise.
-        """
-        if not isinstance(other, Geometry):
-            return False
-        # have to check separately for np.allclose due to different dtypes
-        equal = True
-        equal = equal and (len(self) == len(other))
-        equal = equal and (self.periodic == other.periodic)
-        if not equal:
-            return False
-        equal = equal and np.allclose(self.per_atom.numbers, other.per_atom.numbers)
-        equal = equal and np.allclose(self.per_atom.positions, other.per_atom.positions)
-        equal = equal and np.allclose(self.cell, other.cell)
-        return bool(equal)
-
-    def align_axes(self):
+    def align_axes(self) -> None:
         """
         Align the axes of the unit cell to a canonical representation for periodic systems.
         """
-        if self.periodic:  # only do something if periodic:
-            positions = self.per_atom.positions
-            cell = self.cell
-            transform_lower_triangular(positions, cell, reorder=False)
-            reduce_box_vectors(cell)
+        if not self.periodic:
+            return
+        positions = self.per_atom.positions
+        cell = self.cell
+        transform_lower_triangular(positions, cell, reorder=False)
+        reduce_box_vectors(cell)
 
-    def __len__(self):
+    def copy(self) -> Geometry:
         """
-        Get the number of atoms in the geometry.
-
-        Returns:
-            int: The number of atoms.
+        Create a deep copy of the Geometry instance.
         """
-        return len(self.per_atom)
+        return pickle.loads(pickle.dumps(self))
 
     def to_string(self) -> str:
         """
         Convert the Geometry instance to a string representation in extended XYZ format.
-
-        Returns:
-            str: String representation of the geometry.
         """
+        data = {k: getattr(self, k) for k in ("energy", "stress")}
         if self.periodic:
-            comment = 'Lattice="'
-            comment += " ".join([str(x) for x in np.reshape(self.cell.T, 9, order="F")])
-            comment += '" pbc="T T T" '
+            data["Lattice"] = self.cell.T  # ase does Fortran ordering
+            data["pbc"] = "T T T"
         else:
-            comment = 'pbc="F F F" '
-
-        write_forces = not np.any(np.isnan(self.per_atom.forces))
-        comment += "Properties=species:S:1:pos:R:3"
-        if write_forces:
-            comment += ":forces:R:3"
-        comment += " "
-
-        keys = [
-            "energy",
-            "stress",
-            "delta",
-            "phase",
-            "logprob",
-            "stdout",
-            "identifier",
-        ]
-        values_dict = {}
-        for key in keys:
-            value = getattr(self, key)
-            if value is None:
-                continue
-            if type(value) is np.ndarray:
-                if np.all(np.isnan(value)):
-                    continue
-            values_dict[key] = value
-        for key, value in self.order.items():
-            values_dict["order_" + key] = value
-        comment += key_val_dict_to_str(values_dict)
-        lines = ["{}".format(len(self))]
-        lines.append("{}".format(comment))
-        fmt = " ".join(["%2s"] + 3 * ["%16.8f"]) + " "
-        if write_forces:
-            fmt += " ".join(3 * ["%16.8f"])
-        for i in range(len(self)):
-            entry = (chemical_symbols[self.per_atom.numbers[i]],)
-            entry = entry + tuple(self.per_atom.positions[i])
-            if write_forces:
-                entry = entry + tuple(self.per_atom.forces[i])
-            lines.append(fmt % entry)
-        return "\n".join(lines)
-
-    def save(self, path_xyz: Union[Path, str]):
+            data["pbc"] = "F F F"
+        data |= vars(self.meta)
+        data = {k: v for k, v in data.items() if v is not None}
+        info_str = key_val_dict_to_str(data)
+        properties, txt = self.per_atom.to_string()
+        header = " ".join([properties, info_str])
+        return "\n".join([str(len(self)), header, txt])
+
+    def save(self, path_xyz: Path | str) -> None:
         """
         Save the Geometry instance to an XYZ file.
-
-        Args:
-            path_xyz (Union[Path, str]): Path to save the XYZ file.
         """
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
-        with open(path_xyz, "w") as f:
-            f.write(self.to_string())
+        path_xyz.write_text(self.to_string())
 
-    def copy(self) -> Geometry:
+    def to_atoms(self, structural_only: bool = False) -> Atoms:
         """
-        Create a deep copy of the Geometry instance.
+        Convert the Geometry instance to an Atoms object.
+        """
+        if structural_only:
+            return Atoms(
+                positions=self.per_atom.positions,
+                numbers=self.numbers,
+                cell=self.cell,
+                pbc=self.periodic,
+            )
+
+        # use the ASE extxyz reader
+        s = self.to_string()
+        return next(read_xyz(io.StringIO(s), index=0))
+
+    def __eq__(self, other: "Geometry") -> bool:
+        """
+        Check if two Geometry instances are structurally equal.
+        """
+        if (
+            isinstance(other, Geometry)
+            and len(self) == len(other)
+            and self.periodic == other.periodic
+            and (not self.periodic or np.allclose(self.cell, other.cell))
+            and np.allclose(self.per_atom.numbers, other.per_atom.numbers)
+            and np.allclose(self.per_atom.positions, other.per_atom.positions)
+        ):
+            return True
+        return False
 
-        Returns:
-            Geometry: A new Geometry instance with the same data.
+    def __len__(self) -> int:
         """
-        return Geometry.from_string(self.to_string())
+        Get the number of atoms in the geometry.
+        """
+        return len(self.per_atom)
 
     @classmethod
-    def from_string(cls, s: str, natoms: Optional[int] = None) -> Optional[Geometry]:
+    def from_string(cls, s: str) -> Geometry:
         """
         Create a Geometry instance from a string representation in extended XYZ format.
-
-        Args:
-            s (str): String representation of the geometry.
-            natoms (Optional[int], optional): Number of atoms (if known). Defaults to None.
-
-        Returns:
-            Optional[Geometry]: A new Geometry instance, or None if the string is empty.
         """
-        if len(s) == 0:
-            return None
-        if not natoms:  # natoms in s
-            lines = s.strip().split("\n")
-            natoms = int(lines[0])
-            lines = lines[1:]
+        n_atoms, header, body = s.strip().split("\n", 2)
+        data = key_val_str_to_dict(header)
+        data.pop("pbc", None)
+        if "Lattice" in data:
+            data["cell"] = data.pop("Lattice").T  # ase does Fortran ordering
         else:
-            lines = s.rstrip().split(
-                "\n"
-            )  # i-PI nonperiodic starts with empty -> rstrip!
-        assert len(lines) == natoms + 1
-        comment = lines[0]
-        comment_dict = key_val_str_to_dict_regex(comment)
-
-        # read and format per_atom data
-        column_indices = {}
-        if "Properties" in comment_dict:
-            properties = comment_dict["Properties"].split(":")
-            count = 0
-            for i in range(len(properties) // 3):
-                name = properties[3 * i]
-                ncolumns = int(properties[3 * i + 2])
-                column_indices[name] = count
-                count += ncolumns
-            assert "pos" in column_indices  # positions need to be there
-
-        per_atom = np.recarray(natoms, dtype=per_atom_dtype)
-        per_atom.forces[:] = np.nan
-        POS_INDEX = column_indices.get("pos", 1)
-        FORCES_INDEX = column_indices.get("forces", None)
-        for i in range(natoms):
-            values = lines[i + 1].split()
-            per_atom.numbers[i] = chemical_symbols.index(values[0])
-            per_atom.positions[i, :] = [
-                float(_) for _ in values[POS_INDEX : POS_INDEX + 3]
-            ]
-            if FORCES_INDEX is not None:
-                per_atom.forces[i, :] = [
-                    float(_) for _ in values[FORCES_INDEX : FORCES_INDEX + 3]
-                ]
-
-        order = {}
-        for key, value in comment_dict.items():
-            if key.startswith("order_"):
-                order[key.replace("order_", "")] = value
-
-        # TODO: this needs a better solution
-        cell = comment_dict.pop("Lattice", np.zeros((3, 3)))
-        if isinstance(cell, str):
-            # most likely an array of NaNs
-            cell = np.array(cell.split()).reshape((3, 3))
-        cell = cell.T # transposed!
-        pbc = sum(comment_dict.pop("pbc", [])) == 3
-        geometry = cls(
-            per_atom=per_atom,
-            cell=cell if pbc else np.zeros((3, 3)),
-            energy=comment_dict.pop("energy", None),
-            stress=comment_dict.pop("stress", None),
-            delta=comment_dict.pop("delta", None),
-            phase=comment_dict.pop("phase", None),
-            logprob=comment_dict.pop("logprob", None),
-            stdout=comment_dict.pop("stdout", None),
-            identifier=comment_dict.pop("identifier", None),
-            order=order,
-        )
-        return geometry
+            data["cell"] = None
+        per_atom = Container.from_string(body, data.pop("Properties", None))
+
+        assert len(per_atom) == int(n_atoms)
+        return Geometry(per_atom, **data)
 
     @classmethod
-    def load(cls, path_xyz: Union[Path, str]) -> Geometry:
+    def load(cls, path_xyz: Path | str) -> "Geometry":
         """
         Load a Geometry instance from an XYZ file.
-
-        Args:
-            path_xyz (Union[Path, str]): Path to the XYZ file.
-
-        Returns:
-            Geometry: A new Geometry instance loaded from the file.
         """
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
-        assert path_xyz.exists()
-        with open(path_xyz, "r") as f:
-            content = f.read()
+        content = path_xyz.read_text()
         return cls.from_string(content)
 
+    @classmethod
+    def from_data(
+        cls, numbers: np.ndarray, positions: np.ndarray, cell: Optional[np.ndarray]
+    ) -> Geometry:
+        """
+        Create a Geometry instance from atomic numbers, positions, and cell data.
+        """
+        per_atom = Container(numbers.copy(), positions.copy())
+        cell = cell if cell is None else cell.copy()
+        return Geometry(per_atom, cell=cell)
+
+    @classmethod
+    def from_atoms(cls, atoms: Atoms) -> Geometry:
+        """
+        Create a Geometry instance from an ASE Atoms object.
+        """
+        per_atom = Container(**atoms.arrays)
+        data = atoms.info
+        if all(atoms.pbc):
+            data["cell"] = atoms.cell.array
+        if atoms.calc is not None:
+            # ASE does stupid calc things
+            try:
+                data["energy"] = atoms.get_potential_energy()
+                per_atom.forces = atoms.get_forces()
+            except ase.calculators.calculator.PropertyNotImplementedError:
+                pass  # property was not stored in calc
+        return cls(per_atom, **data)
+
     @property
-    def periodic(self):
+    def periodic(self) -> bool:
         """
         Check if the geometry is periodic.
-
-        Returns:
-            bool: True if the geometry is periodic, False otherwise.
         """
         return np.any(self.cell)
 
     @property
-    def per_atom_energy(self):
+    def per_atom_energy(self) -> Optional[float]:
         """
         Calculate the energy per atom.
-
-        Returns:
-            Optional[float]: Energy per atom if total energy is available, None otherwise.
         """
         if self.energy is None:
             return None
-        else:
-            return self.energy / len(self)
+        return self.energy / len(self)
 
     @property
-    def volume(self):
+    def volume(self) -> Optional[float]:
         """
         Calculate the volume of the unit cell.
-
-        Returns:
-            float: Volume of the unit cell for periodic systems, np.nan for non-periodic systems.
         """
         if not self.periodic:
-            return np.nan
-        else:
-            return np.linalg.det(self.cell)
-        
-    @property
-    def atomic_masses(self):
-         """
-         Get the atomic masses of the atoms in the geometry.
- 
-         Returns:
-             np.ndarray: Array of atomic masses.
-         """
-         return np.array([atomic_masses[n] for n in self.per_atom.numbers])
+            return None
+        return np.linalg.det(self.cell)
 
-    @classmethod
-    def from_data(
-        cls,
-        numbers: np.ndarray,
-        positions: np.ndarray,
-        cell: Optional[np.ndarray],
-    ) -> Geometry:
+    @property
+    def atomic_masses(self) -> npt.NDArray:
         """
-        Create a Geometry instance from atomic numbers, positions, and cell data.
-
-        Args:
-            numbers (np.ndarray): Array of atomic numbers.
-            positions (np.ndarray): Array of atomic positions.
-            cell (Optional[np.ndarray]): Unit cell vectors (or None for non-periodic systems).
-
-        Returns:
-            Geometry: A new Geometry instance.
+        Get the atomic masses of the atoms in the geometry.
         """
-        per_atom = np.recarray(len(numbers), dtype=per_atom_dtype)
-        per_atom.numbers[:] = numbers
-        per_atom.positions[:] = positions
-        per_atom.forces[:] = np.nan
-        if cell is not None:
-            cell = cell.copy()
-        else:
-            cell = np.zeros((3, 3))
-        return Geometry(per_atom, cell)
+        return np.array([atomic_masses[n] for n in self.per_atom.numbers]).flatten()
 
-    @classmethod
-    def from_atoms(cls, atoms: Atoms) -> Geometry:
+    @property
+    def numbers(self) -> npt.NDArray:
         """
-        Create a Geometry instance from an ASE Atoms object.
-
-        Args:
-            atoms (Atoms): ASE Atoms object.
-
-        Returns:
-            Geometry: A new Geometry instance.
+        Get a flattened version of the per_atom.numbers array.
         """
-        per_atom = np.recarray(len(atoms), dtype=per_atom_dtype)
-        per_atom.numbers[:] = atoms.numbers.astype(np.uint8)
-        per_atom.positions[:] = atoms.get_positions()
-        per_atom.forces[:] = atoms.arrays.get("forces", np.nan)  # TODO: ASE stores forces in calc now
-        if np.any(atoms.pbc):
-            cell = np.array(atoms.cell)
-        else:
-            cell = np.zeros((3, 3))
-        geometry = cls(per_atom, cell)
-        geometry.energy = atoms.info.get("energy", None)
-        geometry.stress = atoms.info.get("stress", None)
-        geometry.delta = atoms.info.get("delta", None)
-        geometry.phase = atoms.info.get("phase", None)
-        geometry.logprob = atoms.info.get("logprob", None)
-        geometry.stdout = atoms.info.get("stdout", None)
-        geometry.identifier = atoms.info.get("identifier", None)
-        return geometry
-
-
-def new_nullstate():
-    """
-    Create a new null state Geometry.
+        return self.per_atom.numbers.flatten()
 
-    Returns:
-        Geometry: A Geometry instance representing a null state.
-    """
-    return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None)
+
+# def new_nullstate():
+#     """
+#     Create a new null state Geometry.
+#
+#     Returns:
+#         Geometry: A Geometry instance representing a null state.
+#     """
+#     return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None)
 
 
 # use universal dummy state
-NullState = new_nullstate()
+# NullState = new_nullstate()
 
 
 def is_lower_triangular(cell: np.ndarray) -> bool:
@@ -554,7 +470,6 @@ def reduce_box_vectors(cell: np.ndarray):
     cell[1, :] = cell[1, :] - cell[0, :] * np.round(cell[1, 0] / cell[0, 0])
 
 
-@typeguard.typechecked
 def get_mass_matrix(geometry: Geometry) -> np.ndarray:
     """
     Compute the mass matrix for a given geometry.
@@ -573,7 +488,6 @@ def get_mass_matrix(geometry: Geometry) -> np.ndarray:
     return np.outer(sqrt_inv, sqrt_inv)
 
 
-@typeguard.typechecked
 def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
     """
     Apply mass-weighting to a Hessian matrix.
@@ -590,7 +504,6 @@ def mass_weight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
     return hessian * get_mass_matrix(geometry)
 
 
-@typeguard.typechecked
 def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
     """
     Remove mass-weighting from a Hessian matrix.
@@ -607,104 +520,110 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
     return hessian / get_mass_matrix(geometry)
 
 
-def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]:
-    """
-    Create output arrays for specified quantities from a list of Geometry instances.
-
-    Args:
-        quantities (list[str]): List of quantity names to extract.
-        data (list[Geometry]): List of Geometry instances.
-
-    Returns:
-        list[np.ndarray]: List of arrays containing the requested quantities.
-    """
-    order_names = list(set([k for g in data for k in g.order]))
-    assert all([q in QUANTITIES + order_names for q in quantities])
-    natoms = np.array([len(geometry) for geometry in data], dtype=int)
-    max_natoms = np.max(natoms)
-    nframes = len(data)
-    nprob = 0
-    max_phase = 0
-    for state in data:
-        if state.logprob is not None:
-            nprob = max(len(state.logprob), nprob)
-        if state.phase is not None:
-            max_phase = max(len(state.phase), max_phase)
-
-    arrays = []
-    for quantity in quantities:
-        if quantity in ["positions", "forces"]:
-            array = np.empty((nframes, max_natoms, 3), dtype=np.float64)
-            array[:] = np.nan
-        elif quantity in ["cell", "stress"]:
-            array = np.empty((nframes, 3, 3), dtype=np.float64)
-            array[:] = np.nan
-        elif quantity in ["numbers"]:
-            array = np.empty((nframes, max_natoms), dtype=np.uint8)
-            array[:] = 0
-        elif quantity in ["energy", "delta", "per_atom_energy"]:
-            array = np.empty((nframes,), dtype=np.float64)
-            array[:] = np.nan
-        elif quantity in ["phase"]:
-            array = np.empty((nframes,), dtype=(np.unicode_, max_phase))
-            array[:] = ""
-        elif quantity in ["logprob"]:
-            array = np.empty((nframes, nprob), dtype=np.float64)
-            array[:] = np.nan
-        elif quantity in ["identifier"]:
-            array = np.empty((nframes,), dtype=np.int64)
-            array[:] = -1
-        elif quantity in order_names:
-            array = np.empty((nframes,), dtype=np.float64)
-            array[:] = np.nan
-        else:
-            raise AssertionError("missing quantity in if/else")
-        arrays.append(array)
-    return arrays
-
-
-def _assign_identifier(
-    state: Geometry,
-    identifier: int,
-    discard: bool = False,
-) -> tuple[Geometry, int]:
-    """
-    Assign an identifier to a Geometry instance.
-
-    Args:
-        state (Geometry): Input Geometry instance.
-        identifier (int): Identifier to assign.
-        discard (bool, optional): Whether to discard the state. Defaults to False.
-
-    Returns:
-        tuple[Geometry, int]: Updated Geometry and next available identifier.
-    """
-    if (state == NullState) or discard:
-        return state, identifier
-    else:
-        assert state.identifier is None
-        state.identifier = identifier
-        return state, identifier + 1
-
-
-assign_identifier = python_app(_assign_identifier, executors=["default_threads"])
-
-
-@typeguard.typechecked
-def _check_equality(
-    state0: Geometry,
-    state1: Geometry,
-) -> bool:
-    """
-    Check if two Geometry instances are equal.
-
-    Args:
-        state0 (Geometry): First Geometry instance.
-        state1 (Geometry): Second Geometry instance.
-
-    Returns:
-        bool: True if the Geometry instances are equal, False otherwise.
-    """
+def get_unique_numbers(states: Sequence[Geometry]) -> set[int]:
+    """Returns a set of unique atom numbers found across all states."""
+    numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers))
+    return {int(n) for n in numbers}
+
+
+def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float:
+    """Compute the total atomic energy based on provided single atom energies."""
+    total = 0
+    numbers, counts = np.unique(geometry.numbers, return_counts=True)
+    for number, count in zip(numbers, counts):
+        symbol = chemical_symbols[number]
+        try:
+            total += count * atomic_energies[symbol]
+        except KeyError:
+            raise KeyError(f"No atomic energy value for symbol '{symbol}'..")
+    return float(total)
+
+
+# def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]:
+#     """
+#     Create output arrays for specified quantities from a list of Geometry instances.
+#
+#     Args:
+#         quantities (list[str]): List of quantity names to extract.
+#         data (list[Geometry]): List of Geometry instances.
+#
+#     Returns:
+#         list[np.ndarray]: List of arrays containing the requested quantities.
+#     """
+#     order_names = list(set([k for g in data for k in g.order]))
+#     assert all([q in QUANTITIES + order_names for q in quantities])
+#     natoms = np.array([len(geometry) for geometry in data], dtype=int)
+#     max_natoms = np.max(natoms)
+#     nframes = len(data)
+#     nprob = 0
+#     max_phase = 0
+#     for state in data:
+#         if state.logprob is not None:
+#             nprob = max(len(state.logprob), nprob)
+#         if state.phase is not None:
+#             max_phase = max(len(state.phase), max_phase)
+#
+#     arrays = []
+#     for quantity in quantities:
+#         if quantity in ["positions", "forces"]:
+#             array = np.empty((nframes, max_natoms, 3), dtype=np.float64)
+#             array[:] = np.nan
+#         elif quantity in ["cell", "stress"]:
+#             array = np.empty((nframes, 3, 3), dtype=np.float64)
+#             array[:] = np.nan
+#         elif quantity in ["numbers"]:
+#             array = np.empty((nframes, max_natoms), dtype=np.uint8)
+#             array[:] = 0
+#         elif quantity in ["energy", "delta", "per_atom_energy"]:
+#             array = np.empty((nframes,), dtype=np.float64)
+#             array[:] = np.nan
+#         elif quantity in ["phase"]:
+#             array = np.empty((nframes,), dtype=(np.unicode_, max_phase))
+#             array[:] = ""
+#         elif quantity in ["logprob"]:
+#             array = np.empty((nframes, nprob), dtype=np.float64)
+#             array[:] = np.nan
+#         elif quantity in ["identifier"]:
+#             array = np.empty((nframes,), dtype=np.int64)
+#             array[:] = -1
+#         elif quantity in order_names:
+#             array = np.empty((nframes,), dtype=np.float64)
+#             array[:] = np.nan
+#         else:
+#             raise AssertionError("missing quantity in if/else")
+#         arrays.append(array)
+#     return arrays
+
+
+# def _assign_identifier(
+#     state: Geometry,
+#     identifier: int,
+#     discard: bool = False,
+# ) -> tuple[Geometry, int]:
+#     """
+#     Assign an identifier to a Geometry instance.
+#
+#     Args:
+#         state (Geometry): Input Geometry instance.
+#         identifier (int): Identifier to assign.
+#         discard (bool, optional): Whether to discard the state. Defaults to False.
+#
+#     Returns:
+#         tuple[Geometry, int]: Updated Geometry and next available identifier.
+#     """
+#     if (state == NullState) or discard:
+#         return state, identifier
+#     else:
+#         assert state.identifier is None
+#         state.identifier = identifier
+#         return state, identifier + 1
+
+
+# assign_identifier = python_app(_assign_identifier, executors=["default_threads"])
+
+
+def _check_equality(state0: Geometry, state1: Geometry) -> bool:
+    """    Check if two Geometry instances are equal"""
     return state0 == state1
 
 

From 43fde16269cf375d33674bd18ba5153a8d5dccfe Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 1 Apr 2026 20:15:59 +0200
Subject: [PATCH 02/16] updating data.utils

- API updates
- some rewrites
- functionaility unchanged
---
 psiflow/data/dataset.py | 192 +++++++------------------
 psiflow/data/utils.py   | 307 ++++++++++++++--------------------------
 2 files changed, 158 insertions(+), 341 deletions(-)

diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index 4a86f39..b67003a 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -1,18 +1,15 @@
-from __future__ import annotations  # necessary for type-guarding class methods
-
 import math
 from pathlib import Path
 from typing import Callable, ClassVar, Optional, Union
 
 import numpy as np
-import typeguard
 from parsl.app.app import join_app, python_app
 from parsl.app.python import PythonApp
 from parsl.data_provider.files import File
 from parsl.dataflow.futures import AppFuture, DataFuture
 
 import psiflow
-from psiflow.geometry import QUANTITIES, Geometry
+from psiflow.geometry import Geometry
 from psiflow.utils.apps import copy_data_future, pack
 
 from .utils import (
@@ -48,7 +45,7 @@ class Dataset:
 
     def __init__(
         self,
-        states: Optional[list[Union[AppFuture, Geometry]], AppFuture],
+        states: Optional[list[AppFuture | Geometry] | AppFuture] = None,
         extxyz: Optional[psiflow._DataFuture] = None,
     ):
         """
@@ -56,26 +53,15 @@ def __init__(
 
         Args:
             states: List of Geometry instances or AppFutures representing geometries.
-            extxyz: Optional DataFuture representing an existing extxyz file.
-
-        Note:
-            Either states or extxyz should be provided, not both.
         """
-        if extxyz is not None:
-            assert states is None
+        if extxyz is not None:  # takes precedence over states
             self.extxyz = extxyz
-        else:
-            assert states is not None
-            if not isinstance(states, list):  # AppFuture[list, geometry]
-                extra_states = states
-                states = []
-            else:
-                extra_states = None
-            self.extxyz = write_frames(
-                *states,
-                extra_states=extra_states,
-                outputs=[psiflow.context().new_file("data_", ".xyz")],
-            ).outputs[0]
+            return
+
+        if not isinstance(states, list):  # AppFuture[list, geometry]
+            states = [states]
+        file = psiflow.context().new_file("data_", ".xyz")
+        self.extxyz = write_frames(*states, outputs=[file]).outputs[0]
 
     def length(self) -> AppFuture:
         """
@@ -84,25 +70,19 @@ def length(self) -> AppFuture:
         Returns:
             AppFuture: Future representing the number of structures.
         """
-        return count_frames(inputs=[self.extxyz])
+        return count_frames(self.extxyz)
 
-    def shuffle(self):
+    def shuffle(self) -> "Dataset":
         """
         Shuffle the order of structures in the dataset.
-
-        Returns:
-            Dataset: A new Dataset with shuffled structures.
         """
-        extxyz = shuffle(
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        extxyz = shuffle(self.extxyz, outputs=[file]).outputs[0]
+        return Dataset(extxyz=extxyz)
 
     def __getitem__(
-        self,
-        index: Union[int, slice, list[int], AppFuture],
-    ) -> Union[Dataset, AppFuture]:
+        self, index: int | slice | list[int] | AppFuture
+    ) -> Dataset | AppFuture:
         """
         Get a subset of the dataset or a single structure.
 
@@ -112,36 +92,23 @@ def __getitem__(
         Returns:
             Union[Dataset, AppFuture]: A new Dataset or an AppFuture of a single Geometry.
         """
+        future_frames = read_frames(self.extxyz, indices=index)
         if isinstance(index, int):
-            future = read_frames(
-                [index],
-                inputs=[self.extxyz],
-                outputs=[],  # will return Geometry as Future
-            )
-            return future[0]
-        else:  # slice, list, AppFuture
-            extxyz = read_frames(
-                index,
-                inputs=[self.extxyz],
-                outputs=[psiflow.context().new_file("data_", ".xyz")],
-            ).outputs[0]
-            return Dataset(None, extxyz)
+            return future_frames[0]  # will return Geometry as Future
 
-    def save(self, path: Union[Path, str]) -> DataFuture:
+        # slice, list, AppFuture
+        file = psiflow.context().new_file("data_", ".xyz")
+        extxyz = write_frames(future_frames, outputs=[file]).outputs[0]
+        return Dataset(extxyz=extxyz)
+
+    def save(self, path: Path | str) -> DataFuture:
         """
         Save the dataset to a file.
-
-        Args:
-            path: Path to save the dataset.
-
         Returns:
             DataFuture: Future representing the file to which will be saved.
         """
         path = psiflow.resolve_and_check(Path(path))
-        future = copy_data_future(
-            inputs=[self.extxyz],
-            outputs=[File(str(path))],
-        )
+        future = copy_data_future(inputs=[self.extxyz], outputs=[File(path)])
         return future.outputs[0]
 
     def geometries(self) -> AppFuture:
@@ -151,96 +118,63 @@ def geometries(self) -> AppFuture:
         Returns:
             AppFuture: Future representing a list of Geometry instances.
         """
-        return read_frames(inputs=[self.extxyz])
+        return read_frames(self.extxyz)
 
     def __add__(self, dataset: Dataset) -> Dataset:
         """
         Concatenate two datasets.
-
-        Args:
-            dataset: Another Dataset to add to this one.
-
-        Returns:
-            Dataset: A new Dataset containing structures from both datasets.
         """
-        extxyz = join_frames(
-            inputs=[self.extxyz, dataset.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = join_frames(inputs=[self.extxyz, dataset.extxyz], outputs=[file])
+        return Dataset(extxyz=future.outputs[0])
 
-    def subtract_offset(self, **atomic_energies: Union[float, AppFuture]) -> Dataset:
+    def subtract_offset(self, **atomic_energies: float | AppFuture) -> Dataset:
         """
         Subtract atomic energy offsets from the dataset.
-
-        Args:
-            **atomic_energies: Atomic energies for each element.
-
-        Returns:
-            Dataset: A new Dataset with adjusted energies.
         """
         assert len(atomic_energies) > 0
-        extxyz = apply_offset(
-            True,
-            **atomic_energies,
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = apply_offset(
+            self.extxyz, subtract=True, **atomic_energies, outputs=[file]
+        )
+        return Dataset(extxyz=future.outputs[0])
 
     def add_offset(self, **atomic_energies) -> Dataset:
         """
         Add atomic energy offsets to the dataset.
-
-        Args:
-            **atomic_energies: Atomic energies for each element.
-
-        Returns:
-            Dataset: A new Dataset with adjusted energies.
         """
         assert len(atomic_energies) > 0
-        extxyz = apply_offset(
-            False,
-            **atomic_energies,
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = apply_offset(
+            self.extxyz, subtract=False, **atomic_energies, outputs=[file]
+        )
+        return Dataset(extxyz=future.outputs[0])
 
-    def elements(self):
+    def elements(self) -> AppFuture:
         """
         Get the set of elements present in the dataset.
 
         Returns:
             AppFuture: Future representing a set of element symbols.
         """
-        return get_elements(inputs=[self.extxyz])
+        return get_elements(self.extxyz)
 
-    def reset(self):
+    def reset(self) -> Dataset:
         """
         Reset all structures in the dataset.
-
-        Returns:
-            Dataset: A new Dataset with reset structures.
         """
-        extxyz = reset_frames(
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = reset_frames(self.extxyz, outputs=[file])
+        return Dataset(extxyz=future.outputs[0])
 
-    def clean(self):
+    def clean(self) -> Dataset:
         """
         Clean all structures in the dataset.
-
-        Returns:
-            Dataset: A new Dataset with cleaned structures.
         """
-        extxyz = clean_frames(
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = clean_frames(self.extxyz, outputs=[file])
+        return Dataset(extxyz=future.outputs[0])
 
     def get(
         self,
@@ -335,10 +269,8 @@ def not_null(self) -> Dataset:
         Returns:
             Dataset: A new Dataset without null states.
         """
-        extxyz = not_null(
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
+        file = psiflow.context().new_file("data_", ".xyz")
+        extxyz = not_null(inputs=[self.extxyz], outputs=[file]).outputs[0]
         return Dataset(None, extxyz)
 
     def align_axes(self):
@@ -393,25 +325,15 @@ def assign_identifiers(
         return result
 
     @classmethod
-    def load(
-        cls,
-        path_xyz: Union[Path, str],
-    ) -> Dataset:
+    def load(cls, path_xyz: Union[Path, str]) -> Dataset:
         """
         Load a dataset from a file.
-
-        Args:
-            path_xyz: Path to the XYZ file.
-
-        Returns:
-            Dataset: Loaded dataset.
         """
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
         assert path_xyz.exists()  # needs to be locally accessible
-        return cls(None, extxyz=File(str(path_xyz)))
+        return cls(extxyz=File(path_xyz))
 
 
-@typeguard.typechecked
 def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]:
     """
     Concatenate multiple lists of arrays.
@@ -459,7 +381,6 @@ def pad_arrays(
 concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"])
 
 
-@typeguard.typechecked
 def _aggregate_multiple(
     *arrays_list,
     coefficients: Optional[np.ndarray] = None,
@@ -493,7 +414,6 @@ def _aggregate_multiple(
 
 
 @join_app
-@typeguard.typechecked
 def batch_apply(
     apply_apps: tuple[Union[PythonApp, Callable]],
     arg: Union[Dataset, list[Geometry]],
@@ -560,7 +480,6 @@ def get_length(arg):
         return 1
 
 
-@typeguard.typechecked
 def compute(
     arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
     *apply_apps: Union[PythonApp, Callable],
@@ -623,7 +542,6 @@ def compute(
         return [future[i] for i in range(len(outputs_))]
 
 
-@typeguard.typechecked
 class Computable:
     """
     Base class for computable objects.
diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py
index 0f54936..1ca0e03 100644
--- a/psiflow/data/utils.py
+++ b/psiflow/data/utils.py
@@ -1,119 +1,94 @@
 import re
 import shutil
-from typing import Optional, Union, Sequence
+import subprocess
+from pathlib import Path
+from typing import Optional, Union, Sequence, Generator, TypeAlias
 
 import numpy as np
 import typeguard
 from ase.data import atomic_numbers, chemical_symbols
-from parsl.app.app import python_app
+from parsl import python_app, File
 from parsl.dataflow.futures import AppFuture
 
-from psiflow.geometry import Geometry, NullState, _assign_identifier, create_outputs
+from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers
+
+# TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor.
+# TODO: loosen type hints?
+# TODO: inputs/outputs for apps sometimes behave weird
+
+
+FileLike: TypeAlias = str | Path | File
+
+def iter_read_frames(file: FileLike) -> Generator[list[str]]:
+    """Yields text data to instantiate geometries"""
+    frame_regex = re.compile(r"^\d+$")
+    with open(file, "r") as f:
+        for line in f:
+            if frame_regex.match(_ := line.strip()):
+                n = int(_)
+                yield [line] + [f.readline() for _i in range(n + 1)]
 
 
-@typeguard.typechecked
 def _write_frames(
-    *states: Geometry,
-    extra_states: Union[Geometry, list[Geometry], None] = None,
-    outputs: list = [],
+    *states: Geometry | list[Geometry], outputs: Sequence[File] = ()
 ) -> None:
     """
     Write Geometry instances to a file.
 
     Args:
-        *states: Variable number of Geometry instances to write.
-        extra_states: Additional Geometry instance(s) to write.
+        states: Variable number of (lists of) Geometry instances to write.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    all_states = list(states)
-    if extra_states is not None:
-        if isinstance(extra_states, list):
-            all_states += extra_states
-        else:  # single geometry
-            all_states.append(extra_states)
+    assert len(outputs) == 1
+    data = []
+    for d in states:
+        if isinstance(d, list):
+            data.extend(d)
+        else:
+            data.append(d)
     with open(outputs[0], "w") as f:
-        for state in all_states:  # avoid double newline by using strip!
-            f.write(state.to_string().strip() + "\n")
+        f.write("".join([geom.to_string() for geom in data]))
 
 
 write_frames = python_app(_write_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
 def _read_frames(
-    indices: Union[None, slice, list[int], int] = None,
-    inputs: list = [],
-    outputs: list = [],
-) -> Optional[list[Geometry]]:
+    file: FileLike, indices: Optional[slice | list[int] | int] = None
+) -> list[Geometry]:
     """
     Read Geometry instances from a file.
 
     Args:
+        file: DataFuture representing the input file path containing the geometry data.
         indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing the geometry data.
-        outputs: List of Parsl futures. If provided, the first element should be
-                 a DataFuture representing the output file path where the selected
-                 geometries will be written.
 
-    Returns:
-        Optional[list[Geometry]]: List of read Geometry instances if no output
-                                  is specified, otherwise None.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    frame_index = 0
-    frame_regex = re.compile(r"^\d+$")
-    length = _count_frames(inputs=inputs)
-    _all = range(length)
+    if indices is None:
+        # read everything
+        data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)]
+        return [Geometry.from_string(s) for s in data]
+
+    # figure out what frames to read
+    # TODO: reads twice + indices always wrap modulo nframes which might be unexpected
+    length = _count_frames(file)
     if isinstance(indices, slice):
-        indices = list(_all[indices])
+        indices = list(range(length)[indices])
     elif isinstance(indices, int):
-        indices = [list(_all)[indices]]
-
-    if isinstance(indices, list):
-        if length > 0:
-            indices = [i % length for i in indices]  # for negative indices and wrapping
-        indices_ = set(indices)  # for *much* faster 'i in indices'
-    else:
-        assert indices is None
-        indices_ = None
-
-    data = []
-    with open(inputs[0], "r") as f:
-        while True:
-            line = f.readline()
-            if not line:
-                break
-            if frame_regex.match(line.strip()):
-                natoms = int(line.strip())
+        indices = [indices]
 
-                # currently at position frame_index, check if to be read
-                _ = [f.readline() for _i in range(natoms + 1)]
-                if indices_ is None or frame_index in indices_:
-                    data.append("".join([line] + _))
-                else:
-                    data.append(None)
-                frame_index += 1
+    if length > 0:
+        indices = [i % length for i in indices]  # for negative indices and wrapping
+    indices_ = set(indices)  # for *much* faster 'i in indices'
 
-    if indices is not None:  # sort states accordingly
-        data = [data[i] for i in indices]
+    data = {}
+    for i, geom_str_list in enumerate(iter_read_frames(file)):
+        if i in indices_:
+            data[i] = Geometry.from_string("".join(geom_str_list))
 
-    if len(outputs) > 0:
-        with open(outputs[0], "w") as f:
-            f.write("\n".join([d.strip() for d in data if d is not None]))
-            f.write("\n")
-    else:
-        geometries = [Geometry.from_string(s) for s in data if s is not None]
-        return geometries
+    # return in original order
+    return [data[i] for i in indices]
 
 
 read_frames = python_app(_read_frames, executors=["default_threads"])
@@ -365,11 +340,7 @@ def _assign_identifiers(
 assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _join_frames(
-    inputs: list = [],
-    outputs: list = [],
-):
+def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None:
     """
     Join multiple frame files into a single file.
 
@@ -378,12 +349,6 @@ def _join_frames(
                 representing an input file path containing geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path where joined frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
     assert len(outputs) == 1
 
@@ -396,157 +361,101 @@ def _join_frames(
 join_frames = python_app(_join_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _count_frames(inputs: list = []) -> int:
-    """
-    Count the number of frames in a file.
+def _count_frames(file: FileLike) -> int:
+    """Count the number of frames in a file."""
+    path = Path(file)
+    threshold = 10 * 1024 * 1024  # 10 MB in bytes
+    file_size = path.stat().st_size
 
-    Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-
-    Returns:
-        int: Number of frames in the file.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    nframes = 0
-    frame_regex = re.compile(r"^\d+$")
-    with open(inputs[0], "r") as f:
-        for line in f:
-            if frame_regex.match(line.strip()):
-                nframes += 1
-                natoms = int(line.strip())
-                _ = [f.readline() for _i in range(natoms + 1)]
-    return nframes
+    if file_size > threshold:  # use grep
+        cmd = f"grep -Fc Properties {path}"
+        result = subprocess.check_output(cmd.split())
+        return int(result.strip())
+    return len([_ for _ in iter_read_frames(path)])
 
 
 count_frames = python_app(_count_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _reset_frames(inputs: list = [], outputs: list = []) -> None:
+def _reset_frames(file: FileLike, outputs: Sequence[File] = ()) -> None:
     """
     Reset all frames in a file.
 
     Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
+        file: DataFuture representing the input file path containing the geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path where reset frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    data = _read_frames(inputs=[inputs[0]])
+    assert len(outputs) == 1
+    data = _read_frames(file)
     for geometry in data:
         geometry.reset()
-    _write_frames(*data, outputs=[outputs[0]])
+    _write_frames(*data, outputs=outputs)
 
 
 reset_frames = python_app(_reset_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _clean_frames(inputs: list = [], outputs: list = []) -> None:
+def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None:
     """
     Clean all frames in a file.
 
     Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
+        file: DataFuture representing the input file path containing the geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where cleaned frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
+                 representing the output file path where reset frames will be written.
     """
-    data = _read_frames(inputs=[inputs[0]])
+    assert len(outputs) == 1
+    data = _read_frames(file)
     for geometry in data:
         geometry.clean()
-    _write_frames(*data, outputs=[outputs[0]])
+    _write_frames(*data, outputs=outputs)
 
 
 clean_frames = python_app(_clean_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
 def _apply_offset(
-    subtract: bool,
-    inputs: list = [],
-    outputs: list = [],
-    **atomic_energies: float,
+    file: FileLike, subtract: bool, outputs: Sequence[File] = (), **atomic_energies: float
 ) -> None:
     """
     Apply an energy offset to all frames in a file.
 
     Args:
+        file: DataFuture representing the input file path containing the geometry data.
         subtract: Whether to subtract or add the offset.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path where updated frames will be written.
         **atomic_energies: Atomic energies for each element.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    assert len(inputs) == 1
     assert len(outputs) == 1
-    data = _read_frames(inputs=[inputs[0]])
-    numbers = [atomic_numbers[e] for e in atomic_energies.keys()]
-    all_numbers = [n for geometry in data for n in set(geometry.per_atom.numbers)]
-    for n in all_numbers:
-        if n != 0:  # from NullState
-            assert n in numbers
-    for geometry in data:
-        if geometry == NullState:
-            continue
-        natoms = len(geometry)
-        energy = geometry.energy
-        for number in numbers:
-            natoms_per_number = np.sum(geometry.per_atom.numbers == number)
-            if natoms_per_number == 0:
-                continue
-            element = chemical_symbols[number]
-            multiplier = -1 if subtract else 1
-            energy += multiplier * natoms_per_number * atomic_energies[element]
-            natoms -= natoms_per_number
-        assert natoms == 0  # all atoms accounted for
-        geometry.energy = energy
-    _write_frames(*data, outputs=[outputs[0]])
+    frames = _read_frames(file)
+    numbers_data = get_unique_numbers(frames)
+    numbers_kwargs = {atomic_numbers[e] for e in atomic_energies.keys()}
+    assert numbers_data == numbers_kwargs, "Provide atomic energies for all elements.."
+
+    for geom in frames:
+        energy = get_atomic_energy(geom, atomic_energies)
+        if subtract:
+            geom.energy -= energy
+        else:
+            geom.energy += energy
+
+    _write_frames(frames, outputs=outputs)
 
 
 apply_offset = python_app(_apply_offset, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _get_elements(inputs: list = []) -> set[str]:
+def _get_elements(*files: File) -> set[str]:
     """
-    Get the set of elements present in all frames of a file.
+    Get the set of elements present in all frames of a sequence of file.
 
     Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-
-    Returns:
-        set[str]: Set of element symbols.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
+        inputs: List of Parsl DataFuture
     """
-    data = _read_frames(inputs=[inputs[0]])
-    return set([chemical_symbols[n] for g in data for n in g.per_atom.numbers])
+    frames = [geom for file in files for geom in _read_frames(file)]
+    return {chemical_symbols[i] for i in get_unique_numbers(frames)}
 
 
 get_elements = python_app(_get_elements, executors=["default_threads"])
@@ -569,7 +478,7 @@ def _align_axes(inputs: list = [], outputs: list = []) -> None:
     Note:
         This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    data = _read_frames(inputs=[inputs[0]])
+    data = _read_frames(inputs[0])
     for geometry in data:
         geometry.align_axes()
     _write_frames(*data, outputs=[outputs[0]])
@@ -676,32 +585,22 @@ def _app_filter(
 )  # filter is protected
 
 
-@typeguard.typechecked
 def _shuffle(
-    inputs: list = [],
-    outputs: list = [],
+    file: FileLike,
+    outputs: Sequence[File] = (),
 ) -> None:
     """
     Shuffle the order of frames in a file.
 
     Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
+        file: DataFuture representing the input file path containing the geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path where shuffled frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    data = _read_frames(inputs=[inputs[0]])
-    indices = np.arange(len(data))
-    np.random.shuffle(indices)
-
-    shuffled = [data[int(i)] for i in indices]
-    _write_frames(*shuffled, outputs=[outputs[0]])
+    assert len(outputs) == 1
+    frames = _read_frames(file)
+    np.random.shuffle(frames)
+    _write_frames(frames, outputs=outputs)
 
 
 shuffle = python_app(_shuffle, executors=["default_threads"])

From e825ea655cb94e09bd3250d53d34f48b9d834609 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 1 Apr 2026 23:20:00 +0200
Subject: [PATCH 03/16] dataset update v1

- Dataset.get simplified - cannot filter non per-atom attributes
- Dataset.get_per_atom now does filtering bases on ids or element
- Dataset.evaluate has got to go, I think
- Dataset.not_null removed

reimplemented _extract_quantities, _insert_quantities:
- arbitrary quantities
- no longer forces everything into arrays
- use MISSING vaue to indicate missing data

more changes in data.utils, but not different functionality
---
 psiflow/data/dataset.py | 101 +++------
 psiflow/data/utils.py   | 438 ++++++++++++++--------------------------
 2 files changed, 187 insertions(+), 352 deletions(-)

diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index b67003a..9df0142 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -25,11 +25,11 @@
     get_train_valid_indices,
     insert_quantities,
     join_frames,
-    not_null,
     read_frames,
     reset_frames,
     shuffle,
     write_frames,
+    extract_quantities_per_atom,
 )
 
 
@@ -171,55 +171,46 @@ def clean(self) -> Dataset:
         """
         Clean all structures in the dataset.
         """
-
         file = psiflow.context().new_file("data_", ".xyz")
         future = clean_frames(self.extxyz, outputs=[file])
         return Dataset(extxyz=future.outputs[0])
 
-    def get(
+    def get(self, *quantities: str) -> tuple[AppFuture, ...]:
+        """
+        Extract specified quantities from the dataset.
+        """
+        future_states = read_frames(self.extxyz)
+        future_dict = extract_quantities(future_states, quantities)
+        return tuple(future_dict[q] for q in quantities)
+
+    def get_per_atom(
         self,
         *quantities: str,
         atom_indices: Optional[list[int]] = None,
         elements: Optional[list[str]] = None,
-    ):
+    ) -> tuple[AppFuture, ...]:
         """
-        Extract specified quantities from the dataset.
+        Extract specified per atom quantities from the dataset.
 
         Args:
             *quantities: Names of quantities to extract.
             atom_indices: Optional list of atom indices to consider.
             elements: Optional list of element symbols to consider.
-
-        Returns:
-            Union[AppFuture, tuple[AppFuture, ...]]: Future(s) representing the extracted quantities.
         """
-        result = extract_quantities(
-            quantities,
-            atom_indices,
-            elements,
-            inputs=[self.extxyz],
+        future_states = read_frames(self.extxyz)
+        future_dict = extract_quantities_per_atom(
+            future_states, quantities, atom_indices, elements
         )
-        if len(quantities) == 1:
-            return result[0]
-        else:
-            return tuple([result[i] for i in range(len(quantities))])
+        return tuple(future_dict[q] for q in quantities)
 
     def evaluate(
-        self,
-        computable: Computable,
-        batch_size: Optional[int] = None,
+        self, computable: Computable, batch_size: Optional[int] = None
     ) -> Dataset:
         """
         Evaluate a Computable on the dataset.
-
-        Args:
-            computable: Computable object to evaluate.
-            batch_size: Optional batch size for evaluation.
-
-        Returns:
-            Dataset: A new Dataset with evaluation results.
         """
         # TODO: remove this functionality?
+        #  or this should be the (only) way to label a dataset?
         from psiflow.hamiltonians import Hamiltonian
 
         if not isinstance(computable, Hamiltonian):
@@ -233,58 +224,30 @@ def evaluate(
             outputs = computable.compute(self)  # use default from computable
         if not isinstance(outputs, list):  # compute unpacks for only one property
             outputs = [outputs]
-        future = insert_quantities(
-            quantities=tuple(computable.outputs),
-            arrays=pack(*outputs),
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        )
-        return Dataset(None, future.outputs[0])
+        data = {k: v for k, v in zip(computable.outputs, outputs)}
 
-    def filter(
-        self,
-        quantity: str,
-    ) -> Dataset:
-        """
-        Filter the dataset based on a specified quantity.
-
-        Args:
-            quantity: The quantity to filter on.
+        file = psiflow.context().new_file("data_", ".xyz")
 
-        Returns:
-            Dataset: A new Dataset containing only structures that pass the filter.
-        """
-        assert quantity in QUANTITIES
-        extxyz = app_filter(
-            quantity,
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        future = read_frames(self.extxyz)
+        future = insert_quantities(future, data)
+        extxyz = write_frames(future, outputs=[file]).outputs[0]
+        return Dataset(extxyz=extxyz)
 
-    def not_null(self) -> Dataset:
+    def filter(self, quantity: str) -> Dataset:
         """
-        Remove null states from the dataset.
-
-        Returns:
-            Dataset: A new Dataset without null states.
+        Filter the dataset based on a specified quantity.
         """
         file = psiflow.context().new_file("data_", ".xyz")
-        extxyz = not_null(inputs=[self.extxyz], outputs=[file]).outputs[0]
-        return Dataset(None, extxyz)
+        future = app_filter(self.extxyz, quantity, outputs=[file])
+        return Dataset(extxyz=future.outputs[0])
 
-    def align_axes(self):
+    def align_axes(self) -> Dataset:
         """
         Adopt a canonical orientation for all (periodic) structures in the dataset.
-
-        Returns:
-            Dataset: A new Dataset with aligned structures.
         """
-        extxyz = align_axes(
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        ).outputs[0]
-        return Dataset(None, extxyz)
+        file = psiflow.context().new_file("data_", ".xyz")
+        future = align_axes(self.extxyz, outputs=[file])
+        return Dataset(extxyz=future.outputs[0])
 
     def split(self, fraction, shuffle=True):  # auto-shuffles
         """
diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py
index 1ca0e03..3f77308 100644
--- a/psiflow/data/utils.py
+++ b/psiflow/data/utils.py
@@ -19,6 +19,20 @@
 
 FileLike: TypeAlias = str | Path | File
 
+
+class MissingType:
+    """Placeholder sentinel for missing data fields"""
+
+    def __repr__(self):
+        return "<MISSING>"
+
+    def __bool__(self):
+        return False
+
+
+MISSING = MissingType()
+
+
 def iter_read_frames(file: FileLike) -> Generator[list[str]]:
     """Yields text data to instantiate geometries"""
     frame_regex = re.compile(r"^\d+$")
@@ -94,166 +108,96 @@ def _read_frames(
 read_frames = python_app(_read_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
 def _extract_quantities(
-    quantities: tuple[str, ...],
-    atom_indices: Optional[list[int]],
-    elements: Optional[list[str]],
-    *extra_data: Geometry,
-    inputs: list = [],
-) -> tuple[np.ndarray, ...]:
+    states: Sequence[Geometry], quantities: Sequence[str]
+) -> dict[str, list]:
     """
     Extract specified quantities from Geometry instances.
-
-    Args:
-        quantities: Tuple of quantity names to extract.
-        atom_indices: List of atom indices to consider.
-        elements: List of element symbols to consider.
-        *extra_data: Additional Geometry instances.
-        inputs: List of Parsl futures. If provided, the first element should be a DataFuture
-                representing the input file path containing geometry data.
-
-    Returns:
-        tuple[np.ndarray, ...]: Tuple of arrays containing extracted quantities.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    if not len(extra_data):
-        assert len(inputs) == 1
-        data = _read_frames(inputs=inputs)
-    else:
-        assert len(inputs) == 0
-        data = list(extra_data)
-    order_names = list(set([k for g in data for k in g.order]))
-    natoms = np.array([len(geometry) for geometry in data], dtype=int)
-    max_natoms = np.max(natoms)
-
-    arrays = create_outputs(quantities, data)
-    for i, geometry in enumerate(data):
-        mask = get_index_element_mask(
-            geometry.per_atom.numbers,
-            atom_indices,
-            elements,
-            natoms_padded=int(max_natoms),
-        )
-        natoms = len(geometry)
-        for j, quantity in enumerate(quantities):
-            if quantity == "positions":
-                arrays[j][i, mask, :] = geometry.per_atom.positions[mask[:natoms]]
-            elif quantity == "forces":
-                arrays[j][i, mask, :] = geometry.per_atom.forces[mask[:natoms]]
-            elif quantity == "cell":
-                arrays[j][i, :, :] = geometry.cell
-            elif quantity == "stress":
-                if geometry.stress is not None:
-                    arrays[j][i, :, :] = geometry.stress
-            elif quantity == "numbers":
-                arrays[j][i, :] = geometry.numbers
-            elif quantity == "energy":
-                if geometry.energy is not None:
-                    arrays[j][i] = geometry.energy
-            elif quantity == "delta":
-                if geometry.delta is not None:
-                    arrays[j][i] = geometry.delta
-            elif quantity == "per_atom_energy":
-                if geometry.energy is not None:
-                    arrays[j][i] = geometry.per_atom_energy
-            elif quantity == "phase":
-                if geometry.phase is not None:
-                    arrays[j][i] = geometry.phase
-            elif quantity == "logprob":
-                if geometry.logprob is not None:
-                    arrays[j][i, :] = geometry.logprob
-            elif quantity == "identifier":
-                if geometry.identifier is not None:
-                    arrays[j][i] = geometry.identifier
-            elif quantity in order_names:
-                if quantity in geometry.order:
-                    arrays[j][i] = geometry.order[quantity]
-    return tuple(arrays)
+    data = {k: [] for k in quantities}
+    for k in quantities:
+        if k in ("cell", "energy", "stress"):
+            for geom in states:
+                value = getattr(geom, k, MISSING)
+                data[k].append(value)
+            continue
+        for geom in states:
+            value = getattr(geom.per_atom, k, MISSING)
+            if value is MISSING:
+                value = getattr(geom.meta, k, MISSING)
+            data[k].append(value)
+    return data
 
 
 extract_quantities = python_app(_extract_quantities, executors=["default_threads"])
 
 
-@typeguard.typechecked
 def _insert_quantities(
-    quantities: tuple[str, ...],
-    arrays: Sequence[np.ndarray],
-    data: Optional[list[Geometry]] = None,
-    inputs: list = [],
-    outputs: list = [],
-) -> None:
+    states: Sequence[Geometry], data: dict[str, list]
+) -> Sequence[Geometry]:
     """
-    Insert quantities into Geometry instances.
-
-    Args:
-        quantities: Tuple of quantity names to insert.
-        arrays: List of arrays containing the quantities to insert.
-        data: List of Geometry instances to update.
-        inputs: List of Parsl futures. If provided, the first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. If provided, the first element should be a DataFuture
-                 representing the output file path where updated geometries will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    Insert quantities from data into Geometry instances.
     """
-    if data is None:
-        assert len(inputs) == 1
-        data = _read_frames(inputs=inputs)
-    else:
-        assert len(inputs) == 0
-    max_natoms = max([len(geometry) for geometry in data])
+    for q, values in data.items():
+        assert len(states) == len(values)
 
-    for i, geometry in enumerate(data):
-        if geometry == NullState:
+        if q in ("cell", "energy", "stress"):
+            for geom, v in zip(states, values):
+                setattr(geom, q, v)
             continue
-        mask = get_index_element_mask(
-            geometry.per_atom.numbers,
-            None,
-            None,
-            natoms_padded=int(max_natoms),
-        )
-        natoms = len(geometry)
-        for j, quantity in enumerate(quantities):
-            if quantity == "positions":
-                geometry.per_atom.positions[:natoms, :] = arrays[j][i, mask, :]
-            elif quantity == "forces":
-                geometry.per_atom.forces[mask[:natoms]] = arrays[j][i, mask, :]
-            elif quantity == "cell":
-                geometry.cell = arrays[j][i, :, :]
-            elif quantity == "stress":
-                geometry.stress = arrays[j][i, :, :]
-            elif quantity == "numbers":
-                geometry.numbers = arrays[j][i, :]
-            elif quantity == "energy":
-                geometry.energy = arrays[j][i]
-            elif quantity == "delta":
-                geometry.delta = arrays[j][i]
-            elif quantity == "per_atom_energy":
-                geometry.per_atom_energy = arrays[j][i]
-            elif quantity == "phase":
-                geometry.phase = arrays[j][i]
-            elif quantity == "logprob":
-                geometry.logprob = arrays[j][i, :]
-            elif quantity == "identifier":
-                geometry.identifier = arrays[j][i]
-            elif quantity in geometry.order:
-                geometry.order[quantity] = arrays[j][i]
+
+        for geom, v in zip(states, values):
+            if isinstance(v, np.ndarray) and len(geom) == len(v):
+                setattr(geom.per_atom, q, v)
             else:
-                raise ValueError("unknown quantity {}".format(quantity))
-    if len(outputs) > 0:
-        _write_frames(*data, outputs=[outputs[0]])
+                setattr(geom.meta, q, v)
+
+    return states
 
 
 insert_quantities = python_app(_insert_quantities, executors=["default_threads"])
 
 
+def _extract_quantities_per_atom(
+    states: Sequence[Geometry],
+    quantities: Sequence[str],
+    atom_indices: Optional[Sequence[int]] = None,
+    elements: Optional[Sequence[str]] = None,
+) -> dict[str, list]:
+    """
+    Extract per atom quantities from Geometry instances, filtering based on atom_indices or elements.
+    """
+    if atom_indices is None and elements is None:
+        # no filtering
+        data = {}
+        for k in quantities:
+            data[k] = [getattr(geom.per_atom, k, MISSING) for geom in states]
+        return data
+
+    data = {k: [] for k in quantities}
+    numbers = {atomic_numbers[s] for s in elements or ()}
+    for geom in states:
+        # mask out unwanted rows
+        mask = np.zeros(len(geom), dtype=bool)
+        for n in numbers:
+            mask[geom.numbers == n] = True
+        if atom_indices is not None:
+            mask[atom_indices] = True
+
+        for k in quantities:
+            value = getattr(geom.per_atom, k, MISSING)
+            if not value is MISSING:
+                value = value[mask]
+            data[k].append(value)
+
+    return data
+
+
+extract_quantities_per_atom = python_app(
+    _extract_quantities_per_atom, executors=["default_threads"]
+)
+
+
 @typeguard.typechecked
 def _check_distances(state: Geometry, threshold: float) -> Geometry:
     """
@@ -416,7 +360,10 @@ def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None:
 
 
 def _apply_offset(
-    file: FileLike, subtract: bool, outputs: Sequence[File] = (), **atomic_energies: float
+    file: FileLike,
+    subtract: bool,
+    outputs: Sequence[File] = (),
+    **atomic_energies: float,
 ) -> None:
     """
     Apply an energy offset to all frames in a file.
@@ -461,128 +408,52 @@ def _get_elements(*files: File) -> set[str]:
 get_elements = python_app(_get_elements, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _align_axes(inputs: list = [], outputs: list = []) -> None:
+def _align_axes(file: FileLike, outputs: Sequence[File] = ()) -> None:
     """
     Align axes for all frames in a file.
 
     Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
+        file: DataFuture representing the input file path containing the geometry data.
         outputs: List of Parsl futures. The first element should be a DataFuture
                  representing the output file path where aligned frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    data = _read_frames(inputs[0])
+    assert len(outputs) == 1
+    data = _read_frames(file)
     for geometry in data:
         geometry.align_axes()
-    _write_frames(*data, outputs=[outputs[0]])
+    _write_frames(*data, outputs=outputs)
 
 
 align_axes = python_app(_align_axes, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _not_null(inputs: list = [], outputs: list = []) -> list[bool]:
-    """
-    Check which frames in a file are not null states.
-
-    Args:
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. If provided, the first element should be a DataFuture
-                 representing the output file path where non-null frames will be written.
-
-    Returns:
-        list[bool]: List of boolean values indicating non-null states.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
+def _app_filter(file: FileLike, quantity: str, outputs: Sequence[File] = ()) -> None:
     """
-    frame_regex = re.compile(r"^\d+$")
-
-    data = []
-    mask = []
-    with open(inputs[0], "r") as f:
-        while True:
-            line = f.readline()
-            if not line:
-                break
-            if frame_regex.match(line.strip()):
-                natoms = int(line.strip())
-                _ = [f.readline() for _i in range(natoms + 1)]
-                if natoms == 1:
-                    if _[-1].strip()[0] == "X":  # check if element(-1) == X
-                        mask.append(False)
-                        continue
-                data.append("".join([line] + _))
-                mask.append(True)
-    if len(outputs) > 0:
-        with open(outputs[0], "w") as f:
-            f.write("\n".join(data))
-    return mask
-
-
-not_null = python_app(_not_null, executors=["default_threads"])
-
-
-@typeguard.typechecked
-def _app_filter(
-    quantity: str,
-    inputs: list = [],
-    outputs: list = [],
-) -> None:
+    Filter frames based on a specified quantity and writes to first output.
     """
-    Filter frames based on a specified quantity.
-
-    Args:
-        quantity: The quantity to filter on.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where filtered frames will be written.
-
-    Returns:
-        None
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    data = _read_frames(inputs=[inputs[0]])
-    i = 0
-    while i < len(data):
-        if quantity == "forces":
-            if np.all(np.invert(np.isnan(data[i].per_atom.forces))):
-                retain = True
-            else:
-                retain = False
-        elif quantity == "cell":
-            if not np.allclose(data[i].cell, 0.0):
-                retain = True
-            else:
-                retain = False
-        elif hasattr(data[i], quantity) and getattr(data[i], quantity) is not None:
-            retain = True
-        else:
-            retain = False
+    # TODO: where is this used?
+    data = _read_frames(file)
+    out = []
+
+    # None does not count
+    for geom in data:
+        if (quantity in ("cell", "energy", "stress")) and getattr(
+            geom, quantity, None
+        ) is not None:
+            out.append(geom)
+            continue
 
-        # pop if necessary:
-        if retain:
-            i += 1
-        else:
-            data.pop(i)
+        value = getattr(geom.per_atom, quantity, None)
+        if value is None:
+            value = getattr(geom.meta, quantity, None)
+        if value is not None:
+            out.append(geom)
 
-    _write_frames(*data, outputs=[outputs[0]])
+    _write_frames(out, outputs=outputs)
 
 
-app_filter = python_app(
-    _app_filter, executors=["default_threads"]
-)  # filter is protected
+# filter is protected
+app_filter = python_app(_app_filter, executors=["default_threads"])
 
 
 def _shuffle(
@@ -662,46 +533,47 @@ def get_train_valid_indices(
     return future[0], future[1]
 
 
-@typeguard.typechecked
-def get_index_element_mask(
-    numbers: np.ndarray,
-    atom_indices: Optional[list[int]],
-    elements: Optional[list[str]],
-    natoms_padded: Optional[int] = None,
-) -> np.ndarray:
-    """
-    Generate a mask for atom indices and elements.
-
-    Args:
-        numbers: Array of atomic numbers.
-        atom_indices: List of atom indices to include.
-        elements: List of element symbols to include.
-        natoms_padded: Total number of atoms including padding.
-
-    Returns:
-        np.ndarray: Boolean mask array.
-    """
-    mask = np.array([True] * len(numbers))
-
-    if elements is not None:
-        numbers_to_include = [atomic_numbers[e] for e in elements]
-        mask_elements = np.array([False] * len(numbers))
-        for number in numbers_to_include:
-            mask_elements = np.logical_or(mask_elements, (numbers == number))
-        mask = np.logical_and(mask, mask_elements)
-
-    if natoms_padded is not None:
-        assert natoms_padded >= len(numbers)
-        padding = natoms_padded - len(numbers)
-        mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool)
-
-    if atom_indices is not None:  # below padding
-        mask_indices = np.array([False] * len(mask))
-        mask_indices[np.array(atom_indices)] = True
-        mask = np.logical_and(mask, mask_indices)
-
-    return mask
-
+# @typeguard.typechecked
+# def get_index_element_mask(
+#     numbers: np.ndarray,
+#     atom_indices: Optional[list[int]],
+#     elements: Optional[list[str]],
+#     natoms_padded: Optional[int] = None,
+# ) -> np.ndarray:
+#     """
+#     Generate a mask for atom indices and elements.
+#
+#     Args:
+#         numbers: Array of atomic numbers.
+#         atom_indices: List of atom indices to include.
+#         elements: List of element symbols to include.
+#         natoms_padded: Total number of atoms including padding.
+#
+#     Returns:
+#         np.ndarray: Boolean mask array.
+#     """
+#     mask = np.array([True] * len(numbers))
+#
+#     if elements is not None:
+#         numbers_to_include = [atomic_numbers[e] for e in elements]
+#         mask_elements = np.array([False] * len(numbers))
+#         for number in numbers_to_include:
+#             mask_elements = np.logical_or(mask_elements, (numbers == number))
+#         mask = np.logical_and(mask, mask_elements)
+#
+#     if natoms_padded is not None:
+#         assert natoms_padded >= len(numbers)
+#         padding = natoms_padded - len(numbers)
+#         mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool)
+#
+#     if atom_indices is not None:  # below padding
+#         mask_indices = np.array([False] * len(mask))
+#         mask_indices[np.array(atom_indices)] = True
+#         mask = np.logical_and(mask, mask_indices)
+#
+#     return mask
+
+# TODO: these do not belong here
 
 @typeguard.typechecked
 def _compute_rmse(

From 40fd431e2d07be0b236c445685122898af86adde Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Thu, 2 Apr 2026 11:37:03 +0200
Subject: [PATCH 04/16] some restructuring

---
 psiflow/abandonware.py   |  51 +++++
 psiflow/compute.py       | 338 ++++++++++++++++++++++++++++
 psiflow/data/__init__.py |   3 +-
 psiflow/data/dataset.py  | 270 +---------------------
 psiflow/data/utils.py    | 472 +++++++++++++--------------------------
 psiflow/geometry.py      | 123 +---------
 6 files changed, 563 insertions(+), 694 deletions(-)
 create mode 100644 psiflow/abandonware.py
 create mode 100644 psiflow/compute.py

diff --git a/psiflow/abandonware.py b/psiflow/abandonware.py
new file mode 100644
index 0000000..5d9f9b0
--- /dev/null
+++ b/psiflow/abandonware.py
@@ -0,0 +1,51 @@
+"""
+TODO: these functions seem to not be used anywhere in the codebase
+ probably a good idea to remove these eventually
+"""
+
+
+def _check_equality(state0: Geometry, state1: Geometry) -> bool:
+    """Check if two Geometry instances are equal"""
+    return state0 == state1
+
+
+check_equality = python_app(_check_equality, executors=["default_threads"])
+
+
+@typeguard.typechecked
+def _check_distances(state: Geometry, threshold: float) -> Geometry:
+    """
+    Check if all interatomic distances in a Geometry are above a threshold.
+
+    Args:
+        state: Geometry instance to check.
+        threshold: Minimum allowed interatomic distance.
+
+    Returns:
+        Geometry: The input Geometry if all distances are above the threshold, otherwise NullState.
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_htex executor.
+    """
+    from ase.geometry.geometry import find_mic
+
+    if state == NullState:
+        return NullState
+    nrows = int(len(state) * (len(state) - 1) / 2)
+    deltas = np.zeros((nrows, 3))
+    count = 0
+    for i in range(len(state) - 1):
+        for j in range(i + 1, len(state)):
+            deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j]
+            count += 1
+    assert count == nrows
+    if state.periodic:
+        deltas, _ = find_mic(deltas, state.cell)
+    check = np.all(np.linalg.norm(deltas, axis=1) > threshold)
+    if check:
+        return state
+    else:
+        return NullState
+
+
+check_distances = python_app(_check_distances, executors=["default_htex"])
diff --git a/psiflow/compute.py b/psiflow/compute.py
new file mode 100644
index 0000000..110ab33
--- /dev/null
+++ b/psiflow/compute.py
@@ -0,0 +1,338 @@
+import math
+from typing import Callable, ClassVar, Optional, Union
+
+import numpy as np
+import typeguard
+from parsl import python_app
+from parsl.app.app import join_app, python_app
+from parsl.app.python import PythonApp
+from parsl.dataflow.futures import AppFuture, DataFuture
+
+import psiflow
+from psiflow.geometry import Geometry
+from psiflow.data import Dataset
+from psiflow.data.utils import batch_frames
+
+
+def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]:
+    """
+    Concatenate multiple lists of arrays.
+
+    Args:
+        *args: Lists of numpy arrays to concatenate.
+
+    Returns:
+        list[np.ndarray]: List of concatenated arrays.
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+
+    def pad_arrays(
+        arrays: list[np.ndarray],
+        pad_dimension: int = 1,
+    ) -> list[np.ndarray]:
+        ndims = np.array([len(a.shape) for a in arrays])
+        assert np.all(ndims == ndims[0])
+        assert np.all(pad_dimension < ndims)
+
+        pad_size = max([a.shape[pad_dimension] for a in arrays])
+        for i in range(len(arrays)):
+            shape = list(arrays[i].shape)
+            shape[pad_dimension] = pad_size - shape[pad_dimension]
+            padding = np.zeros(tuple(shape)) + np.nan
+            arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension)
+        return arrays
+
+    narrays = len(args[0])
+    for arg in args:
+        assert isinstance(arg, list)
+    assert all([len(a) == narrays for a in args])
+
+    concatenated = []
+    for i in range(narrays):
+        arrays = [arg[i] for arg in args]
+        if len(arrays[0].shape) > 1:
+            pad_arrays(arrays)
+        concatenated.append(np.concatenate(tuple(arrays)))
+    return concatenated
+
+
+concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"])
+
+
+def _aggregate_multiple(
+    *arrays_list,
+    coefficients: Optional[np.ndarray] = None,
+) -> list[np.ndarray]:
+    """
+    Aggregate multiple lists of arrays with optional coefficients.
+
+    Args:
+        *arrays_list: Lists of arrays to aggregate.
+        coefficients: Optional coefficients for weighted aggregation.
+
+    Returns:
+        list[np.ndarray]: List of aggregated arrays.
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+    if coefficients is None:
+        coefficients = np.ones(len(arrays_list))
+    else:
+        assert len(coefficients) == len(arrays_list)
+
+    results = [np.zeros(a.shape) for a in arrays_list[0]]
+    for i, arrays in enumerate(arrays_list):
+        for j, array in enumerate(arrays):
+            results[j] += coefficients[i] * array
+    return results
+
+
+aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"])
+
+
+@join_app
+def batch_apply(
+    apply_apps: tuple[Union[PythonApp, Callable]],
+    arg: Union[Dataset, list[Geometry]],
+    batch_size: int,
+    length: int,
+    outputs: list = [],
+    reduce_func: Optional[PythonApp] = None,
+    **app_kwargs,
+) -> AppFuture:
+    """
+    Apply a set of apps to batches of data.
+
+    Args:
+        apply_apps: Tuple of PythonApps or Callables to apply.
+        arg: Dataset or list of Geometries to process.
+        batch_size: Size of each batch.
+        length: Total number of items to process.
+        outputs: List of output files.
+        reduce_func: Optional function to reduce results.
+        **app_kwargs: Additional keyword arguments for the apps.
+
+    Returns:
+        AppFuture: Future representing the result of batch application.
+
+    Note:
+        This function is wrapped as a Parsl join_app.
+    """
+    nbatches = math.ceil(length / batch_size)
+    batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)]
+    future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches)
+    output_futures = []
+    for i in range(nbatches):
+        futures = []
+        for app in apply_apps:
+            f = app(
+                None,
+                inputs=[future.outputs[i]],
+                **app_kwargs,
+            )
+            futures.append(f)
+        reduced = reduce_func(*futures)
+        output_futures.append(reduced)
+    future = concatenate_multiple(*output_futures)
+    return future
+
+
+@python_app(executors=["default_threads"])
+def get_length(arg):
+    """
+    Get the length of the input argument.
+
+    Args:
+        arg: Input to get the length of.
+
+    Returns:
+        int: Length of the input.
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+    if isinstance(arg, list):
+        return len(arg)
+    else:
+        return 1
+
+
+def compute(
+    arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
+    *apply_apps: Union[PythonApp, Callable],
+    outputs_: Union[str, list[str], tuple[str, ...], None] = None,
+    reduce_func: Union[PythonApp, Callable] = aggregate_multiple,
+    batch_size: Optional[int] = None,
+) -> Union[list[AppFuture], AppFuture]:
+    """
+    Compute results by applying apps to the input data.
+
+    Args:
+        arg: Input data to compute on.
+        *apply_apps: Apps to apply to the data.
+        outputs_: Names of output quantities.
+        reduce_func: Function to reduce results.
+        batch_size: Optional batch size for processing.
+
+    Returns:
+        Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
+    """
+    if type(outputs_) is str:
+        outputs_ = [outputs_]
+    if batch_size is not None:
+        if isinstance(arg, Dataset):
+            length = arg.length()
+        else:
+            length = get_length(arg)
+            # convert to Dataset for convenience
+            arg = Dataset(arg)
+        future = batch_apply(
+            apply_apps,
+            arg,
+            batch_size,
+            length,
+            outputs_=outputs_,
+            reduce_func=reduce_func,
+        )
+    else:
+        futures = []
+        if isinstance(arg, Dataset):
+            for app in apply_apps:
+                future = app(
+                    None,
+                    outputs_=outputs_,
+                    inputs=[arg.extxyz],
+                )
+                futures.append(future)
+        else:
+            for app in apply_apps:
+                future = app(
+                    arg,
+                    outputs_=outputs_,
+                    inputs=[],
+                )
+                futures.append(future)
+        future = reduce_func(*futures)
+    if len(outputs_) == 1:
+        return future[0]
+    else:
+        return [future[i] for i in range(len(outputs_))]
+
+
+class Computable:
+    """
+    Base class for computable objects.
+
+    Attributes:
+        outputs (ClassVar[tuple[str, ...]]): Names of output quantities.
+        batch_size (ClassVar[Optional[int]]): Default batch size for computation.
+    """
+
+    outputs: ClassVar[tuple[str, ...]] = ()
+    batch_size: ClassVar[Optional[int]] = None
+
+    def compute(
+        self,
+        arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
+        *outputs: Optional[str],
+        batch_size: Optional[int] = -1,  # if -1: take class default
+    ) -> Union[list[AppFuture], AppFuture]:
+        """
+        Compute results for the given input.
+
+        Args:
+            arg: Input data to compute on.
+            outputs: Names of output quantities.
+            batch_size: Batch size for computation.
+
+        Returns:
+            Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
+        """
+        raise NotImplementedError
+
+
+@typeguard.typechecked
+def _compute_rmse(
+    array0: np.ndarray,
+    array1: np.ndarray,
+    reduce: bool = True,
+) -> Union[float, np.ndarray]:
+    """
+    Compute the Root Mean Square Error (RMSE) between two arrays.
+
+    Args:
+        array0: First array.
+        array1: Second array.
+        reduce: Whether to reduce the result to a single value.
+
+    Returns:
+        Union[float, np.ndarray]: RMSE value(s).
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+    assert array0.shape == array1.shape
+    assert np.all(np.isnan(array0) == np.isnan(array1))
+
+    se = (array0 - array1) ** 2
+    se = se.reshape(se.shape[0], -1)
+
+    if reduce:  # across both dimensions
+        mask = np.logical_not(np.isnan(se))
+        return float(np.sqrt(np.mean(se[mask])))
+    else:  # retain first dimension
+        if se.ndim == 1:
+            return se
+        else:
+            values = np.empty(len(se))
+            for i in range(len(se)):
+                if np.all(np.isnan(se[i])):
+                    values[i] = np.nan
+                else:
+                    mask = np.logical_not(np.isnan(se[i]))
+                    value = np.sqrt(np.mean(se[i][mask]))
+                    values[i] = value
+            return values
+
+
+compute_rmse = python_app(_compute_rmse, executors=["default_threads"])
+
+
+@typeguard.typechecked
+def _compute_mae(
+    array0,
+    array1,
+    reduce: bool = True,
+) -> Union[float, np.ndarray]:
+    """
+    Compute the Mean Absolute Error (MAE) between two arrays.
+
+    Args:
+        array0: First array.
+        array1: Second array.
+        reduce: Whether to reduce the result to a single value.
+
+    Returns:
+        Union[float, np.ndarray]: MAE value(s).
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+    assert array0.shape == array1.shape
+    mask0 = np.logical_not(np.isnan(array0))
+    mask1 = np.logical_not(np.isnan(array1))
+    assert np.all(mask0 == mask1)
+    ae = np.abs(array0 - array1)
+    to_reduce = tuple(range(1, array0.ndim))
+    mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce))
+    ae = ae[mask0].reshape(np.sum(1 * mask), -1)
+    if reduce:  # across both dimensions
+        return float(np.sqrt(np.mean(ae)))
+    else:  # retain first dimension
+        return np.sqrt(np.mean(ae, axis=1))
+
+
+compute_mae = python_app(_compute_mae, executors=["default_threads"])
diff --git a/psiflow/data/__init__.py b/psiflow/data/__init__.py
index b91798a..13a9804 100644
--- a/psiflow/data/__init__.py
+++ b/psiflow/data/__init__.py
@@ -1,2 +1 @@
-from .dataset import Computable, Dataset, aggregate_multiple, compute  # noqa: F401
-from .utils import compute_mae, compute_rmse  # noqa: F401
+from .dataset import Dataset
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index 9df0142..1dafd24 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -1,28 +1,22 @@
-import math
 from pathlib import Path
-from typing import Callable, ClassVar, Optional, Union
+from typing import Optional, Union
 
-import numpy as np
-from parsl.app.app import join_app, python_app
-from parsl.app.python import PythonApp
 from parsl.data_provider.files import File
 from parsl.dataflow.futures import AppFuture, DataFuture
 
 import psiflow
 from psiflow.geometry import Geometry
-from psiflow.utils.apps import copy_data_future, pack
+from psiflow.utils.apps import copy_data_future
 
 from .utils import (
     align_axes,
     app_filter,
     apply_offset,
     assign_identifiers,
-    batch_frames,
     clean_frames,
     count_frames,
     extract_quantities,
     get_elements,
-    get_train_valid_indices,
     insert_quantities,
     join_frames,
     read_frames,
@@ -30,6 +24,7 @@
     shuffle,
     write_frames,
     extract_quantities_per_atom,
+    split_frames,
 )
 
 
@@ -204,7 +199,7 @@ def get_per_atom(
         return tuple(future_dict[q] for q in quantities)
 
     def evaluate(
-        self, computable: Computable, batch_size: Optional[int] = None
+        self, computable: "Computable", batch_size: Optional[int] = None
     ) -> Dataset:
         """
         Evaluate a Computable on the dataset.
@@ -249,23 +244,21 @@ def align_axes(self) -> Dataset:
         future = align_axes(self.extxyz, outputs=[file])
         return Dataset(extxyz=future.outputs[0])
 
-    def split(self, fraction, shuffle=True):  # auto-shuffles
+    def split(self, fraction: float, shuffle: bool = True) -> tuple[Dataset, Dataset]:
         """
         Split the dataset into training and validation sets.
 
         Args:
             fraction: Fraction of data to use for training.
             shuffle: Whether to shuffle before splitting.
-
-        Returns:
-            tuple[Dataset, Dataset]: Training and validation datasets.
         """
-        train, valid = get_train_valid_indices(
-            self.length(),
-            fraction,
-            shuffle,
+        assert 0 <= fraction <= 1
+        file_train = psiflow.context().new_file("data_", ".xyz")
+        file_val = psiflow.context().new_file("data_", ".xyz")
+        future = split_frames(
+            self.extxyz, fraction, shuffle, outputs=[file_train, file_val]
         )
-        return self.__getitem__(train), self.__getitem__(valid)
+        return Dataset(extxyz=future.outputs[0]), Dataset(extxyz=future.outputs[1])
 
     def assign_identifiers(
         self, identifier: Union[int, AppFuture, None] = None
@@ -279,6 +272,7 @@ def assign_identifiers(
         Returns:
             AppFuture: Future representing the next available identifier.
         """
+        # TODO: what is the use case for this?
         result = assign_identifiers(
             identifier,
             inputs=[self.extxyz],
@@ -295,243 +289,3 @@ def load(cls, path_xyz: Union[Path, str]) -> Dataset:
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
         assert path_xyz.exists()  # needs to be locally accessible
         return cls(extxyz=File(path_xyz))
-
-
-def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]:
-    """
-    Concatenate multiple lists of arrays.
-
-    Args:
-        *args: Lists of numpy arrays to concatenate.
-
-    Returns:
-        list[np.ndarray]: List of concatenated arrays.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-
-    def pad_arrays(
-        arrays: list[np.ndarray],
-        pad_dimension: int = 1,
-    ) -> list[np.ndarray]:
-        ndims = np.array([len(a.shape) for a in arrays])
-        assert np.all(ndims == ndims[0])
-        assert np.all(pad_dimension < ndims)
-
-        pad_size = max([a.shape[pad_dimension] for a in arrays])
-        for i in range(len(arrays)):
-            shape = list(arrays[i].shape)
-            shape[pad_dimension] = pad_size - shape[pad_dimension]
-            padding = np.zeros(tuple(shape)) + np.nan
-            arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension)
-        return arrays
-
-    narrays = len(args[0])
-    for arg in args:
-        assert isinstance(arg, list)
-    assert all([len(a) == narrays for a in args])
-
-    concatenated = []
-    for i in range(narrays):
-        arrays = [arg[i] for arg in args]
-        if len(arrays[0].shape) > 1:
-            pad_arrays(arrays)
-        concatenated.append(np.concatenate(tuple(arrays)))
-    return concatenated
-
-
-concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"])
-
-
-def _aggregate_multiple(
-    *arrays_list,
-    coefficients: Optional[np.ndarray] = None,
-) -> list[np.ndarray]:
-    """
-    Aggregate multiple lists of arrays with optional coefficients.
-
-    Args:
-        *arrays_list: Lists of arrays to aggregate.
-        coefficients: Optional coefficients for weighted aggregation.
-
-    Returns:
-        list[np.ndarray]: List of aggregated arrays.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    if coefficients is None:
-        coefficients = np.ones(len(arrays_list))
-    else:
-        assert len(coefficients) == len(arrays_list)
-
-    results = [np.zeros(a.shape) for a in arrays_list[0]]
-    for i, arrays in enumerate(arrays_list):
-        for j, array in enumerate(arrays):
-            results[j] += coefficients[i] * array
-    return results
-
-
-aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"])
-
-
-@join_app
-def batch_apply(
-    apply_apps: tuple[Union[PythonApp, Callable]],
-    arg: Union[Dataset, list[Geometry]],
-    batch_size: int,
-    length: int,
-    outputs: list = [],
-    reduce_func: Optional[PythonApp] = None,
-    **app_kwargs,
-) -> AppFuture:
-    """
-    Apply a set of apps to batches of data.
-
-    Args:
-        apply_apps: Tuple of PythonApps or Callables to apply.
-        arg: Dataset or list of Geometries to process.
-        batch_size: Size of each batch.
-        length: Total number of items to process.
-        outputs: List of output files.
-        reduce_func: Optional function to reduce results.
-        **app_kwargs: Additional keyword arguments for the apps.
-
-    Returns:
-        AppFuture: Future representing the result of batch application.
-
-    Note:
-        This function is wrapped as a Parsl join_app.
-    """
-    nbatches = math.ceil(length / batch_size)
-    batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)]
-    future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches)
-    output_futures = []
-    for i in range(nbatches):
-        futures = []
-        for app in apply_apps:
-            f = app(
-                None,
-                inputs=[future.outputs[i]],
-                **app_kwargs,
-            )
-            futures.append(f)
-        reduced = reduce_func(*futures)
-        output_futures.append(reduced)
-    future = concatenate_multiple(*output_futures)
-    return future
-
-
-@python_app(executors=["default_threads"])
-def get_length(arg):
-    """
-    Get the length of the input argument.
-
-    Args:
-        arg: Input to get the length of.
-
-    Returns:
-        int: Length of the input.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    if isinstance(arg, list):
-        return len(arg)
-    else:
-        return 1
-
-
-def compute(
-    arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-    *apply_apps: Union[PythonApp, Callable],
-    outputs_: Union[str, list[str], tuple[str, ...], None] = None,
-    reduce_func: Union[PythonApp, Callable] = aggregate_multiple,
-    batch_size: Optional[int] = None,
-) -> Union[list[AppFuture], AppFuture]:
-    """
-    Compute results by applying apps to the input data.
-
-    Args:
-        arg: Input data to compute on.
-        *apply_apps: Apps to apply to the data.
-        outputs_: Names of output quantities.
-        reduce_func: Function to reduce results.
-        batch_size: Optional batch size for processing.
-
-    Returns:
-        Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
-    """
-    if type(outputs_) is str:
-        outputs_ = [outputs_]
-    if batch_size is not None:
-        if isinstance(arg, Dataset):
-            length = arg.length()
-        else:
-            length = get_length(arg)
-            # convert to Dataset for convenience
-            arg = Dataset(arg)
-        future = batch_apply(
-            apply_apps,
-            arg,
-            batch_size,
-            length,
-            outputs_=outputs_,
-            reduce_func=reduce_func,
-        )
-    else:
-        futures = []
-        if isinstance(arg, Dataset):
-            for app in apply_apps:
-                future = app(
-                    None,
-                    outputs_=outputs_,
-                    inputs=[arg.extxyz],
-                )
-                futures.append(future)
-        else:
-            for app in apply_apps:
-                future = app(
-                    arg,
-                    outputs_=outputs_,
-                    inputs=[],
-                )
-                futures.append(future)
-        future = reduce_func(*futures)
-    if len(outputs_) == 1:
-        return future[0]
-    else:
-        return [future[i] for i in range(len(outputs_))]
-
-
-class Computable:
-    """
-    Base class for computable objects.
-
-    Attributes:
-        outputs (ClassVar[tuple[str, ...]]): Names of output quantities.
-        batch_size (ClassVar[Optional[int]]): Default batch size for computation.
-    """
-
-    outputs: ClassVar[tuple[str, ...]] = ()
-    batch_size: ClassVar[Optional[int]] = None
-
-    def compute(
-        self,
-        arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-        *outputs: Optional[str],
-        batch_size: Optional[int] = -1,  # if -1: take class default
-    ) -> Union[list[AppFuture], AppFuture]:
-        """
-        Compute results for the given input.
-
-        Args:
-            arg: Input data to compute on.
-            outputs: Names of output quantities.
-            batch_size: Batch size for computation.
-
-        Returns:
-            Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
-        """
-        raise NotImplementedError
diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py
index 3f77308..b9c6b94 100644
--- a/psiflow/data/utils.py
+++ b/psiflow/data/utils.py
@@ -2,18 +2,16 @@
 import shutil
 import subprocess
 from pathlib import Path
-from typing import Optional, Union, Sequence, Generator, TypeAlias
+from typing import Optional, Sequence, Generator, TypeAlias
 
 import numpy as np
 import typeguard
 from ase.data import atomic_numbers, chemical_symbols
 from parsl import python_app, File
-from parsl.dataflow.futures import AppFuture
 
 from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers
 
 # TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor.
-# TODO: loosen type hints?
 # TODO: inputs/outputs for apps sometimes behave weird
 
 
@@ -33,81 +31,6 @@ def __bool__(self):
 MISSING = MissingType()
 
 
-def iter_read_frames(file: FileLike) -> Generator[list[str]]:
-    """Yields text data to instantiate geometries"""
-    frame_regex = re.compile(r"^\d+$")
-    with open(file, "r") as f:
-        for line in f:
-            if frame_regex.match(_ := line.strip()):
-                n = int(_)
-                yield [line] + [f.readline() for _i in range(n + 1)]
-
-
-def _write_frames(
-    *states: Geometry | list[Geometry], outputs: Sequence[File] = ()
-) -> None:
-    """
-    Write Geometry instances to a file.
-
-    Args:
-        states: Variable number of (lists of) Geometry instances to write.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path.
-    """
-    assert len(outputs) == 1
-    data = []
-    for d in states:
-        if isinstance(d, list):
-            data.extend(d)
-        else:
-            data.append(d)
-    with open(outputs[0], "w") as f:
-        f.write("".join([geom.to_string() for geom in data]))
-
-
-write_frames = python_app(_write_frames, executors=["default_threads"])
-
-
-def _read_frames(
-    file: FileLike, indices: Optional[slice | list[int] | int] = None
-) -> list[Geometry]:
-    """
-    Read Geometry instances from a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer.
-
-    """
-    if indices is None:
-        # read everything
-        data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)]
-        return [Geometry.from_string(s) for s in data]
-
-    # figure out what frames to read
-    # TODO: reads twice + indices always wrap modulo nframes which might be unexpected
-    length = _count_frames(file)
-    if isinstance(indices, slice):
-        indices = list(range(length)[indices])
-    elif isinstance(indices, int):
-        indices = [indices]
-
-    if length > 0:
-        indices = [i % length for i in indices]  # for negative indices and wrapping
-    indices_ = set(indices)  # for *much* faster 'i in indices'
-
-    data = {}
-    for i, geom_str_list in enumerate(iter_read_frames(file)):
-        if i in indices_:
-            data[i] = Geometry.from_string("".join(geom_str_list))
-
-    # return in original order
-    return [data[i] for i in indices]
-
-
-read_frames = python_app(_read_frames, executors=["default_threads"])
-
-
 def _extract_quantities(
     states: Sequence[Geometry], quantities: Sequence[str]
 ) -> dict[str, list]:
@@ -198,90 +121,79 @@ def _extract_quantities_per_atom(
 )
 
 
-@typeguard.typechecked
-def _check_distances(state: Geometry, threshold: float) -> Geometry:
-    """
-    Check if all interatomic distances in a Geometry are above a threshold.
+def iter_read_frames(file: FileLike) -> Generator[list[str]]:
+    """Yields text data to instantiate geometries"""
+    frame_regex = re.compile(r"^\d+$")
+    with open(file, "r") as f:
+        for line in f:
+            if frame_regex.match(_ := line.strip()):
+                n = int(_)
+                yield [line] + [f.readline() for _i in range(n + 1)]
 
-    Args:
-        state: Geometry instance to check.
-        threshold: Minimum allowed interatomic distance.
 
-    Returns:
-        Geometry: The input Geometry if all distances are above the threshold, otherwise NullState.
+def _write_frames(
+    *states: Geometry | list[Geometry], outputs: Sequence[File] = ()
+) -> None:
+    """
+    Write Geometry instances to a file.
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_htex executor.
+    Args:
+        states: Variable number of (lists of) Geometry instances to write.
+        outputs: List of Parsl futures. The first element should be a DataFuture
+                 representing the output file path.
     """
-    from ase.geometry.geometry import find_mic
-
-    if state == NullState:
-        return NullState
-    nrows = int(len(state) * (len(state) - 1) / 2)
-    deltas = np.zeros((nrows, 3))
-    count = 0
-    for i in range(len(state) - 1):
-        for j in range(i + 1, len(state)):
-            deltas[count] = state.per_atom.positions[i] - state.per_atom.positions[j]
-            count += 1
-    assert count == nrows
-    if state.periodic:
-        deltas, _ = find_mic(deltas, state.cell)
-    check = np.all(np.linalg.norm(deltas, axis=1) > threshold)
-    if check:
-        return state
-    else:
-        return NullState
+    assert len(outputs) == 1
+    data = []
+    for d in states:
+        if isinstance(d, list):
+            data.extend(d)
+        else:
+            data.append(d)
+    with open(outputs[0], "w") as f:
+        f.write("".join([geom.to_string() for geom in data]))
 
 
-check_distances = python_app(_check_distances, executors=["default_htex"])
+write_frames = python_app(_write_frames, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _assign_identifiers(
-    identifier: Optional[int],
-    inputs: list = [],
-    outputs: list = [],
-) -> int:
+def _read_frames(
+    file: FileLike, indices: Optional[slice | list[int] | int] = None
+) -> list[Geometry]:
     """
-    Assign identifiers to Geometry instances in a file.
+    Read Geometry instances from a file.
 
     Args:
-        identifier: Starting identifier value.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where updated geometries will be written.
-
-    Returns:
-        int: Next available identifier.
+        file: DataFuture representing the input file path containing the geometry data.
+        indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer.
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
     """
-    data = _read_frames(slice(None), inputs=[inputs[0]])
-    states = []
-    if identifier is None:  # do not assign but look for max
-        identifier = -1
-        for geometry in data:
-            if geometry.identifier is not None:
-                identifier = max(identifier, geometry.identifier)
-        identifier += 1
-        for geometry in data:  # assign those which were not yet assigned
-            if geometry.identifier is None:
-                geometry, identifier = _assign_identifier(geometry, identifier)
-            states.append(geometry)
-        _write_frames(*states, outputs=[outputs[0]])
-        return identifier
-    else:
-        for geometry in data:
-            geometry, identifier = _assign_identifier(geometry, identifier)
-            states.append(geometry)
-        _write_frames(*states, outputs=[outputs[0]])
-        return identifier
+    if indices is None:
+        # read everything
+        data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)]
+        return [Geometry.from_string(s) for s in data]
 
+    # figure out what frames to read
+    # TODO: reads twice + indices always wrap modulo nframes which might be unexpected
+    length = _count_frames(file)
+    if isinstance(indices, slice):
+        indices = list(range(length)[indices])
+    elif isinstance(indices, int):
+        indices = [indices]
 
-assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"])
+    if length > 0:
+        indices = [i % length for i in indices]  # for negative indices and wrapping
+    indices_ = set(indices)  # for *much* faster 'i in indices'
+
+    data = {}
+    for i, geom_str_list in enumerate(iter_read_frames(file)):
+        if i in indices_:
+            data[i] = Geometry.from_string("".join(geom_str_list))
+
+    # return in original order
+    return [data[i] for i in indices]
+
+
+read_frames = python_app(_read_frames, executors=["default_threads"])
 
 
 def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None:
@@ -477,186 +389,33 @@ def _shuffle(
 shuffle = python_app(_shuffle, executors=["default_threads"])
 
 
-@typeguard.typechecked
-def _train_valid_indices(
-    effective_nstates: int,
-    train_valid_split: float,
-    shuffle: bool,
-) -> tuple[list[int], list[int]]:
-    """
-    Generate indices for train and validation splits.
-
-    Args:
-        effective_nstates: Total number of states.
-        train_valid_split: Fraction of states to use for training.
-        shuffle: Whether to shuffle the indices.
-
-    Returns:
-        tuple[list[int], list[int]]: Lists of indices for training and validation sets.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    ntrain = int(np.floor(effective_nstates * train_valid_split))
-    nvalid = effective_nstates - ntrain
-    assert ntrain > 0
-    assert nvalid > 0
-    order = np.arange(ntrain + nvalid, dtype=int)
+def _split_frames(
+    file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = ()
+) -> None:
+    """Split into training and validation sets"""
+    assert len(outputs) == 2
+    assert 0 <= fraction <= 1
+    frames = _read_frames(file)
+    order = np.arange(len(frames))
     if shuffle:
         np.random.shuffle(order)
-    train_list = list(order[:ntrain])
-    valid_list = list(order[ntrain : (ntrain + nvalid)])
-    return [int(i) for i in train_list], [int(i) for i in valid_list]
-
-
-train_valid_indices = python_app(_train_valid_indices, executors=["default_threads"])
-
-
-@typeguard.typechecked
-def get_train_valid_indices(
-    effective_nstates: AppFuture,
-    train_valid_split: float,
-    shuffle: bool,
-) -> tuple[AppFuture, AppFuture]:
-    """
-    Get futures for train and validation indices.
-
-    Args:
-        effective_nstates: Future representing the total number of states.
-        train_valid_split: Fraction of states to use for training.
-        shuffle: Whether to shuffle the indices.
-
-    Returns:
-        tuple[AppFuture, AppFuture]: Futures for training and validation indices.
-    """
-    future = train_valid_indices(effective_nstates, train_valid_split, shuffle)
-    return future[0], future[1]
-
-
-# @typeguard.typechecked
-# def get_index_element_mask(
-#     numbers: np.ndarray,
-#     atom_indices: Optional[list[int]],
-#     elements: Optional[list[str]],
-#     natoms_padded: Optional[int] = None,
-# ) -> np.ndarray:
-#     """
-#     Generate a mask for atom indices and elements.
-#
-#     Args:
-#         numbers: Array of atomic numbers.
-#         atom_indices: List of atom indices to include.
-#         elements: List of element symbols to include.
-#         natoms_padded: Total number of atoms including padding.
-#
-#     Returns:
-#         np.ndarray: Boolean mask array.
-#     """
-#     mask = np.array([True] * len(numbers))
-#
-#     if elements is not None:
-#         numbers_to_include = [atomic_numbers[e] for e in elements]
-#         mask_elements = np.array([False] * len(numbers))
-#         for number in numbers_to_include:
-#             mask_elements = np.logical_or(mask_elements, (numbers == number))
-#         mask = np.logical_and(mask, mask_elements)
-#
-#     if natoms_padded is not None:
-#         assert natoms_padded >= len(numbers)
-#         padding = natoms_padded - len(numbers)
-#         mask = np.concatenate((mask, np.array([False] * padding)), axis=0).astype(bool)
-#
-#     if atom_indices is not None:  # below padding
-#         mask_indices = np.array([False] * len(mask))
-#         mask_indices[np.array(atom_indices)] = True
-#         mask = np.logical_and(mask, mask_indices)
-#
-#     return mask
-
-# TODO: these do not belong here
-
-@typeguard.typechecked
-def _compute_rmse(
-    array0: np.ndarray,
-    array1: np.ndarray,
-    reduce: bool = True,
-) -> Union[float, np.ndarray]:
-    """
-    Compute the Root Mean Square Error (RMSE) between two arrays.
-
-    Args:
-        array0: First array.
-        array1: Second array.
-        reduce: Whether to reduce the result to a single value.
-
-    Returns:
-        Union[float, np.ndarray]: RMSE value(s).
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    assert array0.shape == array1.shape
-    assert np.all(np.isnan(array0) == np.isnan(array1))
-
-    se = (array0 - array1) ** 2
-    se = se.reshape(se.shape[0], -1)
-
-    if reduce:  # across both dimensions
-        mask = np.logical_not(np.isnan(se))
-        return float(np.sqrt(np.mean(se[mask])))
-    else:  # retain first dimension
-        if se.ndim == 1:
-            return se
-        else:
-            values = np.empty(len(se))
-            for i in range(len(se)):
-                if np.all(np.isnan(se[i])):
-                    values[i] = np.nan
-                else:
-                    mask = np.logical_not(np.isnan(se[i]))
-                    value = np.sqrt(np.mean(se[i][mask]))
-                    values[i] = value
-            return values
+    n_train = int(len(frames) * fraction)
+    train_ids = order[:n_train]
+    val_ids = order[n_train:]
 
+    train = [frames[i] for i in train_ids]
+    val = [frames[i] for i in val_ids]
+    _write_frames(train, outputs=outputs[:1])
+    _write_frames(val, outputs=outputs[1:])
 
-compute_rmse = python_app(_compute_rmse, executors=["default_threads"])
 
+split_frames = python_app(_split_frames, executors=["default_threads"])
 
-@typeguard.typechecked
-def _compute_mae(
-    array0,
-    array1,
-    reduce: bool = True,
-) -> Union[float, np.ndarray]:
-    """
-    Compute the Mean Absolute Error (MAE) between two arrays.
 
-    Args:
-        array0: First array.
-        array1: Second array.
-        reduce: Whether to reduce the result to a single value.
-
-    Returns:
-        Union[float, np.ndarray]: MAE value(s).
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    assert array0.shape == array1.shape
-    mask0 = np.logical_not(np.isnan(array0))
-    mask1 = np.logical_not(np.isnan(array1))
-    assert np.all(mask0 == mask1)
-    ae = np.abs(array0 - array1)
-    to_reduce = tuple(range(1, array0.ndim))
-    mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce))
-    ae = ae[mask0].reshape(np.sum(1 * mask), -1)
-    if reduce:  # across both dimensions
-        return float(np.sqrt(np.mean(ae)))
-    else:  # retain first dimension
-        return np.sqrt(np.mean(ae, axis=1))
-
-
-compute_mae = python_app(_compute_mae, executors=["default_threads"])
+# TODO
+# TODO: think about where the parts below should live
+# TODO
 
 
 @typeguard.typechecked
@@ -710,3 +469,78 @@ def _batch_frames(
 
 
 batch_frames = python_app(_batch_frames, executors=["default_threads"])
+
+
+# def _assign_identifier(
+#     state: Geometry,
+#     identifier: int,
+#     discard: bool = False,
+# ) -> tuple[Geometry, int]:
+#     """
+#     Assign an identifier to a Geometry instance.
+#
+#     Args:
+#         state (Geometry): Input Geometry instance.
+#         identifier (int): Identifier to assign.
+#         discard (bool, optional): Whether to discard the state. Defaults to False.
+#
+#     Returns:
+#         tuple[Geometry, int]: Updated Geometry and next available identifier.
+#     """
+#     if (state == NullState) or discard:
+#         return state, identifier
+#     else:
+#         assert state.identifier is None
+#         state.identifier = identifier
+#         return state, identifier + 1
+#
+#
+# assign_identifier = python_app(_assign_identifier, executors=["default_threads"])
+#
+
+
+@typeguard.typechecked
+def _assign_identifiers(
+    identifier: Optional[int],
+    inputs: list = [],
+    outputs: list = [],
+) -> int:
+    """
+    Assign identifiers to Geometry instances in a file.
+
+    Args:
+        identifier: Starting identifier value.
+        inputs: List of Parsl futures. The first element should be a DataFuture
+                representing the input file path containing geometry data.
+        outputs: List of Parsl futures. The first element should be a DataFuture
+                 representing the output file path where updated geometries will be written.
+
+    Returns:
+        int: Next available identifier.
+
+    Note:
+        This function is wrapped as a Parsl app and executed using the default_threads executor.
+    """
+    data = _read_frames(slice(None), inputs=[inputs[0]])
+    states = []
+    if identifier is None:  # do not assign but look for max
+        identifier = -1
+        for geometry in data:
+            if geometry.identifier is not None:
+                identifier = max(identifier, geometry.identifier)
+        identifier += 1
+        for geometry in data:  # assign those which were not yet assigned
+            if geometry.identifier is None:
+                geometry, identifier = _assign_identifier(geometry, identifier)
+            states.append(geometry)
+        _write_frames(*states, outputs=[outputs[0]])
+        return identifier
+    else:
+        for geometry in data:
+            geometry, identifier = _assign_identifier(geometry, identifier)
+            states.append(geometry)
+        _write_frames(*states, outputs=[outputs[0]])
+        return identifier
+
+
+assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"])
diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index a2fa7e5..abac90e 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -11,7 +11,6 @@
 from ase import Atoms
 from ase.data import atomic_masses, chemical_symbols, atomic_numbers
 from ase.io.extxyz import key_val_dict_to_str, read_xyz, key_val_str_to_dict
-from parsl.app.app import python_app
 
 import psiflow
 
@@ -19,7 +18,7 @@
 # TODO: custom attributes
 
 
-class Container:
+class PerAtom:
     """
     Holds 'per atom' arrays like positions and forces
     For every attribute, the first array dimension should be n_atoms
@@ -52,7 +51,7 @@ def __len__(self) -> int:
         return self.numbers.shape[0]
 
     def __str__(self) -> str:
-        return f"Container[{len(self)}](keys={set(vars(self))})"
+        return f"PerAtom[{len(self)}](keys={set(vars(self))})"
 
     def _check(self, arr: npt.NDArray) -> bool:
         try:
@@ -140,21 +139,20 @@ class Geometry:
     and various physical properties such as energy and forces.
 
     Attributes:
-        per_atom (Container): Record array containing per-atom properties.
+        per_atom (PerAtom): Record array containing per-atom properties.
         cell (np.ndarray): 3x3 array representing the unit cell vectors.
         energy (Optional[float]): Total energy of the system.
         stress (Optional[np.ndarray]): Stress tensor of the system.
     """
 
-    per_atom: Container
+    per_atom: PerAtom
     cell: npt.NDArray
     energy: Optional[float]
     stress: Optional[np.ndarray]
-    meta: SimpleNamespace
 
     def __init__(
         self,
-        per_atom: Container,
+        per_atom: PerAtom,
         energy: Optional[float] = None,
         cell: Optional[np.ndarray] = None,
         stress: Optional[np.ndarray] = None,
@@ -279,7 +277,7 @@ def from_string(cls, s: str) -> Geometry:
             data["cell"] = data.pop("Lattice").T  # ase does Fortran ordering
         else:
             data["cell"] = None
-        per_atom = Container.from_string(body, data.pop("Properties", None))
+        per_atom = PerAtom.from_string(body, data.pop("Properties", None))
 
         assert len(per_atom) == int(n_atoms)
         return Geometry(per_atom, **data)
@@ -300,7 +298,7 @@ def from_data(
         """
         Create a Geometry instance from atomic numbers, positions, and cell data.
         """
-        per_atom = Container(numbers.copy(), positions.copy())
+        per_atom = PerAtom(numbers.copy(), positions.copy())
         cell = cell if cell is None else cell.copy()
         return Geometry(per_atom, cell=cell)
 
@@ -309,7 +307,7 @@ def from_atoms(cls, atoms: Atoms) -> Geometry:
         """
         Create a Geometry instance from an ASE Atoms object.
         """
-        per_atom = Container(**atoms.arrays)
+        per_atom = PerAtom(**atoms.arrays)
         data = atoms.info
         if all(atoms.pbc):
             data["cell"] = atoms.cell.array
@@ -362,20 +360,6 @@ def numbers(self) -> npt.NDArray:
         return self.per_atom.numbers.flatten()
 
 
-# def new_nullstate():
-#     """
-#     Create a new null state Geometry.
-#
-#     Returns:
-#         Geometry: A Geometry instance representing a null state.
-#     """
-#     return Geometry.from_data(np.zeros(1), np.zeros((1, 3)), None)
-
-
-# use universal dummy state
-# NullState = new_nullstate()
-
-
 def is_lower_triangular(cell: np.ndarray) -> bool:
     """
     Check if a cell matrix is lower triangular.
@@ -537,94 +521,3 @@ def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) ->
         except KeyError:
             raise KeyError(f"No atomic energy value for symbol '{symbol}'..")
     return float(total)
-
-
-# def create_outputs(quantities: list[str], data: list[Geometry]) -> list[np.ndarray]:
-#     """
-#     Create output arrays for specified quantities from a list of Geometry instances.
-#
-#     Args:
-#         quantities (list[str]): List of quantity names to extract.
-#         data (list[Geometry]): List of Geometry instances.
-#
-#     Returns:
-#         list[np.ndarray]: List of arrays containing the requested quantities.
-#     """
-#     order_names = list(set([k for g in data for k in g.order]))
-#     assert all([q in QUANTITIES + order_names for q in quantities])
-#     natoms = np.array([len(geometry) for geometry in data], dtype=int)
-#     max_natoms = np.max(natoms)
-#     nframes = len(data)
-#     nprob = 0
-#     max_phase = 0
-#     for state in data:
-#         if state.logprob is not None:
-#             nprob = max(len(state.logprob), nprob)
-#         if state.phase is not None:
-#             max_phase = max(len(state.phase), max_phase)
-#
-#     arrays = []
-#     for quantity in quantities:
-#         if quantity in ["positions", "forces"]:
-#             array = np.empty((nframes, max_natoms, 3), dtype=np.float64)
-#             array[:] = np.nan
-#         elif quantity in ["cell", "stress"]:
-#             array = np.empty((nframes, 3, 3), dtype=np.float64)
-#             array[:] = np.nan
-#         elif quantity in ["numbers"]:
-#             array = np.empty((nframes, max_natoms), dtype=np.uint8)
-#             array[:] = 0
-#         elif quantity in ["energy", "delta", "per_atom_energy"]:
-#             array = np.empty((nframes,), dtype=np.float64)
-#             array[:] = np.nan
-#         elif quantity in ["phase"]:
-#             array = np.empty((nframes,), dtype=(np.unicode_, max_phase))
-#             array[:] = ""
-#         elif quantity in ["logprob"]:
-#             array = np.empty((nframes, nprob), dtype=np.float64)
-#             array[:] = np.nan
-#         elif quantity in ["identifier"]:
-#             array = np.empty((nframes,), dtype=np.int64)
-#             array[:] = -1
-#         elif quantity in order_names:
-#             array = np.empty((nframes,), dtype=np.float64)
-#             array[:] = np.nan
-#         else:
-#             raise AssertionError("missing quantity in if/else")
-#         arrays.append(array)
-#     return arrays
-
-
-# def _assign_identifier(
-#     state: Geometry,
-#     identifier: int,
-#     discard: bool = False,
-# ) -> tuple[Geometry, int]:
-#     """
-#     Assign an identifier to a Geometry instance.
-#
-#     Args:
-#         state (Geometry): Input Geometry instance.
-#         identifier (int): Identifier to assign.
-#         discard (bool, optional): Whether to discard the state. Defaults to False.
-#
-#     Returns:
-#         tuple[Geometry, int]: Updated Geometry and next available identifier.
-#     """
-#     if (state == NullState) or discard:
-#         return state, identifier
-#     else:
-#         assert state.identifier is None
-#         state.identifier = identifier
-#         return state, identifier + 1
-
-
-# assign_identifier = python_app(_assign_identifier, executors=["default_threads"])
-
-
-def _check_equality(state0: Geometry, state1: Geometry) -> bool:
-    """    Check if two Geometry instances are equal"""
-    return state0 == state1
-
-
-check_equality = python_app(_check_equality, executors=["default_threads"])

From 4978a83367e212dc86fffed05ce316d1881b7796 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Fri, 10 Apr 2026 11:09:22 +0200
Subject: [PATCH 05/16] more restructuring

separated functions/apps for geometries from functions/apps for datasets
---
 psiflow/compute.py      |  12 +-
 psiflow/data/dataset.py | 108 ++++----
 psiflow/data/file.py    | 273 +++++++++++++++++++++
 psiflow/data/utils.py   | 529 +++++-----------------------------------
 4 files changed, 399 insertions(+), 523 deletions(-)
 create mode 100644 psiflow/data/file.py

diff --git a/psiflow/compute.py b/psiflow/compute.py
index 110ab33..22461ef 100644
--- a/psiflow/compute.py
+++ b/psiflow/compute.py
@@ -3,15 +3,13 @@
 
 import numpy as np
 import typeguard
-from parsl import python_app
 from parsl.app.app import join_app, python_app
-from parsl.app.python import PythonApp
 from parsl.dataflow.futures import AppFuture, DataFuture
 
 import psiflow
 from psiflow.geometry import Geometry
 from psiflow.data import Dataset
-from psiflow.data.utils import batch_frames
+from psiflow.data.file import batch_frames
 
 
 def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]:
@@ -95,12 +93,12 @@ def _aggregate_multiple(
 
 @join_app
 def batch_apply(
-    apply_apps: tuple[Union[PythonApp, Callable]],
+    apply_apps: tuple[Union[python_app, Callable]],
     arg: Union[Dataset, list[Geometry]],
     batch_size: int,
     length: int,
     outputs: list = [],
-    reduce_func: Optional[PythonApp] = None,
+    reduce_func: Optional[python_app] = None,
     **app_kwargs,
 ) -> AppFuture:
     """
@@ -162,9 +160,9 @@ def get_length(arg):
 
 def compute(
     arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-    *apply_apps: Union[PythonApp, Callable],
+    *apply_apps: Union[python_app, Callable],
     outputs_: Union[str, list[str], tuple[str, ...], None] = None,
-    reduce_func: Union[PythonApp, Callable] = aggregate_multiple,
+    reduce_func: Union[python_app, Callable] = aggregate_multiple,
     batch_size: Optional[int] = None,
 ) -> Union[list[AppFuture], AppFuture]:
     """
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index 1dafd24..df79073 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -8,26 +8,28 @@
 from psiflow.geometry import Geometry
 from psiflow.utils.apps import copy_data_future
 
-from .utils import (
-    align_axes,
-    app_filter,
-    apply_offset,
-    assign_identifiers,
-    clean_frames,
+from psiflow.data.file import (
     count_frames,
-    extract_quantities,
     get_elements,
-    insert_quantities,
     join_frames,
     read_frames,
-    reset_frames,
-    shuffle,
     write_frames,
-    extract_quantities_per_atom,
     split_frames,
+    shuffle_frames,
+    apply_offset,
+    reset_frames,
+    clean_frames,
+    align_axes,
+    filter_frames,
+    extract_quantities,
+    extract_quantities_per_atom,
+    assign_identifiers,
 )
 
 
+# TODO: do all operations need to generate a new file?
+
+
 @psiflow.register_serializable
 class Dataset:
     """
@@ -44,7 +46,7 @@ def __init__(
         extxyz: Optional[psiflow._DataFuture] = None,
     ):
         """
-        Initialize a Dataset.
+        Initialize a Dataset. Provide either states or extxyz.
 
         Args:
             states: List of Geometry instances or AppFutures representing geometries.
@@ -72,7 +74,7 @@ def shuffle(self) -> "Dataset":
         Shuffle the order of structures in the dataset.
         """
         file = psiflow.context().new_file("data_", ".xyz")
-        extxyz = shuffle(self.extxyz, outputs=[file]).outputs[0]
+        extxyz = shuffle_frames(self.extxyz, outputs=[file]).outputs[0]
         return Dataset(extxyz=extxyz)
 
     def __getitem__(
@@ -174,8 +176,7 @@ def get(self, *quantities: str) -> tuple[AppFuture, ...]:
         """
         Extract specified quantities from the dataset.
         """
-        future_states = read_frames(self.extxyz)
-        future_dict = extract_quantities(future_states, quantities)
+        future_dict = extract_quantities(self.extxyz, quantities)
         return tuple(future_dict[q] for q in quantities)
 
     def get_per_atom(
@@ -192,48 +193,48 @@ def get_per_atom(
             atom_indices: Optional list of atom indices to consider.
             elements: Optional list of element symbols to consider.
         """
-        future_states = read_frames(self.extxyz)
         future_dict = extract_quantities_per_atom(
-            future_states, quantities, atom_indices, elements
+            self.extxyz, quantities, atom_indices, elements
         )
         return tuple(future_dict[q] for q in quantities)
 
-    def evaluate(
-        self, computable: "Computable", batch_size: Optional[int] = None
-    ) -> Dataset:
-        """
-        Evaluate a Computable on the dataset.
-        """
-        # TODO: remove this functionality?
-        #  or this should be the (only) way to label a dataset?
-        from psiflow.hamiltonians import Hamiltonian
-
-        if not isinstance(computable, Hamiltonian):
-            # avoid extracting and inserting the same quantities
-            return computable.compute_dataset(self)
-
-        # use Hamiltonian.compute method
-        if batch_size is not None:
-            outputs = computable.compute(self, batch_size=batch_size)
-        else:
-            outputs = computable.compute(self)  # use default from computable
-        if not isinstance(outputs, list):  # compute unpacks for only one property
-            outputs = [outputs]
-        data = {k: v for k, v in zip(computable.outputs, outputs)}
-
-        file = psiflow.context().new_file("data_", ".xyz")
-
-        future = read_frames(self.extxyz)
-        future = insert_quantities(future, data)
-        extxyz = write_frames(future, outputs=[file]).outputs[0]
-        return Dataset(extxyz=extxyz)
+    # def evaluate(
+    #     self, computable: "Computable", batch_size: Optional[int] = None
+    # ) -> Dataset:
+    #     """
+    #     Evaluate a Computable on the dataset.
+    #     """
+    #     # TODO: remove this functionality?
+    #     #  or this should be the (only) way to label a dataset?
+    #     from psiflow.hamiltonians import Hamiltonian
+    #
+    #     if not isinstance(computable, Hamiltonian):
+    #         # avoid extracting and inserting the same quantities
+    #         return computable.compute_dataset(self)
+    #
+    #     # use Hamiltonian.compute method
+    #     if batch_size is not None:
+    #         outputs = computable.compute(self, batch_size=batch_size)
+    #     else:
+    #         outputs = computable.compute(self)  # use default from computable
+    #     if not isinstance(outputs, list):  # compute unpacks for only one property
+    #         outputs = [outputs]
+    #     data = {k: v for k, v in zip(computable.outputs, outputs)}
+    #
+    #     file = psiflow.context().new_file("data_", ".xyz")
+    #
+    #     future = read_frames(self.extxyz)
+    #     future = insert_quantities(future, data)
+    #     extxyz = write_frames(future, outputs=[file]).outputs[0]
+    #     return Dataset(extxyz=extxyz)
 
     def filter(self, quantity: str) -> Dataset:
         """
         Filter the dataset based on a specified quantity.
         """
+        # TODO: where is this used?
         file = psiflow.context().new_file("data_", ".xyz")
-        future = app_filter(self.extxyz, quantity, outputs=[file])
+        future = filter_frames(self.extxyz, quantity, outputs=[file])
         return Dataset(extxyz=future.outputs[0])
 
     def align_axes(self) -> Dataset:
@@ -273,13 +274,12 @@ def assign_identifiers(
             AppFuture: Future representing the next available identifier.
         """
         # TODO: what is the use case for this?
-        result = assign_identifiers(
-            identifier,
-            inputs=[self.extxyz],
-            outputs=[psiflow.context().new_file("data_", ".xyz")],
-        )
-        self.extxyz = result.outputs[0]
-        return result
+        # TODO: this is the only method that changes inplace instead of returning a new Dataset
+        future = assign_identifiers(self.extxyz, identifier)
+        future_states, future_id = future[0], future[1]
+        file = psiflow.context().new_file("data_", ".xyz")
+        self.extxyz = write_frames(future_states, outputs=[file]).outputs[0]
+        return future_id
 
     @classmethod
     def load(cls, path_xyz: Union[Path, str]) -> Dataset:
diff --git a/psiflow/data/file.py b/psiflow/data/file.py
new file mode 100644
index 0000000..b2c3902
--- /dev/null
+++ b/psiflow/data/file.py
@@ -0,0 +1,273 @@
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from functools import partial
+from typing import Optional, Generator, TypeAlias, Any, Callable
+from collections.abc import Sequence
+
+import numpy as np
+from ase.data import chemical_symbols
+from parsl import python_app, File
+
+from psiflow.geometry import Geometry
+from psiflow.data.utils import (
+    get_unique_numbers,
+    apply_energy_offset,
+    filter_quantity,
+    extract,
+    extract_per_atom,
+    assign_ids,
+)
+
+# TODO: some operations have side effects, which could be unexpected
+
+
+FileLike: TypeAlias = str | Path | File
+
+
+def iter_read_frames(file: FileLike) -> Generator[list[str]]:
+    """Yields text data to instantiate geometries"""
+    frame_regex = re.compile(r"^\d+$")
+    with open(file, "r") as f:
+        for line in f:
+            if frame_regex.match(_ := line.strip()):
+                n = int(_)
+                yield [line] + [f.readline() for _i in range(n + 1)]
+
+
+def _write_frames(
+    *states: Geometry | list[Geometry], outputs: Sequence[File] = ()
+) -> None:
+    """
+    Write Geometry instances to a file.
+
+    Args:
+        states: Variable number of (lists of) Geometry instances to write.
+        outputs: List of Parsl futures. The first element should be a DataFuture
+                 representing the output file path.
+    """
+    assert len(outputs) == 1
+    data = []
+    for d in states:
+        if isinstance(d, list):
+            data.extend(d)
+        else:
+            data.append(d)
+    with open(outputs[0], "w") as f:
+        f.write("".join([geom.to_string() for geom in data]))
+
+
+def _read_frames(
+    file: FileLike, indices: Optional[slice | list[int] | int] = None
+) -> list[Geometry]:
+    """
+    Read Geometry instances from a file.
+
+    Args:
+        file: DataFuture representing the input file path containing the geometry data.
+        indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer.
+
+    """
+    if indices is None:
+        # read everything
+        data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)]
+        return [Geometry.from_string(s) for s in data]
+
+    # figure out what frames to read
+    # TODO: reads twice + indices always wrap modulo nframes which might be unexpected
+    length = _count_frames(file)
+    if isinstance(indices, slice):
+        indices = list(range(length)[indices])
+    elif isinstance(indices, int):
+        indices = [indices]
+
+    if length > 0:
+        indices = [i % length for i in indices]  # for negative indices and wrapping
+    indices_ = set(indices)  # for *much* faster 'i in indices'
+
+    data = {}
+    for i, geom_str_list in enumerate(iter_read_frames(file)):
+        if i in indices_:
+            data[i] = Geometry.from_string("".join(geom_str_list))
+
+    # return in original order
+    return [data[i] for i in indices]
+
+
+def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None:
+    """
+    Join multiple frame files into a single file.
+
+    Args:
+        inputs: List of Parsl futures. Each element should be a DataFuture
+                representing an input file path containing geometry data.
+        outputs: List of Parsl futures. The first element should be a DataFuture
+                 representing the output file path where joined frames will be written.
+    """
+    assert len(outputs) == 1
+
+    with open(outputs[0], "wb") as destination:
+        for input_file in inputs:
+            with open(input_file, "rb") as source:
+                shutil.copyfileobj(source, destination)
+
+
+def _count_frames(file: FileLike) -> int:
+    """Count the number of frames in a file."""
+    path = Path(file)
+    threshold = 10 * 1024 * 1024  # 10 MB in bytes
+    file_size = path.stat().st_size
+
+    if file_size > threshold:  # use grep
+        cmd = f"grep -Fc Properties {path}"
+        result = subprocess.check_output(cmd.split())
+        return int(result.strip())
+    return len([_ for _ in iter_read_frames(path)])
+
+
+def _get_elements(*files: File) -> set[str]:
+    """
+    Get the set of elements present in all frames of a sequence of file.
+    """
+    frames = [geom for file in files for geom in _read_frames(file)]
+    return {chemical_symbols[i] for i in get_unique_numbers(frames)}
+
+
+def _split_frames(
+    file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = ()
+) -> None:
+    """Split into training and validation sets"""
+    assert len(outputs) == 2
+    assert 0 <= fraction <= 1
+    frames = _read_frames(file)
+    order = np.arange(len(frames))
+    if shuffle:
+        np.random.shuffle(order)
+
+    n_train = int(len(frames) * fraction)
+    train_ids = order[:n_train]
+    val_ids = order[n_train:]
+
+    train = [frames[i] for i in train_ids]
+    val = [frames[i] for i in val_ids]
+    _write_frames(train, outputs=outputs[:1])
+    _write_frames(val, outputs=outputs[1:])
+
+
+def _batch_frames(
+    file: FileLike, batch_size: int, outputs: Sequence[File] = ()
+) -> list[list[Geometry]]:
+    """
+    Split frames into batches.
+
+    Args:
+        file: DataFuture representing the input file path containing the geometry data.
+        batch_size: Number of frames per batch.
+        outputs: List of Parsl futures. Each element should be a DataFuture
+                 representing an output file path for each batch.
+    """
+    # TODO: why pass batch size?
+    # TODO: will not be great for large files
+    frames = _read_frames(file)
+
+    batches, batch = [], []
+    for i, geom in enumerate(frames):
+        batch.append(geom)
+        if (i + 1) % batch_size == 0:
+            batches.append(batch)
+            batch = []
+
+    if not outputs:
+        return batches
+
+    assert len(outputs) == len(batches)
+    for file, batch in zip(outputs, batches):
+        _write_frames(batch, outputs=[file])
+    return batches
+
+
+write_frames = python_app(_write_frames, executors=["default_threads"])
+read_frames = python_app(_read_frames, executors=["default_threads"])
+join_frames = python_app(_join_frames, executors=["default_threads"])
+count_frames = python_app(_count_frames, executors=["default_threads"])
+get_elements = python_app(_get_elements, executors=["default_threads"])
+split_frames = python_app(_split_frames, executors=["default_threads"])
+batch_frames = python_app(_batch_frames, executors=["default_threads"])
+
+
+def _read_write_wrapper(
+    execute: Callable,  # Parsl throws if this is named 'func'
+    file: FileLike,
+    *args: Any,
+    outputs: Sequence[File] = (),
+    **kwargs: Any,
+) -> None:
+    """
+    Wrapper function to make simple Geometry functions work for Dataset instances.
+    Essentially does (i) read (ii) modify (iii) write.
+
+    Args:
+        file: DataFuture representing the input file path containing the geometry data.
+        execute: Function that will get executed for the geometries in file
+        outputs: List of Parsl futures. The first element should be a DataFuture
+                 representing the output file path where reset frames will be written.
+    """
+    assert len(outputs) == 1
+    frames_in = _read_frames(file)
+    frames_out = execute(frames_in, *args, **kwargs)
+    if frames_out is None:  # execute can alter in-place
+        frames_out = frames_in
+    _write_frames(frames_out, outputs=outputs)
+
+
+def _reset_frames(states: Sequence[Geometry]) -> None:
+    for geom in states:
+        geom.reset()
+
+
+def _clean_frames(states: Sequence[Geometry]) -> None:
+    for geom in states:
+        geom.clean()
+
+
+def _align_axes(states: Sequence[Geometry]) -> None:
+    for geom in states:
+        geom.align_axes()
+
+
+def _shuffle_frames(states: Sequence[Geometry]) -> None:
+    np.random.shuffle(states)
+
+
+read_write_app = python_app(_read_write_wrapper, executors=["default_threads"])
+reset_frames = partial(read_write_app, _reset_frames)
+clean_frames = partial(read_write_app, _clean_frames)
+align_axes = partial(read_write_app, _align_axes)
+shuffle_frames = partial(read_write_app, _shuffle_frames)
+
+apply_offset = partial(read_write_app, apply_energy_offset)
+filter_frames = partial(read_write_app, filter_quantity)
+
+
+def _read_wrapper(
+    execute: Callable,  # Parsl throws if this is named 'func'
+    file: FileLike,
+    *args: Any,
+    **kwargs: Any,
+) -> Any:
+    """
+    Wrapper function to make simple Geometry functions work for Dataset instances.
+    Essentially does (i) read (ii) modify (iii) return.
+
+    Args:
+        file: DataFuture representing the input file path containing the geometry data.
+        execute: Function that will get executed for the geometries in file
+    """
+    frames_in = _read_frames(file)
+    return execute(frames_in, *args, **kwargs)
+
+read_app = python_app(_read_wrapper, executors=["default_threads"])
+extract_quantities = partial(read_app, extract)
+extract_quantities_per_atom = partial(read_app, extract_per_atom)
+assign_identifiers = partial(read_app, assign_ids)
diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py
index b9c6b94..8cfa5a1 100644
--- a/psiflow/data/utils.py
+++ b/psiflow/data/utils.py
@@ -1,87 +1,68 @@
-import re
-import shutil
-import subprocess
-from pathlib import Path
-from typing import Optional, Sequence, Generator, TypeAlias
+from typing import Optional
+from collections.abc import Sequence
 
 import numpy as np
-import typeguard
-from ase.data import atomic_numbers, chemical_symbols
-from parsl import python_app, File
-
-from psiflow.geometry import Geometry, get_atomic_energy, get_unique_numbers
-
-# TODO: Note: This function is wrapped as a Parsl app and executed using the default_threads executor.
-# TODO: inputs/outputs for apps sometimes behave weird
-
-
-FileLike: TypeAlias = str | Path | File
-
-
-class MissingType:
-    """Placeholder sentinel for missing data fields"""
-
-    def __repr__(self):
-        return "<MISSING>"
-
-    def __bool__(self):
-        return False
+from ase.data import atomic_numbers
+
+from psiflow.geometry import (
+    Geometry,
+    get_atomic_energy,
+    DEFAULT_PROPERTIES,
+    PER_ATOM_FIELDS,
+    MISSING,
+)
 
 
-MISSING = MissingType()
+# TODO: some of these have side effects..
 
 
-def _extract_quantities(
-    states: Sequence[Geometry], quantities: Sequence[str]
-) -> dict[str, list]:
+def extract(states: Sequence[Geometry], quantities: Sequence[str]) -> dict[str, list]:
     """
     Extract specified quantities from Geometry instances.
     """
-    data = {k: [] for k in quantities}
-    for k in quantities:
-        if k in ("cell", "energy", "stress"):
-            for geom in states:
-                value = getattr(geom, k, MISSING)
-                data[k].append(value)
-            continue
+    data = {}
+
+    # figure out where to find quantities
+    per_atom, per_structure, unknown = [], [], []
+    for q in set(quantities):
+        if q in PER_ATOM_FIELDS:
+            per_atom.append(q)
+        elif q in DEFAULT_PROPERTIES:
+            per_structure.append(q)
+        else:
+            unknown.append(q)
+
+    for q in per_atom:
+        data[q] = [getattr(geom.per_atom, q, MISSING) for geom in states]
+    for q in per_structure:
+        data[q] = [getattr(geom, q, MISSING) for geom in states]
+    for q in unknown:
+        # try both options
+        data[q] = []
         for geom in states:
-            value = getattr(geom.per_atom, k, MISSING)
+            value = getattr(geom, q, MISSING)
             if value is MISSING:
-                value = getattr(geom.meta, k, MISSING)
-            data[k].append(value)
-    return data
+                value = getattr(geom.per_atom, q, MISSING)
+            data[q].append(value)
 
-
-extract_quantities = python_app(_extract_quantities, executors=["default_threads"])
+    return data
 
 
-def _insert_quantities(
-    states: Sequence[Geometry], data: dict[str, list]
-) -> Sequence[Geometry]:
+def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geometry]:
     """
     Insert quantities from data into Geometry instances.
     """
     for q, values in data.items():
         assert len(states) == len(values)
-
-        if q in ("cell", "energy", "stress"):
-            for geom, v in zip(states, values):
-                setattr(geom, q, v)
-            continue
-
         for geom, v in zip(states, values):
             if isinstance(v, np.ndarray) and len(geom) == len(v):
                 setattr(geom.per_atom, q, v)
             else:
-                setattr(geom.meta, q, v)
-
+                setattr(geom, q, v)
     return states
 
 
-insert_quantities = python_app(_insert_quantities, executors=["default_threads"])
-
-
-def _extract_quantities_per_atom(
+def extract_per_atom(
     states: Sequence[Geometry],
     quantities: Sequence[str],
     atom_indices: Optional[Sequence[int]] = None,
@@ -116,431 +97,55 @@ def _extract_quantities_per_atom(
     return data
 
 
-extract_quantities_per_atom = python_app(
-    _extract_quantities_per_atom, executors=["default_threads"]
-)
-
-
-def iter_read_frames(file: FileLike) -> Generator[list[str]]:
-    """Yields text data to instantiate geometries"""
-    frame_regex = re.compile(r"^\d+$")
-    with open(file, "r") as f:
-        for line in f:
-            if frame_regex.match(_ := line.strip()):
-                n = int(_)
-                yield [line] + [f.readline() for _i in range(n + 1)]
-
-
-def _write_frames(
-    *states: Geometry | list[Geometry], outputs: Sequence[File] = ()
-) -> None:
-    """
-    Write Geometry instances to a file.
-
-    Args:
-        states: Variable number of (lists of) Geometry instances to write.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path.
-    """
-    assert len(outputs) == 1
-    data = []
-    for d in states:
-        if isinstance(d, list):
-            data.extend(d)
-        else:
-            data.append(d)
-    with open(outputs[0], "w") as f:
-        f.write("".join([geom.to_string() for geom in data]))
-
-
-write_frames = python_app(_write_frames, executors=["default_threads"])
-
-
-def _read_frames(
-    file: FileLike, indices: Optional[slice | list[int] | int] = None
+def filter_quantity(
+    states: Sequence[Geometry],
+    quantity: str,
 ) -> list[Geometry]:
     """
-    Read Geometry instances from a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        indices: Indices of frames to read. Can be None (read all), a slice, a list of integers, or a single integer.
-
-    """
-    if indices is None:
-        # read everything
-        data = ["".join(geom_str_list) for geom_str_list in iter_read_frames(file)]
-        return [Geometry.from_string(s) for s in data]
-
-    # figure out what frames to read
-    # TODO: reads twice + indices always wrap modulo nframes which might be unexpected
-    length = _count_frames(file)
-    if isinstance(indices, slice):
-        indices = list(range(length)[indices])
-    elif isinstance(indices, int):
-        indices = [indices]
-
-    if length > 0:
-        indices = [i % length for i in indices]  # for negative indices and wrapping
-    indices_ = set(indices)  # for *much* faster 'i in indices'
-
-    data = {}
-    for i, geom_str_list in enumerate(iter_read_frames(file)):
-        if i in indices_:
-            data[i] = Geometry.from_string("".join(geom_str_list))
-
-    # return in original order
-    return [data[i] for i in indices]
-
-
-read_frames = python_app(_read_frames, executors=["default_threads"])
-
-
-def _join_frames(inputs: Sequence[File] = (), outputs: Sequence[File] = ()) -> None:
-    """
-    Join multiple frame files into a single file.
-
-    Args:
-        inputs: List of Parsl futures. Each element should be a DataFuture
-                representing an input file path containing geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where joined frames will be written.
-    """
-    assert len(outputs) == 1
-
-    with open(outputs[0], "wb") as destination:
-        for input_file in inputs:
-            with open(input_file, "rb") as source:
-                shutil.copyfileobj(source, destination)
-
-
-join_frames = python_app(_join_frames, executors=["default_threads"])
-
-
-def _count_frames(file: FileLike) -> int:
-    """Count the number of frames in a file."""
-    path = Path(file)
-    threshold = 10 * 1024 * 1024  # 10 MB in bytes
-    file_size = path.stat().st_size
-
-    if file_size > threshold:  # use grep
-        cmd = f"grep -Fc Properties {path}"
-        result = subprocess.check_output(cmd.split())
-        return int(result.strip())
-    return len([_ for _ in iter_read_frames(path)])
-
-
-count_frames = python_app(_count_frames, executors=["default_threads"])
-
-
-def _reset_frames(file: FileLike, outputs: Sequence[File] = ()) -> None:
-    """
-    Reset all frames in a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where reset frames will be written.
-    """
-    assert len(outputs) == 1
-    data = _read_frames(file)
-    for geometry in data:
-        geometry.reset()
-    _write_frames(*data, outputs=outputs)
-
-
-reset_frames = python_app(_reset_frames, executors=["default_threads"])
-
-
-def _clean_frames(file: FileLike, outputs: Sequence[File] = ()) -> None:
+    Filter frames based on a specified quantity.
     """
-    Clean all frames in a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where reset frames will be written.
-    """
-    assert len(outputs) == 1
-    data = _read_frames(file)
-    for geometry in data:
-        geometry.clean()
-    _write_frames(*data, outputs=outputs)
+    data = extract(states, [quantity])[quantity]
+    return [geom for i, geom in enumerate(states) if not data[i] is MISSING]
 
 
-clean_frames = python_app(_clean_frames, executors=["default_threads"])
+def get_unique_numbers(states: Sequence[Geometry]) -> set[int]:
+    """Returns a set of unique atom numbers found across all states."""
+    numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers))
+    return {int(n) for n in numbers}
 
 
-def _apply_offset(
-    file: FileLike,
-    subtract: bool,
-    outputs: Sequence[File] = (),
-    **atomic_energies: float,
+def apply_energy_offset(
+    states: Sequence[Geometry], subtract: bool, **atomic_energies: float
 ) -> None:
-    """
-    Apply an energy offset to all frames in a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        subtract: Whether to subtract or add the offset.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where updated frames will be written.
-        **atomic_energies: Atomic energies for each element.
-    """
-    assert len(outputs) == 1
-    frames = _read_frames(file)
-    numbers_data = get_unique_numbers(frames)
+    """Apply an energy offset to all geometries"""
+    numbers_data = get_unique_numbers(states)
     numbers_kwargs = {atomic_numbers[e] for e in atomic_energies.keys()}
     assert numbers_data == numbers_kwargs, "Provide atomic energies for all elements.."
 
-    for geom in frames:
+    for geom in states:
         energy = get_atomic_energy(geom, atomic_energies)
         if subtract:
             geom.energy -= energy
         else:
             geom.energy += energy
 
-    _write_frames(frames, outputs=outputs)
-
-
-apply_offset = python_app(_apply_offset, executors=["default_threads"])
-
 
-def _get_elements(*files: File) -> set[str]:
+def assign_ids(
+    states: Sequence[Geometry], identifier: Optional[int] = None
+) -> tuple[Sequence[Geometry], int]:
     """
-    Get the set of elements present in all frames of a sequence of file.
-
-    Args:
-        inputs: List of Parsl DataFuture
+    Assign unique identifiers to Geometry instances, starting at provided identifier.
     """
-    frames = [geom for file in files for geom in _read_frames(file)]
-    return {chemical_symbols[i] for i in get_unique_numbers(frames)}
-
-
-get_elements = python_app(_get_elements, executors=["default_threads"])
-
+    # TODO: when would we want to supply the starting identifier?
+    if identifier is None:
+        # find largest existing value and add one
+        ids = extract(states, ["identifier"])["identifier"]
+        identifier = max([i for i in ids if i is not MISSING], default=0) + 1
 
-def _align_axes(file: FileLike, outputs: Sequence[File] = ()) -> None:
-    """
-    Align axes for all frames in a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where aligned frames will be written.
-    """
-    assert len(outputs) == 1
-    data = _read_frames(file)
-    for geometry in data:
-        geometry.align_axes()
-    _write_frames(*data, outputs=outputs)
-
-
-align_axes = python_app(_align_axes, executors=["default_threads"])
-
-
-def _app_filter(file: FileLike, quantity: str, outputs: Sequence[File] = ()) -> None:
-    """
-    Filter frames based on a specified quantity and writes to first output.
-    """
-    # TODO: where is this used?
-    data = _read_frames(file)
-    out = []
-
-    # None does not count
-    for geom in data:
-        if (quantity in ("cell", "energy", "stress")) and getattr(
-            geom, quantity, None
-        ) is not None:
-            out.append(geom)
+    for geom in states:
+        if hasattr(geom, "identifier"):
             continue
-
-        value = getattr(geom.per_atom, quantity, None)
-        if value is None:
-            value = getattr(geom.meta, quantity, None)
-        if value is not None:
-            out.append(geom)
-
-    _write_frames(out, outputs=outputs)
-
-
-# filter is protected
-app_filter = python_app(_app_filter, executors=["default_threads"])
-
-
-def _shuffle(
-    file: FileLike,
-    outputs: Sequence[File] = (),
-) -> None:
-    """
-    Shuffle the order of frames in a file.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where shuffled frames will be written.
-    """
-    assert len(outputs) == 1
-    frames = _read_frames(file)
-    np.random.shuffle(frames)
-    _write_frames(frames, outputs=outputs)
-
-
-shuffle = python_app(_shuffle, executors=["default_threads"])
-
-
-def _split_frames(
-    file: FileLike, fraction: float, shuffle: bool, outputs: Sequence[File] = ()
-) -> None:
-    """Split into training and validation sets"""
-    assert len(outputs) == 2
-    assert 0 <= fraction <= 1
-    frames = _read_frames(file)
-    order = np.arange(len(frames))
-    if shuffle:
-        np.random.shuffle(order)
-
-    n_train = int(len(frames) * fraction)
-    train_ids = order[:n_train]
-    val_ids = order[n_train:]
-
-    train = [frames[i] for i in train_ids]
-    val = [frames[i] for i in val_ids]
-    _write_frames(train, outputs=outputs[:1])
-    _write_frames(val, outputs=outputs[1:])
-
-
-split_frames = python_app(_split_frames, executors=["default_threads"])
-
-
-# TODO
-# TODO: think about where the parts below should live
-# TODO
-
-
-@typeguard.typechecked
-def _batch_frames(
-    batch_size: int,
-    inputs: list = [],
-    outputs: list = [],
-) -> Optional[list[Geometry]]:
-    """
-    Split frames into batches.
-
-    Args:
-        batch_size: Number of frames per batch.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. Each element should be a DataFuture
-                 representing an output file path for each batch.
-
-    Returns:
-        Optional[list[Geometry]]: List of Geometry instances if no outputs are specified.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    frame_regex = re.compile(r"^\d+$")
-
-    data = []
-    batch_index = 0
-    with open(inputs[0], "r") as f:
-        while True:
-            line = f.readline()
-            if not line:
-                break
-            if frame_regex.match(line.strip()):
-                natoms = int(line.strip())
-                _ = [f.readline() for _i in range(natoms + 1)]
-                data.append("".join([line] + _))
-                if len(data) == batch_size:
-                    with open(outputs[batch_index], "w") as g:
-                        g.write("\n".join(data))
-                    data = []
-                    batch_index += 1
-                else:  # write and clear
-                    pass
-    if len(data) > 0:
-        with open(outputs[batch_index], "w") as f:
-            f.write("\n".join([d.strip() for d in data if d is not None]))
-            f.write("\n")
-        batch_index += 1
-    assert batch_index == len(outputs)
-
-
-batch_frames = python_app(_batch_frames, executors=["default_threads"])
-
-
-# def _assign_identifier(
-#     state: Geometry,
-#     identifier: int,
-#     discard: bool = False,
-# ) -> tuple[Geometry, int]:
-#     """
-#     Assign an identifier to a Geometry instance.
-#
-#     Args:
-#         state (Geometry): Input Geometry instance.
-#         identifier (int): Identifier to assign.
-#         discard (bool, optional): Whether to discard the state. Defaults to False.
-#
-#     Returns:
-#         tuple[Geometry, int]: Updated Geometry and next available identifier.
-#     """
-#     if (state == NullState) or discard:
-#         return state, identifier
-#     else:
-#         assert state.identifier is None
-#         state.identifier = identifier
-#         return state, identifier + 1
-#
-#
-# assign_identifier = python_app(_assign_identifier, executors=["default_threads"])
-#
-
-
-@typeguard.typechecked
-def _assign_identifiers(
-    identifier: Optional[int],
-    inputs: list = [],
-    outputs: list = [],
-) -> int:
-    """
-    Assign identifiers to Geometry instances in a file.
-
-    Args:
-        identifier: Starting identifier value.
-        inputs: List of Parsl futures. The first element should be a DataFuture
-                representing the input file path containing geometry data.
-        outputs: List of Parsl futures. The first element should be a DataFuture
-                 representing the output file path where updated geometries will be written.
-
-    Returns:
-        int: Next available identifier.
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    data = _read_frames(slice(None), inputs=[inputs[0]])
-    states = []
-    if identifier is None:  # do not assign but look for max
-        identifier = -1
-        for geometry in data:
-            if geometry.identifier is not None:
-                identifier = max(identifier, geometry.identifier)
+        geom.identifier = identifier
         identifier += 1
-        for geometry in data:  # assign those which were not yet assigned
-            if geometry.identifier is None:
-                geometry, identifier = _assign_identifier(geometry, identifier)
-            states.append(geometry)
-        _write_frames(*states, outputs=[outputs[0]])
-        return identifier
-    else:
-        for geometry in data:
-            geometry, identifier = _assign_identifier(geometry, identifier)
-            states.append(geometry)
-        _write_frames(*states, outputs=[outputs[0]])
-        return identifier
-
 
-assign_identifiers = python_app(_assign_identifiers, executors=["default_threads"])
+    return states, identifier

From 7a50537f2549aa5606b8b8b19bb7356adc25a122 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Fri, 10 Apr 2026 14:34:07 +0200
Subject: [PATCH 06/16] Update geometry.py

- MISSING replaces None for missing values
- Geometry always has numbers, positions, cell
- Geometry returns MISSING or value for forces, energy, stress
- other missing attributes raise
---
 psiflow/geometry.py | 124 ++++++++++++++++++++++++++++++--------------
 1 file changed, 84 insertions(+), 40 deletions(-)

diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index abac90e..9c0cd29 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -1,8 +1,8 @@
 import io
 import pickle
+import copy
 from pathlib import Path
-from types import SimpleNamespace
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, Final
 from collections.abc import Sequence
 
 import ase.calculators.calculator
@@ -15,7 +15,24 @@
 import psiflow
 
 # TODO: docstrings
-# TODO: custom attributes
+# TODO: custom attributes writing
+
+# these are always accessible for every geometry
+DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress"
+PER_ATOM_FIELDS = "numbers", "positions", "forces"
+
+
+class MissingType:
+    """Placeholder sentinel for missing data fields - replaces None"""
+
+    def __repr__(self):
+        return "<MISSING>"
+
+    def __bool__(self):
+        return False
+
+
+MISSING: Final = MissingType()
 
 
 class PerAtom:
@@ -24,6 +41,10 @@ class PerAtom:
     For every attribute, the first array dimension should be n_atoms
     """
 
+    numbers: npt.NDArray[np.uint8]
+    positions: npt.NDArray[np.float64]
+    forces: npt.NDArray[np.float64]
+
     def __init__(
         self,
         numbers: npt.NDArray[np.uint8],
@@ -32,17 +53,20 @@ def __init__(
     ):
         self.numbers = numbers.astype(int).reshape(-1, 1)
         self.positions = positions
-        self._check(positions)
         for k, v in kwargs.items():
             setattr(self, k, v)
 
     def __setattr__(self, name, value):
-        if not self._check(value):
+        if value is MISSING:
+            return  # do not set MISSING values
+        elif not self._check(value):
             raise ValueError(f"Field '{name}' is not a per-atom property..")
         super().__setattr__(name, value)
 
     def __getattr__(self, name):
         # only runs if the attribute isn't found normally
+        if name == "forces":
+            return MISSING  # forces should always be accessible
         raise AttributeError(
             f"'{type(self).__name__}' instance has no attribute '{name}'"
         )
@@ -51,7 +75,7 @@ def __len__(self) -> int:
         return self.numbers.shape[0]
 
     def __str__(self) -> str:
-        return f"PerAtom[{len(self)}](keys={set(vars(self))})"
+        return f"PerAtom[{len(self)}]{set(vars(self))}"
 
     def _check(self, arr: npt.NDArray) -> bool:
         try:
@@ -68,7 +92,7 @@ def reset(self):
     def to_string(self) -> tuple[str, str]:
         symbols = [chemical_symbols[i] for i in self.numbers.flatten()]
         data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions}
-        if hasattr(self, "forces"):
+        if not self.forces is MISSING:
             data["forces"] = self.forces
 
         # create extxyz header
@@ -146,16 +170,14 @@ class Geometry:
     """
 
     per_atom: PerAtom
-    cell: npt.NDArray
-    energy: Optional[float]
-    stress: Optional[np.ndarray]
+    cell: Optional[np.ndarray]
+    energy: float
+    stress: np.ndarray
 
     def __init__(
         self,
         per_atom: PerAtom,
-        energy: Optional[float] = None,
         cell: Optional[np.ndarray] = None,
-        stress: Optional[np.ndarray] = None,
         **kwargs: Any,
     ):
         """
@@ -163,29 +185,58 @@ def __init__(
         proceeds via the class methods
         """
         self.per_atom = per_atom
-        self.energy = energy
-        if cell is not None:
-            assert cell.shape == (3, 3)
         self.cell = cell
-        if stress is not None:
-            assert stress.shape == (3, 3)
-        self.stress = stress
-        self.meta = SimpleNamespace(**kwargs)
+
+        # set optional attributes
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+
+    def __setattr__(self, name, value):
+        if value is MISSING:
+            return  # do not set MISSING values
+
+        # enforce format for DEFAULT_PROPERTIES
+        if name == "cell" and value is not None:
+            assert isinstance(value, np.ndarray) and value.shape == (3, 3)
+        elif name == "energy":
+            value = float(value)
+            # assert isinstance(value, float)
+        elif name == "stress":
+            assert isinstance(value, np.ndarray) and value.shape == (3, 3)
+
+        super().__setattr__(name, value)
+
+    def __getattr__(self, name):
+        # only runs if the attribute isn't found normally
+        if name in ("energy", "stress"):
+            return MISSING  # should always be accessible
+        raise AttributeError(
+            f"'{type(self).__name__}' instance has no attribute '{name}'"
+        )
+
+    def attributes(self) -> dict[str, Any]:
+        """Return all non-MISSING instance attributes"""
+
+        return {k: v for k, v in vars(self).items() if k != "per_atom"}
 
     def reset(self) -> None:
         """
         Reset all computed properties of the geometry to their default values.
         """
         self.per_atom.reset()
-        self.energy = None
-        self.stress = None
+        for k in ("energy", "stress"):
+            if hasattr(self, k):
+                delattr(self, k)
 
     def clean(self) -> None:
         """
         Clean the geometry by resetting properties and removing additional information.
         """
-        self.reset()
-        self.meta = SimpleNamespace()
+        self.per_atom.reset()
+        for k in self.attributes():
+            if k == "cell":
+                continue
+            delattr(self, k)
 
     def align_axes(self) -> None:
         """
@@ -208,14 +259,13 @@ def to_string(self) -> str:
         """
         Convert the Geometry instance to a string representation in extended XYZ format.
         """
-        data = {k: getattr(self, k) for k in ("energy", "stress")}
+        data = self.attributes()
+        data.pop("cell")  # cell needs special treatment
         if self.periodic:
             data["Lattice"] = self.cell.T  # ase does Fortran ordering
             data["pbc"] = "T T T"
         else:
             data["pbc"] = "F F F"
-        data |= vars(self.meta)
-        data = {k: v for k, v in data.items() if v is not None}
         info_str = key_val_dict_to_str(data)
         properties, txt = self.per_atom.to_string()
         header = " ".join([properties, info_str])
@@ -272,7 +322,7 @@ def from_string(cls, s: str) -> Geometry:
         """
         n_atoms, header, body = s.strip().split("\n", 2)
         data = key_val_str_to_dict(header)
-        data.pop("pbc", None)
+        data.pop("pbc", None)  # geometry derives pbc from cell
         if "Lattice" in data:
             data["cell"] = data.pop("Lattice").T  # ase does Fortran ordering
         else:
@@ -299,8 +349,7 @@ def from_data(
         Create a Geometry instance from atomic numbers, positions, and cell data.
         """
         per_atom = PerAtom(numbers.copy(), positions.copy())
-        cell = cell if cell is None else cell.copy()
-        return Geometry(per_atom, cell=cell)
+        return Geometry(per_atom, cell=copy.copy(cell))
 
     @classmethod
     def from_atoms(cls, atoms: Atoms) -> Geometry:
@@ -316,6 +365,7 @@ def from_atoms(cls, atoms: Atoms) -> Geometry:
             try:
                 data["energy"] = atoms.get_potential_energy()
                 per_atom.forces = atoms.get_forces()
+                data["stress"] = atoms.get_stress(voigt=False)
             except ase.calculators.calculator.PropertyNotImplementedError:
                 pass  # property was not stored in calc
         return cls(per_atom, **data)
@@ -325,15 +375,15 @@ def periodic(self) -> bool:
         """
         Check if the geometry is periodic.
         """
-        return np.any(self.cell)
+        return not self.cell is None
 
     @property
-    def per_atom_energy(self) -> Optional[float]:
+    def per_atom_energy(self) -> float | MISSING:
         """
         Calculate the energy per atom.
         """
-        if self.energy is None:
-            return None
+        if self.energy is MISSING:
+            return MISSING
         return self.energy / len(self)
 
     @property
@@ -350,7 +400,7 @@ def atomic_masses(self) -> npt.NDArray:
         """
         Get the atomic masses of the atoms in the geometry.
         """
-        return np.array([atomic_masses[n] for n in self.per_atom.numbers]).flatten()
+        return np.array([atomic_masses[n] for n in self.numbers])
 
     @property
     def numbers(self) -> npt.NDArray:
@@ -504,12 +554,6 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
     return hessian / get_mass_matrix(geometry)
 
 
-def get_unique_numbers(states: Sequence[Geometry]) -> set[int]:
-    """Returns a set of unique atom numbers found across all states."""
-    numbers = set(el for geom in states for el in np.unique(geom.per_atom.numbers))
-    return {int(n) for n in numbers}
-
-
 def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float:
     """Compute the total atomic energy based on provided single atom energies."""
     total = 0

From 04e93b0fd3681706c51155679e385a375c44966d Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Fri, 10 Apr 2026 15:02:30 +0200
Subject: [PATCH 07/16] update imports + test_data

---
 psiflow/abandonware.py         |  38 +-
 psiflow/data/dataset.py        |   1 -
 psiflow/data/file.py           |   1 +
 psiflow/data/utils.py          |  11 +-
 psiflow/functions.py           |  17 +-
 psiflow/hamiltonians.py        |   3 +-
 psiflow/learning.py            |   2 +-
 psiflow/metrics.py             |   3 +-
 psiflow/reference/reference.py |   6 +-
 psiflow/sampling/ase.py        |   2 +-
 psiflow/sampling/optimize.py   |   2 +-
 psiflow/sampling/sampling.py   |   2 +-
 psiflow/sampling/walker.py     |   5 +-
 psiflow/utils/apps.py          |   6 +-
 pyproject.toml                 |   4 +-
 tests/conftest.py              |   2 +-
 tests/test_data.py             | 610 ++++++++++++++++-----------------
 17 files changed, 362 insertions(+), 353 deletions(-)

diff --git a/psiflow/abandonware.py b/psiflow/abandonware.py
index 5d9f9b0..91e4982 100644
--- a/psiflow/abandonware.py
+++ b/psiflow/abandonware.py
@@ -4,13 +4,6 @@
 """
 
 
-def _check_equality(state0: Geometry, state1: Geometry) -> bool:
-    """Check if two Geometry instances are equal"""
-    return state0 == state1
-
-
-check_equality = python_app(_check_equality, executors=["default_threads"])
-
 
 @typeguard.typechecked
 def _check_distances(state: Geometry, threshold: float) -> Geometry:
@@ -49,3 +42,34 @@ def _check_distances(state: Geometry, threshold: float) -> Geometry:
 
 
 check_distances = python_app(_check_distances, executors=["default_htex"])
+
+
+
+def test_metrics(dataset):
+    # TODO: extracted from test_data.test_data_extract
+
+    data = dataset[:2] + Dataset([NullState]) + dataset[3:5]
+    forces = data.get("forces", elements=["Cu"])
+    reference = np.zeros((5, 4, 3))
+    reference[2, :] = np.nan  # ensure nan is in same place
+    reference[:, 0] = np.nan  # ensure nan is in same place
+    value = compute_rmse(forces, reference)
+
+    # last three atoms are Cu
+    forces = np.zeros((5, 4, 3))
+    for i in range(5):
+        forces[i, :] = data[i].result().per_atom.forces
+    forces[:, 0] = np.nan
+    assert np.allclose(
+        value.result(),
+        compute_rmse(forces, forces * np.zeros_like(forces)).result(),
+    )
+    unreduced = compute_rmse(
+        forces, forces * np.zeros_like(forces), reduce=False
+    ).result()
+    assert len(unreduced) == 5
+    unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)]
+    assert np.allclose(
+        np.sqrt(np.mean(np.square(unreduced_))),
+        value.result(),
+    )
\ No newline at end of file
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index df79073..b5ad65c 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -26,7 +26,6 @@
     assign_identifiers,
 )
 
-
 # TODO: do all operations need to generate a new file?
 
 
diff --git a/psiflow/data/file.py b/psiflow/data/file.py
index b2c3902..21bc507 100644
--- a/psiflow/data/file.py
+++ b/psiflow/data/file.py
@@ -267,6 +267,7 @@ def _read_wrapper(
     frames_in = _read_frames(file)
     return execute(frames_in, *args, **kwargs)
 
+
 read_app = python_app(_read_wrapper, executors=["default_threads"])
 extract_quantities = partial(read_app, extract)
 extract_quantities_per_atom = partial(read_app, extract_per_atom)
diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py
index 8cfa5a1..d9bb37a 100644
--- a/psiflow/data/utils.py
+++ b/psiflow/data/utils.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from ase.data import atomic_numbers
+from parsl import python_app
 
 from psiflow.geometry import (
     Geometry,
@@ -12,7 +13,6 @@
     MISSING,
 )
 
-
 # TODO: some of these have side effects..
 
 
@@ -48,7 +48,7 @@ def extract(states: Sequence[Geometry], quantities: Sequence[str]) -> dict[str,
     return data
 
 
-def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geometry]:
+def insert(states: Sequence[Geometry], data: dict[str, list]) -> None:
     """
     Insert quantities from data into Geometry instances.
     """
@@ -59,7 +59,6 @@ def insert(states: Sequence[Geometry], data: dict[str, list]) -> Sequence[Geomet
                 setattr(geom.per_atom, q, v)
             else:
                 setattr(geom, q, v)
-    return states
 
 
 def extract_per_atom(
@@ -149,3 +148,9 @@ def assign_ids(
         identifier += 1
 
     return states, identifier
+
+
+@python_app(executors=["default_threads"])
+def check_equality(state0: Geometry, state1: Geometry) -> bool:
+    """Check if two Geometry instances are equal"""
+    return state0 == state1
diff --git a/psiflow/functions.py b/psiflow/functions.py
index a461e7d..13d3ebb 100644
--- a/psiflow/functions.py
+++ b/psiflow/functions.py
@@ -8,13 +8,12 @@
 from collections.abc import Sequence
 
 import numpy as np
-from ase import Atoms
 from ase.calculators.calculator import Calculator
 from ase.data import atomic_masses
 from ase.units import fs, kJ, mol, nm
 from ase.stress import voigt_6_to_full_3x3_stress
 
-from psiflow.geometry import Geometry, NullState, create_outputs
+from psiflow.geometry import Geometry
 
 
 def format_output(
@@ -212,7 +211,7 @@ def __call__(
         self,
         geometry: Geometry,
     ) -> dict[str, float | np.ndarray]:
-        atoms = geometry_to_atoms(geometry)
+        atoms = geometry.to_atoms(structural_only=True)
         self.calc.calculate(atoms)
         return format_output(geometry, **self.calc.results)
 
@@ -243,7 +242,7 @@ def __call__(
         self,
         geometry: Geometry,
     ) -> dict[str, float | np.ndarray]:
-        atoms = geometry_to_atoms(geometry)
+        atoms = geometry.to_atoms(structural_only=True)
         self.calc.calculate(atoms)
         return format_output(geometry, **self.calc.results)
 
@@ -296,13 +295,3 @@ def function_from_json(path: Union[str, Path], **kwargs) -> Function:
             data[key] = value
     function = function_cls(**data)
     return function
-
-
-def geometry_to_atoms(geom: Geometry) -> Atoms:
-    """Only structural information"""
-    return Atoms(
-        positions=geom.per_atom.positions,
-        numbers=geom.per_atom.numbers,
-        cell=np.copy(geom.cell) if geom.periodic else None,
-        pbc=geom.periodic,
-    )
diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py
index f101188..e337daa 100644
--- a/psiflow/hamiltonians.py
+++ b/psiflow/hamiltonians.py
@@ -11,7 +11,8 @@
 from parsl.dataflow.futures import AppFuture, Future
 
 import psiflow
-from psiflow.data import Computable, Dataset, aggregate_multiple, compute
+from psiflow.data import Dataset
+from psiflow.compute import Computable, aggregate_multiple, compute
 from psiflow.functions import (
     EinsteinCrystalFunction,
     HarmonicFunction,
diff --git a/psiflow/learning.py b/psiflow/learning.py
index 11cf8d0..a1193c9 100644
--- a/psiflow/learning.py
+++ b/psiflow/learning.py
@@ -12,7 +12,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.geometry import Geometry, NullState, assign_identifier
+from psiflow.geometry import Geometry#, NullState, assign_identifier
 from psiflow.hamiltonians import Hamiltonian, Zero
 from psiflow.metrics import Metrics
 # from psiflow.models import Model
diff --git a/psiflow/metrics.py b/psiflow/metrics.py
index 7b7e351..60cb99c 100644
--- a/psiflow/metrics.py
+++ b/psiflow/metrics.py
@@ -386,7 +386,8 @@ def _update_logs(
     inputs: list = [],
     outputs: list = [],
 ):
-    from psiflow.data.utils import _compute_rmse, _extract_quantities, _read_frames
+    from psiflow.data.utils import _extract_quantities, _read_frames
+    from psiflow.compute import _compute_rmse
     from psiflow.utils.io import _load_metrics, _save_metrics
 
     data0 = _read_frames(inputs=[inputs[1]])
diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py
index 1a6d6d7..cfbfe10 100644
--- a/psiflow/reference/reference.py
+++ b/psiflow/reference/reference.py
@@ -14,8 +14,8 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.utils import extract_quantities
-from psiflow.geometry import Geometry, NullState
+from psiflow.data.utils import extract
+from psiflow.geometry import Geometry
 from psiflow.utils.apps import copy_app_future
 from psiflow.utils.parse import LineNotFoundError, get_task_name_id
 
@@ -149,7 +149,7 @@ def compute(
         #  but extract_quantities does not accept AppFuture[list[Geometry]] (yet?)
         future_dataset = self.compute_dataset(dataset)
         outputs = outputs or self.outputs
-        future_data = extract_quantities(
+        future_data = extract(
             tuple(outputs), None, None, inputs=[future_dataset.extxyz]
         )
         if len(outputs) == 1:
diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py
index 69dde69..8a7f9e2 100644
--- a/psiflow/sampling/ase.py
+++ b/psiflow/sampling/ase.py
@@ -7,7 +7,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.utils import write_frames
+from psiflow.data.file import write_frames
 from psiflow.geometry import Geometry
 from psiflow.hamiltonians import Hamiltonian
 from psiflow.utils.io import _dump_json
diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py
index 3d09880..a5e97df 100644
--- a/psiflow/sampling/optimize.py
+++ b/psiflow/sampling/optimize.py
@@ -9,7 +9,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.utils import write_frames
+from psiflow.data.file import write_frames
 from psiflow.geometry import Geometry
 from psiflow.hamiltonians import Hamiltonian, MACEHamiltonian
 from psiflow.sampling.sampling import (
diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py
index 117f8b7..8650f03 100644
--- a/psiflow/sampling/sampling.py
+++ b/psiflow/sampling/sampling.py
@@ -12,7 +12,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.utils import read_frames
+from psiflow.data.file import read_frames
 from psiflow.hamiltonians import Hamiltonian, MixtureHamiltonian, Zero, MACEHamiltonian
 from psiflow.sampling.output import (
     DEFAULT_OBSERVABLES,
diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py
index d26d243..8d9ada5 100644
--- a/psiflow/sampling/walker.py
+++ b/psiflow/sampling/walker.py
@@ -12,10 +12,9 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.geometry import Geometry, check_equality
+from psiflow.geometry import Geometry
+from psiflow.data.utils import check_equality
 from psiflow.hamiltonians import Hamiltonian, Zero, combine_hamiltonians
-
-# from psiflow.order_parameters import OrderParameter
 from psiflow.sampling.metadynamics import Metadynamics
 from psiflow.utils.apps import copy_app_future
 
diff --git a/psiflow/utils/apps.py b/psiflow/utils/apps.py
index d4e7937..134ad2a 100644
--- a/psiflow/utils/apps.py
+++ b/psiflow/utils/apps.py
@@ -1,6 +1,7 @@
 import shutil
 import textwrap
 from typing import Any, Union
+from collections.abc import Sequence
 from pathlib import Path
 
 import numpy as np
@@ -38,8 +39,8 @@ def _compute_sum(a, b):
 
 def _copy_data_future(
     pass_on_exist: bool = False,
-    inputs: list[File] = [],
-    outputs: list[File] = [],
+    inputs: Sequence[File] = (),
+    outputs: Sequence[File] = ()
 ) -> None:
     assert len(inputs) == 1
     assert len(outputs) == 1
@@ -49,7 +50,6 @@ def _copy_data_future(
         shutil.copyfile(inputs[0], outputs[0])
     else:  # no need to copy empty file
         pass
-    return
 
 
 copy_data_future = python_app(_copy_data_future, executors=["default_threads"])
diff --git a/pyproject.toml b/pyproject.toml
index 98e13db..4b058d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "psiflow"
-version = "4.0.4"
+version = "4.0.5"
 description = "Library for developing interatomic potentials"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -14,7 +14,7 @@ dependencies = [
     "pyyaml>=6.0",
 #    "numpy>=1.22.3, <2",
     "parsl==2026.02.23",
-    "prettytable",
+    "prettytable",  # TODO: what is this used for?
     "psutil",
 #    "cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools.git@3b9929735dcb3c8c0620a548b1fe20efecbad077",  # need 2024.1
     "ipi @ git+https://github.com/i-pi/i-pi.git@v3.1.10",
diff --git a/tests/conftest.py b/tests/conftest.py
index 58d2068..6ae4928 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -96,7 +96,7 @@ def generate_emt_cu_data(nstates, amplitude, supercell=None) -> list[ase.Atoms]:
     return atoms_list
 
 
-@pytest.fixture
+@pytest.fixture  # TODO: do we need to regenerate this every test?
 def dataset(context) -> Dataset:
     data = generate_emt_cu_data(20, 0.2)
     data += generate_emt_cu_data(5, 0.15, supercell=np.diag([1, 2, 1]))
diff --git a/tests/test_data.py b/tests/test_data.py
index 738993f..e5d8155 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -1,4 +1,3 @@
-import copy
 import os
 
 import numpy as np
@@ -8,18 +7,14 @@
 from parsl.app.futures import DataFuture
 from parsl.data_provider.files import File
 
-import psiflow
-from psiflow.data import Dataset, compute_rmse
-from psiflow.data.utils import (
-    _read_frames,
-    _write_frames,
-    get_index_element_mask,
-    read_frames,
-)
-from psiflow.geometry import Geometry, NullState, check_equality
+from psiflow.geometry import Geometry, MISSING
+from psiflow.data import Dataset
+from psiflow.data.file import _read_frames, _write_frames
+from psiflow.data.utils import insert
 
 
 def test_geometry(tmp_path):
+    # TODO: custom per_atom field
     geometry = Geometry.from_data(
         numbers=np.arange(1, 6),
         positions=np.random.uniform(0, 1, size=(5, 3)),
@@ -29,9 +24,7 @@ def test_geometry(tmp_path):
 
     _ = np.array(geometry.per_atom.positions)
     atoms = Atoms(
-        numbers=np.arange(1, 6),
-        positions=geometry.per_atom.positions,
-        pbc=False,
+        numbers=np.arange(1, 6), positions=geometry.per_atom.positions, pbc=False
     )
     geometry_ = Geometry.from_atoms(atoms)
     assert geometry == geometry_
@@ -46,108 +39,99 @@ def test_geometry(tmp_path):
     address = geometry.per_atom.positions.ctypes.data
     address_ = geometry_.per_atom.positions.ctypes.data
     assert address != address_  # should copy
-    assert np.allclose(
-        geometry.per_atom.positions,
-        geometry_.per_atom.positions,
-    )
+    assert np.allclose(geometry.per_atom.positions, geometry_.per_atom.positions)
     geometry.align_axes()
     assert geometry.per_atom.positions.ctypes.data == address  # should not copy
 
-    fake_data = []
+    # create atoms dataset
+    atoms = []
     for i in range(10):
-        atoms = Atoms(
+        at = Atoms(
             numbers=np.arange(i + 5),
             positions=np.random.uniform(-3, 3, size=(i + 5, 3)),
             pbc=False,
         )
-        fake_data.append(atoms)
-    fake_data[2].info["energy"] = 1.0
-    fake_data[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3))
-    fake_data[1].info["stdout"] = "abcd"
-    fake_data[6].info["logprob"] = np.array([-23.1, 22.1])
-    fake_data[7].cell = np.random.uniform(-3, 4, size=(3, 3))
-    fake_data[7].pbc = True
-
-    write(tmp_path / "test.xyz", fake_data)
-
-    data = read_frames(inputs=[str(tmp_path / "test.xyz")]).result()
+        atoms.append(at)
+    atoms[2].info["energy"] = 1.0
+    atoms[3].arrays["forces"] = np.random.uniform(-3, 3, size=(8, 3))
+    atoms[1].info["stdout"] = "abcd"
+    atoms[6].info["logprob"] = np.array([-23.1, 22.1])
+    atoms[7].cell = np.random.uniform(-3, 4, size=(3, 3))
+    atoms[7].pbc = True
+
+    write(tmp_path / "atoms.xyz", atoms)
+
+    # check reading from extxyz
+    data = _read_frames(tmp_path / "atoms.xyz")
     assert data is not None  # should return when no output file is given
-    assert len(data) == len(fake_data)
-    for i in range(10):
-        assert np.allclose(
-            fake_data[i].get_positions(),
-            data[i].per_atom.positions,
-        )
-        assert np.allclose(
-            fake_data[i].numbers,
-            data[i].per_atom.numbers,
-        )
-        assert data[i] == Geometry.from_atoms(fake_data[i])
+    assert len(data) == len(atoms)
+    for at, geom in zip(atoms, data):
+        assert geom == Geometry.from_atoms(at)
     assert data[2].energy == 1.0
     assert data[2].per_atom_energy == 1.0 / 7
-    assert data[3].per_atom_energy is None
-    assert np.allclose(
-        data[3].per_atom.forces,
-        fake_data[3].arrays["forces"],
-    )
+    assert data[3].per_atom_energy is MISSING
+    assert np.allclose(data[3].per_atom.forces, atoms[3].arrays["forces"])
     assert data[1].stdout == "abcd"
     assert type(data[6].logprob) is np.ndarray
-    assert np.allclose(
-        data[6].logprob,
-        fake_data[6].info["logprob"],
-    )
+    assert np.allclose(data[6].logprob, atoms[6].info["logprob"])
     assert data[7].periodic
-    assert np.allclose(
-        data[7].cell,
-        np.array(fake_data[7].cell),
-    )
-    assert np.allclose(
-        data[6].cell,
-        np.zeros((3, 3)),
-    )
-    geometry = read_frames(indices=[3], inputs=[str(tmp_path / "test.xyz")]).result()[0]
+    assert np.allclose(data[7].cell, atoms[7].cell.array)
+    assert data[6].cell is None
+
+    geometry = _read_frames(tmp_path / "atoms.xyz", indices=[3])[0]
     assert geometry == data[3]
     assert not geometry == data[4]
 
-    # check writing
-    data = read_frames(
-        inputs=[str(tmp_path / "test.xyz")],
-        outputs=[File(str(tmp_path / "test_.xyz"))],
-    ).result()
-    assert data is None
-
-    data = read_frames(inputs=[File(str(tmp_path / "test.xyz"))]).result()
-    data_ = read_frames(inputs=[File(str(tmp_path / "test_.xyz"))]).result()
-
+    # check writing to extxyz
+    geoms = [Geometry.from_atoms(at) for at in atoms]
+    _write_frames(geoms, outputs=[File(tmp_path / "geometries.xyz")])
+    atoms_ = read(tmp_path / "geometries.xyz", ":")
+    for at, at_ in zip(atoms, atoms_):
+        assert np.allclose(at.cell, at_.cell)
+        data = at.arrays | at.info | (at.calc.results if at.calc else {})
+        data_ = at_.arrays | at_.info | (at_.calc.results if at_.calc else {})
+        for k in data:
+            if k == "cell":
+                pass
+            elif isinstance(data[k], np.ndarray):
+                assert np.allclose(data[k], data_[k])
+            else:
+                assert data[k] == data_[k]
+
+    data = _read_frames(tmp_path / "atoms.xyz")
+    data_ = _read_frames(tmp_path / "geometries.xyz")
     for state, state_ in zip(data, data_):
         assert state == state_
         assert state.energy == state_.energy
-        if state.stress is not None:
-            assert np.allclose(
-                state.stress,
-                state_.stress,
-            )
-        assert state.delta == state_.delta
-        assert state.phase == state_.phase
-        if state.logprob is not None:
-            assert np.allclose(state.logprob, state_.logprob)
-        assert state.stdout == state_.stdout
-        assert state.identifier == state.identifier
+        assert state.stress is MISSING or np.allclose(state.stress, state_.stress)
+        attrs = state.attributes()
+        attrs_ = state_.attributes()
+        for val, val_ in zip(attrs.values(), attrs_.values()):
+            if isinstance(val, np.ndarray):
+                assert np.allclose(val, val_)
+            else:
+                assert val == val_
 
     state.save(tmp_path / "geo.xyz")
-    state.save(str(tmp_path / "geo.xyz"))
     assert state == Geometry.load(tmp_path / "geo.xyz")
 
+    # default attributes have type (and shape) requirements
+    with pytest.raises(TypeError):
+        state.energy = None
+    with pytest.raises(AssertionError):
+        state.cell = [[1, 2, 3]] * 3
+
 
 def test_readwrite_cycle(dataset, tmp_path):
+    tmp_file = File(tmp_path / "test.xyz")
+
     data = dataset[:4].geometries().result()
-    data[2].order["test"] = 324
-    _write_frames(*data, outputs=[str(tmp_path / "test.xyz")])
+    data[2].test = 324
+    _write_frames(*data, outputs=[tmp_file])
     loaded = Dataset(data)
-    assert "test" in loaded[2].result().order
-
-    states = _read_frames(inputs=[str(tmp_path / "test.xyz")])
-    assert "test" in states[2].order
+    assert hasattr(loaded[2].result(), "test")
+    states = _read_frames(tmp_file)
+    assert hasattr(states[2], "test")
 
     s = """3
 energy=3.0 phase=c7eq Properties=species:S:1:pos:R:3:momenta:R:3:forces:R:3
@@ -155,19 +139,15 @@ def test_readwrite_cycle(dataset, tmp_path):
 O 1 2 3 4 5 6 7 8 9
 F 2 3 4 5 6 7 8 9 10
 """
-    geometry = Geometry.from_string(s, natoms=None)
+    geometry = Geometry.from_string(s)
     assert len(geometry) == 3
     assert geometry.energy == 3.0
     assert geometry.phase == "c7eq"
     assert not geometry.periodic
-    assert np.all(np.logical_not(np.isnan(geometry.per_atom.forces)))
-    assert np.allclose(
-        geometry.per_atom.numbers,
-        np.array([6, 8, 9]),
-    )
+    assert not np.any(np.isnan(geometry.per_atom.forces))
+    assert np.allclose(geometry.numbers, np.array([6, 8, 9]))
     assert np.allclose(
-        geometry.per_atom.forces,
-        np.array([[6,7,8], [7,8,9], [8,9,10]]),
+        geometry.per_atom.forces, np.array([[6, 7, 8], [7, 8, 9], [8, 9, 10]])
     )
     s = """7
 
@@ -182,33 +162,39 @@ def test_readwrite_cycle(dataset, tmp_path):
     geometry = Geometry.from_string(s)
     assert not geometry.periodic
     assert len(geometry) == 7
-    assert geometry.energy is None
+    assert geometry.energy is MISSING
 
-    geometry = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None)
-    geometry.stress = np.array([np.nan] * 9).reshape(3, 3)
-    assert "nan" not in geometry.to_string()
+    geom = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None)
+    geom.stress = np.array([np.nan] * 9).reshape(3, 3)
+    string = geom.to_string()
+    assert "nan" in string  # the stress field
+    geom_ = Geometry.from_string(string)
+    assert np.isnan(geom_.stress).all()
 
 
-def test_dataset_empty(tmp_path):
-    dataset = Dataset([])
-    assert dataset.length().result() == 0
-    assert isinstance(dataset.extxyz, DataFuture)
-    path_xyz = tmp_path / "test.xyz"
-    dataset.save(path_xyz)  # ensure the copy is executed before assert
-    psiflow.wait()
-    assert os.path.isfile(path_xyz)
-    with pytest.raises(ValueError):  # cannot save outside cwd
-        dataset.save(path_xyz.parents[3] / "test.xyz")
-
-
-def test_dataset_append(dataset):
+def test_dataset(dataset, tmp_path):
+    """Verifying basic dataset functionality"""
     l = dataset.length().result()  # noqa: E741
     geometries = dataset.geometries().result()
     assert len(geometries) == l
     assert type(geometries) is list
     assert type(geometries[0]) is Geometry
     for i in range(l):
-        assert check_equality(geometries[i], dataset[i]).result()
+        assert geometries[i] == dataset[i].result()
+
+    path_xyz = tmp_path / "data.xyz"
+    dataset.save(path_xyz).result()
+    assert os.path.isfile(path_xyz)
+    with pytest.raises(ValueError):  # cannot save outside cwd
+        dataset.save(path_xyz.parents[3] / "test.xyz")
+
+    dataset = Dataset([])
+    assert dataset.length().result() == 0
+    assert isinstance(dataset.extxyz, DataFuture)
+
+
+def test_dataset_append(dataset):
+    l = dataset.length().result()
     empty = Dataset([])  # use [] instead of None
     empty += dataset
     assert l == empty.length().result()
@@ -216,19 +202,16 @@ def test_dataset_append(dataset):
     assert added.length().result() == 2 * l
     assert dataset.length().result() == l  # must not have changed
 
+
+def test_dataset_align(dataset):
     # test canonical transformation
     transformed = dataset.align_axes()
-    for i in range(dataset.length().result()):
-        geometry = dataset[i].result()
-        geometry.align_axes()
-        assert np.allclose(
-            geometry.per_atom.positions,
-            transformed[i].result().per_atom.positions,
-        )
-        assert np.allclose(
-            geometry.cell,
-            transformed[i].result().cell,
-        )
+    geoms = dataset.geometries().result()
+    geoms_ = transformed.geometries().result()
+    for geom, geom_ in zip(geoms, geoms_):
+        geom.align_axes()
+        assert np.allclose(geom.per_atom.positions, geom_.per_atom.positions)
+        assert np.allclose(geom.cell, geom_.cell)
 
 
 def test_dataset_slice(dataset):
@@ -255,63 +238,6 @@ def test_dataset_slice(dataset):
     for i, geometry in enumerate(subset.geometries().result()):
         assert geometry == dataset[i].result()
 
-
-def test_dataset_from_xyz(tmp_path, dataset):
-    path_xyz = tmp_path / "data.xyz"
-    dataset.save(path_xyz)
-    psiflow.wait()
-    _ = Dataset.load(path_xyz)
-    atoms_list = read(path_xyz, index=":")
-
-    for i in range(dataset.length().result()):
-        assert np.allclose(
-            dataset[i].result().per_atom.positions,
-            atoms_list[i].get_positions(),
-        )
-        assert dataset[i].result().energy == atoms_list[i].calc.results["energy"]
-
-
-def test_index_element_mask():
-    numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8])
-    elements = ["H"]
-    indices = [0, 1, 4, 5]
-    assert np.allclose(
-        np.array([True, True, False, False, False, False, False, False]),
-        get_index_element_mask(numbers, indices, elements),
-    )
-    elements = ["H", "O"]
-    indices = [0, 1, 4, 5]
-    assert np.allclose(
-        np.array([True, True, False, False, False, True, False, False]),
-        get_index_element_mask(numbers, indices, elements),
-    )
-    elements = ["H", "O"]
-    indices = [3]
-    assert np.allclose(
-        get_index_element_mask(numbers, indices, elements),
-        np.array([False] * len(numbers)),
-    )
-    elements = ["H", "O"]
-    indices = None
-    assert np.allclose(
-        get_index_element_mask(numbers, indices, elements),
-        np.array([True, True, True, False, False, True, True, True]),
-    )
-    elements = None
-    indices = [0, 1, 2, 3, 5]
-    assert np.allclose(
-        get_index_element_mask(numbers, indices, elements),
-        np.array([True, True, True, True, False, True, False, False]),
-    )
-    elements = ["Cl"]  # not present at all
-    indices = None
-    assert np.allclose(
-        get_index_element_mask(numbers, indices, elements),
-        np.array([False] * len(numbers)),
-    )
-
-
-def test_dataset_gather(dataset):
     indices = [0, 3, 2, 6, 1]
     gathered = dataset[indices]
     assert gathered.length().result() == len(indices)
@@ -322,180 +248,244 @@ def test_dataset_gather(dataset):
         )
 
 
+def test_dataset_from_xyz(tmp_path, dataset):
+    path_xyz = tmp_path / "data.xyz"
+    dataset.save(path_xyz).result()
+    atoms_list = read(path_xyz, index=":")
+    geoms = dataset.geometries().result()
+    for geom, at in zip(geoms, atoms_list):
+        assert np.allclose(geom.per_atom.positions, at.get_positions())
+        assert geom.energy == at.calc.results["energy"]
+
+
+# def test_index_element_mask():
+#     numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8])
+#     elements = ["H"]
+#     indices = [0, 1, 4, 5]
+#     assert np.allclose(
+#         np.array([True, True, False, False, False, False, False, False]),
+#         get_index_element_mask(numbers, indices, elements),
+#     )
+#     elements = ["H", "O"]
+#     indices = [0, 1, 4, 5]
+#     assert np.allclose(
+#         np.array([True, True, False, False, False, True, False, False]),
+#         get_index_element_mask(numbers, indices, elements),
+#     )
+#     elements = ["H", "O"]
+#     indices = [3]
+#     assert np.allclose(
+#         get_index_element_mask(numbers, indices, elements),
+#         np.array([False] * len(numbers)),
+#     )
+#     elements = ["H", "O"]
+#     indices = None
+#     assert np.allclose(
+#         get_index_element_mask(numbers, indices, elements),
+#         np.array([True, True, True, False, False, True, True, True]),
+#     )
+#     elements = None
+#     indices = [0, 1, 2, 3, 5]
+#     assert np.allclose(
+#         get_index_element_mask(numbers, indices, elements),
+#         np.array([True, True, True, True, False, True, False, False]),
+#     )
+#     elements = ["Cl"]  # not present at all
+#     indices = None
+#     assert np.allclose(
+#         get_index_element_mask(numbers, indices, elements),
+#         np.array([False] * len(numbers)),
+#     )
+
+
 def test_data_elements(dataset):
-    assert "H" in dataset.elements().result()
-    assert "Cu" in dataset.elements().result()
-    assert len(dataset.elements().result()) == 2
+    elements = dataset.elements().result()
+    assert "H" in elements
+    assert "Cu" in elements
+    assert len(elements) == 2
 
 
 def test_data_reset(dataset):
     dataset = dataset.reset()
-    assert dataset[0].result().energy is None
+    geometries = dataset.geometries().result()
+    for geom in geometries:
+        assert geom.energy is MISSING
+        assert geom.per_atom.forces is MISSING
 
 
 def test_data_split(dataset):
     train, valid = dataset.split(0.9)
-    assert (
-        train.length().result() + valid.length().result() == dataset.length().result()
-    )
+    train_geoms = train.geometries().result()
+    valid_geoms = valid.geometries().result()
+    assert len(train_geoms) + len(valid_geoms) == dataset.length().result()
+    for geom_t in train_geoms:
+        for geom_v in valid_geoms:
+            assert geom_t != geom_v
 
 
-def test_identifier(dataset):
-    data = dataset + Dataset([NullState, NullState, dataset[0]])
+def test_identifier(dataset, dataset_h2):
+    data = dataset + dataset_h2
     identifier = data.assign_identifiers(0)
-    assert identifier.result() == dataset.length().result() + 1
-    assert identifier.result() == data.not_null().length().result()
-    for i in range(data.length().result()):
-        if not data[i].result() == NullState:
-            assert data[i].result().identifier < identifier.result()
-            # assert data[i].result().reference_status
-    data = data.clean()  # also removes identifier
-    for i in range(data.length().result()):
-        s = data[i].result()
-        if not s == NullState:
-            assert s.identifier is None
-    identifier = data.assign_identifiers(10)
-    assert identifier.result() == 10 + data.not_null().length().result()
-    identifier = dataset.assign_identifiers(10)
-    assert (
-        dataset.assign_identifiers().result()
-        == 10 + dataset.not_null().length().result()
-    )
-    for i in range(dataset.length().result()):
-        s = dataset[i].result()
-        if not s == NullState:
-            assert s.identifier >= 10
-
-    identifier = data.assign_identifiers()
-    data = data.clean()
-    assert (
-        data.assign_identifiers(identifier).result()
-        == identifier.result() + data.not_null().length().result()
-    )
+    assert identifier.result() == data.length().result()
+
+    data = data.clean()  # removes identifier
+    geoms = data.geometries().result()
+    for geom in geoms:
+        assert not hasattr(geom, "identifier")
+
+    identifier = dataset.assign_identifiers(5).result()
+    geoms = dataset.geometries().result()
+    assert identifier == len(geoms) + 5
+    for geom in geoms:
+        assert geom.identifier < identifier
+
+    # set initial value
+    geoms = dataset_h2.geometries().result()
+    geoms[10].identifier = 42
+
+    # count from 0
+    data = Dataset(geoms)
+    identifier = data.assign_identifiers(0).result()
+    geoms_ = data.geometries().result()
+    assert identifier == len(geoms) - 1  # skips over one geometry
+    assert geoms_[0].identifier == 0
+    assert geoms_[10].identifier == 42
+    assert geoms_[-1].identifier == identifier - 1
+
+    # count from highest
+    data = Dataset(geoms)
+    identifier = data.assign_identifiers().result()
+    geoms_ = data.geometries().result()
+    assert identifier == len(geoms) + 42  # skips over one geometry
+    assert geoms_[0].identifier == 43
+    assert geoms_[10].identifier == 42
+    assert geoms_[-1].identifier == identifier - 1
 
 
 def test_data_offset(dataset):
-    atomic_energies = {
-        "H": 34,
-        "Cu": 12,
-    }
-    data = dataset.subtract_offset(**atomic_energies)
-    data_ = data.add_offset(**atomic_energies)
-    for i in range(dataset.length().result()):
-        natoms = len(dataset[i].result())
-        offset = (natoms - 1) * atomic_energies["Cu"] + atomic_energies["H"]
-        assert np.allclose(
-            data[i].result().energy,
-            dataset[i].result().energy - offset,
-        )
-        assert np.allclose(  # unchanged
-            data[i].result().per_atom.forces,
-            dataset[i].result().per_atom.forces,
-        )
-        assert np.allclose(
-            data_[i].result().energy,
-            dataset[i].result().energy,
-        )
+    atomic_energies = {"H": 34, "Cu": 12}
+    data0 = dataset.subtract_offset(**atomic_energies)
+    data1 = data0.add_offset(**atomic_energies)
+
+    geoms = dataset.geometries().result()
+    geoms0 = data0.geometries().result()
+    geoms1 = data1.geometries().result()
+
+    for g, g0, g1 in zip(geoms, geoms0, geoms1):
+        offset = (len(g) - 1) * atomic_energies["Cu"] + atomic_energies["H"]
+        assert np.allclose(g0.energy, g.energy - offset)
+        assert np.allclose(g0.per_atom.forces, g.per_atom.forces)
+        assert np.allclose(g1.energy, g.energy)
 
 
 def test_data_extract(dataset):
     state = dataset[0].result()
-    state.energy = None
+    state.reset()
     state.identifier = 0
     state.delta = 6
 
+    # check Dataset.get
     data = dataset[:5] + dataset[-5:] + Dataset([state])
     energy, forces, identifier = data.get("energy", "forces", "identifier")
     energy = energy.result()
     forces = forces.result()
     identifier = identifier.result()
+    assert energy[-1] is MISSING and forces[-1] is MISSING
+    assert identifier[-1] == 0
     for i, geometry in enumerate(data.geometries().result()):
         if geometry.energy:
             assert np.allclose(geometry.energy, energy[i])
-        n = len(geometry)
-        assert np.allclose(
-            geometry.per_atom.forces,
-            forces[i][:n, :],
-        )
-        if identifier[i].item() != -1:
+        try:
+            assert np.allclose(geometry.per_atom.forces, forces[i])
+        except TypeError:
+            assert geometry.per_atom.forces is MISSING and forces[i] is MISSING
+        if hasattr(geometry, "identifier"):
             assert geometry.identifier == identifier[i]
-        if i < 10:
-            assert identifier[i] == -1
-    assert np.isnan(np.mean(energy))
-    assert np.isnan(np.mean(forces))
-    psiflow.wait()
-
-    data = dataset[:2] + Dataset([NullState]) + dataset[3:5]
-    forces = data.get("forces", elements=["Cu"])
-    reference = np.zeros((5, 4, 3))
-    reference[2, :] = np.nan  # ensure nan is in same place
-    reference[:, 0] = np.nan  # ensure nan is in same place
-    value = compute_rmse(forces, reference)
-
-    # last three atoms are Cu
-    forces = np.zeros((5, 4, 3))
-    for i in range(5):
-        forces[i, :] = data[i].result().per_atom.forces
-    forces[:, 0] = np.nan
-    assert np.allclose(
-        value.result(),
-        compute_rmse(forces, forces * np.zeros_like(forces)).result(),
-    )
-    unreduced = compute_rmse(
-        forces, forces * np.zeros_like(forces), reduce=False
-    ).result()
-    assert len(unreduced) == 5
-    unreduced_ = unreduced[np.array([0, 1, 3, 4], dtype=int)]
-    assert np.allclose(
-        np.sqrt(np.mean(np.square(unreduced_))),
-        value.result(),
-    )
 
-    # try weirdly specific indices
-    forces = np.zeros((5, 4, 3))
-    for i in range(5):
-        g = data[i].result()
-        if len(g) >= 4:
-            forces[i, 3] = data[i].result().per_atom.forces[3]
-            forces[i, 1] = data[i].result().per_atom.forces[1]
+    # check extract-insert round-trip
+    geoms = data.clean().geometries().result()
+    data_dict = {"energy": energy, "forces": forces, "identifier": identifier}
+    insert(geoms, data_dict)
+    for geom, geom_ in zip(geoms, data.geometries().result()):
+        e, e_ = geom.energy, geom_.energy
+        f, f_ = geom.per_atom.forces, geom_.per_atom.forces
+        assert e == e_
+        if f is MISSING:
+            assert f_ is MISSING
         else:
-            forces[i, :] = np.nan
-    forces_ = data.get("forces", atom_indices=[3, 1]).result()
-    mask = np.invert(np.isnan(forces_))
-    assert np.allclose(
-        forces[mask],
-        forces_[mask],
-    )
+            assert np.allclose(f, f_)
+        if hasattr(geom, "identifier"):
+            assert geom.identifier == geom_.identifier
+
+    # check Dataset.get_per_atom
+    geoms = data.geometries().result()
+    forces = [geom.per_atom.forces for geom in geoms]
+    numbers = [geom.numbers for geom in geoms]
+
+    forces_ = data.get_per_atom("forces", elements=["Cu"])[0].result()
+    for n, f, f_ in zip(numbers, forces, forces_):
+        # filter on Cu
+        if f is MISSING:
+            assert f_ is MISSING
+        else:
+            assert np.allclose(f[n == 29], f_)
+
+    ids = [0, 2]
+    forces_ = data.get_per_atom("forces", atom_indices=ids)[0].result()
+    for f, f_ in zip(forces, forces_):
+        # filter on ids
+        if f is MISSING:
+            assert f_ is MISSING
+        else:
+            assert np.allclose(f[ids], f_)
 
-    # check order parameters
+    # check custom attributes
     s = Geometry.from_string(
         """
 3
-order_distance=2.3 energy=4
+distance=2.3 energy=4
 O 0 0 0
 H 1 1 1
 H -1 1 1
 """,
     )
-    states = [copy.deepcopy(s) for _ in range(5)]
-
-    states[1].order = {}
-    states[2].order = {"some": 4.2}
-    states[3].order["distance"] = 2.2
+    states = [s.copy() for _ in range(5)]
+    states[1].order = {"answer": 42}
+    states[2].some = 4.2
+    states[3].distance = 2.2
     data = Dataset(states)
-    some, distance = data.get("some", "distance")
+    some, distance, energy, order = data.get("some", "distance", "energy", "order")
     some = some.result()
     distance = distance.result()
-    assert np.isnan(some[1])
-    assert np.allclose(some[2], 4.2)
-    assert np.allclose(distance[0], 2.3)
-    assert np.allclose(distance[3], 2.2)
-    assert np.isnan(distance[2])
+    energy = energy.result()
+    order = order.result()
+    assert some == [MISSING, MISSING, 4.2, MISSING, MISSING]
+    assert distance == [2.3, 2.3, 2.3, 2.2, 2.3]
+    assert energy == [4, 4, 4, 4, 4]
+    assert order == [MISSING, {"answer": 42}, MISSING, MISSING, MISSING]
 
 
 def test_filter(dataset, dataset_h2):
-    data = dataset + dataset_h2 + Dataset([NullState])
-    data = data.shuffle()
-    assert data.filter("cell").length().result() == dataset.length().result()
-    assert data.filter("energy").length().result() == dataset.length().result()
-    assert data.filter("forces").length().result() == dataset.length().result()
+    geom = dataset[0].result()
+    geom.flagged = True
+    geom.reset()
+    data = dataset + dataset_h2 + Dataset([geom])
+
+    data0 = data.shuffle()
+    data1 = data.filter("cell")
+    data2 = data.filter("energy")
+    data3 = data.filter("forces")
+    data4 = data.filter("flagged")
+
+    l = dataset.length().result()
+    l_ = dataset_h2.length().result()
+    l0 = data0.length().result()
+    l1 = data1.length().result()
+    l2 = data2.length().result()
+    l3 = data3.length().result()
+    l4 = data4.length().result()
+
+    assert l0 == l + l_ + 1  # all geometries
+    assert l0 == l1  # cell always defined
+    assert l2 == l and l3 == l
+    assert l4 == 1

From 942ac3ca194bf6c4f7496888d89a2473deecfcb3 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 15:22:00 +0200
Subject: [PATCH 08/16] Update compute.py

refactored compute functionality
- ComputeResult instance instead of tuple of stacked + padded arrays
- compute signature tweaked
- compare_results to compute metrics between ComputeResults/loose data

- functions return dict[str, list]
- Hamiltonian.compute returns future of ComputeResult
---
 psiflow/compute.py | 543 +++++++++++++++++++++------------------------
 1 file changed, 258 insertions(+), 285 deletions(-)

diff --git a/psiflow/compute.py b/psiflow/compute.py
index 22461ef..8774962 100644
--- a/psiflow/compute.py
+++ b/psiflow/compute.py
@@ -1,336 +1,309 @@
-import math
-from typing import Callable, ClassVar, Optional, Union
+from typing import Callable, ClassVar, Optional, Union, Type, Any, TypeAlias
+from collections.abc import Sequence
+from dataclasses import dataclass
 
 import numpy as np
-import typeguard
 from parsl.app.app import join_app, python_app
 from parsl.dataflow.futures import AppFuture, DataFuture
 
 import psiflow
-from psiflow.geometry import Geometry
+from psiflow.geometry import Geometry, PER_ATOM_FIELDS, DEFAULT_PROPERTIES, MISSING
 from psiflow.data import Dataset
-from psiflow.data.file import batch_frames
+from psiflow.functions import Function
+from psiflow.utils.apps import pack
 
+ComputeInput: TypeAlias = (
+    Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry
+)
 
-def _concatenate_multiple(*args: list[np.ndarray]) -> list[np.ndarray]:
-    """
-    Concatenate multiple lists of arrays.
 
-    Args:
-        *args: Lists of numpy arrays to concatenate.
+@dataclass
+class ComputeResult:
+    """Container to hold and manipulate the results from compute/apply operations."""
 
-    Returns:
-        list[np.ndarray]: List of concatenated arrays.
+    n_atoms: np.ndarray
+    data: dict[str, np.ndarray]
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
+    def __post_init__(self):
+        self.cutoffs = self.n_atoms.cumsum()
 
-    def pad_arrays(
-        arrays: list[np.ndarray],
-        pad_dimension: int = 1,
-    ) -> list[np.ndarray]:
-        ndims = np.array([len(a.shape) for a in arrays])
-        assert np.all(ndims == ndims[0])
-        assert np.all(pad_dimension < ndims)
-
-        pad_size = max([a.shape[pad_dimension] for a in arrays])
-        for i in range(len(arrays)):
-            shape = list(arrays[i].shape)
-            shape[pad_dimension] = pad_size - shape[pad_dimension]
-            padding = np.zeros(tuple(shape)) + np.nan
-            arrays[i] = np.concatenate((arrays[i], padding), axis=pad_dimension)
-        return arrays
-
-    narrays = len(args[0])
-    for arg in args:
-        assert isinstance(arg, list)
-    assert all([len(a) == narrays for a in args])
-
-    concatenated = []
-    for i in range(narrays):
-        arrays = [arg[i] for arg in args]
-        if len(arrays[0].shape) > 1:
-            pad_arrays(arrays)
-        concatenated.append(np.concatenate(tuple(arrays)))
-    return concatenated
-
-
-concatenate_multiple = python_app(_concatenate_multiple, executors=["default_threads"])
-
-
-def _aggregate_multiple(
-    *arrays_list,
-    coefficients: Optional[np.ndarray] = None,
-) -> list[np.ndarray]:
-    """
-    Aggregate multiple lists of arrays with optional coefficients.
+    def __getattr__(self, item):
+        """Enable parsl deferred_getitem on AppFuture"""
+        if item == "data":
+            raise AttributeError()  # prevent RecursionError from pickle
+        return self.data[item]
 
-    Args:
-        *arrays_list: Lists of arrays to aggregate.
-        coefficients: Optional coefficients for weighted aggregation.
+    @property
+    def keys(self) -> set[str]:
+        return set(self.data.keys())
 
-    Returns:
-        list[np.ndarray]: List of aggregated arrays.
+    def get(self, key: str, per_geom: bool = False) -> list | np.ndarray:
+        arr = self.data[key]
+        if not per_geom:
+            return arr
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
+        if self.cutoffs[-1] == arr.shape[0]:
+            data_list = np.array_split(arr, self.cutoffs[:-1])
+        else:
+            data_list = list(arr)
+        return data_list
+
+    @classmethod
+    def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]):
+        values = {}
+        for k, v in data.items():
+            assert len(v) == n_atoms.size
+            if k in PER_ATOM_FIELDS:
+                values[k] = np.concatenate(v)
+            elif k in DEFAULT_PROPERTIES:
+                values[k] = np.stack(v)
+            else:
+                # TODO: what with custom properties?
+                values[k] = np.stack(v)
+        return cls(np.array(n_atoms), values)
+
+
+def _apply(
+    states: Sequence[Geometry],
+    function_cls: Type[Function],
+    inputs: Sequence = (),
+    parsl_resource_specification: dict = {},
+    **parameters,
+) -> ComputeResult:
+    assert function_cls is not None
+    function = function_cls(**parameters)  # psiflow.functions.Function subclass
+    output_dict = function.compute(states)
+    n_atoms = np.array([len(geom) for geom in states])
+    return ComputeResult.from_data(n_atoms, output_dict)
+
+
+@python_app(executors=["default_threads"])
+def concatenate_results(*results: ComputeResult) -> ComputeResult:
+    """"""
+    n_atoms_list = [result.n_atoms for result in results]
+    n_atoms = np.concatenate(n_atoms_list)
+
+    data = {}
+    for k in results[0].keys:
+        values = [result.get(k) for result in results]
+        data[k] = np.concatenate(values)
+
+    return ComputeResult(n_atoms, data)
+
+
+@python_app(executors=["default_threads"])
+def aggregate_results(
+    *results: ComputeResult, coefficients: Optional[np.ndarray] = None
+) -> ComputeResult:
+    """"""
+    # TODO: what with data fields that are not shared?
     if coefficients is None:
-        coefficients = np.ones(len(arrays_list))
-    else:
-        assert len(coefficients) == len(arrays_list)
+        coefficients = np.ones(len(results))
+    assert len(coefficients) == len(results)
 
-    results = [np.zeros(a.shape) for a in arrays_list[0]]
-    for i, arrays in enumerate(arrays_list):
-        for j, array in enumerate(arrays):
-            results[j] += coefficients[i] * array
-    return results
+    n_atoms = results[0].n_atoms
+    for result in results[1:]:
+        assert np.allclose(n_atoms, result.n_atoms)
 
+    data = {k: np.zeros_like(v) for k, v in results[0].data.items()}
+    for i, result in enumerate(results):
+        for k in data:
+            data[k] += result.get(k) * coefficients[i]
 
-aggregate_multiple = python_app(_aggregate_multiple, executors=["default_threads"])
+    return ComputeResult(n_atoms, data)
 
 
 @join_app
 def batch_apply(
-    apply_apps: tuple[Union[python_app, Callable]],
-    arg: Union[Dataset, list[Geometry]],
+    states: list[Geometry],
+    apply_apps: Sequence[Callable],
     batch_size: int,
-    length: int,
-    outputs: list = [],
-    reduce_func: Optional[python_app] = None,
-    **app_kwargs,
+    reduce_func: Callable,
 ) -> AppFuture:
-    """
-    Apply a set of apps to batches of data.
+    """Apply a set of apps to batches of data"""
+    # TODO: holds everything in memory -- what with very large datasets?
+    n_batch = len(states) // batch_size + 1 * (len(states) % batch_size > 0)
+    batches = [arr.tolist() for arr in np.array_split(states, n_batch)]
 
-    Args:
-        apply_apps: Tuple of PythonApps or Callables to apply.
-        arg: Dataset or list of Geometries to process.
-        batch_size: Size of each batch.
-        length: Total number of items to process.
-        outputs: List of output files.
-        reduce_func: Optional function to reduce results.
-        **app_kwargs: Additional keyword arguments for the apps.
-
-    Returns:
-        AppFuture: Future representing the result of batch application.
-
-    Note:
-        This function is wrapped as a Parsl join_app.
-    """
-    nbatches = math.ceil(length / batch_size)
-    batches = [psiflow.context().new_file("data_", ".xyz") for _ in range(nbatches)]
-    future = batch_frames(batch_size, inputs=[arg.extxyz], outputs=batches)
-    output_futures = []
-    for i in range(nbatches):
+    output = []
+    for batch in batches:
         futures = []
         for app in apply_apps:
-            f = app(
-                None,
-                inputs=[future.outputs[i]],
-                **app_kwargs,
-            )
-            futures.append(f)
-        reduced = reduce_func(*futures)
-        output_futures.append(reduced)
-    future = concatenate_multiple(*output_futures)
-    return future
-
-
-@python_app(executors=["default_threads"])
-def get_length(arg):
-    """
-    Get the length of the input argument.
-
-    Args:
-        arg: Input to get the length of.
-
-    Returns:
-        int: Length of the input.
+            future = app(batch)
+            futures.append(future)
+        future = reduce_func(*futures)
+        output.append(future)
 
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    if isinstance(arg, list):
-        return len(arg)
-    else:
-        return 1
+    return concatenate_results(*output)
 
 
 def compute(
-    arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-    *apply_apps: Union[python_app, Callable],
-    outputs_: Union[str, list[str], tuple[str, ...], None] = None,
-    reduce_func: Union[python_app, Callable] = aggregate_multiple,
+    arg: ComputeInput,
+    apply_apps: Sequence[Callable],
+    reduce_func: Union[python_app, Callable] = aggregate_results,
     batch_size: Optional[int] = None,
-) -> Union[list[AppFuture], AppFuture]:
+) -> AppFuture:
     """
     Compute results by applying apps to the input data.
 
     Args:
         arg: Input data to compute on.
-        *apply_apps: Apps to apply to the data.
-        outputs_: Names of output quantities.
+        apply_apps: Apps to apply to the data.
         reduce_func: Function to reduce results.
         batch_size: Optional batch size for processing.
-
-    Returns:
-        Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
     """
-    if type(outputs_) is str:
-        outputs_ = [outputs_]
-    if batch_size is not None:
-        if isinstance(arg, Dataset):
-            length = arg.length()
-        else:
-            length = get_length(arg)
-            # convert to Dataset for convenience
-            arg = Dataset(arg)
-        future = batch_apply(
-            apply_apps,
-            arg,
-            batch_size,
-            length,
-            outputs_=outputs_,
-            reduce_func=reduce_func,
-        )
-    else:
+    states = input_to_geometries(arg)
+    if batch_size is None:
         futures = []
-        if isinstance(arg, Dataset):
-            for app in apply_apps:
-                future = app(
-                    None,
-                    outputs_=outputs_,
-                    inputs=[arg.extxyz],
-                )
-                futures.append(future)
-        else:
-            for app in apply_apps:
-                future = app(
-                    arg,
-                    outputs_=outputs_,
-                    inputs=[],
-                )
-                futures.append(future)
+        for app in apply_apps:
+            future = app(states)
+            futures.append(future)
         future = reduce_func(*futures)
-    if len(outputs_) == 1:
-        return future[0]
     else:
-        return [future[i] for i in range(len(outputs_))]
-
-
-class Computable:
-    """
-    Base class for computable objects.
-
-    Attributes:
-        outputs (ClassVar[tuple[str, ...]]): Names of output quantities.
-        batch_size (ClassVar[Optional[int]]): Default batch size for computation.
-    """
-
-    outputs: ClassVar[tuple[str, ...]] = ()
-    batch_size: ClassVar[Optional[int]] = None
-
-    def compute(
-        self,
-        arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-        *outputs: Optional[str],
-        batch_size: Optional[int] = -1,  # if -1: take class default
-    ) -> Union[list[AppFuture], AppFuture]:
-        """
-        Compute results for the given input.
-
-        Args:
-            arg: Input data to compute on.
-            outputs: Names of output quantities.
-            batch_size: Batch size for computation.
-
-        Returns:
-            Union[list[AppFuture], AppFuture]: Future(s) representing computation results.
-        """
-        raise NotImplementedError
-
-
-@typeguard.typechecked
-def _compute_rmse(
-    array0: np.ndarray,
-    array1: np.ndarray,
-    reduce: bool = True,
-) -> Union[float, np.ndarray]:
-    """
-    Compute the Root Mean Square Error (RMSE) between two arrays.
+        future = batch_apply(states, apply_apps, batch_size, reduce_func)
 
-    Args:
-        array0: First array.
-        array1: Second array.
-        reduce: Whether to reduce the result to a single value.
-
-    Returns:
-        Union[float, np.ndarray]: RMSE value(s).
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    assert array0.shape == array1.shape
-    assert np.all(np.isnan(array0) == np.isnan(array1))
-
-    se = (array0 - array1) ** 2
-    se = se.reshape(se.shape[0], -1)
-
-    if reduce:  # across both dimensions
-        mask = np.logical_not(np.isnan(se))
-        return float(np.sqrt(np.mean(se[mask])))
-    else:  # retain first dimension
-        if se.ndim == 1:
-            return se
-        else:
-            values = np.empty(len(se))
-            for i in range(len(se)):
-                if np.all(np.isnan(se[i])):
-                    values[i] = np.nan
-                else:
-                    mask = np.logical_not(np.isnan(se[i]))
-                    value = np.sqrt(np.mean(se[i][mask]))
-                    values[i] = value
-            return values
+    return future
 
 
-compute_rmse = python_app(_compute_rmse, executors=["default_threads"])
+metric_func_map: dict[str, Callable] = {
+    "RMSE": lambda arr1, arr2: np.mean((arr1 - arr2) ** 2) ** 0.5,
+    "MAE": lambda arr1, arr2: np.abs(arr1 - arr2).mean(),
+}
 
 
-@typeguard.typechecked
-def _compute_mae(
-    array0,
-    array1,
+def _compare_results(
+    result1: ComputeResult,
+    result2: Optional[ComputeResult] = None,
+    metric: str = "RMSE",
     reduce: bool = True,
-) -> Union[float, np.ndarray]:
-    """
-    Compute the Mean Absolute Error (MAE) between two arrays.
-
-    Args:
-        array0: First array.
-        array1: Second array.
-        reduce: Whether to reduce the result to a single value.
-
-    Returns:
-        Union[float, np.ndarray]: MAE value(s).
-
-    Note:
-        This function is wrapped as a Parsl app and executed using the default_threads executor.
-    """
-    assert array0.shape == array1.shape
-    mask0 = np.logical_not(np.isnan(array0))
-    mask1 = np.logical_not(np.isnan(array1))
-    assert np.all(mask0 == mask1)
-    ae = np.abs(array0 - array1)
-    to_reduce = tuple(range(1, array0.ndim))
-    mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce))
-    ae = ae[mask0].reshape(np.sum(1 * mask), -1)
-    if reduce:  # across both dimensions
-        return float(np.sqrt(np.mean(ae)))
-    else:  # retain first dimension
-        return np.sqrt(np.mean(ae, axis=1))
-
-
-compute_mae = python_app(_compute_mae, executors=["default_threads"])
+    **kwargs: list | np.ndarray,
+) -> dict:
+    """"""
+    # TODO: what with missing values?
+    assert (result2 is None) != (len(kwargs) == 0)  # xor
+    if kwargs:
+        result2 = ComputeResult.from_data(result1.n_atoms, kwargs)
+    elif not np.allclose(result1.cutoffs, result2.cutoffs):
+        raise ValueError("Results cannot be compared")
+
+    metric_func = metric_func_map[metric]
+    keys = result1.keys.intersection(result2.keys)
+
+    out = {}
+    for k in keys:
+        if reduce:
+            arr1, arr2 = result1.get(k), result2.get(k)
+            out[k] = metric_func(arr1, arr2)
+        else:
+            list1, list2 = result1.get(k, per_geom=True), result2.get(k, per_geom=True)
+            out[k] = [metric_func(v1, v2) for v1, v2 in zip(list1, list2)]
+
+    return out
+
+
+compare_results = python_app(_compare_results, executors=["default_threads"])
+
+
+# def _compute_rmse(
+#     array0: np.ndarray,
+#     array1: np.ndarray,
+#     reduce: bool = True,
+# ) -> Union[float, np.ndarray]:
+#     """
+#     Compute the Root Mean Square Error (RMSE) between two arrays.
+#
+#     Args:
+#         array0: First array.
+#         array1: Second array.
+#         reduce: Whether to reduce the result to a single value.
+#
+#     Returns:
+#         Union[float, np.ndarray]: RMSE value(s).
+#
+#     Note:
+#         This function is wrapped as a Parsl app and executed using the default_threads executor.
+#     """
+#     assert array0.shape == array1.shape
+#     assert np.all(np.isnan(array0) == np.isnan(array1))
+#
+#     se = (array0 - array1) ** 2
+#     se = se.reshape(se.shape[0], -1)
+#
+#     if reduce:  # across both dimensions
+#         mask = np.logical_not(np.isnan(se))
+#         return float(np.sqrt(np.mean(se[mask])))
+#     else:  # retain first dimension
+#         if se.ndim == 1:
+#             return se
+#         else:
+#             values = np.empty(len(se))
+#             for i in range(len(se)):
+#                 if np.all(np.isnan(se[i])):
+#                     values[i] = np.nan
+#                 else:
+#                     mask = np.logical_not(np.isnan(se[i]))
+#                     value = np.sqrt(np.mean(se[i][mask]))
+#                     values[i] = value
+#             return values
+#
+#
+# compute_rmse = python_app(_compute_rmse, executors=["default_threads"])
+#
+#
+# def _compute_mae(
+#     array0,
+#     array1,
+#     reduce: bool = True,
+# ) -> Union[float, np.ndarray]:
+#     """
+#     Compute the Mean Absolute Error (MAE) between two arrays.
+#
+#     Args:
+#         array0: First array.
+#         array1: Second array.
+#         reduce: Whether to reduce the result to a single value.
+#
+#     Returns:
+#         Union[float, np.ndarray]: MAE value(s).
+#
+#     Note:
+#         This function is wrapped as a Parsl app and executed using the default_threads executor.
+#     """
+#     assert array0.shape == array1.shape
+#     mask0 = np.logical_not(np.isnan(array0))
+#     mask1 = np.logical_not(np.isnan(array1))
+#     assert np.all(mask0 == mask1)
+#     ae = np.abs(array0 - array1)
+#     to_reduce = tuple(range(1, array0.ndim))
+#     mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce))
+#     ae = ae[mask0].reshape(np.sum(1 * mask), -1)
+#     if reduce:  # across both dimensions
+#         return float(np.sqrt(np.mean(ae)))
+#     else:  # retain first dimension
+#         return np.sqrt(np.mean(ae, axis=1))
+#
+#
+# compute_mae = python_app(_compute_mae, executors=["default_threads"])
+
+
+def input_to_geometries(data: ComputeInput) -> AppFuture:
+    """Convert ComputeInput into a sequence of geometries (as a future)"""
+    # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry
+
+    @python_app(executors=["default_threads"])
+    def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]:
+        # make sure apply apps get consistent input
+        if isinstance(data, Geometry):
+            data = [data]
+        return data
+
+    if isinstance(data, Dataset):
+        return data.geometries()
+    elif isinstance(data, Geometry):
+        return pack(data)
+    elif isinstance(data, AppFuture):
+        return prep_input(data)
+    elif isinstance(data, list):
+        return pack(*data)
+    else:
+        assert False

From 54c4808e80b28e1b67927acb3933bfb411a9008a Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 16:22:10 +0200
Subject: [PATCH 09/16] update Hamiltonian API

- Hamiltonian.compute returns ComputeResult
- Hamiltonian.evaluate labels a dataset
- moving towards deprecating Dataset.evaluate
---
 psiflow/compute.py      | 12 +++++++
 psiflow/data/dataset.py |  1 +
 psiflow/functions.py    | 50 +++++++----------------------
 psiflow/hamiltonians.py | 71 +++++++++++++++++------------------------
 4 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/psiflow/compute.py b/psiflow/compute.py
index 8774962..de26fb5 100644
--- a/psiflow/compute.py
+++ b/psiflow/compute.py
@@ -9,6 +9,7 @@
 import psiflow
 from psiflow.geometry import Geometry, PER_ATOM_FIELDS, DEFAULT_PROPERTIES, MISSING
 from psiflow.data import Dataset
+from psiflow.data.utils import insert
 from psiflow.functions import Function
 from psiflow.utils.apps import pack
 
@@ -48,6 +49,9 @@ def get(self, key: str, per_geom: bool = False) -> list | np.ndarray:
             data_list = list(arr)
         return data_list
 
+    def to_dict(self) -> dict[str, list]:
+        return {k: self.get(k, per_geom=True) for k in self.keys}
+
     @classmethod
     def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]):
         values = {}
@@ -204,6 +208,7 @@ def _compare_results(
 compare_results = python_app(_compare_results, executors=["default_threads"])
 
 
+# TODO: cleanup
 # def _compute_rmse(
 #     array0: np.ndarray,
 #     array1: np.ndarray,
@@ -307,3 +312,10 @@ def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]:
         return pack(*data)
     else:
         assert False
+
+
+@python_app(executors=["default_threads"])
+def insert_results(states: Sequence[Geometry], result: ComputeResult) -> Sequence[Geometry]:
+    """"""
+    insert(states, result.to_dict())
+    return states
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index b5ad65c..1d0e8da 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -197,6 +197,7 @@ def get_per_atom(
         )
         return tuple(future_dict[q] for q in quantities)
 
+    # TODO: cleanup?
     # def evaluate(
     #     self, computable: "Computable", batch_size: Optional[int] = None
     # ) -> Dataset:
diff --git a/psiflow/functions.py b/psiflow/functions.py
index 13d3ebb..8d22978 100644
--- a/psiflow/functions.py
+++ b/psiflow/functions.py
@@ -4,7 +4,7 @@
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Optional, Type, Union, Any
+from typing import Optional, Union
 from collections.abc import Sequence
 
 import numpy as np
@@ -42,18 +42,15 @@ def __call__(
 
     def compute(
         self,
-        geometries: list[Geometry],
-    ) -> dict[str, float | np.ndarray]:
+        geometries: Sequence[Geometry],
+    ) -> dict[str, list[float | np.ndarray]]:
         """Evaluate multiple geometries and merge data into single arrays"""
-        value, grad_pos, grad_cell = create_outputs(self.outputs, geometries)
+        data = {k: [] for k in self.outputs}
         for i, geometry in enumerate(geometries):
-            if geometry == NullState:  # TODO: remove
-                continue
             out = self(geometry)
-            value[i] = out["energy"]
-            grad_pos[i, : len(geometry)] = out["forces"]
-            grad_cell[i] = out["stress"]
-        return {"energy": value, "forces": grad_pos, "stress": grad_cell}
+            for k, v in out.items():
+                data[k].append(v)
+        return data
 
 
 @dataclass(frozen=True)
@@ -144,7 +141,7 @@ def __call__(
 
     @staticmethod
     def _geometry_to_key(geometry: Geometry) -> tuple:
-        return tuple([geometry.periodic]) + tuple(geometry.per_atom.numbers)
+        return tuple([geometry.periodic]) + tuple(geometry.numbers)
 
 
 @dataclass(frozen=True)
@@ -153,7 +150,7 @@ def __call__(
         self,
         geometry: Geometry,
     ) -> dict[str, float | np.ndarray]:
-        return format_output(geometry, 0)
+        return format_output(geometry, 0.0)
 
 
 @dataclass(frozen=True)
@@ -221,7 +218,7 @@ class DispersionFunction(Function):
     method: str
     damping: str = "d3bj"
     params_tweaks: Optional[dict[str, float]] = None
-    num_threads: int = 1
+    num_threads: int = 4
 
     calc: Calculator = field(init=False)
 
@@ -247,32 +244,7 @@ def __call__(
         return format_output(geometry, **self.calc.results)
 
 
-def _apply(
-    arg: Union[Geometry, list[Geometry], None],
-    outputs_: tuple[str, ...],
-    function_cls: Type[Function],
-    wait_for: Any = None,
-    inputs: Sequence = (),
-    parsl_resource_specification: dict = {},
-    **parameters,
-) -> Optional[list[np.ndarray]]:
-    from psiflow.data.utils import _read_frames
-
-    # TODO: why pass geoms through arg or inputs?
-    #  should we not have a single keyword for input structures?
-    #  or at least reserve inputs for arbitrary futures (to wait on)?
-
-    assert function_cls is not None
-    if arg is None:
-        states = _read_frames(inputs=[inputs[0]])
-    elif not isinstance(arg, list):
-        states = [arg]
-    else:
-        states = arg
-    function = function_cls(**parameters)
-    output_dict = function.compute(states)
-    output_arrays = [output_dict[k] for k in outputs_]
-    return output_arrays
+
 
 
 def function_from_json(path: Union[str, Path], **kwargs) -> Function:
diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py
index e337daa..b90e7ca 100644
--- a/psiflow/hamiltonians.py
+++ b/psiflow/hamiltonians.py
@@ -12,7 +12,15 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.compute import Computable, aggregate_multiple, compute
+from psiflow.data.file import write_frames
+from psiflow.compute import (
+    aggregate_results,
+    compute,
+    _apply,
+    ComputeInput,
+    ComputeResult,
+    insert_results,
+)
 from psiflow.functions import (
     EinsteinCrystalFunction,
     HarmonicFunction,
@@ -20,18 +28,12 @@
     PlumedFunction,
     ZeroFunction,
     DispersionFunction,
-    _apply,
 )
 from psiflow.geometry import Geometry
 from psiflow.utils._plumed import remove_comments_printflush
 from psiflow.utils.apps import get_attribute
 from psiflow.utils.io import dump_json
 
-
-# TODO: comparison logic in __eq__ only works for hamiltonians without futures
-# TODO: dataclasses automatically generate __eq__
-
-
 logger = logging.getLogger(__name__)  # logging per module
 
 
@@ -40,23 +42,24 @@
 apply_modelevaluation = python_app(_apply, executors=["ModelEvaluation"])
 
 
-# TODO: why have the Computable class?
-class Hamiltonian(Computable):
-    batch_size = 1000
+# TODO: evaluate method for datasets?
+class Hamiltonian:
     outputs: ClassVar[tuple] = ("energy", "forces", "stress")
     function_name: ClassVar[str]
 
-    def compute(
-        self,
-        arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-        *outputs: Optional[str],
-        batch_size: Optional[int] = -1,  # if -1: take class default TODO: why?
-    ) -> Union[list[AppFuture], AppFuture]:
-        if len(outputs) == 0:
-            outputs = tuple(self.__class__.outputs)
-        if batch_size == -1:
-            batch_size = self.__class__.batch_size
-        return compute(arg, self.get_app(), outputs_=outputs, batch_size=batch_size)
+    def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture:
+        """Evaluate hamiltonian function on input data and return ComputeResult"""
+        return compute(arg, [self.get_app()], batch_size=batch_size)
+
+    def evaluate(self, arg: Dataset, batch_size: int = 1000) -> Dataset:
+        """Return new dataset labelled with hamiltonian function"""
+        geoms = arg.geometries()
+        result = self.compute(geoms, batch_size)
+        future = insert_results(geoms, result)
+
+        file = psiflow.context().new_file("data_", ".xyz")
+        extxyz = write_frames(future, outputs=[file]).outputs[0]
+        return Dataset(extxyz=extxyz)
 
     def __eq__(self, other: "Hamiltonian") -> bool:
         raise NotImplementedError
@@ -108,25 +111,12 @@ def __init__(
         self.hamiltonians = list(hamiltonians)
         self.coefficients = list(coefficients)
 
-    def compute(  # override compute for efficient batching
-        self,
-        arg: Union[Dataset, AppFuture[list], list, AppFuture, Geometry],
-        outputs: Union[str, list[str], None] = None,
-        batch_size: Optional[int] = None,
-    ) -> Union[list[AppFuture], AppFuture]:
-        if outputs is None:
-            outputs = list(self.__class__.outputs)
+    def compute(self, arg: ComputeInput, batch_size: int = 1000) -> AppFuture:
         apply_apps = [h.get_app() for h in self.hamiltonians]
         reduce_func = partial(
-            aggregate_multiple, coefficients=np.array(self.coefficients)
-        )
-        return compute(
-            arg,
-            *apply_apps,
-            outputs_=outputs,
-            reduce_func=reduce_func,
-            batch_size=batch_size,
+            aggregate_results, coefficients=np.array(self.coefficients)
         )
+        return compute(arg, apply_apps, reduce_func=reduce_func, batch_size=batch_size)
 
     def __eq__(self, hamiltonian: Hamiltonian) -> bool:
         if type(hamiltonian) is not MixtureHamiltonian:
@@ -224,9 +214,6 @@ def serialize(self, **kwargs) -> list[DataFuture]:
 class Zero(Hamiltonian):
     function_name: ClassVar[str] = "ZeroFunction"
 
-    def __init__(self):
-        pass
-
     def get_app(self) -> Callable:
         return partial(apply_threads, function_cls=ZeroFunction)
 
@@ -312,7 +299,7 @@ def get_app(self) -> Callable:
         return partial(
             apply_htex,
             function_cls=PlumedFunction,
-            wait_for=self.external,
+            inputs=[self.external],
             **self.parameters(),
         )
 
@@ -429,7 +416,7 @@ def get_app(self) -> Callable:
         return partial(
             apply_modelevaluation,
             function_cls=MACEFunction,
-            wait_for=self.external,
+            inputs=[self.external],
             parsl_resource_specification=evaluation.wq_resources(1),
             **self.parameters(include_env=True),
         )

From bb98a49e302830adb38b97cf1512f9d370213515 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 16:23:15 +0200
Subject: [PATCH 10/16] Update test_function.py

---
 tests/test_function.py | 256 +++++++++++++++++++++++------------------
 1 file changed, 145 insertions(+), 111 deletions(-)

diff --git a/tests/test_function.py b/tests/test_function.py
index 39ec1ca..22b305e 100644
--- a/tests/test_function.py
+++ b/tests/test_function.py
@@ -5,11 +5,14 @@
 from parsl.data_provider.files import File  # type: ignore
 
 import psiflow
+from psiflow.geometry import MISSING
 from psiflow.functions import (
     EinsteinCrystalFunction,
     HarmonicFunction,
     MACEFunction,
     PlumedFunction,
+    DispersionFunction,
+    ZeroFunction,
     function_from_json,
 )
 from psiflow.hamiltonians import (
@@ -18,6 +21,7 @@
     MixtureHamiltonian,
     PlumedHamiltonian,
     D3Hamiltonian,
+    MACEHamiltonian,
     Zero,
 )
 from psiflow.utils._plumed import remove_comments_printflush, set_path_in_plumed
@@ -25,34 +29,39 @@
 from psiflow.utils.io import dump_json
 from psiflow.utils.apps import get_attribute
 from psiflow.serialization import CLS_KEY, deserialize_hook
+from psiflow.compute import compare_results
 
 
 def test_einstein_crystal(dataset):
     future_geom = dataset[0]
+    geom = future_geom.result()
     test_dataset = dataset[:4].reset()
-    geometries = test_dataset.geometries().result()
+    test_geoms = test_dataset.geometries().result()
 
     function = EinsteinCrystalFunction(
-        force_constant=1.0, centers=future_geom.result().per_atom.positions
+        force_constant=1.0, centers=geom.per_atom.positions, volume=geom.volume
     )
     hamiltonian = EinsteinCrystal.from_geometry(future_geom, force_constant=1.0)
 
-    energy, forces, stress = function.compute(geometries).values()
-    forces1, stress1, energy1 = hamiltonian.compute(
-        test_dataset, "forces", "stress", "energy"
-    )
-    forces2 = hamiltonian.compute(test_dataset, "forces", batch_size=3)
-
-    assert np.all(energy >= 0)
-    assert energy[0] == 0
-    assert geometries[0].energy is None
-    assert np.allclose(  # forces point to centers
-        forces,
-        function.centers.reshape(1, -1, 3) - test_dataset.get("positions").result(),
-    )
-    assert np.allclose(energy1.result(), energy)
-    assert np.allclose(forces1.result(), forces)
-    assert np.allclose(forces1.result(), forces2.result())
+    out0 = function.compute(test_geoms)
+    assert all(e >= 0 for e in out0["energy"])
+    assert out0["energy"][0] == 0
+    assert test_geoms[0].energy is MISSING
+    positions = test_dataset.get("positions")[0].result()
+    centers = function.centers.reshape(1, -1, 3)
+    for f, p in zip(out0["forces"], positions):
+        assert np.allclose(f, centers - p)  # forces point to centers
+
+    out1 = hamiltonian.compute(test_dataset)
+    out2 = hamiltonian.compute(test_dataset, batch_size=3)
+
+    error1 = compare_results(out1, **out0)
+    error2 = compare_results(out1, out2)
+
+    for k, rmse in error1.result().items():
+        assert np.isclose(rmse, 0.0)
+    for k, rmse in error2.result().items():
+        assert np.isclose(rmse, 0.0)
 
 
 def test_einstein_force(dataset):
@@ -72,7 +81,9 @@ def test_einstein_force(dataset):
                 forces_[i, j] = (-1.0) * sign * force_constant * delta
                 forces.append(forces_)
 
-    forces_ = einstein.compute(geometries, "forces").result()
+    # per atom arrays are concatenated - not stacked
+    forces_ = einstein.compute(geometries).result().forces
+    forces_ = forces_.reshape(-1, len(reference), 3)
     assert np.allclose(forces, forces_)
 
 
@@ -91,9 +102,7 @@ def test_get_filename_hills():
     plumed_input = remove_comments_printflush(plumed_input)
     plumed_input = set_path_in_plumed(plumed_input, "METAD", "/tmp/my_input")
     plumed_input = set_path_in_plumed(plumed_input, "METADD", "/tmp/my_input")
-    assert (
-        plumed_input.strip()
-        == """
+    assert plumed_input.strip() == """
 RESTART
 UNITS LENGTH=A ENERGY=kj/mol TIME=fs
 CV: VOLUME
@@ -101,12 +110,12 @@ def test_get_filename_hills():
 METAD ARG=CV0 SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad FILE=/tmp/my_input sdld
 METADD ARG=CV SIGMA=100 HEIGHT=2 PACE=50 LABEL=metad sdld FILE=/tmp/my_input
 """.strip()
-    )
 
 
 def test_plumed_function(tmp_path, dataset, dataset_h2):
     data = dataset + dataset_h2
     geometries = data.geometries().result()
+    f = 1 / (kJ / mol) * 10  # eV --> kJ/mol and nm --> A
     plumed_str = """
 D1: DISTANCE ATOMS=1,2 NOPBC
 CV: BIASVALUE arg=D1
@@ -114,12 +123,13 @@ def test_plumed_function(tmp_path, dataset, dataset_h2):
     function = PlumedFunction(plumed_str)
     outputs = function.compute(geometries)
 
-    f = 1 / (kJ / mol) * 10  # eV --> kJ/mol and nm --> A
-    positions = data.get("positions").result()
-    manual = np.linalg.norm(positions[:, 0, :] - positions[:, 1, :], axis=1)
-    assert np.allclose(outputs["energy"] * f, manual)
-    gradient = (positions[:, 0, :] - positions[:, 1, :]) / manual.reshape(-1, 1)
-    assert np.allclose(outputs["forces"][:, 0, :] * f, gradient * (-1.0))
+    [positions] = data.get("positions")
+    manual = np.array([np.linalg.norm(p[0] - p[1]) for p in positions.result()])
+    gradient = [(p[0] - p[1]) / d for p, d in zip(positions.result(), manual)]
+    assert np.allclose(np.array(outputs["energy"]) * f, manual)
+    for grad, force in zip(gradient, outputs["forces"]):
+        # only compare first force component
+        assert np.allclose(force[0] * f, grad * (-1.0))
 
     # test volume bias
     geometries = dataset.geometries().result()
@@ -132,12 +142,12 @@ def test_plumed_function(tmp_path, dataset, dataset_h2):
     function = PlumedFunction(plumed_input)
     hamiltonian = PlumedHamiltonian(plumed_input)
 
-    energy, forces, stress = function.compute(geometries).values()
-    energy_, forces_, stress_ = hamiltonian.compute(dataset)
+    data = function.compute(geometries)
+    out = hamiltonian.compute(dataset).result()
     energy_manual = (volumes - 50) ** 2 * (kJ / mol) / 2
-    assert np.allclose(energy, energy_manual)
-    assert np.allclose(energy, energy_.result())
-    assert np.allclose(stress, stress_.result())
+    assert np.allclose(data["energy"], energy_manual)
+    assert np.allclose(data["energy"], out.energy)
+    assert np.allclose(data["stress"], out.stress)
 
     # use external grid as bias, check that file is read
     test_set = dataset[:10]
@@ -154,22 +164,19 @@ def test_plumed_function(tmp_path, dataset, dataset_h2):
 UNITS LENGTH=A ENERGY=kj/mol TIME=fs
 CV: DISTANCE ATOMS=1,2 NOPBC
 METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={}
-""".format(
-        path_hills
-    )
+""".format(path_hills)
     hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills))
-    energy = hamiltonian.compute(test_set, "energy")
+    energy = hamiltonian.compute(test_set).result().energy
 
     # compute bias energy manually
-    positions = test_set.get("positions").result()
-    vecs = positions[:, 0, :] - positions[:, 1, :]
-    distance = np.linalg.norm(vecs, axis=1).reshape(-1, 1)
     sigma = 2 * np.ones((1, 2))
     height = np.array([70, 70]).reshape(1, -1) * (kJ / mol)  # unit consistency
     center = np.array([2.5, 2.6]).reshape(1, -1)
-    energy_per_hill = height * np.exp((distance - center) ** 2 / (-2 * sigma**2))
+    positions = test_set.get("positions")[0].result()
+    dists = np.array([np.linalg.norm(p[0] - p[1]) for p in positions]).reshape(-1, 1)
+    energy_per_hill = height * np.exp((dists - center) ** 2 / (-2 * sigma**2))
     energy_ = np.sum(energy_per_hill, axis=1)
-    assert np.allclose(energy.result(), energy_, atol=1e-3)
+    assert np.allclose(energy, energy_, atol=1e-3)
 
     # check that hills file didn't change
     with open(path_hills, "r") as f:
@@ -177,7 +184,7 @@ def test_plumed_function(tmp_path, dataset, dataset_h2):
 
 
 def test_harmonic_function(dataset):
-    test_set = dataset[:10]
+    test_set = dataset[:5]
     geometries = test_set.geometries().result()
     reference = geometries[0]
     hess = np.eye(3 * len(reference))
@@ -185,30 +192,59 @@ def test_harmonic_function(dataset):
     function = HarmonicFunction(reference.per_atom.positions, hess, reference.energy)
     einstein = EinsteinCrystalFunction(1.0, reference.per_atom.positions)
     harmonic = Harmonic.from_geometry(dataset[0], hess)
+    assert Harmonic.outputs == ("energy", "forces", "stress")
 
-    energy, forces, _ = function.compute(geometries).values()
-    energy1, forces1, _ = einstein.compute(geometries).values()
-    energy2, forces2, _ = harmonic.compute(test_set)
+    out0 = function.compute(geometries)
+    out1 = einstein.compute(geometries)
+    result = harmonic.compute(test_set)
+    err0 = compare_results(result, **out0).result()
+    err1 = compare_results(result, **out1).result()
 
-    assert Harmonic.outputs == ("energy", "forces", "stress")
-    assert np.allclose(energy - reference.energy, energy1)
-    assert np.allclose(forces, forces1)
-    assert np.allclose(energy2.result() - reference.energy, energy1, atol=1e-5)
-    assert np.allclose(forces2.result(), forces1)
+    assert np.allclose(np.array(out0["energy"]) - reference.energy, out1["energy"])
+    assert np.isclose(err0["energy"], 0)
+    assert np.isclose(err1["energy"], reference.energy)
+    assert np.allclose(out0["forces"], out1["forces"])
+    assert np.isclose(err0["forces"], 0)
+    assert np.isclose(err1["forces"], 0)
 
 
 def test_dispersion_function(dataset):
+    test_set = dataset[:3]
+    function = DispersionFunction(method="pbe", damping="d3bj")
     hamiltonian = D3Hamiltonian(method="pbe", damping="d3bj")
-    energy = hamiltonian.compute(dataset[-1], "energy").result()
-    assert energy is not None
-    assert energy < 0.0  # dispersion is attractive
 
-    subset = dataset[:3]
-    data = subset.evaluate(hamiltonian)
-    energy, forces = hamiltonian.compute(subset, "energy", "forces")
+    out = function.compute(test_set.geometries().result())
+    result = hamiltonian.compute(test_set)
+    error = compare_results(result, **out)
+    for k, rmse in error.result().items():
+        assert np.isclose(rmse, 0.0)
+    assert np.all(np.array(out["energy"]) < 0.0)  # dispersion is attractive
+
+
+def test_zero(dataset):
+    test_set = dataset[:10]
+    function = ZeroFunction()
+    hamiltonian = Zero()
+
+    out = function.compute(test_set.geometries().result())
+    result = hamiltonian.compute(test_set).result()
+    error = compare_results(result, **out)
+    for k, rmse in error.result().items():
+        assert np.isclose(rmse, 0.0)
+    assert np.allclose(result.energy, 0.0)
+    assert np.allclose(result.forces, 0.0)
 
-    assert np.allclose(data.get("energy").result(), energy.result())
-    assert len(forces.result().shape) == 3
+
+def test_mace(dataset, mace_foundation):
+    test_set = dataset[:3]
+    function = MACEFunction(mace_foundation, 1, "cpu", "float32")
+    hamiltonian = MACEHamiltonian(mace_foundation)
+
+    out = function.compute(test_set.geometries().result())
+    result = hamiltonian.compute(test_set).result()
+    error = compare_results(result, **out)
+    for k, rmse in error.result().items():
+        assert np.isclose(rmse, 0.0, atol=1e-6)  # single precision
 
 
 def test_hamiltonian_arithmetic(dataset):
@@ -234,11 +270,11 @@ def test_hamiltonian_arithmetic(dataset):
     zero = Zero()
     assert hamiltonian == hamiltonian + zero
     assert 2 * hamiltonian + zero == 2 * hamiltonian
-    scaled_m = 0.5 * hamiltonian
+    scaled_h = 0.5 * hamiltonian
     scaled_h1 = EinsteinCrystal.from_geometry(future, 0.5)
-    assert len(scaled_m) == 1
-    assert scaled_m.get_coefficient(hamiltonian) == 0.5
-    assert scaled_m.get_coefficient(scaled_h1) is None
+    assert len(scaled_h) == 1
+    assert scaled_h.get_coefficient(hamiltonian) == 0.5
+    assert scaled_h.get_coefficient(scaled_h1) is None
     scaled_h2 = EinsteinCrystal.from_geometry(future, 4.0)
     mixture = hamiltonian + scaled_h2
     assert len(mixture) == 2
@@ -246,32 +282,26 @@ def test_hamiltonian_arithmetic(dataset):
     assert mixture.get_coefficients(mixture) == (1, 1)
     assert mixture.get_coefficients(hamiltonian + scaled_h1) is None
 
-    energy_m = scaled_m.compute(test_set, "energy")
-    energy_h1 = scaled_h1.compute(test_set, "energy")
-    energy, forces, _ = hamiltonian.compute(test_set)
-    energy_, forces_, _ = mixture.compute(test_set)
-    assert np.allclose(energy_m.result(), energy_h1.result())
-    assert np.allclose(energy_.result(), 5 * energy.result())
-    assert np.allclose(forces_.result(), 5 * forces.result())
+    result_h = scaled_h.compute(test_set)
+    result_h1 = scaled_h1.compute(test_set)
+    error = compare_results(result_h, result_h1)
+    out = hamiltonian.compute(test_set)
+    out_ = mixture.compute(test_set)
+    error, out, out_ = error.result(), out.result(), out_.result()
 
-    energy, forces, stress = zero.compute(test_set)
-    assert np.allclose(energy.result(), 0.0)
-    geometries = [dataset[i].result() for i in [0, -1]]
-    natoms = [len(geometry) for geometry in geometries]
-    forces = zero.compute(geometries, "forces", batch_size=1).result()
-    assert np.all(forces[0, : natoms[0]] == 0.0)
-    assert np.all(forces[-1, : natoms[1]] == 0.0)
+    for k, v in error.items():
+        assert np.isclose(v, 0)
+    assert np.allclose(out.energy * 5, out_.energy)
+    assert np.allclose(out.forces * 5, out_.forces)
 
-
-def test_subtract(dataset):
-    einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0)
+    # check subtraction
+    einstein = EinsteinCrystal.from_geometry(future, force_constant=1.0)
     h = einstein - einstein
     assert isinstance(h, MixtureHamiltonian)
     assert np.allclose(h.coefficients, 0.0)
 
 
 def test_hamiltonian_serialize(dataset):
-
     data = json.loads(psiflow.serialize(Zero()).result())
     assert data[CLS_KEY] == "Zero"
     zero = psiflow.deserialize(json.dumps(data)).result()
@@ -281,9 +311,7 @@ def test_hamiltonian_serialize(dataset):
     UNITS LENGTH=A ENERGY=kj/mol TIME=fs
     CV: VOLUME
     RESTRAINT ARG=CV AT={center} KAPPA={kappa}
-    """.format(
-        center=100, kappa=1 / (kJ / mol)
-    )
+    """.format(center=100, kappa=1 / (kJ / mol))
     plumed = PlumedHamiltonian(plumed_input)
     einstein = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0)
 
@@ -308,27 +336,12 @@ def test_hamiltonian_serialize(dataset):
             assert h == h_
 
     test_set = dataset[:10]
-    energy_e = einstein.compute(test_set, "energy")
-    energy_e_ = einstein_.compute(test_set, "energy")
-    energy_m = mixed.compute(test_set, "energy")
-    energy_m_ = mixed_.compute(test_set, "energy")
-    assert np.allclose(energy_e.result(), energy_e_.result())
-    assert np.allclose(energy_m.result(), energy_m_.result())
-
-
-def test_evaluate(dataset):
-    hamiltonian = EinsteinCrystal.from_geometry(geometry=dataset[0], force_constant=1.0)
-    data = dataset[:10].reset()
-    evaluated = data.evaluate(hamiltonian, batch_size=None)
-    evaluated_ = data.evaluate(hamiltonian, batch_size=2)
-    energy = hamiltonian.compute(evaluated, "energy")
-
-    geometries = evaluated.geometries().result()
-    geometries_ = evaluated_.geometries().result()
-    for geom, geom_, e in zip(geometries, geometries_, energy.result()):
-        assert not np.all(np.isnan(geom.per_atom.forces))
-        assert geom == geom_
-        assert geom.energy == e
+    out1 = einstein.compute(test_set)
+    out1_ = einstein_.compute(test_set)
+    out2 = mixed.compute(test_set)
+    out2_ = mixed_.compute(test_set)
+    assert np.allclose(out1.result().energy, out1_.result().energy)
+    assert np.allclose(out2.result().energy, out2_.result().energy)
 
 
 def test_json_dump():
@@ -368,15 +381,36 @@ def test_function_from_json(tmp_path, dataset):
 UNITS LENGTH=A ENERGY=kj/mol TIME=fs
 CV: DISTANCE ATOMS=1,2 NOPBC
 METAD ARG=CV PACE=1 SIGMA=3 HEIGHT=342 FILE={}
-""".format(
-        path_hills
-    )
+""".format(path_hills)
     hamiltonian = PlumedHamiltonian(plumed_input, File(path_hills))
 
     future = hamiltonian.serialize_function()
     future.result()  # ensure file exists
     function = function_from_json(future.filepath)
 
-    energies = hamiltonian.compute(dataset, "energy")
+    energies = hamiltonian.compute(dataset).result().energy
     energies_ = function.compute(dataset.geometries().result())["energy"]
-    assert np.allclose(energies.result(), energies_)
+    assert np.allclose(energies, energies_)
+
+
+def test_evaluate(dataset):
+    test_set = dataset[:10].reset()
+    hamiltonian = EinsteinCrystal.from_geometry(dataset[0], force_constant=1.0)
+
+    out0 = hamiltonian.evaluate(test_set)
+    out1 = hamiltonian.evaluate(test_set, batch_size=None)
+    result = hamiltonian.compute(test_set)
+
+    geoms0 = out0.geometries().result()
+    geoms1 = out1.geometries().result()
+    data = result.result().to_dict()
+
+    for i in range(len(geoms0)):
+        geom0, geom1 = geoms0[i], geoms1[i]
+        assert np.isclose(geom0.energy, geom1.energy)
+        assert np.isclose(geom0.energy, data["energy"][i])
+        assert np.allclose(geom0.per_atom.forces, geom1.per_atom.forces)
+        assert np.allclose(geom0.per_atom.forces, data["forces"][i])
+        assert np.allclose(geom0.stress, geom1.stress)
+        assert np.allclose(geom0.stress, data["stress"][i])
+

From 0e5ebdd6b8bcb6ab6d6e15809497799ad023ce30 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 18:25:19 +0200
Subject: [PATCH 11/16] reference API update + tests

Some streamlining of the Reference methods:
- __call__ labels a single geometry
- evaluate labels a Dataset (analogous to Hamiltonian)
- compute was removed (for now) -similar functionality through Reference.evaluate(dataset).get('energy')

In which scenario do you want loose energies/forces?
---
 psiflow/execution.py           |   2 +-
 psiflow/geometry.py            |   5 +-
 psiflow/hamiltonians.py        |   7 +--
 psiflow/reference/cp2k_.py     |  15 ++---
 psiflow/reference/reference.py | 106 ++++++++++++---------------------
 tests/test_reference.py        |  71 ++++++++++++----------
 6 files changed, 87 insertions(+), 119 deletions(-)

diff --git a/psiflow/execution.py b/psiflow/execution.py
index 26e7f3a..812a69e 100644
--- a/psiflow/execution.py
+++ b/psiflow/execution.py
@@ -345,7 +345,7 @@ def wrap_in_timeout(self, command: str) -> str:
             return command  # noop
 
         # send SIGTERM after max_runtime, follow with SIGKILL 30s later
-        return f"timeout -k 30s {self.max_runtime}s {command}"
+        return f"timeout -v -k 30s {self.max_runtime}s {command}"
 
     # def wrap_in_srun(self, command: str) -> str:
     #     # TODO: stub -- this does not work
diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index 9c0cd29..d162095 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -224,9 +224,8 @@ def reset(self) -> None:
         Reset all computed properties of the geometry to their default values.
         """
         self.per_atom.reset()
-        for k in ("energy", "stress"):
-            if hasattr(self, k):
-                delattr(self, k)
+        for k in {"energy", "stress"}.intersection(self.attributes()):
+            delattr(self, k)
 
     def clean(self) -> None:
         """
diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py
index b90e7ca..dd65fc4 100644
--- a/psiflow/hamiltonians.py
+++ b/psiflow/hamiltonians.py
@@ -12,7 +12,6 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.file import write_frames
 from psiflow.compute import (
     aggregate_results,
     compute,
@@ -42,7 +41,6 @@
 apply_modelevaluation = python_app(_apply, executors=["ModelEvaluation"])
 
 
-# TODO: evaluate method for datasets?
 class Hamiltonian:
     outputs: ClassVar[tuple] = ("energy", "forces", "stress")
     function_name: ClassVar[str]
@@ -56,10 +54,7 @@ def evaluate(self, arg: Dataset, batch_size: int = 1000) -> Dataset:
         geoms = arg.geometries()
         result = self.compute(geoms, batch_size)
         future = insert_results(geoms, result)
-
-        file = psiflow.context().new_file("data_", ".xyz")
-        extxyz = write_frames(future, outputs=[file]).outputs[0]
-        return Dataset(extxyz=extxyz)
+        return Dataset(future)
 
     def __eq__(self, other: "Hamiltonian") -> bool:
         raise NotImplementedError
diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py
index 014a1e4..44412f6 100644
--- a/psiflow/reference/cp2k_.py
+++ b/psiflow/reference/cp2k_.py
@@ -4,8 +4,6 @@
 from typing import Optional
 from collections.abc import Sequence
 
-import numpy as np
-from ase.data import chemical_symbols
 from ase.units import Bohr, Ha
 from cp2k_input_tools.generator import CP2KInputGenerator
 from cp2k_input_tools.parser import CP2KInputParserSimplified
@@ -153,13 +151,12 @@ def create_input(self, geom: Geometry) -> tuple[bool, Optional[File]]:
         section_copy = copy.deepcopy(section)
         section.pop("topology", None)  # remove topology section
 
-        # insert geometry and cell
-        symbols = np.array(chemical_symbols)[geom.per_atom.numbers]
-        coord = [
-            f"{s:5} {p[0]:<15.8f} {p[1]:<15.8f} {p[2]:<15.8f}"
-            for s, p in zip(symbols, geom.per_atom.positions)
-        ]
-        section["coord"] = {"*": coord}
+        # insert coordinates
+        _, body = geom.per_atom.to_string()
+        coord_list = body.split("\n")[:-1]
+        section["coord"] = {"*": coord_list}
+
+        # insert cell
         cell = {}
         for i, vector in enumerate(["A", "B", "C"]):
             cell[vector] = "{} {} {}".format(*geom.cell[i])
diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py
index cfbfe10..68f92c2 100644
--- a/psiflow/reference/reference.py
+++ b/psiflow/reference/reference.py
@@ -1,6 +1,5 @@
-import warnings
 import logging
-from typing import Optional, Union, Callable, Optional
+from typing import Callable, Optional
 from collections.abc import Sequence
 from pathlib import Path
 from functools import partial
@@ -14,12 +13,15 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.data.utils import extract
 from psiflow.geometry import Geometry
 from psiflow.utils.apps import copy_app_future
 from psiflow.utils.parse import LineNotFoundError, get_task_name_id
 
 
+# TODO: existing Reference instances do not react to changes in execution environment
+#  e.g., altering max_runtime is not retroactively updated
+
+
 logger = logging.getLogger(__name__)  # logging per module
 
 
@@ -31,13 +33,14 @@ class Status(Enum):
 
 def update_geometry(geom: Geometry, data: dict) -> Geometry:
     """"""
+    # TODO: is the metadata not more a logging thing?
     # data should contain 'stdout' and 'status' keys
     task_name, task_id = get_task_name_id(data["stdout"])
     logger.info(f'Task "{task_name}" (ID {task_id}): {data["status"].name}')
 
     geom = geom.copy()
     geom.reset()
-    geom.order["status"], geom.order["task_id"] = data["status"].name, task_id
+    geom.metadata = {'status': data["status"].name, 'task_id': task_id}
     if data["status"] != Status.SUCCESS:
         return geom
 
@@ -47,18 +50,18 @@ def update_geometry(geom: Geometry, data: dict) -> Geometry:
     shift = data["positions"][0] - geom.per_atom.positions[0]
     if not np.allclose(data["positions"], geom.per_atom.positions + shift, atol=1e-6):
         # output does not match geometry up to a translation
-        geom.order["status"] = Status.INCONSISTENT
+        geom.status = Status.INCONSISTENT
         return geom
 
     geom.energy = data["energy"]
-    geom.order["runtime"] = data.get("runtime")
+    geom.metadata["runtime"] = data.get("runtime")
     if "forces" in data:
-        geom.per_atom.forces[:] = data["forces"]
+        geom.per_atom.forces = data["forces"]
     return geom
 
 
 @join_app
-def get_minimum_energy(element: str, **kwargs) -> AppFuture[float]:
+def get_minimum_energy(element: str, **kwargs) -> AppFuture:
     energies = {m: state.energy or np.inf for m, state in kwargs.items()}
     energy = min(energies.values())
     msg = [
@@ -84,18 +87,6 @@ def get_spin_multiplicities(element: str) -> list[int]:
     return mults
 
 
-@join_app
-def compute_dataset(
-    dataset: Dataset,
-    length: int,
-    reference: "Reference",
-) -> list[AppFuture[Geometry]]:
-    logger.info(f"Performing {length} {reference.__class__.__name__} calculations.")
-    geometries = dataset.geometries()  # read it once
-    evaluated = [reference.evaluate(geometries[i]) for i in range(length)]
-    return evaluated
-
-
 def _execute(
     bash_template: str,
     inputs: list[File],
@@ -125,38 +116,15 @@ def __init__(
         if (n := self.n_cores) is not None:
             assert n <= definition.spec["cores"]
 
-    def compute(
-        self,
-        arg: Union[Dataset, Geometry, AppFuture, list],
-        *outputs: Optional[Union[str, tuple]],
-    ):
-        # TODO: deprecate? Reference evaluations are too costly
-        for output in outputs:
-            if output not in self.outputs:
-                raise ValueError("output {} not in {}".format(output, self.outputs))
-
-        # TODO: convert_to_dataset util?
-        if isinstance(arg, Dataset):
-            dataset = arg
-        elif isinstance(arg, list):
-            dataset = Dataset(arg)
-        elif isinstance(arg, AppFuture) or isinstance(arg, Geometry):
-            dataset = Dataset([arg])
-        else:
-            raise TypeError
-
-        # TODO: writing to dataset first is wasted overhead,
-        #  but extract_quantities does not accept AppFuture[list[Geometry]] (yet?)
-        future_dataset = self.compute_dataset(dataset)
-        outputs = outputs or self.outputs
-        future_data = extract(
-            tuple(outputs), None, None, inputs=[future_dataset.extxyz]
-        )
-        if len(outputs) == 1:
-            return future_data[0]
-        return [future_data[_] for _ in range(len(outputs))]
+    def __call__(self, geometry: Geometry | AppFuture) -> AppFuture:
+        return evaluate(geometry, self)
+
+    def evaluate(self, dataset: Dataset) -> Dataset:
+        """Return a new labelled dataset"""
+        future = evaluate_geometries(dataset.geometries(), self)
+        return Dataset(future)
 
-    def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]:
+    def compute_atomic_energy(self, element, box_size=None) -> AppFuture:
         references = self.get_single_atom_references(element)
         state = Geometry.from_data(
             numbers=np.array([atomic_numbers[element]]),
@@ -165,11 +133,10 @@ def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]:
         )
         futures = {}
         for mult, reference in references.items():
-            futures[str(mult)] = reference.evaluate(state)
-        return get_minimum_energy(element, **futures)
+            futures[str(mult)] = reference(state)
+        return get_minimum_energy(element, **futures)  # float
 
     def get_execute_app(self) -> Callable:
-        # TODO: how often is this called?
         context = psiflow.context()
         definition = context.definitions[self.executor]
         return partial(
@@ -179,13 +146,6 @@ def get_execute_app(self) -> Callable:
             label=self._execute_label,
         )
 
-    def evaluate(self, geometry: Geometry | AppFuture) -> AppFuture:
-        return evaluate(geometry, self)
-
-    def compute_dataset(self, dataset: Dataset) -> Dataset:
-        future = compute_dataset(dataset, dataset.length(), self)
-        return Dataset(future)
-
     def get_single_atom_references(self, element: str) -> dict[int, "Reference"]:
         raise NotImplementedError
 
@@ -218,21 +178,17 @@ def _process_output(
 def evaluate(
     geom: Geometry,
     reference: Reference,
-    execute_func: Optional[Callable] = None,
+    execute_app: Optional[Callable] = None,
 ) -> AppFuture:
     """"""
-    if geom == NullState:  # TODO: remove this
-        warnings.warn("Skipping NullState..")
-        return copy_app_future(geom)
-
     flag, *files = reference.create_input(geom=geom)
     if not flag:  # TODO: should we reset geom?
         return copy_app_future(geom)
 
     # do the actual evaluation
-    if execute_func is None:
-        execute_func = reference.get_execute_app()
-    future = execute_func(inputs=files)
+    if execute_app is None:
+        execute_app = reference.get_execute_app()
+    future = execute_app(inputs=files)
 
     future = process_output(
         geom=geom,
@@ -240,3 +196,15 @@ def evaluate(
         inputs=[future.stdout, future.stderr, future],  # wait for future
     )
     return future
+
+
+@join_app
+def evaluate_geometries(
+    states: Sequence[Geometry],
+    reference: "Reference",
+) -> list[AppFuture]:
+    msg = f"Performing {len(states)} {reference.__class__.__name__} calculations."
+    logger.info(msg)
+    execute_app = reference.get_execute_app()
+    evaluated = [evaluate(geom, reference, execute_app) for geom in states]
+    return evaluated
diff --git a/tests/test_reference.py b/tests/test_reference.py
index 416974a..fcce746 100644
--- a/tests/test_reference.py
+++ b/tests/test_reference.py
@@ -5,7 +5,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.geometry import Geometry, NullState
+from psiflow.geometry import Geometry, MISSING
 from psiflow.reference import CP2K, GPAW, ORCA, create_orca_input
 from psiflow.reference.reference import Status
 from psiflow.reference.cp2k_ import (
@@ -198,23 +198,19 @@ def test_cp2k_parse_output():
 def test_cp2k_success(simple_cp2k_input, geom_h2_p):
     reference = CP2K(simple_cp2k_input)
 
-    future = reference.evaluate(geom_h2_p)
-    future_null = reference.evaluate(NullState)
+    future = reference(geom_h2_p)
     assert isinstance(future, AppFuture)
     geom_out = future.result()
-    geom_null = future_null.result()
 
     ref_energy = -1.167407360449355 * Ha
     ref_forces = np.array([[-0.00968014, 0.0, 0.0], [0.00967947, 0.0, 0.0]]) * Ha / Bohr
-    assert geom_out != NullState
     assert geom_out.energy is not None
     assert not np.any(np.isnan(geom_out.per_atom.forces))
     assert np.allclose(ref_energy, geom_out.energy)
     assert np.allclose(ref_forces, geom_out.per_atom.forces, atol=1e-5)
-    assert geom_null == NullState  # check whether NullState evaluates to NullState
 
     # check number of mpi processes
-    stdout, _ = get_task_logs(geom_out.order["task_id"])
+    stdout, _ = get_task_logs(geom_out.metadata["task_id"])
     lines = stdout.read_text().split("\n")
     for line in lines:
         if "Total number of message passing processes" in line:
@@ -295,12 +291,12 @@ def test_cp2k_failure(geom_h2_p):
 &END FORCE_EVAL
 """  # incorrect input file
     reference = CP2K(cp2k_input)
-    future = reference.evaluate(geom_h2_p)
+    future = reference(geom_h2_p)
     assert isinstance(future, AppFuture)
     geom_out = future.result()
-    assert geom_out.energy is None
-    assert np.all(np.isnan(geom_out.per_atom.forces))
-    stdout, _ = get_task_logs(geom_out.order["task_id"])
+    assert geom_out.energy is MISSING
+    assert geom_out.per_atom.forces is MISSING
+    stdout, _ = get_task_logs(geom_out.metadata["task_id"])
     assert "ABORT" in stdout.read_text()  # verify error is captured
 
 
@@ -311,25 +307,34 @@ def test_cp2k_memory(simple_cp2k_input):
         positions=np.random.uniform(0, 20, size=(4000, 3)),
         cell=20 * np.eye(3),  # box way too large
     )
-    energy, forces = reference.compute(geometry)
-    energy, forces = energy.result(), forces.result()
-    assert np.all(np.isnan(energy))
+    geom_fail = reference(geometry).result()
+    assert geom_fail.energy is MISSING
+    assert geom_fail.per_atom.forces is MISSING
+    assert geom_fail.metadata["status"] == Status.FAILED.name
 
 
 @pytest.mark.filterwarnings("ignore:Original input file not found")
 def test_cp2k_timeout(simple_cp2k_input, geom_h2_p):
+    definition = psiflow.context().definitions["CP2K"]
+    runtime = definition.max_runtime
+    definition.max_runtime = 1  # seconds
+
     reference = CP2K(simple_cp2k_input)
     geom_h2_p.cell = 20 * np.eye(3)  # box way too large
-    energy, forces = reference.compute(Dataset([geom_h2_p]))
-    energy, forces = energy.result(), forces.result()
-    assert np.all(np.isnan(energy))
+
+    geom_fail = reference(geom_h2_p).result()
+    assert geom_fail.energy is MISSING
+    assert geom_fail.per_atom.forces is MISSING
+    assert geom_fail.metadata["status"] == Status.FAILED.name
+
+    definition.max_runtime = runtime  # restore value
 
 
 def test_cp2k_energy(simple_cp2k_input, geom_h2_p):
     reference = CP2K(simple_cp2k_input, outputs=("energy",))
-    geom_out = reference.evaluate(geom_h2_p).result()
+    geom_out = reference(geom_h2_p).result()
     assert geom_out.energy is not None
-    assert np.all(np.isnan(geom_out.per_atom.forces))
+    assert geom_out.per_atom.forces is MISSING
 
 
 @pytest.mark.filterwarnings("ignore:Original input file not found")
@@ -337,6 +342,7 @@ def test_cp2k_atomic_energies(simple_cp2k_input):
     # use energy-only because why not
     reference = CP2K(simple_cp2k_input, outputs=("energy",))
     energy = reference.compute_atomic_energy("H", box_size=4)
+    print(energy.result())
     assert abs(energy.result() - (-13.6)) < 1  # reasonably close to exact value
 
 
@@ -380,27 +386,30 @@ def test_cp2k_posthf(geom_h2_p):
 &END FORCE_EVAL
 """
     reference = CP2K(cp2k_input_str, outputs=("energy",))
-    assert reference.evaluate(geom_h2_p).result().energy is not None
+    assert reference(geom_h2_p).result().energy is not MISSING
 
 
-def test_gpaw_single(dataset_h2):
+def test_gpaw(dataset_h2):
+    # basic functionality
     parameters = dict(mode="fd", nbands=0, xc="LDA", h=0.3, minimal_box_multiple=2)
     gpaw = GPAW(parameters)
     future_in = dataset_h2[0]
-    future_out = gpaw.evaluate(future_in)
-    future_energy = gpaw.compute(dataset_h2[:1])[0]
+    future_out = gpaw(future_in)
+    dataset_out = gpaw.evaluate(dataset_h2[:2])
     future_energy_zr = gpaw.compute_atomic_energy("Zr", box_size=9)
-    gpaw = GPAW({"askdfj": "asdfk"})  # invalid input
-    future_fail = gpaw.evaluate(future_in)
 
-    geom_in, geom_out = future_in.result(), future_out.result()
-    energy, energy_zr = future_energy.result(), future_energy_zr.result()
+    # invalid input
+    gpaw = GPAW({"askdfj": "asdfk"})
+    future_fail = gpaw(future_in)
 
-    assert geom_out.energy is not None and geom_out.energy < 0.0
+    geom_in, geom_out = future_in.result(), future_out.result()
+    geoms_out = dataset_out.geometries().result()
+    assert geom_out.energy is not MISSING and geom_out.energy < 0.0
     assert np.allclose(geom_out.per_atom.positions, geom_in.per_atom.positions)
-    assert np.allclose(geom_out.energy, energy)
-    assert energy_zr == 0.0
-    assert future_fail.result().energy is None
+    assert future_energy_zr.result() == 0.0
+    for geom in geoms_out:
+        assert geom.energy is not MISSING and geom.energy < 0.0
+    assert future_fail.result().energy is MISSING
 
 
 # TODO: enable once we have an ORCA container

From 9523b2efd43301a5ca60856723a6fcb52fed619e Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 22:56:06 +0200
Subject: [PATCH 12/16] make sampling work with new geometry and data modules

make sampling work with new geometry and data modules
---
 install_local.sh             |  8 +++++++-
 psiflow/data/dataset.py      |  2 --
 psiflow/geometry.py          |  2 ++
 psiflow/sampling/driver.py   |  5 ++++-
 psiflow/sampling/optimize.py |  2 +-
 psiflow/sampling/output.py   |  2 +-
 psiflow/sampling/sampling.py |  2 +-
 psiflow/sampling/server.py   | 10 ++++------
 psiflow/sampling/utils.py    |  2 +-
 psiflow/sampling/walker.py   |  2 +-
 10 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/install_local.sh b/install_local.sh
index 9f7d7df..c067cbe 100644
--- a/install_local.sh
+++ b/install_local.sh
@@ -10,11 +10,17 @@ micromamba install ndcctools -c conda-forge
 # install psiflow
 pip install -e psiflow[dev]
 
-# install PyTorch and MACE
+# install PyTorch and MACE -- CUDA
 pip install torch --index-url https://download.pytorch.org/whl/cu128
 pip install mace-torch
 pip install cuequivariance cuequivariance-torch cuequivariance-ops-torch-cu12
 
+# install PyTorch and MACE -- ROCM
+pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
+pip install mace-torch
+pip install openequivariance
+python -c "import openequivariance"  # compile some binary, maybe
+
 # install basic PLUMED and python API
 micromamba install plumed -c conda-forge
 pip install plumed
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index 1d0e8da..21d0459 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -26,8 +26,6 @@
     assign_identifiers,
 )
 
-# TODO: do all operations need to generate a new file?
-
 
 @psiflow.register_serializable
 class Dataset:
diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index d162095..c23817a 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -16,6 +16,8 @@
 
 # TODO: docstrings
 # TODO: custom attributes writing
+# TODO: Reference and Sampling want to add attributes to Geometry (used to be 'order' dict)
+#  what to do?
 
 # these are always accessible for every geometry
 DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress"
diff --git a/psiflow/sampling/driver.py b/psiflow/sampling/driver.py
index b56b852..463201f 100644
--- a/psiflow/sampling/driver.py
+++ b/psiflow/sampling/driver.py
@@ -74,7 +74,10 @@ def compute_structure(self, cell, pos):
             unit_to_internal("energy", "electronvolt", energy), np.float64
         )
         force_ipi = np.asarray(unit_to_internal("force", "ev/ang", forces), np.float64)
-        vir_calc = -stress * self.geometry.volume
+        if self.geometry.periodic:
+            vir_calc = -stress * self.geometry.volume
+        else:
+            vir_calc = np.zeros_like(stress)
         vir_ipi = np.array(
             unit_to_internal("energy", "electronvolt", vir_calc.T), dtype=np.float64
         )
diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py
index a5e97df..4d7cd0c 100644
--- a/psiflow/sampling/optimize.py
+++ b/psiflow/sampling/optimize.py
@@ -192,7 +192,7 @@ def optimize(
         parsl_resource_specification=definition.wq_resources(1),
     )
 
-    final = Dataset(None, result.outputs[0]).evaluate(hamiltonian)[-1]
+    final = hamiltonian.evaluate(Dataset(extxyz=result.outputs[0]))[-1]
     if keep_trajectory:
         trajectory = Dataset(None, result.outputs[1])
         return final, trajectory
diff --git a/psiflow/sampling/output.py b/psiflow/sampling/output.py
index 9daec95..7f08719 100644
--- a/psiflow/sampling/output.py
+++ b/psiflow/sampling/output.py
@@ -100,7 +100,7 @@ def _parse_simulation_data(
 
     task_id = get_task_name_id(simulation_stdout)[-1]
     logger.info(f"Simulation [ID {task_id}]: {status}")
-    return status, data, temperature, state.order.pop("time")
+    return status, data, temperature, state.time
 
 
 parse_simulation_data = python_app(
diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py
index 8650f03..a28fb6d 100644
--- a/psiflow/sampling/sampling.py
+++ b/psiflow/sampling/sampling.py
@@ -635,7 +635,7 @@ def _sample(
 
     # process MD output
     simulation_outputs = []
-    final_states = read_frames(inputs=result.outputs[:1])
+    final_states = read_frames(result.outputs[0])
     for idx, walker in enumerate(walkers):
         # TODO: order_parameter out of commission
         # if walker.order_parameter is not None:
diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py
index d0bc4e5..c3d5856 100644
--- a/psiflow/sampling/server.py
+++ b/psiflow/sampling/server.py
@@ -46,7 +46,7 @@ def parse_checkpoint(file_xml: str | Path) -> list[Geometry]:
         # get current internal system time
         ensemble_data = sys.ensemble.fetch()
         time = ensemble_data.time * (_hbar / (_e * Ha)) * 1e12
-        geometry.order["time"], geometry.order["name"] = time, name
+        geometry.time, geometry.name = time, name
         geometries.append(geometry)
 
     return geometries
@@ -63,8 +63,6 @@ def insert_addresses(input_xml: ET.Element) -> None:
             address.text = str(Path.cwd() / address.text.strip())
 
 
-
-
 def wait_for_clients(input_xml, timeout: int = 60) -> None:
     """Make sure clients have initialised successfully"""
 
@@ -72,7 +70,7 @@ def wait_for_clients(input_xml, timeout: int = 60) -> None:
     sockets = []
     xml_str = ET.tostring(input_xml, encoding="unicode")
     for line in xml_str.splitlines():
-        if 'address' in line:
+        if "address" in line:
             sockets.append(line.split(">")[1].split("<")[0])
 
     for _ in range(timeout):
@@ -116,7 +114,7 @@ def run(start_xyz: str, input_xml: str):
 
 
 def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None:
-    from psiflow.data.utils import _write_frames
+    from psiflow.data.file import _write_frames
 
     print("Starting cleanup")
     with open(INPUT_XML, "r") as f:
@@ -129,7 +127,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None:
     states = parse_checkpoint("output.checkpoint")
     for state in states:
         if np.allclose(state.cell, NONPERIODIC_CELL):
-            state.cell[:] = 0.0
+            state.cell = None
     _write_frames(*states, outputs=[output_xyz])
     print("Moved checkpoint geometries")
 
diff --git a/psiflow/sampling/utils.py b/psiflow/sampling/utils.py
index 72ffb36..37f94f1 100644
--- a/psiflow/sampling/utils.py
+++ b/psiflow/sampling/utils.py
@@ -38,7 +38,7 @@ def check_forces(
     if not np.sum(exceeded):
         return
     indices = np.arange(len(geometry))[exceeded]
-    numbers = geometry.per_atom.numbers[exceeded]
+    numbers = geometry.numbers[exceeded]
     symbols = [chemical_symbols[n] for n in numbers]
     raise ForceMagnitudeException(
         "\nforce exceeded {} eV/A for atoms {}"
diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py
index 8d9ada5..1c22cad 100644
--- a/psiflow/sampling/walker.py
+++ b/psiflow/sampling/walker.py
@@ -185,7 +185,7 @@ def _get_minimum_energy_states(
 def quench(walkers: list[Walker], dataset: Dataset) -> None:
     """Assign the lowest energy geometry in dataset to every walker"""
     hamiltonians = combine_hamiltonians([w.hamiltonian for w in walkers])
-    energies = [h.compute(dataset, "energy") for h in hamiltonians.hamiltonians]
+    energies = [h.compute(dataset).energy for h in hamiltonians.hamiltonians]
     coefficients = [
         hamiltonians.get_coefficients(walker.hamiltonian * 1.0) for walker in walkers
     ]

From 6e9d3696ee03b18956838c6ce5d68b31a9401845 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 15 Apr 2026 22:56:38 +0200
Subject: [PATCH 13/16] fix tests

---
 tests/test_free_energy.py |  5 +++--
 tests/test_learning.py    |  2 +-
 tests/test_sampling.py    | 29 ++++++++++++-----------------
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/tests/test_free_energy.py b/tests/test_free_energy.py
index ecacc3e..c1bc53a 100644
--- a/tests/test_free_energy.py
+++ b/tests/test_free_energy.py
@@ -1,15 +1,16 @@
 import numpy as np
 from ase.units import _c, kB, second
 
+import psiflow
 from psiflow.free_energy import (
     Integration,
     compute_frequencies,
     compute_harmonic,
     harmonic_free_energy,
 )
-from psiflow.geometry import check_equality
 from psiflow.hamiltonians import EinsteinCrystal, Harmonic, MACEHamiltonian
 from psiflow.sampling.ase import optimize
+from psiflow.data.utils import check_equality
 
 
 def test_integration_simple(dataset):
@@ -39,7 +40,7 @@ def test_integration_simple(dataset):
 
         # manual computation of delta gradient
         delta = -0.1 * harmonic
-        energies = delta.compute(integration.outputs[i].trajectory, "energy")
+        energies = delta.compute(integration.outputs[i].trajectory).energy
         assert np.allclose(
             state.gradients["delta"].result(),
             np.mean(energies.result()) / (kB * state.temperature),
diff --git a/tests/test_learning.py b/tests/test_learning.py
index 430a6ca..416a09c 100644
--- a/tests/test_learning.py
+++ b/tests/test_learning.py
@@ -3,7 +3,7 @@
 
 import psiflow
 from psiflow.data import Dataset
-from psiflow.geometry import new_nullstate
+# from psiflow.geometry import new_nullstate
 from psiflow.hamiltonians import EinsteinCrystal
 from psiflow.learning import Learning, evaluate_outputs
 from psiflow.metrics import Metrics, _create_table, parse_walker_log, reconstruct_dtypes
diff --git a/tests/test_sampling.py b/tests/test_sampling.py
index 0f3ccf8..152968c 100644
--- a/tests/test_sampling.py
+++ b/tests/test_sampling.py
@@ -5,7 +5,6 @@
 from ase.units import Bohr
 
 import psiflow
-from psiflow.geometry import check_equality
 from psiflow.hamiltonians import EinsteinCrystal, PlumedHamiltonian, MACEHamiltonian
 from psiflow.sampling.optimize import (
     optimize as optimize_ipi,
@@ -29,6 +28,8 @@
 )
 from psiflow.sampling.output import Status
 
+from psiflow.data.utils import check_equality
+
 
 def test_walkers(dataset):
     future = dataset[0]
@@ -120,11 +121,9 @@ def test_parse_checkpoint(checkpoint):
     file = psiflow.context().new_file("input_", ".xml")
     Path(file.filepath).write_text(checkpoint)
     states = parse_checkpoint(file.filepath)
-    assert "time" in states[0].order
-    assert np.allclose(
-        states[0].cell,
-        np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr,
-    )
+    cell = np.array([[1, 0.0, 0], [0.1, 2, 0], [0, 0, 3]]) * Bohr
+    assert hasattr(states[0], "time")
+    assert np.allclose(states[0].cell, cell)
 
 
 def test_sample(dataset, mace_foundation):
@@ -143,13 +142,9 @@ def test_sample(dataset, mace_foundation):
 METAD ARG=CV PACE=5 SIGMA=0.05 HEIGHT=5
 """
     metadynamics = Metadynamics(plumed_str)
+    mixture = 0.9 * plumed + einstein
     walkers = [
-        Walker(
-            start=future,
-            temperature=300,
-            metadynamics=metadynamics,
-            hamiltonian=0.9 * plumed + einstein,
-        ),
+        Walker(start=future, metadynamics=metadynamics, hamiltonian=mixture),
         Walker(start=future, temperature=600, hamiltonian=einstein),
         Walker(start=future, temperature=450, hamiltonian=einstein),
         Walker(start=future, temperature=600, hamiltonian=einstein),
@@ -182,11 +177,11 @@ def test_sample(dataset, mace_foundation):
         outputs[1]["potential{electronvolt}"].result(),
     ]
     energies_ = [
-        (0.9 * plumed + einstein).compute(outputs[0].trajectory, "energy"),
-        einstein.compute(outputs[1].trajectory, "energy"),
+        mixture.compute(outputs[0].trajectory).result().energy,
+        einstein.compute(outputs[1].trajectory).result().energy,
     ]
-    assert len(energies[0]) == len(energies_[0].result())
-    assert np.allclose(energies[0], energies_[0].result())
+    assert len(energies[0]) == len(energies_[0])
+    assert np.allclose(energies[0], energies_[0])
     time = outputs[0]["time{picosecond}"].result()
     assert np.allclose(time, np.arange(0, 51, 5) * 5e-4)
 
@@ -237,7 +232,7 @@ def test_sample(dataset, mace_foundation):
         checkpoint_step=1,
         fix_com=True,  # otherwise temperature won't match
     )
-    energy = hamiltonian.compute(dataset[1], "energy")
+    energy = hamiltonian.compute(dataset[1]).energy
     energy_ = output["potential{electronvolt}"]
     assert np.allclose(energy.result()[0], energy_.result()[0], atol=1e-5)
     assert np.allclose(output.time.result(), 10 * 0.5 / 1000)

From 9c0af275d41673d056f736ac52296ee6c08c00be Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Tue, 21 Apr 2026 16:12:54 +0200
Subject: [PATCH 14/16] cleanup + minor fixes

---
 psiflow/compute.py             | 101 +++---------------------
 psiflow/data/__init__.py       |   1 +
 psiflow/data/dataset.py        |  33 +-------
 psiflow/data/file.py           |  35 ---------
 psiflow/execution.py           |   8 +-
 psiflow/functions.py           |   1 -
 psiflow/geometry.py            | 135 ++++++++++++---------------------
 psiflow/learning.py            |   5 +-
 psiflow/models/mace.py         |   4 +-
 psiflow/reference/cp2k_.py     |   3 -
 psiflow/reference/reference.py |  11 +--
 psiflow/sampling/_ase.py       |   6 +-
 psiflow/sampling/sampling.py   |   3 +-
 psiflow/sampling/server.py     |   2 +-
 psiflow/utils/wq.py            |   1 +
 tests/test_data.py             |  74 ++++++++----------
 16 files changed, 108 insertions(+), 315 deletions(-)

diff --git a/psiflow/compute.py b/psiflow/compute.py
index de26fb5..836bbb9 100644
--- a/psiflow/compute.py
+++ b/psiflow/compute.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Callable, ClassVar, Optional, Union, Type, Any, TypeAlias
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -50,19 +51,19 @@ def get(self, key: str, per_geom: bool = False) -> list | np.ndarray:
         return data_list
 
     def to_dict(self) -> dict[str, list]:
+        """Convert to a dict that is extract/insert compliant for geometries"""
         return {k: self.get(k, per_geom=True) for k in self.keys}
 
     @classmethod
-    def from_data(cls, n_atoms: Sequence[int], data: dict[str, list]):
+    def from_data(cls, n_atoms: np.ndarray, data: dict[str, list]):
         values = {}
         for k, v in data.items():
             assert len(v) == n_atoms.size
-            if k in PER_ATOM_FIELDS:
-                values[k] = np.concatenate(v)
-            elif k in DEFAULT_PROPERTIES:
-                values[k] = np.stack(v)
+            if not np.iterable(v[0]):
+                values[k] = np.array(v)
+            elif len(v[0]) == n_atoms[0] and len(v[-1]) == n_atoms[-1]:
+                values[k] = np.concatenate(v)  # assume per-atom property
             else:
-                # TODO: what with custom properties?
                 values[k] = np.stack(v)
         return cls(np.array(n_atoms), values)
 
@@ -100,7 +101,6 @@ def aggregate_results(
     *results: ComputeResult, coefficients: Optional[np.ndarray] = None
 ) -> ComputeResult:
     """"""
-    # TODO: what with data fields that are not shared?
     if coefficients is None:
         coefficients = np.ones(len(results))
     assert len(coefficients) == len(results)
@@ -208,89 +208,6 @@ def _compare_results(
 compare_results = python_app(_compare_results, executors=["default_threads"])
 
 
-# TODO: cleanup
-# def _compute_rmse(
-#     array0: np.ndarray,
-#     array1: np.ndarray,
-#     reduce: bool = True,
-# ) -> Union[float, np.ndarray]:
-#     """
-#     Compute the Root Mean Square Error (RMSE) between two arrays.
-#
-#     Args:
-#         array0: First array.
-#         array1: Second array.
-#         reduce: Whether to reduce the result to a single value.
-#
-#     Returns:
-#         Union[float, np.ndarray]: RMSE value(s).
-#
-#     Note:
-#         This function is wrapped as a Parsl app and executed using the default_threads executor.
-#     """
-#     assert array0.shape == array1.shape
-#     assert np.all(np.isnan(array0) == np.isnan(array1))
-#
-#     se = (array0 - array1) ** 2
-#     se = se.reshape(se.shape[0], -1)
-#
-#     if reduce:  # across both dimensions
-#         mask = np.logical_not(np.isnan(se))
-#         return float(np.sqrt(np.mean(se[mask])))
-#     else:  # retain first dimension
-#         if se.ndim == 1:
-#             return se
-#         else:
-#             values = np.empty(len(se))
-#             for i in range(len(se)):
-#                 if np.all(np.isnan(se[i])):
-#                     values[i] = np.nan
-#                 else:
-#                     mask = np.logical_not(np.isnan(se[i]))
-#                     value = np.sqrt(np.mean(se[i][mask]))
-#                     values[i] = value
-#             return values
-#
-#
-# compute_rmse = python_app(_compute_rmse, executors=["default_threads"])
-#
-#
-# def _compute_mae(
-#     array0,
-#     array1,
-#     reduce: bool = True,
-# ) -> Union[float, np.ndarray]:
-#     """
-#     Compute the Mean Absolute Error (MAE) between two arrays.
-#
-#     Args:
-#         array0: First array.
-#         array1: Second array.
-#         reduce: Whether to reduce the result to a single value.
-#
-#     Returns:
-#         Union[float, np.ndarray]: MAE value(s).
-#
-#     Note:
-#         This function is wrapped as a Parsl app and executed using the default_threads executor.
-#     """
-#     assert array0.shape == array1.shape
-#     mask0 = np.logical_not(np.isnan(array0))
-#     mask1 = np.logical_not(np.isnan(array1))
-#     assert np.all(mask0 == mask1)
-#     ae = np.abs(array0 - array1)
-#     to_reduce = tuple(range(1, array0.ndim))
-#     mask = np.logical_not(np.all(np.isnan(ae), axis=to_reduce))
-#     ae = ae[mask0].reshape(np.sum(1 * mask), -1)
-#     if reduce:  # across both dimensions
-#         return float(np.sqrt(np.mean(ae)))
-#     else:  # retain first dimension
-#         return np.sqrt(np.mean(ae, axis=1))
-#
-#
-# compute_mae = python_app(_compute_mae, executors=["default_threads"])
-
-
 def input_to_geometries(data: ComputeInput) -> AppFuture:
     """Convert ComputeInput into a sequence of geometries (as a future)"""
     # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry
@@ -315,7 +232,9 @@ def prep_input(data: Geometry | Sequence[Geometry]) -> list[Geometry]:
 
 
 @python_app(executors=["default_threads"])
-def insert_results(states: Sequence[Geometry], result: ComputeResult) -> Sequence[Geometry]:
+def insert_results(
+    states: Sequence[Geometry], result: ComputeResult
+) -> Sequence[Geometry]:
     """"""
     insert(states, result.to_dict())
     return states
diff --git a/psiflow/data/__init__.py b/psiflow/data/__init__.py
index 13a9804..7ade023 100644
--- a/psiflow/data/__init__.py
+++ b/psiflow/data/__init__.py
@@ -1 +1,2 @@
 from .dataset import Dataset
+from .file import read_frames, count_frames
\ No newline at end of file
diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py
index 21d0459..52bbf00 100644
--- a/psiflow/data/dataset.py
+++ b/psiflow/data/dataset.py
@@ -195,37 +195,6 @@ def get_per_atom(
         )
         return tuple(future_dict[q] for q in quantities)
 
-    # TODO: cleanup?
-    # def evaluate(
-    #     self, computable: "Computable", batch_size: Optional[int] = None
-    # ) -> Dataset:
-    #     """
-    #     Evaluate a Computable on the dataset.
-    #     """
-    #     # TODO: remove this functionality?
-    #     #  or this should be the (only) way to label a dataset?
-    #     from psiflow.hamiltonians import Hamiltonian
-    #
-    #     if not isinstance(computable, Hamiltonian):
-    #         # avoid extracting and inserting the same quantities
-    #         return computable.compute_dataset(self)
-    #
-    #     # use Hamiltonian.compute method
-    #     if batch_size is not None:
-    #         outputs = computable.compute(self, batch_size=batch_size)
-    #     else:
-    #         outputs = computable.compute(self)  # use default from computable
-    #     if not isinstance(outputs, list):  # compute unpacks for only one property
-    #         outputs = [outputs]
-    #     data = {k: v for k, v in zip(computable.outputs, outputs)}
-    #
-    #     file = psiflow.context().new_file("data_", ".xyz")
-    #
-    #     future = read_frames(self.extxyz)
-    #     future = insert_quantities(future, data)
-    #     extxyz = write_frames(future, outputs=[file]).outputs[0]
-    #     return Dataset(extxyz=extxyz)
-
     def filter(self, quantity: str) -> Dataset:
         """
         Filter the dataset based on a specified quantity.
@@ -272,7 +241,7 @@ def assign_identifiers(
             AppFuture: Future representing the next available identifier.
         """
         # TODO: what is the use case for this?
-        # TODO: this is the only method that changes inplace instead of returning a new Dataset
+        #  this is the only method that changes inplace instead of returning a new Dataset
         future = assign_identifiers(self.extxyz, identifier)
         future_states, future_id = future[0], future[1]
         file = psiflow.context().new_file("data_", ".xyz")
diff --git a/psiflow/data/file.py b/psiflow/data/file.py
index 21bc507..f86093b 100644
--- a/psiflow/data/file.py
+++ b/psiflow/data/file.py
@@ -20,8 +20,6 @@
     assign_ids,
 )
 
-# TODO: some operations have side effects, which could be unexpected
-
 
 FileLike: TypeAlias = str | Path | File
 
@@ -155,45 +153,12 @@ def _split_frames(
     _write_frames(val, outputs=outputs[1:])
 
 
-def _batch_frames(
-    file: FileLike, batch_size: int, outputs: Sequence[File] = ()
-) -> list[list[Geometry]]:
-    """
-    Split frames into batches.
-
-    Args:
-        file: DataFuture representing the input file path containing the geometry data.
-        batch_size: Number of frames per batch.
-        outputs: List of Parsl futures. Each element should be a DataFuture
-                 representing an output file path for each batch.
-    """
-    # TODO: why pass batch size?
-    # TODO: will not be great for large files
-    frames = _read_frames(file)
-
-    batches, batch = [], []
-    for i, geom in enumerate(frames):
-        batch.append(geom)
-        if (i + 1) % batch_size == 0:
-            batches.append(batch)
-            batch = []
-
-    if not outputs:
-        return batches
-
-    assert len(outputs) == len(batches)
-    for file, batch in zip(outputs, batches):
-        _write_frames(batch, outputs=[file])
-    return batches
-
-
 write_frames = python_app(_write_frames, executors=["default_threads"])
 read_frames = python_app(_read_frames, executors=["default_threads"])
 join_frames = python_app(_join_frames, executors=["default_threads"])
 count_frames = python_app(_count_frames, executors=["default_threads"])
 get_elements = python_app(_get_elements, executors=["default_threads"])
 split_frames = python_app(_split_frames, executors=["default_threads"])
-batch_frames = python_app(_batch_frames, executors=["default_threads"])
 
 
 def _read_write_wrapper(
diff --git a/psiflow/execution.py b/psiflow/execution.py
index 812a69e..8b47ff7 100644
--- a/psiflow/execution.py
+++ b/psiflow/execution.py
@@ -347,12 +347,6 @@ def wrap_in_timeout(self, command: str) -> str:
         # send SIGTERM after max_runtime, follow with SIGKILL 30s later
         return f"timeout -v -k 30s {self.max_runtime}s {command}"
 
-    # def wrap_in_srun(self, command: str) -> str:
-    #     # TODO: stub -- this does not work
-    #     if self.provider is None:
-    #         return command  # noop
-    # return f"srun -t 1 -c $CORES {command}"
-
     def _create_threadpool(self, path: Path) -> ThreadPoolExecutor:
         max_threads = self.kwargs["max_threads"]
         return ThreadPoolExecutor(self.name, max_threads, working_dir=str(path))
@@ -464,7 +458,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
 
-        if self.use_gpu and self.kwargs["gpus_per_task"] > 1:
+        if self.use_gpu and self.kwargs.get("gpus_per_task", 1) > 1:
             raise ConfigurationError("No Hamiltonian can do multi-GPU evaluation")
 
         # i-Pi will kill client connections after no response for timeout seconds
diff --git a/psiflow/functions.py b/psiflow/functions.py
index 8d22978..11b64e7 100644
--- a/psiflow/functions.py
+++ b/psiflow/functions.py
@@ -173,7 +173,6 @@ def __call__(
 
 @dataclass(frozen=True)
 class MACEFunction(Function):
-    # TODO: why are some arguments separated and others 'calc_kwargs'?
     model_path: str | Path
     ncores: int
     device: str
diff --git a/psiflow/geometry.py b/psiflow/geometry.py
index c23817a..357b5f8 100644
--- a/psiflow/geometry.py
+++ b/psiflow/geometry.py
@@ -2,8 +2,7 @@
 import pickle
 import copy
 from pathlib import Path
-from typing import Optional, Union, Any, Final
-from collections.abc import Sequence
+from typing import Optional, Any, Final
 
 import ase.calculators.calculator
 import numpy as np
@@ -14,10 +13,9 @@
 
 import psiflow
 
-# TODO: docstrings
-# TODO: custom attributes writing
-# TODO: Reference and Sampling want to add attributes to Geometry (used to be 'order' dict)
-#  what to do?
+
+# TODO: all per-atom attributes are cast to float when writing, making read-write not a noop
+
 
 # these are always accessible for every geometry
 DEFAULT_PROPERTIES = "per_atom", "cell", "energy", "stress"
@@ -39,8 +37,9 @@ def __bool__(self):
 
 class PerAtom:
     """
-    Holds 'per atom' arrays like positions and forces
-    For every attribute, the first array dimension should be n_atoms
+    Holds 'per atom' arrays like positions and forces.
+    Must contain 'numbers' and 'positions', can contain anything else.
+    All arrays should be 2-dimensional with shape [n_atoms, *].
     """
 
     numbers: npt.NDArray[np.uint8]
@@ -53,16 +52,20 @@ def __init__(
         positions: npt.NDArray[np.float64],
         **kwargs: npt.NDArray,
     ):
-        self.numbers = numbers.astype(int).reshape(-1, 1)
+        self.numbers = numbers
         self.positions = positions
         for k, v in kwargs.items():
             setattr(self, k, v)
 
-    def __setattr__(self, name, value):
+    def __setattr__(self, name: str, value: np.ndarray) -> None:
         if value is MISSING:
             return  # do not set MISSING values
-        elif not self._check(value):
+        elif name == "numbers":
+            value = value.astype(int)
+        elif len(value) != len(self):
             raise ValueError(f"Field '{name}' is not a per-atom property..")
+        if value.ndim != 2:
+            value = value.reshape(-1, 1)  # 1D arrays
         super().__setattr__(name, value)
 
     def __getattr__(self, name):
@@ -79,23 +82,22 @@ def __len__(self) -> int:
     def __str__(self) -> str:
         return f"PerAtom[{len(self)}]{set(vars(self))}"
 
-    def _check(self, arr: npt.NDArray) -> bool:
-        try:
-            return (arr.shape[0] == len(self)) and arr.ndim == 2
-        except AttributeError:
-            return arr.ndim == 2  # numbers not yet defined
+    def attributes(self) -> dict[str, Any]:
+        """Return all optional per-atom attributes"""
+        return {
+            k: v for k, v in vars(self).items() if k not in ("numbers", "positions")
+        }
 
     def reset(self):
         """Wipes optional fields"""
-        attrs = [k for k in vars(self) if k not in ("numbers", "positions")]
-        for attr in attrs:
+        for attr in self.attributes():
             delattr(self, attr)
 
     def to_string(self) -> tuple[str, str]:
+        """Creates extxyz header and body"""
         symbols = [chemical_symbols[i] for i in self.numbers.flatten()]
         data = {"species": np.array(symbols).reshape(-1, 1), "pos": self.positions}
-        if not self.forces is MISSING:
-            data["forces"] = self.forces
+        data |= self.attributes()
 
         # create extxyz header
         header = "Properties=species:S:1:pos:R:3"
@@ -131,6 +133,7 @@ def to_string(self) -> tuple[str, str]:
 
     @classmethod
     def from_string(cls, s: str, props: Optional[str] = None):
+        """Reverses to_string"""
         props = (props or "species:S:1:pos:R:3") + ":"
         dtype_map = {"R": "f8", "S": "i8"}
 
@@ -165,10 +168,10 @@ class Geometry:
     and various physical properties such as energy and forces.
 
     Attributes:
-        per_atom (PerAtom): Record array containing per-atom properties.
-        cell (np.ndarray): 3x3 array representing the unit cell vectors.
+        per_atom (PerAtom): Container with per-atom properties.
+        cell (Optional[np.ndarray]): 3x3 array representing the unit cell vectors (or None for isolated molecules).
         energy (Optional[float]): Total energy of the system.
-        stress (Optional[np.ndarray]): Stress tensor of the system.
+        stress (Optional[np.ndarray]): Stress tensor of the system (3x3 array).
     """
 
     per_atom: PerAtom
@@ -182,10 +185,7 @@ def __init__(
         cell: Optional[np.ndarray] = None,
         **kwargs: Any,
     ):
-        """
-        Initialize a Geometry instance, though the preferred way of instantiating
-        proceeds via the class methods
-        """
+        """Initialize a Geometry instance. Preferably instantiate through class methods."""
         self.per_atom = per_atom
         self.cell = cell
 
@@ -193,7 +193,7 @@ def __init__(
         for k, v in kwargs.items():
             setattr(self, k, v)
 
-    def __setattr__(self, name, value):
+    def __setattr__(self, name: str, value: Any) -> None:
         if value is MISSING:
             return  # do not set MISSING values
 
@@ -204,6 +204,8 @@ def __setattr__(self, name, value):
             value = float(value)
             # assert isinstance(value, float)
         elif name == "stress":
+            if self.cell is None:
+                raise ValueError("Cannot assign stress to molecules..")
             assert isinstance(value, np.ndarray) and value.shape == (3, 3)
 
         super().__setattr__(name, value)
@@ -218,21 +220,16 @@ def __getattr__(self, name):
 
     def attributes(self) -> dict[str, Any]:
         """Return all non-MISSING instance attributes"""
-
         return {k: v for k, v in vars(self).items() if k != "per_atom"}
 
     def reset(self) -> None:
-        """
-        Reset all computed properties of the geometry to their default values.
-        """
+        """Reset all computed properties to their default values"""
         self.per_atom.reset()
         for k in {"energy", "stress"}.intersection(self.attributes()):
             delattr(self, k)
 
     def clean(self) -> None:
-        """
-        Clean the geometry by resetting properties and removing additional information.
-        """
+        """Remove all non-structural information ('numbers', 'positions', 'cell')"""
         self.per_atom.reset()
         for k in self.attributes():
             if k == "cell":
@@ -240,9 +237,7 @@ def clean(self) -> None:
             delattr(self, k)
 
     def align_axes(self) -> None:
-        """
-        Align the axes of the unit cell to a canonical representation for periodic systems.
-        """
+        """Align the cell axes to a canonical representation for periodic systems"""
         if not self.periodic:
             return
         positions = self.per_atom.positions
@@ -251,38 +246,28 @@ def align_axes(self) -> None:
         reduce_box_vectors(cell)
 
     def copy(self) -> Geometry:
-        """
-        Create a deep copy of the Geometry instance.
-        """
+        """Create a deep copy of the Geometry instance"""
         return pickle.loads(pickle.dumps(self))
 
     def to_string(self) -> str:
-        """
-        Convert the Geometry instance to a string representation in extended XYZ format.
-        """
+        """Convert the Geometry instance to a string representation in extended XYZ format"""
         data = self.attributes()
         data.pop("cell")  # cell needs special treatment
         if self.periodic:
             data["Lattice"] = self.cell.T  # ase does Fortran ordering
             data["pbc"] = "T T T"
-        else:
-            data["pbc"] = "F F F"
         info_str = key_val_dict_to_str(data)
         properties, txt = self.per_atom.to_string()
         header = " ".join([properties, info_str])
         return "\n".join([str(len(self)), header, txt])
 
     def save(self, path_xyz: Path | str) -> None:
-        """
-        Save the Geometry instance to an XYZ file.
-        """
+        """Save the Geometry instance to an XYZ file"""
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
         path_xyz.write_text(self.to_string())
 
     def to_atoms(self, structural_only: bool = False) -> Atoms:
-        """
-        Convert the Geometry instance to an Atoms object.
-        """
+        """Convert the Geometry instance to an Atoms object"""
         if structural_only:
             return Atoms(
                 positions=self.per_atom.positions,
@@ -296,9 +281,7 @@ def to_atoms(self, structural_only: bool = False) -> Atoms:
         return next(read_xyz(io.StringIO(s), index=0))
 
     def __eq__(self, other: "Geometry") -> bool:
-        """
-        Check if two Geometry instances are structurally equal.
-        """
+        """Check if two Geometry instances are structurally equal"""
         if (
             isinstance(other, Geometry)
             and len(self) == len(other)
@@ -311,16 +294,12 @@ def __eq__(self, other: "Geometry") -> bool:
         return False
 
     def __len__(self) -> int:
-        """
-        Get the number of atoms in the geometry.
-        """
+        """Get the number of atoms in the geometry"""
         return len(self.per_atom)
 
     @classmethod
     def from_string(cls, s: str) -> Geometry:
-        """
-        Create a Geometry instance from a string representation in extended XYZ format.
-        """
+        """Create a Geometry instance from a string representation in extended XYZ format"""
         n_atoms, header, body = s.strip().split("\n", 2)
         data = key_val_str_to_dict(header)
         data.pop("pbc", None)  # geometry derives pbc from cell
@@ -335,9 +314,7 @@ def from_string(cls, s: str) -> Geometry:
 
     @classmethod
     def load(cls, path_xyz: Path | str) -> "Geometry":
-        """
-        Load a Geometry instance from an XYZ file.
-        """
+        """Load a Geometry instance from an XYZ file"""
         path_xyz = psiflow.resolve_and_check(Path(path_xyz))
         content = path_xyz.read_text()
         return cls.from_string(content)
@@ -346,17 +323,13 @@ def load(cls, path_xyz: Path | str) -> "Geometry":
     def from_data(
         cls, numbers: np.ndarray, positions: np.ndarray, cell: Optional[np.ndarray]
     ) -> Geometry:
-        """
-        Create a Geometry instance from atomic numbers, positions, and cell data.
-        """
+        """Create a Geometry instance from atomic numbers, positions, and cell data"""
         per_atom = PerAtom(numbers.copy(), positions.copy())
         return Geometry(per_atom, cell=copy.copy(cell))
 
     @classmethod
     def from_atoms(cls, atoms: Atoms) -> Geometry:
-        """
-        Create a Geometry instance from an ASE Atoms object.
-        """
+        """Create a Geometry instance from an ASE Atoms object"""
         per_atom = PerAtom(**atoms.arrays)
         data = atoms.info
         if all(atoms.pbc):
@@ -373,41 +346,31 @@ def from_atoms(cls, atoms: Atoms) -> Geometry:
 
     @property
     def periodic(self) -> bool:
-        """
-        Check if the geometry is periodic.
-        """
+        """Check if the geometry is periodic"""
         return not self.cell is None
 
     @property
     def per_atom_energy(self) -> float | MISSING:
-        """
-        Calculate the energy per atom.
-        """
+        """Calculate the per-atom energy"""
         if self.energy is MISSING:
             return MISSING
         return self.energy / len(self)
 
     @property
     def volume(self) -> Optional[float]:
-        """
-        Calculate the volume of the unit cell.
-        """
+        """Calculate the volume of the unit cell"""
         if not self.periodic:
             return None
         return np.linalg.det(self.cell)
 
     @property
     def atomic_masses(self) -> npt.NDArray:
-        """
-        Get the atomic masses of the atoms in the geometry.
-        """
+        """Get the atomic masses of the atoms in the geometry"""
         return np.array([atomic_masses[n] for n in self.numbers])
 
     @property
     def numbers(self) -> npt.NDArray:
-        """
-        Get a flattened version of the per_atom.numbers array.
-        """
+        """Get a flattened version of per_atom.numbers"""
         return self.per_atom.numbers.flatten()
 
 
@@ -556,7 +519,7 @@ def mass_unweight(hessian: np.ndarray, geometry: Geometry) -> np.ndarray:
 
 
 def get_atomic_energy(geometry: Geometry, atomic_energies: dict[str, float]) -> float:
-    """Compute the total atomic energy based on provided single atom energies."""
+    """Compute the total atomic energy based on single atom energies"""
     total = 0
     numbers, counts = np.unique(geometry.numbers, return_counts=True)
     for number, count in zip(numbers, counts):
diff --git a/psiflow/learning.py b/psiflow/learning.py
index a1193c9..af2dea3 100644
--- a/psiflow/learning.py
+++ b/psiflow/learning.py
@@ -24,6 +24,7 @@
 logger = logging.getLogger(__name__)  # logging per module
 
 
+# TODO: remove -> merge into one app
 @typeguard.typechecked
 def _compute_error(
     state0: Geometry,
@@ -45,7 +46,7 @@ def _compute_error(
 
 compute_error = python_app(_compute_error, executors=["default_threads"])
 
-
+# TODO: remove -> merge into one app
 @typeguard.typechecked
 def _exceeds_error(
     errors: np.ndarray,
@@ -56,7 +57,7 @@ def _exceeds_error(
 
 exceeds_error = python_app(_exceeds_error, executors=["default_threads"])
 
-
+# TODO: remove -> merge into one app
 @typeguard.typechecked
 def evaluate_outputs(
     outputs: list[SimulationOutput],
diff --git a/psiflow/models/mace.py b/psiflow/models/mace.py
index cd8c41b..6d2c8d7 100644
--- a/psiflow/models/mace.py
+++ b/psiflow/models/mace.py
@@ -22,7 +22,6 @@
 
 # TODO: when changing the training dataset, the computed avg_num_neighbors will also change,
 #  making old checkpoints inconsistent
-# TODO: sanitise_config consistently called?
 # TODO: training fails when batch_size > dataset (see github issue)
 # TODO: restart_latest functionality to continue training??
 
@@ -59,8 +58,7 @@ def sanitise_config(config: dict) -> dict:
 
 
 def format_E0s(atomic_energies: dict) -> dict | str:
-    """"""
-    # TODO: MACE fails when atomic energy is not defined for every element..
+    """MACE will fail if atomic energy is not defined for all elements"""
     if atomic_energies:
         return {atomic_numbers[k]: v for k, v in atomic_energies.items()}
     else:
diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py
index 44412f6..6000624 100644
--- a/psiflow/reference/cp2k_.py
+++ b/psiflow/reference/cp2k_.py
@@ -128,9 +128,6 @@ def get_single_atom_references(self, element: str) -> dict[int, Reference]:
         else:
             input_section["scf"] = {"ot": {"minimizer": "CG"}}
 
-        # necessary for oxygen calculation, at least in 2024.1
-        # TODO: not an allowed keyword in 2025.2
-        # input_section["scf"]["ignore_convergence_failure"] = "TRUE"
         input_str = dict_to_str(input_dict)
 
         references = {}
diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py
index 68f92c2..38ef513 100644
--- a/psiflow/reference/reference.py
+++ b/psiflow/reference/reference.py
@@ -60,7 +60,7 @@ def update_geometry(geom: Geometry, data: dict) -> Geometry:
     return geom
 
 
-@join_app
+@python_app(executors=['default_threads'])
 def get_minimum_energy(element: str, **kwargs) -> AppFuture:
     energies = {m: state.energy or np.inf for m, state in kwargs.items()}
     energy = min(energies.values())
@@ -70,11 +70,11 @@ def get_minimum_energy(element: str, **kwargs) -> AppFuture:
     ]
     logger.info('\n'.join(msg))
     assert not np.isinf(energy), f"Atomic energy calculation of '{element}' failed"
-    return copy_app_future(energy)  # TODO: why?
+    return energy
 
 
 def get_spin_multiplicities(element: str) -> list[int]:
-    """TODO: rethink this"""
+    """TODO: redo this"""
     # max S = N * 1/2, max mult = 2 * S + 1
     from ase.symbols import atomic_numbers
 
@@ -166,7 +166,7 @@ def _process_output(
     try:
         data = reference.parse_output(stdout)
     except LineNotFoundError:
-        data = {"status": Status.FAILED}  # TODO: find out what went wrong?
+        data = {"status": Status.FAILED}
     data |= {"stdout": Path(inputs[0]), "stderr": Path(inputs[1])}
     return update_geometry(geom, data)
 
@@ -182,7 +182,8 @@ def evaluate(
 ) -> AppFuture:
     """"""
     flag, *files = reference.create_input(geom=geom)
-    if not flag:  # TODO: should we reset geom?
+    if not flag:
+        geom.reset()  # remove fields to indicate no evaluation happened
         return copy_app_future(geom)
 
     # do the actual evaluation
diff --git a/psiflow/sampling/_ase.py b/psiflow/sampling/_ase.py
index e66ae3d..cb23c1a 100644
--- a/psiflow/sampling/_ase.py
+++ b/psiflow/sampling/_ase.py
@@ -1,6 +1,5 @@
 """
 Structure optimisation through ASE
-TODO: do we need to check for very large forces?
 """
 
 import os
@@ -169,9 +168,10 @@ def main(args: SimpleNamespace):
         Path(args.output_xyz).touch()
         return
 
-    atoms.set_constraint()  # usually noop
     if not any(atoms.pbc):
-        atoms.cell = None  # remove meaningless cell
+        # remove meaningless cell and stress
+        atoms.cell = None
+        atoms.calc.results.pop("stress", None)
     ase.io.write(FILE_OUT, atoms, format="extxyz")
 
     shutil.copy(FILE_OUT, args.output_xyz)
diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py
index a28fb6d..433623a 100644
--- a/psiflow/sampling/sampling.py
+++ b/psiflow/sampling/sampling.py
@@ -192,7 +192,6 @@ def setup_ensemble(
     bias_weights_list = []
 
     # add hamiltonian components that are not shared between all walkers as bias
-    # TODO: do we always want to do this?
     for comp in components:
         if not comp.shared:
             force = ET.Element("force", forcefield=comp.name)
@@ -429,7 +428,6 @@ def make_driver_commands(
     driver_kwargs: list[dict], file_xyz: File, files_hamiltonian: list[File]
 ) -> list[str]:
     """"""
-    # TODO: what if 'file_xyz' contains multiple geometries of different size?
     assert len(driver_kwargs) >= len(files_hamiltonian)
     default = f'i-pi-driver-py -u -S "" -m custom -P {PATH_DRIVER} -a {{address}} -o {{options}} &'
 
@@ -538,6 +536,7 @@ def _sample(
     )
     smotion = setup_smotion(coupling, plumed_list)
 
+    # TODO: validate all arguments together + consistently
     # make sure at least one checkpoint is being written
     if checkpoint_step is None:  # default to every 5% of simulation progress
         if step is None:
diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py
index c3d5856..b1eaae2 100644
--- a/psiflow/sampling/server.py
+++ b/psiflow/sampling/server.py
@@ -138,7 +138,7 @@ def cleanup(output_xyz: str, output_props: str, output_trajs: str) -> None:
         out = subprocess.run(
             "i-pi-remdsort input.xml", shell=True, capture_output=True, text=True
         )
-        assert out.returncode == 0  # TODO: what if it isn't?
+        assert out.returncode == 0
         print("REMDSORT")
 
     output_props = _.split(",") if (_ := output_props) else []
diff --git a/psiflow/utils/wq.py b/psiflow/utils/wq.py
index 4ecc01d..1059ae5 100644
--- a/psiflow/utils/wq.py
+++ b/psiflow/utils/wq.py
@@ -1,4 +1,5 @@
 # TODO: this probably does not work in a nested way
+#  or when join apps submit tasks at some point in the future (after exiting the context)
 
 import logging
 
diff --git a/tests/test_data.py b/tests/test_data.py
index e5d8155..d0fd49a 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -14,7 +14,6 @@
 
 
 def test_geometry(tmp_path):
-    # TODO: custom per_atom field
     geometry = Geometry.from_data(
         numbers=np.arange(1, 6),
         positions=np.random.uniform(0, 1, size=(5, 3)),
@@ -122,7 +121,32 @@ def test_geometry(tmp_path):
         state.cell = [[1, 2, 3]] * 3
 
 
+def test_custom_attributes(dataset):
+    """Do geometries behave for non-default attributes?"""
+    geom = dataset[0].result()
+
+    geom.custom1 = False
+    geom.custom2 = [42, "42"]
+    geom.per_atom.custom1 = np.ones(len(geom), dtype=int) * 42
+    geom.per_atom.custom2 = np.random.random(len(geom)).astype(bool)
+
+    string = geom.to_string()  # casts per-atom properties to float..
+    geom_ = Geometry.from_string(string)
+    atoms = geom.to_atoms()
+
+    assert geom.custom1 == geom_.custom1
+    assert geom.custom2 == geom_.custom2
+    assert np.allclose(geom.per_atom.custom1, geom_.per_atom.custom1)
+    assert np.allclose(geom.per_atom.custom2, geom_.per_atom.custom2)
+
+    assert geom.custom1 == atoms.info["custom1"]
+    assert geom.custom2 == atoms.info["custom2"]
+    assert np.allclose(geom.per_atom.custom1, atoms.arrays["custom1"])
+    assert np.allclose(geom.per_atom.custom2, atoms.arrays["custom2"])
+
+
 def test_readwrite_cycle(dataset, tmp_path):
+    # TODO: much overlap with test_geometry
     tmp_file = File(tmp_path / "test.xyz")
 
     data = dataset[:4].geometries().result()
@@ -165,11 +189,13 @@ def test_readwrite_cycle(dataset, tmp_path):
     assert geometry.energy is MISSING
 
     geom = Geometry.from_data(np.ones(2), np.zeros((2, 3)), cell=None)
-    geom.stress = np.array([np.nan] * 9).reshape(3, 3)
+    with pytest.raises(ValueError):  # cannot assign stress to molecules
+        geom.stress = np.array([np.nan] * 9).reshape(3, 3)
     string = geom.to_string()
-    assert "nan" in string  # the stress field
+    assert "stress" not in string
+    assert "pbc" not in string
     geom_ = Geometry.from_string(string)
-    assert np.isnan(geom_.stress).all()
+    assert geom_.stress is MISSING
 
 
 def test_dataset(dataset, tmp_path):
@@ -258,46 +284,6 @@ def test_dataset_from_xyz(tmp_path, dataset):
         assert geom.energy == at.calc.results["energy"]
 
 
-# def test_index_element_mask():
-#     numbers = np.array([1, 1, 1, 6, 6, 8, 8, 8])
-#     elements = ["H"]
-#     indices = [0, 1, 4, 5]
-#     assert np.allclose(
-#         np.array([True, True, False, False, False, False, False, False]),
-#         get_index_element_mask(numbers, indices, elements),
-#     )
-#     elements = ["H", "O"]
-#     indices = [0, 1, 4, 5]
-#     assert np.allclose(
-#         np.array([True, True, False, False, False, True, False, False]),
-#         get_index_element_mask(numbers, indices, elements),
-#     )
-#     elements = ["H", "O"]
-#     indices = [3]
-#     assert np.allclose(
-#         get_index_element_mask(numbers, indices, elements),
-#         np.array([False] * len(numbers)),
-#     )
-#     elements = ["H", "O"]
-#     indices = None
-#     assert np.allclose(
-#         get_index_element_mask(numbers, indices, elements),
-#         np.array([True, True, True, False, False, True, True, True]),
-#     )
-#     elements = None
-#     indices = [0, 1, 2, 3, 5]
-#     assert np.allclose(
-#         get_index_element_mask(numbers, indices, elements),
-#         np.array([True, True, True, True, False, True, False, False]),
-#     )
-#     elements = ["Cl"]  # not present at all
-#     indices = None
-#     assert np.allclose(
-#         get_index_element_mask(numbers, indices, elements),
-#         np.array([False] * len(numbers)),
-#     )
-
-
 def test_data_elements(dataset):
     elements = dataset.elements().result()
     assert "H" in elements

From 756a3fec301388fb85d55176cce13494893c66d0 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Tue, 21 Apr 2026 18:05:03 +0200
Subject: [PATCH 15/16] add analysis comments before refactor

- fully separate wandb
- probably going to redo metrics as well
---
 psiflow/learning.py | 54 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/psiflow/learning.py b/psiflow/learning.py
index af2dea3..5dca750 100644
--- a/psiflow/learning.py
+++ b/psiflow/learning.py
@@ -8,13 +8,14 @@
 import numpy as np
 import typeguard
 from parsl.app.app import python_app
-from parsl.dataflow.futures import AppFuture
+from parsl.dataflow.futures import AppFuture, Future
 
 import psiflow
 from psiflow.data import Dataset
 from psiflow.geometry import Geometry#, NullState, assign_identifier
 from psiflow.hamiltonians import Hamiltonian, Zero
 from psiflow.metrics import Metrics
+from psiflow.models import MACE
 # from psiflow.models import Model
 from psiflow.reference import Reference
 from psiflow.sampling import SimulationOutput, Walker, sample
@@ -24,6 +25,23 @@
 logger = logging.getLogger(__name__)  # logging per module
 
 
+# TODO: reset_model method to remove existing MLP + train/val split
+# TODO: some plugin for data discarding + walker reset
+# TODO: some plugin for data selection / uncertainty thing
+# TODO: some plugin for wandb
+# TODO: add_data method after init?
+# TODO: clean_data method to throw away ridiculous structures
+# TODO: remove psiflow.wait calls -- but still have blocking functionality
+
+# TODO: PASSIVE LEARNING
+#  1. walk MD + store structures + track which structure belongs to which walker
+#  2. some data selection / cleanup step
+#  3. reference single point + hamiltonian eval
+#  4. compare geometries + filter + discard + keep track of walkers to reset
+#  5. update train/val and retrain
+
+
+
 # TODO: remove -> merge into one app
 @typeguard.typechecked
 def _compute_error(
@@ -100,18 +118,24 @@ def evaluate_outputs(
     return identifier, data, resets
 
 
-@typeguard.typechecked
-# @psiflow.serializable
+class Thresholds:
+    """Container for hyperparams"""
+    pass
+
+@psiflow.register_serializable
 class Learning:
+    root: Path
+    model: MACE
     reference: Reference
-    path_output: str
-    identifier: Union[AppFuture, int]
     train_valid_split: float
-    error_thresholds_for_reset: list[Optional[float]]
-    error_thresholds_for_discard: list[Optional[float]]
-    metrics: Metrics
+    identifier: Union[AppFuture, int]
+    error_tresholds: Optional[Thresholds]
+    # error_thresholds_for_reset: list[Optional[float]]
+    # error_thresholds_for_discard: list[Optional[float]]
+    # metrics: Metrics
     iteration: int
-    data: Dataset
+    wait_for: Future
+    # data: Dataset
 
     def __init__(
         self,
@@ -124,6 +148,8 @@ def __init__(
         wandb_project: Optional[str] = None,
         initial_data: Optional[Dataset] = None,
     ):
+        # TODO: accept model argument
+        # TODO: differentiate between fresh start <> restart
         self.reference = reference
         self.path_output = str(path_output)
         Path(self.path_output).mkdir(exist_ok=True, parents=True)
@@ -146,6 +172,7 @@ def __init__(
         self.iteration = -1
 
     def update(self, learning: Learning) -> None:
+        # TODO: does this make sense?
         assert self.path_output == learning.path_output
         self.train_valid_split = learning.train_valid_split
         self.error_thresholds_for_reset = learning.error_thresholds_for_reset
@@ -157,6 +184,7 @@ def update(self, learning: Learning) -> None:
         self.iteration = learning.iteration
 
     def skip(self, name: str) -> bool:
+        # TODO: upon restart, should we not immediately find how far we got and what the next action should be?
         if not (Path(self.path_output) / name).exists():
             return False
         else:  # check that the iteration has completed, or delete
@@ -169,6 +197,7 @@ def skip(self, name: str) -> bool:
                 return True
 
     def save(self, name: str, model: Model, walkers: list[Walker]) -> None:
+        # TODO: fully revamp this method
         model.save(Path(self.path_output) / name)
         self.data.save(Path(self.path_output) / name / "data.xyz")
 
@@ -217,6 +246,7 @@ def passive_learning(
         steps: int,
         **sampling_kwargs,
     ) -> tuple[Model, list[Walker]]:
+        # TODO: remove model argument + return only walkers
         self.iteration += 1
         name = "{}_{}".format(self.iteration, "passive_learning")
         if self.skip(name):
@@ -232,6 +262,9 @@ def passive_learning(
                 step = steps
             nevaluations = steps // step
             for _i in range(nevaluations):
+                # TODO: splits into multiple simulations and does data evaluation + validation in steps
+                #  -- do not think we want this.. The instruction is to collect multiple geometries per walker
+                #  -- because we should handle all data validation in one go (errors / uncertainty ...)
                 outputs = sample(
                     walkers,
                     steps=step,
@@ -239,8 +272,10 @@ def passive_learning(
                     **sampling_kwargs,
                 )
                 # only apply thresholds to forces, not to energies since large offset can exist
+                # TODO: give users the choice?
                 threshold_reset = [None, self.error_thresholds_for_reset[1]]
                 threshold_discard = [None, self.error_thresholds_for_discard[1]]
+                # TODO: do single point evaluations here
                 identifier, data, _ = evaluate_outputs(  # ignore resets
                     outputs,
                     hamiltonian,
@@ -253,6 +288,7 @@ def passive_learning(
                 self.identifier = identifier
                 self.data += data  # automatically sorted based on identifier
 
+            # TODO: keep consistent train/val splits and do not reset model
             train, valid = self.data.split(self.train_valid_split)  # also shuffles
             model.reset()
             model.initialize(train)

From e021709be7ffaea95738c13422657d86fcda4c57 Mon Sep 17 00:00:00 2001
From: pdobbelaere <pidobbel.Dobbelaere@UGent.be>
Date: Wed, 22 Apr 2026 10:43:46 +0200
Subject: [PATCH 16/16] fix test_models (again) whoops

---
 psiflow/compute.py   | 14 ++++++++++-
 tests/test_models.py | 58 ++++++++++++++++++++++----------------------
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/psiflow/compute.py b/psiflow/compute.py
index 836bbb9..a6d1802 100644
--- a/psiflow/compute.py
+++ b/psiflow/compute.py
@@ -14,6 +14,9 @@
 from psiflow.functions import Function
 from psiflow.utils.apps import pack
 
+# TODO: what with missing values in arrays?
+
+
 ComputeInput: TypeAlias = (
     Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry
 )
@@ -183,7 +186,6 @@ def _compare_results(
     **kwargs: list | np.ndarray,
 ) -> dict:
     """"""
-    # TODO: what with missing values?
     assert (result2 is None) != (len(kwargs) == 0)  # xor
     if kwargs:
         result2 = ComputeResult.from_data(result1.n_atoms, kwargs)
@@ -208,6 +210,16 @@ def _compare_results(
 compare_results = python_app(_compare_results, executors=["default_threads"])
 
 
+def _compare_arrays(arr1: Sequence, arr2: Sequence, metric: str = "RMSE") -> float:
+    """"""
+    metric_func = metric_func_map[metric]
+    arr1, arr2 = np.array(arr1), np.array(arr2)
+    return metric_func(arr1, arr2)
+
+
+compare_arrays = python_app(_compare_arrays, executors=["default_threads"])
+
+
 def input_to_geometries(data: ComputeInput) -> AppFuture:
     """Convert ComputeInput into a sequence of geometries (as a future)"""
     # Dataset | list[Geometry] | list[AppFuture] | AppFuture | Geometry
diff --git a/tests/test_models.py b/tests/test_models.py
index 9476975..7da4e80 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -4,7 +4,7 @@
 from parsl.app.futures import DataFuture
 
 import psiflow
-from psiflow.data import compute_rmse
+from psiflow.compute import compare_arrays
 from psiflow.hamiltonians import MACEHamiltonian
 from psiflow.models import MACE
 from psiflow.models.mace import KEY_ATOMIC_ENERGIES, KEY_ITERATION, MODEL_DIRS
@@ -59,19 +59,16 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path):
     validation = dataset[-5:]
     path = tmp_path / "mace"
     model = MACE.create(path, mace_config)
+    [data] = validation.get(key)
 
     model.train(training, validation)
     hamiltonian = model.create_hamiltonian()
-    rmse0 = compute_rmse(
-        validation.get(key),
-        validation.evaluate(hamiltonian).get(key),
-    )
+    validation0 = hamiltonian.evaluate(validation)
+    [data0] = validation0.get(key)
     future_train = model.train(training, validation)
     hamiltonian = model.create_hamiltonian()
-    rmse1 = compute_rmse(
-        validation.get(key),
-        validation.evaluate(hamiltonian).get(key),
-    )
+    validation1 = hamiltonian.evaluate(validation)
+    [data1] = validation1.get(key)
 
     future_train.result()  # wait for second training run
     with pytest.raises(AssertionError):
@@ -81,25 +78,25 @@ def test_mace_train(gpu, mace_config, dataset, tmp_path):
     # train from load
     model_ = MACE.load(path)
     hamiltonian = model_.create_hamiltonian()
-    rmse2 = compute_rmse(
-        validation.get(key),
-        validation.evaluate(hamiltonian).get(key),
-    )
+    validation2 = hamiltonian.evaluate(validation)
+    [data2] = validation2.get(key)
     model_.train(training, validation)
     hamiltonian = model_.create_hamiltonian()
-    rmse3 = compute_rmse(
-        validation.get(key),
-        validation.evaluate(hamiltonian).get(key),
-    )
+    validation3 = hamiltonian.evaluate(validation)
+    [data3] = validation3.get(key)
 
-    print(rmse0.result())
-    print(rmse1.result())
-    print(rmse2.result())
-    print(rmse3.result())
+    rmse0 = compare_arrays(data, data0).result()
+    rmse1 = compare_arrays(data, data1).result()
+    rmse2 = compare_arrays(data, data2).result()
+    rmse3 = compare_arrays(data, data3).result()
 
-    assert rmse0.result() > rmse1.result()
-    assert np.isclose(rmse1.result(), rmse2.result())
-    assert rmse2.result() > rmse3.result()
+    print(rmse0)
+    print(rmse1)
+    print(rmse2)
+    print(rmse3)
+    assert rmse0 > rmse1
+    assert np.isclose(rmse1, rmse2)
+    assert rmse2 > rmse3
 
 
 def test_mace_hamiltonian(dataset, mace_foundation):
@@ -114,8 +111,11 @@ def test_mace_hamiltonian(dataset, mace_foundation):
     assert hamiltonian1 == hamiltonian2
     hamiltonian2.update_kwargs(enable_cueq=True)
 
-    e0 = hamiltonian0.compute(dataset, "energy")
-    e1 = hamiltonian1.compute(dataset, "energy")
-    e2 = hamiltonian2.compute(dataset, "energy")
-    assert np.allclose(e0.result(), e1.result())
-    assert np.allclose(e0.result(), e2.result())
+    out0 = hamiltonian0.compute(dataset)
+    out1 = hamiltonian1.compute(dataset)
+    out2 = hamiltonian2.compute(dataset)
+    out0, out1, out2 = out0.result(), out1.result(), out2.result()
+    for key in out0.keys:
+        # single precision
+        assert np.allclose(out0.get(key), out1.get(key), atol=1e-6)
+        assert np.allclose(out0.get(key), out2.get(key), atol=1e-6)