Move load_group function to utils.py, add get_path_by_uuid to Logger, add get_group_subpath method.

GeigerJ2 · GeigerJ2 · commit 59f6fb2562e4 · 2025-02-17T14:56:55.000+01:00
diff --git a/src/aiida/cmdline/commands/cmd_profile.py b/src/aiida/cmdline/commands/cmd_profile.py
@@ -433,18 +433,26 @@ def profile_mirror(
     if incremental:
         msg = 'Incremental mirroring selected. Will update directory.'
         echo.echo_report(msg)
+    else:
+        msg = 'Overwriting selected. Will clean directory first.'
+        # TODO: Maybe add y/n confirmation here?
+        echo.echo_report(msg)
 
     if num_processes_to_dump == 0:
-        echo.echo_success('No processes to dump.')
+        msg = 'No processes to dump.'
+        echo.echo_success(msg)
     else:
         profile_dumper.dump_processes()
-        echo.echo_success(f'Dumped {num_processes_to_dump} new nodes.')
+        msg = f'Dumped {num_processes_to_dump} new nodes.'
+        echo.echo_success(msg)
 
     if delete_missing:
+        # breakpoint()
         if num_processes_to_delete == 0:
             echo.echo_success('No processes to delete.')
         else:
             profile_dumper.delete_processes()
+
             echo.echo_success(f'Deleted {num_processes_to_delete} node directories.')
 
     # Append the current dump time to dumping safeguard file
diff --git a/src/aiida/tools/dumping/collection.py b/src/aiida/tools/dumping/collection.py
@@ -25,6 +25,7 @@
     NodeDumpKeyMapper,
     ProcessesToDumpContainer,
     filter_nodes_last_dump_time,
+    load_given_group,
 )
 
 logger = AIIDA_LOGGER.getChild('tools.dumping')
@@ -62,7 +63,7 @@ def __init__(
             raise Exception(msg)
 
         elif group is not None:
-            self.group = _validate_group(group)
+            self.group = load_given_group(group)
             if self.group:
                 self._collection_nodes = [n.uuid for n in self.group.nodes]
 
@@ -85,9 +86,11 @@ def __init__(
 
     @property
     def collection_nodes(self) -> list[str]:
-        """Return collection nodes.
+        """Property to hold the collection nodes.
 
-        :return: List of collection node identifiers.
+        Takes care of respecting the ``incremental`` attribute, and filtering by ``last_dump_time``.
+
+        :return: List of collection node UUIDs.
         """
         if self.incremental and self.last_dump_time:
             self._collection_nodes = filter_nodes_last_dump_time(
@@ -100,7 +103,9 @@ def collection_nodes(self) -> list[str]:
     def processes_to_dump(self) -> ProcessesToDumpContainer:
         """Get the processes to dump from the collection of nodes.
 
-        :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows.
+        Only re-evaluates the processes, if not already set.
+
+        :return: Instance of a ``ProcessesToDumpContainer``, that holds the selected calculations and workflows.
         """
         if not self._processes_to_dump:
             self._processes_to_dump = self._get_processes_to_dump()
@@ -109,16 +114,13 @@ def processes_to_dump(self) -> ProcessesToDumpContainer:
     def _get_processes_to_dump(self) -> ProcessesToDumpContainer:
         """Retrieve the processeses from the collection nodes.
 
-        If deduplication is selected, this method takes care of only dumping top-level workflows and only dump
-        calculations in their own designated directories if they are not part of a workflow.
+        Depending on the attributes of the ``CollectionDumper``, this method takes care of only selecting top-level
+        workflows and calculations if they are not part of a workflow. This requires to use the actual ORM entities,
+        rather than UUIDs, as the ``.caller``s have to be checked. In addition, sub-calculations
 
-        :return: Instance of the ``ProcessesToDump`` class containing the selected calculations and workflows.
+        :return: Instance of a ``ProcessesToDumpContainer``, that holds the selected calculations and workflows.
         """
 
-        # Deduplication is already handled in the ``get_processes`` method, where PKs/UUIDs are used, rather than AiiDA
-        # ORM entities as here. Specifically, calculations that are part of a workflow are not dumpid in their own,
-        # dedicated directory if they are part of a workflow.
-
         if not self.collection_nodes:
             return ProcessesToDumpContainer(calculations=[], workflows=[])
 
@@ -136,8 +138,8 @@ def _get_processes_to_dump(self) -> ProcessesToDumpContainer:
         else:
             calculations = [node for node in nodes_orm if isinstance(node, orm.CalculationNode)]
 
-            # Get sub-calculations that were called by workflows of the group, and which are not
-            # contained in the group.nodes directly
+            # Get sub-calculations that were called by workflows but which might themselves not be directly contained in
+            # the collection
             called_calculations = []
             for workflow in workflows:
                 called_calculations += [
@@ -147,13 +149,17 @@ def _get_processes_to_dump(self) -> ProcessesToDumpContainer:
             # Convert to set to avoid duplicates
             calculations = list(set(calculations + called_calculations))
 
+        # Use this small helper class rather than returning a dictionary for access via dot-notation
         return ProcessesToDumpContainer(
             calculations=calculations,
             workflows=workflows,
         )
 
     def _dump_processes(self, processes: list[orm.CalculationNode] | list[orm.WorkflowNode]) -> None:
-        """Dump a collection of processes."""
+        """Dump a list of AiiDA calculations or workflows to disk.
+
+        :param processes: List of AiiDA calculations or workflows from the ``ProcessesToDumpContainer``.
+        """
 
         if len(list(processes)) == 0:
             return
@@ -164,13 +170,12 @@ def _dump_processes(self, processes: list[orm.CalculationNode] | list[orm.Workfl
         sub_path.mkdir(exist_ok=True, parents=True)
 
         logger_attr = NodeDumpKeyMapper.get_key_from_node(node=next(iter(processes)))
-        # ! `getattr` gives a reference to the object, thus I can update the store directly
+        # ! `getattr` gives a reference to the actual object, thus I can update the store directly
         current_store = getattr(self.dump_logger.log, logger_attr)
 
-        # breakpoint()
+        process_dumper = self.process_dumper
 
         for process in processes:
-            process_dumper = self.process_dumper
 
             process_dump_path = sub_path / process_dumper._generate_default_dump_path(process_node=process, prefix=None)
 
@@ -225,24 +230,3 @@ def dump(self, output_path: Path | None = None) -> None:
                 self._dump_processes(processes=collection_processes.workflows)
             if len(collection_processes.calculations) > 0:
                 self._dump_processes(processes=collection_processes.calculations)
-
-
-def _validate_group(group: orm.Group | str) -> orm.Group | None:
-    """Validate the given group identifier.
-
-    :param group: The group identifier to validate.
-    :return: Insance of ``orm.Group``.
-    :raises NotExistent: If no ``orm.Group`` can be loaded for a given label.
-    """
-
-    if isinstance(group, str):
-        try:
-            return orm.load_group(group)
-        # `load_group` raises the corresponding errors
-        except NotExistent:
-            raise
-        except:
-            raise
-
-    elif isinstance(group, orm.Group):
-        return group
diff --git a/src/aiida/tools/dumping/logger.py b/src/aiida/tools/dumping/logger.py
@@ -13,6 +13,8 @@
 from pathlib import Path
 from typing import Collection
 
+from aiida.common.exceptions import NotExistent
+
 
 @dataclass
 class DumpLog:
@@ -195,11 +197,31 @@ def deserialize_logs(category_data: dict) -> DumpLogStore:
 
         return instance
 
-    def find_store_by_uuid(self, uuid: str) -> DumpLogStore | None:
+    def get_store_by_uuid(self, uuid: str) -> DumpLogStore:
         """Find the store that contains the given UUID."""
         # Iterate over the fields of the DumpLogStoreCollection dataclass for generality
+        # TODO: Add error handling for wrong UUID
         for field_ in fields(self.log):
             store = getattr(self.log, field_.name)
             if uuid in store.entries:
                 return store
-        return None
+
+        msg = f"No corresponding `DumpLogStore` found for UUID: `{uuid}`."
+        raise NotExistent(msg)
+
+    def get_path_by_uuid(self, uuid: str) -> Path | None:
+        """Find the store that contains the given UUID."""
+        # Iterate over the fields of the DumpLogStoreCollection dataclass for generality
+
+        try:
+            current_store = self.get_store_by_uuid(uuid=uuid)
+            path = current_store.entries[uuid].path
+            return path
+        except NotExistent as exc:
+            raise NotExistent(exc.args[0]) from exc
+        except KeyError as exc:
+            msg = f"UUID: `{uuid}` not contained in store `{current_store}`."
+            raise KeyError(msg) from exc
+        except:
+            # For debugging
+            raise
diff --git a/src/aiida/tools/dumping/process.py b/src/aiida/tools/dumping/process.py
@@ -48,7 +48,7 @@ def __init__(
 
         super().__init__(base_dump_config=self.base_dump_config, dump_logger=dump_logger)
 
-        # Unpack arguments for ProcessDumper for easier access
+        # Unpack arguments for easier access
         self.include_inputs = self.process_dump_config.include_inputs
         self.include_outputs = self.process_dump_config.include_outputs
         self.include_attributes = self.process_dump_config.include_attributes
diff --git a/src/aiida/tools/dumping/profile.py b/src/aiida/tools/dumping/profile.py
diff --git a/src/aiida/tools/dumping/utils.py b/src/aiida/tools/dumping/utils.py