Merge branch 'develop' into feature/transformer_sequence_sharding

japols · japols · commit a847f1a7a7c6 · 2024-11-29T10:51:59.000Z
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
     -   id: python-check-blanket-noqa # Check for # noqa: all
     -   id: python-no-log-warn # Check for log.warn
 - repo: https://github.com/psf/black-pre-commit-mirror
-  rev: 24.8.0
+  rev: 24.10.0
   hooks:
   - id: black
     args: [--line-length=120]
@@ -40,7 +40,7 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.9
+  rev: v0.7.2
   hooks:
   - id: ruff
     args:
@@ -59,13 +59,8 @@ repos:
   hooks:
   - id: rstfmt
     exclude: 'cli/.*' # Because we use argparse
-- repo: https://github.com/b8raoult/pre-commit-docconvert
-  rev: "0.1.5"
-  hooks:
-  - id: docconvert
-    args: ["numpy"]
 - repo: https://github.com/tox-dev/pyproject-fmt
-  rev: "2.2.4"
+  rev: "v2.5.0"
   hooks:
   - id: pyproject-fmt
 -   repo: https://github.com/jshwi/docsig # Check docstrings against function sig
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,8 @@ Keep it human-readable, your future self will thank you!
 - configurabilty of the dropout probability in the the MultiHeadSelfAttention module
 - Variable Bounding as configurable model layers [#13](https://github.com/ecmwf/anemoi-models/issues/13)
 - GraphTransformerMapperBlock chunking to reduce memory usage during inference [#46](https://github.com/ecmwf/anemoi-models/pull/46)
+- Mask NaN values in training loss function [#271](https://github.com/ecmwf-lab/aifs-mono/issues/271)
+- New `NamedNodesAttributes` class to handle node attributes in a more flexible way [#64](https://github.com/ecmwf/anemoi-models/pull/64)
 - Contributors file [#69](https://github.com/ecmwf/anemoi-models/pull/69)
 
 ### Changed
@@ -33,6 +35,7 @@ Keep it human-readable, your future self will thank you!
 - ci: extened python versions to include 3.11 and 3.12 [#66](https://github.com/ecmwf/anemoi-models/pull/66)
 - Update copyright notice
 - Fix `__version__` import in init
+- Fix missing copyrights [#71](https://github.com/ecmwf/anemoi-models/pull/71)
 
 ### Removed
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,15 +29,15 @@
 
 project = "Anemoi Models"
 
-author = "ECMWF"
+author = "Anemoi contributors"
 
 year = datetime.datetime.now().year
 if year == 2024:
     years = "2024"
 else:
     years = "2024-%s" % (year,)
 
-copyright = "%s, ECMWF" % (years,)
+copyright = "%s, Anemoi contributors" % (years,)
 
 try:
     from anemoi.models._version import __version__
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,12 @@
-# (C) Copyright 2024 ECMWF.
+# (C) Copyright 2024 Anemoi contributors.
 #
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-# https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
-
 [build-system]
 build-backend = "setuptools.build_meta"
 
@@ -36,6 +35,7 @@ classifiers = [
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
diff --git a/src/anemoi/models/__init__.py b/src/anemoi/models/__init__.py
@@ -1,6 +1,8 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
diff --git a/src/anemoi/models/__main__.py b/src/anemoi/models/__main__.py
@@ -1,12 +1,11 @@
-#!/usr/bin/env python
-# (C) Copyright 2024 ECMWF.
+# (C) Copyright 2024 Anemoi contributors.
 #
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
-#
 
 from anemoi.utils.cli import cli_main
 from anemoi.utils.cli import make_parser
diff --git a/src/anemoi/models/commands/__init__.py b/src/anemoi/models/commands/__init__.py
@@ -1,12 +1,11 @@
-#!/usr/bin/env python
-# (C) Copyright 2024 ECMWF.
+# (C) Copyright 2024 Anemoi contributors.
 #
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
-#
 
 import os
 
diff --git a/src/anemoi/models/data_indices/__init__.py b/src/anemoi/models/data_indices/__init__.py
@@ -0,0 +1,8 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
diff --git a/src/anemoi/models/distributed/__init__.py b/src/anemoi/models/distributed/__init__.py
@@ -0,0 +1,8 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
diff --git a/src/anemoi/models/interface/__init__.py b/src/anemoi/models/interface/__init__.py
@@ -1,11 +1,11 @@
-# (C) Copyright 2024 ECMWF.
+# (C) Copyright 2024 Anemoi contributors.
 #
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
-#
 
 import uuid
 
diff --git a/src/anemoi/models/layers/__init__.py b/src/anemoi/models/layers/__init__.py
@@ -0,0 +1,8 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
diff --git a/src/anemoi/models/layers/graph.py b/src/anemoi/models/layers/graph.py
@@ -12,6 +12,7 @@
 import torch
 from torch import Tensor
 from torch import nn
+from torch_geometric.data import HeteroData
 
 
 class TrainableTensor(nn.Module):
@@ -36,8 +37,77 @@ def __init__(self, tensor_size: int, trainable_size: int) -> None:
     def forward(self, x: Tensor, batch_size: int) -> Tensor:
         latent = [einops.repeat(x, "e f -> (repeat e) f", repeat=batch_size)]
         if self.trainable is not None:
-            latent.append(einops.repeat(self.trainable, "e f -> (repeat e) f", repeat=batch_size))
+            latent.append(einops.repeat(self.trainable.to(x.device), "e f -> (repeat e) f", repeat=batch_size))
         return torch.cat(
             latent,
             dim=-1,  # feature dimension
         )
+
+
+class NamedNodesAttributes(nn.Module):
+    """Named Nodes Attributes information.
+
+    Attributes
+    ----------
+    num_nodes : dict[str, int]
+        Number of nodes for each group of nodes.
+    attr_ndims : dict[str, int]
+        Total dimension of node attributes (non-trainable + trainable) for each group of nodes.
+    trainable_tensors : nn.ModuleDict
+        Dictionary of trainable tensors for each group of nodes.
+
+    Methods
+    -------
+    get_coordinates(self, name: str) -> Tensor
+        Get the coordinates of a set of nodes.
+    forward( self, name: str, batch_size: int) -> Tensor
+        Get the node attributes to be passed trough the graph neural network.
+    """
+
+    num_nodes: dict[str, int]
+    attr_ndims: dict[str, int]
+    trainable_tensors: dict[str, TrainableTensor]
+
+    def __init__(self, num_trainable_params: int, graph_data: HeteroData) -> None:
+        """Initialize NamedNodesAttributes."""
+        super().__init__()
+
+        self.define_fixed_attributes(graph_data, num_trainable_params)
+
+        self.trainable_tensors = nn.ModuleDict()
+        for nodes_name, nodes in graph_data.node_items():
+            self.register_coordinates(nodes_name, nodes.x)
+            self.register_tensor(nodes_name, num_trainable_params)
+
+    def define_fixed_attributes(self, graph_data: HeteroData, num_trainable_params: int) -> None:
+        """Define fixed attributes."""
+        nodes_names = list(graph_data.node_types)
+        self.num_nodes = {nodes_name: graph_data[nodes_name].num_nodes for nodes_name in nodes_names}
+        self.attr_ndims = {
+            nodes_name: 2 * graph_data[nodes_name].x.shape[1] + num_trainable_params for nodes_name in nodes_names
+        }
+
+    def register_coordinates(self, name: str, node_coords: Tensor) -> None:
+        """Register coordinates."""
+        sin_cos_coords = torch.cat([torch.sin(node_coords), torch.cos(node_coords)], dim=-1)
+        self.register_buffer(f"latlons_{name}", sin_cos_coords, persistent=True)
+
+    def get_coordinates(self, name: str) -> Tensor:
+        """Return original coordinates."""
+        sin_cos_coords = getattr(self, f"latlons_{name}")
+        ndim = sin_cos_coords.shape[1] // 2
+        sin_values = sin_cos_coords[:, :ndim]
+        cos_values = sin_cos_coords[:, ndim:]
+        return torch.atan2(sin_values, cos_values)
+
+    def register_tensor(self, name: str, num_trainable_params: int) -> None:
+        """Register a trainable tensor."""
+        self.trainable_tensors[name] = TrainableTensor(self.num_nodes[name], num_trainable_params)
+
+    def forward(self, name: str, batch_size: int) -> Tensor:
+        """Returns the node attributes to be passed trough the graph neural network.
+
+        It includes both the coordinates and the trainable parameters.
+        """
+        latlons = getattr(self, f"latlons_{name}")
+        return self.trainable_tensors[name](latlons, batch_size)
diff --git a/src/anemoi/models/models/__init__.py b/src/anemoi/models/models/__init__.py
@@ -0,0 +1,8 @@
+# (C) Copyright 2024 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
diff --git a/src/anemoi/models/models/encoder_processor_decoder.py b/src/anemoi/models/models/encoder_processor_decoder.py
@@ -22,7 +22,7 @@
 from torch_geometric.data import HeteroData
 
 from anemoi.models.distributed.shapes import get_shape_shards
-from anemoi.models.layers.graph import TrainableTensor
+from anemoi.models.layers.graph import NamedNodesAttributes
 
 LOGGER = logging.getLogger(__name__)
 
@@ -56,42 +56,33 @@ def __init__(
 
         self._calculate_shapes_and_indices(data_indices)
         self._assert_matching_indices(data_indices)
-
-        self.multi_step = model_config.training.multistep_input
-
-        self._define_tensor_sizes(model_config)
-
-        # Create trainable tensors
-        self._create_trainable_attributes()
-
-        # Register lat/lon of nodes
-        self._register_latlon("data", self._graph_name_data)
-        self._register_latlon("hidden", self._graph_name_hidden)
-
         self.data_indices = data_indices
 
+        self.multi_step = model_config.training.multistep_input
         self.num_channels = model_config.model.num_channels
 
-        input_dim = self.multi_step * self.num_input_channels + self.latlons_data.shape[1] + self.trainable_data_size
+        self.node_attributes = NamedNodesAttributes(model_config.model.trainable_parameters.hidden, self._graph_data)
+
+        input_dim = self.multi_step * self.num_input_channels + self.node_attributes.attr_ndims[self._graph_name_data]
 
         # Encoder data -> hidden
         self.encoder = instantiate(
             model_config.model.encoder,
             in_channels_src=input_dim,
-            in_channels_dst=self.latlons_hidden.shape[1] + self.trainable_hidden_size,
+            in_channels_dst=self.node_attributes.attr_ndims[self._graph_name_hidden],
             hidden_dim=self.num_channels,
             sub_graph=self._graph_data[(self._graph_name_data, "to", self._graph_name_hidden)],
-            src_grid_size=self._data_grid_size,
-            dst_grid_size=self._hidden_grid_size,
+            src_grid_size=self.node_attributes.num_nodes[self._graph_name_data],
+            dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden],
         )
 
         # Processor hidden -> hidden
         self.processor = instantiate(
             model_config.model.processor,
             num_channels=self.num_channels,
             sub_graph=self._graph_data[(self._graph_name_hidden, "to", self._graph_name_hidden)],
-            src_grid_size=self._hidden_grid_size,
-            dst_grid_size=self._hidden_grid_size,
+            src_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden],
+            dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden],
         )
 
         # Decoder hidden -> data
@@ -102,8 +93,8 @@ def __init__(
             hidden_dim=self.num_channels,
             out_channels_dst=self.num_output_channels,
             sub_graph=self._graph_data[(self._graph_name_hidden, "to", self._graph_name_data)],
-            src_grid_size=self._hidden_grid_size,
-            dst_grid_size=self._data_grid_size,
+            src_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden],
+            dst_grid_size=self.node_attributes.num_nodes[self._graph_name_data],
         )
 
         # Instantiation of model output bounding functions (e.g., to ensure outputs like TP are positive definite)
@@ -133,34 +124,6 @@ def _assert_matching_indices(self, data_indices: dict) -> None:
             self._internal_output_idx,
         ), f"Internal model indices must match {self._internal_input_idx} != {self._internal_output_idx}"
 
-    def _define_tensor_sizes(self, config: DotDict) -> None:
-        self._data_grid_size = self._graph_data[self._graph_name_data].num_nodes
-        self._hidden_grid_size = self._graph_data[self._graph_name_hidden].num_nodes
-
-        self.trainable_data_size = config.model.trainable_parameters.data
-        self.trainable_hidden_size = config.model.trainable_parameters.hidden
-
-    def _register_latlon(self, name: str, nodes: str) -> None:
-        """Register lat/lon buffers.
-
-        Parameters
-        ----------
-        name : str
-            Name to store the lat-lon coordinates of the nodes.
-        nodes : str
-            Name of nodes to map
-        """
-        coords = self._graph_data[nodes].x
-        sin_cos_coords = torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
-        self.register_buffer(f"latlons_{name}", sin_cos_coords, persistent=True)
-
-    def _create_trainable_attributes(self) -> None:
-        """Create all trainable attributes."""
-        self.trainable_data = TrainableTensor(trainable_size=self.trainable_data_size, tensor_size=self._data_grid_size)
-        self.trainable_hidden = TrainableTensor(
-            trainable_size=self.trainable_hidden_size, tensor_size=self._hidden_grid_size
-        )
-
     def _run_mapper(
         self,
         mapper: nn.Module,
@@ -210,12 +173,12 @@ def forward(self, x: Tensor, model_comm_group: Optional[ProcessGroup] = None) ->
         x_data_latent = torch.cat(
             (
                 einops.rearrange(x, "batch time ensemble grid vars -> (batch ensemble grid) (time vars)"),
-                self.trainable_data(self.latlons_data, batch_size=batch_size),
+                self.node_attributes(self._graph_name_data, batch_size=batch_size),
             ),
             dim=-1,  # feature dimension
         )
 
-        x_hidden_latent = self.trainable_hidden(self.latlons_hidden, batch_size=batch_size)
+        x_hidden_latent = self.node_attributes(self._graph_name_hidden, batch_size=batch_size)
 
         # get shard shapes
         shard_shapes_data = get_shape_shards(x_data_latent, 0, model_comm_group)
diff --git a/src/anemoi/models/preprocessing/__init__.py b/src/anemoi/models/preprocessing/__init__.py
diff --git a/src/anemoi/models/preprocessing/imputer.py b/src/anemoi/models/preprocessing/imputer.py
diff --git a/src/anemoi/models/preprocessing/remapper.py b/src/anemoi/models/preprocessing/remapper.py
diff --git a/tests/layers/test_graph.py b/tests/layers/test_graph.py
diff --git a/tests/preprocessing/test_preprocessor_imputer.py b/tests/preprocessing/test_preprocessor_imputer.py
diff --git a/tests/preprocessing/test_preprocessor_remapper.py b/tests/preprocessing/test_preprocessor_remapper.py