From 2a88ece93397dea03a532b7f90e92b129f8c8111 Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:50:21 +0100
Subject: [PATCH 1/8] add link to transform (#82)

---
 CHANGELOG.md   | 4 ++++
 docs/conf.py   | 4 ++++
 docs/index.rst | 1 +
 3 files changed, 9 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dfe32421..7eff57d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ Keep it human-readable, your future self will thank you!
 
 ## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD)
 
+### Added
+
+- Add anemoi-transform link to documentation
+
 ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09
 
 ## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04
diff --git a/docs/conf.py b/docs/conf.py
index b14bb99a..790061fb 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,6 +107,10 @@
         "https://anemoi-registry.readthedocs.io/en/latest/",
         ("../../anemoi-registry/docs/_build/html/objects.inv", None),
     ),
+    "anemoi-transform": (
+        "https://anemoi-transform.readthedocs.io/en/latest/",
+        ("../../anemoi-transform/docs/_build/html/objects.inv", None),
+    ),
 }
 
 
diff --git a/docs/index.rst b/docs/index.rst
index fbddc874..b7e7abdc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -121,6 +121,7 @@ datasets <building-introduction>`.
 *****************
 
 -  :ref:`anemoi-utils <anemoi-utils:index-page>`
+-  :ref:`anemoi-transform <anemoi-transform:index-page>`
 -  :ref:`anemoi-datasets <anemoi-datasets:index-page>`
 -  :ref:`anemoi-models <anemoi-models:index-page>`
 -  :ref:`anemoi-graphs <anemoi-graphs:index-page>`

From 65ad2dd92f2865ab9e7c19b9be8e46206d05ac7f Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:51:10 +0100
Subject: [PATCH 2/8] Fill missing dates by interpolating data (#81)

* add interpolate and closest
---
 CHANGELOG.md                             |   8 +-
 docs/using/code/fill_missing_dates1_.py  |   1 +
 docs/using/code/fill_missing_dates2_.py  |   1 +
 docs/using/code/missing_dates_.py        |   1 -
 docs/using/code/set_missing_dates_.py    |   1 +
 docs/using/missing.rst                   |  23 +++-
 pyproject.toml                           |  33 +----
 src/anemoi/datasets/data/dataset.py      |  26 ++--
 src/anemoi/datasets/data/fill_missing.py | 162 +++++++++++++++++++++++
 9 files changed, 213 insertions(+), 43 deletions(-)
 create mode 100644 docs/using/code/fill_missing_dates1_.py
 create mode 100644 docs/using/code/fill_missing_dates2_.py
 delete mode 100644 docs/using/code/missing_dates_.py
 create mode 100644 docs/using/code/set_missing_dates_.py
 create mode 100644 src/anemoi/datasets/data/fill_missing.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7eff57d2..daa40d14 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,14 +16,16 @@ Keep it human-readable, your future self will thank you!
 
 ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09
 
-## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04
+### Changed
 
-- Update documentation
+- Add support to fill missing dates
+
+## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04
 
-- Update documentation
 ### Changed
 
 - Add `variables_metadata` entry in the dataset metadata
+- Update documentation
 
 ### Changed
 
diff --git a/docs/using/code/fill_missing_dates1_.py b/docs/using/code/fill_missing_dates1_.py
new file mode 100644
index 00000000..93d78ccf
--- /dev/null
+++ b/docs/using/code/fill_missing_dates1_.py
@@ -0,0 +1 @@
+ds = open_dataset(dataset, fill_missing_dates="interpolate")
diff --git a/docs/using/code/fill_missing_dates2_.py b/docs/using/code/fill_missing_dates2_.py
new file mode 100644
index 00000000..6567aaf4
--- /dev/null
+++ b/docs/using/code/fill_missing_dates2_.py
@@ -0,0 +1 @@
+ds = open_dataset(dataset, fill_missing_dates="closest")
diff --git a/docs/using/code/missing_dates_.py b/docs/using/code/missing_dates_.py
deleted file mode 100644
index 5c2da45d..00000000
--- a/docs/using/code/missing_dates_.py
+++ /dev/null
@@ -1 +0,0 @@
-ds = open_dataset(dataset, missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"])
diff --git a/docs/using/code/set_missing_dates_.py b/docs/using/code/set_missing_dates_.py
new file mode 100644
index 00000000..35f35ed6
--- /dev/null
+++ b/docs/using/code/set_missing_dates_.py
@@ -0,0 +1 @@
+ds = open_dataset(dataset, set_missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"])
diff --git a/docs/using/missing.rst b/docs/using/missing.rst
index 3388a444..e7c6b9d4 100644
--- a/docs/using/missing.rst
+++ b/docs/using/missing.rst
@@ -4,6 +4,25 @@
  Managing missing dates
 ########################
 
+**************************************************
+ Filling the missing dates with artificial values
+**************************************************
+
+When you have missing dates in a dataset, you can fill them with
+artificial values. You can either fill them with values that are the
+result of a linear interpolation between the two closest dates:
+
+.. literalinclude:: code/fill_missing_dates1_.py
+
+Or you can select the copy the value of the closest date:
+
+.. literalinclude:: code/fill_missing_dates2_.py
+
+if the missing date is exactly in the middle of two dates, the library
+will choose that value of the largest date. You can change this behavior
+by setting the ``closest`` parameter to ``'down'`` or ``'up'``
+explicitly.
+
 ************************************************
  Skipping missing when iterating over a dataset
 ************************************************
@@ -72,7 +91,7 @@ the datasets to make the dates contiguous.
  Debugging
 ***********
 
-You can set missing dates using the ``missing_dates`` option. This
+You can set missing dates using the ``set_missing_dates`` option. This
 option is for debugging purposes only.
 
-.. literalinclude:: code/missing_dates_.py
+.. literalinclude:: code/set_missing_dates_.py
diff --git a/pyproject.toml b/pyproject.toml
index 5210d87c..797cfd0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dynamic = [
   "version",
 ]
 dependencies = [
-  "anemoi-utils[provenance]>=0.3.15",
+  "anemoi-utils[provenance]>=0.3.18",
   "cfunits",
   "numpy",
   "pyyaml",
@@ -60,43 +60,20 @@ dependencies = [
 ]
 
 optional-dependencies.all = [
-  "boto3",
-  "earthkit-data[mars]>=0.9",
-  "earthkit-geo>=0.2",
-  "earthkit-meteo",
-  "ecmwflibs>=0.6.3",
-  "entrypoints",
-  "gcsfs",
-  "kerchunk",
-  "pyproj",
-  "requests",
+  "anemoi-datasets[create,remote,xarray]",
 ]
 
 optional-dependencies.create = [
-  "earthkit-data[mars]>=0.9",
+  "earthkit-data[mars]>=0.10.7",
   "earthkit-geo>=0.2",
   "earthkit-meteo",
-  "ecmwflibs>=0.6.3",
+  "eccodes>=2.38.1",
   "entrypoints",
   "pyproj",
 ]
 
 optional-dependencies.dev = [
-  "boto3",
-  "earthkit-data[mars]>=0.9",
-  "earthkit-geo>=0.2",
-  "earthkit-meteo",
-  "ecmwflibs>=0.6.3",
-  "entrypoints",
-  "gcsfs",
-  "kerchunk",
-  "nbsphinx",
-  "pandoc",
-  "pyproj",
-  "pytest",
-  "requests",
-  "sphinx",
-  "sphinx-rtd-theme",
+  "anemoi-datasets[all,docs,tests]",
 ]
 
 optional-dependencies.docs = [
diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py
index 2f6af1b1..56275d1e 100644
--- a/src/anemoi/datasets/data/dataset.py
+++ b/src/anemoi/datasets/data/dataset.py
@@ -41,6 +41,14 @@ def _subset(self, **kwargs):
         if not kwargs:
             return self.mutate()
 
+        # This one must be first
+        if "fill_missing_dates" in kwargs:
+            from .fill_missing import fill_missing_dates_factory
+
+            fill_missing_dates = kwargs.pop("fill_missing_dates")
+            ds = fill_missing_dates_factory(self, fill_missing_dates, kwargs)
+            return ds._subset(**kwargs).mutate()
+
         if "start" in kwargs or "end" in kwargs:
             start = kwargs.pop("start", None)
             end = kwargs.pop("end", None)
@@ -64,12 +72,6 @@ def _subset(self, **kwargs):
                 .mutate()
             )
 
-        if "interpolate_frequency" in kwargs:
-            from .interpolate import InterpolateFrequency
-
-            interpolate_frequency = kwargs.pop("interpolate_frequency")
-            return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate()
-
         if "select" in kwargs:
             from .select import Select
 
@@ -121,11 +123,11 @@ def _subset(self, **kwargs):
             bbox = kwargs.pop("area")
             return Cropping(self, bbox)._subset(**kwargs).mutate()
 
-        if "missing_dates" in kwargs:
+        if "set_missing_dates" in kwargs:
             from .missing import MissingDates
 
-            missing_dates = kwargs.pop("missing_dates")
-            return MissingDates(self, missing_dates)._subset(**kwargs).mutate()
+            set_missing_dates = kwargs.pop("set_missing_dates")
+            return MissingDates(self, set_missing_dates)._subset(**kwargs).mutate()
 
         if "skip_missing_dates" in kwargs:
             from .missing import SkipMissingDates
@@ -139,6 +141,12 @@ def _subset(self, **kwargs):
             if skip_missing_dates:
                 return SkipMissingDates(self, expected_access)._subset(**kwargs).mutate()
 
+        if "interpolate_frequency" in kwargs:
+            from .interpolate import InterpolateFrequency
+
+            interpolate_frequency = kwargs.pop("interpolate_frequency")
+            return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate()
+
         # Keep last
         if "shuffle" in kwargs:
             from .subset import Subset
diff --git a/src/anemoi/datasets/data/fill_missing.py b/src/anemoi/datasets/data/fill_missing.py
new file mode 100644
index 00000000..ca16fd65
--- /dev/null
+++ b/src/anemoi/datasets/data/fill_missing.py
@@ -0,0 +1,162 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import logging
+
+import numpy as np
+
+from anemoi.datasets.data import MissingDateError
+
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Forwards
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+
+LOG = logging.getLogger(__name__)
+
+
+class MissingDatesFill(Forwards):
+    def __init__(self, dataset):
+        super().__init__(dataset)
+        self._missing = set(dataset.missing)
+        self._warnings = set()
+
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 0, slice(None))
+        result = self._get_slice(previous)
+        return apply_index_to_slices_changes(result[index], changes)
+
+    def _get_slice(self, s):
+        return np.stack([self[i] for i in range(*s.indices(self._len))])
+
+    @property
+    def missing(self):
+        return set()
+
+    @debug_indexing
+    def __getitem__(self, n):
+
+        try:
+            return self.forward[n]
+        except MissingDateError:
+            pass
+
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+
+        if isinstance(n, slice):
+            return self._get_slice(n)
+
+        if n < 0:
+            n += self._len
+
+        a = None
+        i = n
+        while a is None and i >= 0:
+            if i in self._missing:
+                i -= 1
+            else:
+                a = i
+
+        len = self._len
+        b = None
+        i = n
+        while b is None and n < len:
+            if i in self._missing:
+                i += 1
+            else:
+                b = i
+
+        return self._fill_missing(n, a, b)
+
+
+class MissingDatesClosest(MissingDatesFill):
+
+    def __init__(self, dataset, closest):
+        super().__init__(dataset)
+        self.closest = closest
+        self._closest = {}
+
+    def _fill_missing(self, n, a, b):
+
+        if n not in self._warnings:
+            LOG.warning(f"Missing date at index {n} ({self.dates[n]})")
+            if abs(n - a) == abs(b - n):
+                if self.closest == "up":
+                    u = b
+                else:
+                    u = a
+            else:
+                if abs(n - a) < abs(b - n):
+                    u = a
+                else:
+                    u = b
+            LOG.warning(f"Using closest date {u} ({self.dates[u]})")
+
+            self._closest[n] = u
+            self._warnings.add(n)
+
+        return self.forward[self._closest[n]]
+
+    def subclass_metadata_specific(self):
+        return {"closest": self.closest}
+
+    def tree(self):
+        return Node(self, [self.forward.tree()], closest=self.closest)
+
+
+class MissingDatesInterpolate(MissingDatesFill):
+    def __init__(self, dataset):
+        super().__init__(dataset)
+        self._alpha = {}
+
+    def _fill_missing(self, n, a, b):
+        if n not in self._warnings:
+            LOG.warning(f"Missing date at index {n} ({self.dates[n]})")
+
+            if a is None or b is None:
+                raise MissingDateError(
+                    f"Cannot interpolate at index {n} ({self.dates[n]}). Are the first or last date missing?"
+                )
+
+            assert a < n < b, (a, n, b)
+
+            alpha = (n - a) / (b - a)
+            assert 0 < alpha < 1, alpha
+
+            LOG.warning(f"Interpolating between index {a} ({self.dates[a]}) and {b} ({self.dates[b]})")
+            LOG.warning(f"Interpolation {1 - alpha:g} * ({self.dates[a]}) + {alpha:g} * ({self.dates[b]})")
+
+            self._alpha[n] = alpha
+
+            self._warnings.add(n)
+
+        alpha = self._alpha[n]
+        return self.forward[a] * (1 - alpha) + self.forward[b] * alpha
+
+    def subclass_metadata_specific(self):
+        return {}
+
+    def tree(self):
+        return Node(self, [self.forward.tree()])
+
+
+def fill_missing_dates_factory(dataset, method, kwargs):
+    if method == "closest":
+        closest = kwargs.get("closest", "up")
+        return MissingDatesClosest(dataset, closest=closest)
+
+    if method == "interpolate":
+        return MissingDatesInterpolate(dataset)
+
+    raise ValueError(f"Invalid `fill_missing_dates` method '{method}'")

From 4507a06e6f4cfed04e102d1a1c2dbf9d88c6b3c2 Mon Sep 17 00:00:00 2001
From: Helen Theissen <helen.theissen@ecmwf.int>
Date: Fri, 11 Oct 2024 17:25:44 +0100
Subject: [PATCH 3/8] Fix/remove upstream deps from ci (#83)

* chore(deps): remove upstream deps from ci

* chore: use branch of downstream ci

* fix: revert back to main on downstream-ci

* fix: add anemoi-utils back in

* docs: update changelog
---
 .github/ci-config.yml     | 6 ------
 .github/ci-hpc-config.yml | 9 ---------
 CHANGELOG.md              | 5 +++++
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/.github/ci-config.yml b/.github/ci-config.yml
index 6138e636..f712f26f 100644
--- a/.github/ci-config.yml
+++ b/.github/ci-config.yml
@@ -1,9 +1,3 @@
-dependencies: |
-  ecmwf/ecbuild
-  MathisRosenhauer/libaec@master
-  ecmwf/eccodes
-  ecmwf/eckit
-  ecmwf/odc
 dependency_branch: develop
 parallelism_factor: 8
 self_build: false # Only for python packages
diff --git a/.github/ci-hpc-config.yml b/.github/ci-hpc-config.yml
index b6e65e42..ab70a21c 100644
--- a/.github/ci-hpc-config.yml
+++ b/.github/ci-hpc-config.yml
@@ -2,17 +2,8 @@ build:
   python: '3.10'
   modules:
     - ninja
-  dependencies:
-    - ecmwf/ecbuild@develop
-    - ecmwf/eccodes@develop
-    - ecmwf/eckit@develop
-    - ecmwf/odc@develop
   python_dependencies:
     - ecmwf/anemoi-utils@develop
-    - ecmwf/earthkit-data@develop
-    - ecmwf/earthkit-meteo@develop
-    - ecmwf/earthkit-geo@develop
   parallel: 64
-
   pytest_cmd: |
     python -m pytest -vv -m 'not notebook and not no_cache_init' --cov=. --cov-report=xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
index daa40d14..485d61c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,10 +10,15 @@ Keep it human-readable, your future self will thank you!
 
 ## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD)
 
+
 ### Added
 
 - Add anemoi-transform link to documentation
 
+### Changed
+
+- Remove upstream dependencies from downstream-ci workflow (temporary) (#83)
+
 ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09
 
 ### Changed

From 90ccc406bb45f60804dc2f26bc65f278d2b3fb61 Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Sun, 13 Oct 2024 19:08:44 +0100
Subject: [PATCH 4/8] Feature/params metadata (#86)

* add `variables_metadata` to metadata
---
 CHANGELOG.md                        | 4 ++++
 src/anemoi/datasets/data/dataset.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 485d61c8..3afcf7ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,10 @@ Keep it human-readable, your future self will thank you!
 
 - Add `variables_metadata` entry in the dataset metadata
 
+### Changed
+
+- Add `variables_metadata` entry in the dataset metadata
+
 ## [0.5.5](https://github.com/ecmwf/anemoi-datasets/compare/0.5.4...0.5.5) - 2024-10-04
 
 ### Changed
diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py
index 56275d1e..35851991 100644
--- a/src/anemoi/datasets/data/dataset.py
+++ b/src/anemoi/datasets/data/dataset.py
@@ -284,6 +284,7 @@ def dataset_metadata(self):
             specific=self.metadata_specific(),
             frequency=self.frequency,
             variables=self.variables,
+            variables_metadata=self.variables_metadata,
             shape=self.shape,
             start_date=self.start_date.astype(str),
             end_date=self.end_date.astype(str),

From 6cccf2e19f3633ca3889ab90dd4c4200f0abdc29 Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Mon, 14 Oct 2024 10:51:34 +0100
Subject: [PATCH 5/8] Feature/xy zip (#85)

Update xy/zip
---
 CHANGELOG.md                       |   2 +
 src/anemoi/datasets/data/concat.py |   1 +
 src/anemoi/datasets/data/merge.py  | 154 +++++++++++++++++++++++++++++
 src/anemoi/datasets/data/misc.py   |   6 ++
 src/anemoi/datasets/data/xy.py     |  20 +++-
 5 files changed, 179 insertions(+), 4 deletions(-)
 create mode 100644 src/anemoi/datasets/data/merge.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3afcf7ee..eca03ad8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you!
 ### Added
 
 - Add anemoi-transform link to documentation
+- Control compatibility check in xy/zip
+- Add `merge` feature
 
 ### Changed
 
diff --git a/src/anemoi/datasets/data/concat.py b/src/anemoi/datasets/data/concat.py
index 53e6bfbc..55b2bc11 100644
--- a/src/anemoi/datasets/data/concat.py
+++ b/src/anemoi/datasets/data/concat.py
@@ -148,6 +148,7 @@ def concat_factory(args, kwargs):
 
     datasets = kwargs.pop("concat")
     fill_missing_gaps = kwargs.pop("fill_missing_gaps", False)
+
     assert isinstance(datasets, (list, tuple))
     assert len(args) == 0
 
diff --git a/src/anemoi/datasets/data/merge.py b/src/anemoi/datasets/data/merge.py
new file mode 100644
index 00000000..fc9a22ea
--- /dev/null
+++ b/src/anemoi/datasets/data/merge.py
@@ -0,0 +1,154 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import logging
+from functools import cached_property
+
+import numpy as np
+
+from . import MissingDateError
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Combined
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+from .misc import _auto_adjust
+from .misc import _open
+
+LOG = logging.getLogger(__name__)
+
+
+class Merge(Combined):
+    def __init__(self, datasets, allow_gaps_in_dates=False):
+        super().__init__(datasets)
+
+        self.allow_gaps_in_dates = allow_gaps_in_dates
+
+        dates = dict()
+
+        for i, d in enumerate(datasets):
+            for j, date in enumerate(d.dates):
+                date = date.astype(object)
+                if date in dates:
+                    d1 = datasets[dates[date][0]]
+                    d2 = datasets[i]
+                    raise ValueError(f"Duplicate date {date} found in datasets {d1} and {d2}")
+                dates[date] = (i, j)
+
+        all_dates = sorted(dates)
+        start = all_dates[0]
+        end = all_dates[-1]
+
+        frequency = min(d2 - d1 for d1, d2 in zip(all_dates[:-1], all_dates[1:]))
+
+        date = start
+        indices = []
+        _dates = []
+
+        self._missing_index = len(datasets)
+
+        while date <= end:
+            if date not in dates:
+                if self.allow_gaps_in_dates:
+                    dates[date] = (self._missing_index, -1)
+                else:
+                    raise ValueError(
+                        f"merge: date {date} not covered by dataset. Start={start}, end={end}, frequency={frequency}"
+                    )
+
+            indices.append(dates[date])
+            _dates.append(date)
+            date += frequency
+
+        self._dates = np.array(_dates, dtype="datetime64[s]")
+        self._indices = np.array(indices)
+        self._frequency = frequency
+
+    @property
+    def dates(self):
+        return self._dates
+
+    @property
+    def frequency(self):
+        return self._frequency
+
+    @cached_property
+    def missing(self):
+        # TODO: optimize
+        result = set()
+
+        for i, (dataset, row) in enumerate(self._indices):
+            if dataset == self._missing_index:
+                result.add(i)
+                continue
+
+            if row in self.datasets[dataset].missing:
+                result.add(i)
+
+        return result
+
+    def check_same_lengths(self, d1, d2):
+        # Turned off because we are concatenating along the first axis
+        pass
+
+    def check_same_dates(self, d1, d2):
+        # Turned off because we are concatenating along the dates axis
+        pass
+
+    def check_compatibility(self, d1, d2):
+        super().check_compatibility(d1, d2)
+        self.check_same_sub_shapes(d1, d2, drop_axis=0)
+
+    def tree(self):
+        return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates)
+
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+
+        if isinstance(n, slice):
+            return self._get_slice(n)
+
+        dataset, row = self._indices[n]
+
+        if dataset == self._missing_index:
+            raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})")
+
+        return self.datasets[dataset][int(row)]
+
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 0, slice(None))
+        result = self._get_slice(previous)
+        return apply_index_to_slices_changes(result[index], changes)
+
+    def _get_slice(self, s):
+        return np.stack([self[i] for i in range(*s.indices(self._len))])
+
+
+def merge_factory(args, kwargs):
+
+    datasets = kwargs.pop("merge")
+
+    assert isinstance(datasets, (list, tuple))
+    assert len(args) == 0
+
+    datasets = [_open(e) for e in datasets]
+
+    if len(datasets) == 1:
+        return datasets[0]._subset(**kwargs)
+
+    datasets, kwargs = _auto_adjust(datasets, kwargs)
+
+    allow_gaps_in_dates = kwargs.pop("allow_gaps_in_dates", False)
+
+    return Merge(datasets, allow_gaps_in_dates=allow_gaps_in_dates)._subset(**kwargs)
diff --git a/src/anemoi/datasets/data/misc.py b/src/anemoi/datasets/data/misc.py
index 94478e28..8c7cfc04 100644
--- a/src/anemoi/datasets/data/misc.py
+++ b/src/anemoi/datasets/data/misc.py
@@ -302,6 +302,12 @@ def _open_dataset(*args, **kwargs):
         assert not sets, sets
         return concat_factory(args, kwargs).mutate()
 
+    if "merge" in kwargs:
+        from .merge import merge_factory
+
+        assert not sets, sets
+        return merge_factory(args, kwargs).mutate()
+
     if "ensemble" in kwargs:
         from .ensemble import ensemble_factory
 
diff --git a/src/anemoi/datasets/data/xy.py b/src/anemoi/datasets/data/xy.py
index 74b27a96..1c326f15 100644
--- a/src/anemoi/datasets/data/xy.py
+++ b/src/anemoi/datasets/data/xy.py
@@ -18,15 +18,19 @@
 
 class ZipBase(Combined):
 
+    def __init__(self, datasets, check_compatibility=True):
+        self._check_compatibility = check_compatibility
+        super().__init__(datasets)
+
     def swap_with_parent(self, parent):
         new_parents = [parent.clone(ds) for ds in self.datasets]
         return self.clone(new_parents)
 
     def clone(self, datasets):
-        return self.__class__(datasets)
+        return self.__class__(datasets, check_compatibility=self._check_compatibility)
 
     def tree(self):
-        return Node(self, [d.tree() for d in self.datasets])
+        return Node(self, [d.tree() for d in self.datasets], check_compatibility=self._check_compatibility)
 
     def __len__(self):
         return min(len(d) for d in self.datasets)
@@ -86,6 +90,10 @@ def resolution(self):
     def name_to_index(self):
         return tuple(d.name_to_index for d in self.datasets)
 
+    def check_compatibility(self, d1, d2):
+        if self._check_compatibility:
+            super().check_compatibility(d1, d2)
+
 
 class Zip(ZipBase):
     pass
@@ -110,7 +118,9 @@ def xy_factory(args, kwargs):
 
     assert len(datasets) == 2
 
-    return XY(datasets)._subset(**kwargs)
+    check_compatibility = kwargs.pop("check_compatibility", True)
+
+    return XY(datasets, check_compatibility=check_compatibility)._subset(**kwargs)
 
 
 def zip_factory(args, kwargs):
@@ -122,4 +132,6 @@ def zip_factory(args, kwargs):
     datasets = [_open(e) for e in zip]
     datasets, kwargs = _auto_adjust(datasets, kwargs)
 
-    return Zip(datasets)._subset(**kwargs)
+    check_compatibility = kwargs.pop("check_compatibility", True)
+
+    return Zip(datasets, check_compatibility=check_compatibility)._subset(**kwargs)

From bd1244ce1f4e406b264d5b27cce846eafa6fa482 Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Mon, 14 Oct 2024 18:28:27 +0100
Subject: [PATCH 6/8] Bugfix/various (#84)

* Various bug fixes

* support ensembles

* fix rr accumulations
---
 CHANGELOG.md                                  |  1 +
 src/anemoi/datasets/create/__init__.py        | 19 +++++++++-
 .../create/functions/filters/rename.py        | 13 ++++---
 .../create/functions/sources/accumulations.py |  5 ++-
 src/anemoi/datasets/create/input/__init__.py  | 16 +-------
 src/anemoi/datasets/create/input/result.py    | 38 +++++++++++++++++--
 src/anemoi/datasets/create/input/step.py      |  1 +
 7 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eca03ad8..b9f3e9c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ Keep it human-readable, your future self will thank you!
 ### Added
 
 - Add anemoi-transform link to documentation
+- Various bug fixes
 - Control compatibility check in xy/zip
 - Add `merge` feature
 
diff --git a/src/anemoi/datasets/create/__init__.py b/src/anemoi/datasets/create/__init__.py
index 14cf5240..85abac76 100644
--- a/src/anemoi/datasets/create/__init__.py
+++ b/src/anemoi/datasets/create/__init__.py
@@ -412,7 +412,24 @@ def _run(self):
         metadata.update(self.main_config.get("add_metadata", {}))
 
         metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict()
-        metadata["recipe"] = sanitise(self.main_config.get_serialisable_dict())
+
+        recipe = sanitise(self.main_config.get_serialisable_dict())
+
+        # Remove stuff added by prepml
+        for k in [
+            "build_dataset",
+            "config_format_version",
+            "config_path",
+            "dataset_status",
+            "ecflow",
+            "metadata",
+            "platform",
+            "reading_chunks",
+            "upload",
+        ]:
+            recipe.pop(k, None)
+
+        metadata["recipe"] = recipe
 
         metadata["description"] = self.main_config.description
         metadata["licence"] = self.main_config["licence"]
diff --git a/src/anemoi/datasets/create/functions/filters/rename.py b/src/anemoi/datasets/create/functions/filters/rename.py
index b82fd8ca..666e085d 100644
--- a/src/anemoi/datasets/create/functions/filters/rename.py
+++ b/src/anemoi/datasets/create/functions/filters/rename.py
@@ -56,11 +56,14 @@ def __init__(self, field, format):
         self.format = format
         self.bits = re.findall(r"{(\w+)}", format)
 
-    def metadata(self, key, **kwargs):
-        value = self.field.metadata(key, **kwargs)
-        if "{" + key + "}" in self.format:
-            bits = {b: self.field.metadata(b, **kwargs) for b in self.bits}
-            return self.format.format(**bits)
+    def metadata(self, *args, **kwargs):
+        value = self.field.metadata(*args, **kwargs)
+        if args:
+            assert len(args) == 1
+            key = args[0]
+            if "{" + key + "}" in self.format:
+                bits = {b: self.field.metadata(b, **kwargs) for b in self.bits}
+                return self.format.format(**bits)
         return value
 
     def __getattr__(self, name):
diff --git a/src/anemoi/datasets/create/functions/sources/accumulations.py b/src/anemoi/datasets/create/functions/sources/accumulations.py
index b74eb33f..bfc5eba7 100644
--- a/src/anemoi/datasets/create/functions/sources/accumulations.py
+++ b/src/anemoi/datasets/create/functions/sources/accumulations.py
@@ -370,12 +370,15 @@ def accumulations(context, dates, **request):
 
     user_accumulation_period = request.pop("accumulation_period", 6)
 
+    # If `data_accumulation_period` is not set, this means that the accumulations are from the start
+    # of the forecast.
+
     KWARGS = {
         ("od", "oper"): dict(patch=_scda),
         ("od", "elda"): dict(base_times=(6, 18)),
         ("ea", "oper"): dict(data_accumulation_period=1, base_times=(6, 18)),
         ("ea", "enda"): dict(data_accumulation_period=3, base_times=(6, 18)),
-        ("rr", "oper"): dict(data_accumulation_period=3, base_times=(0, 3, 6, 9, 12, 15, 18, 21)),
+        ("rr", "oper"): dict(base_times=(0, 3, 6, 9, 12, 15, 18, 21)),
         ("l5", "oper"): dict(data_accumulation_period=1, base_times=(0,)),
     }
 
diff --git a/src/anemoi/datasets/create/input/__init__.py b/src/anemoi/datasets/create/input/__init__.py
index d23f038d..92e93669 100644
--- a/src/anemoi/datasets/create/input/__init__.py
+++ b/src/anemoi/datasets/create/input/__init__.py
@@ -6,23 +6,9 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 #
-import datetime
-import itertools
+
 import logging
-import math
-import time
-from collections import defaultdict
 from copy import deepcopy
-from functools import cached_property
-from functools import wraps
-
-import numpy as np
-from anemoi.utils.dates import as_datetime as as_datetime
-from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta
-
-from anemoi.datasets.dates import DatesProvider as DatesProvider
-from anemoi.datasets.fields import FieldArray as FieldArray
-from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField
 
 from .trace import trace_select
 
diff --git a/src/anemoi/datasets/create/input/result.py b/src/anemoi/datasets/create/input/result.py
index 7fab9acf..f708ef9c 100644
--- a/src/anemoi/datasets/create/input/result.py
+++ b/src/anemoi/datasets/create/input/result.py
@@ -33,9 +33,38 @@
 def _fields_metatata(variables, cube):
     assert isinstance(variables, tuple), variables
 
+    def _merge(md1, md2):
+        assert set(md1.keys()) == set(md2.keys()), (set(md1.keys()), set(md2.keys()))
+        result = {}
+        for k in md1.keys():
+            v1 = md1[k]
+            v2 = md2[k]
+
+            if v1 == v2:
+                result[k] = v1
+                continue
+
+            if isinstance(v1, list):
+                assert v2 not in v1, (v1, v2)
+                result[k] = sorted(v1 + [v2])
+                continue
+
+            if isinstance(v2, list):
+                assert v1 not in v2, (v1, v2)
+                result[k] = sorted(v2 + [v1])
+                continue
+
+            result[k] = sorted([v1, v2])
+
+        return result
+
     result = {}
-    for i, c in enumerate(cube.iterate_cubelets()):
-        assert c._coords_names[1] == variables[i], (c._coords_names[1], variables[i])
+    i = -1
+    for c in cube.iterate_cubelets():
+
+        if i == -1 or c._coords_names[1] != variables[i]:
+            i += 1
+
         f = cube[c.coords]
         md = f.metadata(namespace="mars")
         if not md:
@@ -49,7 +78,10 @@ def _fields_metatata(variables, cube):
             md["param"] = str(f.metadata("paramId", default="unknown"))
             # assert md['param'] != 'unknown', (md, f.metadata('param'))
 
-        result[variables[i]] = md
+        if variables[i] in result:
+            result[variables[i]] = _merge(md, result[variables[i]])
+        else:
+            result[variables[i]] = md
 
     assert i + 1 == len(variables), (i + 1, len(variables))
     return result
diff --git a/src/anemoi/datasets/create/input/step.py b/src/anemoi/datasets/create/input/step.py
index 3eb2917c..daca578b 100644
--- a/src/anemoi/datasets/create/input/step.py
+++ b/src/anemoi/datasets/create/input/step.py
@@ -59,6 +59,7 @@ def select(self, group_of_dates):
         )
 
     def __repr__(self):
+        # raise NotImplementedError(f"Not implemented in {self.__class__.__name__}")
         return super().__repr__(self.previous_step, _inline_=str(self.kwargs))
 
 

From 4f949a5ed18b0d7a88dbb3c6ff3014ecb3131dc0 Mon Sep 17 00:00:00 2001
From: Florian Pinault <Florian.Pinault@ecmwf.int>
Date: Tue, 15 Oct 2024 12:33:26 +0000
Subject: [PATCH 7/8] clean bug template

---
 .github/ISSUE_TEMPLATE/bug_report.md | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 16a27120..53e6c790 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -9,12 +9,15 @@ assignees: ''
 **Describe the bug**
 A clear and concise description of what the bug is.
 
+** Version number **
+I am using the following versions/branch/sha1 of the anemoi packages
+(alternatively the output of `pip freeze`)
+
 **To Reproduce**
 Steps to reproduce the behavior:
 1. Go to '...'
-2. Click on '....'
-3. Scroll down to '....'
-4. See error
+2. Run this '....'
+3. See error
 
 **URL to sample input data**
 Provide a URL to a sample input data, or attach a file to that report if it is small enough.
@@ -25,16 +28,5 @@ A clear and concise description of what you expected to happen.
 **Screenshots**
 If applicable, add screenshots to help explain your problem.
 
-**Desktop (please complete the following information):**
- - OS: [e.g. iOS]
- - Browser [e.g. chrome, safari]
- - Version [e.g. 22]
-
-**Smartphone (please complete the following information):**
- - Device: [e.g. iPhone6]
- - OS: [e.g. iOS8.1]
- - Browser [e.g. stock browser, safari]
- - Version [e.g. 22]
-
 **Additional context**
 Add any other context about the problem here.

From 1d96021c21d196afadee1af5bf048fb750913889 Mon Sep 17 00:00:00 2001
From: b8raoult <53792887+b8raoult@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:00:58 +0100
Subject: [PATCH 8/8] Extend CODEOWNERS (#89)

---
 .github/CODEOWNERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0211a4e3..74bdac0a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,6 +1,6 @@
 # CODEOWNERS file
 
 # Protect workflow files
-/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
-/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
-/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
+/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
+/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
+/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry