Feature/xy zip (#85)

b8raoult · web-flow · commit 6cccf2e19f36 · 2024-10-14T11:51:34.000+02:00
Update xy/zip
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you!
 ### Added
 
 - Add anemoi-transform link to documentation
+- Control compatibility check in xy/zip
+- Add `merge` feature
 
 ### Changed
 
diff --git a/src/anemoi/datasets/data/concat.py b/src/anemoi/datasets/data/concat.py
@@ -148,6 +148,7 @@ def concat_factory(args, kwargs):
 
     datasets = kwargs.pop("concat")
     fill_missing_gaps = kwargs.pop("fill_missing_gaps", False)
+
     assert isinstance(datasets, (list, tuple))
     assert len(args) == 0
 
diff --git a/src/anemoi/datasets/data/merge.py b/src/anemoi/datasets/data/merge.py
@@ -0,0 +1,154 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import logging
+from functools import cached_property
+
+import numpy as np
+
+from . import MissingDateError
+from .debug import Node
+from .debug import debug_indexing
+from .forwards import Combined
+from .indexing import apply_index_to_slices_changes
+from .indexing import expand_list_indexing
+from .indexing import index_to_slices
+from .indexing import update_tuple
+from .misc import _auto_adjust
+from .misc import _open
+
+LOG = logging.getLogger(__name__)
+
+
+class Merge(Combined):
+    def __init__(self, datasets, allow_gaps_in_dates=False):
+        super().__init__(datasets)
+
+        self.allow_gaps_in_dates = allow_gaps_in_dates
+
+        dates = dict()
+
+        for i, d in enumerate(datasets):
+            for j, date in enumerate(d.dates):
+                date = date.astype(object)
+                if date in dates:
+                    d1 = datasets[dates[date][0]]
+                    d2 = datasets[i]
+                    raise ValueError(f"Duplicate date {date} found in datasets {d1} and {d2}")
+                dates[date] = (i, j)
+
+        all_dates = sorted(dates)
+        start = all_dates[0]
+        end = all_dates[-1]
+
+        frequency = min(d2 - d1 for d1, d2 in zip(all_dates[:-1], all_dates[1:]))
+
+        date = start
+        indices = []
+        _dates = []
+
+        self._missing_index = len(datasets)
+
+        while date <= end:
+            if date not in dates:
+                if self.allow_gaps_in_dates:
+                    dates[date] = (self._missing_index, -1)
+                else:
+                    raise ValueError(
+                        f"merge: date {date} not covered by dataset. Start={start}, end={end}, frequency={frequency}"
+                    )
+
+            indices.append(dates[date])
+            _dates.append(date)
+            date += frequency
+
+        self._dates = np.array(_dates, dtype="datetime64[s]")
+        self._indices = np.array(indices)
+        self._frequency = frequency
+
+    @property
+    def dates(self):
+        return self._dates
+
+    @property
+    def frequency(self):
+        return self._frequency
+
+    @cached_property
+    def missing(self):
+        # TODO: optimize
+        result = set()
+
+        for i, (dataset, row) in enumerate(self._indices):
+            if dataset == self._missing_index:
+                result.add(i)
+                continue
+
+            if row in self.datasets[dataset].missing:
+                result.add(i)
+
+        return result
+
+    def check_same_lengths(self, d1, d2):
+        # Turned off because we are concatenating along the first axis
+        pass
+
+    def check_same_dates(self, d1, d2):
+        # Turned off because we are concatenating along the dates axis
+        pass
+
+    def check_compatibility(self, d1, d2):
+        super().check_compatibility(d1, d2)
+        self.check_same_sub_shapes(d1, d2, drop_axis=0)
+
+    def tree(self):
+        return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates)
+
+    @debug_indexing
+    def __getitem__(self, n):
+        if isinstance(n, tuple):
+            return self._get_tuple(n)
+
+        if isinstance(n, slice):
+            return self._get_slice(n)
+
+        dataset, row = self._indices[n]
+
+        if dataset == self._missing_index:
+            raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})")
+
+        return self.datasets[dataset][int(row)]
+
+    @debug_indexing
+    @expand_list_indexing
+    def _get_tuple(self, index):
+        index, changes = index_to_slices(index, self.shape)
+        index, previous = update_tuple(index, 0, slice(None))
+        result = self._get_slice(previous)
+        return apply_index_to_slices_changes(result[index], changes)
+
+    def _get_slice(self, s):
+        return np.stack([self[i] for i in range(*s.indices(self._len))])
+
+
+def merge_factory(args, kwargs):
+
+    datasets = kwargs.pop("merge")
+
+    assert isinstance(datasets, (list, tuple))
+    assert len(args) == 0
+
+    datasets = [_open(e) for e in datasets]
+
+    if len(datasets) == 1:
+        return datasets[0]._subset(**kwargs)
+
+    datasets, kwargs = _auto_adjust(datasets, kwargs)
+
+    allow_gaps_in_dates = kwargs.pop("allow_gaps_in_dates", False)
+
+    return Merge(datasets, allow_gaps_in_dates=allow_gaps_in_dates)._subset(**kwargs)
diff --git a/src/anemoi/datasets/data/misc.py b/src/anemoi/datasets/data/misc.py
@@ -302,6 +302,12 @@ def _open_dataset(*args, **kwargs):
         assert not sets, sets
         return concat_factory(args, kwargs).mutate()
 
+    if "merge" in kwargs:
+        from .merge import merge_factory
+
+        assert not sets, sets
+        return merge_factory(args, kwargs).mutate()
+
     if "ensemble" in kwargs:
         from .ensemble import ensemble_factory
 
diff --git a/src/anemoi/datasets/data/xy.py b/src/anemoi/datasets/data/xy.py
@@ -18,15 +18,19 @@
 
 class ZipBase(Combined):
 
+    def __init__(self, datasets, check_compatibility=True):
+        self._check_compatibility = check_compatibility
+        super().__init__(datasets)
+
     def swap_with_parent(self, parent):
         new_parents = [parent.clone(ds) for ds in self.datasets]
         return self.clone(new_parents)
 
     def clone(self, datasets):
-        return self.__class__(datasets)
+        return self.__class__(datasets, check_compatibility=self._check_compatibility)
 
     def tree(self):
-        return Node(self, [d.tree() for d in self.datasets])
+        return Node(self, [d.tree() for d in self.datasets], check_compatibility=self._check_compatibility)
 
     def __len__(self):
         return min(len(d) for d in self.datasets)
@@ -86,6 +90,10 @@ def resolution(self):
     def name_to_index(self):
         return tuple(d.name_to_index for d in self.datasets)
 
+    def check_compatibility(self, d1, d2):
+        if self._check_compatibility:
+            super().check_compatibility(d1, d2)
+
 
 class Zip(ZipBase):
     pass
@@ -110,7 +118,9 @@ def xy_factory(args, kwargs):
 
     assert len(datasets) == 2
 
-    return XY(datasets)._subset(**kwargs)
+    check_compatibility = kwargs.pop("check_compatibility", True)
+
+    return XY(datasets, check_compatibility=check_compatibility)._subset(**kwargs)
 
 
 def zip_factory(args, kwargs):
@@ -122,4 +132,6 @@ def zip_factory(args, kwargs):
     datasets = [_open(e) for e in zip]
     datasets, kwargs = _auto_adjust(datasets, kwargs)
 
-    return Zip(datasets)._subset(**kwargs)
+    check_compatibility = kwargs.pop("check_compatibility", True)
+
+    return Zip(datasets, check_compatibility=check_compatibility)._subset(**kwargs)