From 2a88ece93397dea03a532b7f90e92b129f8c8111 Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:50:21 +0100 Subject: [PATCH 1/8] add link to transform (#82) --- CHANGELOG.md | 4 ++++ docs/conf.py | 4 ++++ docs/index.rst | 1 + 3 files changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfe32421..7eff57d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Keep it human-readable, your future self will thank you! ## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD) +### Added + +- Add anemoi-transform link to documentation + ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09 ## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04 diff --git a/docs/conf.py b/docs/conf.py index b14bb99a..790061fb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -107,6 +107,10 @@ "https://anemoi-registry.readthedocs.io/en/latest/", ("../../anemoi-registry/docs/_build/html/objects.inv", None), ), + "anemoi-transform": ( + "https://anemoi-transform.readthedocs.io/en/latest/", + ("../../anemoi-transform/docs/_build/html/objects.inv", None), + ), } diff --git a/docs/index.rst b/docs/index.rst index fbddc874..b7e7abdc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -121,6 +121,7 @@ datasets `. ***************** - :ref:`anemoi-utils ` +- :ref:`anemoi-transform ` - :ref:`anemoi-datasets ` - :ref:`anemoi-models ` - :ref:`anemoi-graphs ` From 65ad2dd92f2865ab9e7c19b9be8e46206d05ac7f Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:51:10 +0100 Subject: [PATCH 2/8] Fill missing dates by interpolating data (#81) * add interpolate and closest --- CHANGELOG.md | 8 +- docs/using/code/fill_missing_dates1_.py | 1 + docs/using/code/fill_missing_dates2_.py | 1 + docs/using/code/missing_dates_.py | 1 - docs/using/code/set_missing_dates_.py | 1 + docs/using/missing.rst | 23 +++- pyproject.toml | 33 +---- src/anemoi/datasets/data/dataset.py | 26 ++-- src/anemoi/datasets/data/fill_missing.py | 162 +++++++++++++++++++++++ 9 files changed, 213 insertions(+), 43 deletions(-) create mode 100644 docs/using/code/fill_missing_dates1_.py create mode 100644 docs/using/code/fill_missing_dates2_.py delete mode 100644 docs/using/code/missing_dates_.py create mode 100644 docs/using/code/set_missing_dates_.py create mode 100644 src/anemoi/datasets/data/fill_missing.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7eff57d2..daa40d14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,14 +16,16 @@ Keep it human-readable, your future self will thank you! ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09 -## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04 +### Changed -- Update documentation +- Add support to fill missing dates + +## [Allow for unknown CF coordinates](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...0.5.6) - 2024-10-04 -- Update documentation ### Changed - Add `variables_metadata` entry in the dataset metadata +- Update documentation ### Changed diff --git a/docs/using/code/fill_missing_dates1_.py b/docs/using/code/fill_missing_dates1_.py new file mode 100644 index 00000000..93d78ccf --- /dev/null +++ b/docs/using/code/fill_missing_dates1_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, fill_missing_dates="interpolate") diff --git a/docs/using/code/fill_missing_dates2_.py b/docs/using/code/fill_missing_dates2_.py new file mode 100644 index 00000000..6567aaf4 --- /dev/null +++ b/docs/using/code/fill_missing_dates2_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, fill_missing_dates="closest") diff --git a/docs/using/code/missing_dates_.py b/docs/using/code/missing_dates_.py deleted file mode 100644 index 5c2da45d..00000000 --- a/docs/using/code/missing_dates_.py +++ /dev/null @@ -1 +0,0 @@ -ds = open_dataset(dataset, missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"]) diff --git a/docs/using/code/set_missing_dates_.py b/docs/using/code/set_missing_dates_.py new file mode 100644 index 00000000..35f35ed6 --- /dev/null +++ b/docs/using/code/set_missing_dates_.py @@ -0,0 +1 @@ +ds = open_dataset(dataset, set_missing_dates=["2010-01-01T12:00:00", "2010-02-01T12:00:00"]) diff --git a/docs/using/missing.rst b/docs/using/missing.rst index 3388a444..e7c6b9d4 100644 --- a/docs/using/missing.rst +++ b/docs/using/missing.rst @@ -4,6 +4,25 @@ Managing missing dates ######################## +************************************************** + Filling the missing dates with artificial values +************************************************** + +When you have missing dates in a dataset, you can fill them with +artificial values. You can either fill them with values that are the +result of a linear interpolation between the two closest dates: + +.. literalinclude:: code/fill_missing_dates1_.py + +Or you can select the copy the value of the closest date: + +.. literalinclude:: code/fill_missing_dates2_.py + +if the missing date is exactly in the middle of two dates, the library +will choose that value of the largest date. You can change this behavior +by setting the ``closest`` parameter to ``'down'`` or ``'up'`` +explicitly. + ************************************************ Skipping missing when iterating over a dataset ************************************************ @@ -72,7 +91,7 @@ the datasets to make the dates contiguous. Debugging *********** -You can set missing dates using the ``missing_dates`` option. This +You can set missing dates using the ``set_missing_dates`` option. This option is for debugging purposes only. -.. literalinclude:: code/missing_dates_.py +.. literalinclude:: code/set_missing_dates_.py diff --git a/pyproject.toml b/pyproject.toml index 5210d87c..797cfd0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dynamic = [ "version", ] dependencies = [ - "anemoi-utils[provenance]>=0.3.15", + "anemoi-utils[provenance]>=0.3.18", "cfunits", "numpy", "pyyaml", @@ -60,43 +60,20 @@ dependencies = [ ] optional-dependencies.all = [ - "boto3", - "earthkit-data[mars]>=0.9", - "earthkit-geo>=0.2", - "earthkit-meteo", - "ecmwflibs>=0.6.3", - "entrypoints", - "gcsfs", - "kerchunk", - "pyproj", - "requests", + "anemoi-datasets[create,remote,xarray]", ] optional-dependencies.create = [ - "earthkit-data[mars]>=0.9", + "earthkit-data[mars]>=0.10.7", "earthkit-geo>=0.2", "earthkit-meteo", - "ecmwflibs>=0.6.3", + "eccodes>=2.38.1", "entrypoints", "pyproj", ] optional-dependencies.dev = [ - "boto3", - "earthkit-data[mars]>=0.9", - "earthkit-geo>=0.2", - "earthkit-meteo", - "ecmwflibs>=0.6.3", - "entrypoints", - "gcsfs", - "kerchunk", - "nbsphinx", - "pandoc", - "pyproj", - "pytest", - "requests", - "sphinx", - "sphinx-rtd-theme", + "anemoi-datasets[all,docs,tests]", ] optional-dependencies.docs = [ diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py index 2f6af1b1..56275d1e 100644 --- a/src/anemoi/datasets/data/dataset.py +++ b/src/anemoi/datasets/data/dataset.py @@ -41,6 +41,14 @@ def _subset(self, **kwargs): if not kwargs: return self.mutate() + # This one must be first + if "fill_missing_dates" in kwargs: + from .fill_missing import fill_missing_dates_factory + + fill_missing_dates = kwargs.pop("fill_missing_dates") + ds = fill_missing_dates_factory(self, fill_missing_dates, kwargs) + return ds._subset(**kwargs).mutate() + if "start" in kwargs or "end" in kwargs: start = kwargs.pop("start", None) end = kwargs.pop("end", None) @@ -64,12 +72,6 @@ def _subset(self, **kwargs): .mutate() ) - if "interpolate_frequency" in kwargs: - from .interpolate import InterpolateFrequency - - interpolate_frequency = kwargs.pop("interpolate_frequency") - return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate() - if "select" in kwargs: from .select import Select @@ -121,11 +123,11 @@ def _subset(self, **kwargs): bbox = kwargs.pop("area") return Cropping(self, bbox)._subset(**kwargs).mutate() - if "missing_dates" in kwargs: + if "set_missing_dates" in kwargs: from .missing import MissingDates - missing_dates = kwargs.pop("missing_dates") - return MissingDates(self, missing_dates)._subset(**kwargs).mutate() + set_missing_dates = kwargs.pop("set_missing_dates") + return MissingDates(self, set_missing_dates)._subset(**kwargs).mutate() if "skip_missing_dates" in kwargs: from .missing import SkipMissingDates @@ -139,6 +141,12 @@ def _subset(self, **kwargs): if skip_missing_dates: return SkipMissingDates(self, expected_access)._subset(**kwargs).mutate() + if "interpolate_frequency" in kwargs: + from .interpolate import InterpolateFrequency + + interpolate_frequency = kwargs.pop("interpolate_frequency") + return InterpolateFrequency(self, interpolate_frequency)._subset(**kwargs).mutate() + # Keep last if "shuffle" in kwargs: from .subset import Subset diff --git a/src/anemoi/datasets/data/fill_missing.py b/src/anemoi/datasets/data/fill_missing.py new file mode 100644 index 00000000..ca16fd65 --- /dev/null +++ b/src/anemoi/datasets/data/fill_missing.py @@ -0,0 +1,162 @@ +# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import logging + +import numpy as np + +from anemoi.datasets.data import MissingDateError + +from .debug import Node +from .debug import debug_indexing +from .forwards import Forwards +from .indexing import apply_index_to_slices_changes +from .indexing import expand_list_indexing +from .indexing import index_to_slices +from .indexing import update_tuple + +LOG = logging.getLogger(__name__) + + +class MissingDatesFill(Forwards): + def __init__(self, dataset): + super().__init__(dataset) + self._missing = set(dataset.missing) + self._warnings = set() + + @debug_indexing + @expand_list_indexing + def _get_tuple(self, index): + index, changes = index_to_slices(index, self.shape) + index, previous = update_tuple(index, 0, slice(None)) + result = self._get_slice(previous) + return apply_index_to_slices_changes(result[index], changes) + + def _get_slice(self, s): + return np.stack([self[i] for i in range(*s.indices(self._len))]) + + @property + def missing(self): + return set() + + @debug_indexing + def __getitem__(self, n): + + try: + return self.forward[n] + except MissingDateError: + pass + + if isinstance(n, tuple): + return self._get_tuple(n) + + if isinstance(n, slice): + return self._get_slice(n) + + if n < 0: + n += self._len + + a = None + i = n + while a is None and i >= 0: + if i in self._missing: + i -= 1 + else: + a = i + + len = self._len + b = None + i = n + while b is None and n < len: + if i in self._missing: + i += 1 + else: + b = i + + return self._fill_missing(n, a, b) + + +class MissingDatesClosest(MissingDatesFill): + + def __init__(self, dataset, closest): + super().__init__(dataset) + self.closest = closest + self._closest = {} + + def _fill_missing(self, n, a, b): + + if n not in self._warnings: + LOG.warning(f"Missing date at index {n} ({self.dates[n]})") + if abs(n - a) == abs(b - n): + if self.closest == "up": + u = b + else: + u = a + else: + if abs(n - a) < abs(b - n): + u = a + else: + u = b + LOG.warning(f"Using closest date {u} ({self.dates[u]})") + + self._closest[n] = u + self._warnings.add(n) + + return self.forward[self._closest[n]] + + def subclass_metadata_specific(self): + return {"closest": self.closest} + + def tree(self): + return Node(self, [self.forward.tree()], closest=self.closest) + + +class MissingDatesInterpolate(MissingDatesFill): + def __init__(self, dataset): + super().__init__(dataset) + self._alpha = {} + + def _fill_missing(self, n, a, b): + if n not in self._warnings: + LOG.warning(f"Missing date at index {n} ({self.dates[n]})") + + if a is None or b is None: + raise MissingDateError( + f"Cannot interpolate at index {n} ({self.dates[n]}). Are the first or last date missing?" + ) + + assert a < n < b, (a, n, b) + + alpha = (n - a) / (b - a) + assert 0 < alpha < 1, alpha + + LOG.warning(f"Interpolating between index {a} ({self.dates[a]}) and {b} ({self.dates[b]})") + LOG.warning(f"Interpolation {1 - alpha:g} * ({self.dates[a]}) + {alpha:g} * ({self.dates[b]})") + + self._alpha[n] = alpha + + self._warnings.add(n) + + alpha = self._alpha[n] + return self.forward[a] * (1 - alpha) + self.forward[b] * alpha + + def subclass_metadata_specific(self): + return {} + + def tree(self): + return Node(self, [self.forward.tree()]) + + +def fill_missing_dates_factory(dataset, method, kwargs): + if method == "closest": + closest = kwargs.get("closest", "up") + return MissingDatesClosest(dataset, closest=closest) + + if method == "interpolate": + return MissingDatesInterpolate(dataset) + + raise ValueError(f"Invalid `fill_missing_dates` method '{method}'") From 4507a06e6f4cfed04e102d1a1c2dbf9d88c6b3c2 Mon Sep 17 00:00:00 2001 From: Helen Theissen Date: Fri, 11 Oct 2024 17:25:44 +0100 Subject: [PATCH 3/8] Fix/remove upstream deps from ci (#83) * chore(deps): remove upstream deps from ci * chore: use branch of downstream ci * fix: revert back to main on downstream-ci * fix: add anemoi-utils back in * docs: update changelog --- .github/ci-config.yml | 6 ------ .github/ci-hpc-config.yml | 9 --------- CHANGELOG.md | 5 +++++ 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/.github/ci-config.yml b/.github/ci-config.yml index 6138e636..f712f26f 100644 --- a/.github/ci-config.yml +++ b/.github/ci-config.yml @@ -1,9 +1,3 @@ -dependencies: | - ecmwf/ecbuild - MathisRosenhauer/libaec@master - ecmwf/eccodes - ecmwf/eckit - ecmwf/odc dependency_branch: develop parallelism_factor: 8 self_build: false # Only for python packages diff --git a/.github/ci-hpc-config.yml b/.github/ci-hpc-config.yml index b6e65e42..ab70a21c 100644 --- a/.github/ci-hpc-config.yml +++ b/.github/ci-hpc-config.yml @@ -2,17 +2,8 @@ build: python: '3.10' modules: - ninja - dependencies: - - ecmwf/ecbuild@develop - - ecmwf/eccodes@develop - - ecmwf/eckit@develop - - ecmwf/odc@develop python_dependencies: - ecmwf/anemoi-utils@develop - - ecmwf/earthkit-data@develop - - ecmwf/earthkit-meteo@develop - - ecmwf/earthkit-geo@develop parallel: 64 - pytest_cmd: | python -m pytest -vv -m 'not notebook and not no_cache_init' --cov=. --cov-report=xml diff --git a/CHANGELOG.md b/CHANGELOG.md index daa40d14..485d61c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,15 @@ Keep it human-readable, your future self will thank you! ## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.7...HEAD) + ### Added - Add anemoi-transform link to documentation +### Changed + +- Remove upstream dependencies from downstream-ci workflow (temporary) (#83) + ## [0.5.7](https://github.com/ecmwf/anemoi-datasets/compare/0.5.6...0.5.7) - 2024-10-09 ### Changed From 90ccc406bb45f60804dc2f26bc65f278d2b3fb61 Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Sun, 13 Oct 2024 19:08:44 +0100 Subject: [PATCH 4/8] Feature/params metadata (#86) * add `variables_metadata` to metadata --- CHANGELOG.md | 4 ++++ src/anemoi/datasets/data/dataset.py | 1 + 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 485d61c8..3afcf7ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,10 @@ Keep it human-readable, your future self will thank you! - Add `variables_metadata` entry in the dataset metadata +### Changed + +- Add `variables_metadata` entry in the dataset metadata + ## [0.5.5](https://github.com/ecmwf/anemoi-datasets/compare/0.5.4...0.5.5) - 2024-10-04 ### Changed diff --git a/src/anemoi/datasets/data/dataset.py b/src/anemoi/datasets/data/dataset.py index 56275d1e..35851991 100644 --- a/src/anemoi/datasets/data/dataset.py +++ b/src/anemoi/datasets/data/dataset.py @@ -284,6 +284,7 @@ def dataset_metadata(self): specific=self.metadata_specific(), frequency=self.frequency, variables=self.variables, + variables_metadata=self.variables_metadata, shape=self.shape, start_date=self.start_date.astype(str), end_date=self.end_date.astype(str), From 6cccf2e19f3633ca3889ab90dd4c4200f0abdc29 Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:51:34 +0100 Subject: [PATCH 5/8] Feature/xy zip (#85) Update xy/zip --- CHANGELOG.md | 2 + src/anemoi/datasets/data/concat.py | 1 + src/anemoi/datasets/data/merge.py | 154 +++++++++++++++++++++++++++++ src/anemoi/datasets/data/misc.py | 6 ++ src/anemoi/datasets/data/xy.py | 20 +++- 5 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 src/anemoi/datasets/data/merge.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3afcf7ee..eca03ad8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ Keep it human-readable, your future self will thank you! ### Added - Add anemoi-transform link to documentation +- Control compatibility check in xy/zip +- Add `merge` feature ### Changed diff --git a/src/anemoi/datasets/data/concat.py b/src/anemoi/datasets/data/concat.py index 53e6bfbc..55b2bc11 100644 --- a/src/anemoi/datasets/data/concat.py +++ b/src/anemoi/datasets/data/concat.py @@ -148,6 +148,7 @@ def concat_factory(args, kwargs): datasets = kwargs.pop("concat") fill_missing_gaps = kwargs.pop("fill_missing_gaps", False) + assert isinstance(datasets, (list, tuple)) assert len(args) == 0 diff --git a/src/anemoi/datasets/data/merge.py b/src/anemoi/datasets/data/merge.py new file mode 100644 index 00000000..fc9a22ea --- /dev/null +++ b/src/anemoi/datasets/data/merge.py @@ -0,0 +1,154 @@ +# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import logging +from functools import cached_property + +import numpy as np + +from . import MissingDateError +from .debug import Node +from .debug import debug_indexing +from .forwards import Combined +from .indexing import apply_index_to_slices_changes +from .indexing import expand_list_indexing +from .indexing import index_to_slices +from .indexing import update_tuple +from .misc import _auto_adjust +from .misc import _open + +LOG = logging.getLogger(__name__) + + +class Merge(Combined): + def __init__(self, datasets, allow_gaps_in_dates=False): + super().__init__(datasets) + + self.allow_gaps_in_dates = allow_gaps_in_dates + + dates = dict() + + for i, d in enumerate(datasets): + for j, date in enumerate(d.dates): + date = date.astype(object) + if date in dates: + d1 = datasets[dates[date][0]] + d2 = datasets[i] + raise ValueError(f"Duplicate date {date} found in datasets {d1} and {d2}") + dates[date] = (i, j) + + all_dates = sorted(dates) + start = all_dates[0] + end = all_dates[-1] + + frequency = min(d2 - d1 for d1, d2 in zip(all_dates[:-1], all_dates[1:])) + + date = start + indices = [] + _dates = [] + + self._missing_index = len(datasets) + + while date <= end: + if date not in dates: + if self.allow_gaps_in_dates: + dates[date] = (self._missing_index, -1) + else: + raise ValueError( + f"merge: date {date} not covered by dataset. Start={start}, end={end}, frequency={frequency}" + ) + + indices.append(dates[date]) + _dates.append(date) + date += frequency + + self._dates = np.array(_dates, dtype="datetime64[s]") + self._indices = np.array(indices) + self._frequency = frequency + + @property + def dates(self): + return self._dates + + @property + def frequency(self): + return self._frequency + + @cached_property + def missing(self): + # TODO: optimize + result = set() + + for i, (dataset, row) in enumerate(self._indices): + if dataset == self._missing_index: + result.add(i) + continue + + if row in self.datasets[dataset].missing: + result.add(i) + + return result + + def check_same_lengths(self, d1, d2): + # Turned off because we are concatenating along the first axis + pass + + def check_same_dates(self, d1, d2): + # Turned off because we are concatenating along the dates axis + pass + + def check_compatibility(self, d1, d2): + super().check_compatibility(d1, d2) + self.check_same_sub_shapes(d1, d2, drop_axis=0) + + def tree(self): + return Node(self, [d.tree() for d in self.datasets], allow_gaps_in_dates=self.allow_gaps_in_dates) + + @debug_indexing + def __getitem__(self, n): + if isinstance(n, tuple): + return self._get_tuple(n) + + if isinstance(n, slice): + return self._get_slice(n) + + dataset, row = self._indices[n] + + if dataset == self._missing_index: + raise MissingDateError(f"Date {self.dates[n]} is missing (index={n})") + + return self.datasets[dataset][int(row)] + + @debug_indexing + @expand_list_indexing + def _get_tuple(self, index): + index, changes = index_to_slices(index, self.shape) + index, previous = update_tuple(index, 0, slice(None)) + result = self._get_slice(previous) + return apply_index_to_slices_changes(result[index], changes) + + def _get_slice(self, s): + return np.stack([self[i] for i in range(*s.indices(self._len))]) + + +def merge_factory(args, kwargs): + + datasets = kwargs.pop("merge") + + assert isinstance(datasets, (list, tuple)) + assert len(args) == 0 + + datasets = [_open(e) for e in datasets] + + if len(datasets) == 1: + return datasets[0]._subset(**kwargs) + + datasets, kwargs = _auto_adjust(datasets, kwargs) + + allow_gaps_in_dates = kwargs.pop("allow_gaps_in_dates", False) + + return Merge(datasets, allow_gaps_in_dates=allow_gaps_in_dates)._subset(**kwargs) diff --git a/src/anemoi/datasets/data/misc.py b/src/anemoi/datasets/data/misc.py index 94478e28..8c7cfc04 100644 --- a/src/anemoi/datasets/data/misc.py +++ b/src/anemoi/datasets/data/misc.py @@ -302,6 +302,12 @@ def _open_dataset(*args, **kwargs): assert not sets, sets return concat_factory(args, kwargs).mutate() + if "merge" in kwargs: + from .merge import merge_factory + + assert not sets, sets + return merge_factory(args, kwargs).mutate() + if "ensemble" in kwargs: from .ensemble import ensemble_factory diff --git a/src/anemoi/datasets/data/xy.py b/src/anemoi/datasets/data/xy.py index 74b27a96..1c326f15 100644 --- a/src/anemoi/datasets/data/xy.py +++ b/src/anemoi/datasets/data/xy.py @@ -18,15 +18,19 @@ class ZipBase(Combined): + def __init__(self, datasets, check_compatibility=True): + self._check_compatibility = check_compatibility + super().__init__(datasets) + def swap_with_parent(self, parent): new_parents = [parent.clone(ds) for ds in self.datasets] return self.clone(new_parents) def clone(self, datasets): - return self.__class__(datasets) + return self.__class__(datasets, check_compatibility=self._check_compatibility) def tree(self): - return Node(self, [d.tree() for d in self.datasets]) + return Node(self, [d.tree() for d in self.datasets], check_compatibility=self._check_compatibility) def __len__(self): return min(len(d) for d in self.datasets) @@ -86,6 +90,10 @@ def resolution(self): def name_to_index(self): return tuple(d.name_to_index for d in self.datasets) + def check_compatibility(self, d1, d2): + if self._check_compatibility: + super().check_compatibility(d1, d2) + class Zip(ZipBase): pass @@ -110,7 +118,9 @@ def xy_factory(args, kwargs): assert len(datasets) == 2 - return XY(datasets)._subset(**kwargs) + check_compatibility = kwargs.pop("check_compatibility", True) + + return XY(datasets, check_compatibility=check_compatibility)._subset(**kwargs) def zip_factory(args, kwargs): @@ -122,4 +132,6 @@ def zip_factory(args, kwargs): datasets = [_open(e) for e in zip] datasets, kwargs = _auto_adjust(datasets, kwargs) - return Zip(datasets)._subset(**kwargs) + check_compatibility = kwargs.pop("check_compatibility", True) + + return Zip(datasets, check_compatibility=check_compatibility)._subset(**kwargs) From bd1244ce1f4e406b264d5b27cce846eafa6fa482 Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Mon, 14 Oct 2024 18:28:27 +0100 Subject: [PATCH 6/8] Bugfix/various (#84) * Various bug fixes * support ensembles * fix rr accumulations --- CHANGELOG.md | 1 + src/anemoi/datasets/create/__init__.py | 19 +++++++++- .../create/functions/filters/rename.py | 13 ++++--- .../create/functions/sources/accumulations.py | 5 ++- src/anemoi/datasets/create/input/__init__.py | 16 +------- src/anemoi/datasets/create/input/result.py | 38 +++++++++++++++++-- src/anemoi/datasets/create/input/step.py | 1 + 7 files changed, 68 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eca03ad8..b9f3e9c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Keep it human-readable, your future self will thank you! ### Added - Add anemoi-transform link to documentation +- Various bug fixes - Control compatibility check in xy/zip - Add `merge` feature diff --git a/src/anemoi/datasets/create/__init__.py b/src/anemoi/datasets/create/__init__.py index 14cf5240..85abac76 100644 --- a/src/anemoi/datasets/create/__init__.py +++ b/src/anemoi/datasets/create/__init__.py @@ -412,7 +412,24 @@ def _run(self): metadata.update(self.main_config.get("add_metadata", {})) metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict() - metadata["recipe"] = sanitise(self.main_config.get_serialisable_dict()) + + recipe = sanitise(self.main_config.get_serialisable_dict()) + + # Remove stuff added by prepml + for k in [ + "build_dataset", + "config_format_version", + "config_path", + "dataset_status", + "ecflow", + "metadata", + "platform", + "reading_chunks", + "upload", + ]: + recipe.pop(k, None) + + metadata["recipe"] = recipe metadata["description"] = self.main_config.description metadata["licence"] = self.main_config["licence"] diff --git a/src/anemoi/datasets/create/functions/filters/rename.py b/src/anemoi/datasets/create/functions/filters/rename.py index b82fd8ca..666e085d 100644 --- a/src/anemoi/datasets/create/functions/filters/rename.py +++ b/src/anemoi/datasets/create/functions/filters/rename.py @@ -56,11 +56,14 @@ def __init__(self, field, format): self.format = format self.bits = re.findall(r"{(\w+)}", format) - def metadata(self, key, **kwargs): - value = self.field.metadata(key, **kwargs) - if "{" + key + "}" in self.format: - bits = {b: self.field.metadata(b, **kwargs) for b in self.bits} - return self.format.format(**bits) + def metadata(self, *args, **kwargs): + value = self.field.metadata(*args, **kwargs) + if args: + assert len(args) == 1 + key = args[0] + if "{" + key + "}" in self.format: + bits = {b: self.field.metadata(b, **kwargs) for b in self.bits} + return self.format.format(**bits) return value def __getattr__(self, name): diff --git a/src/anemoi/datasets/create/functions/sources/accumulations.py b/src/anemoi/datasets/create/functions/sources/accumulations.py index b74eb33f..bfc5eba7 100644 --- a/src/anemoi/datasets/create/functions/sources/accumulations.py +++ b/src/anemoi/datasets/create/functions/sources/accumulations.py @@ -370,12 +370,15 @@ def accumulations(context, dates, **request): user_accumulation_period = request.pop("accumulation_period", 6) + # If `data_accumulation_period` is not set, this means that the accumulations are from the start + # of the forecast. + KWARGS = { ("od", "oper"): dict(patch=_scda), ("od", "elda"): dict(base_times=(6, 18)), ("ea", "oper"): dict(data_accumulation_period=1, base_times=(6, 18)), ("ea", "enda"): dict(data_accumulation_period=3, base_times=(6, 18)), - ("rr", "oper"): dict(data_accumulation_period=3, base_times=(0, 3, 6, 9, 12, 15, 18, 21)), + ("rr", "oper"): dict(base_times=(0, 3, 6, 9, 12, 15, 18, 21)), ("l5", "oper"): dict(data_accumulation_period=1, base_times=(0,)), } diff --git a/src/anemoi/datasets/create/input/__init__.py b/src/anemoi/datasets/create/input/__init__.py index d23f038d..92e93669 100644 --- a/src/anemoi/datasets/create/input/__init__.py +++ b/src/anemoi/datasets/create/input/__init__.py @@ -6,23 +6,9 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # -import datetime -import itertools + import logging -import math -import time -from collections import defaultdict from copy import deepcopy -from functools import cached_property -from functools import wraps - -import numpy as np -from anemoi.utils.dates import as_datetime as as_datetime -from anemoi.utils.dates import frequency_to_timedelta as frequency_to_timedelta - -from anemoi.datasets.dates import DatesProvider as DatesProvider -from anemoi.datasets.fields import FieldArray as FieldArray -from anemoi.datasets.fields import NewValidDateTimeField as NewValidDateTimeField from .trace import trace_select diff --git a/src/anemoi/datasets/create/input/result.py b/src/anemoi/datasets/create/input/result.py index 7fab9acf..f708ef9c 100644 --- a/src/anemoi/datasets/create/input/result.py +++ b/src/anemoi/datasets/create/input/result.py @@ -33,9 +33,38 @@ def _fields_metatata(variables, cube): assert isinstance(variables, tuple), variables + def _merge(md1, md2): + assert set(md1.keys()) == set(md2.keys()), (set(md1.keys()), set(md2.keys())) + result = {} + for k in md1.keys(): + v1 = md1[k] + v2 = md2[k] + + if v1 == v2: + result[k] = v1 + continue + + if isinstance(v1, list): + assert v2 not in v1, (v1, v2) + result[k] = sorted(v1 + [v2]) + continue + + if isinstance(v2, list): + assert v1 not in v2, (v1, v2) + result[k] = sorted(v2 + [v1]) + continue + + result[k] = sorted([v1, v2]) + + return result + result = {} - for i, c in enumerate(cube.iterate_cubelets()): - assert c._coords_names[1] == variables[i], (c._coords_names[1], variables[i]) + i = -1 + for c in cube.iterate_cubelets(): + + if i == -1 or c._coords_names[1] != variables[i]: + i += 1 + f = cube[c.coords] md = f.metadata(namespace="mars") if not md: @@ -49,7 +78,10 @@ def _fields_metatata(variables, cube): md["param"] = str(f.metadata("paramId", default="unknown")) # assert md['param'] != 'unknown', (md, f.metadata('param')) - result[variables[i]] = md + if variables[i] in result: + result[variables[i]] = _merge(md, result[variables[i]]) + else: + result[variables[i]] = md assert i + 1 == len(variables), (i + 1, len(variables)) return result diff --git a/src/anemoi/datasets/create/input/step.py b/src/anemoi/datasets/create/input/step.py index 3eb2917c..daca578b 100644 --- a/src/anemoi/datasets/create/input/step.py +++ b/src/anemoi/datasets/create/input/step.py @@ -59,6 +59,7 @@ def select(self, group_of_dates): ) def __repr__(self): + # raise NotImplementedError(f"Not implemented in {self.__class__.__name__}") return super().__repr__(self.previous_step, _inline_=str(self.kwargs)) From 4f949a5ed18b0d7a88dbb3c6ff3014ecb3131dc0 Mon Sep 17 00:00:00 2001 From: Florian Pinault Date: Tue, 15 Oct 2024 12:33:26 +0000 Subject: [PATCH 7/8] clean bug template --- .github/ISSUE_TEMPLATE/bug_report.md | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 16a27120..53e6c790 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -9,12 +9,15 @@ assignees: '' **Describe the bug** A clear and concise description of what the bug is. +** Version number ** +I am using the following versions/branch/sha1 of the anemoi packages +(alternatively the output of `pip freeze`) + **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error +2. Run this '....' +3. See error **URL to sample input data** Provide a URL to a sample input data, or attach a file to that report if it is small enough. @@ -25,16 +28,5 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - **Additional context** Add any other context about the problem here. From 1d96021c21d196afadee1af5bf048fb750913889 Mon Sep 17 00:00:00 2001 From: b8raoult <53792887+b8raoult@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:00:58 +0100 Subject: [PATCH 8/8] Extend CODEOWNERS (#89) --- .github/CODEOWNERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0211a4e3..74bdac0a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,6 +1,6 @@ # CODEOWNERS file # Protect workflow files -/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb -/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb -/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb +/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry +/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry +/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry