diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 05fc7d10..7003d22e 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -15,9 +15,10 @@ from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN from .utils import ( EmptyFileError, - _AccessNCFileInfo, + GenericTimeParser, + GfdlTimeParser, + _NCFileInfo, _VarInfo, - get_timeinfo, ) # Frequency translations @@ -56,8 +57,9 @@ class BaseBuilder(Builder): This builds on the ecgtools.Builder class. """ - # Base class carries an empty set + # Base class carries an empty set, and a GenericParser PATTERNS: list = [] + TIME_PARSER = GenericTimeParser def __init__( self, @@ -222,7 +224,7 @@ def parser(file): raise NotImplementedError @classmethod - def parse_access_filename( + def parse_filename( cls, filename: str, patterns: list[str] | None = None, @@ -285,11 +287,9 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile( - cls, file: str, time_dim: str = "time" - ) -> _AccessNCFileInfo: + def parse_ncfile(cls, file: str, time_dim: str = "time") -> _NCFileInfo: """ - Get Intake-ESM datastore entry info from an ACCESS netcdf file + Get Intake-ESM datastore entry info from a netcdf file Parameters ---------- @@ -300,7 +300,7 @@ def parse_access_ncfile( Returns ------- - output_nc_info: _AccessNCFileInfo + output_nc_info: _NCFileInfo A dataclass containing the information parsed from the file Raises @@ -310,7 +310,7 @@ def parse_access_ncfile( file_path = Path(file) - file_id, filename_timestamp, filename_frequency = cls.parse_access_filename( + file_id, filename_timestamp, filename_frequency = cls.parse_filename( file_path.stem ) @@ -327,14 +327,14 @@ def parse_access_ncfile( attrs = ds[var].attrs dvars.append_attrs(var, attrs) # type: ignore - start_date, end_date, frequency = get_timeinfo( + start_date, end_date, frequency = cls.TIME_PARSER( ds, filename_frequency, time_dim - ) + )() if not dvars.variable_list: raise EmptyFileError("This file contains no variables") - output_ncfile = _AccessNCFileInfo( + output_ncfile = _NCFileInfo( filename=file_path.name, path=file, file_id=file_id, @@ -399,7 +399,7 @@ def parser(cls, file) -> dict: if realm == "ice": realm = "seaIce" - nc_info = cls.parse_access_ncfile(file) + nc_info = cls.parse_ncfile(file) ncinfo_dict = nc_info.to_dict() ncinfo_dict["realm"] = realm @@ -457,7 +457,7 @@ def __init__(self, path): @classmethod def parser(cls, file) -> dict: try: - output_nc_info = cls.parse_access_ncfile(file) + output_nc_info = cls.parse_ncfile(file) ncinfo_dict = output_nc_info.to_dict() if "mom6" in ncinfo_dict["filename"]: @@ -487,6 +487,7 @@ class Mom6Builder(BaseBuilder): rf"[^\.]*({PATTERNS_HELPERS['ymd-ns']})\.{PATTERNS_HELPERS['mom6_components']}.*{PATTERNS_HELPERS['mom6_added_timestamp']}.*$", # Daily snapshot naming rf"[^\.]*({PATTERNS_HELPERS['ymd-ns']})\.{PATTERNS_HELPERS['mom6_components']}.*$", # Basic naming ] + TIME_PARSER = GfdlTimeParser def __init__(self, path): """ @@ -529,7 +530,7 @@ def __init__(self, path): @classmethod def parser(cls, file): try: - output_nc_info = cls.parse_access_ncfile(file) + output_nc_info = cls.parse_ncfile(file) ncinfo_dict = output_nc_info.to_dict() if "ocean" in ncinfo_dict["filename"]: @@ -605,7 +606,7 @@ def parser(cls, file): realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"} - nc_info = cls.parse_access_ncfile(file) + nc_info = cls.parse_ncfile(file) ncinfo_dict = nc_info.to_dict() # Remove exp_id from file id so that members can be part of the same dataset diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index cc0b6905..fe494acf 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -19,7 +19,7 @@ class EmptyFileError(Exception): @dataclass -class _AccessNCFileInfo: +class _NCFileInfo: """ Holds information about a NetCDF file that is used to create an intake-esm catalog entry. @@ -250,3 +250,309 @@ def _todate(t): frequency = frequency[1] return start_date, end_date, frequency + + +class GenericTimeParser: + """ + Generic time parser + """ + + def __init__(self, ds: xr.Dataset, filename_frequency: str | None, time_dim: str): + """ + Parameters + ---------- + ds: :py:class:`xarray.Dataset` + The dataset to parse the time info from + filename_frequency: str + Frequency as determined from the filename + time_dim: str + The name of the time dimension + """ + self.ds = ds + self.filename_frequency = filename_frequency + self.time_dim = time_dim + + @staticmethod + def _add_month_start(time, n: int): + """Add months to cftime datetime and truncate to start""" + year = time.year + ((time.month + n - 1) // 12) + month = (time.month + n - 1) % 12 + 1 + return time.replace( + year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0 + ) + + @staticmethod + def _add_year_start(time, n: int): + """Add years to cftime datetime and truncate to start""" + return time.replace( + year=time.year + n, + month=1, + day=1, + hour=0, + minute=0, + second=0, + microsecond=0, + ) + + @staticmethod + def _guess_start_end_dates(ts, te, frequency): + """Guess the start and end bounded times for a given frequency""" + warnings.warn( + "Time coordinate does not include bounds information. Guessing " + "start and end times." + ) + num, unit = frequency + if unit == "yr": + step_back = -int(num / 2) + step_fwd = num + step_back + ts = GenericTimeParser._add_year_start(ts, step_back) + te = GenericTimeParser._add_year_start(te, step_fwd) + elif unit == "mon": + step_back = -int(num / 2) + step_fwd = num + step_back + ts = GenericTimeParser._add_month_start(ts, step_back) + te = GenericTimeParser._add_month_start(te, step_fwd) + elif unit == "day": + dt = timedelta(days=num) / 2 + ts = ts - dt + te = te + dt + elif unit == "hr": + dt = timedelta(hours=num) / 2 + ts = ts - dt + te = te + dt + else: + warnings.warn("Cannot infer start and end times for subhourly frequencies.") + return ts, te + + def _get_timeinfo(self) -> tuple[str, str, str]: + """ + Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the + cosima cookbook, see + https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565 + + Parameters + ---------- + ds: :py:class:`xarray.Dataset` + The dataset to parse the time info from + filename_frequency: str + Frequency as determined from the filename + time_dim: str + The name of the time dimension + + Returns + ------- + start_date: str + The start date of the dataset + end_date: str + The end date of the dataset + frequency: str + The frequency of the dataset + + Raises + ------ + EmptyFileError + If the dataset has a valid unlimited dimension, but no data + """ + + ds = self.ds + filename_frequency = self.filename_frequency + time_dim = self.time_dim + + def _todate(t): + return cftime.num2date(t, time_var.units, calendar=time_var.calendar) + + time_format = "%Y-%m-%d, %H:%M:%S" + ts = None + te = None + frequency: str | tuple[int | None, str] = FREQUENCY_STATIC + has_time = time_dim in ds + + if has_time: + time_var = ds[time_dim] + + if len(time_var) == 0: + raise EmptyFileError( + "This file has a valid unlimited dimension, but no data" + ) + + has_bounds = hasattr(time_var, "bounds") and time_var.bounds in ds.variables + if has_bounds: + bounds_var = ds.variables[time_var.bounds] + ts = _todate(bounds_var[0, 0]) + te = _todate(bounds_var[-1, 1]) + else: + ts = _todate(time_var[0]) + te = _todate(time_var[-1]) + + if len(time_var) > 1 or has_bounds: + if has_bounds: + t1 = _todate(bounds_var[0, 1]) + else: + t1 = _todate(time_var[1]) + + dt = t1 - ts + # TODO: This is not a very good way to get the frequency + if dt.days >= 365: + years = round(dt.days / 365) + frequency = (years, "yr") + elif dt.days >= 28: + months = round(dt.days / 30) + frequency = (months, "mon") + elif dt.days >= 1: + frequency = (dt.days, "day") + elif dt.seconds >= 3600: + hours = round(dt.seconds / 3600) + frequency = (hours, "hr") + else: + frequency = (None, "subhr") + + if filename_frequency: + if filename_frequency != frequency: + msg = ( + f"The frequency '{filename_frequency}' determined from filename does not " + f"match the frequency '{frequency}' determined from the file contents." + ) + if frequency == FREQUENCY_STATIC: + frequency = filename_frequency + warnings.warn(f"{msg} Using '{frequency}'.") + + if has_time & (frequency != FREQUENCY_STATIC): + if not has_bounds: + ts, te = GenericTimeParser._guess_start_end_dates(ts, te, frequency) + + if ts is None: + start_date = "none" + else: + start_date = ts.strftime(time_format) + + if te is None: + end_date = "none" + else: + end_date = te.strftime(time_format) + + if frequency[0]: + frequency = f"{str(frequency[0])}{frequency[1]}" + else: + frequency = frequency[1] + + return start_date, end_date, frequency + + def __call__(self) -> tuple[str, str, str]: + return self._get_timeinfo() + + +class AccessTimeParser(GenericTimeParser): + pass + + +class GfdlTimeParser(GenericTimeParser): + def __init__(self, ds: xr.Dataset, filename_frequency: str | None, time_dim: str): + self.ds = ds + self.filename_frequency = filename_frequency + self.time_dim = time_dim + + def _get_timeinfo(self) -> tuple[str, str, str]: + """ + Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the + cosima cookbook, see + https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565 + + Parameters + ---------- + ds: :py:class:`xarray.Dataset` + The dataset to parse the time info from + filename_frequency: str + Frequency as determined from the filename + time_dim: str + The name of the time dimension + + Returns + ------- + start_date: str + The start date of the dataset + end_date: str + The end date of the dataset + frequency: str + The frequency of the dataset + + Raises + ------ + EmptyFileError + If the dataset has a valid unlimited dimension, but no data + """ + + ds = self.ds + filename_frequency = self.filename_frequency + time_dim = self.time_dim + + def _todate(t): + return cftime.num2date(t, time_var.units, calendar=time_var.calendar) + + time_format = "%Y-%m-%d, %H:%M:%S" + ts = None + te = None + frequency: str | tuple[int | None, str] = FREQUENCY_STATIC + has_time = time_dim in ds + + if has_time: + time_var = ds[time_dim] + + if len(time_var) == 0: + raise EmptyFileError( + "This file has a valid unlimited dimension, but no data" + ) + + ts = _todate(time_var[0]) + te = _todate(time_var[-1]) + + if len(time_var) > 1: + t1 = _todate(time_var[1]) + + dt = t1 - ts + # TODO: This is not a very good way to get the frequency + if dt.days >= 365: + years = round(dt.days / 365) + frequency = (years, "yr") + elif dt.days >= 28: + months = round(dt.days / 30) + frequency = (months, "mon") + elif dt.days >= 1: + frequency = (dt.days, "day") + elif dt.seconds >= 3600: + hours = round(dt.seconds / 3600) + frequency = (hours, "hr") + else: + frequency = (None, "subhr") + + if filename_frequency: + if filename_frequency != frequency: + msg = ( + f"The frequency '{filename_frequency}' determined from filename does not " + f"match the frequency '{frequency}' determined from the file contents." + ) + if frequency == FREQUENCY_STATIC: + frequency = filename_frequency + warnings.warn(f"{msg} Using '{frequency}'.") + + if has_time & (frequency != FREQUENCY_STATIC): + ts, te = GenericTimeParser._guess_start_end_dates(ts, te, frequency) + + if ts is None: + start_date = "none" + else: + start_date = ts.strftime(time_format) + + if te is None: + end_date = "none" + else: + end_date = te.strftime(time_format) + + if frequency[0]: + frequency = f"{str(frequency[0])}{frequency[1]}" + else: + frequency = frequency[1] + + return start_date, end_date, frequency + + def __call__(self) -> tuple[str, str, str]: + return self._get_timeinfo() diff --git a/tests/test_builders.py b/tests/test_builders.py index 0ed3c5d4..ea820e63 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -13,7 +13,7 @@ from intake_esm.utils import OPTIONS from access_nri_intake.source import CORE_COLUMNS, builders -from access_nri_intake.source.utils import _AccessNCFileInfo +from access_nri_intake.source.utils import _NCFileInfo @pytest.mark.parametrize( @@ -224,7 +224,7 @@ def test_builder_parser(test_data, filename, builder, realm, member, file_id): assert info["file_id"] == file_id -@mock.patch("access_nri_intake.source.utils._AccessNCFileInfo.to_dict") +@mock.patch("access_nri_intake.source.utils._NCFileInfo.to_dict") @pytest.mark.parametrize( "filename", [ @@ -759,8 +759,8 @@ def test_builder_columns_with_iterables(test_data): ), ], ) -def test_parse_access_filename(builder, filename, expected): - assert builder.parse_access_filename(filename) == expected +def test_parse_filename(builder, filename, expected): + assert builder.parse_filename(filename) == expected @pytest.mark.parametrize( @@ -776,7 +776,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_grid.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_grid.nc", file_id="ocean_grid", @@ -799,7 +799,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean.nc", file_id="ocean", @@ -849,7 +849,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_month.nc", file_id="ocean_month", @@ -888,7 +888,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month_inst_nobounds.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_month_inst_nobounds.nc", file_id="ocean_month_inst_nobounds", @@ -921,7 +921,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="iceh.1900-01.nc", file_id="iceh_XXXX_XX", @@ -953,7 +953,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="by578a.pd201501_dai.nc", file_id="by578a_pdXXXXXX_dai", @@ -971,7 +971,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ice/iceh_d.2015-01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="iceh_d.2015-01.nc", file_id="iceh_d_XXXX_XX", @@ -1003,7 +1003,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_daily.nc-20150630", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_daily.nc-20150630", file_id="ocean_daily", @@ -1035,7 +1035,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_scalar.nc-20150630", file_id="ocean_scalar", @@ -1077,7 +1077,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="HI-C-05-r1.pa-185001_mon.nc", file_id="HI_C_05_r1_pa_XXXXXX_mon", @@ -1095,7 +1095,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ice/iceh.1850-01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="iceh.1850-01.nc", file_id="iceh_XXXX_XX", @@ -1127,7 +1127,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_bgc_ann.nc-18501231", file_id="ocean_bgc_ann", @@ -1166,7 +1166,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc.nc-18501231", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="ocean_bgc.nc-18501231", file_id="ocean_bgc", @@ -1208,7 +1208,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc", file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX", @@ -1281,7 +1281,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", @@ -1349,7 +1349,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.static.nc", file_id="GMOM_JRA_WD_mom6_h_static", @@ -1377,7 +1377,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc", file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX", @@ -1450,7 +1450,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.cice.h.1900-01-01.nc", file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX", @@ -1482,7 +1482,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", @@ -1500,14 +1500,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output000/19000101.ice_daily.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="19000101.ice_daily.nc", file_id="XXXXXXXX_ice_daily", filename_timestamp="19000101", - frequency="subhr", + frequency="1day", start_date="1900-01-01, 00:00:00", - end_date="1900-01-01, 00:00:00", + end_date="1901-01-01, 00:00:00", variable=[ "xT", "xTe", @@ -1583,14 +1583,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output000/19000101.ocean_annual_z.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="19000101.ocean_annual_z.nc", file_id="XXXXXXXX_ocean_annual_z", filename_timestamp="19000101", - frequency="subhr", + frequency="1yr", start_date="1900-01-01, 00:00:00", - end_date="1900-01-01, 00:00:00", + end_date="1901-01-01, 00:00:00", variable=[ "xh", "yh", @@ -1746,14 +1746,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output000/19000101.ocean_month_rho2.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="19000101.ocean_month_rho2.nc", file_id="XXXXXXXX_ocean_month_rho2", filename_timestamp="19000101", - frequency="subhr", + frequency="1mon", start_date="1900-01-01, 00:00:00", - end_date="1900-01-01, 00:00:00", + end_date="1901-01-01, 00:00:00", variable=[ "xh", "yh", @@ -1839,14 +1839,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output000/19000101.ocean_scalar_annual.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="19000101.ocean_scalar_annual.nc", file_id="XXXXXXXX_ocean_scalar_annual", filename_timestamp="19000101", - frequency="subhr", + frequency="1yr", start_date="1900-01-01, 00:00:00", - end_date="1900-01-01, 00:00:00", + end_date="1901-01-01, 00:00:00", variable=[ "scalar_axis", "time", @@ -1922,7 +1922,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output000/19000101.ocean_static.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="19000101.ocean_static.nc", file_id="XXXXXXXX_ocean_static", @@ -2100,14 +2100,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output053/20051101.ocean_daily_2005_360.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="20051101.ocean_daily_2005_360.nc", file_id="XXXXXXXX_ocean_daily_XXXX_XXX", filename_timestamp="20051101", - frequency="subhr", - start_date="1991-01-01, 00:00:00", - end_date="1991-01-01, 00:00:00", + frequency="1day", + start_date="2005-12-26, 00:00:00", + end_date="2005-12-27, 00:00:00", variable=[ "xh", "yh", @@ -2233,14 +2233,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output053/20051101.ocean_daily_rho2_2005_360.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="20051101.ocean_daily_rho2_2005_360.nc", file_id="XXXXXXXX_ocean_daily_rho2_XXXX_XXX", filename_timestamp="20051101", - frequency="subhr", - start_date="1991-01-01, 00:00:00", - end_date="1991-01-01, 00:00:00", + frequency="1day", + start_date="2005-12-26, 00:00:00", + end_date="2005-12-27, 00:00:00", variable=[ "xh", "yh", @@ -2331,14 +2331,14 @@ def test_parse_access_filename(builder, filename, expected): ( builders.Mom6Builder, "mom6/output053/20051101.ocean_daily_z_2005_360.nc", - _AccessNCFileInfo( + _NCFileInfo( path=None, # type: ignore filename="20051101.ocean_daily_z_2005_360.nc", file_id="XXXXXXXX_ocean_daily_z_XXXX_XXX", filename_timestamp="20051101", - frequency="subhr", - start_date="1991-01-01, 00:00:00", - end_date="1991-01-01, 00:00:00", + frequency="1day", + start_date="2005-12-26, 00:00:00", + end_date="2005-12-27, 00:00:00", variable=[ "xh", "yh", @@ -2456,7 +2456,7 @@ def test_parse_access_ncfile(test_data, builder, filename, expected, compare_fil # Set the path to the test data directory expected.path = file - assert builder.parse_access_ncfile(file) == expected + assert builder.parse_ncfile(file) == expected if not compare_files: return None diff --git a/tests/test_source_utils.py b/tests/test_source_utils.py index 419a9f44..f7628f49 100644 --- a/tests/test_source_utils.py +++ b/tests/test_source_utils.py @@ -4,7 +4,13 @@ import pytest import xarray as xr -from access_nri_intake.source.utils import get_timeinfo +from access_nri_intake.source.utils import ( + AccessTimeParser, + EmptyFileError, + GenericTimeParser, + GfdlTimeParser, + get_timeinfo, +) @pytest.mark.parametrize( @@ -142,3 +148,276 @@ def test_get_timeinfo(times, bounds, ffreq, expected): ) assert get_timeinfo(ds, filename_frequency=ffreq, time_dim="time") == expected + + +@pytest.mark.parametrize( + "times, bounds, ffreq, expected", + [ + ( + [365 / 2], + False, + (1, "yr"), + ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr"), + ), + ( + [31 / 2], + False, + (1, "mon"), + ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon"), + ), + ( + [1.5 / 24], + False, + (3, "hr"), + ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr"), + ), + ( + [0.0, 9 / 60 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 00:09:00", "subhr"), + ), + ( + [0.0, 3 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr"), + ), + ( + [0.0, 6 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "6hr"), + ), + ( + [0.0, 1.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-02, 00:00:00", "1day"), + ), + ( + [0.0, 31.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon"), + ), + ( + [0.0, 90.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-04-01, 00:00:00", "3mon"), + ), + ( + [0.0, 365.0], + True, + None, + ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr"), + ), + ( + [0.0, 730.0], + True, + None, + ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "2yr"), + ), + ( + [1.5 / 24, 4.5 / 24], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "3hr"), + ), + ( + [3 / 24, 9 / 24], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 12:00:00", "6hr"), + ), + ( + [0.5, 1.5], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-03, 00:00:00", "1day"), + ), + ( + [31 / 2, 45], + False, + None, + ("1900-01-01, 00:00:00", "1900-03-01, 00:00:00", "1mon"), + ), + ( + [45, 135.5], + False, + None, + ("1900-01-01, 00:00:00", "1900-07-01, 00:00:00", "3mon"), + ), + ( + [365 / 2, 365 + 365 / 2], + False, + None, + ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "1yr"), + ), + ( + [365, 3 * 365], + False, + None, + ("1900-01-01, 00:00:00", "1904-01-01, 00:00:00", "2yr"), + ), + ], +) +@pytest.mark.parametrize( + "parser", + [AccessTimeParser, GenericTimeParser], +) +def test_generic_time_parser(times, bounds, ffreq, expected, parser): + if bounds: + time = (times[0] + times[1]) / 2 + ds = xr.Dataset( + data_vars={ + "dummy": ("time", [0]), + "time_bounds": (("time", "nv"), [(times[0], times[1])]), + }, + coords={"time": [time]}, + ) + ds["time"].attrs = dict(bounds="time_bounds") + else: + ds = xr.Dataset( + data_vars={"dummy": ("time", [0] * len(times))}, + coords={"time": times}, + ) + + ds["time"].attrs |= dict( + units="days since 1900-01-01 00:00:00", calendar="GREGORIAN" + ) + + assert parser(ds, filename_frequency=ffreq, time_dim="time")() == expected + + +@pytest.mark.parametrize( + "parser", + [AccessTimeParser, GenericTimeParser], +) +def test_generic_time_parser_warnings(parser): + times = [1.5 / 24 / 60] + ffreq = (3, "s") + + ds = xr.Dataset( + data_vars={"dummy": ("time", [0] * len(times))}, + coords={"time": times}, + ) + + ds["time"].attrs |= dict( + units="days since 1900-01-01 00:00:00", calendar="GREGORIAN" + ) + + with pytest.warns( + match="Cannot infer start and end times for subhourly frequencies." + ): + parser(ds, filename_frequency=ffreq, time_dim="time")._guess_start_end_dates( + 0, 1, (1, "s") + ) + + +@pytest.mark.parametrize( + "parser", + [AccessTimeParser, GenericTimeParser, GfdlTimeParser], +) +def test_generic_empty_file_error(parser): + times = [] + ffreq = (3, "hr") + + ds = xr.Dataset( + data_vars={"dummy": ("time", [])}, + coords={"time": times}, + ) + + ds["time"].attrs |= dict( + units="days since 1900-01-01 00:00:00", calendar="GREGORIAN" + ) + + with pytest.raises(EmptyFileError): + parser(ds, filename_frequency=ffreq, time_dim="time")() + + +@pytest.mark.parametrize( + "times, ffreq, expected", + [ + ( + [365 / 2], + (1, "yr"), + ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr"), + ), + ( + [31 / 2], + (1, "mon"), + ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon"), + ), + ( + [1.5 / 24], + (3, "hr"), + ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr"), + ), + ( + [1.5 / 24, 4.5 / 24], + None, + ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "3hr"), + ), + ( + [3 / 24, 9 / 24], + None, + ("1900-01-01, 00:00:00", "1900-01-01, 12:00:00", "6hr"), + ), + ( + [0.5, 1.5], + None, + ("1900-01-01, 00:00:00", "1900-01-03, 00:00:00", "1day"), + ), + ( + [31 / 2, 45], + None, + ("1900-01-01, 00:00:00", "1900-03-01, 00:00:00", "1mon"), + ), + ( + [45, 135.5], + None, + ("1900-01-01, 00:00:00", "1900-07-01, 00:00:00", "3mon"), + ), + ( + [365 / 2, 365 + 365 / 2], + None, + ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "1yr"), + ), + ( + [365, 3 * 365], + None, + ("1900-01-01, 00:00:00", "1904-01-01, 00:00:00", "2yr"), + ), + ( + [365 / 86400 / 720, 365 / 86400 / 360], # 1/2 second, 1 second + None, + ("1900-01-01, 00:00:00", "1900-01-01, 00:00:01", "subhr"), + ), + ], +) +def test_gfdl_time_parser(times, ffreq, expected): + ds = xr.Dataset( + data_vars={"dummy": ("time", [0] * len(times))}, + coords={"time": times}, + ) + + ds["time"].attrs |= dict( + units="days since 1900-01-01 00:00:00", calendar="GREGORIAN" + ) + + assert GfdlTimeParser(ds, filename_frequency=ffreq, time_dim="time")() == expected + + +def test_gfdl_parser_notime(): + ds = xr.Dataset( + data_vars={"dummy": ("latitude", [0])}, + coords={"latitude": [0]}, + ) + + assert GfdlTimeParser(ds, filename_frequency=None, time_dim="time")() == ( + "none", + "none", + "fx", + ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..18a7bbf4 --- /dev/null +++ b/tox.ini @@ -0,0 +1,48 @@ +[tox] +envlist = + py{310,311,312,313} + +[testenv] +setenv = + PYTHONPATH = {toxinidir} + +deps = + pytest + +commands = + pip install pytest-random-order + pip install . + pytest --random-order tests + +basepython = + py310: python3.10 + py311: python3.11 + py312: python3.12 + py313: python3.13 + +[testenv:py310] +conda_env = ci/environment-3.10.yml + +[testenv:py311] +conda_env = ci/environment-3.11.yml + +[testenv:py312] +conda_env = ci/environment-3.12.yml + +[testenv:py313] +conda_env = ci/environment-3.13.yml + + +; 310: All Passed + +; 311: 1 failed +; [FAILED tests/test_builders.py::test_builder_build[basedirs4-Mom6Builder-kwargs4-27-27-15] - AssertionError: assert 26 == 27 +; FAILED tests/test_builders.py::test_builder_parser[mom6/output000/19000101.ice_daily.nc-Mom6Builder-seaIce-None-XXXXXXXX_ice_daily] - KeyError: 'realm' +; FAILED tests/test_builders.py::test_builder_parser[mom6/output000/19000101.ocean_daily.nc-Mom6Builder-ocean-None-XXXXXXXX_ocean_daily] - KeyError: 'realm' +; 3 failed + +; 312 1 failed +; FAILED tests/test_builders.py::test_parse_access_ncfile[Mom6Builder-mom6/output000/19000101.ocean_annual_z.nc-expected20-False] - AssertionError: assert _AccessNCFile...ays', 'days']) == _AccessNCFile...ays', 'days']) +; 1 failed, + +; 313: All Passed