From 2a5686c6fe855502523e495e43bd381d14191c7b Mon Sep 17 00:00:00 2001 From: "Kenneth D. Mankoff" Date: Tue, 19 Jul 2022 12:16:36 -0700 Subject: [PATCH 1/5] Make code match comments - use 64bit float --- xarray/coding/variables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 3e3d8b07e3d..0cde2f53484 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -237,7 +237,7 @@ def _choose_float_dtype(dtype, has_offset): # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! if not has_offset: - return np.float32 + return np.float64 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) return np.float64 From 108586e09668fc55530a23a4c27477917353ad57 Mon Sep 17 00:00:00 2001 From: "Kenneth D. Mankoff" Date: Tue, 19 Jul 2022 13:23:33 -0700 Subject: [PATCH 2/5] Fix logic bug Line above this removes 'add_offset' from 'attrs' (if it exists), so '"add_offset" in attrs' should always be false. It was moved into 'encoding' so let's check for it there. --- xarray/coding/variables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 0cde2f53484..ad759b9c87c 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -269,7 +269,7 @@ def decode(self, variable, name=None): if "scale_factor" in attrs or "add_offset" in attrs: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in attrs) + dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: From 4eedd29c009e2ee9d91df5bd755fed14f67ed1d0 Mon Sep 17 00:00:00 2001 From: "Kenneth D. Mankoff" Date: Tue, 19 Jul 2022 13:47:57 -0700 Subject: [PATCH 3/5] Fixed test suite for new float64 dtype Modified _choose_float_dtype + Returns float32 if inputs are float16 or float32 + Returns float64 if inputs are int --- xarray/tests/test_coding.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 3af43f78e38..4f949683a99 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -96,7 +96,7 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) +@pytest.mark.parametrize("dtype", "f2 f4".split()) def test_scaling_converts_to_float32(dtype) -> None: original = xr.Variable( ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) @@ -109,6 +109,19 @@ def test_scaling_converts_to_float32(dtype) -> None: assert roundtripped.dtype == np.float32 +@pytest.mark.parametrize("dtype", "u1 u2 i1 i2".split()) +def test_scaling_converts_to_float64(dtype) -> None: + original = xr.Variable( + ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) + ) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float64 + roundtripped = coder.decode(encoded) + assert_identical(original, roundtripped) + assert roundtripped.dtype == np.float64 + + @pytest.mark.parametrize("scale_factor", (10, [10])) @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: From 312acdac56062771ec4892a09df3723ddec3ddb6 Mon Sep 17 00:00:00 2001 From: "Kenneth D. Mankoff" Date: Fri, 29 Jul 2022 09:10:41 -0700 Subject: [PATCH 4/5] Undo 2a5686c per correction from @dcherian https://github.com/pydata/xarray/pull/6812#pullrequestreview-1048437791 --- xarray/coding/variables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index ad759b9c87c..058359e94a5 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -236,8 +236,10 @@ def _choose_float_dtype(dtype, has_offset): # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if not has_offset: + if has_offset: return np.float64 + else: + return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) return np.float64 From 46157205a62b363d7d18ecc0d633c943aa6cbd1c Mon Sep 17 00:00:00 2001 From: "Kenneth D. Mankoff" Date: Fri, 29 Jul 2022 10:12:21 -0700 Subject: [PATCH 5/5] Rewrite per CF standards https://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html Split encoding and decoding for now. --- xarray/coding/variables.py | 80 ++++++++++++++++++++++++++++++++----- xarray/tests/test_coding.py | 26 ++++-------- 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 058359e94a5..d1f7431f2f0 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -224,7 +224,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, has_offset): +def _choose_float_dtype_encoding(dtype, has_offset): """Return a float dtype that can losslessly represent `dtype` values.""" # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" @@ -245,6 +245,67 @@ def _choose_float_dtype(dtype, has_offset): return np.float64 +def _choose_float_dtype_decoding(dtype, scale_factor, add_offset): + """Return a dtype per CF convention standards.""" + + # From CF standard: + # If the scale_factor and add_offset attributes are of the same data type as the + # associated variable, the unpacked data is assumed to be of the same data type as + # the packed data. + if (dtype == type(scale_factor) if scale_factor is not None else dtype) and ( + dtype == (type(add_offset) if add_offset is not None else dtype) + ): + ret_dtype = dtype + + # If here, one or both of scale_factor and add_offset exist, and do not match dtype + + # From CF standard: + # However, if the scale_factor and add_offset attributes are of a different data + # type from the variable (containing the packed data) then the unpacked data should + # match the type of these attributes, which must both be of type float or both be of + # type double. + + # Check that scale_factor and add_offset are float or double and the same. + if (scale_factor is not None) and (add_offset is not None): + if not np.issubdtype(type(scale_factor), np.double): + print("Warning - non-compliant CF file: scale_factor not float or double") + if not np.issubdtype(type(add_offset), np.double): + print("Warning - non-compliant CF file: add_offset not float or double") + if type(scale_factor) != type(add_offset): + print( + "Warning - non-compliant CF file: scale_factor and add_offset are different type" + ) + + # From CF standard: + # An additional restriction in this case is that the variable containing the packed + # data must be of type byte, short or int. It is not advised to unpack an int into a + # float as there is a potential precision loss. + + # Check that dtype is byte, short, or int. + if not np.issubdtype(dtype, np.integer): + print("Warning - non-compliant CF file: dtype is not byte, short, or int") + + if scale_factor is None: + ret_dtype = np.find_common_type([dtype, type(add_offset)], []) + elif add_offset is None: + ret_dtype = np.find_common_type([dtype, type(scale_factor)], []) + else: + ret_dtype = np.find_common_type( + [ + dtype, + type(scale_factor) if scale_factor is not None else None, + type(add_offset) if add_offset is not None else None, + ], + [], + ) + + # Upcast half-precision to single-precision, because float16 is "intended for + # storage but not computation" + if ret_dtype.itemsize <= 4 and np.issubdtype(ret_dtype, np.floating): + ret_dtype = np.float32 + return ret_dtype + + class CFScaleOffsetCoder(VariableCoder): """Scale and offset variables according to CF conventions. @@ -255,13 +316,14 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) - if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) - data = data.astype(dtype=dtype, copy=True) - if "add_offset" in encoding: - data -= pop_to(encoding, attrs, "add_offset", name=name) - if "scale_factor" in encoding: - data /= pop_to(encoding, attrs, "scale_factor", name=name) + scale_factor = pop_to(encoding, attrs, "scale_factor", name=name) + add_offset = pop_to(encoding, attrs, "add_offset", name=name) + dtype = _choose_float_dtype_encoding(data.dtype, add_offset in attrs) + data = data.astype(dtype=dtype, copy=True) + if add_offset is not None: + data = data - add_offset + if scale_factor is not None: + data = data / scale_factor return Variable(dims, data, attrs, encoding) @@ -271,7 +333,7 @@ def decode(self, variable, name=None): if "scale_factor" in attrs or "add_offset" in attrs: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dtype = _choose_float_dtype_decoding(data.dtype, scale_factor, add_offset) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 4f949683a99..24b24dbb24b 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -48,7 +48,6 @@ def test_CFMaskCoder_decode() -> None: def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None: original = xr.Variable(("x",), data, encoding=encoding) encoded = encode_cf_variable(original) - assert encoded.dtype == encoded.attrs["missing_value"].dtype assert encoded.dtype == encoded.attrs["_FillValue"].dtype @@ -96,34 +95,23 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "f2 f4".split()) -def test_scaling_converts_to_float32(dtype) -> None: - original = xr.Variable( - ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) - ) - coder = variables.CFScaleOffsetCoder() - encoded = coder.encode(original) - assert encoded.dtype == np.float32 - roundtripped = coder.decode(encoded) - assert_identical(original, roundtripped) - assert roundtripped.dtype == np.float32 - - -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2".split()) +@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) def test_scaling_converts_to_float64(dtype) -> None: original = xr.Variable( ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) - assert encoded.dtype == np.float64 + if dtype in ["f2", "f4"]: + assert encoded.dtype == np.float32 + if dtype in ["u1", "u2", "i1", "i2"]: + assert encoded.dtype == np.float64 roundtripped = coder.decode(encoded) - assert_identical(original, roundtripped) assert roundtripped.dtype == np.float64 -@pytest.mark.parametrize("scale_factor", (10, [10])) -@pytest.mark.parametrize("add_offset", (0.1, [0.1])) +@pytest.mark.parametrize("scale_factor", (10, 1)) +@pytest.mark.parametrize("add_offset", (0.1, 1.0)) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 encoding = dict(scale_factor=scale_factor, add_offset=add_offset)