From 2a5686c6fe855502523e495e43bd381d14191c7b Mon Sep 17 00:00:00 2001
From: "Kenneth D. Mankoff" <mankoff@gmail.com>
Date: Tue, 19 Jul 2022 12:16:36 -0700
Subject: [PATCH 1/5] Make code match comments - use 64bit float

---
 xarray/coding/variables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 3e3d8b07e3d..0cde2f53484 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -237,7 +237,7 @@ def _choose_float_dtype(dtype, has_offset):
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
         if not has_offset:
-            return np.float32
+            return np.float64
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
     return np.float64

From 108586e09668fc55530a23a4c27477917353ad57 Mon Sep 17 00:00:00 2001
From: "Kenneth D. Mankoff" <mankoff@gmail.com>
Date: Tue, 19 Jul 2022 13:23:33 -0700
Subject: [PATCH 2/5] Fix logic bug

Line above this removes 'add_offset' from 'attrs' (if it exists), so
'"add_offset" in attrs' should always be false. It was moved into
'encoding' so let's check for it there.
---
 xarray/coding/variables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 0cde2f53484..ad759b9c87c 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -269,7 +269,7 @@ def decode(self, variable, name=None):
         if "scale_factor" in attrs or "add_offset" in attrs:
             scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
             add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in attrs)
+            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:

From 4eedd29c009e2ee9d91df5bd755fed14f67ed1d0 Mon Sep 17 00:00:00 2001
From: "Kenneth D. Mankoff" <mankoff@gmail.com>
Date: Tue, 19 Jul 2022 13:47:57 -0700
Subject: [PATCH 3/5] Fixed test suite for new float64 dtype

Modified _choose_float_dtype
+ Returns float32 if inputs are float16 or float32
+ Returns float64 if inputs are int
---
 xarray/tests/test_coding.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 3af43f78e38..4f949683a99 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -96,7 +96,7 @@ def test_coder_roundtrip() -> None:
     assert_identical(original, roundtripped)
 
 
-@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split())
+@pytest.mark.parametrize("dtype", "f2 f4".split())
 def test_scaling_converts_to_float32(dtype) -> None:
     original = xr.Variable(
         ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
@@ -109,6 +109,19 @@ def test_scaling_converts_to_float32(dtype) -> None:
     assert roundtripped.dtype == np.float32
 
 
+@pytest.mark.parametrize("dtype", "u1 u2 i1 i2".split())
+def test_scaling_converts_to_float64(dtype) -> None:
+    original = xr.Variable(
+        ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
+    )
+    coder = variables.CFScaleOffsetCoder()
+    encoded = coder.encode(original)
+    assert encoded.dtype == np.float64
+    roundtripped = coder.decode(encoded)
+    assert_identical(original, roundtripped)
+    assert roundtripped.dtype == np.float64
+
+
 @pytest.mark.parametrize("scale_factor", (10, [10]))
 @pytest.mark.parametrize("add_offset", (0.1, [0.1]))
 def test_scaling_offset_as_list(scale_factor, add_offset) -> None:

From 312acdac56062771ec4892a09df3723ddec3ddb6 Mon Sep 17 00:00:00 2001
From: "Kenneth D. Mankoff" <mankoff@gmail.com>
Date: Fri, 29 Jul 2022 09:10:41 -0700
Subject: [PATCH 4/5] Undo 2a5686c per correction from @dcherian

https://github.com/pydata/xarray/pull/6812#pullrequestreview-1048437791
---
 xarray/coding/variables.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index ad759b9c87c..058359e94a5 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -236,8 +236,10 @@ def _choose_float_dtype(dtype, has_offset):
         # but a large integer offset could lead to loss of precision.
         # Sensitivity analysis can be tricky, so we just use a float64
         # if there's any offset at all - better unoptimised than wrong!
-        if not has_offset:
+        if has_offset:
             return np.float64
+        else:
+            return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
     return np.float64

From 46157205a62b363d7d18ecc0d633c943aa6cbd1c Mon Sep 17 00:00:00 2001
From: "Kenneth D. Mankoff" <mankoff@gmail.com>
Date: Fri, 29 Jul 2022 10:12:21 -0700
Subject: [PATCH 5/5] Rewrite per CF standards

https://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html

Split encoding and decoding for now.
---
 xarray/coding/variables.py  | 80 ++++++++++++++++++++++++++++++++-----
 xarray/tests/test_coding.py | 26 ++++--------
 2 files changed, 78 insertions(+), 28 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 058359e94a5..d1f7431f2f0 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -224,7 +224,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
     return data
 
 
-def _choose_float_dtype(dtype, has_offset):
+def _choose_float_dtype_encoding(dtype, has_offset):
     """Return a float dtype that can losslessly represent `dtype` values."""
     # Keep float32 as-is.  Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
@@ -245,6 +245,67 @@ def _choose_float_dtype(dtype, has_offset):
     return np.float64
 
 
+def _choose_float_dtype_decoding(dtype, scale_factor, add_offset):
+    """Return a dtype per CF convention standards."""
+
+    # From CF standard:
+    # If the scale_factor and add_offset attributes are of the same data type as the
+    # associated variable, the unpacked data is assumed to be of the same data type as
+    # the packed data.
+    if (dtype == type(scale_factor) if scale_factor is not None else dtype) and (
+        dtype == (type(add_offset) if add_offset is not None else dtype)
+    ):
+        ret_dtype = dtype
+
+    # If here, one or both of scale_factor and add_offset exist, and do not match dtype
+
+    # From CF standard:
+    # However, if the scale_factor and add_offset attributes are of a different data
+    # type from the variable (containing the packed data) then the unpacked data should
+    # match the type of these attributes, which must both be of type float or both be of
+    # type double.
+
+    # Check that scale_factor and add_offset are float or double and the same.
+    if (scale_factor is not None) and (add_offset is not None):
+        if not np.issubdtype(type(scale_factor), np.double):
+            print("Warning - non-compliant CF file: scale_factor not float or double")
+        if not np.issubdtype(type(add_offset), np.double):
+            print("Warning - non-compliant CF file: add_offset not float or double")
+        if type(scale_factor) != type(add_offset):
+            print(
+                "Warning - non-compliant CF file: scale_factor and add_offset are different type"
+            )
+
+    # From CF standard:
+    # An additional restriction in this case is that the variable containing the packed
+    # data must be of type byte, short or int. It is not advised to unpack an int into a
+    # float as there is a potential precision loss.
+
+    # Check that dtype is byte, short, or int.
+    if not np.issubdtype(dtype, np.integer):
+        print("Warning - non-compliant CF file: dtype is not byte, short, or int")
+
+    if scale_factor is None:
+        ret_dtype = np.find_common_type([dtype, type(add_offset)], [])
+    elif add_offset is None:
+        ret_dtype = np.find_common_type([dtype, type(scale_factor)], [])
+    else:
+        ret_dtype = np.find_common_type(
+            [
+                dtype,
+                type(scale_factor) if scale_factor is not None else None,
+                type(add_offset) if add_offset is not None else None,
+            ],
+            [],
+        )
+
+    # Upcast half-precision to single-precision, because float16 is "intended for
+    # storage but not computation"
+    if ret_dtype.itemsize <= 4 and np.issubdtype(ret_dtype, np.floating):
+        ret_dtype = np.float32
+    return ret_dtype
+
+
 class CFScaleOffsetCoder(VariableCoder):
     """Scale and offset variables according to CF conventions.
 
@@ -255,13 +316,14 @@ class CFScaleOffsetCoder(VariableCoder):
     def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
-        if "scale_factor" in encoding or "add_offset" in encoding:
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
-            data = data.astype(dtype=dtype, copy=True)
-        if "add_offset" in encoding:
-            data -= pop_to(encoding, attrs, "add_offset", name=name)
-        if "scale_factor" in encoding:
-            data /= pop_to(encoding, attrs, "scale_factor", name=name)
+        scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
+        add_offset = pop_to(encoding, attrs, "add_offset", name=name)
+        dtype = _choose_float_dtype_encoding(data.dtype, add_offset in attrs)
+        data = data.astype(dtype=dtype, copy=True)
+        if add_offset is not None:
+            data = data - add_offset
+        if scale_factor is not None:
+            data = data / scale_factor
 
         return Variable(dims, data, attrs, encoding)
 
@@ -271,7 +333,7 @@ def decode(self, variable, name=None):
         if "scale_factor" in attrs or "add_offset" in attrs:
             scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
             add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+            dtype = _choose_float_dtype_decoding(data.dtype, scale_factor, add_offset)
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 4f949683a99..24b24dbb24b 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -48,7 +48,6 @@ def test_CFMaskCoder_decode() -> None:
 def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None:
     original = xr.Variable(("x",), data, encoding=encoding)
     encoded = encode_cf_variable(original)
-
     assert encoded.dtype == encoded.attrs["missing_value"].dtype
     assert encoded.dtype == encoded.attrs["_FillValue"].dtype
 
@@ -96,34 +95,23 @@ def test_coder_roundtrip() -> None:
     assert_identical(original, roundtripped)
 
 
-@pytest.mark.parametrize("dtype", "f2 f4".split())
-def test_scaling_converts_to_float32(dtype) -> None:
-    original = xr.Variable(
-        ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
-    )
-    coder = variables.CFScaleOffsetCoder()
-    encoded = coder.encode(original)
-    assert encoded.dtype == np.float32
-    roundtripped = coder.decode(encoded)
-    assert_identical(original, roundtripped)
-    assert roundtripped.dtype == np.float32
-
-
-@pytest.mark.parametrize("dtype", "u1 u2 i1 i2".split())
+@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split())
 def test_scaling_converts_to_float64(dtype) -> None:
     original = xr.Variable(
         ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
     )
     coder = variables.CFScaleOffsetCoder()
     encoded = coder.encode(original)
-    assert encoded.dtype == np.float64
+    if dtype in ["f2", "f4"]:
+        assert encoded.dtype == np.float32
+    if dtype in ["u1", "u2", "i1", "i2"]:
+        assert encoded.dtype == np.float64
     roundtripped = coder.decode(encoded)
-    assert_identical(original, roundtripped)
     assert roundtripped.dtype == np.float64
 
 
-@pytest.mark.parametrize("scale_factor", (10, [10]))
-@pytest.mark.parametrize("add_offset", (0.1, [0.1]))
+@pytest.mark.parametrize("scale_factor", (10, 1))
+@pytest.mark.parametrize("add_offset", (0.1, 1.0))
 def test_scaling_offset_as_list(scale_factor, add_offset) -> None:
     # test for #4631
     encoding = dict(scale_factor=scale_factor, add_offset=add_offset)