Automate interpretation of _Unsigned attribute (#1453)

deeplycloudy · shoyer · commit e3e6db5b1bf9 · 2017-07-28T10:39:04.000-07:00
* Add support for _Unsigned attribute

* Update docstrings with new unsigned behavior

* Cast to unsigned with copy instead of view, fixing infinite recursion. Move _Unsigned between attributes and encoding

* Fix default argument for is_unsigned

* Separate test for unsigned roundtrip

* Move unsigned support out of mask_and_scale, update whats-new

* Fix what's new date and add issue

* Putting enhancement in correct section of whats-new

* Turn off _FillValue support provided by PyNIO. Let xarray handle it.

* Convert _FillValue when _Unsigned is present

* PEP8

* No need to convert unsigned fill value if there is not fill value

* PEP8

* yet more PEP8

* Be more careful with _Unsigned attribute check

* Better fencing for attribute checks. Style fixes. Test for dtypes.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -24,6 +24,10 @@ Enhancements
 - More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
   raster files are opened with :py:func:`~xarray.open_rasterio`.
   By `Greg Brener <https://github.com/gbrener>`_
+- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
+  a signed integer data type should be interpreted as unsigned bytes
+  (:issue:`1444`).
+  By `Eric Bruning <https://github.com/deeplycloudy>`_.
 
 - Speed-up (x 100) of :py:func:`~xarray.conventions.decode_cf_datetime`.
   By `Christian Chwala <https://github.com/cchwala>`_.
diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py
@@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False):
         import Nio
         opener = functools.partial(Nio.open_file, filename, mode=mode)
         self.ds = opener()
+        # xarray provides its own support for FillValue,
+        # so turn off PyNIO's support for the same.
+        self.ds.set_option('MaskedArrayMode', 'MaskedNever')
         self._autoclose = autoclose
         self._isopen = True
         self._opener = opener
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -552,6 +552,34 @@ def __getitem__(self, key):
         return np.asarray(self.array[key], dtype=self.dtype)
 
 
+class UnsignedIntTypeArray(utils.NDArrayMixin):
+    """Decode arrays on the fly from signed integer to unsigned
+    integer. Typically used when _Unsigned is set at as a netCDF
+    attribute on a signed integer variable.
+
+    >>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
+
+    >>> sb.dtype
+    dtype('int8')
+
+    >>> UnsignedIntTypeArray(sb).dtype
+    dtype('uint8')
+
+    >>> UnsignedIntTypeArray(sb)[:]
+    array([  0,   1, 127, 128, 255], dtype=uint8)
+    """
+    def __init__(self, array):
+        self.array = array
+        self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize)
+
+    @property
+    def dtype(self):
+        return self.unsigned_dtype
+
+    def __getitem__(self, key):
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
 def string_to_char(arr):
     """Like netCDF4.stringtochar, but faster and more flexible.
     """
@@ -655,6 +683,14 @@ def maybe_encode_dtype(var, name=None):
                                   'any _FillValue to use for NaNs' % name,
                                   RuntimeWarning, stacklevel=3)
                 data = duck_array_ops.around(data)[...]
+                if encoding.get('_Unsigned', False):
+                    signed_dtype = 'i%s' % dtype.itemsize
+                    if '_FillValue' in var.attrs:
+                        old_fill = np.asarray(attrs['_FillValue'])
+                        new_fill = old_fill.astype(signed_dtype)
+                        attrs['_FillValue'] = new_fill
+                    data = data.astype(signed_dtype)
+                    pop_to(encoding, attrs, '_Unsigned')
             if dtype == 'S1' and data.dtype != 'S1':
                 data = string_to_char(np.asarray(data, 'S'))
                 dims = dims + ('string%s' % data.shape[-1],)
@@ -779,7 +815,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
         example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
     mask_and_scale: bool
         Lazily scale (using scale_factor and add_offset) and mask
-        (using _FillValue).
+        (using _FillValue). If the _Unsigned attribute is present
+        treat integer arrays as unsigned.
     decode_times : bool
         Decode cf times ('hours since 2000-01-01') to np.datetime64.
     decode_endianness : bool
@@ -804,6 +841,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
             dimensions = dimensions[:-1]
             data = CharToStringArray(data)
 
+    pop_to(attributes, encoding, '_Unsigned')
+    is_unsigned = encoding.get('_Unsigned', False)
+    if is_unsigned and mask_and_scale:
+        if data.dtype.kind == 'i':
+            data = UnsignedIntTypeArray(data)
+        else:
+            warnings.warn("variable has _Unsigned attribute but is not "
+                          "of integer type. Ignoring attribute.",
+                          RuntimeWarning, stacklevel=3)
+
     if mask_and_scale:
         if 'missing_value' in attributes:
             # missing_value is deprecated, but we still want to support it as
@@ -818,20 +865,26 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
                                  "and decoding explicitly using "
                                  "xarray.conventions.decode_cf(ds)")
             attributes['_FillValue'] = attributes.pop('missing_value')
-
         fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
         if fill_value.size > 1:
             warnings.warn("variable has multiple fill values {0}, decoding "
                           "all values to NaN.".format(str(fill_value)),
                           RuntimeWarning, stacklevel=3)
         scale_factor = pop_to(attributes, encoding, 'scale_factor')
         add_offset = pop_to(attributes, encoding, 'add_offset')
-        if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
-                scale_factor is not None or add_offset is not None):
+        has_fill = (fill_value is not None and
+                    not np.any(pd.isnull(fill_value)))
+        if (has_fill or scale_factor is not None or add_offset is not None):
             if fill_value.dtype.kind in ['U', 'S']:
                 dtype = object
             else:
                 dtype = float
+            # According to the CF spec, the fill value is of the same
+            # type as its variable, i.e. its storage format on disk.
+            # This handles the case where the fill_value also needs to be
+            # converted to its unsigned value.
+            if has_fill:
+                fill_value = np.asarray(fill_value, dtype=data.dtype)
             data = MaskedAndScaledArray(data, fill_value, scale_factor,
                                         add_offset, dtype)
 
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data():
     return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)})
 
 
+def create_unsigned_masked_scaled_data():
+    encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
+                'add_offset': 10, 'scale_factor': np.float32(0.1)}
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
+    return Dataset({'x': ('t', x, {}, encoding)})
+
+
+def create_encoded_unsigned_masked_scaled_data():
+    # These are values as written to the file: the _FillValue will
+    # be represented in the signed form.
+    attributes = {'_FillValue': -1, '_Unsigned': 'true',
+                  'add_offset': 10, 'scale_factor': np.float32(0.1)}
+    # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
+    sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
+    return Dataset({'x': ('t', sb, attributes)})
+
+
 def create_boolean_data():
     attributes = {'units': '-'}
     return Dataset({'x': ('t', [True, False, False, True], attributes)})
@@ -360,24 +377,63 @@ def test_roundtrip_strings_with_fill_value(self):
         with self.roundtrip(original) as actual:
             self.assertDatasetIdentical(expected, actual)
 
+    def test_unsigned_roundtrip_mask_and_scale(self):
+        decoded = create_unsigned_masked_scaled_data()
+        encoded = create_encoded_unsigned_masked_scaled_data()
+        with self.roundtrip(decoded) as actual:
+            for k in decoded.variables:
+                self.assertEqual(decoded.variables[k].dtype,
+                                 actual.variables[k].dtype)
+            self.assertDatasetAllClose(decoded, actual)
+        with self.roundtrip(decoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
+            for k in encoded.variables:
+                self.assertEqual(encoded.variables[k].dtype,
+                                 actual.variables[k].dtype)
+            self.assertDatasetAllClose(encoded, actual)
+        with self.roundtrip(encoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
+            for k in encoded.variables:
+                self.assertEqual(encoded.variables[k].dtype,
+                                 actual.variables[k].dtype)
+            self.assertDatasetAllClose(encoded, actual)
+        # make sure roundtrip encoding didn't change the
+        # original dataset.
+        self.assertDatasetIdentical(
+            encoded, create_encoded_unsigned_masked_scaled_data())
+        with self.roundtrip(encoded) as actual:
+            for k in decoded.variables:
+                self.assertEqual(decoded.variables[k].dtype,
+                                 actual.variables[k].dtype)
+            self.assertDatasetAllClose(decoded, actual)
+        with self.roundtrip(encoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
+            for k in encoded.variables:
+                self.assertEqual(encoded.variables[k].dtype,
+                                 actual.variables[k].dtype)
+            self.assertDatasetAllClose(encoded, actual)
+
     def test_roundtrip_mask_and_scale(self):
         decoded = create_masked_and_scaled_data()
         encoded = create_encoded_masked_and_scaled_data()
         with self.roundtrip(decoded) as actual:
             self.assertDatasetAllClose(decoded, actual)
-        with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
+        with self.roundtrip(decoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
             # TODO: this assumes that all roundtrips will first
             # encode.  Is that something we want to test for?
             self.assertDatasetAllClose(encoded, actual)
-        with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
+        with self.roundtrip(encoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
             self.assertDatasetAllClose(encoded, actual)
         # make sure roundtrip encoding didn't change the
         # original dataset.
         self.assertDatasetIdentical(encoded,
                                     create_encoded_masked_and_scaled_data())
         with self.roundtrip(encoded) as actual:
             self.assertDatasetAllClose(decoded, actual)
-        with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
+        with self.roundtrip(encoded,
+                            open_kwargs=dict(decode_cf=False)) as actual:
             self.assertDatasetAllClose(encoded, actual)
 
     def test_coordinates_encoding(self):
diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py
@@ -108,6 +108,15 @@ def test_string_to_char(self):
         self.assertArrayEqual(actual, expected)
 
 
+class TestUnsignedIntTypeArray(TestCase):
+    def test_unsignedinttype_array(self):
+        sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
+        ub = conventions.UnsignedIntTypeArray(sb)
+        self.assertEqual(ub.dtype, np.dtype('u1'))
+        self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255],
+                                           dtype=np.dtype('u1')))
+
+
 class TestBoolTypeArray(TestCase):
     def test_booltype_array(self):
         x = np.array([1, 0, 1, 1, 0], dtype='i1')