Skip to content

Commit e3e6db5

Browse files
deeplycloudyshoyer
authored andcommitted
Automate interpretation of _Unsigned attribute (#1453)
* Add support for _Unsigned attribute * Update docstrings with new unsigned behavior * Cast to unsigned with copy instead of view, fixing infinite recursion. Move _Unsigned between attributes and encoding * Fix default argument for is_unsigned * Separate test for unsigned roundtrip * Move unsigned support out of mask_and_scale, update whats-new * Fix what's new date and add issue * Putting enhancement in correct section of whats-new * Turn off _FillValue support provided by PyNIO. Let xarray handle it. * Convert _FillValue when _Unsigned is present * PEP8 * No need to convert unsigned fill value if there is not fill value * PEP8 * yet more PEP8 * Be more careful with _Unsigned attribute check * Better fencing for attribute checks. Style fixes. Test for dtypes.
1 parent dbf9307 commit e3e6db5

File tree

5 files changed

+132
-7
lines changed

5 files changed

+132
-7
lines changed

doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ Enhancements
2424
- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
2525
raster files are opened with :py:func:`~xarray.open_rasterio`.
2626
By `Greg Brener <https://github.com/gbrener>`_
27+
- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
28+
a signed integer data type should be interpreted as unsigned bytes
29+
(:issue:`1444`).
30+
By `Eric Bruning <https://github.com/deeplycloudy>`_.
2731

2832
- Speed-up (x 100) of :py:func:`~xarray.conventions.decode_cf_datetime`.
2933
By `Christian Chwala <https://github.com/cchwala>`_.

xarray/backends/pynio_.py

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False):
4242
import Nio
4343
opener = functools.partial(Nio.open_file, filename, mode=mode)
4444
self.ds = opener()
45+
# xarray provides its own support for FillValue,
46+
# so turn off PyNIO's support for the same.
47+
self.ds.set_option('MaskedArrayMode', 'MaskedNever')
4548
self._autoclose = autoclose
4649
self._isopen = True
4750
self._opener = opener

xarray/conventions.py

+57-4
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,34 @@ def __getitem__(self, key):
552552
return np.asarray(self.array[key], dtype=self.dtype)
553553

554554

555+
class UnsignedIntTypeArray(utils.NDArrayMixin):
556+
"""Decode arrays on the fly from signed integer to unsigned
557+
integer. Typically used when _Unsigned is set at as a netCDF
558+
attribute on a signed integer variable.
559+
560+
>>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
561+
562+
>>> sb.dtype
563+
dtype('int8')
564+
565+
>>> UnsignedIntTypeArray(sb).dtype
566+
dtype('uint8')
567+
568+
>>> UnsignedIntTypeArray(sb)[:]
569+
array([ 0, 1, 127, 128, 255], dtype=uint8)
570+
"""
571+
def __init__(self, array):
572+
self.array = array
573+
self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize)
574+
575+
@property
576+
def dtype(self):
577+
return self.unsigned_dtype
578+
579+
def __getitem__(self, key):
580+
return np.asarray(self.array[key], dtype=self.dtype)
581+
582+
555583
def string_to_char(arr):
556584
"""Like netCDF4.stringtochar, but faster and more flexible.
557585
"""
@@ -655,6 +683,14 @@ def maybe_encode_dtype(var, name=None):
655683
'any _FillValue to use for NaNs' % name,
656684
RuntimeWarning, stacklevel=3)
657685
data = duck_array_ops.around(data)[...]
686+
if encoding.get('_Unsigned', False):
687+
signed_dtype = 'i%s' % dtype.itemsize
688+
if '_FillValue' in var.attrs:
689+
old_fill = np.asarray(attrs['_FillValue'])
690+
new_fill = old_fill.astype(signed_dtype)
691+
attrs['_FillValue'] = new_fill
692+
data = data.astype(signed_dtype)
693+
pop_to(encoding, attrs, '_Unsigned')
658694
if dtype == 'S1' and data.dtype != 'S1':
659695
data = string_to_char(np.asarray(data, 'S'))
660696
dims = dims + ('string%s' % data.shape[-1],)
@@ -779,7 +815,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
779815
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
780816
mask_and_scale: bool
781817
Lazily scale (using scale_factor and add_offset) and mask
782-
(using _FillValue).
818+
(using _FillValue). If the _Unsigned attribute is present
819+
treat integer arrays as unsigned.
783820
decode_times : bool
784821
Decode cf times ('hours since 2000-01-01') to np.datetime64.
785822
decode_endianness : bool
@@ -804,6 +841,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
804841
dimensions = dimensions[:-1]
805842
data = CharToStringArray(data)
806843

844+
pop_to(attributes, encoding, '_Unsigned')
845+
is_unsigned = encoding.get('_Unsigned', False)
846+
if is_unsigned and mask_and_scale:
847+
if data.dtype.kind == 'i':
848+
data = UnsignedIntTypeArray(data)
849+
else:
850+
warnings.warn("variable has _Unsigned attribute but is not "
851+
"of integer type. Ignoring attribute.",
852+
RuntimeWarning, stacklevel=3)
853+
807854
if mask_and_scale:
808855
if 'missing_value' in attributes:
809856
# missing_value is deprecated, but we still want to support it as
@@ -818,20 +865,26 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
818865
"and decoding explicitly using "
819866
"xarray.conventions.decode_cf(ds)")
820867
attributes['_FillValue'] = attributes.pop('missing_value')
821-
822868
fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
823869
if fill_value.size > 1:
824870
warnings.warn("variable has multiple fill values {0}, decoding "
825871
"all values to NaN.".format(str(fill_value)),
826872
RuntimeWarning, stacklevel=3)
827873
scale_factor = pop_to(attributes, encoding, 'scale_factor')
828874
add_offset = pop_to(attributes, encoding, 'add_offset')
829-
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
830-
scale_factor is not None or add_offset is not None):
875+
has_fill = (fill_value is not None and
876+
not np.any(pd.isnull(fill_value)))
877+
if (has_fill or scale_factor is not None or add_offset is not None):
831878
if fill_value.dtype.kind in ['U', 'S']:
832879
dtype = object
833880
else:
834881
dtype = float
882+
# According to the CF spec, the fill value is of the same
883+
# type as its variable, i.e. its storage format on disk.
884+
# This handles the case where the fill_value also needs to be
885+
# converted to its unsigned value.
886+
if has_fill:
887+
fill_value = np.asarray(fill_value, dtype=data.dtype)
835888
data = MaskedAndScaledArray(data, fill_value, scale_factor,
836889
add_offset, dtype)
837890

xarray/tests/test_backends.py

+59-3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data():
6262
return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)})
6363

6464

65+
def create_unsigned_masked_scaled_data():
66+
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
67+
'add_offset': 10, 'scale_factor': np.float32(0.1)}
68+
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
69+
return Dataset({'x': ('t', x, {}, encoding)})
70+
71+
72+
def create_encoded_unsigned_masked_scaled_data():
73+
# These are values as written to the file: the _FillValue will
74+
# be represented in the signed form.
75+
attributes = {'_FillValue': -1, '_Unsigned': 'true',
76+
'add_offset': 10, 'scale_factor': np.float32(0.1)}
77+
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
78+
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
79+
return Dataset({'x': ('t', sb, attributes)})
80+
81+
6582
def create_boolean_data():
6683
attributes = {'units': '-'}
6784
return Dataset({'x': ('t', [True, False, False, True], attributes)})
@@ -360,24 +377,63 @@ def test_roundtrip_strings_with_fill_value(self):
360377
with self.roundtrip(original) as actual:
361378
self.assertDatasetIdentical(expected, actual)
362379

380+
def test_unsigned_roundtrip_mask_and_scale(self):
381+
decoded = create_unsigned_masked_scaled_data()
382+
encoded = create_encoded_unsigned_masked_scaled_data()
383+
with self.roundtrip(decoded) as actual:
384+
for k in decoded.variables:
385+
self.assertEqual(decoded.variables[k].dtype,
386+
actual.variables[k].dtype)
387+
self.assertDatasetAllClose(decoded, actual)
388+
with self.roundtrip(decoded,
389+
open_kwargs=dict(decode_cf=False)) as actual:
390+
for k in encoded.variables:
391+
self.assertEqual(encoded.variables[k].dtype,
392+
actual.variables[k].dtype)
393+
self.assertDatasetAllClose(encoded, actual)
394+
with self.roundtrip(encoded,
395+
open_kwargs=dict(decode_cf=False)) as actual:
396+
for k in encoded.variables:
397+
self.assertEqual(encoded.variables[k].dtype,
398+
actual.variables[k].dtype)
399+
self.assertDatasetAllClose(encoded, actual)
400+
# make sure roundtrip encoding didn't change the
401+
# original dataset.
402+
self.assertDatasetIdentical(
403+
encoded, create_encoded_unsigned_masked_scaled_data())
404+
with self.roundtrip(encoded) as actual:
405+
for k in decoded.variables:
406+
self.assertEqual(decoded.variables[k].dtype,
407+
actual.variables[k].dtype)
408+
self.assertDatasetAllClose(decoded, actual)
409+
with self.roundtrip(encoded,
410+
open_kwargs=dict(decode_cf=False)) as actual:
411+
for k in encoded.variables:
412+
self.assertEqual(encoded.variables[k].dtype,
413+
actual.variables[k].dtype)
414+
self.assertDatasetAllClose(encoded, actual)
415+
363416
def test_roundtrip_mask_and_scale(self):
364417
decoded = create_masked_and_scaled_data()
365418
encoded = create_encoded_masked_and_scaled_data()
366419
with self.roundtrip(decoded) as actual:
367420
self.assertDatasetAllClose(decoded, actual)
368-
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
421+
with self.roundtrip(decoded,
422+
open_kwargs=dict(decode_cf=False)) as actual:
369423
# TODO: this assumes that all roundtrips will first
370424
# encode. Is that something we want to test for?
371425
self.assertDatasetAllClose(encoded, actual)
372-
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
426+
with self.roundtrip(encoded,
427+
open_kwargs=dict(decode_cf=False)) as actual:
373428
self.assertDatasetAllClose(encoded, actual)
374429
# make sure roundtrip encoding didn't change the
375430
# original dataset.
376431
self.assertDatasetIdentical(encoded,
377432
create_encoded_masked_and_scaled_data())
378433
with self.roundtrip(encoded) as actual:
379434
self.assertDatasetAllClose(decoded, actual)
380-
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
435+
with self.roundtrip(encoded,
436+
open_kwargs=dict(decode_cf=False)) as actual:
381437
self.assertDatasetAllClose(encoded, actual)
382438

383439
def test_coordinates_encoding(self):

xarray/tests/test_conventions.py

+9
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ def test_string_to_char(self):
108108
self.assertArrayEqual(actual, expected)
109109

110110

111+
class TestUnsignedIntTypeArray(TestCase):
112+
def test_unsignedinttype_array(self):
113+
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
114+
ub = conventions.UnsignedIntTypeArray(sb)
115+
self.assertEqual(ub.dtype, np.dtype('u1'))
116+
self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255],
117+
dtype=np.dtype('u1')))
118+
119+
111120
class TestBoolTypeArray(TestCase):
112121
def test_booltype_array(self):
113122
x = np.array([1, 0, 1, 1, 0], dtype='i1')

0 commit comments

Comments
 (0)