Merge pull request #562 from pauldmccarthy/indexed_gzip_usage

effigies · web-flow · commit 8c1d0bc4bd9c · 2017-10-06T15:13:09.000-04:00
Only use `indexed_gzip` when explicitly requested
diff --git a/nibabel/arrayproxy.py b/nibabel/arrayproxy.py
@@ -265,10 +265,10 @@ def _get_fileobj(self):
         """
         if self._keep_file_open:
             if not hasattr(self, '_opener'):
-                self._opener = ImageOpener(self.file_like)
+                self._opener = ImageOpener(self.file_like, keep_open=True)
             yield self._opener
         else:
-            with ImageOpener(self.file_like) as opener:
+            with ImageOpener(self.file_like, keep_open=False) as opener:
                 yield opener
 
     def get_unscaled(self):
diff --git a/nibabel/benchmarks/bench_array_to_file.py b/nibabel/benchmarks/bench_array_to_file.py
@@ -19,11 +19,12 @@
 
 import numpy as np
 
-
 from .butils import print_git_title
 
 from numpy.testing import measure
 
+from nibabel.volumeutils import array_to_file  # NOQA
+
 
 def bench_array_to_file():
     rng = np.random.RandomState(20111001)
diff --git a/nibabel/benchmarks/bench_arrayproxy_slicing.py b/nibabel/benchmarks/bench_arrayproxy_slicing.py
@@ -0,0 +1,202 @@
+"""Benchmarks for ArrayProxy slicing of gzipped and non-gzipped files
+
+Run benchmarks with::
+
+    import nibabel as nib
+    nib.bench()
+
+If you have doctests enabled by default in nose (with a noserc file or
+environment variable), and you have a numpy version <= 1.6.1, this will also
+run the doctests, let's hope they pass.
+
+Run this benchmark with:
+
+    nosetests -s --match '(?:^|[\\b_\\.//-])[Bb]ench' /path/to/bench_arrayproxy_slicing.py
+"""
+
+from timeit import timeit
+import contextlib
+import gc
+import itertools as it
+import numpy as np
+import mock
+
+import nibabel as nib
+from nibabel.tmpdirs import InTemporaryDirectory
+from nibabel.openers import HAVE_INDEXED_GZIP
+
+from .butils import print_git_title
+from ..rstutils import rst_table
+
+# if memory_profiler is installed, we get memory usage results
+try:
+    from memory_profiler import memory_usage
+except ImportError:
+    memory_usage = None
+
+
+# Each test involves loading an image of shape SHAPE, and then slicing it
+# NITERS times
+NITERS = 50
+SHAPE = (100, 100, 100, 100)
+
+# One test is run for each combination of SLICEOBJS, KEEP_OPENS, and HAVE_IGZIP
+
+# ':' gets replaced with slice(None)
+# '?' gets replaced with a random index into the relevant axis
+# numbers (assumed to be between 0 and 1) get scaled to the axis shape
+SLICEOBJS = [
+    ('?', ':', ':', ':'),
+    (':', ':', ':', '?'),
+    ('?', '?', '?', ':'),
+]
+
+KEEP_OPENS = [False, True]
+
+if HAVE_INDEXED_GZIP:
+    HAVE_IGZIP = [False, True]
+else:
+    HAVE_IGZIP = [False]
+
+
+@contextlib.contextmanager
+def patch_indexed_gzip(have_igzip):
+
+    atts = ['nibabel.openers.HAVE_INDEXED_GZIP',
+            'nibabel.arrayproxy.HAVE_INDEXED_GZIP']
+
+    with mock.patch(atts[0], have_igzip), mock.patch(atts[1], have_igzip):
+        yield
+
+
+def bench_arrayproxy_slicing():
+
+    print_git_title('\nArrayProxy gzip slicing')
+
+    # each test is a tuple containing
+    # (HAVE_INDEXED_GZIP, keep_file_open, sliceobj)
+    tests = list(it.product(HAVE_IGZIP, KEEP_OPENS, SLICEOBJS))
+
+    # remove tests where HAVE_INDEXED_GZIP is True and keep_file_open is False,
+    # because if keep_file_open is False, HAVE_INDEXED_GZIP has no effect
+    tests = [t for t in tests if not (t[0] and not t[1])]
+
+    testfile = 'testfile.nii'
+    testfilegz = 'test.nii.gz'
+
+    def get_test_label(test):
+        have_igzip = test[0]
+        keep_open = test[1]
+
+        if not (have_igzip and keep_open):
+            return 'gzip'
+        else:
+            return 'indexed_gzip'
+
+    def fix_sliceobj(sliceobj):
+        new_sliceobj = []
+        for i, s in enumerate(sliceobj):
+            if s == ':':
+                new_sliceobj.append(slice(None))
+            elif s == '?':
+                new_sliceobj.append(np.random.randint(0, SHAPE[i]))
+            else:
+                new_sliceobj.append(int(s * SHAPE[i]))
+        return tuple(new_sliceobj)
+
+    def fmt_sliceobj(sliceobj):
+        slcstr = []
+        for i, s in enumerate(sliceobj):
+            if s in ':?':
+                slcstr.append(s)
+            else:
+                slcstr.append(str(int(s * SHAPE[i])))
+        return '[{}]'.format(', '.join(slcstr))
+
+    with InTemporaryDirectory():
+
+        print('Generating test data... ({} MB)'.format(
+            int(round(np.prod(SHAPE) * 4 / 1048576.))))
+
+        data = np.array(np.random.random(SHAPE), dtype=np.float32)
+
+        # zero out 10% of voxels so gzip has something to compress
+        mask = np.random.random(SHAPE[:3]) > 0.1
+        if len(SHAPE) > 3:
+            data[mask, :] = 0
+        else:
+            data[mask] = 0
+
+        # save uncompressed and compressed versions of the image
+        img = nib.nifti1.Nifti1Image(data, np.eye(4))
+        nib.save(img, testfilegz)
+        nib.save(img, testfile)
+
+        # each result is a tuple containing
+        # (label, keep_open, sliceobj, testtime, basetime, testmem, basemem)
+        #
+        # where "basetime" is the time taken to load and slice a memmapped
+        # (uncompressed)image, and "basemem" is memory usage for the same
+        results = []
+
+        # We use the same random seed for each slice object,
+        seeds = [np.random.randint(0, 2 ** 32) for s in SLICEOBJS]
+
+        for ti, test in enumerate(tests):
+
+            label = get_test_label(test)
+            have_igzip, keep_open, sliceobj = test
+            seed = seeds[SLICEOBJS.index(sliceobj)]
+
+            print('Running test {} of {} ({})...'.format(
+                ti + 1, len(tests), label))
+
+            # load uncompressed and compressed versions of the image
+            img = nib.load(testfile, keep_file_open=keep_open)
+
+            with patch_indexed_gzip(have_igzip):
+                imggz = nib.load(testfilegz, keep_file_open=keep_open)
+
+            def basefunc():
+                img.dataobj[fix_sliceobj(sliceobj)]
+
+            def testfunc():
+                with patch_indexed_gzip(have_igzip):
+                    imggz.dataobj[fix_sliceobj(sliceobj)]
+
+            # make sure nothing is floating around from the previous test
+            # iteration, so memory profiling is (hopefully) more accurate
+            gc.collect()
+
+            if memory_usage is not None:
+                membaseline = max(memory_usage(lambda: None))
+                testmem = max(memory_usage(testfunc)) - membaseline
+                basemem = max(memory_usage(basefunc)) - membaseline
+            else:
+                testmem = np.nan
+                basemem = np.nan
+
+            # reset the random number generator, so test and baseline use the
+            # same slices
+            np.random.seed(seed)
+            testtime = float(timeit(testfunc, number=NITERS)) / float(NITERS)
+            np.random.seed(seed)
+            basetime = float(timeit(basefunc, number=NITERS)) / float(NITERS)
+
+            results.append((label, keep_open, sliceobj, testtime, basetime,
+                            testmem, basemem))
+
+    data = np.zeros((len(results), 4))
+    data[:, 0] = [r[3] for r in results]
+    data[:, 1] = [r[4] for r in results]
+    try:
+        data[:, 2] = [r[3] / r[4] for r in results]
+    except:
+        data[:, 2] = np.nan
+    data[:, 3] = [r[5] - r[6] for r in results]
+
+    rowlbls = ['Type {}, keep_open {}, slice {}'.format(
+        r[0], r[1], fmt_sliceobj(r[2])) for r in results]
+    collbls = ['Time', 'Baseline time', 'Time ratio', 'Memory deviation']
+
+    print(rst_table(data, rowlbls, collbls))
diff --git a/nibabel/benchmarks/bench_finite_range.py b/nibabel/benchmarks/bench_finite_range.py
@@ -24,6 +24,8 @@
 
 from numpy.testing import measure
 
+from nibabel.volumeutils import finite_range  # NOQA
+
 
 def bench_finite_range():
     rng = np.random.RandomState(20111001)
diff --git a/nibabel/openers.py b/nibabel/openers.py
@@ -12,12 +12,23 @@
 import bz2
 import gzip
 import sys
+import warnings
 from os.path import splitext
+from distutils.version import StrictVersion
 
-# is indexed_gzip present?
+# is indexed_gzip present and modern?
 try:
-    from indexed_gzip import SafeIndexedGzipFile
+    from indexed_gzip import SafeIndexedGzipFile, __version__ as version
+
     HAVE_INDEXED_GZIP = True
+
+    if StrictVersion(version) < StrictVersion('0.6.0'):
+        warnings.warn('indexed_gzip is present, but too old '
+                      '(>= 0.6.0 required): {})'.format(version))
+        HAVE_INDEXED_GZIP = False
+
+    del version
+
 except ImportError:
     HAVE_INDEXED_GZIP = False
 
@@ -67,10 +78,10 @@ def readinto(self, buf):
             return n_read
 
 
-def _gzip_open(filename, mode='rb', compresslevel=9):
+def _gzip_open(filename, mode='rb', compresslevel=9, keep_open=False):
 
     # use indexed_gzip if possible for faster read access
-    if mode == 'rb' and HAVE_INDEXED_GZIP:
+    if keep_open and mode == 'rb' and HAVE_INDEXED_GZIP:
         gzip_file = SafeIndexedGzipFile(filename)
 
     # Fall-back to built-in GzipFile (wrapped with the BufferedGzipFile class
@@ -101,12 +112,13 @@ class Opener(object):
     \*args : positional arguments
         passed to opening method when `fileish` is str.  ``mode``, if not
         specified, is `rb`.  ``compresslevel``, if relevant, and not specified,
-        is set from class variable ``default_compresslevel``
+        is set from class variable ``default_compresslevel``. ``keep_open``, if
+        relevant, and not specified, is ``False``.
     \*\*kwargs : keyword arguments
         passed to opening method when `fileish` is str.  Change of defaults as
         for \*args
     """
-    gz_def = (_gzip_open, ('mode', 'compresslevel'))
+    gz_def = (_gzip_open, ('mode', 'compresslevel', 'keep_open'))
     bz2_def = (bz2.BZ2File, ('mode', 'buffering', 'compresslevel'))
     compress_ext_map = {
         '.gz': gz_def,
@@ -132,8 +144,15 @@ def __init__(self, fileish, *args, **kwargs):
         # Set default mode
         if 'mode' not in full_kwargs:
             kwargs['mode'] = 'rb'
+        # Default compression level
         if 'compresslevel' in arg_names and 'compresslevel' not in kwargs:
             kwargs['compresslevel'] = self.default_compresslevel
+        # Default keep_open hint
+        if 'keep_open' in arg_names:
+            kwargs.setdefault('keep_open', False)
+        # Clear keep_open hint if it is not relevant for the file type
+        else:
+            kwargs.pop('keep_open', None)
         self.fobj = opener(fileish, *args, **kwargs)
         self._name = fileish
         self.me_opened = True
diff --git a/nibabel/pkg_info.py b/nibabel/pkg_info.py
@@ -2,9 +2,9 @@
 import sys
 import subprocess
 try:
-    from ConfigParser import ConfigParser
+    from ConfigParser import RawConfigParser as ConfigParser
 except ImportError:
-    from configparser import ConfigParser  # python 3
+    from configparser import RawConfigParser as ConfigParser  # python 3
 
 COMMIT_INFO_FNAME = 'COMMIT_INFO.txt'
 
diff --git a/nibabel/tests/test_arrayproxy.py b/nibabel/tests/test_arrayproxy.py
@@ -33,6 +33,7 @@
 from nibabel.testing import VIRAL_MEMMAP
 
 from .test_fileslice import slicer_samples
+from .test_openers import patch_indexed_gzip
 
 
 class FunkyHeader(object):
@@ -412,21 +413,6 @@ def test_keep_file_open_true_false_invalid():
             ArrayProxy(fname, ((10, 10, 10), dtype), keep_file_open='cauto')
 
 
-@contextlib.contextmanager
-def patch_indexed_gzip(state):
-    # Make it look like we do (state==True) or do not (state==False) have
-    # the indexed gzip module.
-    if state:
-        values = (True, True, gzip.GzipFile)
-    else:
-        values = (False, False, None)
-    with mock.patch('nibabel.openers.HAVE_INDEXED_GZIP', values[0]), \
-         mock.patch('nibabel.arrayproxy.HAVE_INDEXED_GZIP', values[1]), \
-         mock.patch('nibabel.openers.SafeIndexedGzipFile', values[2],
-                    create=True):
-        yield
-
-
 @contextlib.contextmanager
 def patch_keep_file_open_default(value):
     # Patch arrayproxy.KEEP_FILE_OPEN_DEFAULT with the given value
diff --git a/nibabel/tests/test_openers.py b/nibabel/tests/test_openers.py