Skip to content

blosc_set_nthreads #25

@ax3l

Description

@ax3l

Hi,

is there a registered filter option to set the number of threads used by blosc from HDF5? Something like the blosc_set_nthreads call?

I cannot find it in the blosc_filter.c in cd_values.

#!/usr/bin/env python
#
from PIL import Image
import numpy as np
import h5py as h5
from profilehooks import profile
import blosc
import os


@profile
def benchmark(data, compression=None, repeat=100, **kwds):
    f = h5.File('data.hdf5', 'w')
    parallel_blosc = kwds.pop('parallel_blosc', False)
    if parallel_blosc:
        blosc.set_nthreads(4)
        blosc.set_releasegil(True)
    else:
        c_data = data

    for i in range(repeat):
        if parallel_blosc:
            c_data = np.void(blosc.pack_array(data, clevel=9, shuffle=blosc.BITSHUFFLE, cname='lz4'))

        f.create_dataset("data_{:05}".format(i), data=c_data,
                         compression=compression,
                         **kwds)

    del f # closes the file
    os.sync() # flush out of kernel memory caches


def blosc_opts(complevel=9, complib='blosc:lz4', shuffle=1):
    shuffle = 2 if shuffle == 'bit' else 1 if shuffle else 0
    compressors = ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd']
    complib = ['blosc:' + c for c in compressors].index(complib)
    args = {
        'compression': 32001,
        'compression_opts': (0, 0, 0, 0, complevel, shuffle, complib)
    }
    #if shuffle:
    args['shuffle'] = False
    return args


if __name__ == "__main__":

    im = Image.open("some_large.png")
    # some_large.png: PNG image data, about 2000 x 2000, 16-bit grayscale, non-interlaced
    data = np.array(im).astype("uint16")

    benchmark(data)  # raw:                                           966MB,   ~9.1ms/image

    #benchmark(data, "gzip", compression_opts=0)  # "zlib" ("off"):    973MB,  ~18  ms/image
    #benchmark(data, "gzip", compression_opts=1)  # "zlib" ("fast"):   288MB, ~110  ms/image
    #benchmark(data, "gzip", compression_opts=5)  # "zlib" ("medium"): 242MB, ~296  ms/image
    #benchmark(data, "gzip", compression_opts=9)  # "zlib" ("strong"): 219MB, 5030  ms/image
    #benchmark(data, "lzf")  # "LZF" (has no options):                 442MB,   ~40ms/image

    # these use the HDF5 filter but do not thread!
    #benchmark(data, **blosc_opts(1, 'blosc:lz4', 2))  # "LZ4" ("fast") 556MB   27ms/image
    #benchmark(data, **blosc_opts(2, 'blosc:lz4', 2))  # "LZ4" ("fast") 555MB   30ms/image
    #benchmark(data, **blosc_opts(9, 'blosc:lz4', 2))  # "LZ4" ("strong")

    # these pre-compress with blosc to a void and do thread!
    #benchmark(data, parallel_blosc=True)  # LZ4, 4 threads, bitshuffle  339MB  ~9.5ms/image

I saw there is the BLOSC_NTHREADS environment variable for c-blosc in 1.9.0+, but it does not seam to have an effect here... (I am using this through h5py.) I guess I need something like

blosc.set_nthreads(4)
blosc.set_releasegil(True)

of the python-blosc bindings... on top...

Thanks,
Axel

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions