|
1 | 1 | from cStringIO import StringIO
|
2 | 2 | import gzip
|
| 3 | +import struct |
| 4 | + |
| 5 | +_XERIAL_V1_HEADER = (-126, 'S', 'N', 'A', 'P', 'P', 'Y', 0, 1, 1) |
| 6 | +_XERIAL_V1_FORMAT = 'bccccccBii' |
3 | 7 |
|
4 | 8 | try:
|
5 | 9 | import snappy
|
@@ -36,13 +40,101 @@ def gzip_decode(payload):
|
36 | 40 | return result
|
37 | 41 |
|
38 | 42 |
|
39 |
| -def snappy_encode(payload): |
| 43 | +def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024): |
| 44 | + """Encodes the given data with snappy if xerial_compatible is set then the |
| 45 | + stream is encoded in a fashion compatible with the xerial snappy library |
| 46 | +
|
| 47 | + The block size (xerial_blocksize) controls how frequent the blocking occurs |
| 48 | + 32k is the default in the xerial library. |
| 49 | +
|
| 50 | + The format winds up being |
| 51 | + +-------------+------------+--------------+------------+--------------+ |
| 52 | + | Header | Block1 len | Block1 data | Blockn len | Blockn data | |
| 53 | + |-------------+------------+--------------+------------+--------------| |
| 54 | + | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | |
| 55 | + +-------------+------------+--------------+------------+--------------+ |
| 56 | +
|
| 57 | + It is important to not that the blocksize is the amount of uncompressed |
| 58 | + data presented to snappy at each block, whereas the blocklen is the |
| 59 | + number of bytes that will be present in the stream, that is the |
| 60 | + length will always be <= blocksize. |
| 61 | + """ |
| 62 | + |
40 | 63 | if not _has_snappy:
|
41 | 64 | raise NotImplementedError("Snappy codec is not available")
|
42 |
| - return snappy.compress(payload) |
| 65 | + |
| 66 | + if xerial_compatible: |
| 67 | + def _chunker(): |
| 68 | + for i in xrange(0, len(payload), xerial_blocksize): |
| 69 | + yield payload[i:i+xerial_blocksize] |
| 70 | + |
| 71 | + out = StringIO() |
| 72 | + |
| 73 | + header = ''.join([struct.pack('!' + fmt, dat) for fmt, dat |
| 74 | + in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)]) |
| 75 | + |
| 76 | + out.write(header) |
| 77 | + for chunk in _chunker(): |
| 78 | + block = snappy.compress(chunk) |
| 79 | + block_size = len(block) |
| 80 | + out.write(struct.pack('!i', block_size)) |
| 81 | + out.write(block) |
| 82 | + |
| 83 | + out.seek(0) |
| 84 | + return out.read() |
| 85 | + |
| 86 | + else: |
| 87 | + return snappy.compress(payload) |
| 88 | + |
| 89 | + |
| 90 | +def _detect_xerial_stream(payload): |
| 91 | + """Detects if the data given might have been encoded with the blocking mode |
| 92 | + of the xerial snappy library. |
| 93 | +
|
| 94 | + This mode writes a magic header of the format: |
| 95 | + +--------+--------------+------------+---------+--------+ |
| 96 | + | Marker | Magic String | Null / Pad | Version | Compat | |
| 97 | + |--------+--------------+------------+---------+--------| |
| 98 | + | byte | c-string | byte | int32 | int32 | |
| 99 | + |--------+--------------+------------+---------+--------| |
| 100 | + | -126 | 'SNAPPY' | \0 | | | |
| 101 | + +--------+--------------+------------+---------+--------+ |
| 102 | +
|
| 103 | + The pad appears to be to ensure that SNAPPY is a valid cstring |
| 104 | + The version is the version of this format as written by xerial, |
| 105 | + in the wild this is currently 1 as such we only support v1. |
| 106 | +
|
| 107 | + Compat is there to claim the miniumum supported version that |
| 108 | + can read a xerial block stream, presently in the wild this is |
| 109 | + 1. |
| 110 | + """ |
| 111 | + |
| 112 | + if len(payload) > 16: |
| 113 | + header = header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16]) |
| 114 | + return header == _XERIAL_V1_HEADER |
| 115 | + return False |
43 | 116 |
|
44 | 117 |
|
45 | 118 | def snappy_decode(payload):
|
46 | 119 | if not _has_snappy:
|
47 | 120 | raise NotImplementedError("Snappy codec is not available")
|
48 |
| - return snappy.decompress(payload) |
| 121 | + |
| 122 | + if _detect_xerial_stream(payload): |
| 123 | + # TODO ? Should become a fileobj ? |
| 124 | + out = StringIO() |
| 125 | + byt = buffer(payload[16:]) |
| 126 | + length = len(byt) |
| 127 | + cursor = 0 |
| 128 | + |
| 129 | + while cursor < length: |
| 130 | + block_size = struct.unpack_from('!i', byt[cursor:])[0] |
| 131 | + # Skip the block size |
| 132 | + cursor += 4 |
| 133 | + end = cursor + block_size |
| 134 | + out.write(snappy.decompress(byt[cursor:end])) |
| 135 | + cursor = end |
| 136 | + |
| 137 | + out.seek(0) |
| 138 | + return out.read() |
| 139 | + else: |
| 140 | + return snappy.decompress(payload) |
0 commit comments