@@ -134,27 +134,27 @@ struct LZSSMatch
134
134
};
135
135
136
136
/* *
137
- @return The number of bytes written into 'dst' or a negative value if this function fails.
138
- @param capacity ... Byte size of a destination buffer 'dst'. This is at least more than the size which is reterned by the compressBound.
139
- @param dst ... A destination buffer.
140
- @param size ... Byte size of a source data. The maximum supported value for this is MAX_BLOCK_SIZE.
141
- @param src ... A source data.
142
- */
137
+ @return The number of bytes written into 'dst' or a negative value if this function fails.
138
+ @param capacity ... Byte size of a destination buffer 'dst'. This is at least more than the size which is reterned by the compressBound.
139
+ @param dst ... A destination buffer.
140
+ @param size ... Byte size of a source data. The maximum supported value for this is MAX_BLOCK_SIZE.
141
+ @param src ... A source data.
142
+ */
143
143
s32 compress (SLZ4Context& context, u32 capacity, u8* dst, u32 size, const u8* src);
144
144
145
145
/* *
146
- @return The maximum number of bytes that compression may write into a destination buffer.
147
- @param size ... Byte size of a source data. The maximum supported value for this is MAX_BLOCK_SIZE.
148
- */
146
+ @return The maximum number of bytes that compression may write into a destination buffer.
147
+ @param size ... Byte size of a source data. The maximum supported value for this is MAX_BLOCK_SIZE.
148
+ */
149
149
s32 compressBound (u32 size);
150
150
151
151
/* *
152
- @return The number of bytes written into 'dst' or a negative value if this function fails.
153
- @param capacity ... Byte size of a destination buffer 'dst'. The maximum supported value for this is MAX_BLOCK_SIZE.
154
- @param dst ... A destination buffer.
155
- @param size ... Byte size of a compressed data 'src'.
156
- @param src ... A source compressed data.
157
- */
152
+ @return The number of bytes written into 'dst' or a negative value if this function fails.
153
+ @param capacity ... Byte size of a destination buffer 'dst'. The maximum supported value for this is MAX_BLOCK_SIZE.
154
+ @param dst ... A destination buffer.
155
+ @param size ... Byte size of a compressed data 'src'.
156
+ @param src ... A source compressed data.
157
+ */
158
158
s32 decompress (u32 capacity, u8* dst, u32 size, const u8* src);
159
159
160
160
} // namespace slz4
@@ -165,8 +165,8 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src);
165
165
#include < immintrin.h>
166
166
167
167
#define XXH_INLINE_ALL
168
- #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
169
- #define XXH_IMPLEMENTATION /* access definitions */
168
+ #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
169
+ #define XXH_IMPLEMENTATION /* access definitions */
170
170
#include < xxhash.h>
171
171
172
172
namespace slz4
@@ -187,7 +187,7 @@ namespace
187
187
188
188
u32 push (u32 code, u32 u)
189
189
{
190
- return (code >> 8 ) | (u<< 24 );
190
+ return (code >> 8 ) | (u << 24 );
191
191
}
192
192
193
193
u32 hash (u32 code)
@@ -365,27 +365,42 @@ namespace
365
365
}
366
366
367
367
// -------------------------------------------------------------------
368
- void copy (u8* dst, const u8* src , u32 size)
368
+ void set (u8* dst, s32 value , u32 size)
369
369
{
370
- u32 s = size>>6 ;
370
+ u32 s = size>>7 ;
371
+ SLZ4_ASSERT (size == (s<<7 ));
372
+ __m256i x = _mm256_set1_epi32 (value);
371
373
for (u32 i=0 ; i<s; ++i){
374
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(dst), x);
375
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(dst+32 ), x);
376
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(dst+64 ), x);
377
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(dst+96 ), x);
378
+ dst += 128 ;
379
+ }
380
+ }
381
+
382
+ // -------------------------------------------------------------------
383
+ void copy (u8* dst, const u8* src, u32 size)
384
+ {
385
+ u32 s = size >> 6 ;
386
+ for (u32 i = 0 ; i < s; ++i) {
372
387
_mm_storeu_ps (reinterpret_cast <f32*>(dst), _mm_loadu_ps (reinterpret_cast <const f32*>(src)));
373
- _mm_storeu_ps (reinterpret_cast <f32*>(dst+ 16 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src+ 16 )));
374
- _mm_storeu_ps (reinterpret_cast <f32*>(dst+ 32 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src+ 32 )));
375
- _mm_storeu_ps (reinterpret_cast <f32*>(dst+ 48 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src+ 48 )));
388
+ _mm_storeu_ps (reinterpret_cast <f32*>(dst + 16 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src + 16 )));
389
+ _mm_storeu_ps (reinterpret_cast <f32*>(dst + 32 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src + 32 )));
390
+ _mm_storeu_ps (reinterpret_cast <f32*>(dst + 48 ), _mm_loadu_ps (reinterpret_cast <const f32*>(src + 48 )));
376
391
dst += 64 ;
377
392
src += 64 ;
378
393
}
379
- size -= (s<< 6 );
380
- s = size>> 4 ;
381
- for (u32 i= 0 ; i< s; ++i){
394
+ size -= (s << 6 );
395
+ s = size >> 4 ;
396
+ for (u32 i = 0 ; i < s; ++i) {
382
397
_mm_storeu_ps (reinterpret_cast <f32*>(dst), _mm_loadu_ps (reinterpret_cast <const f32*>(src)));
383
398
dst += 16 ;
384
399
src += 16 ;
385
400
}
386
401
387
- s = size - (s<< 4 );
388
- for (u32 i= 0 ; i< s; ++i){
402
+ s = size - (s << 4 );
403
+ for (u32 i = 0 ; i < s; ++i) {
389
404
dst[i] = src[i];
390
405
}
391
406
}
@@ -406,7 +421,8 @@ s32 compress(SLZ4Context& context, u32 capacity, u8* dst, u32 size, const u8* sr
406
421
: -1 ;
407
422
}
408
423
409
- memset (context.entries_ , -1 , sizeof (s32) * DICTIONARY_SIZE);
424
+ // memset(context.entries_, -1, sizeof(s32) * DICTIONARY_SIZE);
425
+ set (reinterpret_cast <u8*>(context.entries_ ), -1 , sizeof (s32)*DICTIONARY_SIZE);
410
426
{ // Add the first code to our dictionary
411
427
u32 code = pack (src);
412
428
u32 index = hash (code) & (DICTIONARY_SIZE - 1 );
@@ -418,7 +434,7 @@ s32 compress(SLZ4Context& context, u32 capacity, u8* dst, u32 size, const u8* sr
418
434
u32 pending = 0 ;
419
435
u32 position = 4 ;
420
436
while (position < endMatch) {
421
- u32 code = pack (src+ position);
437
+ u32 code = pack (src + position);
422
438
LZSSMatch match = findLongestMatch (context, code, src, position, end);
423
439
424
440
if (MIN_MATCH_LENGTH <= match.length_ ) {
@@ -454,7 +470,7 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src)
454
470
{
455
471
SLZ4_ASSERT (0 <= size);
456
472
SLZ4_ASSERT (0 <= capacity && capacity <= MAX_BLOCK_SIZE);
457
- if (MAX_BLOCK_SIZE< capacity){
473
+ if (MAX_BLOCK_SIZE < capacity) {
458
474
return -1 ;
459
475
}
460
476
const u8* current = src;
@@ -464,10 +480,11 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src)
464
480
u8* d = dst;
465
481
466
482
for (;;) {
467
- // Decode token
468
483
if (end0 <= current) {
469
- return - 1 ;
484
+ break ;
470
485
}
486
+
487
+ // Decode token
471
488
u32 literalLength = (current[0 ] >> 4 );
472
489
u32 matchLength = (current[0 ]) & 0xFU ;
473
490
++current;
@@ -478,13 +495,12 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src)
478
495
}
479
496
480
497
// Read literals
481
- if (end0< (current + literalLength)){
498
+ if (end0 < (current + literalLength)) {
482
499
return -1 ;
483
500
}
484
- if (dend< (d + literalLength)){
501
+ if (dend < (d + literalLength)) {
485
502
return -1 ;
486
503
}
487
- // memcpy(d, current, literalLength);
488
504
copy (d, current, literalLength);
489
505
d += literalLength;
490
506
current += literalLength;
@@ -503,15 +519,15 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src)
503
519
}
504
520
505
521
// Copy match
506
- if (static_cast <s32>(d- dst) < offset) {
522
+ if (static_cast <s32>(d - dst) < offset) {
507
523
return -1 ;
508
524
}
509
525
matchLength += MIN_MATCH_LENGTH;
510
- if (dend< (d + matchLength)){
526
+ if (dend < (d + matchLength)) {
511
527
return -1 ;
512
528
}
513
529
if (16 <= offset) {
514
- copy (d, d- offset, matchLength);
530
+ copy (d, d - offset, matchLength);
515
531
d += matchLength;
516
532
} else {
517
533
while (0 < matchLength) {
@@ -521,7 +537,8 @@ s32 decompress(u32 capacity, u8* dst, u32 size, const u8* src)
521
537
}
522
538
}
523
539
}
524
- return static_cast <s32>(d- dst);
540
+ return static_cast <s32>(d - dst);
525
541
}
542
+
526
543
} // namespace slz4
527
544
#endif // SLZ4_IMPLEMENTATION
0 commit comments