Skip to content

Commit 5214d3f

Browse files
committed
Simplify AVX512 algorithm
1 parent 6b986a4 commit 5214d3f

File tree

3 files changed

+14
-13
lines changed

3 files changed

+14
-13
lines changed

Diff for: ChangeLog

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
2024-06-29 Kim Walisch <[email protected]>
2+
3+
Version 3.1
4+
5+
* Improve AVX512 algorithm for trailing 64 bytes.
6+
* AVX512 algorithm does not require AVX512-BITALG extension anymore.
7+
18
2024-06-27 Kim Walisch <[email protected]>
29

310
Version 3.0

Diff for: benchmark.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,7 @@ int main(int argc, char* argv[])
102102
#if defined(LIBPOPCNT_HAVE_AVX512) && (defined(__AVX512__) || \
103103
(defined(__AVX512F__) && \
104104
defined(__AVX512BW__) && \
105-
defined(__AVX512VPOPCNTDQ__) && \
106-
defined(__AVX512BITALG__)))
105+
defined(__AVX512VPOPCNTDQ__)))
107106
if (algo.empty() && bytes >= 40)
108107
algo = "AVX512";
109108
#endif

Diff for: libpopcnt.h

+6-11
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,7 @@
154154
((defined(LIBPOPCNT_HAVE_AVX512) && !(defined(__AVX512__) || \
155155
(defined(__AVX512F__) && \
156156
defined(__AVX512BW__) && \
157-
defined(__AVX512VPOPCNTDQ__) && \
158-
defined(__AVX512BITALG__)))) || \
157+
defined(__AVX512VPOPCNTDQ__)))) || \
159158
(defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || \
160159
(defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__)))
161160
#define LIBPOPCNT_HAVE_CPUID
@@ -265,7 +264,6 @@ static inline uint64_t popcnt64(uint64_t x)
265264
#define LIBPOPCNT_BIT_AVX512BW (1 << 30)
266265

267266
/* %ecx bit flags */
268-
#define LIBPOPCNT_BIT_AVX512_BITALG (1 << 12)
269267
#define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14)
270268
#define LIBPOPCNT_BIT_POPCNT (1 << 23)
271269

@@ -361,8 +359,7 @@ static inline int get_cpuid(void)
361359
/* then we add LIBPOPCNT_BIT_AVX512_VPOPCNTDQ to our CPUID flags. */
362360
if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F &&
363361
(abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW &&
364-
(abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ &&
365-
(abcd[2] & LIBPOPCNT_BIT_AVX512_BITALG) == LIBPOPCNT_BIT_AVX512_BITALG)
362+
(abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ)
366363
flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ;
367364
}
368365
}
@@ -487,7 +484,7 @@ static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size)
487484
#include <immintrin.h>
488485

489486
#if __has_attribute(target)
490-
__attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq,avx512bitalg")))
487+
__attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq")))
491488
#endif
492489
static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
493490
{
@@ -528,9 +525,8 @@ static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
528525
{
529526
__mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size));
530527
__m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
531-
__m512i cnt8 = _mm512_popcnt_epi8(vec);
532-
cnt8 = _mm512_sad_epu8(cnt8, _mm512_setzero_si512());
533-
cnt = _mm512_add_epi64(cnt, cnt8);
528+
vec = _mm512_popcnt_epi64(vec);
529+
cnt = _mm512_add_epi64(cnt, vec);
534530
}
535531

536532
return _mm512_reduce_add_epi64(cnt);
@@ -581,8 +577,7 @@ static uint64_t popcnt(const void* data, uint64_t size)
581577
#if defined(__AVX512__) || \
582578
(defined(__AVX512F__) && \
583579
defined(__AVX512BW__) && \
584-
defined(__AVX512VPOPCNTDQ__) && \
585-
defined(__AVX512BITALG__))
580+
defined(__AVX512VPOPCNTDQ__))
586581
/* For tiny arrays AVX512 is not worth it */
587582
if (i + 40 <= size)
588583
#else

0 commit comments

Comments
 (0)