|
154 | 154 | ((defined(LIBPOPCNT_HAVE_AVX512) && !(defined(__AVX512__) || \
|
155 | 155 | (defined(__AVX512F__) && \
|
156 | 156 | defined(__AVX512BW__) && \
|
157 |
| - defined(__AVX512VPOPCNTDQ__) && \ |
158 |
| - defined(__AVX512BITALG__)))) || \ |
| 157 | + defined(__AVX512VPOPCNTDQ__)))) || \ |
159 | 158 | (defined(LIBPOPCNT_HAVE_AVX2) && !defined(__AVX2__)) || \
|
160 | 159 | (defined(LIBPOPCNT_HAVE_POPCNT) && !defined(__POPCNT__)))
|
161 | 160 | #define LIBPOPCNT_HAVE_CPUID
|
@@ -265,7 +264,6 @@ static inline uint64_t popcnt64(uint64_t x)
|
265 | 264 | #define LIBPOPCNT_BIT_AVX512BW (1 << 30)
|
266 | 265 |
|
267 | 266 | /* %ecx bit flags */
|
268 |
| -#define LIBPOPCNT_BIT_AVX512_BITALG (1 << 12) |
269 | 267 | #define LIBPOPCNT_BIT_AVX512_VPOPCNTDQ (1 << 14)
|
270 | 268 | #define LIBPOPCNT_BIT_POPCNT (1 << 23)
|
271 | 269 |
|
@@ -361,8 +359,7 @@ static inline int get_cpuid(void)
|
361 | 359 | /* then we add LIBPOPCNT_BIT_AVX512_VPOPCNTDQ to our CPUID flags. */
|
362 | 360 | if ((abcd[1] & LIBPOPCNT_BIT_AVX512F) == LIBPOPCNT_BIT_AVX512F &&
|
363 | 361 | (abcd[1] & LIBPOPCNT_BIT_AVX512BW) == LIBPOPCNT_BIT_AVX512BW &&
|
364 |
| - (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ && |
365 |
| - (abcd[2] & LIBPOPCNT_BIT_AVX512_BITALG) == LIBPOPCNT_BIT_AVX512_BITALG) |
| 362 | + (abcd[2] & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) == LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) |
366 | 363 | flags |= LIBPOPCNT_BIT_AVX512_VPOPCNTDQ;
|
367 | 364 | }
|
368 | 365 | }
|
@@ -487,7 +484,7 @@ static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size)
|
487 | 484 | #include <immintrin.h>
|
488 | 485 |
|
489 | 486 | #if __has_attribute(target)
|
490 |
| - __attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq,avx512bitalg"))) |
| 487 | + __attribute__ ((target ("avx512f,avx512bw,avx512vpopcntdq"))) |
491 | 488 | #endif
|
492 | 489 | static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
|
493 | 490 | {
|
@@ -528,9 +525,8 @@ static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
|
528 | 525 | {
|
529 | 526 | __mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size));
|
530 | 527 | __m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
|
531 |
| - __m512i cnt8 = _mm512_popcnt_epi8(vec); |
532 |
| - cnt8 = _mm512_sad_epu8(cnt8, _mm512_setzero_si512()); |
533 |
| - cnt = _mm512_add_epi64(cnt, cnt8); |
| 528 | + vec = _mm512_popcnt_epi64(vec); |
| 529 | + cnt = _mm512_add_epi64(cnt, vec); |
534 | 530 | }
|
535 | 531 |
|
536 | 532 | return _mm512_reduce_add_epi64(cnt);
|
@@ -581,8 +577,7 @@ static uint64_t popcnt(const void* data, uint64_t size)
|
581 | 577 | #if defined(__AVX512__) || \
|
582 | 578 | (defined(__AVX512F__) && \
|
583 | 579 | defined(__AVX512BW__) && \
|
584 |
| - defined(__AVX512VPOPCNTDQ__) && \ |
585 |
| - defined(__AVX512BITALG__)) |
| 580 | + defined(__AVX512VPOPCNTDQ__)) |
586 | 581 | /* For tiny arrays AVX512 is not worth it */
|
587 | 582 | if (i + 40 <= size)
|
588 | 583 | #else
|
|
0 commit comments