Skip to content

Commit 2a8a553

Browse files
committed
Fix AVX2 function for x86 arch
1 parent bd5b218 commit 2a8a553

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

source/Platform/x86/AVX2.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,16 @@ class AVX2Vector {
111111
#else
112112
inline uint16_t sum() const {
113113
__m256i sum256 = _mm256_sad_epu8(vector, _mm256_setzero_si256());
114-
__m128i sum128 = _mm_add_epi64(_mm256_extractf128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
115-
return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1) + _mm_extract_epi32(sum128, 2) + _mm_extract_epi32(sum128, 3);
114+
__m128i sum128 = _mm_add_epi32(_mm256_extractf128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
115+
116+
// Sum the 32-bit elements in the lower 64 bits
117+
sum128 = _mm_add_epi32(sum128, _mm_srli_si128(sum128, 8));
118+
119+
// Sum the 16-bit elements in the lower 32 bits
120+
sum128 = _mm_add_epi16(_mm_unpacklo_epi32(sum128, _mm_setzero_si128()), _mm_unpackhi_epi32(sum128, _mm_setzero_si128()));
121+
122+
// Extract the result as a 16-bit integer
123+
return static_cast<uint16_t>(_mm_extract_epi16(sum128, 0));
116124
}
117125
#endif
118126

0 commit comments

Comments
 (0)