Skip to content

Commit e185c73

Browse files
committed
Fix detection of AVX2 support
Fixes NLnetLabs#222.
1 parent 380abdb commit e185c73

File tree

1 file changed

+73
-29
lines changed

1 file changed

+73
-29
lines changed

src/isadetection.h

+73-29
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*
44
* Slightly modified version of isadetection.h in simdjson.
55
*
6+
* Copyright (c) 2024 NLnet Labs (Jeroen Koekkoek)
67
* Copyright (c) 2020- simdjson (Daniel Lemire,
78
* Geoff Langdale,
89
* John Keiser)
@@ -54,10 +55,12 @@
5455
#ifndef ISADETECTION_H
5556
#define ISADETECTION_H
5657

58+
#include <stdbool.h>
5759
#include <stdint.h>
5860
#include <stdlib.h>
5961
#if defined(_MSC_VER)
6062
#include <intrin.h>
63+
#include <immintrin.h>
6164
#elif defined(HAVE_CPUID)
6265
#include <cpuid.h>
6366
#endif
@@ -107,20 +110,22 @@ static inline uint32_t detect_supported_architectures() {
107110
#elif defined(__x86_64__) || defined(_M_AMD64) // x64
108111

109112
// Can be found on Intel ISA Reference for CPUID
110-
static const uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7
111-
static const uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7
112-
static const uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7
113-
static const uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7
114-
static const uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
115-
static const uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7
116-
static const uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7
117-
static const uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7
118-
static const uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7
119-
static const uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
120-
static const uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7
121-
static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
122-
static const uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1
123-
static const uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1
113+
static const uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7
114+
static const uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7
115+
static const uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7
116+
static const uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7
117+
static const uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
118+
static const uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7
119+
static const uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7
120+
static const uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7
121+
static const uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7
122+
static const uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
123+
static const uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7
124+
static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
125+
static const uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1
126+
static const uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1
127+
static const uint32_t cpuid_have_xgetbv_bit = 1 << 27; ///< @private bit 27 of ECX for EAX=0x1
128+
static const uint32_t cpuid_have_avx_bit = 1 << 28; ///< @private bit 28 of ECX for EAX=0x1
124129

125130
static inline void cpuid(
126131
uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
@@ -145,17 +150,27 @@ static inline void cpuid(
145150
#endif
146151
}
147152

148-
static inline uint32_t detect_supported_architectures(void) {
153+
static inline uint64_t xgetbv(uint32_t ecx)
154+
{
155+
#if defined(_MSC_VER)
156+
return _xgetbv(ecx);
157+
#else
158+
uint32_t a, c = ecx, d;
159+
asm volatile("xgetbv\n\t" : "=d"(d), "=a"(a) : "c"(c));
160+
uint64_t xcr0 = ((uint64_t)d << 32) | (uint64_t)a;
161+
return xcr0;
162+
#endif
163+
}
164+
165+
static inline uint32_t detect_supported_architectures(void)
166+
{
149167
uint32_t eax, ebx, ecx, edx;
150-
uint32_t host_isa = 0x0;
168+
uint32_t host_isa = 0x0, host_avx_isa = 0x0;
151169

152170
// ECX for EAX=0x7
153171
eax = 0x7;
154172
ecx = 0x0;
155173
cpuid(&eax, &ebx, &ecx, &edx);
156-
if (ebx & cpuid_avx2_bit) {
157-
host_isa |= AVX2;
158-
}
159174
if (ebx & cpuid_bmi1_bit) {
160175
host_isa |= BMI1;
161176
}
@@ -164,46 +179,51 @@ static inline uint32_t detect_supported_architectures(void) {
164179
host_isa |= BMI2;
165180
}
166181

182+
if (ebx & cpuid_avx2_bit) {
183+
host_avx_isa |= AVX2;
184+
}
185+
167186
if (ebx & cpuid_avx512f_bit) {
168-
host_isa |= AVX512F;
187+
host_avx_isa |= AVX512F;
169188
}
170189

171190
if (ebx & cpuid_avx512dq_bit) {
172-
host_isa |= AVX512DQ;
191+
host_avx_isa |= AVX512DQ;
173192
}
174193

175194
if (ebx & cpuid_avx512ifma_bit) {
176-
host_isa |= AVX512IFMA;
195+
host_avx_isa |= AVX512IFMA;
177196
}
178197

179198
if (ebx & cpuid_avx512pf_bit) {
180-
host_isa |= AVX512PF;
199+
host_avx_isa |= AVX512PF;
181200
}
182201

183202
if (ebx & cpuid_avx512er_bit) {
184-
host_isa |= AVX512ER;
203+
host_avx_isa |= AVX512ER;
185204
}
186205

187206
if (ebx & cpuid_avx512cd_bit) {
188-
host_isa |= AVX512CD;
207+
host_avx_isa |= AVX512CD;
189208
}
190209

191210
if (ebx & cpuid_avx512bw_bit) {
192-
host_isa |= AVX512BW;
211+
host_avx_isa |= AVX512BW;
193212
}
194213

195214
if (ebx & cpuid_avx512vl_bit) {
196-
host_isa |= AVX512VL;
215+
host_avx_isa |= AVX512VL;
197216
}
198217

199218
if (ecx & cpuid_avx512vbmi2_bit) {
200-
host_isa |= AVX512VBMI2;
219+
host_avx_isa |= AVX512VBMI2;
201220
}
202221

222+
bool have_avx = false, have_xgetbv = false;
223+
203224
// EBX for EAX=0x1
204225
eax = 0x1;
205226
cpuid(&eax, &ebx, &ecx, &edx);
206-
207227
if (ecx & cpuid_sse42_bit) {
208228
host_isa |= SSE42;
209229
}
@@ -212,6 +232,30 @@ static inline uint32_t detect_supported_architectures(void) {
212232
host_isa |= PCLMULQDQ;
213233
}
214234

235+
// Correct detection of AVX2 support requires more than checking the CPUID
236+
// bit. Peter Cordes provides an excellent answer on Stack Overflow
237+
// (https://stackoverflow.com/a/34071400) quoting the article Introduction
238+
// to Intel Advanced Vector Extensions (search Wayback Machine).
239+
//
240+
// 1. Verify that the operating system supports XGETBV using
241+
// CPUID.1:ECX.OSXSAVE bit 27 = 1.
242+
// 2. Verify the processor supports the AVX instruction extensions using:
243+
// CPUID.1:ECX bit 28 = 1.
244+
// 3. Issue XGETBV, and verify that the feature-enabled mask at bits 1 and 2
245+
// are 11b (XMM state and YMM state enabled by the operating system).
246+
247+
248+
// Determine if the CPU supports AVX
249+
have_avx = (ecx & cpuid_have_avx_bit) != 0;
250+
// Determine if the Operating System supports XGETBV
251+
have_xgetbv = (ecx & cpuid_have_xgetbv_bit) != 0;
252+
253+
if (have_avx && have_xgetbv) {
254+
uint64_t xcr0 = xgetbv(0x0);
255+
if ((xcr0 & 0x6) == 0x6)
256+
host_isa |= host_avx_isa;
257+
}
258+
215259
return host_isa;
216260
}
217261
#else // fallback

0 commit comments

Comments
 (0)