Skip to content

Commit 00cb72e

Browse files
committed
Fix detection of AVX2 support
Fixes NLnetLabs#222.
1 parent 380abdb commit 00cb72e

File tree

1 file changed

+74
-29
lines changed

1 file changed

+74
-29
lines changed

src/isadetection.h

+74-29
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*
44
* Slightly modified version of isadetection.h in simdjson.
55
*
6+
* Copyright (c) 2024 NLnet Labs (Jeroen Koekkoek)
67
* Copyright (c) 2020- simdjson (Daniel Lemire,
78
* Geoff Langdale,
89
* John Keiser)
@@ -107,20 +108,22 @@ static inline uint32_t detect_supported_architectures() {
107108
#elif defined(__x86_64__) || defined(_M_AMD64) // x64
108109

109110
// Can be found on Intel ISA Reference for CPUID
110-
static const uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7
111-
static const uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7
112-
static const uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7
113-
static const uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7
114-
static const uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
115-
static const uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7
116-
static const uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7
117-
static const uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7
118-
static const uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7
119-
static const uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
120-
static const uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7
121-
static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
122-
static const uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1
123-
static const uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1
111+
static const uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7
112+
static const uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7
113+
static const uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7
114+
static const uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7
115+
static const uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
116+
static const uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7
117+
static const uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7
118+
static const uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7
119+
static const uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7
120+
static const uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
121+
static const uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7
122+
static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
123+
static const uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1
124+
static const uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1
125+
static const uint32_t cpuid_have_xgetbv_bit = 1 << 27; ///< @private bit 27 of ECX for EAX=0x1
126+
static const uint32_t cpuid_have_avx_bit = 1 << 28; ///< @private bit 28 of ECX for EAX=0x1
124127

125128
static inline void cpuid(
126129
uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
@@ -145,17 +148,26 @@ static inline void cpuid(
145148
#endif
146149
}
147150

148-
static inline uint32_t detect_supported_architectures(void) {
151+
static inline void xgetbv(
152+
uint32_t *edx, uint32_t *eax, uint32_t *ecx)
153+
{
154+
// FIXME: Implement support for _xgetbv intrinsic?
155+
uint32_t a, c = *ecx, d;
156+
asm volatile("xgetbv\n\t" : "=d"(d), "=a"(a), "+c"(c));
157+
*edx = d;
158+
*eax = a;
159+
*ecx = c;
160+
}
161+
162+
static inline uint32_t detect_supported_architectures(void)
163+
{
149164
uint32_t eax, ebx, ecx, edx;
150-
uint32_t host_isa = 0x0;
165+
uint32_t host_isa = 0x0, host_avx_isa = 0x0;
151166

152167
// ECX for EAX=0x7
153168
eax = 0x7;
154169
ecx = 0x0;
155170
cpuid(&eax, &ebx, &ecx, &edx);
156-
if (ebx & cpuid_avx2_bit) {
157-
host_isa |= AVX2;
158-
}
159171
if (ebx & cpuid_bmi1_bit) {
160172
host_isa |= BMI1;
161173
}
@@ -164,46 +176,51 @@ static inline uint32_t detect_supported_architectures(void) {
164176
host_isa |= BMI2;
165177
}
166178

179+
if (ebx & cpuid_avx2_bit) {
180+
host_avx_isa |= AVX2;
181+
}
182+
167183
if (ebx & cpuid_avx512f_bit) {
168-
host_isa |= AVX512F;
184+
host_avx_isa |= AVX512F;
169185
}
170186

171187
if (ebx & cpuid_avx512dq_bit) {
172-
host_isa |= AVX512DQ;
188+
host_avx_isa |= AVX512DQ;
173189
}
174190

175191
if (ebx & cpuid_avx512ifma_bit) {
176-
host_isa |= AVX512IFMA;
192+
host_avx_isa |= AVX512IFMA;
177193
}
178194

179195
if (ebx & cpuid_avx512pf_bit) {
180-
host_isa |= AVX512PF;
196+
host_avx_isa |= AVX512PF;
181197
}
182198

183199
if (ebx & cpuid_avx512er_bit) {
184-
host_isa |= AVX512ER;
200+
host_avx_isa |= AVX512ER;
185201
}
186202

187203
if (ebx & cpuid_avx512cd_bit) {
188-
host_isa |= AVX512CD;
204+
host_avx_isa |= AVX512CD;
189205
}
190206

191207
if (ebx & cpuid_avx512bw_bit) {
192-
host_isa |= AVX512BW;
208+
host_avx_isa |= AVX512BW;
193209
}
194210

195211
if (ebx & cpuid_avx512vl_bit) {
196-
host_isa |= AVX512VL;
212+
host_avx_isa |= AVX512VL;
197213
}
198214

199215
if (ecx & cpuid_avx512vbmi2_bit) {
200-
host_isa |= AVX512VBMI2;
216+
host_avx_isa |= AVX512VBMI2;
201217
}
202218

219+
bool have_avx = false, have_xgetbv = false;
220+
203221
// EBX for EAX=0x1
204222
eax = 0x1;
205223
cpuid(&eax, &ebx, &ecx, &edx);
206-
207224
if (ecx & cpuid_sse42_bit) {
208225
host_isa |= SSE42;
209226
}
@@ -212,6 +229,34 @@ static inline uint32_t detect_supported_architectures(void) {
212229
host_isa |= PCLMULQDQ;
213230
}
214231

232+
// Correct detection of AVX2 support requires more than checking the CPUID
233+
// bit. Peter Cordes provides an excellent answer on Stack Overflow
234+
// (https://stackoverflow.com/a/34071400) quoting the article Introduction
235+
// to Intel Advanced Vector Extensions (search Wayback Machine).
236+
//
237+
// 1. Verify that the operating system supports XGETBV using
238+
// CPUID.1:ECX.OSXSAVE bit 27 = 1.
239+
// 2. Verify the processor supports the AVX instruction extensions using:
240+
// CPUID.1:ECX bit 28 = 1.
241+
// 3. Issue XGETBV, and verify that the feature-enabled mask at bits 1 and 2
242+
// are 11b (XMM state and YMM state enabled by the operating system).
243+
244+
245+
// Determine if the CPU supports AVX
246+
have_avx = (ecx & cpuid_have_avx_bit) != 0;
247+
// Determine if the Operating System supports XGETBV
248+
have_xgetbv = (ecx & cpuid_have_xgetbv_bit) != 0;
249+
250+
if (have_avx && have_xgetbv) {
251+
uint64_t xcr0;
252+
ecx = 0x0;
253+
xgetbv(&edx, &eax, &ecx);
254+
255+
xcr0 = ((uint64_t)edx << 32) | (uint64_t)eax;
256+
if ((xcr0 & 0x6) == 0x6)
257+
host_isa |= host_avx_isa;
258+
}
259+
215260
return host_isa;
216261
}
217262
#else // fallback

0 commit comments

Comments
 (0)