33 *
44 * Slightly modified version of isadetection.h in simdjson.
55 *
6+ * Copyright (c) 2024 NLnet Labs (Jeroen Koekkoek)
67 * Copyright (c) 2020- simdjson (Daniel Lemire,
78 * Geoff Langdale,
89 * John Keiser)
5455#ifndef ISADETECTION_H
5556#define ISADETECTION_H
5657
58+ #include <stdbool.h>
5759#include <stdint.h>
5860#include <stdlib.h>
5961#if defined(_MSC_VER )
6062#include <intrin.h>
63+ #include <immintrin.h>
6164#elif defined(HAVE_CPUID )
6265#include <cpuid.h>
6366#endif
@@ -107,20 +110,22 @@ static inline uint32_t detect_supported_architectures() {
107110#elif defined(__x86_64__ ) || defined(_M_AMD64 ) // x64
108111
109112// Can be found on Intel ISA Reference for CPUID
110- static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
111- static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
112- static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
113- static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
114- static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
115- static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
116- static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
117- static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
118- static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
119- static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
120- static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
121- static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
122- static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
123- static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
113+ static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
114+ static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
115+ static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
116+ static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
117+ static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
118+ static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
119+ static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
120+ static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
121+ static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
122+ static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
123+ static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
124+ static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
125+ static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
126+ static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
127+ static const uint32_t cpuid_have_xgetbv_bit = 1 << 27 ; ///< @private bit 27 of ECX for EAX=0x1
128+ static const uint32_t cpuid_have_avx_bit = 1 << 28 ; ///< @private bit 28 of ECX for EAX=0x1
124129
125130static inline void cpuid (
126131 uint32_t * eax , uint32_t * ebx , uint32_t * ecx , uint32_t * edx )
@@ -145,17 +150,27 @@ static inline void cpuid(
145150#endif
146151}
147152
148- static inline uint32_t detect_supported_architectures (void ) {
153+ static inline uint64_t xgetbv (uint32_t ecx )
154+ {
155+ #if defined(_MSC_VER )
156+ return _xgetbv (ecx );
157+ #else
158+ uint32_t a , c = ecx , d ;
159+ asm volatile ("xgetbv\n\t" : "=d" (d ), "=a" (a ) : "c" (c ));
160+ uint64_t xcr0 = ((uint64_t )d << 32 ) | (uint64_t )a ;
161+ return xcr0 ;
162+ #endif
163+ }
164+
165+ static inline uint32_t detect_supported_architectures (void )
166+ {
149167 uint32_t eax , ebx , ecx , edx ;
150- uint32_t host_isa = 0x0 ;
168+ uint32_t host_isa = 0x0 , host_avx_isa = 0x0 ;
151169
152170 // ECX for EAX=0x7
153171 eax = 0x7 ;
154172 ecx = 0x0 ;
155173 cpuid (& eax , & ebx , & ecx , & edx );
156- if (ebx & cpuid_avx2_bit ) {
157- host_isa |= AVX2 ;
158- }
159174 if (ebx & cpuid_bmi1_bit ) {
160175 host_isa |= BMI1 ;
161176 }
@@ -164,46 +179,51 @@ static inline uint32_t detect_supported_architectures(void) {
164179 host_isa |= BMI2 ;
165180 }
166181
182+ if (ebx & cpuid_avx2_bit ) {
183+ host_avx_isa |= AVX2 ;
184+ }
185+
167186 if (ebx & cpuid_avx512f_bit ) {
168- host_isa |= AVX512F ;
187+ host_avx_isa |= AVX512F ;
169188 }
170189
171190 if (ebx & cpuid_avx512dq_bit ) {
172- host_isa |= AVX512DQ ;
191+ host_avx_isa |= AVX512DQ ;
173192 }
174193
175194 if (ebx & cpuid_avx512ifma_bit ) {
176- host_isa |= AVX512IFMA ;
195+ host_avx_isa |= AVX512IFMA ;
177196 }
178197
179198 if (ebx & cpuid_avx512pf_bit ) {
180- host_isa |= AVX512PF ;
199+ host_avx_isa |= AVX512PF ;
181200 }
182201
183202 if (ebx & cpuid_avx512er_bit ) {
184- host_isa |= AVX512ER ;
203+ host_avx_isa |= AVX512ER ;
185204 }
186205
187206 if (ebx & cpuid_avx512cd_bit ) {
188- host_isa |= AVX512CD ;
207+ host_avx_isa |= AVX512CD ;
189208 }
190209
191210 if (ebx & cpuid_avx512bw_bit ) {
192- host_isa |= AVX512BW ;
211+ host_avx_isa |= AVX512BW ;
193212 }
194213
195214 if (ebx & cpuid_avx512vl_bit ) {
196- host_isa |= AVX512VL ;
215+ host_avx_isa |= AVX512VL ;
197216 }
198217
199218 if (ecx & cpuid_avx512vbmi2_bit ) {
200- host_isa |= AVX512VBMI2 ;
219+ host_avx_isa |= AVX512VBMI2 ;
201220 }
202221
222+ bool have_avx = false, have_xgetbv = false;
223+
203224 // EBX for EAX=0x1
204225 eax = 0x1 ;
205226 cpuid (& eax , & ebx , & ecx , & edx );
206-
207227 if (ecx & cpuid_sse42_bit ) {
208228 host_isa |= SSE42 ;
209229 }
@@ -212,6 +232,30 @@ static inline uint32_t detect_supported_architectures(void) {
212232 host_isa |= PCLMULQDQ ;
213233 }
214234
235+ // Correct detection of AVX2 support requires more than checking the CPUID
236+ // bit. Peter Cordes provides an excellent answer on Stack Overflow
237+ // (https://stackoverflow.com/a/34071400) quoting the article Introduction
238+ // to Intel Advanced Vector Extensions (search Wayback Machine).
239+ //
240+ // 1. Verify that the operating system supports XGETBV using
241+ // CPUID.1:ECX.OSXSAVE bit 27 = 1.
242+ // 2. Verify the processor supports the AVX instruction extensions using:
243+ // CPUID.1:ECX bit 28 = 1.
244+ // 3. Issue XGETBV, and verify that the feature-enabled mask at bits 1 and 2
245+ // are 11b (XMM state and YMM state enabled by the operating system).
246+
247+
248+ // Determine if the CPU supports AVX
249+ have_avx = (ecx & cpuid_have_avx_bit ) != 0 ;
250+ // Determine if the Operating System supports XGETBV
251+ have_xgetbv = (ecx & cpuid_have_xgetbv_bit ) != 0 ;
252+
253+ if (have_avx && have_xgetbv ) {
254+ uint64_t xcr0 = xgetbv (0x0 );
255+ if ((xcr0 & 0x6 ) == 0x6 )
256+ host_isa |= host_avx_isa ;
257+ }
258+
215259 return host_isa ;
216260}
217261#else // fallback
0 commit comments