3
3
*
4
4
* Slightly modified version of isadetection.h in simdjson.
5
5
*
6
+ * Copyright (c) 2024 NLnet Labs (Jeroen Koekkoek)
6
7
* Copyright (c) 2020- simdjson (Daniel Lemire,
7
8
* Geoff Langdale,
8
9
* John Keiser)
54
55
#ifndef ISADETECTION_H
55
56
#define ISADETECTION_H
56
57
58
+ #include <stdbool.h>
57
59
#include <stdint.h>
58
60
#include <stdlib.h>
59
61
#if defined(_MSC_VER )
60
62
#include <intrin.h>
63
+ #include <immintrin.h>
61
64
#elif defined(HAVE_CPUID )
62
65
#include <cpuid.h>
63
66
#endif
@@ -107,20 +110,22 @@ static inline uint32_t detect_supported_architectures() {
107
110
#elif defined(__x86_64__ ) || defined(_M_AMD64 ) // x64
108
111
109
112
// Can be found on Intel ISA Reference for CPUID
110
- static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
111
- static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
112
- static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
113
- static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
114
- static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
115
- static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
116
- static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
117
- static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
118
- static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
119
- static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
120
- static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
121
- static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
122
- static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
123
- static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
113
+ static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
114
+ static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
115
+ static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
116
+ static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
117
+ static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
118
+ static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
119
+ static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
120
+ static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
121
+ static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
122
+ static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
123
+ static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
124
+ static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
125
+ static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
126
+ static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
127
+ static const uint32_t cpuid_have_xgetbv_bit = 1 << 27 ; ///< @private bit 27 of ECX for EAX=0x1
128
+ static const uint32_t cpuid_have_avx_bit = 1 << 28 ; ///< @private bit 28 of ECX for EAX=0x1
124
129
125
130
static inline void cpuid (
126
131
uint32_t * eax , uint32_t * ebx , uint32_t * ecx , uint32_t * edx )
@@ -145,17 +150,27 @@ static inline void cpuid(
145
150
#endif
146
151
}
147
152
148
- static inline uint32_t detect_supported_architectures (void ) {
153
+ static inline uint64_t xgetbv (uint32_t ecx )
154
+ {
155
+ #if defined(_MSC_VER )
156
+ return _xgetbv (ecx );
157
+ #else
158
+ uint32_t a , c = ecx , d ;
159
+ asm volatile ("xgetbv\n\t" : "=d" (d ), "=a" (a ) : "c" (c ));
160
+ uint64_t xcr0 = ((uint64_t )d << 32 ) | (uint64_t )a ;
161
+ return xcr0 ;
162
+ #endif
163
+ }
164
+
165
+ static inline uint32_t detect_supported_architectures (void )
166
+ {
149
167
uint32_t eax , ebx , ecx , edx ;
150
- uint32_t host_isa = 0x0 ;
168
+ uint32_t host_isa = 0x0 , host_avx_isa = 0x0 ;
151
169
152
170
// ECX for EAX=0x7
153
171
eax = 0x7 ;
154
172
ecx = 0x0 ;
155
173
cpuid (& eax , & ebx , & ecx , & edx );
156
- if (ebx & cpuid_avx2_bit ) {
157
- host_isa |= AVX2 ;
158
- }
159
174
if (ebx & cpuid_bmi1_bit ) {
160
175
host_isa |= BMI1 ;
161
176
}
@@ -164,46 +179,51 @@ static inline uint32_t detect_supported_architectures(void) {
164
179
host_isa |= BMI2 ;
165
180
}
166
181
182
+ if (ebx & cpuid_avx2_bit ) {
183
+ host_avx_isa |= AVX2 ;
184
+ }
185
+
167
186
if (ebx & cpuid_avx512f_bit ) {
168
- host_isa |= AVX512F ;
187
+ host_avx_isa |= AVX512F ;
169
188
}
170
189
171
190
if (ebx & cpuid_avx512dq_bit ) {
172
- host_isa |= AVX512DQ ;
191
+ host_avx_isa |= AVX512DQ ;
173
192
}
174
193
175
194
if (ebx & cpuid_avx512ifma_bit ) {
176
- host_isa |= AVX512IFMA ;
195
+ host_avx_isa |= AVX512IFMA ;
177
196
}
178
197
179
198
if (ebx & cpuid_avx512pf_bit ) {
180
- host_isa |= AVX512PF ;
199
+ host_avx_isa |= AVX512PF ;
181
200
}
182
201
183
202
if (ebx & cpuid_avx512er_bit ) {
184
- host_isa |= AVX512ER ;
203
+ host_avx_isa |= AVX512ER ;
185
204
}
186
205
187
206
if (ebx & cpuid_avx512cd_bit ) {
188
- host_isa |= AVX512CD ;
207
+ host_avx_isa |= AVX512CD ;
189
208
}
190
209
191
210
if (ebx & cpuid_avx512bw_bit ) {
192
- host_isa |= AVX512BW ;
211
+ host_avx_isa |= AVX512BW ;
193
212
}
194
213
195
214
if (ebx & cpuid_avx512vl_bit ) {
196
- host_isa |= AVX512VL ;
215
+ host_avx_isa |= AVX512VL ;
197
216
}
198
217
199
218
if (ecx & cpuid_avx512vbmi2_bit ) {
200
- host_isa |= AVX512VBMI2 ;
219
+ host_avx_isa |= AVX512VBMI2 ;
201
220
}
202
221
222
+ bool have_avx = false, have_xgetbv = false;
223
+
203
224
// EBX for EAX=0x1
204
225
eax = 0x1 ;
205
226
cpuid (& eax , & ebx , & ecx , & edx );
206
-
207
227
if (ecx & cpuid_sse42_bit ) {
208
228
host_isa |= SSE42 ;
209
229
}
@@ -212,6 +232,30 @@ static inline uint32_t detect_supported_architectures(void) {
212
232
host_isa |= PCLMULQDQ ;
213
233
}
214
234
235
+ // Correct detection of AVX2 support requires more than checking the CPUID
236
+ // bit. Peter Cordes provides an excellent answer on Stack Overflow
237
+ // (https://stackoverflow.com/a/34071400) quoting the article Introduction
238
+ // to Intel Advanced Vector Extensions (search Wayback Machine).
239
+ //
240
+ // 1. Verify that the operating system supports XGETBV using
241
+ // CPUID.1:ECX.OSXSAVE bit 27 = 1.
242
+ // 2. Verify the processor supports the AVX instruction extensions using:
243
+ // CPUID.1:ECX bit 28 = 1.
244
+ // 3. Issue XGETBV, and verify that the feature-enabled mask at bits 1 and 2
245
+ // are 11b (XMM state and YMM state enabled by the operating system).
246
+
247
+
248
+ // Determine if the CPU supports AVX
249
+ have_avx = (ecx & cpuid_have_avx_bit ) != 0 ;
250
+ // Determine if the Operating System supports XGETBV
251
+ have_xgetbv = (ecx & cpuid_have_xgetbv_bit ) != 0 ;
252
+
253
+ if (have_avx && have_xgetbv ) {
254
+ uint64_t xcr0 = xgetbv (0x0 );
255
+ if ((xcr0 & 0x6 ) == 0x6 )
256
+ host_isa |= host_avx_isa ;
257
+ }
258
+
215
259
return host_isa ;
216
260
}
217
261
#else // fallback
0 commit comments