3
3
*
4
4
* Slightly modified version of isadetection.h in simdjson.
5
5
*
6
+ * Copyright (c) 2024 NLnet Labs (Jeroen Koekkoek)
6
7
* Copyright (c) 2020- simdjson (Daniel Lemire,
7
8
* Geoff Langdale,
8
9
* John Keiser)
@@ -107,20 +108,22 @@ static inline uint32_t detect_supported_architectures() {
107
108
#elif defined(__x86_64__ ) || defined(_M_AMD64 ) // x64
108
109
109
110
// Can be found on Intel ISA Reference for CPUID
110
- static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
111
- static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
112
- static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
113
- static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
114
- static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
115
- static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
116
- static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
117
- static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
118
- static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
119
- static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
120
- static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
121
- static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
122
- static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
123
- static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
111
+ static const uint32_t cpuid_avx2_bit = 1 << 5 ; ///< @private Bit 5 of EBX for EAX=0x7
112
+ static const uint32_t cpuid_bmi1_bit = 1 << 3 ; ///< @private bit 3 of EBX for EAX=0x7
113
+ static const uint32_t cpuid_bmi2_bit = 1 << 8 ; ///< @private bit 8 of EBX for EAX=0x7
114
+ static const uint32_t cpuid_avx512f_bit = 1 << 16 ; ///< @private bit 16 of EBX for EAX=0x7
115
+ static const uint32_t cpuid_avx512dq_bit = 1 << 17 ; ///< @private bit 17 of EBX for EAX=0x7
116
+ static const uint32_t cpuid_avx512ifma_bit = 1 << 21 ; ///< @private bit 21 of EBX for EAX=0x7
117
+ static const uint32_t cpuid_avx512pf_bit = 1 << 26 ; ///< @private bit 26 of EBX for EAX=0x7
118
+ static const uint32_t cpuid_avx512er_bit = 1 << 27 ; ///< @private bit 27 of EBX for EAX=0x7
119
+ static const uint32_t cpuid_avx512cd_bit = 1 << 28 ; ///< @private bit 28 of EBX for EAX=0x7
120
+ static const uint32_t cpuid_avx512bw_bit = 1 << 30 ; ///< @private bit 30 of EBX for EAX=0x7
121
+ static const uint32_t cpuid_avx512vl_bit = 1U << 31 ; ///< @private bit 31 of EBX for EAX=0x7
122
+ static const uint32_t cpuid_avx512vbmi2_bit = 1 << 6 ; ///< @private bit 6 of ECX for EAX=0x7
123
+ static const uint32_t cpuid_sse42_bit = 1 << 20 ; ///< @private bit 20 of ECX for EAX=0x1
124
+ static const uint32_t cpuid_pclmulqdq_bit = 1 << 1 ; ///< @private bit 1 of ECX for EAX=0x1
125
+ static const uint32_t cpuid_have_xgetbv_bit = 1 << 27 ; ///< @private bit 27 of ECX for EAX=0x1
126
+ static const uint32_t cpuid_have_avx_bit = 1 << 28 ; ///< @private bit 28 of ECX for EAX=0x1
124
127
125
128
static inline void cpuid (
126
129
uint32_t * eax , uint32_t * ebx , uint32_t * ecx , uint32_t * edx )
@@ -145,17 +148,26 @@ static inline void cpuid(
145
148
#endif
146
149
}
147
150
148
- static inline uint32_t detect_supported_architectures (void ) {
151
+ static inline void xgetbv (
152
+ uint32_t * edx , uint32_t * eax , uint32_t * ecx )
153
+ {
154
+ // FIXME: Implement support for _xgetbv intrinsic?
155
+ uint32_t a , c = * ecx , d ;
156
+ asm volatile ("xgetbv\n\t" : "=d" (d ), "=a" (a ), "+c" (c ));
157
+ * edx = d ;
158
+ * eax = a ;
159
+ * ecx = c ;
160
+ }
161
+
162
+ static inline uint32_t detect_supported_architectures (void )
163
+ {
149
164
uint32_t eax , ebx , ecx , edx ;
150
- uint32_t host_isa = 0x0 ;
165
+ uint32_t host_isa = 0x0 , host_avx_isa = 0x0 ;
151
166
152
167
// ECX for EAX=0x7
153
168
eax = 0x7 ;
154
169
ecx = 0x0 ;
155
170
cpuid (& eax , & ebx , & ecx , & edx );
156
- if (ebx & cpuid_avx2_bit ) {
157
- host_isa |= AVX2 ;
158
- }
159
171
if (ebx & cpuid_bmi1_bit ) {
160
172
host_isa |= BMI1 ;
161
173
}
@@ -164,46 +176,51 @@ static inline uint32_t detect_supported_architectures(void) {
164
176
host_isa |= BMI2 ;
165
177
}
166
178
179
+ if (ebx & cpuid_avx2_bit ) {
180
+ host_avx_isa |= AVX2 ;
181
+ }
182
+
167
183
if (ebx & cpuid_avx512f_bit ) {
168
- host_isa |= AVX512F ;
184
+ host_avx_isa |= AVX512F ;
169
185
}
170
186
171
187
if (ebx & cpuid_avx512dq_bit ) {
172
- host_isa |= AVX512DQ ;
188
+ host_avx_isa |= AVX512DQ ;
173
189
}
174
190
175
191
if (ebx & cpuid_avx512ifma_bit ) {
176
- host_isa |= AVX512IFMA ;
192
+ host_avx_isa |= AVX512IFMA ;
177
193
}
178
194
179
195
if (ebx & cpuid_avx512pf_bit ) {
180
- host_isa |= AVX512PF ;
196
+ host_avx_isa |= AVX512PF ;
181
197
}
182
198
183
199
if (ebx & cpuid_avx512er_bit ) {
184
- host_isa |= AVX512ER ;
200
+ host_avx_isa |= AVX512ER ;
185
201
}
186
202
187
203
if (ebx & cpuid_avx512cd_bit ) {
188
- host_isa |= AVX512CD ;
204
+ host_avx_isa |= AVX512CD ;
189
205
}
190
206
191
207
if (ebx & cpuid_avx512bw_bit ) {
192
- host_isa |= AVX512BW ;
208
+ host_avx_isa |= AVX512BW ;
193
209
}
194
210
195
211
if (ebx & cpuid_avx512vl_bit ) {
196
- host_isa |= AVX512VL ;
212
+ host_avx_isa |= AVX512VL ;
197
213
}
198
214
199
215
if (ecx & cpuid_avx512vbmi2_bit ) {
200
- host_isa |= AVX512VBMI2 ;
216
+ host_avx_isa |= AVX512VBMI2 ;
201
217
}
202
218
219
+ bool have_avx = false, have_xgetbv = false;
220
+
203
221
// EBX for EAX=0x1
204
222
eax = 0x1 ;
205
223
cpuid (& eax , & ebx , & ecx , & edx );
206
-
207
224
if (ecx & cpuid_sse42_bit ) {
208
225
host_isa |= SSE42 ;
209
226
}
@@ -212,6 +229,34 @@ static inline uint32_t detect_supported_architectures(void) {
212
229
host_isa |= PCLMULQDQ ;
213
230
}
214
231
232
+ // Correct detection of AVX2 support requires more than checking the CPUID
233
+ // bit. Peter Cordes provides an excellent answer on Stack Overflow
234
+ // (https://stackoverflow.com/a/34071400) quoting the article Introduction
235
+ // to Intel Advanced Vector Extensions (search Wayback Machine).
236
+ //
237
+ // 1. Verify that the operating system supports XGETBV using
238
+ // CPUID.1:ECX.OSXSAVE bit 27 = 1.
239
+ // 2. Verify the processor supports the AVX instruction extensions using:
240
+ // CPUID.1:ECX bit 28 = 1.
241
+ // 3. Issue XGETBV, and verify that the feature-enabled mask at bits 1 and 2
242
+ // are 11b (XMM state and YMM state enabled by the operating system).
243
+
244
+
245
+ // Determine if the CPU supports AVX
246
+ have_avx = (ecx & cpuid_have_avx_bit ) != 0 ;
247
+ // Determine if the Operating System supports XGETBV
248
+ have_xgetbv = (ecx & cpuid_have_xgetbv_bit ) != 0 ;
249
+
250
+ if (have_avx && have_xgetbv ) {
251
+ uint64_t xcr0 ;
252
+ ecx = 0x0 ;
253
+ xgetbv (& edx , & eax , & ecx );
254
+
255
+ xcr0 = ((uint64_t )edx << 32 ) | (uint64_t )eax ;
256
+ if ((xcr0 & 0x6 ) == 0x6 )
257
+ host_isa |= host_avx_isa ;
258
+ }
259
+
215
260
return host_isa ;
216
261
}
217
262
#else // fallback
0 commit comments