@@ -114,6 +114,35 @@ namespace xsimd
114
114
#endif
115
115
116
116
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
117
+
118
+ auto get_xcr0_low = []() noexcept
119
+ {
120
+ uint32_t xcr0;
121
+
122
+ #if defined(_MSC_VER) && _MSC_VER >= 1400
123
+
124
+ xcr0 = (uint32_t )_xgetbv (0 );
125
+
126
+ #elif defined(__GNUC__)
127
+
128
+ __asm__ (
129
+ " xorl %%ecx, %%ecx\n "
130
+ " xgetbv\n "
131
+ : " =a" (xcr0)
132
+ :
133
+ #if defined(__i386__)
134
+ : " ecx" , " edx"
135
+ #else
136
+ : " rcx" , " rdx"
137
+ #endif
138
+ );
139
+
140
+ #else /* _MSC_VER < 1400 */
141
+ #error "_MSC_VER < 1400 is not supported"
142
+ #endif /* _MSC_VER && _MSC_VER >= 1400 */
143
+ return xcr0;
144
+ };
145
+
117
146
auto get_cpuid = [](int reg[4 ], int level, int count = 0 ) noexcept
118
147
{
119
148
@@ -148,43 +177,67 @@ namespace xsimd
148
177
149
178
get_cpuid (regs1, 0x1 );
150
179
151
- sse2 = regs1[3 ] >> 26 & 1 ;
152
- sse3 = regs1[2 ] >> 0 & 1 ;
153
- ssse3 = regs1[2 ] >> 9 & 1 ;
154
- sse4_1 = regs1[2 ] >> 19 & 1 ;
155
- sse4_2 = regs1[2 ] >> 20 & 1 ;
156
- fma3_sse42 = regs1[2 ] >> 12 & 1 ;
180
+ // OS can explicitly disable the usage of SSE/AVX extensions
181
+ // by setting an appropriate flag in CR0 register
182
+ //
183
+ // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
184
+
185
+ unsigned sse_state_os_enabled = 1 ;
186
+ unsigned avx_state_os_enabled = 1 ;
187
+ unsigned avx512_state_os_enabled = 1 ;
188
+
189
+ // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
190
+ // 18] to enable XSETBV/XGETBV instructions to access XCR0 and
191
+ // to support processor extended state management using
192
+ // XSAVE/XRSTOR.
193
+ bool osxsave = regs1[2 ] >> 27 & 1 ;
194
+ if (osxsave)
195
+ {
196
+
197
+ uint32_t xcr0 = get_xcr0_low ();
198
+
199
+ sse_state_os_enabled = xcr0 >> 1 & 1 ;
200
+ avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
201
+ avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
202
+ }
203
+
204
+ sse2 = regs1[3 ] >> 26 & sse_state_os_enabled;
205
+ sse3 = regs1[2 ] >> 0 & sse_state_os_enabled;
206
+ ssse3 = regs1[2 ] >> 9 & sse_state_os_enabled;
207
+ sse4_1 = regs1[2 ] >> 19 & sse_state_os_enabled;
208
+ sse4_2 = regs1[2 ] >> 20 & sse_state_os_enabled;
209
+ fma3_sse42 = regs1[2 ] >> 12 & sse_state_os_enabled;
157
210
158
- avx = regs1[2 ] >> 28 & 1 ;
211
+ avx = regs1[2 ] >> 28 & avx_state_os_enabled ;
159
212
fma3_avx = avx && fma3_sse42;
160
213
161
214
int regs8[4 ];
162
215
get_cpuid (regs8, 0x80000001 );
163
- fma4 = regs8[2 ] >> 16 & 1 ;
216
+ fma4 = regs8[2 ] >> 16 & avx_state_os_enabled ;
164
217
165
218
// sse4a = regs[2] >> 6 & 1;
166
219
167
220
// xop = regs[2] >> 11 & 1;
168
221
169
222
int regs7[4 ];
170
223
get_cpuid (regs7, 0x7 );
171
- avx2 = regs7[1 ] >> 5 & 1 ;
224
+ avx2 = regs7[1 ] >> 5 & avx_state_os_enabled ;
172
225
173
226
int regs7a[4 ];
174
227
get_cpuid (regs7a, 0x7 , 0x1 );
175
- avxvnni = regs7a[0 ] >> 4 & 1 ;
228
+ avxvnni = regs7a[0 ] >> 4 & avx_state_os_enabled ;
176
229
177
230
fma3_avx2 = avx2 && fma3_sse42;
178
231
179
- avx512f = regs7[1 ] >> 16 & 1 ;
180
- avx512cd = regs7[1 ] >> 28 & 1 ;
181
- avx512dq = regs7[1 ] >> 17 & 1 ;
182
- avx512bw = regs7[1 ] >> 30 & 1 ;
183
- avx512er = regs7[1 ] >> 27 & 1 ;
184
- avx512pf = regs7[1 ] >> 26 & 1 ;
185
- avx512ifma = regs7[1 ] >> 21 & 1 ;
186
- avx512vbmi = regs7[2 ] >> 1 & 1 ;
187
- avx512vnni_bw = regs7[2 ] >> 11 & 1 ;
232
+ avx512f = regs7[1 ] >> 16 & avx512_state_os_enabled ;
233
+ avx512cd = regs7[1 ] >> 28 & avx512_state_os_enabled ;
234
+ avx512dq = regs7[1 ] >> 17 & avx512_state_os_enabled ;
235
+ avx512bw = regs7[1 ] >> 30 & avx512_state_os_enabled ;
236
+ avx512er = regs7[1 ] >> 27 & avx512_state_os_enabled ;
237
+ avx512pf = regs7[1 ] >> 26 & avx512_state_os_enabled ;
238
+ avx512ifma = regs7[1 ] >> 21 & avx512_state_os_enabled ;
239
+ avx512vbmi = regs7[2 ] >> 1 & avx512_state_os_enabled ;
240
+ avx512vnni_bw = regs7[2 ] >> 11 & avx512_state_os_enabled ;
188
241
avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
189
242
#endif
190
243
}
0 commit comments