Skip to content

Commit 4b03a27

Browse files
Use __attribute__((target(...))) for SSE4.2 CRC-32C support.
Presently, we check for compiler support for the required intrinsics both with and without the -msse4.2 compiler flag, and then depending on the results of those checks, we pick which files to compile with which flags. This is tedious and complicated, and it results in unsustainable coding patterns such as separate files for each portion of code that may need to be built with different compiler flags. This commit makes use of the newly-added support for __attribute__((target(...))) in the SSE4.2 CRC-32C code. This simplifies both the configure-time checks and the build scripts, and it allows us to place the functions that use the intrinsics in files that we otherwise do not want to build with special CPU instructions (although this commit refrains from doing so). This is also preparatory work for a proposed follow-up commit that will further optimize the CRC-32C code with AVX-512 instructions. While at it, this commit modifies meson's checks for SSE4.2 CRC support to be the same as autoconf's. meson was choosing whether to use a runtime check based purely on whether -msse4.2 is required, while autoconf has long checked for the __SSE4_2__ preprocessor symbol to decide. meson's previous approach seems to work just fine, but this change avoids needing to build multiple test programs and to keep track of whether to actually use pg_attribute_target(). Ideally we'd use __attribute__((target(...))) for ARMv8 CRC support, too, but there's little point in doing so because until clang 16, using the ARM intrinsics still requires special compiler flags. Perhaps we can re-evaluate this decision after some time has passed. Author: Raghuveer Devulapalli Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
1 parent 6ba9892 commit 4b03a27

File tree

7 files changed

+72
-102
lines changed

7 files changed

+72
-102
lines changed

Diff for: config/c-compiler.m4

+17-15
Original file line numberDiff line numberDiff line change
@@ -605,24 +605,26 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
605605
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
606606
# the other ones are, on x86-64 platforms)
607607
#
608-
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
609-
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
608+
# If the intrinsics are supported, sets pgac_sse42_crc32_intrinsics.
610609
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
611-
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
612-
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
613-
[pgac_save_CFLAGS=$CFLAGS
614-
CFLAGS="$pgac_save_CFLAGS $1"
615-
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
616-
[unsigned int crc = 0;
617-
crc = _mm_crc32_u8(crc, 0);
618-
crc = _mm_crc32_u32(crc, 0);
619-
/* return computed value, to prevent the above being optimized away */
620-
return crc == 0;])],
610+
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
611+
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
612+
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
613+
#if defined(__has_attribute) && __has_attribute (target)
614+
__attribute__((target("sse4.2")))
615+
#endif
616+
static int crc32_sse42_test(void)
617+
{
618+
unsigned int crc = 0;
619+
crc = _mm_crc32_u8(crc, 0);
620+
crc = _mm_crc32_u32(crc, 0);
621+
/* return computed value, to prevent the above being optimized away */
622+
return crc == 0;
623+
}],
624+
[return crc32_sse42_test();])],
621625
[Ac_cachevar=yes],
622-
[Ac_cachevar=no])
623-
CFLAGS="$pgac_save_CFLAGS"])
626+
[Ac_cachevar=no])])
624627
if test x"$Ac_cachevar" = x"yes"; then
625-
CFLAGS_CRC="$1"
626628
pgac_sse42_crc32_intrinsics=yes
627629
fi
628630
undefine([Ac_cachevar])dnl

Diff for: configure

+29-64
Original file line numberDiff line numberDiff line change
@@ -17364,87 +17364,47 @@ fi
1736417364

1736517365
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
1736617366
#
17367-
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
17368-
# with the default compiler flags. If not, check if adding the -msse4.2
17369-
# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
17370-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
17371-
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
17372-
if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
17367+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
17368+
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; }
17369+
if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
1737317370
$as_echo_n "(cached) " >&6
1737417371
else
17375-
pgac_save_CFLAGS=$CFLAGS
17376-
CFLAGS="$pgac_save_CFLAGS "
17377-
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17378-
/* end confdefs.h. */
17379-
#include <nmmintrin.h>
17380-
int
17381-
main ()
17382-
{
17383-
unsigned int crc = 0;
17384-
crc = _mm_crc32_u8(crc, 0);
17385-
crc = _mm_crc32_u32(crc, 0);
17386-
/* return computed value, to prevent the above being optimized away */
17387-
return crc == 0;
17388-
;
17389-
return 0;
17390-
}
17391-
_ACEOF
17392-
if ac_fn_c_try_link "$LINENO"; then :
17393-
pgac_cv_sse42_crc32_intrinsics_=yes
17394-
else
17395-
pgac_cv_sse42_crc32_intrinsics_=no
17396-
fi
17397-
rm -f core conftest.err conftest.$ac_objext \
17398-
conftest$ac_exeext conftest.$ac_ext
17399-
CFLAGS="$pgac_save_CFLAGS"
17400-
fi
17401-
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
17402-
$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
17403-
if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
17404-
CFLAGS_CRC=""
17405-
pgac_sse42_crc32_intrinsics=yes
17406-
fi
17407-
17408-
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
17409-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
17410-
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
17411-
if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
17412-
$as_echo_n "(cached) " >&6
17413-
else
17414-
pgac_save_CFLAGS=$CFLAGS
17415-
CFLAGS="$pgac_save_CFLAGS -msse4.2"
17416-
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
17372+
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
1741717373
/* end confdefs.h. */
1741817374
#include <nmmintrin.h>
17375+
#if defined(__has_attribute) && __has_attribute (target)
17376+
__attribute__((target("sse4.2")))
17377+
#endif
17378+
static int crc32_sse42_test(void)
17379+
{
17380+
unsigned int crc = 0;
17381+
crc = _mm_crc32_u8(crc, 0);
17382+
crc = _mm_crc32_u32(crc, 0);
17383+
/* return computed value, to prevent the above being optimized away */
17384+
return crc == 0;
17385+
}
1741917386
int
1742017387
main ()
1742117388
{
17422-
unsigned int crc = 0;
17423-
crc = _mm_crc32_u8(crc, 0);
17424-
crc = _mm_crc32_u32(crc, 0);
17425-
/* return computed value, to prevent the above being optimized away */
17426-
return crc == 0;
17389+
return crc32_sse42_test();
1742717390
;
1742817391
return 0;
1742917392
}
1743017393
_ACEOF
1743117394
if ac_fn_c_try_link "$LINENO"; then :
17432-
pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
17395+
pgac_cv_sse42_crc32_intrinsics=yes
1743317396
else
17434-
pgac_cv_sse42_crc32_intrinsics__msse4_2=no
17397+
pgac_cv_sse42_crc32_intrinsics=no
1743517398
fi
1743617399
rm -f core conftest.err conftest.$ac_objext \
1743717400
conftest$ac_exeext conftest.$ac_ext
17438-
CFLAGS="$pgac_save_CFLAGS"
1743917401
fi
17440-
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
17441-
$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
17442-
if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
17443-
CFLAGS_CRC="-msse4.2"
17402+
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
17403+
$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
17404+
if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
1744417405
pgac_sse42_crc32_intrinsics=yes
1744517406
fi
1744617407

17447-
fi
1744817408

1744917409
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
1745017410
# define __SSE4_2__ in that case.
@@ -17647,15 +17607,20 @@ fi
1764717607
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
1764817608
# use the special CRC instructions for calculating CRC-32C. If we're not
1764917609
# targeting such a processor, but we can nevertheless produce code that uses
17650-
# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
17651-
# implementations and select which one to use at runtime, depending on whether
17652-
# SSE 4.2 is supported by the processor we're running on.
17610+
# the SSE intrinsics, compile both implementations and select which one to use
17611+
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
17612+
# running on.
1765317613
#
1765417614
# Similarly, if we are targeting an ARM processor that has the CRC
1765517615
# instructions that are part of the ARMv8 CRC Extension, use them. And if
1765617616
# we're not targeting such a processor, but can nevertheless produce code that
1765717617
# uses the CRC instructions, compile both, and select at runtime.
1765817618
#
17619+
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
17620+
# instructions because until clang 16, using the ARM intrinsics still requires
17621+
# special -march flags. Perhaps we can re-evaluate this decision after some
17622+
# time has passed.
17623+
#
1765917624
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
1766017625
# in the template or configure command line.
1766117626
#

Diff for: configure.ac

+9-10
Original file line numberDiff line numberDiff line change
@@ -2068,13 +2068,7 @@ fi
20682068

20692069
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
20702070
#
2071-
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
2072-
# with the default compiler flags. If not, check if adding the -msse4.2
2073-
# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
2074-
PGAC_SSE42_CRC32_INTRINSICS([])
2075-
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
2076-
PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
2077-
fi
2071+
PGAC_SSE42_CRC32_INTRINSICS()
20782072

20792073
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
20802074
# define __SSE4_2__ in that case.
@@ -2111,15 +2105,20 @@ AC_SUBST(CFLAGS_CRC)
21112105
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
21122106
# use the special CRC instructions for calculating CRC-32C. If we're not
21132107
# targeting such a processor, but we can nevertheless produce code that uses
2114-
# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
2115-
# implementations and select which one to use at runtime, depending on whether
2116-
# SSE 4.2 is supported by the processor we're running on.
2108+
# the SSE intrinsics, compile both implementations and select which one to use
2109+
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
2110+
# running on.
21172111
#
21182112
# Similarly, if we are targeting an ARM processor that has the CRC
21192113
# instructions that are part of the ARMv8 CRC Extension, use them. And if
21202114
# we're not targeting such a processor, but can nevertheless produce code that
21212115
# uses the CRC instructions, compile both, and select at runtime.
21222116
#
2117+
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
2118+
# instructions because until clang 16, using the ARM intrinsics still requires
2119+
# special -march flags. Perhaps we can re-evaluate this decision after some
2120+
# time has passed.
2121+
#
21232122
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
21242123
# in the template or configure command line.
21252124
#

Diff for: meson.build

+15-7
Original file line numberDiff line numberDiff line change
@@ -2211,14 +2211,19 @@ endif
22112211
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
22122212
# use the special CRC instructions for calculating CRC-32C. If we're not
22132213
# targeting such a processor, but we can nevertheless produce code that uses
2214-
# the SSE intrinsics, perhaps with some extra CFLAGS, compile both
2215-
# implementations and select which one to use at runtime, depending on whether
2216-
# SSE 4.2 is supported by the processor we're running on.
2214+
# the SSE intrinsics, compile both implementations and select which one to use
2215+
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
2216+
# running on.
22172217
#
22182218
# Similarly, if we are targeting an ARM processor that has the CRC
22192219
# instructions that are part of the ARMv8 CRC Extension, use them. And if
22202220
# we're not targeting such a processor, but can nevertheless produce code that
22212221
# uses the CRC instructions, compile both, and select at runtime.
2222+
#
2223+
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
2224+
# instructions because until clang 16, using the ARM intrinsics still requires
2225+
# special -march flags. Perhaps we can re-evaluate this decision after some
2226+
# time has passed.
22222227
###############################################################
22232228

22242229
have_optimized_crc = false
@@ -2234,6 +2239,9 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
22342239
prog = '''
22352240
#include <nmmintrin.h>
22362241
2242+
#if defined(__has_attribute) && __has_attribute (target)
2243+
__attribute__((target("sse4.2")))
2244+
#endif
22372245
int main(void)
22382246
{
22392247
unsigned int crc = 0;
@@ -2244,16 +2252,16 @@ int main(void)
22442252
}
22452253
'''
22462254

2247-
if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
2255+
if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
22482256
args: test_c_args)
2257+
# Do not use Intel SSE 4.2
2258+
elif (cc.get_define('__SSE4_2__') != '')
22492259
# Use Intel SSE 4.2 unconditionally.
22502260
cdata.set('USE_SSE42_CRC32C', 1)
22512261
have_optimized_crc = true
2252-
elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
2253-
args: test_c_args + ['-msse4.2'])
2262+
else
22542263
# Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
22552264
# the runtime check.
2256-
cflags_crc += '-msse4.2'
22572265
cdata.set('USE_SSE42_CRC32C', false)
22582266
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
22592267
have_optimized_crc = true

Diff for: src/port/Makefile

-5
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
8282
rm -f $@
8383
$(AR) $(AROPT) $@ $^
8484

85-
# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
86-
pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
87-
pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
88-
pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
89-
9085
# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
9186
pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
9287
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)

Diff for: src/port/meson.build

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ endif
8282
replace_funcs_pos = [
8383
# x86/x64
8484
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
85-
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
85+
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
8686
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
8787
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
8888

Diff for: src/port/pg_crc32c_sse42.c

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "port/pg_crc32c.h"
2020

2121
pg_attribute_no_sanitize_alignment()
22+
pg_attribute_target("sse4.2")
2223
pg_crc32c
2324
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
2425
{

0 commit comments

Comments
 (0)