Skip to content

Commit a87d8e9

Browse files
authored
[X86][AVX512FP16] Decouple AVX512VL and AVX512DQ from AVX512FP16 (#137450)
Fixes: #136209
1 parent f11d46c commit a87d8e9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+571
-342
lines changed

clang/lib/Headers/avx512fp16intrin.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,8 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
553553
}
554554

555555
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
556-
return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
556+
return (__m512h)_mm512_xor_epi32((__m512i)__A,
557+
_mm512_set1_epi32(-2147483648));
557558
}
558559

559560
static __inline__ __m512h __DEFAULT_FN_ATTRS512

clang/test/CodeGen/X86/avx512fp16-builtins-constrained-cmp.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512fp16 -emit-llvm -ffp-exception-behavior=strict -o - -Wall -Werror | FileCheck %s
1+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512fp16 -target-feature +avx512vl -emit-llvm -ffp-exception-behavior=strict -o - -Wall -Werror | FileCheck %s
22

33
#include <immintrin.h>
44

clang/test/CodeGen/X86/avx512fp16-builtins.c

+15-15
Original file line numberDiff line numberDiff line change
@@ -689,24 +689,24 @@ __m512h test_mm512_abs_ph(__m512h a) {
689689

690690
__m512h test_mm512_conj_pch(__m512h __A) {
691691
// CHECK-LABEL: @test_mm512_conj_pch
692-
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
693-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
694-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
692+
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
693+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
694+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
695695
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
696-
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
697-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
696+
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
697+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
698698
return _mm512_conj_pch(__A);
699699
}
700700

701701
__m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
702702
// CHECK-LABEL: @test_mm512_mask_conj_pch
703703
// CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
704-
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
705-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
706-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
704+
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
705+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
706+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
707707
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
708-
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
709-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
708+
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
709+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
710710
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
711711
// CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
712712
// CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
@@ -717,12 +717,12 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
717717
__m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) {
718718
// CHECK-LABEL: @test_mm512_maskz_conj_pch
719719
// CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
720-
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
721-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
722-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
720+
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
721+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
722+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
723723
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
724-
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
725-
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
724+
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
725+
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
726726
// CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
727727
// CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
728728
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>

clang/test/Preprocessor/x86_target_features.c

+2-9
Original file line numberDiff line numberDiff line change
@@ -596,31 +596,24 @@
596596
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16 %s
597597

598598
// AVX512FP16: #define __AVX512BW__ 1
599-
// AVX512FP16: #define __AVX512DQ__ 1
600599
// AVX512FP16: #define __AVX512FP16__ 1
601-
// AVX512FP16: #define __AVX512VL__ 1
602-
// AVX512FP16: #define __EVEX256__ 1
603600
// AVX512FP16: #define __EVEX512__ 1
604601

605602
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s
606603

607-
// AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1
608-
// AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1
609-
// AVX512FP16NOAVX512VL-NOT: #define __EVEX256__ 1
604+
// AVX512FP16NOAVX512VL: #define __AVX512FP16__ 1
610605
// AVX512FP16NOAVX512VL: #define __EVEX512__ 1
611606

612607
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s
613608

614609
// AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1
615610
// AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1
616-
// AVX512FP16NOAVX512BW: #define __EVEX256__ 1
617611
// AVX512FP16NOAVX512BW: #define __EVEX512__ 1
618612

619613
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s
620614

621615
// AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1
622-
// AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1
623-
// AVX512FP16NOAVX512DQ: #define __EVEX256__ 1
616+
// AVX512FP16NOAVX512DQ: #define __AVX512FP16__ 1
624617
// AVX512FP16NOAVX512DQ: #define __EVEX512__ 1
625618

626619
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s

llvm/lib/Target/X86/X86.td

+2-7
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,9 @@ def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
166166
"HasVP2INTERSECT", "true",
167167
"Enable AVX-512 vp2intersect",
168168
[FeatureAVX512]>;
169-
// FIXME: FP16 scalar intrinsics use the type v8f16, which is supposed to be
170-
// guarded under condition hasVLX. So we imply it in FeatureFP16 currently.
171-
// FIXME: FP16 conversion between f16 and i64 customize type v8i64, which is
172-
// supposed to be guarded under condition hasDQI. So we imply it in FeatureFP16
173-
// currently.
174169
def FeatureFP16 : SubtargetFeature<"avx512fp16", "HasFP16", "true",
175170
"Support 16-bit floating point",
176-
[FeatureBWI, FeatureVLX, FeatureDQI]>;
171+
[FeatureBWI]>;
177172
def FeatureAVXVNNIINT8 : SubtargetFeature<"avxvnniint8",
178173
"HasAVXVNNIINT8", "true",
179174
"Enable AVX-VNNI-INT8",
@@ -338,7 +333,7 @@ def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
338333
"Support AVX10.1 up to 256-bit instruction",
339334
[FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
340335
FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
341-
FeatureFP16]>;
336+
FeatureFP16, FeatureVLX, FeatureDQI]>;
342337
def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
343338
"Support AVX10.1 up to 512-bit instruction",
344339
[FeatureAVX10_1, FeatureEVEX512]>;

llvm/lib/Target/X86/X86ISelLowering.cpp

+10-6
Original file line numberDiff line numberDiff line change
@@ -2024,13 +2024,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
20242024
setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
20252025
setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
20262026

2027-
if (Subtarget.hasDQI()) {
2027+
if (Subtarget.hasDQI() || Subtarget.hasFP16())
20282028
for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
20292029
ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
20302030
ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
20312031
setOperationAction(Opc, MVT::v8i64, Custom);
2032+
2033+
if (Subtarget.hasDQI())
20322034
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2033-
}
20342035

20352036
if (Subtarget.hasCDI()) {
20362037
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
@@ -19860,7 +19861,7 @@ static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl,
1986019861
DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
1986119862
}
1986219863

19863-
static bool isLegalConversion(MVT VT, bool IsSigned,
19864+
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
1986419865
const X86Subtarget &Subtarget) {
1986519866
if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
1986619867
return true;
@@ -19871,6 +19872,8 @@ static bool isLegalConversion(MVT VT, bool IsSigned,
1987119872
if (Subtarget.useAVX512Regs()) {
1987219873
if (VT == MVT::v16i32)
1987319874
return true;
19875+
if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
19876+
return true;
1987419877
if (VT == MVT::v8i64 && Subtarget.hasDQI())
1987519878
return true;
1987619879
}
@@ -19892,7 +19895,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
1989219895

1989319896
if (isSoftF16(VT, Subtarget))
1989419897
return promoteXINT_TO_FP(Op, dl, DAG);
19895-
else if (isLegalConversion(SrcVT, true, Subtarget))
19898+
else if (isLegalConversion(SrcVT, VT, true, Subtarget))
1989619899
return Op;
1989719900

1989819901
if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
@@ -20396,7 +20399,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
2039620399

2039720400
if (isSoftF16(DstVT, Subtarget))
2039820401
return promoteXINT_TO_FP(Op, dl, DAG);
20399-
else if (isLegalConversion(SrcVT, false, Subtarget))
20402+
else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
2040020403
return Op;
2040120404

2040220405
if (DstVT.isVector())
@@ -21419,7 +21422,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
2141921422
{NVT, MVT::Other}, {Chain, Src})});
2142021423
return DAG.getNode(Op.getOpcode(), dl, VT,
2142121424
DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21422-
} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21425+
} else if (isTypeLegal(SrcVT) &&
21426+
isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
2142321427
return Op;
2142421428
}
2142521429

llvm/lib/TargetParser/X86TargetParser.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ constexpr FeatureBitset FeaturesSapphireRapids =
135135
FeatureAVX512BF16 | FeatureAVX512FP16 | FeatureAVXVNNI | FeatureCLDEMOTE |
136136
FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
137137
FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
138-
FeatureWAITPKG;
138+
FeatureWAITPKG | FeatureAVX512DQ | FeatureAVX512VL;
139139
constexpr FeatureBitset FeaturesGraniteRapids =
140140
FeaturesSapphireRapids | FeatureAMX_FP16 | FeaturePREFETCHI;
141141
constexpr FeatureBitset FeaturesDiamondRapids =
@@ -624,8 +624,7 @@ constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
624624
constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2;
625625
constexpr FeatureBitset ImpliedFeaturesAVXNECONVERT = FeatureAVX2;
626626
constexpr FeatureBitset ImpliedFeaturesSHA512 = FeatureAVX2;
627-
constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
628-
FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
627+
constexpr FeatureBitset ImpliedFeaturesAVX512FP16 = FeatureAVX512BW;
629628
// Key Locker Features
630629
constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
631630
constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
@@ -637,7 +636,8 @@ constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
637636
constexpr FeatureBitset ImpliedFeaturesAVX10_1 =
638637
FeatureAVX512CD | FeatureAVX512VBMI | FeatureAVX512IFMA |
639638
FeatureAVX512VNNI | FeatureAVX512BF16 | FeatureAVX512VPOPCNTDQ |
640-
FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16;
639+
FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16 |
640+
FeatureAVX512DQ | FeatureAVX512VL;
641641
constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 =
642642
FeatureAVX10_1 | FeatureEVEX512;
643643
constexpr FeatureBitset ImpliedFeaturesAVX10_2 = FeatureAVX10_1;

llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=f16c,fma | FileCheck %s --check-prefix=F16C
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=F16C
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 | FileCheck %s --check-prefix=FP16
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefix=FP16
66

77
define <2 x half> @foo(<2 x half> %0) "unsafe-fp-math"="true" nounwind {
88
; AVX2-LABEL: foo:

llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ
44

55
; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
66
define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {

llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
33

44
define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
55
; CHECK-LABEL: test1:

llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
33

44
define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
55
; CHECK-LABEL: test1:
@@ -84,7 +84,7 @@ entry:
8484
define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
8585
; CHECK-LABEL: test6:
8686
; CHECK: # %bb.0: # %entry
87-
; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
87+
; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
8888
; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
8989
; CHECK-NEXT: retq
9090
entry:

llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
33

44
define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
55
; CHECK-LABEL: test1:

llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -761,7 +761,7 @@ define <4 x half> @test_s17tofp4(<4 x i17> %arg0) {
761761
define <2 x half> @test_u33tofp2(<2 x i33> %arg0) {
762762
; CHECK-LABEL: test_u33tofp2:
763763
; CHECK: # %bb.0:
764-
; CHECK-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
764+
; CHECK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
765765
; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0
766766
; CHECK-NEXT: retq
767767
%res = uitofp <2 x i33> %arg0 to <2 x half>

llvm/test/CodeGen/X86/avx512fp16-cvt.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ define <8 x half> @f32to4f16_mask(<4 x float> %a, <8 x half> %b, i8 %mask) {
8282
;
8383
; X86-LABEL: f32to4f16_mask:
8484
; X86: # %bb.0:
85-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
85+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
86+
; X86-NEXT: kmovd %eax, %k1
8687
; X86-NEXT: vcvtps2phx %xmm0, %xmm1 {%k1}
8788
; X86-NEXT: vmovaps %xmm1, %xmm0
8889
; X86-NEXT: retl
@@ -101,7 +102,8 @@ define <8 x half> @f32to8f16_mask(<8 x float> %a, <8 x half> %b, i8 %mask) {
101102
;
102103
; X86-LABEL: f32to8f16_mask:
103104
; X86: # %bb.0:
104-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
105+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
106+
; X86-NEXT: kmovd %eax, %k1
105107
; X86-NEXT: vcvtps2phx %ymm0, %xmm1 {%k1}
106108
; X86-NEXT: vmovaps %xmm1, %xmm0
107109
; X86-NEXT: vzeroupper

llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll

+13-10
Original file line numberDiff line numberDiff line change
@@ -469,16 +469,17 @@ define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half>
469469
; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
470470
; X86: # %bb.0:
471471
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
472-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
472+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
473+
; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
473474
; X86-NEXT: vfmadd231sh (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x08]
474-
; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
475+
; X86-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
475476
; X86-NEXT: retl # encoding: [0xc3]
476477
;
477478
; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
478479
; X64: # %bb.0:
479480
; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
480481
; X64-NEXT: vfmadd231sh (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x0f]
481-
; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
482+
; X64-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
482483
; X64-NEXT: retq # encoding: [0xc3]
483484
%q = load half, ptr %ptr_b
484485
%vecinit.i = insertelement <8 x half> undef, half %q, i32 0
@@ -496,7 +497,8 @@ define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half>
496497
define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3, i32 %x4 ){
497498
; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sh:
498499
; X86: # %bb.0:
499-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
500+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
501+
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
500502
; X86-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2]
501503
; X86-NEXT: retl # encoding: [0xc3]
502504
;
@@ -528,16 +530,17 @@ define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half>
528530
define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) {
529531
; X86-LABEL: fmadd_sh_mask_memfold:
530532
; X86: # %bb.0:
531-
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c]
532-
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
533-
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
533+
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
534+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
535+
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
534536
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
535-
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01]
537+
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x02]
536538
; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
537-
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08]
539+
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x09]
538540
; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8]
541+
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
539542
; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1]
540-
; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01]
543+
; X86-NEXT: vmovsh %xmm0, (%edx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x02]
541544
; X86-NEXT: retl # encoding: [0xc3]
542545
;
543546
; X64-LABEL: fmadd_sh_mask_memfold:

0 commit comments

Comments
 (0)