Skip to content

[X86][AVX512FP16] Decouple AVX512VL and AVX512DQ from AVX512FP16 #137450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion clang/lib/Headers/avx512fp16intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,8 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
return (__m512h)_mm512_xor_epi32((__m512i)__A,
_mm512_set1_epi32(-2147483648));
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512fp16 -emit-llvm -ffp-exception-behavior=strict -o - -Wall -Werror | FileCheck %s
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512fp16 -target-feature +avx512vl -emit-llvm -ffp-exception-behavior=strict -o - -Wall -Werror | FileCheck %s

#include <immintrin.h>

Expand Down
30 changes: 15 additions & 15 deletions clang/test/CodeGen/X86/avx512fp16-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -689,24 +689,24 @@ __m512h test_mm512_abs_ph(__m512h a) {

__m512h test_mm512_conj_pch(__m512h __A) {
// CHECK-LABEL: @test_mm512_conj_pch
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
return _mm512_conj_pch(__A);
}

__m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
// CHECK-LABEL: @test_mm512_mask_conj_pch
// CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
// CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
Expand All @@ -717,12 +717,12 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
__m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) {
// CHECK-LABEL: @test_mm512_maskz_conj_pch
// CHECK: %{{.*}} = trunc i32 %{{.*}} to i16
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <16 x i32>
// CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
// CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <8 x i64>
// CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x half>
// CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
// CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
// CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
Expand Down
11 changes: 2 additions & 9 deletions clang/test/Preprocessor/x86_target_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -596,31 +596,24 @@
// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16 %s

// AVX512FP16: #define __AVX512BW__ 1
// AVX512FP16: #define __AVX512DQ__ 1
// AVX512FP16: #define __AVX512FP16__ 1
// AVX512FP16: #define __AVX512VL__ 1
// AVX512FP16: #define __EVEX256__ 1
// AVX512FP16: #define __EVEX512__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s

// AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1
// AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1
// AVX512FP16NOAVX512VL-NOT: #define __EVEX256__ 1
// AVX512FP16NOAVX512VL: #define __AVX512FP16__ 1
// AVX512FP16NOAVX512VL: #define __EVEX512__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s

// AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1
// AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1
// AVX512FP16NOAVX512BW: #define __EVEX256__ 1
// AVX512FP16NOAVX512BW: #define __EVEX512__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s

// AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1
// AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1
// AVX512FP16NOAVX512DQ: #define __EVEX256__ 1
// AVX512FP16NOAVX512DQ: #define __AVX512FP16__ 1
// AVX512FP16NOAVX512DQ: #define __EVEX512__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
Expand Down
9 changes: 2 additions & 7 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,9 @@ def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
"HasVP2INTERSECT", "true",
"Enable AVX-512 vp2intersect",
[FeatureAVX512]>;
// FIXME: FP16 scalar intrinsics use the type v8f16, which is supposed to be
// guarded under condition hasVLX. So we imply it in FeatureFP16 currently.
// FIXME: FP16 conversion between f16 and i64 customize type v8i64, which is
// supposed to be guarded under condition hasDQI. So we imply it in FeatureFP16
// currently.
def FeatureFP16 : SubtargetFeature<"avx512fp16", "HasFP16", "true",
"Support 16-bit floating point",
[FeatureBWI, FeatureVLX, FeatureDQI]>;
[FeatureBWI]>;
def FeatureAVXVNNIINT8 : SubtargetFeature<"avxvnniint8",
"HasAVXVNNIINT8", "true",
"Enable AVX-VNNI-INT8",
Expand Down Expand Up @@ -338,7 +333,7 @@ def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
"Support AVX10.1 up to 256-bit instruction",
[FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
FeatureFP16]>;
FeatureFP16, FeatureVLX, FeatureDQI]>;
def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
"Support AVX10.1 up to 512-bit instruction",
[FeatureAVX10_1, FeatureEVEX512]>;
Expand Down
16 changes: 10 additions & 6 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2024,13 +2024,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

if (Subtarget.hasDQI()) {
if (Subtarget.hasDQI() || Subtarget.hasFP16())
for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
setOperationAction(Opc, MVT::v8i64, Custom);

if (Subtarget.hasDQI())
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}

if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
Expand Down Expand Up @@ -19860,7 +19861,7 @@ static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl,
DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
}

static bool isLegalConversion(MVT VT, bool IsSigned,
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
const X86Subtarget &Subtarget) {
if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
return true;
Expand All @@ -19871,6 +19872,8 @@ static bool isLegalConversion(MVT VT, bool IsSigned,
if (Subtarget.useAVX512Regs()) {
if (VT == MVT::v16i32)
return true;
if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
return true;
if (VT == MVT::v8i64 && Subtarget.hasDQI())
return true;
}
Expand All @@ -19892,7 +19895,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

if (isSoftF16(VT, Subtarget))
return promoteXINT_TO_FP(Op, dl, DAG);
else if (isLegalConversion(SrcVT, true, Subtarget))
else if (isLegalConversion(SrcVT, VT, true, Subtarget))
return Op;

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
Expand Down Expand Up @@ -20396,7 +20399,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

if (isSoftF16(DstVT, Subtarget))
return promoteXINT_TO_FP(Op, dl, DAG);
else if (isLegalConversion(SrcVT, false, Subtarget))
else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
return Op;

if (DstVT.isVector())
Expand Down Expand Up @@ -21419,7 +21422,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
{NVT, MVT::Other}, {Chain, Src})});
return DAG.getNode(Op.getOpcode(), dl, VT,
DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
} else if (isTypeLegal(SrcVT) &&
isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
return Op;
}

Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/TargetParser/X86TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ constexpr FeatureBitset FeaturesSapphireRapids =
FeatureAVX512BF16 | FeatureAVX512FP16 | FeatureAVXVNNI | FeatureCLDEMOTE |
FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
FeatureWAITPKG;
FeatureWAITPKG | FeatureAVX512DQ | FeatureAVX512VL;
constexpr FeatureBitset FeaturesGraniteRapids =
FeaturesSapphireRapids | FeatureAMX_FP16 | FeaturePREFETCHI;
constexpr FeatureBitset FeaturesDiamondRapids =
Expand Down Expand Up @@ -624,8 +624,7 @@ constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVXNECONVERT = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesSHA512 = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
constexpr FeatureBitset ImpliedFeaturesAVX512FP16 = FeatureAVX512BW;
// Key Locker Features
constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
Expand All @@ -637,7 +636,8 @@ constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVX10_1 =
FeatureAVX512CD | FeatureAVX512VBMI | FeatureAVX512IFMA |
FeatureAVX512VNNI | FeatureAVX512BF16 | FeatureAVX512VPOPCNTDQ |
FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16;
FeatureAVX512VBMI2 | FeatureAVX512BITALG | FeatureAVX512FP16 |
FeatureAVX512DQ | FeatureAVX512VL;
constexpr FeatureBitset ImpliedFeaturesAVX10_1_512 =
FeatureAVX10_1 | FeatureEVEX512;
constexpr FeatureBitset ImpliedFeaturesAVX10_2 = FeatureAVX10_1;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=f16c,fma | FileCheck %s --check-prefix=F16C
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=F16C
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 | FileCheck %s --check-prefix=FP16
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefix=FP16

define <2 x half> @foo(<2 x half> %0) "unsafe-fp-math"="true" nounwind {
; AVX2-LABEL: foo:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ

; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s

define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
; CHECK-LABEL: test1:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s

define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
; CHECK-LABEL: test1:
Expand Down Expand Up @@ -84,7 +84,7 @@ entry:
define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
; CHECK-LABEL: test6:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
entry:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s

define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
; CHECK-LABEL: test1:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ define <4 x half> @test_s17tofp4(<4 x i17> %arg0) {
define <2 x half> @test_u33tofp2(<2 x i33> %arg0) {
; CHECK-LABEL: test_u33tofp2:
; CHECK: # %bb.0:
; CHECK-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
; CHECK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
; CHECK-NEXT: vcvtqq2ph %xmm0, %xmm0
; CHECK-NEXT: retq
%res = uitofp <2 x i33> %arg0 to <2 x half>
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/X86/avx512fp16-cvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ define <8 x half> @f32to4f16_mask(<4 x float> %a, <8 x half> %b, i8 %mask) {
;
; X86-LABEL: f32to4f16_mask:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vcvtps2phx %xmm0, %xmm1 {%k1}
; X86-NEXT: vmovaps %xmm1, %xmm0
; X86-NEXT: retl
Expand All @@ -101,7 +102,8 @@ define <8 x half> @f32to8f16_mask(<8 x float> %a, <8 x half> %b, i8 %mask) {
;
; X86-LABEL: f32to8f16_mask:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vcvtps2phx %ymm0, %xmm1 {%k1}
; X86-NEXT: vmovaps %xmm1, %xmm0
; X86-NEXT: vzeroupper
Expand Down
23 changes: 13 additions & 10 deletions llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -469,16 +469,17 @@ define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half>
; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
; X86-NEXT: vfmadd231sh (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x08]
; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X86-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
; X64-NEXT: vfmadd231sh (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x0f]
; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X64-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%q = load half, ptr %ptr_b
%vecinit.i = insertelement <8 x half> undef, half %q, i32 0
Expand All @@ -496,7 +497,8 @@ define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half>
define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3, i32 %x4 ){
; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sh:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
; X86-NEXT: vfmadd213sh %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf6,0x75,0x89,0xa9,0xc2]
; X86-NEXT: retl # encoding: [0xc3]
;
Expand Down Expand Up @@ -528,16 +530,17 @@ define <8 x half>@test_int_x86_avx512_maskz_vfmadd_sh(<8 x half> %x0, <8 x half>
define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) {
; X86-LABEL: fmadd_sh_mask_memfold:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01]
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x02]
; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08]
; X86-NEXT: # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x09]
; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8]
; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1]
; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01]
; X86-NEXT: vmovsh %xmm0, (%edx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x02]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: fmadd_sh_mask_memfold:
Expand Down
Loading
Loading