Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][True16][CodeGen] true16 codegen pattern for fma #122950

Merged
merged 2 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F16_t16_e64:
return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
Expand Down
74 changes: 49 additions & 25 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
Expand All @@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
Expand Down Expand Up @@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAMK_F16_t16
: AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
// V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
// takes VGPR_32_Lo128 operands, so the rewrite would also require
// restricting their register classes. For now just bail out.
if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;

const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
Expand All @@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Expand Down Expand Up @@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMAAK_F16_t16
: AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
// V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
// V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

// takes VGPR_32_Lo128 operands, so the rewrite would also require
// restricting their register classes. For now just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;

// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Expand Down Expand Up @@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
? AMDGPU::V_FMA_F16_gfx9_t16_e64
: AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
Expand Down Expand Up @@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}

assert(
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
"pre-RA");
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
"present "
"pre-RA");

// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
Expand All @@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
Expand Down Expand Up @@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
? ST.useRealTrue16Insts()
? AMDGPU::V_FMAAK_F16_t16
: AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1) {
Expand All @@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
unsigned NewOpc = IsFMA
? (IsF16 ? (ST.hasTrue16BitInsts()
? ST.useRealTrue16Insts()
? AMDGPU::V_FMAMK_F16_t16
: AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
Expand Down Expand Up @@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
Expand Down Expand Up @@ -5578,7 +5600,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAC_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
: AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3237,6 +3237,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
(f16 (VOP3NoMods f16:$src1)),
(f16 (VOP3NoMods f16:$src2))),
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
Expand Down
17 changes: 13 additions & 4 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = AMDGPU::V_FMAAK_F16;
break;
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
NewOpcode = AMDGPU::V_FMAAK_F16_t16;
break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
Expand Down Expand Up @@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = AMDGPU::V_FMAMK_F16;
break;
case AMDGPU::V_FMA_F16_gfx9_t16_e64:
NewOpcode = AMDGPU::V_FMAMK_F16_t16;
break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
Expand Down Expand Up @@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
Expand Down
68 changes: 47 additions & 21 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s

define float @v_fma_f32(float %x, float %y, float %z) {
; GFX6-LABEL: v_fma_f32:
Expand Down Expand Up @@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fma_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it should be optimized in the True16 case. I notice we have not optimized it downstream either. If it is not easy to fix, I'd be ok landing this if the optimization was tracked for a later fix.

Copy link
Contributor Author

@broxigarchen broxigarchen Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Joe. I ran a quick check on this and it seems there is a problem in the two-address convert pass that it failed to map the dst register and thus failed to convert 2 address to 3 address.

It seems it's related with the register class setting for vgpr_16. Since gisel change is not upstreamed, it's better to fix this in the downstream branch. I'll file a case to track this and we can just merge it as it now in the upstream

; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fma_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %fma
}
Expand Down Expand Up @@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_f16_fneg_lhs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
ret half %fma
Expand Down Expand Up @@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_f16_fneg_rhs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg half %y
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
ret half %fma
Expand Down Expand Up @@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fma_f16_fneg_add:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.z = fneg half %z
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
ret half %fma
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s

---
Expand Down
Loading
Loading