Skip to content

Commit 88aec5d

Browse files
committed
[AMDGPU][True16][MC] support more VOP3 inst in true16/fake16 format
1 parent 0964328 commit 88aec5d

File tree

4 files changed

+86
-43
lines changed

4 files changed

+86
-43
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+29-5
Original file line numberDiff line numberDiff line change
@@ -5483,8 +5483,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
54835483
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
54845484
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
54855485
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5486-
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5487-
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5486+
case AMDGPU::S_MINIMUM_F16:
5487+
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5488+
: AMDGPU::V_MINIMUM_F16_fake16_e64;
5489+
case AMDGPU::S_MAXIMUM_F16:
5490+
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5491+
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
54885492
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
54895493
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
54905494
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7448,9 +7452,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74487452
return;
74497453
}
74507454
case AMDGPU::S_MINIMUM_F32:
7451-
case AMDGPU::S_MAXIMUM_F32:
7452-
case AMDGPU::S_MINIMUM_F16:
7453-
case AMDGPU::S_MAXIMUM_F16: {
7455+
case AMDGPU::S_MAXIMUM_F32: {
74547456
const DebugLoc &DL = Inst.getDebugLoc();
74557457
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
74567458
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7467,6 +7469,28 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74677469
Inst.eraseFromParent();
74687470
return;
74697471
}
7472+
case AMDGPU::S_MINIMUM_F16:
7473+
case AMDGPU::S_MAXIMUM_F16: {
7474+
const DebugLoc &DL = Inst.getDebugLoc();
7475+
Register NewDst;
7476+
if (ST.useRealTrue16Insts())
7477+
NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7478+
else
7479+
NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7480+
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7481+
.addImm(0) // src0_modifiers
7482+
.add(Inst.getOperand(1))
7483+
.addImm(0) // src1_modifiers
7484+
.add(Inst.getOperand(2))
7485+
.addImm(0) // clamp
7486+
.addImm(0) // omod
7487+
.addImm(0); // opsel0
7488+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7489+
legalizeOperands(*NewInstr, MDT);
7490+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7491+
Inst.eraseFromParent();
7492+
return;
7493+
}
74707494
}
74717495

74727496
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/lib/Target/AMDGPU/SIInstructions.td

+3
Original file line numberDiff line numberDiff line change
@@ -3659,7 +3659,10 @@ multiclass Int16Med3Pat<Instruction med3Inst,
36593659
defm : FPMed3Pat<f32, V_MED3_F32_e64>;
36603660

36613661
let SubtargetPredicate = HasMed3_16 in {
3662+
let True16Predicate = NotHasTrue16BitInsts in
36623663
defm : FPMed3Pat<f16, V_MED3_F16_e64>;
3664+
let True16Predicate = UseFakeTrue16Insts in
3665+
defm : FPMed3Pat<f16, V_MED3_F16_fake16_e64>;
36633666
}
36643667

36653668
class

llvm/lib/Target/AMDGPU/VOP3Instructions.td

+40-36
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
170170
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
171171
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
172172
defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
173-
defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
174-
defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
173+
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
174+
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
175175

176176
let SchedRW = [WriteDoubleAdd] in {
177177
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -371,8 +371,8 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
371371
} // End SubtargetPredicate = isGFX9Only, FPDPRounding = 1
372372

373373
let SubtargetPredicate = isGFX9Plus in {
374-
defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
375-
defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
374+
defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
375+
defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
376376
let OtherPredicates = [isNotGFX90APlus] in
377377
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
378378
} // End SubtargetPredicate = isGFX9Plus
@@ -437,16 +437,20 @@ defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>;
437437

438438
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
439439

440+
multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
441+
Instruction inst> {
442+
def : GCNPat <
443+
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
444+
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
445+
>;
446+
}
440447

441-
class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
442-
Instruction inst> : GCNPat <
443-
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
444-
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
445-
>;
446-
447-
let Predicates = [Has16BitInsts, isGFX10Plus] in {
448-
def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
449-
} // End Predicates = [Has16BitInsts, isGFX10Plus]
448+
let True16Predicate = UseFakeTrue16Insts in {
449+
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_fake16_e64>;
450+
} // End True16Predicates = UseFakeTrue16Insts
451+
let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
452+
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
453+
} // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts
450454

451455
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
452456
(ops node:$x, node:$y, node:$z),
@@ -616,17 +620,17 @@ let isCommutable = 1, isReMaterializable = 1 in {
616620
} // End isCommutable = 1, isReMaterializable = 1
617621
// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this
618622
// to the new src0.
619-
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
620-
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
621-
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
623+
defm V_MED3_F16 : VOP3Inst_t16 <"v_med3_f16", VOP_F16_F16_F16_F16, AMDGPUfmed3>;
624+
defm V_MED3_I16 : VOP3Inst_t16 <"v_med3_i16", VOP_I16_I16_I16_I16, AMDGPUsmed3>;
625+
defm V_MED3_U16 : VOP3Inst_t16 <"v_med3_u16", VOP_I16_I16_I16_I16, AMDGPUumed3>;
622626

623-
defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
624-
defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
625-
defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
627+
defm V_MIN3_F16 : VOP3Inst_t16 <"v_min3_f16", VOP_F16_F16_F16_F16, AMDGPUfmin3>;
628+
defm V_MIN3_I16 : VOP3Inst_t16 <"v_min3_i16", VOP_I16_I16_I16_I16, AMDGPUsmin3>;
629+
defm V_MIN3_U16 : VOP3Inst_t16 <"v_min3_u16", VOP_I16_I16_I16_I16, AMDGPUumin3>;
626630

627-
defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
628-
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
629-
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
631+
defm V_MAX3_F16 : VOP3Inst_t16 <"v_max3_f16", VOP_F16_F16_F16_F16, AMDGPUfmax3>;
632+
defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
633+
defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
630634

631635
let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
632636
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
@@ -1553,7 +1557,7 @@ defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
15531557
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
15541558
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
15551559
defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
1556-
defm V_MED3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x232, "V_MED3_F16", "v_med3_num_f16">;
1560+
defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
15571561
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
15581562
defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
15591563
defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
@@ -1578,8 +1582,8 @@ defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
15781582
defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
15791583
defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
15801584
defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
1581-
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
1582-
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
1585+
defm V_MINIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
1586+
defm V_MAXIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
15831587

15841588
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
15851589
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
@@ -1669,22 +1673,22 @@ defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>;
16691673
defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>;
16701674
defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>;
16711675
defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>;
1672-
defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
1676+
defm V_MAD_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x241, "v_mad_u16", "V_MAD_U16_gfx9">;
16731677
defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
16741678
defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
16751679
defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
16761680
defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
16771681
defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
1678-
defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
1679-
defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
1680-
defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
1681-
defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>;
1682-
defm V_MAX3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24d>;
1683-
defm V_MAX3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24e>;
1684-
defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>;
1685-
defm V_MED3_I16 : VOP3_Realtriple_gfx11_gfx12<0x250>;
1686-
defm V_MED3_U16 : VOP3_Realtriple_gfx11_gfx12<0x251>;
1687-
defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
1682+
defm V_MIN3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">;
1683+
defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">;
1684+
defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">;
1685+
defm V_MAX3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24c, "v_max3_f16">;
1686+
defm V_MAX3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24d, "v_max3_i16">;
1687+
defm V_MAX3_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24e, "v_max3_u16">;
1688+
defm V_MED3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24f, "v_med3_f16">;
1689+
defm V_MED3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x250, "v_med3_i16">;
1690+
defm V_MED3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x251, "v_med3_u16">;
1691+
defm V_MAD_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x253, "v_mad_i16", "V_MAD_I16_gfx9">;
16881692
defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
16891693
defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>;
16901694
defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>;

llvm/lib/Target/AMDGPU/VOPInstructions.td

+14-2
Original file line numberDiff line numberDiff line change
@@ -1894,10 +1894,22 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
18941894
string pseudo_mnemonic = "", bit isSingle = 0> :
18951895
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
18961896

1897+
multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
1898+
string pseudo_mnemonic = "", bit isSingle = 0> {
1899+
defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
1900+
defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
1901+
}
1902+
18971903
multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
18981904
string opName = NAME, string pseudo_mnemonic = "">
18991905
: VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
19001906

1907+
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName,
1908+
string opName = NAME, string pseudo_mnemonic = ""> {
1909+
defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
1910+
defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
1911+
}
1912+
19011913
multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
19021914
bit isSingle = 0> :
19031915
VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1931,8 +1943,8 @@ multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName
19311943

19321944
multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
19331945
string pseudo_mnemonic = "", bit isSingle = 0> {
1934-
defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
1935-
defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
1946+
defm _t16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
1947+
defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
19361948
}
19371949

19381950
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,

0 commit comments

Comments
 (0)