Skip to content

Commit 4e66e6a

Browse files
committed
[AMDGPU][True16][MC] support more VOP3 inst in true16/fake16 format
1 parent a4ace3d commit 4e66e6a

16 files changed

+8780
-3504
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+29-5
Original file line numberDiff line numberDiff line change
@@ -5479,8 +5479,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
54795479
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
54805480
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
54815481
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5482-
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5483-
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5482+
case AMDGPU::S_MINIMUM_F16:
5483+
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5484+
: AMDGPU::V_MINIMUM_F16_fake16_e64;
5485+
case AMDGPU::S_MAXIMUM_F16:
5486+
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5487+
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
54845488
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
54855489
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
54865490
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7393,9 +7397,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
73937397
return;
73947398
}
73957399
case AMDGPU::S_MINIMUM_F32:
7396-
case AMDGPU::S_MAXIMUM_F32:
7397-
case AMDGPU::S_MINIMUM_F16:
7398-
case AMDGPU::S_MAXIMUM_F16: {
7400+
case AMDGPU::S_MAXIMUM_F32: {
73997401
const DebugLoc &DL = Inst.getDebugLoc();
74007402
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
74017403
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7412,6 +7414,28 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
74127414
Inst.eraseFromParent();
74137415
return;
74147416
}
7417+
case AMDGPU::S_MINIMUM_F16:
7418+
case AMDGPU::S_MAXIMUM_F16: {
7419+
const DebugLoc &DL = Inst.getDebugLoc();
7420+
Register NewDst;
7421+
if (ST.useRealTrue16Insts())
7422+
NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7423+
else
7424+
NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7425+
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7426+
.addImm(0) // src0_modifiers
7427+
.add(Inst.getOperand(1))
7428+
.addImm(0) // src1_modifiers
7429+
.add(Inst.getOperand(2))
7430+
.addImm(0) // clamp
7431+
.addImm(0) // omod
7432+
.addImm(0); // opsel0
7433+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7434+
legalizeOperands(*NewInstr, MDT);
7435+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7436+
Inst.eraseFromParent();
7437+
return;
7438+
}
74157439
}
74167440

74177441
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/lib/Target/AMDGPU/SIInstructions.td

+3
Original file line numberDiff line numberDiff line change
@@ -3665,7 +3665,10 @@ multiclass Int16Med3Pat<Instruction med3Inst,
36653665
defm : FPMed3Pat<f32, V_MED3_F32_e64>;
36663666

36673667
let SubtargetPredicate = HasMed3_16 in {
3668+
let True16Predicate = NotHasTrue16BitInsts in
36683669
defm : FPMed3Pat<f16, V_MED3_F16_e64>;
3670+
let True16Predicate = UseFakeTrue16Insts in
3671+
defm : FPMed3Pat<f16, V_MED3_F16_fake16_e64>;
36693672
}
36703673

36713674
class

llvm/lib/Target/AMDGPU/VOP3Instructions.td

+44-36
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
167167
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
168168
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
169169
defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
170-
defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
171-
defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
170+
defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
171+
defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
172172

173173
let SchedRW = [WriteDoubleAdd] in {
174174
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -368,8 +368,8 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
368368
} // End SubtargetPredicate = isGFX9Only, FPDPRounding = 1
369369

370370
let SubtargetPredicate = isGFX9Plus in {
371-
defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
372-
defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
371+
defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
372+
defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
373373
let OtherPredicates = [isNotGFX90APlus] in
374374
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
375375
} // End SubtargetPredicate = isGFX9Plus
@@ -434,16 +434,24 @@ defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>;
434434

435435
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
436436

437+
multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
438+
Instruction inst> {
439+
def : GCNPat <
440+
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
441+
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
442+
>;
443+
}
437444

438-
class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
439-
Instruction inst> : GCNPat <
440-
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
441-
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
442-
>;
443-
444-
let Predicates = [Has16BitInsts, isGFX10Plus] in {
445-
def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
446-
} // End Predicates = [Has16BitInsts, isGFX10Plus]
445+
let Predicates = [UseRealTrue16Insts] in {
446+
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_t16_e64>;
447+
/*defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_t16_e64>;*/
448+
} // End Predicates = [UseRealTrue16Insts]
449+
let Predicates = [UseFakeTrue16Insts] in {
450+
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_fake16_e64>;
451+
} // End Predicates = [UseFakeTrue16Insts]
452+
let Predicates = [Has16BitInsts, NotHasTrue16BitInsts, isGFX10Plus] in {
453+
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
454+
} // End Predicates = [Has16BitInsts, NotHasTrue16BitInsts, isGFX10Plus]
447455

448456
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
449457
(ops node:$x, node:$y, node:$z),
@@ -613,17 +621,17 @@ let isCommutable = 1, isReMaterializable = 1 in {
613621
} // End isCommutable = 1, isReMaterializable = 1
614622
// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this
615623
// to the new src0.
616-
defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
617-
defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
618-
defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
624+
defm V_MED3_F16 : VOP3Inst_t16 <"v_med3_f16", VOP_F16_F16_F16_F16, AMDGPUfmed3>;
625+
defm V_MED3_I16 : VOP3Inst_t16 <"v_med3_i16", VOP_I16_I16_I16_I16, AMDGPUsmed3>;
626+
defm V_MED3_U16 : VOP3Inst_t16 <"v_med3_u16", VOP_I16_I16_I16_I16, AMDGPUumed3>;
619627

620-
defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
621-
defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
622-
defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
628+
defm V_MIN3_F16 : VOP3Inst_t16 <"v_min3_f16", VOP_F16_F16_F16_F16, AMDGPUfmin3>;
629+
defm V_MIN3_I16 : VOP3Inst_t16 <"v_min3_i16", VOP_I16_I16_I16_I16, AMDGPUsmin3>;
630+
defm V_MIN3_U16 : VOP3Inst_t16 <"v_min3_u16", VOP_I16_I16_I16_I16, AMDGPUumin3>;
623631

624-
defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
625-
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
626-
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
632+
defm V_MAX3_F16 : VOP3Inst_t16 <"v_max3_f16", VOP_F16_F16_F16_F16, AMDGPUfmax3>;
633+
defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
634+
defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
627635

628636
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
629637
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
@@ -1056,7 +1064,7 @@ defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
10561064
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
10571065
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
10581066
defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
1059-
defm V_MED3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x232, "V_MED3_F16", "v_med3_num_f16">;
1067+
defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
10601068
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
10611069
defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
10621070
defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
@@ -1081,8 +1089,8 @@ defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
10811089
defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
10821090
defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
10831091
defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
1084-
defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
1085-
defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
1092+
defm V_MINIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
1093+
defm V_MAXIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
10861094

10871095
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
10881096
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
@@ -1172,22 +1180,22 @@ defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>;
11721180
defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>;
11731181
defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>;
11741182
defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>;
1175-
defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
1183+
defm V_MAD_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x241, "v_mad_u16", "V_MAD_U16_gfx9">;
11761184
defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
11771185
defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
11781186
defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
11791187
defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
11801188
defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
1181-
defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
1182-
defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
1183-
defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
1184-
defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>;
1185-
defm V_MAX3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24d>;
1186-
defm V_MAX3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24e>;
1187-
defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>;
1188-
defm V_MED3_I16 : VOP3_Realtriple_gfx11_gfx12<0x250>;
1189-
defm V_MED3_U16 : VOP3_Realtriple_gfx11_gfx12<0x251>;
1190-
defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
1189+
defm V_MIN3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">;
1190+
defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">;
1191+
defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">;
1192+
defm V_MAX3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24c, "v_max3_f16">;
1193+
defm V_MAX3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24d, "v_max3_i16">;
1194+
defm V_MAX3_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24e, "v_max3_u16">;
1195+
defm V_MED3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24f, "v_med3_f16">;
1196+
defm V_MED3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x250, "v_med3_i16">;
1197+
defm V_MED3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x251, "v_med3_u16">;
1198+
defm V_MAD_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x253, "v_mad_i16", "V_MAD_I16_gfx9">;
11911199
defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
11921200
defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>;
11931201
defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>;

llvm/lib/Target/AMDGPU/VOPInstructions.td

+14-2
Original file line numberDiff line numberDiff line change
@@ -1799,10 +1799,22 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
17991799
string pseudo_mnemonic = "", bit isSingle = 0> :
18001800
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
18011801

1802+
multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
1803+
string pseudo_mnemonic = "", bit isSingle = 0> {
1804+
defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
1805+
defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
1806+
}
1807+
18021808
multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
18031809
string opName = NAME, string pseudo_mnemonic = "">
18041810
: VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
18051811

1812+
multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName,
1813+
string opName = NAME, string pseudo_mnemonic = ""> {
1814+
defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
1815+
defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
1816+
}
1817+
18061818
multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
18071819
bit isSingle = 0> :
18081820
VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1836,8 +1848,8 @@ multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName
18361848

18371849
multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
18381850
string pseudo_mnemonic = "", bit isSingle = 0> {
1839-
defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
1840-
defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
1851+
defm _t16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
1852+
defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
18411853
}
18421854

18431855
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,

0 commit comments

Comments
 (0)