-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU][True16][MC] support more VOP3 inst in true16/fake16 format #113603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][MC] support more VOP3 inst in true16/fake16 format #113603
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc Author: Brox Chen (broxigarchen) ChangesSupport true16 and fake16 format for more VOP3 instructions in MC This patch updates the true16 and fake16 vop_profile for the following instructions and update the asm/dasm tests: Patch is 1.30 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113603.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 89a2eb4f18946b..57b42683679fbb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5479,8 +5479,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
- case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
- case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
+ case AMDGPU::S_MINIMUM_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
+ : AMDGPU::V_MINIMUM_F16_fake16_e64;
+ case AMDGPU::S_MAXIMUM_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
+ : AMDGPU::V_MAXIMUM_F16_fake16_e64;
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7393,9 +7397,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
case AMDGPU::S_MINIMUM_F32:
- case AMDGPU::S_MAXIMUM_F32:
- case AMDGPU::S_MINIMUM_F16:
- case AMDGPU::S_MAXIMUM_F16: {
+ case AMDGPU::S_MAXIMUM_F32: {
const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7412,6 +7414,28 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
+ case AMDGPU::S_MINIMUM_F16:
+ case AMDGPU::S_MAXIMUM_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst;
+ if (ST.useRealTrue16Insts())
+ NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ else
+ NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(1))
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(2))
+ .addImm(0) // clamp
+ .addImm(0) // omod
+ .addImm(0); // opsel0
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index faa0b6d6c3f506..9659c38b76316b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3665,7 +3665,10 @@ multiclass Int16Med3Pat<Instruction med3Inst,
defm : FPMed3Pat<f32, V_MED3_F32_e64>;
let SubtargetPredicate = HasMed3_16 in {
+let True16Predicate = NotHasTrue16BitInsts in
defm : FPMed3Pat<f16, V_MED3_F16_e64>;
+let True16Predicate = UseFakeTrue16Insts in
+defm : FPMed3Pat<f16, V_MED3_F16_fake16_e64>;
}
class
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34ecdb56e8689d..7b09945222da2b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -167,8 +167,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
-defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
-defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
+defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
let SchedRW = [WriteDoubleAdd] in {
defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -368,8 +368,8 @@ let SubtargetPredicate = isGFX9Only, FPDPRounding = 1 in {
} // End SubtargetPredicate = isGFX9Only, FPDPRounding = 1
let SubtargetPredicate = isGFX9Plus in {
-defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
+defm V_MAD_U16_gfx9 : VOP3Inst_t16 <"v_mad_u16_gfx9", VOP_I16_I16_I16_I16>;
+defm V_MAD_I16_gfx9 : VOP3Inst_t16 <"v_mad_i16_gfx9", VOP_I16_I16_I16_I16>;
let OtherPredicates = [isNotGFX90APlus] in
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus
@@ -434,16 +434,24 @@ defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>;
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
+multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst> {
+ def : GCNPat <
+ (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+ (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+ >;
+}
-class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst> : GCNPat <
- (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
- (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
->;
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
-} // End Predicates = [Has16BitInsts, isGFX10Plus]
+let Predicates = [UseRealTrue16Insts] in {
+ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_t16_e64>;
+ /*defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_t16_e64>;*/
+} // End Predicates = [UseRealTrue16Insts]
+let Predicates = [UseFakeTrue16Insts] in {
+ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_fake16_e64>;
+} // End Predicates = [UseFakeTrue16Insts]
+let Predicates = [Has16BitInsts, NotHasTrue16BitInsts, isGFX10Plus] in {
+ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
+} // End Predicates = [Has16BitInsts, NotHasTrue16BitInsts, isGFX10Plus]
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
(ops node:$x, node:$y, node:$z),
@@ -613,17 +621,17 @@ let isCommutable = 1, isReMaterializable = 1 in {
} // End isCommutable = 1, isReMaterializable = 1
// TODO src0 contains the opsel bit for dst, so if we commute, need to mask and swap this
// to the new src0.
-defm V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>;
-defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>;
-defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>;
+defm V_MED3_F16 : VOP3Inst_t16 <"v_med3_f16", VOP_F16_F16_F16_F16, AMDGPUfmed3>;
+defm V_MED3_I16 : VOP3Inst_t16 <"v_med3_i16", VOP_I16_I16_I16_I16, AMDGPUsmed3>;
+defm V_MED3_U16 : VOP3Inst_t16 <"v_med3_u16", VOP_I16_I16_I16_I16, AMDGPUumed3>;
-defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>;
-defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>;
-defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>;
+defm V_MIN3_F16 : VOP3Inst_t16 <"v_min3_f16", VOP_F16_F16_F16_F16, AMDGPUfmin3>;
+defm V_MIN3_I16 : VOP3Inst_t16 <"v_min3_i16", VOP_I16_I16_I16_I16, AMDGPUsmin3>;
+defm V_MIN3_U16 : VOP3Inst_t16 <"v_min3_u16", VOP_I16_I16_I16_I16, AMDGPUumin3>;
-defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>;
-defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
-defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+defm V_MAX3_F16 : VOP3Inst_t16 <"v_max3_f16", VOP_F16_F16_F16_F16, AMDGPUfmax3>;
+defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
+defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
@@ -1056,7 +1064,7 @@ defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
-defm V_MED3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x232, "V_MED3_F16", "v_med3_num_f16">;
+defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
@@ -1081,8 +1089,8 @@ defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
-defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
-defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
+defm V_MINIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
+defm V_MAXIMUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
@@ -1172,22 +1180,22 @@ defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>;
defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>;
defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>;
defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>;
-defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_MAD_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x241, "v_mad_u16", "V_MAD_U16_gfx9">;
defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
-defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
-defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
-defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
-defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>;
-defm V_MAX3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24d>;
-defm V_MAX3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24e>;
-defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>;
-defm V_MED3_I16 : VOP3_Realtriple_gfx11_gfx12<0x250>;
-defm V_MED3_U16 : VOP3_Realtriple_gfx11_gfx12<0x251>;
-defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_MIN3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x249, "v_min3_f16">;
+defm V_MIN3_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24a, "v_min3_i16">;
+defm V_MIN3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x24b, "v_min3_u16">;
+defm V_MAX3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24c, "v_max3_f16">;
+defm V_MAX3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24d, "v_max3_i16">;
+defm V_MAX3_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x24e, "v_max3_u16">;
+defm V_MED3_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx11<0x24f, "v_med3_f16">;
+defm V_MED3_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x250, "v_med3_i16">;
+defm V_MED3_U16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x251, "v_med3_u16">;
+defm V_MAD_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x253, "v_mad_i16", "V_MAD_I16_gfx9">;
defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>;
defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index aab5dc7465d938..65f3b0639ce8fa 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1799,10 +1799,22 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
string pseudo_mnemonic = "", bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
+ string pseudo_mnemonic = "", bit isSingle = 0> {
+ defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
+ defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
+}
+
multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
string opName = NAME, string pseudo_mnemonic = "">
: VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName,
+ string opName = NAME, string pseudo_mnemonic = ""> {
+ defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, 1>;
+ defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, 1>;
+}
+
multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
bit isSingle = 0> :
VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1836,8 +1848,8 @@ multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName
multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> {
- defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
- defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+ defm _t16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+ defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
}
multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 210d55898367d8..d1e4787f41560c 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2824,50 +2824,62 @@ v_lshrrev_b64 v[5:6], src_scc, src_scc
v_lshrrev_b64 v[254:255], 0xaf123456, 0.5
// GFX11: encoding: [0xfe,0x00,0x3d,0xd7,0xff,0xe0,0x01,0x00,0x56,0x34,0x12,0xaf]
-v_mad_i16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x01,0x05,0x0e,0x00]
+v_mad_i16 v5.l, v1.l, v2.l, s3
+// GFX11: [0x05,0x00,0x53,0xd6,0x01,0x05,0x0e,0x00]
-v_mad_i16 v5, v255, s2, s105
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0xff,0x05,0xa4,0x01]
+v_mad_i16 v5.l, v255.h, s2, s105
+// GFX11: [0x05,0x08,0x53,0xd6,0xff,0x05,0xa4,0x01]
-v_mad_i16 v5, s1, v255, exec_hi
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x01,0xfe,0xff,0x01]
+v_mad_i16 v5.l, s1, v255.h, exec_hi
+// GFX11: [0x05,0x10,0x53,0xd6,0x01,0xfe,0xff,0x01]
-v_mad_i16 v5, s105, s105, exec_lo
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x69,0xd2,0xf8,0x01]
+v_mad_i16 v5.l, s105, s105, exec_lo
+// GFX11: [0x05,0x00,0x53,0xd6,0x69,0xd2,0xf8,0x01]
-v_mad_i16 v5, vcc_lo, ttmp15, v3
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x6a,0xf6,0x0c,0x04]
+v_mad_i16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: [0x05,0x00,0x53,0xd6,0x6a,0xf6,0x0c,0x04]
-v_mad_i16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_mad_i16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x53,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-v_mad_i16 v5, ttmp15, src_scc, ttmp15
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x7b,0xfa,0xed,0x01]
+v_mad_i16 v5.l, ttmp15, src_scc, ttmp15
+// GFX11: [0x05,0x00,0x53,0xd6,0x7b,0xfa,0xed,0x01]
-v_mad_i16 v5, m0, 0.5, m0
+v_mad_i16 v5.l, m0, 0.5, m0
// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x7d,0xe0,0xf5,0x01]
-v_mad_i16 v5, exec_lo, -1, vcc_hi
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x7e,0x82,0xad,0x01]
+v_mad_i16 v5.l, exec_lo, -1, vcc_hi
+// GFX11: [0x05,0x00,0x53,0xd6,0x7e,0x82,0xad,0x01]
-v_mad_i16 v5, exec_hi, null, vcc_lo op_sel:[1,1,1,1]
-// GFX11: encoding: [0x05,0x78,0x53,0xd6,0x7f,0xf8,0xa8,0x01]
+v_mad_i16 v5.l, exec_hi, null, vcc_lo
+// GFX11: [0x05,0x00,0x53,0xd6,0x7f,0xf8,0xa8,0x01]
-v_mad_i16 v5, null, exec_lo, 0xfe0b op_sel:[0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0x53,0xd6,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00]
+v_mad_i16 v5.l, null, exec_lo, 0xfe0b
+// GFX11: [0x05,0x00,0x53,0xd6,0x7c,0xfc,0xfc,0x03,0x0b,0xfe,0x00,0x00]
-v_mad_i16 v5, -1, exec_hi, src_scc op_sel:[1,0,0,0]
-// GFX11: encoding: [0x05,0x08,0x53,0xd6,0xc1,0xfe,0xf4,0x03]
+v_mad_i16 v5.l, -1, exec_hi, src_scc
+// GFX11: [0x05,0x00,0x53,0xd6,0xc1,0xfe,0xf4,0x03]
-v_mad_i16 v5, 0.5, m0, 0.5 op_sel:[0,1,0,0]
+v_mad_i16 v5.l, 0.5, m0, 0.5 op_sel:[0,1,0,0]
// GFX11: encoding: [0x05,0x10,0x53,0xd6,0xf0,0xfa,0xc0,0x03]
-v_mad_i16 v5, src_scc, vcc_lo, -1 op_sel:[0,0,1,0]
-// GFX11: encoding: [0x05,0x20,0x53,0xd6,0xfd,0xd4,0x04,0x03]
+v_mad_i16 v5.l, src_scc, vcc_lo, -1
+// GFX11: [0x05,0x00,0x53,0xd6,0xfd,0xd4,0x04,0x03]
-v_mad_i16 v255, 0xfe0b, vcc_hi, null op_sel:[0,0,0,1] clamp
-// GFX11: encoding: [0xff,0xc0,0x53,0xd6,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00]
+v_mad_i16 v255.h, 0xfe0b, vcc_hi, null clamp
+// GFX11: [0xff,0xc0,0x53,0xd6,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00]
+
+v_mad_i16 v5.l, v255.h, s2, s105
+// GFX11: [0x05,0x08,0x53,0xd6,0xff,0x05,0xa4,0x01]
+
+v_mad_i16 v5.l, s1, v255.h, exec_hi
+// GFX11: [0x05,0x10,0x53,0xd6,0x01,0xfe,0xff,0x01]
+
+v_mad_i16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x53,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_mad_i16 v255.h, 0xfe0b, vcc_hi, null clamp
+// GFX11: [0xff,0xc0,0x53,0xd6,0xff,0xd6,0xf0,0x01,0x0b,0xfe,0x00,0x00]
v_mad_i32_i16 v5, v1, v2, v3
// GFX11: encoding: [0x05,0x00,0x5a,0xd6,0x01,0x05,0x0e,0x04]
@@ -3034,50 +3046,62 @@ v_mad_i64_i32 v[5:6], ttmp[14:15], src_scc, vcc_lo, src_scc
v_mad_i64_i32 v[254:255], null, 0xaf123456, vcc_hi, 0.5 clamp
// GFX11: encoding: [0xfe,0xfc,0xff,0xd6,0xff,0xd6,0xc0,0x03,0x56,0x34,0x12,0xaf]
-v_mad_u16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x41,0xd6,0x01,0x05,0x0e,0x00]
+v_mad_u16 v5.l, v1.l, v2.l, s3
+// GFX11: [0x05,0x00,0x41,0xd6,0x01,0x05,0x0e,0x00]
-v_mad_u16 v5, v255, s2, s105
-// GFX11: encoding: [0x05,0x00,0x41,0xd6,0xff,0x05,0xa4,0x01]
+v_mad_u16 v5.l, v255.h, s2, s105
+// GFX11: [0x05,0x08,0x41,0xd6,0xff,0x05,0xa4,0x01]
-v_mad_u16 v5, s1, v255, exec_hi
-// GFX11: encoding: [0x05,0x00,0x41,0xd6,0x01,0xfe,0xff,0x01]
+v_mad_u16 v5.l, s1, v255.h, exec_hi
+// GFX11: [0x05,0...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
b661f1c
to
4e66e6a
Compare
I think the op_sel testline might be missing for these instructions. I guess the op_sel testlines in the downstream is not complete? @Sisyph can you help to confirm? Thanks! |
4e66e6a
to
b545827
Compare
Updated GFX11 asm test with op_sel testlines. Now ready for review! |
3775423
to
e37b111
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Functional changes LGTM, but the tests still seem to need syncing with downstream first.
f8cb9f3
to
4b1545a
Compare
4b1545a
to
58000b4
Compare
Sorry there was a merge mistake than invites a lot of reviewers to this patch. Removed and corrected it. |
@Sisyph as dicussed offline, removed v_maximum_f16 and v_mininum_f16 from this patch |
In this patch #113603 replace `V_MED3_I/U16` to `V_MED3_I/U16_fake16` for Post-GFX11, but it miss to update the CodeGen pattern. This patch update and corrert the CodeGen pattern
# W64-REAL16: v_mad_i16_e64_dpp v255.h, v255.l, v255.l, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x53,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] | ||
# W64-FAKE16: v_mad_i16_e64_dpp v255, v255, v255, src_scc op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x53,0xd6,0xea,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00] | ||
|
||
0x05,0x78,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This duplicates the case at line 2753. Seems there are other duplicates as well.
V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test
…#120600) In this patch llvm/llvm-project#113603 replace `V_MED3_I/U16` to `V_MED3_I/U16_fake16` for Post-GFX11, but it miss to update the CodeGen pattern. This patch update and corrert the CodeGen pattern
V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in llvm/llvm-project#113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test
Support true16 and fake16 format for more VOP3 instructions in MC
This patch updates the true16 and fake16 vop_profile for the following instructions and update the asm/dasm tests:
v_mad_u16
v_mad_i16
v_med3_f16
v_med3_i16
v_med3_u16
v_max3_f16
v_max3_i16
v_max3_u16
v_min3_f16
v_min3_i16
v_min3_u16
v_med3_num_f16