From 72ff900554c77f73bae314ce36a3ac22a3a3f6ca Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 14 Oct 2024 09:41:00 +0000 Subject: [PATCH] Implement relaxed simd operations on x86 Also contains some fixes for non-avx code paths Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com --- src/jit/Backend.cpp | 8 +- src/jit/ByteCodeParser.cpp | 118 ++++++++++++------- src/jit/Compiler.h | 2 + src/jit/SimdArm32Inl.h | 2 +- src/jit/SimdArm64Inl.h | 2 +- src/jit/SimdX86Inl.h | 234 +++++++++++++++++++++++++++++++++---- 6 files changed, 296 insertions(+), 70 deletions(-) diff --git a/src/jit/Backend.cpp b/src/jit/Backend.cpp index 7fc6ef907..f876a53d7 100644 --- a/src/jit/Backend.cpp +++ b/src/jit/Backend.cpp @@ -1242,6 +1242,10 @@ void JITCompiler::compileFunction(JITFunction* jitFunc, bool isExternal) emitShiftSIMD(m_compiler, item->asInstruction()); break; } + case Instruction::TernarySIMD: { + emitTernarySIMD(m_compiler, item->asInstruction()); + break; + } #endif /* HAS_SIMD */ case Instruction::StackInit: { emitStackInit(m_compiler, item->asInstruction()); @@ -1270,10 +1274,6 @@ void JITCompiler::compileFunction(JITFunction* jitFunc, bool isExternal) break; } #ifdef HAS_SIMD - case ByteCode::V128BitSelectOpcode: { - emitSelectSIMD(m_compiler, item->asInstruction()); - break; - } case ByteCode::I8X16ShuffleOpcode: { emitShuffleSIMD(m_compiler, item->asInstruction()); break; diff --git a/src/jit/ByteCodeParser.cpp b/src/jit/ByteCodeParser.cpp index e8c3ee7aa..a1c0c9abc 100644 --- a/src/jit/ByteCodeParser.cpp +++ b/src/jit/ByteCodeParser.cpp @@ -239,36 +239,37 @@ static bool isFloatGlobal(uint32_t globalIndex, Module* module) OL6(OTAtomicWaitI64, /* SSSDTT */ I32, I64, I64, I32, PTR, I64 | S0) \ OL5(OTAtomicNotify, /* SSDTT */ I32, I32, I32, PTR, I32 | S0) -#define OPERAND_TYPE_LIST_SIMD \ - OL2(OTOp1V128, /* SD */ V128 | NOTMP, V128 | TMP | S0) \ - OL2(OTOpCondV128, /* SD */ V128 | TMP, I32) \ - OL1(OTGlobalGetV128, /* D */ V128) \ - OL1(OTGlobalSetV128, /* S */ V128 | NOTMP) \ - OL2(OTSplatI32, /* SD */ I32, V128 | TMP) \ - OL2(OTSplatI64, /* SD */ I64, V128 | TMP) \ - OL2(OTSplatF32, /* SD */ F32 | NOTMP, V128 | TMP) \ - OL2(OTSplatF64, /* SD */ F64 | NOTMP, V128 | TMP) \ - OL2(OTV128ToI32, /* SD */ V128 | TMP, I32) \ - OL4(OTBitSelectV128, /* SSSD */ V128 | TMP, V128 | TMP, V128 | NOTMP, V128 | TMP | S2) \ - OL2(OTExtractLaneI64, /* SD */ V128 | TMP, I64) \ - OL2(OTExtractLaneF32, /* SD */ V128 | TMP, F32 | S0) \ - OL2(OTExtractLaneF64, /* SD */ V128 | TMP, F64 | S0) \ - OL3(OTReplaceLaneI32, /* SSD */ V128 | NOTMP, I32, V128 | TMP | S0) \ - OL3(OTReplaceLaneI64, /* SSD */ V128 | NOTMP, I64, V128 | TMP | S0) \ - OL3(OTReplaceLaneF32, /* SSD */ V128 | NOTMP, F32 | NOTMP, V128 | TMP | S0) \ - OL3(OTReplaceLaneF64, /* SSD */ V128 | NOTMP, F64 | NOTMP, V128 | TMP | S0) \ +#define OPERAND_TYPE_LIST_SIMD \ + OL2(OTOp1V128, /* SD */ V128 | NOTMP, V128 | TMP | S0) \ + OL2(OTOpCondV128, /* SD */ V128 | TMP, I32) \ + OL1(OTGlobalGetV128, /* D */ V128) \ + OL1(OTGlobalSetV128, /* S */ V128 | NOTMP) \ + OL2(OTSplatI32, /* SD */ I32, V128 | TMP) \ + OL2(OTSplatI64, /* SD */ I64, V128 | TMP) \ + OL2(OTSplatF32, /* SD */ F32 | NOTMP, V128 | TMP) \ + OL2(OTSplatF64, /* SD */ F64 | NOTMP, V128 | TMP) \ + OL2(OTV128ToI32, /* SD */ V128 | TMP, I32) \ + OL4(OTOp3V128, /* SSSD */ V128 | TMP, V128 | TMP, V128 | NOTMP, V128 | TMP | S2) \ + OL2(OTExtractLaneI64, /* SD */ V128 | TMP, I64) \ + OL2(OTExtractLaneF32, /* SD */ V128 | TMP, F32 | S0) \ + OL2(OTExtractLaneF64, /* SD */ V128 | TMP, F64 | S0) \ + OL3(OTReplaceLaneI32, /* SSD */ V128 | NOTMP, I32, V128 | TMP | S0) \ + OL3(OTReplaceLaneI64, /* SSD */ V128 | NOTMP, I64, V128 | TMP | S0) \ + OL3(OTReplaceLaneF32, /* SSD */ V128 | NOTMP, F32 | NOTMP, V128 | TMP | S0) \ + OL3(OTReplaceLaneF64, /* SSD */ V128 | NOTMP, F64 | NOTMP, V128 | TMP | S0) \ OL4(OTSelectV128, /* SSSD */ V128, V128, I32, V128 | S0 | S1) #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) -#define OPERAND_TYPE_LIST_SIMD_ARCH \ - OL3(OTOp2V128, /* SSD */ V128 | NOTMP, V128 | TMP, V128 | TMP | S0) \ - OL3(OTOp1V128Tmp, /* SDT */ V128 | NOTMP, V128 | TMP | S0, V128) \ - OL4(OTOp2V128Tmp, /* SSDT */ V128 | NOTMP, V128 | TMP, V128 | TMP | S0, V128) \ - OL3(OTOp2V128Rev, /* SSD */ V128 | TMP, V128 | NOTMP, V128 | TMP | S1) \ - OL3(OTShuffleV128, /* SSD */ V128 | NOTMP, V128 | NOTMP, V128 | TMP | S0) \ - OL3(OTPopcntV128, /* SDT */ V128 | NOTMP, V128 | TMP | S0, V128) \ - OL3(OTShiftV128, /* SSD */ V128 | NOTMP, I32, V128 | TMP | S0) \ +#define OPERAND_TYPE_LIST_SIMD_ARCH \ + OL3(OTOp2V128, /* SSD */ V128 | NOTMP, V128 | TMP, V128 | TMP | S0) \ + OL3(OTOp1V128Tmp, /* SDT */ V128 | NOTMP, V128 | TMP | S0, V128) \ + OL4(OTOp2V128Tmp, /* SSDT */ V128 | NOTMP, V128 | TMP, V128 | TMP | S0, V128) \ + OL3(OTOp2V128Rev, /* SSD */ V128 | TMP, V128 | NOTMP, V128 | TMP | S1) \ + OL5(OTOp3DotAddV128, /* SSSDT */ V128 | TMP, V128 | TMP, V128 | NOTMP, V128 | TMP | S2, V128) \ + OL3(OTShuffleV128, /* SSD */ V128 | NOTMP, V128 | NOTMP, V128 | TMP | S0) \ + OL3(OTPopcntV128, /* SDT */ V128 | NOTMP, V128 | TMP | S0, V128) \ + OL3(OTShiftV128, /* SSD */ V128 | NOTMP, I32, V128 | TMP | S0) \ OL4(OTShiftV128Tmp, /* SSDT */ V128 | NOTMP, I32, V128 | TMP | S0, V128) // List of aliases. @@ -294,6 +295,7 @@ static bool isFloatGlobal(uint32_t globalIndex, Module* module) #define OTPopcntV128 OTOp1V128 #define OTSwizzleV128 OTOp2V128 #define OTShiftV128Tmp OTShiftV128 +#define OTOp3DotAddV128 OTOp2V128 #elif (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) @@ -312,6 +314,7 @@ static bool isFloatGlobal(uint32_t globalIndex, Module* module) #define OTPMinMaxV128 OTOp2V128 #define OTPopcntV128 OTOp1V128 #define OTShiftV128Tmp OTShiftV128 +#define OTOp3DotAddV128 OTOp2V128 #endif /* SLJIT_CONFIG_ARM */ @@ -387,6 +390,7 @@ enum ParamTypes { ParamSrc2Value, ParamSrc2Dst, ParamSrc3, + ParamSrc3Dst, }; static void compileFunction(JITCompiler* compiler) @@ -1596,6 +1600,7 @@ static void compileFunction(JITCompiler* compiler) case ByteCode::I16X8NarrowI32X4SOpcode: case ByteCode::I16X8NarrowI32X4UOpcode: case ByteCode::I16X8Q15mulrSatSOpcode: + case ByteCode::I16X8RelaxedQ15mulrSOpcode: case ByteCode::I32X4AddOpcode: case ByteCode::I32X4SubOpcode: case ByteCode::I32X4MulOpcode: @@ -1644,7 +1649,8 @@ static void compileFunction(JITCompiler* compiler) case ByteCode::F64X2LeOpcode: case ByteCode::V128AndOpcode: case ByteCode::V128OrOpcode: - case ByteCode::V128XorOpcode: { + case ByteCode::V128XorOpcode: + case ByteCode::I8X16RelaxedSwizzleOpcode: { group = Instruction::BinarySIMD; paramType = ParamTypes::ParamSrc2Dst; requiredInit = OTOp2V128; @@ -1669,7 +1675,11 @@ static void compileFunction(JITCompiler* compiler) case ByteCode::F32X4MaxOpcode: case ByteCode::F32X4MinOpcode: case ByteCode::F64X2MaxOpcode: - case ByteCode::F64X2MinOpcode: { + case ByteCode::F64X2MinOpcode: + case ByteCode::F32X4RelaxedMaxOpcode: + case ByteCode::F32X4RelaxedMinOpcode: + case ByteCode::F64X2RelaxedMaxOpcode: + case ByteCode::F64X2RelaxedMinOpcode: { group = Instruction::BinarySIMD; paramType = ParamTypes::ParamSrc2Dst; #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) @@ -1692,7 +1702,8 @@ static void compileFunction(JITCompiler* compiler) case ByteCode::F32X4GeOpcode: case ByteCode::F64X2GtOpcode: case ByteCode::F64X2GeOpcode: - case ByteCode::V128AndnotOpcode: { + case ByteCode::V128AndnotOpcode: + case ByteCode::I16X8DotI8X16I7X16SOpcode: { group = Instruction::BinarySIMD; paramType = ParamTypes::ParamSrc2Dst; requiredInit = OTOp2V128Rev; @@ -1717,8 +1728,11 @@ static void compileFunction(JITCompiler* compiler) case ByteCode::I32X4ExtendLowI16X8UOpcode: case ByteCode::I32X4ExtendHighI16X8UOpcode: case ByteCode::I32X4TruncSatF32X4SOpcode: + case ByteCode::I32X4RelaxedTruncF32X4SOpcode: case ByteCode::I32X4TruncSatF64X2SZeroOpcode: case ByteCode::I32X4TruncSatF64X2UZeroOpcode: + case ByteCode::I32X4RelaxedTruncF64X2SZeroOpcode: + case ByteCode::I32X4RelaxedTruncF64X2UZeroOpcode: case ByteCode::I64X2NegOpcode: case ByteCode::I64X2AbsOpcode: case ByteCode::I64X2ExtendLowI32X4SOpcode: @@ -1759,6 +1773,7 @@ static void compileFunction(JITCompiler* compiler) break; } case ByteCode::I32X4TruncSatF32X4UOpcode: + case ByteCode::I32X4RelaxedTruncF32X4UOpcode: case ByteCode::F32X4ConvertI32X4UOpcode: case ByteCode::F64X2ConvertLowI32X4UOpcode: { group = Instruction::UnarySIMD; @@ -1805,17 +1820,24 @@ static void compileFunction(JITCompiler* compiler) requiredInit = OTPopcntV128; break; } - case ByteCode::V128BitSelectOpcode: { - Instruction* instr = compiler->append(byteCode, Instruction::Any, opcode, 3, 1); - instr->setRequiredRegsDescriptor(OTBitSelectV128); - - V128BitSelect* bitSelect = reinterpret_cast(byteCode); - Operand* operands = instr->operands(); - - operands[0] = STACK_OFFSET(bitSelect->srcOffsets()[0]); - operands[1] = STACK_OFFSET(bitSelect->srcOffsets()[1]); - operands[2] = STACK_OFFSET(bitSelect->srcOffsets()[2]); - operands[3] = STACK_OFFSET(bitSelect->dstOffset()); + case ByteCode::V128BitSelectOpcode: + case ByteCode::I8X16RelaxedLaneSelectOpcode: + case ByteCode::I16X8RelaxedLaneSelectOpcode: + case ByteCode::I32X4RelaxedLaneSelectOpcode: + case ByteCode::I64X2RelaxedLaneSelectOpcode: + case ByteCode::F32X4RelaxedMaddOpcode: + case ByteCode::F32X4RelaxedNmaddOpcode: + case ByteCode::F64X2RelaxedMaddOpcode: + case ByteCode::F64X2RelaxedNmaddOpcode: { + group = Instruction::TernarySIMD; + paramType = ParamTypes::ParamSrc3Dst; + requiredInit = OTOp3V128; + break; + } + case ByteCode::I32X4DotI8X16I7X16AddSOpcode: { + group = Instruction::TernarySIMD; + paramType = ParamTypes::ParamSrc3Dst; + requiredInit = OTOp3DotAddV128; break; } case ByteCode::I8X16ShuffleOpcode: { @@ -2083,6 +2105,22 @@ static void compileFunction(JITCompiler* compiler) operands[2] = STACK_OFFSET(offset3Operation->stackOffset3()); break; } + case ParamSrc3Dst: { + ASSERT(group != Instruction::Any); + + Instruction* instr = compiler->append(byteCode, group, opcode, 3, 1); + instr->addInfo(info); + instr->setRequiredRegsDescriptor(requiredInit); + + ByteCodeOffset4* offset4Operation = reinterpret_cast(byteCode); + Operand* operands = instr->operands(); + + operands[0] = STACK_OFFSET(offset4Operation->src0Offset()); + operands[1] = STACK_OFFSET(offset4Operation->src1Offset()); + operands[2] = STACK_OFFSET(offset4Operation->src2Offset()); + operands[3] = STACK_OFFSET(offset4Operation->dstOffset()); + break; + } default: { ASSERT(paramType == ParamTypes::NoParam); break; diff --git a/src/jit/Compiler.h b/src/jit/Compiler.h index 9fd1670c8..33002ce2a 100644 --- a/src/jit/Compiler.h +++ b/src/jit/Compiler.h @@ -105,6 +105,8 @@ class InstructionListItem { BitMaskSIMD, // Shift SIMD opcodes (e.g. I8X16SHL) ShiftSIMD, + // Ternary SIMD opcodes (e.g. V128BitSelect) + TernarySIMD, // Special type for initializing values from the stack StackInit, // Atomic memory operations (e.g. I32AtomicRmwAdd, I64AtomicRmw16OrU) diff --git a/src/jit/SimdArm32Inl.h b/src/jit/SimdArm32Inl.h index 7a621b7cd..b821925e4 100644 --- a/src/jit/SimdArm32Inl.h +++ b/src/jit/SimdArm32Inl.h @@ -1672,7 +1672,7 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) } } -static void emitSelectSIMD(sljit_compiler* compiler, Instruction* instr) +static void emitTernarySIMD(sljit_compiler* compiler, Instruction* instr) { Operand* operands = instr->operands(); JITArg args[3]; diff --git a/src/jit/SimdArm64Inl.h b/src/jit/SimdArm64Inl.h index b2e740c7b..304290f52 100644 --- a/src/jit/SimdArm64Inl.h +++ b/src/jit/SimdArm64Inl.h @@ -1166,7 +1166,7 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) } } -static void emitSelectSIMD(sljit_compiler* compiler, Instruction* instr) +static void emitTernarySIMD(sljit_compiler* compiler, Instruction* instr) { Operand* operands = instr->operands(); JITArg args[3]; diff --git a/src/jit/SimdX86Inl.h b/src/jit/SimdX86Inl.h index 42f3e69aa..766f7a8da 100644 --- a/src/jit/SimdX86Inl.h +++ b/src/jit/SimdX86Inl.h @@ -42,6 +42,7 @@ enum Type : uint32_t { andnpd = 0x55 | SimdOp::prefix66, andnps = 0x55, blendps = 0x0c | SimdOp::opcode3A | SimdOp::prefix66, + blendvps = 0x4a | SimdOp::opcode3A | SimdOp::prefix66 /* VEX only */, blendvpd = 0x4b | SimdOp::opcode3A | SimdOp::prefix66 /* VEX only */, cmpeqpd = OPCODE_AND_IMM(0xc2, 0) | SimdOp::prefix66, cmpeqps = OPCODE_AND_IMM(0xc2, 0), @@ -91,6 +92,7 @@ enum Type : uint32_t { pandn = 0xdf | SimdOp::prefix66, pavgb = 0xe0 | SimdOp::prefix66, pavgw = 0xe3 | SimdOp::prefix66, + pblendvb = 0x4c | SimdOp::opcode3A | SimdOp::prefix66 /* VEX only */, pblendw = 0x0e | SimdOp::opcode3A | SimdOp::prefix66, pcmpeqb = 0x74 | SimdOp::prefix66, pcmpeqd = 0x76 | SimdOp::prefix66, @@ -745,11 +747,15 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) break; case ByteCode::I32X4TruncSatF32X4SOpcode: case ByteCode::I32X4TruncSatF32X4UOpcode: + case ByteCode::I32X4RelaxedTruncF32X4SOpcode: + case ByteCode::I32X4RelaxedTruncF32X4UOpcode: srcType = SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_FLOAT; dstType = SLJIT_SIMD_ELEM_32; break; case ByteCode::I32X4TruncSatF64X2SZeroOpcode: case ByteCode::I32X4TruncSatF64X2UZeroOpcode: + case ByteCode::I32X4RelaxedTruncF64X2SZeroOpcode: + case ByteCode::I32X4RelaxedTruncF64X2UZeroOpcode: srcType = SLJIT_SIMD_ELEM_64 | SLJIT_SIMD_FLOAT; dstType = SLJIT_SIMD_ELEM_32; break; @@ -903,12 +909,24 @@ static void emitUnarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::I32X4TruncSatF32X4UOpcode: simdEmitTruncSatF32x4U(compiler, dst, args[0].arg, instr->requiredReg(1)); break; + case ByteCode::I32X4RelaxedTruncF32X4SOpcode: + simdEmitSSEOp(compiler, SimdOp::cvttps2dq, dst, args[0].arg); + break; + case ByteCode::I32X4RelaxedTruncF32X4UOpcode: + simdEmitTruncSatF32x4U(compiler, dst, args[0].arg, instr->requiredReg(1)); + break; case ByteCode::I32X4TruncSatF64X2SZeroOpcode: simdEmitTruncSatS(compiler, dst, args[0].arg, false); break; case ByteCode::I32X4TruncSatF64X2UZeroOpcode: simdEmitTruncSatF64x2U(compiler, dst, args[0].arg); break; + case ByteCode::I32X4RelaxedTruncF64X2SZeroOpcode: + simdEmitSSEOp(compiler, SimdOp::cvttpd2dq, dst, args[0].arg); + break; + case ByteCode::I32X4RelaxedTruncF64X2UZeroOpcode: + simdEmitTruncSatF64x2U(compiler, dst, args[0].arg); + break; case ByteCode::I64X2NegOpcode: simdEmitINeg(compiler, 0, SimdOp::psubq, dst, args[0].arg); break; @@ -1409,6 +1427,7 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::I8X16MaxUOpcode: case ByteCode::I8X16AvgrUOpcode: case ByteCode::I8X16SwizzleOpcode: + case ByteCode::I8X16RelaxedSwizzleOpcode: srcType = SLJIT_SIMD_ELEM_8; dstType = SLJIT_SIMD_ELEM_8; break; @@ -1440,9 +1459,13 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::I16X8MaxUOpcode: case ByteCode::I16X8AvgrUOpcode: case ByteCode::I16X8Q15mulrSatSOpcode: + case ByteCode::I16X8RelaxedQ15mulrSOpcode: srcType = SLJIT_SIMD_ELEM_16; dstType = SLJIT_SIMD_ELEM_16; break; + case ByteCode::I16X8DotI8X16I7X16SOpcode: + reverseArgs = 1; + FALLTHROUGH; case ByteCode::I16X8ExtmulLowI8X16SOpcode: case ByteCode::I16X8ExtmulHighI8X16SOpcode: case ByteCode::I16X8ExtmulLowI8X16UOpcode: @@ -1518,6 +1541,8 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F32X4PMaxOpcode: case ByteCode::F32X4MaxOpcode: case ByteCode::F32X4MinOpcode: + case ByteCode::F32X4RelaxedMaxOpcode: + case ByteCode::F32X4RelaxedMinOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; break; @@ -1537,6 +1562,8 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F64X2PMaxOpcode: case ByteCode::F64X2MaxOpcode: case ByteCode::F64X2MinOpcode: + case ByteCode::F64X2RelaxedMaxOpcode: + case ByteCode::F64X2RelaxedMinOpcode: srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; break; @@ -1562,7 +1589,7 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) sljit_s32 dst = GET_TARGET_REG(args[2].arg, instr->requiredReg(1)); if (!sljit_has_cpu_feature(SLJIT_HAS_AVX) && dst != args[reverseArgs].arg) { - sljit_emit_simd_mov(compiler, srcType, dst, args[reverseArgs].arg, 0); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | srcType, dst, args[reverseArgs].arg, 0); args[reverseArgs].arg = dst; } @@ -1637,6 +1664,9 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::I8X16SwizzleOpcode: simdEmitSwizzle(compiler, dst, args[0].arg, args[1].arg); break; + case ByteCode::I8X16RelaxedSwizzleOpcode: + simdEmitOp(compiler, SimdOp::pshufb, dst, args[0].arg, args[1].arg); + break; case ByteCode::I8X16NarrowI16X8SOpcode: simdEmitOp(compiler, SimdOp::packsswb, dst, args[0].arg, args[1].arg); break; @@ -1716,6 +1746,12 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::I16X8Q15mulrSatSOpcode: simdEmitQ15mulrSatS(compiler, dst, args[0].arg, args[1].arg); break; + case ByteCode::I16X8RelaxedQ15mulrSOpcode: + simdEmitOp(compiler, SimdOp::pmulhrsw, dst, args[0].arg, args[1].arg); + break; + case ByteCode::I16X8DotI8X16I7X16SOpcode: + simdEmitOp(compiler, SimdOp::pmaddubsw, dst, args[1].arg, args[0].arg); + break; case ByteCode::I16X8ExtmulLowI8X16SOpcode: simdEmitI16X8Extmul(compiler, SimdOp::isSigned, dst, args[0].arg, args[1].arg); break; @@ -1886,6 +1922,12 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F32X4MinOpcode: simdEmitFloatMinMax(compiler, SimdOp::minps, dst, args[0].arg, args[1].arg); break; + case ByteCode::F32X4RelaxedMaxOpcode: + simdEmitOp(compiler, SimdOp::maxps, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F32X4RelaxedMinOpcode: + simdEmitOp(compiler, SimdOp::minps, dst, args[0].arg, args[1].arg); + break; case ByteCode::F64X2AddOpcode: simdEmitOp(compiler, SimdOp::addpd, dst, args[0].arg, args[1].arg); break; @@ -1928,6 +1970,12 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) case ByteCode::F64X2MinOpcode: simdEmitFloatMinMax(compiler, SimdOp::minpd, dst, args[0].arg, args[1].arg); break; + case ByteCode::F64X2RelaxedMaxOpcode: + simdEmitOp(compiler, SimdOp::maxpd, dst, args[0].arg, args[1].arg); + break; + case ByteCode::F64X2RelaxedMinOpcode: + simdEmitOp(compiler, SimdOp::minpd, dst, args[0].arg, args[1].arg); + break; case ByteCode::V128AndOpcode: simdEmitOp(compiler, SimdOp::pand, dst, args[0].arg, args[1].arg); break; @@ -1938,11 +1986,16 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) simdEmitOp(compiler, SimdOp::pxor, dst, args[0].arg, args[1].arg); break; case ByteCode::V128AndnotOpcode: - if (dst != args[0].arg) { - simdEmitVexOp(compiler, SimdOp::pandn, dst, args[1].arg, args[0].arg); - } else { - simdEmitSSEOp(compiler, SimdOp::pandn, dst, args[1].arg); + if (dst != args[1].arg) { + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, SimdOp::pandn, dst, args[1].arg, args[0].arg); + break; + } + + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | srcType, dst, args[1].arg, 0); } + + simdEmitSSEOp(compiler, SimdOp::pandn, dst, args[0].arg); break; default: ASSERT_NOT_REACHED(); @@ -1954,37 +2007,170 @@ static void emitBinarySIMD(sljit_compiler* compiler, Instruction* instr) } } -static void emitSelectSIMD(sljit_compiler* compiler, Instruction* instr) +static void simdEmitShuffle(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, sljit_s32 ro) { - Operand* operands = instr->operands(); sljit_s32 tmp = SLJIT_TMP_DEST_FREG; - JITArg args[4]; - simdOperandToArg(compiler, operands, args[0], SLJIT_SIMD_ELEM_128, instr->requiredReg(0)); - simdOperandToArg(compiler, operands + 1, args[1], SLJIT_SIMD_ELEM_128, instr->requiredReg(1)); - simdOperandToArg(compiler, operands + 2, args[2], SLJIT_SIMD_ELEM_128, instr->requiredReg(2)); + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, SimdOp::pandn, tmp, ro, rm); + simdEmitVexOp(compiler, SimdOp::pand, rd, rn, ro); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, tmp, ro, 0); + simdEmitSSEOp(compiler, SimdOp::pandn, tmp, rm); + + ASSERT(rd == ro); + simdEmitSSEOp(compiler, SimdOp::pand, rd, rn); + } - args[3].set(operands + 3); - sljit_s32 dst = GET_TARGET_REG(args[3].arg, instr->requiredReg(2)); + simdEmitSSEOp(compiler, SimdOp::por, rd, tmp); +} + +static void simdEmitDotAdd(sljit_compiler* compiler, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, sljit_s32 ro, sljit_s32 tmp1) +{ + sljit_s32 tmp2 = SLJIT_TMP_DEST_FREG; if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { - simdEmitVexOp(compiler, SimdOp::pandn, tmp, args[2].arg, args[1].arg); - simdEmitVexOp(compiler, SimdOp::pand, dst, args[0].arg, args[2].arg); + simdEmitVexOp(compiler, SimdOp::pmaddubsw, tmp1, rm, rn); } else { - sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, tmp, args[2].arg, 0); - simdEmitSSEOp(compiler, SimdOp::pandn, tmp, args[1].arg); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, tmp1, rm, 0); + simdEmitSSEOp(compiler, SimdOp::pmaddubsw, tmp1, rn); + } - if (dst != args[2].arg) { - sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, dst, args[2].arg, 0); - } + sljit_emit_simd_replicate(compiler, SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_16, tmp2, SLJIT_IMM, 1); + simdEmitSSEOp(compiler, SimdOp::pmaddwd, tmp1, tmp2); + + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, SimdOp::paddd, rd, ro, tmp1); + } else { + ASSERT(rd == ro); + simdEmitSSEOp(compiler, SimdOp::paddd, rd, tmp1); + } +} + +static void simdEmitLaneSelect(sljit_compiler* compiler, uint32_t opcode, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, sljit_s32 ro) +{ + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, opcode, rd, rm, rn); + + // Append a /is4 argument. + uint8_t reg = sljit_get_register_index(SLJIT_SIMD_REG_128, ro) << 4; + sljit_emit_op_custom(compiler, ®, 1); + return; + } + + simdEmitShuffle(compiler, rd, rn, rm, ro); +} + +static void simdEmitMadd(sljit_compiler* compiler, uint32_t mulOp, uint32_t addOp, sljit_s32 rd, sljit_s32 rn, sljit_s32 rm, sljit_s32 ro) +{ + sljit_s32 tmp = SLJIT_TMP_DEST_FREG; - simdEmitSSEOp(compiler, SimdOp::pand, dst, args[0].arg); + if (sljit_has_cpu_feature(SLJIT_HAS_AVX)) { + simdEmitVexOp(compiler, mulOp, tmp, rn, rm); + } else { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128, tmp, rn, 0); + simdEmitSSEOp(compiler, mulOp, tmp, rm); } - simdEmitSSEOp(compiler, SimdOp::por, dst, tmp); + simdEmitOp(compiler, addOp, rd, ro, tmp); +} + +static void emitTernarySIMD(sljit_compiler* compiler, Instruction* instr) +{ + Operand* operands = instr->operands(); + JITArg args[4]; + + sljit_s32 srcType = SLJIT_SIMD_ELEM_128; + sljit_s32 dstType = SLJIT_SIMD_ELEM_128; + + switch (instr->opcode()) { + case ByteCode::V128BitSelectOpcode: + srcType = SLJIT_SIMD_ELEM_128; + dstType = SLJIT_SIMD_ELEM_128; + break; + case ByteCode::I8X16RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_8; + break; + case ByteCode::I16X8RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_16; + dstType = SLJIT_SIMD_ELEM_16; + break; + case ByteCode::I32X4RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::I64X2RelaxedLaneSelectOpcode: + srcType = SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_ELEM_64; + break; + case ByteCode::I32X4DotI8X16I7X16AddSOpcode: + srcType = SLJIT_SIMD_ELEM_8; + dstType = SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F32X4RelaxedMaddOpcode: + case ByteCode::F32X4RelaxedNmaddOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_32; + break; + case ByteCode::F64X2RelaxedMaddOpcode: + case ByteCode::F64X2RelaxedNmaddOpcode: + srcType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + dstType = SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + break; + default: + ASSERT_NOT_REACHED(); + break; + } + + simdOperandToArg(compiler, operands, args[0], srcType, instr->requiredReg(0)); + simdOperandToArg(compiler, operands + 1, args[1], srcType, instr->requiredReg(1)); + simdOperandToArg(compiler, operands + 2, args[2], dstType, instr->requiredReg(2)); + + args[3].set(operands + 3); + sljit_s32 dst = GET_TARGET_REG(args[3].arg, instr->requiredReg(2)); + + if (!sljit_has_cpu_feature(SLJIT_HAS_AVX) && dst != args[2].arg) { + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | srcType, dst, args[2].arg, 0); + args[2].arg = dst; + } + + switch (instr->opcode()) { + case ByteCode::V128BitSelectOpcode: + simdEmitShuffle(compiler, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::I8X16RelaxedLaneSelectOpcode: + case ByteCode::I16X8RelaxedLaneSelectOpcode: + simdEmitLaneSelect(compiler, SimdOp::pblendvb, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::I32X4RelaxedLaneSelectOpcode: + simdEmitLaneSelect(compiler, SimdOp::blendvps, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::I64X2RelaxedLaneSelectOpcode: + simdEmitLaneSelect(compiler, SimdOp::blendvpd, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::I32X4DotI8X16I7X16AddSOpcode: + simdEmitDotAdd(compiler, dst, args[0].arg, args[1].arg, args[2].arg, instr->requiredReg(3)); + break; + case ByteCode::F32X4RelaxedMaddOpcode: + simdEmitMadd(compiler, SimdOp::mulps, SimdOp::addps, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::F32X4RelaxedNmaddOpcode: + simdEmitMadd(compiler, SimdOp::mulps, SimdOp::subps, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::F64X2RelaxedMaddOpcode: + simdEmitMadd(compiler, SimdOp::mulpd, SimdOp::addpd, dst, args[0].arg, args[1].arg, args[2].arg); + break; + case ByteCode::F64X2RelaxedNmaddOpcode: + simdEmitMadd(compiler, SimdOp::mulpd, SimdOp::subpd, dst, args[0].arg, args[1].arg, args[2].arg); + break; + default: + ASSERT_NOT_REACHED(); + break; + } if (SLJIT_IS_MEM(args[3].arg)) { - sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128, dst, args[3].arg, args[3].argw); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | SLJIT_SIMD_REG_128 | dstType, dst, args[3].arg, args[3].argw); } } @@ -2190,7 +2376,7 @@ static void emitShiftSIMD(sljit_compiler* compiler, Instruction* instr) } if (!sljit_has_cpu_feature(SLJIT_HAS_AVX) && dst != args[0].arg) { - sljit_emit_simd_mov(compiler, type, dst, args[0].arg, 0); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_REG_128 | type, dst, args[0].arg, 0); args[0].arg = dst; }