diff --git a/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_add_int32.h b/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_add_int32.h index 2f01b92..6016c1d 100644 --- a/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_add_int32.h +++ b/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_add_int32.h @@ -48,8 +48,6 @@ inline void _add_int32_(const uint dst_offset) { } TTI_SFPIADD(0 /*imm*/, 1 /*lreg_c*/, 0 /*lreg_dest*/, 4 /*imod*/); - // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result - TTI_NOP; // LREG_0 -> dest as int32 if constexpr (SIGN_MAGNITUDE_FORMAT) { diff --git a/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_sub_int32.h b/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_sub_int32.h index c739674..c9a510f 100644 --- a/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_sub_int32.h +++ b/tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_sub_int32.h @@ -49,8 +49,6 @@ inline void _sub_int32_(const uint dst_offset) { // Set instruction modifier to 6 to get B's 2's complement TTI_SFPIADD(0 /*imm*/, 1 /*lreg_c*/, 0 /*lreg_dest*/, 6 /*imod*/); - // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result - TTI_NOP; // LREG_0 -> dest as int32 if constexpr (SIGN_MAGNITUDE_FORMAT) { diff --git a/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_add_int32.h b/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_add_int32.h index 195df79..410d5fc 100644 --- a/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_add_int32.h +++ b/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_add_int32.h @@ -33,8 +33,6 @@ inline void _add_int32_(const uint dst_offset) { // operand B - int32 TT_SFPLOAD(1, sfpload_instr_mod, 3, dst_offset * 64); TTI_SFPIADD(0, 1, 0, 4); - // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result - TTI_NOP; // LREG_0 -> dest as int32 TTI_SFPSTORE(0, sfpload_instr_mod, 3, 0); dst_reg++; diff --git a/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_sub_int32.h b/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_sub_int32.h index 8921011..b1262a7 100644 --- a/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_sub_int32.h +++ b/tt_llk_wormhole_b0/common/inc/sfpu/ckernel_sfpu_sub_int32.h @@ -34,8 +34,6 @@ inline void _sub_int32_(const uint dst_offset) { TT_SFPLOAD(0 /*lreg*/, sfpload_instr_mod, 3 /*addr_mode*/, dst_offset * 64 /*dest*/); // Use 6 as imod to convert operand B to 2's complement TTI_SFPIADD(0 /*imm*/, 1 /*lreg*/, 0 /*ldest*/, 6 /*imod*/); - // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result - TTI_NOP; // LREG_0 -> dest as int32 TTI_SFPSTORE(0 /*lreg_ind*/, sfpload_instr_mod, 3 /*addr_mode*/, 0 /*dest*/); dst_reg++;