Skip to content

Commit a1d4566

Browse files
authored
[Snippets][CPU] Moved N_tail processing to the end in BrgemmCopyBKernel (#28664)
### Details: - *The performance experiments (see the mentioned ticket please) show that `N_Tail` processing should be at the end of `BrgemmCopyBKernel`. The current PR moves tail processing from the beginning to the end in kernel* ### Tickets: - *CVS-161315*
1 parent a7f45ba commit a1d4566

File tree

6 files changed

+18
-33
lines changed

6 files changed

+18
-33
lines changed

src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::Expres
258258
OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config");
259259
// In case of data repacking LDB is chosen in accordance with repacking buffer size
260260
if (with_repacking(brgemm_node->get_type())) {
261-
LDB = DIM_CAST(brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1)));
261+
LDB = DIM_CAST(brgemm_utils::repacking::compute_repacked_n_dim(LDB, brgemm_node->get_input_element_type(1)));
262262
}
263263

264264
config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta);

src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -227,21 +227,13 @@ void BrgemmCopyBKernel::generate() {
227227
size_t start_out = 0;
228228
size_t start_comp = 0;
229229

230-
auto add_ptr_increments = [&](size_t current_N) {
230+
for (size_t nb = 0; nb < div_up(N_blk, wei_N_blk); nb++) {
231+
const auto current_N = N_blk - nb * wei_N_blk < wei_N_blk ? wei_N_tail : wei_N_blk;
232+
emit_brgemm_copy_b_kernel_call(current_N, K, start_in, start_out, start_comp);
233+
231234
start_in += is_transpose ? K * current_N * wei_data_size : current_N * wei_data_size;
232235
start_out += current_N * vnni_factor * wei_data_size;
233236
start_comp += is_with_comp ? current_N * sizeof(int32_t) : 0;
234-
};
235-
236-
// OneDNN requires tail handling before main iterations
237-
if (wei_N_tail != 0) {
238-
emit_brgemm_copy_b_kernel_call(wei_N_tail, K, start_in, start_out, start_comp);
239-
add_ptr_increments(wei_N_tail);
240-
}
241-
242-
for (auto nb = wei_N_tail; nb < N_blk; nb += wei_N_blk) {
243-
emit_brgemm_copy_b_kernel_call(wei_N_blk, K, start_in, start_out, start_comp);
244-
add_ptr_increments(wei_N_blk);
245237
}
246238

247239
postamble();
@@ -389,7 +381,7 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre
389381
init(N_dim, N_blk, 0);
390382

391383
const auto& brg_weight_etype = expr->get_node()->get_input_element_type(0);
392-
const auto LDB = brgemm_utils::repacking::compute_LDB(N_dim, brg_weight_etype);
384+
const auto LDB = brgemm_utils::repacking::compute_repacked_n_dim(N_dim, brg_weight_etype);
393385
const auto copy_B_wei_stride =
394386
ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) *
395387
brg_weight_etype.size();

src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,13 @@ namespace repacking {
6060
size_t compute_inner_n_block(const ov::element::Type& precision);
6161
/// \brief Computes inner K block size used by OneDNN implementation. Depends on tensor precision
6262
size_t compute_inner_k_block(const ov::element::Type& precision);
63-
/**
64-
* @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters
65-
* @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node
66-
* @param precision tensor precision
67-
*/
63+
64+
/// \brief Computes N dim in output blocked shape of BrgemmCopyB. Depends on tensor precision
6865
template <
6966
typename T,
7067
typename = typename std::enable_if<(std::is_same<T, size_t>::value || std::is_same<T, int64_t>::value), bool>::type>
71-
T compute_LDB(T n_block, const ov::element::Type& precision) {
72-
return snippets::utils::is_dynamic_value<T>(n_block)
73-
? n_block
74-
: std::max(n_block, static_cast<T>(compute_inner_n_block(precision)));
68+
inline T compute_repacked_n_dim(T n, const ov::element::Type& precision) {
69+
return ov::snippets::utils::rnd_up(n, static_cast<T>(compute_inner_n_block(precision)));
7570
}
7671
/**
7772
* @brief Retrieves the expression pointer for the brgemm_copy_b expression corresponding to the given BrgemmCPU

src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ bool pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(
3535
// K blocking loop: account for zero padding
3636
if (loop_port.get_dim_idx() == 1) {
3737
const auto ptr_incr = loop_desc.ptr_increment;
38-
const auto blocked_shape_ptr_inc = brgemm_utils::repacking::compute_LDB(ptr_incr, precision);
38+
const auto blocked_shape_ptr_inc =
39+
brgemm_utils::repacking::compute_repacked_n_dim(ptr_incr, precision);
3940
if (ptr_incr != 0 && ptr_incr != blocked_shape_ptr_inc) {
4041
loop_desc.ptr_increment = blocked_shape_ptr_inc;
4142
OPENVINO_ASSERT(loop_desc.finalization_offset % ptr_incr == 0,

src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/expressions/brgemm_copy_b_buffer_expressions.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ void RepackedWeightsBufferExpression::init_allocation_size(
4545

4646
const auto& precision = get_node()->get_input_element_type(0);
4747
// Repacking buffer shape is set in accordance to OneDNN requirements
48-
const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision));
48+
const size_t N_dim = compute_repacked_n_dim(n_blk, precision);
4949
if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) {
5050
// In case of transpose, K dimension must be rounded-up to number of elems in vector register
5151
// For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in
@@ -88,13 +88,9 @@ void CompensationsBufferExpression::init_allocation_size(
8888
// Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block
8989
// because of OneDNN implementation nuances (as in get_repacking_buffer_size).
9090
// However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer
91+
const auto& precision = parent_expr->get_node()->get_input_element_type(0);
9192
const size_t n_blk = *ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0)).rbegin();
92-
if (snippets::utils::is_dynamic_value(n_blk)) {
93-
m_allocation_size = snippets::utils::get_dynamic_value<size_t>();
94-
} else {
95-
const auto& precision = parent_expr->get_node()->get_input_element_type(0);
96-
m_allocation_size = std::max(n_blk, compute_inner_n_block(precision));
97-
}
93+
m_allocation_size = compute_repacked_n_dim(n_blk, precision);
9894
}
9995

10096
} // namespace intel_cpu

src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ VectorDims BrgemmExternalRepackingAdjuster::get_blk_shape(const VectorDims& plan
5757
const auto K = *++planar_shape.rbegin();
5858
const auto N = *planar_shape.rbegin();
5959
const auto new_K = snippets::utils::div_up(K, vnni_factor);
60-
const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(prc));
60+
const auto new_N = brgemm_utils::repacking::compute_repacked_n_dim(N, prc);
6161
VectorDims blk_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank);
6262
blk_shape.insert(blk_shape.end(), {new_K, new_N, vnni_factor});
6363
return blk_shape;
@@ -73,7 +73,8 @@ void BrgemmExternalRepackingAdjuster::update_kernel(const RepackExecutorPtr& exe
7373
auto config = static_cast<BrgemmCopyBKernelConfig*>(generic_config.get());
7474
const auto idx = config->is_transposed_B() ? 0 : 1;
7575
const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, layout, idx) * prc.size();
76-
config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, prc));
76+
const auto LDB = brgemm_utils::repacking::compute_repacked_n_dim(N, prc);
77+
config->update(N, N, K, K, copy_wei_stride, LDB);
7778
executor->update_by_config(*config);
7879
}
7980

0 commit comments

Comments
 (0)