Skip to content

Commit c7500c9

Browse files
change the reorder tensor from init to execute OP
1 parent 7a395f6 commit c7500c9

File tree

2 files changed

+61
-65
lines changed

2 files changed

+61
-65
lines changed

ggml/src/ggml-sycl/common.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
313313
int device;
314314
std::string name;
315315
optimize_feature opt_feature;
316-
bool optimized_graph=false;
317316

318317
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
319318

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 61 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
192192

193193
if (!initialized) {
194194
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
195-
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
195+
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
196196
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
197197
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
198198
GGML_LOG_INFO("Running with Environment Variables:\n");
@@ -2863,6 +2863,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
28632863
}
28642864
}
28652865

2866+
static void reorder_qw(char *data_device, const int ncols, const int nrows,
2867+
size_t size, size_t offset, dpct::queue_ptr stream) {
2868+
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
2869+
SYCL_CHECK(
2870+
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
2871+
.wait()));
2872+
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2873+
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2874+
int offset_blks = offset / sizeof(block_q4_0);
2875+
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
2876+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2877+
2878+
stream->parallel_for(
2879+
size / sizeof(block_q4_0),
2880+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
2881+
const block_q4_0* x = (const block_q4_0*)tmp_buf;
2882+
const int ib = i;
2883+
2884+
for (int j = 0; j < QK4_0/2; j ++)
2885+
{
2886+
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
2887+
}
2888+
*(d_ptr + ib) = x[ib].d;
2889+
});
2890+
2891+
sycl::free(tmp_buf, *stream);
2892+
}
2893+
2894+
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2895+
char*data_device = (char*)src0->data;
2896+
size_t ncols = src0->ne[0];
2897+
size_t nrows = src0->ne[1];
2898+
size_t size = ggml_nbytes(src0);
2899+
2900+
reorder_qw(data_device, ncols, nrows, size, 0, stream);
2901+
}
2902+
2903+
/*
2904+
* This function could be called when the OP (mul_mat) function support reorder optimizition.
2905+
*/
2906+
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
2907+
ggml_tensor * dst) {
2908+
if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2909+
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2910+
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2911+
src0->type == GGML_TYPE_Q4_0 &&
2912+
src1->ne[2]==1 && src1->ne[3]==1) {
2913+
2914+
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
2915+
if (!extra) return; //only happen in CI/UT permute case.
2916+
2917+
if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
2918+
2919+
reorder_qw(src0, ctx->stream());
2920+
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
2921+
}
2922+
}
2923+
28662924
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
28672925

28682926
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
@@ -2925,13 +2983,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
29252983
// KQ + KQV multi-batch
29262984
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
29272985
} else if (use_dequantize_mul_mat_vec) {
2986+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
29282987
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
29292988
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
29302989
} else if (use_mul_mat_vec_q) {
29312990
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
29322991
} else if (use_mul_mat_q) {
29332992
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
29342993
} else {
2994+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
29352995
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
29362996
}
29372997
}
@@ -3561,71 +3621,8 @@ catch (sycl::exception const &exc) {
35613621
std::exit(1);
35623622
}
35633623

3564-
static void reorder_qw(char *data_device, const int ncols, const int nrows,
3565-
size_t size, size_t offset, dpct::queue_ptr stream) {
3566-
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3567-
SYCL_CHECK(
3568-
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
3569-
.wait()));
3570-
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
3571-
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
3572-
int offset_blks = offset / sizeof(block_q4_0);
3573-
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
3574-
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
3575-
3576-
stream->parallel_for(
3577-
size / sizeof(block_q4_0),
3578-
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3579-
const block_q4_0* x = (const block_q4_0*)tmp_buf;
3580-
const int ib = i;
3581-
3582-
for (int j = 0; j < QK4_0/2; j ++)
3583-
{
3584-
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
3585-
}
3586-
*(d_ptr + ib) = x[ib].d;
3587-
});
3588-
3589-
sycl::free(tmp_buf, *stream);
3590-
}
3591-
3592-
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3593-
char*data_device = (char*)src0->data;
3594-
size_t ncols = src0->ne[0];
3595-
size_t nrows = src0->ne[1];
3596-
size_t size = ggml_nbytes(src0);
3597-
3598-
reorder_qw(data_device, ncols, nrows, size, 0, stream);
3599-
}
3600-
3601-
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3602-
ggml_tensor *src0 = dst->src[0];
3603-
ggml_tensor *src1 = dst->src[1];
3604-
3605-
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
3606-
src1->ne[2]==1 && src1->ne[3]==1) {
3607-
reorder_qw(src0, stream);
3608-
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
3609-
GGML_ASSERT(extra);
3610-
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
3611-
}
3612-
}
3613-
3614-
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3615-
dpct::queue_ptr stream = ctx->stream();
3616-
if (ctx->optimized_graph) {
3617-
return;
3618-
}
3619-
ctx->optimized_graph = true;
3620-
3621-
for (int i = 0; i < cgraph->n_nodes; i++) {
3622-
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
3623-
}
3624-
}
3625-
36263624
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
36273625
ggml_sycl_set_main_device(sycl_ctx->device);
3628-
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
36293626

36303627
for (int i = 0; i < cgraph->n_nodes; i++) {
36313628
ggml_tensor * node = cgraph->nodes[i];

0 commit comments

Comments
 (0)