@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
192
192
193
193
if (!initialized) {
194
194
g_ggml_sycl_debug = get_sycl_env (" GGML_SYCL_DEBUG" , 0 );
195
- g_ggml_sycl_disable_optimize= get_sycl_env (" GGML_SYCL_DISABLE_OPT" , 1 );
195
+ g_ggml_sycl_disable_optimize= get_sycl_env (" GGML_SYCL_DISABLE_OPT" , 0 );
196
196
g_ggml_sycl_disable_graph = get_sycl_env (" GGML_SYCL_DISABLE_GRAPH" , 1 );
197
197
GGML_SYCL_DEBUG (" [SYCL] call ggml_check_sycl\n " );
198
198
GGML_LOG_INFO (" Running with Environment Variables:\n " );
@@ -2863,6 +2863,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2863
2863
}
2864
2864
}
2865
2865
2866
+ static void reorder_qw (char *data_device, const int ncols, const int nrows,
2867
+ size_t size, size_t offset, dpct::queue_ptr stream) {
2868
+ auto tmp_buf = sycl::malloc_shared<char >(size, *stream);
2869
+ SYCL_CHECK (
2870
+ CHECK_TRY_ERROR ((*stream).memcpy (tmp_buf, data_device, size)
2871
+ .wait ()));
2872
+ GGML_ASSERT ((size % sizeof (block_q4_0) == 0 ));
2873
+ GGML_ASSERT ((offset % sizeof (block_q4_0) == 0 ));
2874
+ int offset_blks = offset / sizeof (block_q4_0);
2875
+ auto qs_ptr = (uint8_t *)data_device + offset_blks * QK4_0 / 2 ;;
2876
+ auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2 ) + offset_blks;
2877
+
2878
+ stream->parallel_for (
2879
+ size / sizeof (block_q4_0),
2880
+ [=](auto i) [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
2881
+ const block_q4_0* x = (const block_q4_0*)tmp_buf;
2882
+ const int ib = i;
2883
+
2884
+ for (int j = 0 ; j < QK4_0/2 ; j ++)
2885
+ {
2886
+ *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs [j];
2887
+ }
2888
+ *(d_ptr + ib) = x[ib].d ;
2889
+ });
2890
+
2891
+ sycl::free (tmp_buf, *stream);
2892
+ }
2893
+
2894
+ static void reorder_qw (const ggml_tensor * src0, dpct::queue_ptr stream) {
2895
+ char *data_device = (char *)src0->data ;
2896
+ size_t ncols = src0->ne [0 ];
2897
+ size_t nrows = src0->ne [1 ];
2898
+ size_t size = ggml_nbytes (src0);
2899
+
2900
+ reorder_qw (data_device, ncols, nrows, size, 0 , stream);
2901
+ }
2902
+
2903
+ /*
2904
+ * This function could be called when the OP (mul_mat) function support reorder optimizition.
2905
+ */
2906
+ static void opt_for_reorder (ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
2907
+ ggml_tensor * dst) {
2908
+ if (!g_ggml_sycl_disable_optimize && // allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2909
+ ctx->opt_feature .reorder && // allow this device due to good perf, skip the devices with bad perf.
2910
+ dst->op == GGML_OP_MUL_MAT && // limit to some supported cases of Q4_0, to do for more cases.
2911
+ src0->type == GGML_TYPE_Q4_0 &&
2912
+ src1->ne [2 ]==1 && src1->ne [3 ]==1 ) {
2913
+
2914
+ ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra ;
2915
+ if (!extra) return ; // only happen in CI/UT permute case.
2916
+
2917
+ if (extra->optimized_feature .reorder ) return ; // skip the tensor which is handled for reorder.
2918
+
2919
+ reorder_qw (src0, ctx->stream ());
2920
+ extra->optimized_feature .reorder = true ; // used to decode/dequan in next steps.
2921
+ }
2922
+ }
2923
+
2866
2924
static void ggml_sycl_mul_mat (ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2867
2925
2868
2926
const bool split = ggml_backend_buffer_is_sycl_split (src0->buffer );
@@ -2925,13 +2983,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2925
2983
// KQ + KQV multi-batch
2926
2984
ggml_sycl_mul_mat_batched_sycl (ctx, src0, src1, dst);
2927
2985
} else if (use_dequantize_mul_mat_vec) {
2986
+ opt_for_reorder (&ctx, src0, src1, dst); // the OP function in this branch support reorder.
2928
2987
ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false );
2929
2988
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
2930
2989
} else if (use_mul_mat_vec_q) {
2931
2990
ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true );
2932
2991
} else if (use_mul_mat_q) {
2933
2992
ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true );
2934
2993
} else {
2994
+ opt_for_reorder (&ctx, src0, src1, dst); // the OP function in this branch support reorder.
2935
2995
ggml_sycl_op_mul_mat (ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false );
2936
2996
}
2937
2997
}
@@ -3561,71 +3621,8 @@ catch (sycl::exception const &exc) {
3561
3621
std::exit (1 );
3562
3622
}
3563
3623
3564
- static void reorder_qw (char *data_device, const int ncols, const int nrows,
3565
- size_t size, size_t offset, dpct::queue_ptr stream) {
3566
- auto tmp_buf = sycl::malloc_shared<char >(size, *stream);
3567
- SYCL_CHECK (
3568
- CHECK_TRY_ERROR ((*stream).memcpy (tmp_buf, data_device, size)
3569
- .wait ()));
3570
- GGML_ASSERT ((size % sizeof (block_q4_0) == 0 ));
3571
- GGML_ASSERT ((offset % sizeof (block_q4_0) == 0 ));
3572
- int offset_blks = offset / sizeof (block_q4_0);
3573
- auto qs_ptr = (uint8_t *)data_device + offset_blks * QK4_0 / 2 ;;
3574
- auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2 ) + offset_blks;
3575
-
3576
- stream->parallel_for (
3577
- size / sizeof (block_q4_0),
3578
- [=](auto i) [[sycl::reqd_sub_group_size (WARP_SIZE)]] {
3579
- const block_q4_0* x = (const block_q4_0*)tmp_buf;
3580
- const int ib = i;
3581
-
3582
- for (int j = 0 ; j < QK4_0/2 ; j ++)
3583
- {
3584
- *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs [j];
3585
- }
3586
- *(d_ptr + ib) = x[ib].d ;
3587
- });
3588
-
3589
- sycl::free (tmp_buf, *stream);
3590
- }
3591
-
3592
- static void reorder_qw (ggml_tensor * src0, dpct::queue_ptr stream) {
3593
- char *data_device = (char *)src0->data ;
3594
- size_t ncols = src0->ne [0 ];
3595
- size_t nrows = src0->ne [1 ];
3596
- size_t size = ggml_nbytes (src0);
3597
-
3598
- reorder_qw (data_device, ncols, nrows, size, 0 , stream);
3599
- }
3600
-
3601
- static void opt_for_reorder (ggml_tensor * dst, dpct::queue_ptr stream) {
3602
- ggml_tensor *src0 = dst->src [0 ];
3603
- ggml_tensor *src1 = dst->src [1 ];
3604
-
3605
- if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
3606
- src1->ne [2 ]==1 && src1->ne [3 ]==1 ) {
3607
- reorder_qw (src0, stream);
3608
- ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra ;
3609
- GGML_ASSERT (extra);
3610
- extra->optimized_feature .reorder = true ; // used to decode/dequan in next steps.
3611
- }
3612
- }
3613
-
3614
- static void optimize_graph_once (ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3615
- dpct::queue_ptr stream = ctx->stream ();
3616
- if (ctx->optimized_graph ) {
3617
- return ;
3618
- }
3619
- ctx->optimized_graph = true ;
3620
-
3621
- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
3622
- if (ctx->opt_feature .reorder ) opt_for_reorder (cgraph->nodes [i], stream);
3623
- }
3624
- }
3625
-
3626
3624
static void ggml_backend_sycl_graph_compute_impl (ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
3627
3625
ggml_sycl_set_main_device (sycl_ctx->device );
3628
- if (!g_ggml_sycl_disable_optimize) optimize_graph_once (cgraph, sycl_ctx);
3629
3626
3630
3627
for (int i = 0 ; i < cgraph->n_nodes ; i++) {
3631
3628
ggml_tensor * node = cgraph->nodes [i];
0 commit comments