Skip to content

Commit 3cd3d5d

Browse files
Merge branch 'ggml-org:master' into master
2 parents d33330f + 295354e commit 3cd3d5d

File tree

7 files changed

+117
-103
lines changed

7 files changed

+117
-103
lines changed

examples/llava/clip.cpp

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -554,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
554554
}
555555

556556
// implementation of the 2D RoPE without adding a new op in ggml
557+
// this is not efficient (use double the memory), but works on all backends
558+
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
557559
static ggml_tensor * build_rope_2d(
558-
ggml_cgraph * gf,
559560
ggml_context * ctx0,
560561
ggml_tensor * cur,
561562
ggml_tensor * pos_h,
562563
ggml_tensor * pos_w,
563564
const float freq_base
564565
) {
565-
ggml_tensor * tmp;
566566
const int64_t n_dim = cur->ne[0];
567567
const int64_t n_head = cur->ne[1];
568568
const int64_t n_pos = cur->ne[2];
@@ -571,18 +571,23 @@ static ggml_tensor * build_rope_2d(
571571
// we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
572572
// first half of cur will use 1e-0, 1e-2 (even)
573573
// second half of cur will use 1e-1, 1e-3 (odd)
574-
//
575-
// for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
574+
// the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
576575
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
577576
// then for the second half, we use freq_scale to shift the inv_freq
578577
// ^ why? replace (2i) with (2i+1) in the above equation
579578
const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
580579

581580
// first half
581+
ggml_tensor * first;
582582
{
583-
cur = ggml_rope_ext_inplace(
583+
first = ggml_view_3d(ctx0, cur,
584+
n_dim/2, n_head, n_pos,
585+
ggml_row_size(cur->type, n_dim),
586+
ggml_row_size(cur->type, n_dim*n_head),
587+
0);
588+
first = ggml_rope_ext(
584589
ctx0,
585-
cur,
590+
first,
586591
pos_h, // positions
587592
nullptr, // freq factors
588593
n_dim/2, // n_dims
@@ -592,26 +597,27 @@ static ggml_tensor * build_rope_2d(
592597
}
593598

594599
// second half
600+
ggml_tensor * second;
595601
{
596-
tmp = ggml_view_3d(ctx0, cur,
602+
second = ggml_view_3d(ctx0, cur,
597603
n_dim/2, n_head, n_pos,
598604
ggml_row_size(cur->type, n_dim),
599605
ggml_row_size(cur->type, n_dim*n_head),
600606
n_dim/2 * ggml_element_size(cur));
601-
tmp = ggml_rope_ext_inplace(
607+
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
608+
second = ggml_rope_ext(
602609
ctx0,
603-
tmp,
610+
second,
604611
pos_w, // positions
605612
nullptr, // freq factors
606613
n_dim/2, // n_dims
607614
0, 0, freq_base,
608615
freq_scale_odd,
609616
0.0f, 1.0f, 0.0f, 0.0f
610617
);
611-
// calculate inplace (modify cur directly)
612-
ggml_build_forward_expand(gf, tmp);
613618
}
614619

620+
cur = ggml_concat(ctx0, first, second, 0);
615621
return cur;
616622
}
617623

@@ -680,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
680686
struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
681687

682688
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
683-
Q = build_rope_2d(gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta);
689+
Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
684690
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
685691

686692
struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
687693

688694
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
689-
K = build_rope_2d(gf, ctx0, K, pos_h, pos_w, hparams.rope_theta);
695+
K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
690696
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
691697

692698
struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
@@ -2796,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
27962802
const auto & model = ctx->vision_model;
27972803
const auto & hparams = model.hparams;
27982804

2805+
// TODO @ngxson : this is ugly, need to refactor later
2806+
bool support_dynamic_size = ctx->has_minicpmv_projector
2807+
|| ctx->has_qwen2vl_merger
2808+
|| ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
2809+
27992810
const int image_size = hparams.image_size;
28002811
int image_size_width = image_size;
28012812
int image_size_height = image_size;
2802-
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
2813+
if (support_dynamic_size) {
28032814
image_size_width = imgs.entries[0]->nx;
28042815
image_size_height = imgs.entries[0]->ny;
28052816
}
@@ -2811,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28112822

28122823
{
28132824
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
2814-
float * data = (float *)malloc(ggml_nbytes(inp_raw));
2825+
std::vector<float> inp_data(ggml_nelements(inp_raw));
2826+
float * data = inp_data.data();
2827+
2828+
// layout of data (note: the channel dim is unrolled to better visualize the layout):
2829+
//
2830+
// ┌──W──┐
2831+
// │ H │ channel = R
2832+
// ├─────┤ │
2833+
// │ H │ channel = G
2834+
// ├─────┤ │
2835+
// │ H │ channel = B
2836+
// └─────┘ │
2837+
// ──────┘ x B
28152838

2816-
// TODO @ngxson : this whole code block is ugly, will need to be refactored
28172839
for (size_t i = 0; i < imgs.entries.size(); i++) {
28182840
const int nx = imgs.entries[i]->nx;
28192841
const int ny = imgs.entries[i]->ny;
@@ -2828,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
28282850
const int n = nx * ny;
28292851

28302852
for (int b = 0; b < batch_size; b++) {
2831-
for (int k = 0; k < 3; k++) {
2832-
for (int y = 0; y < ny; y++) {
2833-
for (int x = 0; x < nx; x++) {
2834-
data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
2835-
}
2853+
float * batch_entry = data + b * (3*n);
2854+
for (int y = 0; y < ny; y++) {
2855+
for (int x = 0; x < nx; x++) {
2856+
size_t base_src = 3*(y * nx + x); // idx of the first channel
2857+
size_t base_dst = y * nx + x; // idx of the first channel
2858+
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
2859+
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
2860+
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
28362861
}
28372862
}
28382863
}
28392864
}
28402865
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
2841-
free(data);
28422866
}
28432867
if (ctx->has_minicpmv_projector) {
28442868
{

ggml/src/ggml-sycl/common.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
313313
int device;
314314
std::string name;
315315
optimize_feature opt_feature;
316-
bool optimized_graph=false;
317316

318317
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
319318

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 61 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
192192

193193
if (!initialized) {
194194
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
195-
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
195+
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
196196
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
197197
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
198198
GGML_LOG_INFO("Running with Environment Variables:\n");
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
28522852
}
28532853
}
28542854

2855+
static void reorder_qw(char *data_device, const int ncols, const int nrows,
2856+
size_t size, size_t offset, dpct::queue_ptr stream) {
2857+
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
2858+
SYCL_CHECK(
2859+
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
2860+
.wait()));
2861+
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2862+
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2863+
int offset_blks = offset / sizeof(block_q4_0);
2864+
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
2865+
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2866+
2867+
stream->parallel_for(
2868+
size / sizeof(block_q4_0),
2869+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
2870+
const block_q4_0* x = (const block_q4_0*)tmp_buf;
2871+
const int ib = i;
2872+
2873+
for (int j = 0; j < QK4_0/2; j ++)
2874+
{
2875+
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
2876+
}
2877+
*(d_ptr + ib) = x[ib].d;
2878+
});
2879+
2880+
sycl::free(tmp_buf, *stream);
2881+
}
2882+
2883+
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2884+
char*data_device = (char*)src0->data;
2885+
size_t ncols = src0->ne[0];
2886+
size_t nrows = src0->ne[1];
2887+
size_t size = ggml_nbytes(src0);
2888+
2889+
reorder_qw(data_device, ncols, nrows, size, 0, stream);
2890+
}
2891+
2892+
/*
2893+
* This function could be called when the OP (mul_mat) function support reorder optimizition.
2894+
*/
2895+
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
2896+
ggml_tensor * dst) {
2897+
if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2898+
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2899+
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2900+
src0->type == GGML_TYPE_Q4_0 &&
2901+
src1->ne[2]==1 && src1->ne[3]==1) {
2902+
2903+
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
2904+
if (!extra) return; //only happen in CI/UT permute case.
2905+
2906+
if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
2907+
2908+
reorder_qw(src0, ctx->stream());
2909+
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
2910+
}
2911+
}
2912+
28552913
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
28562914

28572915
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
@@ -2914,13 +2972,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
29142972
// KQ + KQV multi-batch
29152973
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
29162974
} else if (use_dequantize_mul_mat_vec) {
2975+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
29172976
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
29182977
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
29192978
} else if (use_mul_mat_vec_q) {
29202979
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
29212980
} else if (use_mul_mat_q) {
29222981
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
29232982
} else {
2983+
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
29242984
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
29252985
}
29262986
}
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
35453605
std::exit(1);
35463606
}
35473607

3548-
static void reorder_qw(char *data_device, const int ncols, const int nrows,
3549-
size_t size, size_t offset, dpct::queue_ptr stream) {
3550-
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3551-
SYCL_CHECK(
3552-
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
3553-
.wait()));
3554-
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
3555-
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
3556-
int offset_blks = offset / sizeof(block_q4_0);
3557-
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
3558-
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
3559-
3560-
stream->parallel_for(
3561-
size / sizeof(block_q4_0),
3562-
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3563-
const block_q4_0* x = (const block_q4_0*)tmp_buf;
3564-
const int ib = i;
3565-
3566-
for (int j = 0; j < QK4_0/2; j ++)
3567-
{
3568-
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
3569-
}
3570-
*(d_ptr + ib) = x[ib].d;
3571-
});
3572-
3573-
sycl::free(tmp_buf, *stream);
3574-
}
3575-
3576-
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3577-
char*data_device = (char*)src0->data;
3578-
size_t ncols = src0->ne[0];
3579-
size_t nrows = src0->ne[1];
3580-
size_t size = ggml_nbytes(src0);
3581-
3582-
reorder_qw(data_device, ncols, nrows, size, 0, stream);
3583-
}
3584-
3585-
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3586-
ggml_tensor *src0 = dst->src[0];
3587-
ggml_tensor *src1 = dst->src[1];
3588-
3589-
if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
3590-
src1->ne[2]==1 && src1->ne[3]==1) {
3591-
reorder_qw(src0, stream);
3592-
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
3593-
GGML_ASSERT(extra);
3594-
extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
3595-
}
3596-
}
3597-
3598-
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3599-
dpct::queue_ptr stream = ctx->stream();
3600-
if (ctx->optimized_graph) {
3601-
return;
3602-
}
3603-
ctx->optimized_graph = true;
3604-
3605-
for (int i = 0; i < cgraph->n_nodes; i++) {
3606-
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
3607-
}
3608-
}
3609-
36103608
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
36113609
ggml_sycl_set_main_device(sycl_ctx->device);
3612-
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
36133610

36143611
for (int i = 0; i < cgraph->n_nodes; i++) {
36153612
ggml_tensor * node = cgraph->nodes[i];

src/llama-context.cpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
469469
ggml_tensor * shift,
470470
ggml_tensor * factors,
471471
float freq_base,
472-
float freq_scale,
473-
ggml_backend_buffer * bbuf) const {
472+
float freq_scale) const {
474473
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
475474

476475
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
492491
// dequantize to f32 -> RoPE -> quantize back
493492
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
494493

495-
if (bbuf) {
496-
for (const auto & backend : backends) {
497-
// Figure out which backend KV cache belongs to
498-
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
499-
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
500-
break;
501-
}
502-
}
503-
}
504-
505-
tmp = ggml_rope_ext_inplace(ctx0, tmp,
494+
tmp = ggml_rope_ext(ctx0, tmp,
506495
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
507496
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
508497

@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
582571
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
583572
0);
584573

585-
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
574+
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
586575

587576
ggml_build_forward_expand(gf, cur);
588577
}

src/llama-context.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,7 @@ struct llama_context {
170170
ggml_tensor * shift,
171171
ggml_tensor * factors,
172172
float freq_base,
173-
float freq_scale,
174-
ggml_backend_buffer * bbuf) const;
173+
float freq_scale) const;
175174

176175
llm_graph_result_ptr build_kv_self_shift(
177176
ggml_context * ctx0,

src/llama-graph.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,10 @@ ggml_tensor * llm_graph_context::build_ffn(
803803

804804
if (down) {
805805
cur = build_lora_mm(down, cur);
806+
if (arch == LLM_ARCH_GLM4) {
807+
// GLM4 seems to have numerical issues with half-precision accumulators
808+
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
809+
}
806810
}
807811

808812
if (down_b) {

tests/test-backend-ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,6 +2606,8 @@ struct test_rope : public test_case {
26062606
} else {
26072607
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
26082608
}
2609+
2610+
// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
26092611
}
26102612
ggml_set_name(out, "out");
26112613

0 commit comments

Comments
 (0)