Skip to content

sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs #12858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@
#ifndef GGML_SYCL_BACKEND_HPP
#define GGML_SYCL_BACKEND_HPP

#include "concat.hpp"
#include "common.hpp"
#include "concat.hpp"
#include "conv.hpp"
#include "convert.hpp"
#include "cpy.hpp"
#include "dequantize.hpp"
#include "dmmv.hpp"
#include "element_wise.hpp"
#include "gla.hpp"
#include "im2col.hpp"
#include "mmq.hpp"
#include "mmvq.hpp"
#include "rope.hpp"
#include "norm.hpp"
#include "outprod.hpp"
#include "quants.hpp"
#include "rope.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "im2col.hpp"
#include "wkv.hpp"
#include "outprod.hpp"
#include "element_wise.hpp"
#include "cpy.hpp"
#include "gla.hpp"

#endif // GGML_SYCL_BACKEND_HPP
#endif // GGML_SYCL_BACKEND_HPP
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,4 +776,9 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
}

bool gpu_has_xmx(sycl::device &dev);

constexpr size_t ceil_div(const size_t m, const size_t n) {
return (m + n - 1) / n;
}

#endif // GGML_SYCL_COMMON_HPP
45 changes: 33 additions & 12 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2887,6 +2887,15 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
return false;
}

inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return true;
default:
return false;
}
}

static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
Expand All @@ -2906,13 +2915,14 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}

static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;

if (split) {
ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
ggml_backend_sycl_split_buffer_type_context * buft_ctx =
(ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split;
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
// skip devices that are not going to do any work:
Expand All @@ -2925,7 +2935,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
}
}
} else {
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
}

// check data types and tensor shapes for custom matrix multiplication kernels:
Expand All @@ -2947,9 +2957,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX

const bool reorder = static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra) &&
static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra)->optimized_feature.reorder;

// mmvq path is faster in the CUDA backend.
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
// is enabled takes precedence over DMMV, the current if-else implementation
// requires disabling DMMV if both conditions are met
|| (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is named for Intel GPUs.
Why change the code for CUDA?
In fact, reorder the src0 won't happen for non-intel GPU.
So this code has no impact.
Suggest remove it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your comment aligns with my suspicion that this change is obscure. This line changes the kernels from DMMV to MMVQ if reorder is enabled and it's supported, so it's no longer only for CUDA devices.

I need to rethink how the dispatcher does the work.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, please ignore this comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another comment:
The reorder behavior impact the code path in this PR: use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;

This code works well for CUDA, instead of Intel GPU.
That's why it's limited for only CUDA backend.
Some cases (models) will get benefit from it, some will become bad for Intel GPU.

I suggest removing this behavior.
Only optimize the OPs by reorder. Not change the code path.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what we have measured the new mmvq code path with the reorder optimization is more optimized on Intel devices as well (cf the PR description). Can you let us know if you find a model or device where this is causing a performance regression? That's why we suggest to enable it by default now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you let me know which device you are using exactly? I've been using Arc B580 and the example is working fine with GGML_SYCL_DISABLE_OPT=0 and definitely using the new reorder_mul_mat_vec_q4_0_q8_1_sycl:

|  |                   |                                       |       |Max    |        |Max  |Global |                     |
|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |
|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|
|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
| 0| [level_zero:gpu:0]|                Intel Arc B580 Graphics|   20.1|    160|    1024|   32| 12168M|         1.6.32567+16|
SYCL Optimization Feature:
|ID|        Device Type|Reorder|
|--|-------------------|-------|
| 0| [level_zero:gpu:0]|      Y|
llama_context:  SYCL_Host  output buffer size =     0.12 MiB
init: kv_size = 4096, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1
init:      SYCL0 KV buffer size =  2048.00 MiB
llama_context: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
llama_context:      SYCL0 compute buffer size =   296.00 MiB
llama_context:  SYCL_Host compute buffer size =    16.01 MiB
llama_context: graph nodes  = 1094
llama_context: graph splits = 2
common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
main: llama threadpool init, n_threads = 8

system_info: n_threads = 8 (n_threads_batch = 8) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 |

sampler seed: 0
sampler params:
        repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
        dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
        top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
        mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 4096, n_batch = 2048, n_predict = 400, n_keep = 1

 Building a website can be done in 10 simple steps:
Step 1: Select the Website Name
Step 2: Select the Website Type
Step 3: Choose a Web Host
Step 4: Select the Website Theme
Step 5: Add a Homepage
Step 6: Add Pages
Step 7: Add Images
Step 8: Add Content
Step 9: Add Contact Information
Step 10: Add Social Media Links
Once you’ve made your decisions, you’ll be ready to start building your website. We’ll go over each step in more detail in this article.
When selecting the name of your website, it’s essential to choose something that reflects what your site will be about. A good website name should be:
Easy to remember and spell
Short and catchy
Unique and not already taken by another site
When choosing your website name, it’s important to keep these factors in mind. A website name that is too long or hard to remember will not help your site’s SEO. Likewise, a website name that is too similar to another will confuse users and make it difficult to differentiate between the two sites.
Website names should also be unique and not already taken by another site. This is to prevent confusion and ensure that your site is the one people are looking for when they search for your domain name.
A domain name is an essential part of your website. It’s what people type into their web browsers to visit your site. A domain name should be:
Easy to remember and spell.
Short and catchy.
Unique and not already taken by another site.
When choosing your domain name, it’s essential to keep these factors in mind. A domain name that is too long or hard to remember will not help your site’s SEO. Likewise, a domain name that is too similar to another will confuse users and make it difficult to

llama_perf_sampler_print:    sampling time =       7.61 ms /   419 runs   (    0.02 ms per token, 55088.09 tokens per second)
llama_perf_context_print:        load time =    1687.55 ms
llama_perf_context_print: prompt eval time =     245.45 ms /    19 tokens (   12.92 ms per token,    77.41 tokens per second)
llama_perf_context_print:        eval time =    6255.84 ms /   399 runs   (   15.68 ms per token,    63.78 tokens per second)
llama_perf_context_print:       total time =    6518.86 ms /   418 tokens

I've dig a bit deeper comparing the various sizes of mul_mat_vec that are used in this example, in short I can confirm that reorder_mul_mat_vec_q4_0_q8_1_sycl outputs the exact same values than the existing mmvq kernel (without the reorder) which is itself very close to the CUDA mmvq implementation (from ggml-cuda).
I am computing the top 10 absolute and relative errors compared to the CPU backend and with a fixed seed. With that I can confirm the output of the different mmvq implementations are identical. See the example below for the sizes m=k=4096 and n=1 comparing SYCL reorder mmvq with native CUDA mmvq:

  MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=1,k=4096,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0): call ggml_sycl_mul_mat
mul_mat use_dequantize_mul_mat_vec=0 use_mul_mat_vec_q=1 use_mul_mat_q=0 reorder=1 split=0 m=4096 n=1 k=4096 batch0=1 batch1=1
Calling reorder_mul_mat_vec_q4_0_q8_1_sycl
call ggml_sycl_mul_mat done

Avg abs err=0.107298 Top 10 abs err:
i=3599 a=-4.69262 b=-5.2335 abs err=0.540883
i=2359 a=12.1033 b=12.5865 abs err=0.483185
i=2646 a=7.78191 b=8.24094 abs err=0.459034
i=3928 a=9.89294 b=10.3471 abs err=0.454193
i=378 a=-3.01588 b=-3.45036 abs err=0.434485
i=696 a=9.1979 b=9.62932 abs err=0.43142
i=1759 a=1.37318 b=1.80059 abs err=0.427414
i=3654 a=24.938 b=24.5315 abs err=0.406462
i=678 a=14.8213 b=14.4157 abs err=0.405678
i=1025 a=-31.8456 b=-32.2399 abs err=0.394257

Avg rel err=-0.00800887 Top 10 rel err:
i=1461 a=0.33337 b=0.0154138 rel err=20.6281
i=2038 a=0.136977 b=-0.076067 rel err=2.80074
i=2346 a=0.163639 b=-0.14278 rel err=2.14609
i=2927 a=0.0830765 b=-0.120305 rel err=1.69055
i=3197 a=0.249004 b=0.123245 rel err=1.0204
i=3405 a=0.140829 b=0.0717204 rel err=0.963579
i=1548 a=-0.0529716 b=-0.197922 rel err=0.732362
i=1850 a=-0.0073204 b=-0.0248034 rel err=0.704863
i=3495 a=-0.0530348 b=-0.179241 rel err=0.704115
i=1056 a=0.511249 b=0.303663 rel err=0.683608

CUDA:
  MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=1,k=4096,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0):
Avg abs err=0.107297 Top 10 abs err:
i=3599 a=-4.69262 b=-5.2335 abs err=0.540884
i=2359 a=12.1033 b=12.5865 abs err=0.483183
i=2646 a=7.78191 b=8.24094 abs err=0.459031
i=3928 a=9.89294 b=10.3471 abs err=0.454191
i=378 a=-3.01588 b=-3.45036 abs err=0.434482
i=696 a=9.1979 b=9.62932 abs err=0.431419
i=1759 a=1.37319 b=1.80059 abs err=0.427406
i=3654 a=24.938 b=24.5315 abs err=0.40646
i=678 a=14.8213 b=14.4157 abs err=0.405679
i=1025 a=-31.8456 b=-32.2399 abs err=0.394257

Avg rel err=-0.00800872 Top 10 rel err:
i=1461 a=0.333373 b=0.0154138 rel err=20.6282
i=2038 a=0.136982 b=-0.076067 rel err=2.80081
i=2346 a=0.163637 b=-0.14278 rel err=2.14608
i=2927 a=0.0830841 b=-0.120305 rel err=1.69061
i=3197 a=0.249005 b=0.123245 rel err=1.02041
i=3405 a=0.140827 b=0.0717204 rel err=0.963553
i=1548 a=-0.0529749 b=-0.197922 rel err=0.732345
i=1850 a=-0.00731659 b=-0.0248034 rel err=0.705017
i=3495 a=-0.0530338 b=-0.179241 rel err=0.70412
i=1056 a=0.511251 b=0.303663 rel err=0.683617

// For reference this configuration using the dmmv path:
  MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=1,k=4096,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0): call ggml_sycl_mul_mat
mul_mat use_dequantize_mul_mat_vec=1 use_mul_mat_vec_q=1 use_mul_mat_q=0 reorder=1 split=0 m=4096 n=1 k=4096 batch0=1 batch1=1
Calling dequantize_mul_mat_vec_q4_0_sycl_reorder
call ggml_sycl_mul_mat done

Avg abs err=0.0640822 Top 10 abs err:
i=473 a=-25.0884 b=-25.3996 abs err=0.31119
i=2896 a=9.36747 b=9.67615 abs err=0.308678
i=4075 a=-0.00384331 b=0.297059 abs err=0.300902
i=725 a=26.7606 b=26.4645 abs err=0.296185
i=914 a=10.4823 b=10.7594 abs err=0.277103
i=1546 a=8.51613 b=8.24743 abs err=0.268696
i=1304 a=36.3309 b=36.0639 abs err=0.266941
i=925 a=3.2164 b=2.95618 abs err=0.26022
i=745 a=52.9018 b=53.1593 abs err=0.257549
i=3229 a=-13.4299 b=-13.1729 abs err=0.256965

Avg rel err=-0.00598986 Top 10 rel err:
i=2346 a=0.0593772 b=-0.14278 rel err=1.41587
i=2328 a=-0.0412564 b=-0.168865 rel err=0.755684
i=1548 a=-0.0534334 b=-0.197922 rel err=0.730028
i=3879 a=0.360157 b=0.224717 rel err=0.602713
i=2038 a=-0.0306633 b=-0.076067 rel err=0.596891
i=1421 a=-0.064465 b=-0.155207 rel err=0.584652
i=1941 a=-0.123309 b=-0.248092 rel err=0.50297
i=3495 a=-0.0921001 b=-0.179241 rel err=0.486166
i=3026 a=-0.038188 b=-0.0717697 rel err=0.46791
i=88 a=-0.111841 b=-0.209998 rel err=0.467418

The accuracy is expected to be lower with mmvq since src1 is quantized but that does not necessarily translate to a lower model accuracy.

I've done the same for the configurations m=11008 n=1 k=4096; m=11008 n=2 k=4096; m=4096 n=1 k=11008; m=4096 n=2 k=11008 and m=4096 n=2 k=4096. Attaching the files below:
test_06.txt
test_05.txt
test_04.txt
test_03.txt
test_02.txt
test_01.txt

With these results I don't understand how you could get incorrect output, do you have an idea of what could go wrong? I think we could add another option to disable mmvq for debugging purposes but I think mmvq should become the default for the better performance.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use Arc 770.

Your result above is same as the code change in my test case:

if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda || true)

This code forces to execute the code:
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;

It changes the legacy code path.
The code calling dequantize_mul_mat_vec() or use_mul_mat_vec_q() is changed.
That will impact the result.

The OP functions on GPU and CPU should be same (less error) result as you said, but the code path impact the final result.

I think the optimization should keep same result of legacy code.

It's possible to change the code path to get better performance.
I suggest dividing into two parts (PRs): optimize OP function and optimize the code path.

Looks like the optimize the code path impact the final result correction.

Copy link
Collaborator Author

@Alcpz Alcpz Apr 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NeoZhangJianyu @Rbiessy Thanks for putting the effort on reviewing the PR. I'm happy splitting the PR in two, but let's first try to understand why this gets worse performance.
Could it be a driver/Operative System issue maybe? I'll try to gather more information.

@NeoZhangJianyu, are you testing the PR on Linux or Windows?

Copy link
Collaborator

@NeoZhangJianyu NeoZhangJianyu Apr 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Driver and OS only impact the performance in general.
You must care the shape of tensor.
For example, some code work well for 32 * n, but bad for 24 * n.

I test it on Linux.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification. It's hard to understand the issues you are finding because I don't fully know what you are testing. I'll try to replicate the results locally and depending on the findings see if the PR has to be split or the dispatch could be slightly improved.

use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
}

if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// TODO: Refactor and cleanup of mul mat dispatching.
Expand All @@ -2968,14 +2986,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// KQ + KQV multi-batch
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
constexpr bool convert_src1_to_q8_1 = false;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
} else if (use_mul_mat_vec_q) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
constexpr bool convert_src1_to_q8_1 = true;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
} else if (use_mul_mat_q) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
constexpr bool convert_src1_to_q8_1 = true;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
} else {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
constexpr bool convert_src1_to_q8_1 = false;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
}
}

Expand Down
Loading
Loading