Skip to content

Commit 0728c5a

Browse files
CUDA: mmq CLI option, fixed mmq build issues (ggml-org#2453)
1 parent 1215ed7 commit 0728c5a

File tree

10 files changed

+67
-27
lines changed

10 files changed

+67
-27
lines changed

Diff for: CMakeLists.txt

+10-6
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework
6868
option(LLAMA_BLAS "llama: use BLAS" OFF)
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
71-
option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
71+
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
7272
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
7373
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7474
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -253,9 +253,9 @@ if (LLAMA_CUBLAS)
253253
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
254254

255255
add_compile_definitions(GGML_USE_CUBLAS)
256-
if (LLAMA_CUDA_CUBLAS)
257-
add_compile_definitions(GGML_CUDA_CUBLAS)
258-
endif()
256+
# if (LLAMA_CUDA_CUBLAS)
257+
# add_compile_definitions(GGML_CUDA_CUBLAS)
258+
# endif()
259259
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
260260
if (LLAMA_CUDA_FORCE_DMMV)
261261
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
@@ -277,10 +277,14 @@ if (LLAMA_CUBLAS)
277277
endif()
278278

279279
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
280+
# 52 == lowest CUDA 12 standard
281+
# 60 == f16 CUDA intrinsics
282+
# 61 == integer CUDA intrinsics
283+
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
280284
if (LLAMA_CUDA_DMMV_F16)
281-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
285+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
282286
else()
283-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
287+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
284288
endif()
285289
endif()
286290
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Diff for: Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,9 @@ ifdef LLAMA_CUDA_MMQ_Y
236236
else
237237
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
238238
endif # LLAMA_CUDA_MMQ_Y
239-
ifdef LLAMA_CUDA_CUBLAS
240-
NVCCFLAGS += -DGGML_CUDA_CUBLAS
241-
endif # LLAMA_CUDA_CUBLAS
239+
#ifdef LLAMA_CUDA_CUBLAS
240+
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
241+
#endif # LLAMA_CUDA_CUBLAS
242242
ifdef LLAMA_CUDA_CCBIN
243243
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
244244
endif

Diff for: README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -400,9 +400,11 @@ Building the program with BLAS support may lead to some performance improvements
400400

401401
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
402402

403+
<!---
404+
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
405+
--->
403406
| Option | Legal values | Default | Description |
404407
|-------------------------|------------------------|---------|-------------|
405-
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
406408
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
407409
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
408410
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |

Diff for: examples/common.cpp

+13-3
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
352352
#ifdef GGML_USE_CUBLAS
353353
params.main_gpu = std::stoi(argv[i]);
354354
#else
355-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
355+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
356356
#endif
357357
} else if (arg == "--tensor-split" || arg == "-ts") {
358358
if (++i >= argc) {
@@ -376,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
376376
}
377377
}
378378
#else
379-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
379+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
380+
#endif // GGML_USE_CUBLAS
381+
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
382+
#ifdef GGML_USE_CUBLAS
383+
params.mul_mat_q = true;
384+
#else
385+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
380386
#endif // GGML_USE_CUBLAS
381387
} else if (arg == "--low-vram" || arg == "-lv") {
382388
#ifdef GGML_USE_CUBLAS
383389
params.low_vram = true;
384390
#else
385-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
391+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
386392
#endif // GGML_USE_CUBLAS
387393
} else if (arg == "--no-mmap") {
388394
params.use_mmap = false;
@@ -585,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
585591
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
586592
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
587593
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
594+
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
595+
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
596+
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
588597
#endif
589598
fprintf(stdout, " --mtest compute maximum memory usage\n");
590599
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
@@ -637,6 +646,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
637646
lparams.main_gpu = params.main_gpu;
638647
lparams.tensor_split = params.tensor_split;
639648
lparams.low_vram = params.low_vram;
649+
lparams.mul_mat_q = params.mul_mat_q;
640650
lparams.seed = params.seed;
641651
lparams.f16_kv = params.memory_f16;
642652
lparams.use_mmap = params.use_mmap;

Diff for: examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ struct gpt_params {
7474
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
7575

7676
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
77+
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
7778
bool memory_f16 = true; // use f16 instead of f32 for memory kv
7879
bool random_prompt = false; // do not randomize prompt if none provided
7980
bool use_color = false; // use color to distinguish generations and inputs

Diff for: examples/server/server.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
631631
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
632632
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
633633
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
634+
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
635+
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
636+
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
634637
#endif
635638
fprintf(stdout, " -m FNAME, --model FNAME\n");
636639
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@@ -827,15 +830,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
827830
}
828831
}
829832
#else
830-
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
833+
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
831834
#endif // GGML_USE_CUBLAS
832835
}
833836
else if (arg == "--low-vram" || arg == "-lv")
834837
{
835838
#ifdef GGML_USE_CUBLAS
836839
params.low_vram = true;
837840
#else
838-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
841+
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
842+
#endif // GGML_USE_CUBLAS
843+
}
844+
else if (arg == "--mul-mat-q" || arg == "-mmq")
845+
{
846+
#ifdef GGML_USE_CUBLAS
847+
params.mul_mat_q = true;
848+
#else
849+
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
839850
#endif // GGML_USE_CUBLAS
840851
}
841852
else if (arg == "--main-gpu" || arg == "-mg")

Diff for: ggml-cuda.cu

+14-10
Original file line numberDiff line numberDiff line change
@@ -3898,10 +3898,9 @@ static size_t g_scratch_offset = 0;
38983898

38993899
static int g_device_count = -1;
39003900
static int g_main_device = 0;
3901-
#ifndef GGML_CUDA_FORCE_DMMV
39023901
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
3903-
#endif
39043902
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
3903+
static bool g_mul_mat_q = false;
39053904

39063905
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
39073906

@@ -3923,9 +3922,7 @@ void ggml_init_cublas() {
39233922
g_tensor_split[id] = total_vram;
39243923
total_vram += prop.totalGlobalMem;
39253924

3926-
#ifndef GGML_CUDA_FORCE_DMMV
39273925
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
3928-
#endif
39293926
}
39303927
for (int id = 0; id < g_device_count; ++id) {
39313928
g_tensor_split[id] /= total_vram;
@@ -4278,6 +4275,7 @@ inline void ggml_cuda_op_mul_mat_vec(
42784275

42794276
#ifdef GGML_CUDA_FORCE_DMMV
42804277
const bool use_mul_mat_vec_q = false;
4278+
(void) g_compute_capabilities[0];
42814279
#else
42824280
int id;
42834281
CUDA_CHECK(cudaGetDevice(&id));
@@ -5021,12 +5019,14 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
50215019
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
50225020
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
50235021
} else {
5024-
#ifdef GGML_CUDA_CUBLAS
5025-
const bool use_mul_mat_q = false;
5026-
#else
5027-
const bool use_mul_mat_q = ggml_is_quantized(src0->type);
5028-
#endif // GGML_CUDA_CUBLAS
5029-
if (use_mul_mat_q) {
5022+
int min_compute_capability = INT_MAX;
5023+
for (int id = 0; id < g_device_count; ++id) {
5024+
if (min_compute_capability > g_compute_capabilities[id]) {
5025+
min_compute_capability = g_compute_capabilities[id];
5026+
}
5027+
}
5028+
5029+
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
50305030
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
50315031
} else {
50325032
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
@@ -5320,6 +5320,10 @@ void ggml_cuda_set_main_device(int main_device) {
53205320
}
53215321
}
53225322

5323+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
5324+
g_mul_mat_q = mul_mat_q;
5325+
}
5326+
53235327
void ggml_cuda_set_scratch_size(size_t scratch_size) {
53245328
g_scratch_size = scratch_size;
53255329
}

Diff for: ggml-cuda.h

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
2727
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
2828
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
2929
void ggml_cuda_set_main_device(int main_device);
30+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
3031
void ggml_cuda_set_scratch_size(size_t scratch_size);
3132
void ggml_cuda_free_scratch(void);
3233
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

Diff for: llama.cpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
901901
/*.progress_callback =*/ nullptr,
902902
/*.progress_callback_user_data =*/ nullptr,
903903
/*.low_vram =*/ false,
904+
/*.mul_mat_q =*/ false,
904905
/*.f16_kv =*/ true,
905906
/*.logits_all =*/ false,
906907
/*.vocab_only =*/ false,
@@ -1028,6 +1029,7 @@ static void llama_model_load_internal(
10281029
int n_gpu_layers,
10291030
int main_gpu,
10301031
const float * tensor_split,
1032+
const bool mul_mat_q,
10311033
float rope_freq_base,
10321034
float rope_freq_scale,
10331035
bool low_vram,
@@ -1156,9 +1158,11 @@ static void llama_model_load_internal(
11561158
}
11571159

11581160
(void) main_gpu;
1161+
(void) mul_mat_q;
11591162
#if defined(GGML_USE_CUBLAS)
11601163
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
11611164
ggml_cuda_set_main_device(main_gpu);
1165+
ggml_cuda_set_mul_mat_q(mul_mat_q);
11621166
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
11631167
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
11641168
#elif defined(GGML_USE_CLBLAST)
@@ -1367,6 +1371,7 @@ static bool llama_model_load(
13671371
int n_gpu_layers,
13681372
int main_gpu,
13691373
const float * tensor_split,
1374+
const bool mul_mat_q,
13701375
float rope_freq_base,
13711376
float rope_freq_scale,
13721377
bool low_vram,
@@ -1377,7 +1382,8 @@ static bool llama_model_load(
13771382
llama_progress_callback progress_callback,
13781383
void *progress_callback_user_data) {
13791384
try {
1380-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1385+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1386+
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
13811387
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
13821388
return true;
13831389
} catch (const std::exception & err) {
@@ -3192,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
31923198
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
31933199

31943200
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3195-
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3201+
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
31963202
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
31973203
params.progress_callback_user_data)) {
31983204
delete model;

Diff for: llama.h

+1
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ extern "C" {
108108

109109
// Keep the booleans together to avoid misalignment during copy-by-value.
110110
bool low_vram; // if true, reduce VRAM usage at the cost of performance
111+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
111112
bool f16_kv; // use fp16 for KV cache
112113
bool logits_all; // the llama_eval() call computes all logits, not just the last one
113114
bool vocab_only; // only load the vocabulary, no weights

0 commit comments

Comments
 (0)