From cac00cff1816e818fc103d9d9a37ddc18045c1f6 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 10 Apr 2025 10:50:21 +0000 Subject: [PATCH 1/5] update llama-bench with prompt and gen throughput metrics --- examples/llama-bench/llama-bench.cpp | 114 ++++++++++++++++++++------- 1 file changed, 87 insertions(+), 27 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index cbcbfcee861ee..258842f402a56 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -901,7 +901,9 @@ struct test { int n_prompt; int n_gen; std::string test_time; - std::vector samples_ns; + std::vector samples_e2e_ns; // e2e latency including prompt processing + token generation + std::vector samples_prompt_ns; // prompt processing latency + std::vector samples_gen_ns; // token generation latency test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) : cpu_info(get_cpu_info()), @@ -939,21 +941,42 @@ struct test { (void) ctx; } - uint64_t avg_ns() const { return ::avg(samples_ns); } + uint64_t avg_e2e_ns() const { return ::avg(samples_e2e_ns); } + uint64_t avg_prompt_ns() const { return ::avg(samples_prompt_ns); } + uint64_t avg_gen_ns() const { return ::avg(samples_gen_ns); } - uint64_t stdev_ns() const { return ::stdev(samples_ns); } + uint64_t stddev_e2e_ns() const { return ::stdev(samples_e2e_ns); } + uint64_t stddev_prompt_ns() const { return ::stdev(samples_prompt_ns); } + uint64_t stddev_gen_ns() const { return ::stdev(samples_gen_ns); } - std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + std::vector get_ts(const std::vector & samples_ns, int n_tokens) const { + if(n_tokens==0) + return {0}; std::vector ts; std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } + + std::vector get_e2e_ts() const { + // for only prompt processing, atleast 1 token is generated + int n_tokens = n_gen==0 ? 1 : n_gen; + return get_ts(samples_e2e_ns, n_tokens); + } + std::vector get_prompt_ts() const { + return get_ts(samples_prompt_ns, n_prompt); + } + std::vector get_gen_ts() const { + return get_ts(samples_gen_ns, n_gen); + } - double avg_ts() const { return ::avg(get_ts()); } + double avg_e2e_ts() const { return ::avg(get_e2e_ts()); } + double avg_prompt_ts() const { return ::avg(get_prompt_ts()); } + double avg_gen_ts() const { return ::avg(get_gen_ts()); } - double stdev_ts() const { return ::stdev(get_ts()); } + double stdev_e2e_ts() const { return ::stdev(get_e2e_ts()); } + double stdev_prompt_ts() const { return ::stdev(get_prompt_ts()); } + double stdev_gen_ts() const { return ::stdev(get_gen_ts()); } static std::string get_backend() { std::vector backends; @@ -973,8 +996,10 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "embeddings", "n_prompt", "n_gen", "test_time", + "avg_e2e_ns", "stddev_e2e_ns", "avg_e2e_ts", "stddev_e2e_ts", + "avg_prompt_ns", "stddev_prompt_ns", "avg_prompt_ts", "stddev_prompt_ts", + "avg_gen_ns", "stddev_gen_ns", "avg_gen_ts", "stddev_gen_ts" }; return fields; } @@ -984,15 +1009,16 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_e2e_ns" || + field == "stddev_e2e_ns" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || + field == "avg_gen_ns" || field == "stddev_gen_ns") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_ts" || field == "stddev_ts") { + if (field == "avg_e2e_ts" || field == "stddev_e2e_ts" || field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") { return FLOAT; } return STRING; @@ -1042,10 +1068,18 @@ struct test { std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), - std::to_string(stdev_ns()), - std::to_string(avg_ts()), - std::to_string(stdev_ts()) }; + std::to_string(avg_e2e_ns()), + std::to_string(stddev_e2e_ns()), + std::to_string(avg_e2e_ts()), + std::to_string(stdev_e2e_ts()), + std::to_string(avg_prompt_ns()), + std::to_string(stddev_prompt_ns()), + std::to_string(avg_prompt_ts()), + std::to_string(stdev_prompt_ts()), + std::to_string(avg_gen_ns()), + std::to_string(stddev_gen_ns()), + std::to_string(avg_gen_ts()), + std::to_string(stdev_gen_ts()) }; return values; } @@ -1153,8 +1187,12 @@ struct json_printer : public printer { } fprintf(fout, " {\n"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); - fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " \"samples_e2e_ns\": [ %s ],\n", join(t.samples_e2e_ns, ", ").c_str()); + fprintf(fout, " \"samples_e2e_ts\": [ %s ]\n", join(t.get_e2e_ts(), ", ").c_str()); + fprintf(fout, " \"samples_prompt_ns\": [ %s ],\n", join(t.samples_prompt_ns, ", ").c_str()); + fprintf(fout, " \"samples_prompt_ts\": [ %s ]\n", join(t.get_prompt_ts(), ", ").c_str()); + fprintf(fout, " \"samples_gen_ns\": [ %s ],\n", join(t.samples_gen_ns, ", ").c_str()); + fprintf(fout, " \"samples_gen_ts\": [ %s ]\n", join(t.get_gen_ts(), ", ").c_str()); fprintf(fout, " }"); fflush(fout); } @@ -1173,8 +1211,12 @@ struct jsonl_printer : public printer { void print_test(const test & t) override { fprintf(fout, "{"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str()); - fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str()); + fprintf(fout, "\"samples_e2e_ns\": [ %s ],", join(t.samples_e2e_ns, ", ").c_str()); + fprintf(fout, "\"samples_e2e_ts\": [ %s ]", join(t.get_e2e_ts(), ", ").c_str()); + fprintf(fout, "\"samples_prompt_ns\": [ %s ],", join(t.samples_prompt_ns, ", ").c_str()); + fprintf(fout, "\"samples_prompt_ts\": [ %s ]", join(t.get_prompt_ts(), ", ").c_str()); + fprintf(fout, "\"samples_gen_ns\": [ %s ],", join(t.samples_gen_ns, ", ").c_str()); + fprintf(fout, "\"samples_gen_ts\": [ %s ]", join(t.get_gen_ts(), ", ").c_str()); fprintf(fout, "}\n"); fflush(fout); } @@ -1187,7 +1229,7 @@ struct markdown_printer : public printer { if (field == "model") { return -30; } - if (field == "t/s") { + if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") { return 20; } if (field == "size" || field == "params") { @@ -1314,7 +1356,9 @@ struct markdown_printer : public printer { fields.emplace_back("embeddings"); } fields.emplace_back("test"); - fields.emplace_back("t/s"); + fields.emplace_back("e2e t/s"); + fields.emplace_back("prompt t/s"); + fields.emplace_back("gen t/s"); fprintf(fout, "|"); for (const auto & field : fields) { @@ -1363,8 +1407,14 @@ struct markdown_printer : public printer { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } value = buf; - } else if (field == "t/s") { - snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); + } else if (field == "e2e t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_e2e_ts(), t.stdev_e2e_ts()); + value = buf; + } else if (field == "prompt t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts()); + value = buf; + } else if (field == "gen t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_gen_ts(), t.stdev_gen_ts()); value = buf; } else if (vmap.find(field) != vmap.end()) { value = vmap.at(field); @@ -1374,7 +1424,7 @@ struct markdown_printer : public printer { } int width = get_field_width(field); - if (field == "t/s") { + if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") { // HACK: the utf-8 character is 2 bytes width += 1; } @@ -1629,6 +1679,9 @@ int main(int argc, char ** argv) { } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } + + uint64_t t_gen_start = get_time_ns(); + if (t.n_gen > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, @@ -1637,8 +1690,15 @@ int main(int argc, char ** argv) { test_gen(ctx, t.n_gen, t.n_threads); } - uint64_t t_ns = get_time_ns() - t_start; - t.samples_ns.push_back(t_ns); + uint64_t t_end = get_time_ns(); + + uint64_t e2e_ns = t_end - t_start; + uint64_t prompt_ns = t_gen_start - t_start; + uint64_t gen_ns = t_end - t_gen_start; + + t.samples_e2e_ns.push_back(e2e_ns); + t.samples_prompt_ns.push_back(prompt_ns); + t.samples_gen_ns.push_back(gen_ns); } if (p) { From df46ea53bc0d0d7fe1542bcb62b71d1fb4fd8fbe Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Tue, 15 Apr 2025 11:44:07 +0530 Subject: [PATCH 2/5] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- examples/llama-bench/llama-bench.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 258842f402a56..447e978f09bc3 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -901,7 +901,7 @@ struct test { int n_prompt; int n_gen; std::string test_time; - std::vector samples_e2e_ns; // e2e latency including prompt processing + token generation + std::vector samples_e2e_ns; // e2e latency, i.e. prompt processing + token generation std::vector samples_prompt_ns; // prompt processing latency std::vector samples_gen_ns; // token generation latency @@ -959,7 +959,7 @@ struct test { } std::vector get_e2e_ts() const { - // for only prompt processing, atleast 1 token is generated + // for only prompt processing, at least 1 token is generated int n_tokens = n_gen==0 ? 1 : n_gen; return get_ts(samples_e2e_ns, n_tokens); } From fa6cb8aecc923cbbb23c57e04b3949c451a98996 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Fri, 18 Apr 2025 14:08:34 +0530 Subject: [PATCH 3/5] update readme --- examples/llama-bench/README.md | 274 ++++++++++++++++----------- examples/llama-bench/llama-bench.cpp | 54 ++---- 2 files changed, 175 insertions(+), 153 deletions(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 6bbe4bb75fbf8..c77566f806dd5 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -74,20 +74,32 @@ Note: ## Examples +### Prompt processing and text generation + +```sh +$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -p 0 -n 0 -pg 100,100 -pg 500,100 +``` + +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp100+tg100 | 14303.91 ± 362.95 | 455.50 ± 11.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp500+tg100 | 28026.49 ± 970.98 | 440.05 ± 4.57 | + + ### Text generation with different models ```sh -$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 +$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -m models/Llama-3.2-3B-Instruct-Q4_K_M.gguf -p 0 -n 128,256,512 ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 469.99 ± 2.69 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 454.10 ± 9.76 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 444.62 ± 11.83 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 219.82 ± 0.37 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 215.15 ± 2.04 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 211.12 ± 1.43 | ### Prompt processing with different batch sizes @@ -95,12 +107,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0. $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 ``` -| model | size | params | backend | ngl | n_batch | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | +| model | params | backend | ngl | n_batch | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp1024 | 16751.82 ± 667.31 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp1024 | 23255.17 ± 446.86 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp1024 | 25544.36 ± 571.16 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp1024 | 25610.04 ± 606.37 | 0.00 ± 0.00 | ### Different numbers of threads @@ -108,20 +120,20 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 ``` -| model | size | params | backend | threads | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 || +| model | params | backend | ngl | threads | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp64 | 10322.32 ± 193.62 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | tg16 | 0.00 ± 0.00 | 444.22 ± 6.66 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp64 | 7313.31 ± 145.33 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | tg16 | 0.00 ± 0.00 | 468.99 ± 12.30 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp64 | 10111.46 ± 1261.15 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | tg16 | 0.00 ± 0.00 | 464.07 ± 18.15 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp64 | 9605.79 ± 1684.50 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | tg16 | 0.00 ± 0.00 | 469.92 ± 16.23 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp64 | 10336.80 ± 740.34 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | tg16 | 0.00 ± 0.00 | 472.06 ± 10.29 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp64 | 8819.08 ± 1529.51 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | tg16 | 0.00 ± 0.00 | 458.20 ± 15.14 | ### Different numbers of layers offloaded to the GPU @@ -129,24 +141,24 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | pp512 | 12082.67 ± 403.77 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | tg128 | 0.00 ± 0.00 | 106.37 ± 2.72 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | pp512 | 16742.49 ± 8252.51 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | tg128 | 0.00 ± 0.00 | 454.01 ± 8.64 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | pp512 | 29580.40 ± 106.86 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | tg128 | 0.00 ± 0.00 | 457.68 ± 9.88 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | pp512 | 29594.52 ± 154.46 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | tg128 | 0.00 ± 0.00 | 465.27 ± 9.24 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | pp512 | 29503.27 ± 174.82 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | tg128 | 0.00 ± 0.00 | 467.16 ± 2.22 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | pp512 | 29479.41 ± 180.78 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | tg128 | 0.00 ± 0.00 | 465.67 ± 6.10 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | pp512 | 29446.50 ± 59.09 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | tg128 | 0.00 ± 0.00 | 470.60 ± 2.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | pp512 | 29369.74 ± 229.29 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | tg128 | 0.00 ± 0.00 | 467.41 ± 6.81 | ## Output formats @@ -158,10 +170,10 @@ By default, llama-bench outputs the results in markdown format. The results can $ ./llama-bench -o md ``` -| model | size | params | backend | ngl | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | +| model | params | backend | ngl | test | prompt t/s | gen t/s | +| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp512 | 27663.05 ± 90.18 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 467.13 ± 5.21 | ### CSV @@ -170,9 +182,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,test_time,avg_prompt_ns,stddev_prompt_ns,avg_prompt_ts,stddev_prompt_ts,avg_gen_ns,stddev_gen_ns,avg_gen_ts,stddev_gen_ts +"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","2025-04-18T07:40:28Z","18599580","87220","27527.987050","128.945972","60","54","0.000000","0.000000" +"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","2025-04-18T07:40:28Z","0","0","0.000000","0.000000","273595440","4100226","467.926504","6.877051" ``` ### JSON @@ -184,64 +196,88 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "df46ea53", + "build_number": 5099, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", + "model_type": "llama 1B Q4_K - Medium", + "model_size": 799862912, + "model_n_params": 1235814432, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 512, "n_gen": 0, - "test_time": "2023-09-23T12:09:57Z", - "avg_ns": 212365953, - "stddev_ns": 985423, - "avg_ts": 2410.974041, - "stddev_ts": 11.163766, - "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], - "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] + "test_time": "2025-04-18T07:41:24Z", + "avg_prompt_ns": 19432500, + "stddev_prompt_ns": 1155276, + "avg_prompt_ts": 26420.253006, + "stddev_prompt_ts": 1527.724050, + "avg_gen_ns": 60, + "stddev_gen_ns": 89, + "avg_gen_ts": 0.000000, + "stddev_gen_ts": 0.000000, + "samples_prompt_ns": [ 18723500, 18641500, 18476200, 21034200, 20287100 ], + "samples_prompt_ts": [ 27345.3, 27465.6, 27711.3, 24341.3, 25237.7 ] + "samples_gen_ns": [ 0, 100, 0, 200, 0 ], + "samples_gen_ts": [ 0 ] }, { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "df46ea53", + "build_number": 5099, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", + "model_type": "llama 1B Q4_K - Medium", + "model_size": 799862912, + "model_n_params": 1235814432, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 0, "n_gen": 128, - "test_time": "2023-09-23T12:09:59Z", - "avg_ns": 977425219, - "stddev_ns": 9268593, - "avg_ts": 130.965708, - "stddev_ts": 1.238924, - "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], - "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] + "test_time": "2025-04-18T07:41:24Z", + "avg_prompt_ns": 20, + "stddev_prompt_ns": 44, + "avg_prompt_ts": 0.000000, + "stddev_prompt_ts": 0.000000, + "avg_gen_ns": 279581280, + "stddev_gen_ns": 7013491, + "avg_gen_ts": 458.054981, + "stddev_gen_ts": 11.337387, + "samples_prompt_ns": [ 0, 0, 0, 0, 100 ], + "samples_prompt_ts": [ 0 ] + "samples_gen_ns": [ 290465300, 280112200, 280351000, 274751300, 272226600 ], + "samples_gen_ts": [ 440.672, 456.96, 456.571, 465.876, 470.197 ] } ] ``` @@ -254,8 +290,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} +{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "test_time": "2025-04-18T07:42:16Z", "avg_prompt_ns": 18637700, "stddev_prompt_ns": 164536, "avg_prompt_ts": 27472.914745, "stddev_prompt_ts": 242.676976, "avg_gen_ns": 0, "stddev_gen_ns": 0, "avg_gen_ts": 0.000000, "stddev_gen_ts": 0.000000, "samples_prompt_ns": [ 18782700, 18654200, 18812300, 18466100, 18473200 ],"samples_prompt_ts": [ 27259.1, 27446.9, 27216.2, 27726.5, 27715.8 ]"samples_gen_ns": [ 0, 0, 0, 0, 0 ],"samples_gen_ts": [ 0 ]} +{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "test_time": "2025-04-18T07:42:17Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 122, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 273159220, "stddev_gen_ns": 2025528, "avg_gen_ts": 468.611680, "stddev_gen_ts": 3.456568, "samples_prompt_ns": [ 300, 100, 100, 0, 0 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 276312400, 272096600, 271459600, 274053300, 271874200 ],"samples_gen_ts": [ 463.244, 470.421, 471.525, 467.062, 470.806 ]} ``` @@ -271,32 +307,42 @@ $ ./llama-bench -o sql CREATE TABLE IF NOT EXISTS test ( build_commit TEXT, build_number INTEGER, - cuda INTEGER, - metal INTEGER, - gpu_blas INTEGER, - blas INTEGER, cpu_info TEXT, gpu_info TEXT, + backends TEXT, model_filename TEXT, model_type TEXT, model_size INTEGER, model_n_params INTEGER, n_batch INTEGER, + n_ubatch INTEGER, n_threads INTEGER, - f16_kv INTEGER, + cpu_mask TEXT, + cpu_strict INTEGER, + poll INTEGER, + type_k TEXT, + type_v TEXT, n_gpu_layers INTEGER, + split_mode TEXT, main_gpu INTEGER, - mul_mat_q INTEGER, + no_kv_offload INTEGER, + flash_attn INTEGER, tensor_split TEXT, + use_mmap INTEGER, + embeddings INTEGER, n_prompt INTEGER, n_gen INTEGER, test_time TEXT, - avg_ns INTEGER, - stddev_ns INTEGER, - avg_ts REAL, - stddev_ts REAL + avg_prompt_ns INTEGER, + stddev_prompt_ns INTEGER, + avg_prompt_ts REAL, + stddev_prompt_ts REAL, + avg_gen_ns INTEGER, + stddev_gen_ns INTEGER, + avg_gen_ts REAL, + stddev_gen_ts REAL ); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '2025-04-18T07:42:43Z', '18543960', '131206', '27611.175041', '195.547424', '60', '54', '0.000000', '0.000000'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '2025-04-18T07:42:43Z', '20', '44', '0.000000', '0.000000', '274190080', '2765950', '466.867210', '4.680900'); ``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 447e978f09bc3..2607eae9ae8d8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -901,7 +901,6 @@ struct test { int n_prompt; int n_gen; std::string test_time; - std::vector samples_e2e_ns; // e2e latency, i.e. prompt processing + token generation std::vector samples_prompt_ns; // prompt processing latency std::vector samples_gen_ns; // token generation latency @@ -941,11 +940,9 @@ struct test { (void) ctx; } - uint64_t avg_e2e_ns() const { return ::avg(samples_e2e_ns); } uint64_t avg_prompt_ns() const { return ::avg(samples_prompt_ns); } uint64_t avg_gen_ns() const { return ::avg(samples_gen_ns); } - uint64_t stddev_e2e_ns() const { return ::stdev(samples_e2e_ns); } uint64_t stddev_prompt_ns() const { return ::stdev(samples_prompt_ns); } uint64_t stddev_gen_ns() const { return ::stdev(samples_gen_ns); } @@ -958,11 +955,6 @@ struct test { return ts; } - std::vector get_e2e_ts() const { - // for only prompt processing, at least 1 token is generated - int n_tokens = n_gen==0 ? 1 : n_gen; - return get_ts(samples_e2e_ns, n_tokens); - } std::vector get_prompt_ts() const { return get_ts(samples_prompt_ns, n_prompt); } @@ -970,11 +962,9 @@ struct test { return get_ts(samples_gen_ns, n_gen); } - double avg_e2e_ts() const { return ::avg(get_e2e_ts()); } double avg_prompt_ts() const { return ::avg(get_prompt_ts()); } double avg_gen_ts() const { return ::avg(get_gen_ts()); } - double stdev_e2e_ts() const { return ::stdev(get_e2e_ts()); } double stdev_prompt_ts() const { return ::stdev(get_prompt_ts()); } double stdev_gen_ts() const { return ::stdev(get_gen_ts()); } @@ -992,14 +982,13 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", - "avg_e2e_ns", "stddev_e2e_ns", "avg_e2e_ts", "stddev_e2e_ts", - "avg_prompt_ns", "stddev_prompt_ns", "avg_prompt_ts", "stddev_prompt_ts", - "avg_gen_ns", "stddev_gen_ns", "avg_gen_ts", "stddev_gen_ts" + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", + "avg_prompt_ns", "stddev_prompt_ns", "avg_prompt_ts", "stddev_prompt_ts", + "avg_gen_ns", "stddev_gen_ns", "avg_gen_ts", "stddev_gen_ts" }; return fields; } @@ -1009,8 +998,7 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_e2e_ns" || - field == "stddev_e2e_ns" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || field == "avg_gen_ns" || field == "stddev_gen_ns") { return INT; } @@ -1018,7 +1006,7 @@ struct test { field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_e2e_ts" || field == "stddev_e2e_ts" || field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") { + if (field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") { return FLOAT; } return STRING; @@ -1068,10 +1056,6 @@ struct test { std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_e2e_ns()), - std::to_string(stddev_e2e_ns()), - std::to_string(avg_e2e_ts()), - std::to_string(stdev_e2e_ts()), std::to_string(avg_prompt_ns()), std::to_string(stddev_prompt_ns()), std::to_string(avg_prompt_ts()), @@ -1187,8 +1171,6 @@ struct json_printer : public printer { } fprintf(fout, " {\n"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, " \"samples_e2e_ns\": [ %s ],\n", join(t.samples_e2e_ns, ", ").c_str()); - fprintf(fout, " \"samples_e2e_ts\": [ %s ]\n", join(t.get_e2e_ts(), ", ").c_str()); fprintf(fout, " \"samples_prompt_ns\": [ %s ],\n", join(t.samples_prompt_ns, ", ").c_str()); fprintf(fout, " \"samples_prompt_ts\": [ %s ]\n", join(t.get_prompt_ts(), ", ").c_str()); fprintf(fout, " \"samples_gen_ns\": [ %s ],\n", join(t.samples_gen_ns, ", ").c_str()); @@ -1211,8 +1193,6 @@ struct jsonl_printer : public printer { void print_test(const test & t) override { fprintf(fout, "{"); print_fields(test::get_fields(), t.get_values()); - fprintf(fout, "\"samples_e2e_ns\": [ %s ],", join(t.samples_e2e_ns, ", ").c_str()); - fprintf(fout, "\"samples_e2e_ts\": [ %s ]", join(t.get_e2e_ts(), ", ").c_str()); fprintf(fout, "\"samples_prompt_ns\": [ %s ],", join(t.samples_prompt_ns, ", ").c_str()); fprintf(fout, "\"samples_prompt_ts\": [ %s ]", join(t.get_prompt_ts(), ", ").c_str()); fprintf(fout, "\"samples_gen_ns\": [ %s ],", join(t.samples_gen_ns, ", ").c_str()); @@ -1229,8 +1209,11 @@ struct markdown_printer : public printer { if (field == "model") { return -30; } - if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") { - return 20; + if (field == "prompt t/s") { + return 18; + } + if (field == "gen t/s") { + return 15; } if (field == "size" || field == "params") { return 10; @@ -1302,7 +1285,6 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print fields.emplace_back("model"); - fields.emplace_back("size"); fields.emplace_back("params"); fields.emplace_back("backend"); bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos || @@ -1356,7 +1338,6 @@ struct markdown_printer : public printer { fields.emplace_back("embeddings"); } fields.emplace_back("test"); - fields.emplace_back("e2e t/s"); fields.emplace_back("prompt t/s"); fields.emplace_back("gen t/s"); @@ -1407,9 +1388,6 @@ struct markdown_printer : public printer { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } value = buf; - } else if (field == "e2e t/s") { - snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_e2e_ts(), t.stdev_e2e_ts()); - value = buf; } else if (field == "prompt t/s") { snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts()); value = buf; @@ -1424,7 +1402,7 @@ struct markdown_printer : public printer { } int width = get_field_width(field); - if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") { + if (field == "prompt t/s" || field == "gen t/s") { // HACK: the utf-8 character is 2 bytes width += 1; } @@ -1692,11 +1670,9 @@ int main(int argc, char ** argv) { uint64_t t_end = get_time_ns(); - uint64_t e2e_ns = t_end - t_start; uint64_t prompt_ns = t_gen_start - t_start; uint64_t gen_ns = t_end - t_gen_start; - t.samples_e2e_ns.push_back(e2e_ns); t.samples_prompt_ns.push_back(prompt_ns); t.samples_gen_ns.push_back(gen_ns); } From f3d697a3d7153a92bc7d075c246a58505abcbec4 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Fri, 18 Apr 2025 16:00:25 +0530 Subject: [PATCH 4/5] update default llama-bench params --- examples/llama-bench/llama-bench.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2607eae9ae8d8..29c45b55b904b 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -189,9 +189,9 @@ struct cmd_params { static const cmd_params cmd_params_defaults = { /* model */ { "models/7B/ggml-model-q4_0.gguf" }, - /* n_prompt */ { 512 }, - /* n_gen */ { 128 }, - /* n_pg */ {}, + /* n_prompt */ { 0 }, + /* n_gen */ { 32 }, + /* n_pg */ { { 4096, 32 } }, /* n_batch */ { 2048 }, /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, @@ -210,7 +210,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ { true }, /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* reps */ 5, + /* reps */ 3, /* prio */ GGML_SCHED_PRIO_NORMAL, /* delay */ 0, /* verbose */ false, From 5f01fabd354020a59aec834d9a610989e802b5b5 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Fri, 18 Apr 2025 16:56:22 +0530 Subject: [PATCH 5/5] update llama-bench readme to reflect new default params --- examples/llama-bench/README.md | 178 ++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index c77566f806dd5..008f284fef519 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -25,9 +25,9 @@ usage: ./llama-bench [options] options: -h, --help -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) + -p, --n-prompt (default: 0) + -n, --n-gen (default: 32) + -pg (default: 4096,32) -b, --batch-size (default: 2048) -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) @@ -94,12 +94,14 @@ $ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -m models/Llama-3.2- | model | params | backend | ngl | test | prompt t/s | gen t/s | | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 469.99 ± 2.69 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 454.10 ± 9.76 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 444.62 ± 11.83 | -| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 219.82 ± 0.37 | -| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 215.15 ± 2.04 | -| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 211.12 ± 1.43 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 469.34 ± 2.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 459.78 ± 9.43 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 449.25 ± 11.74 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp4096+tg32 | 15545.82 ± 8.35 | 385.90 ± 3.47 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 212.78 ± 5.12 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg256 | 0.00 ± 0.00 | 214.56 ± 2.16 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | tg512 | 0.00 ± 0.00 | 212.84 ± 1.41 | +| llama 3B Q4_K - Medium | 3.21 B | CUDA | 99 | pp4096+tg32 | 8825.07 ± 100.28 | 177.25 ± 1.89 | ### Prompt processing with different batch sizes @@ -109,10 +111,14 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 | model | params | backend | ngl | n_batch | test | prompt t/s | gen t/s | | ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp1024 | 16751.82 ± 667.31 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp1024 | 23255.17 ± 446.86 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp1024 | 25544.36 ± 571.16 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp1024 | 25610.04 ± 606.37 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp1024 | 17125.18 ± 731.13 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 128 | pp4096+tg32 | 12139.39 ± 446.63 | 378.76 ± 8.18 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp1024 | 24112.17 ± 161.18 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 256 | pp4096+tg32 | 14508.80 ± 53.00 | 386.58 ± 0.42 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp1024 | 25534.56 ± 368.03 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 512 | pp4096+tg32 | 15388.41 ± 13.06 | 386.30 ± 0.53 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp1024 | 25654.61 ± 772.86 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1024 | pp4096+tg32 | 15487.92 ± 8.59 | 385.20 ± 0.50 | ### Different numbers of threads @@ -122,18 +128,24 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 | model | params | backend | ngl | threads | test | prompt t/s | gen t/s | | ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp64 | 10322.32 ± 193.62 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | tg16 | 0.00 ± 0.00 | 444.22 ± 6.66 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp64 | 7313.31 ± 145.33 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | tg16 | 0.00 ± 0.00 | 468.99 ± 12.30 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp64 | 10111.46 ± 1261.15 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | tg16 | 0.00 ± 0.00 | 464.07 ± 18.15 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp64 | 9605.79 ± 1684.50 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | tg16 | 0.00 ± 0.00 | 469.92 ± 16.23 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp64 | 10336.80 ± 740.34 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | tg16 | 0.00 ± 0.00 | 472.06 ± 10.29 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp64 | 8819.08 ± 1529.51 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | tg16 | 0.00 ± 0.00 | 458.20 ± 15.14 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp64 | 9229.99 ± 1897.41 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | tg16 | 0.00 ± 0.00 | 444.33 ± 25.11 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 1 | pp4096+tg32 | 15357.53 ± 27.52 | 373.90 ± 7.03 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp64 | 10799.57 ± 33.90 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | tg16 | 0.00 ± 0.00 | 461.43 ± 10.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 2 | pp4096+tg32 | 15371.18 ± 57.24 | 372.59 ± 4.02 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp64 | 11033.35 ± 177.05 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | tg16 | 0.00 ± 0.00 | 448.57 ± 8.66 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 4 | pp4096+tg32 | 15371.12 ± 43.70 | 376.71 ± 0.93 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp64 | 11206.45 ± 187.47 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | tg16 | 0.00 ± 0.00 | 457.99 ± 6.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 8 | pp4096+tg32 | 15022.14 ± 161.68 | 369.76 ± 4.71 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp64 | 10397.19 ± 304.08 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | tg16 | 0.00 ± 0.00 | 457.53 ± 7.06 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 16 | pp4096+tg32 | 15434.32 ± 158.08 | 372.00 ± 3.34 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp64 | 10588.34 ± 1043.71 | 0.00 ± 0.00 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | tg16 | 0.00 ± 0.00 | 468.10 ± 9.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | 32 | pp4096+tg32 | 15544.54 ± 4.30 | 374.14 ± 7.18 | ### Different numbers of layers offloaded to the GPU @@ -143,22 +155,22 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 | model | params | backend | ngl | test | prompt t/s | gen t/s | | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | pp512 | 12082.67 ± 403.77 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | tg128 | 0.00 ± 0.00 | 106.37 ± 2.72 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | pp512 | 16742.49 ± 8252.51 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | tg128 | 0.00 ± 0.00 | 454.01 ± 8.64 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | pp512 | 29580.40 ± 106.86 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | tg128 | 0.00 ± 0.00 | 457.68 ± 9.88 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | pp512 | 29594.52 ± 154.46 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | tg128 | 0.00 ± 0.00 | 465.27 ± 9.24 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | pp512 | 29503.27 ± 174.82 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | tg128 | 0.00 ± 0.00 | 467.16 ± 2.22 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | pp512 | 29479.41 ± 180.78 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | tg128 | 0.00 ± 0.00 | 465.67 ± 6.10 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | pp512 | 29446.50 ± 59.09 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | tg128 | 0.00 ± 0.00 | 470.60 ± 2.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | pp512 | 29369.74 ± 229.29 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | tg128 | 0.00 ± 0.00 | 467.41 ± 6.81 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | tg32 | 0.00 ± 0.00 | 107.29 ± 1.37 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 10 | pp4096+tg32 | 8458.79 ± 154.44 | 70.84 ± 0.10 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | tg32 | 0.00 ± 0.00 | 484.02 ± 0.93 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 20 | pp4096+tg32 | 15303.20 ± 120.74 | 372.57 ± 6.32 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | tg32 | 0.00 ± 0.00 | 473.82 ± 4.27 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 30 | pp4096+tg32 | 15372.85 ± 239.94 | 378.99 ± 4.72 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | tg32 | 0.00 ± 0.00 | 474.76 ± 7.11 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 31 | pp4096+tg32 | 15373.12 ± 263.84 | 377.83 ± 12.16 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | tg32 | 0.00 ± 0.00 | 482.19 ± 0.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 32 | pp4096+tg32 | 15515.24 ± 15.85 | 369.73 ± 0.23 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | tg32 | 0.00 ± 0.00 | 482.07 ± 0.63 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 33 | pp4096+tg32 | 15299.93 ± 261.50 | 373.32 ± 9.92 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | tg32 | 0.00 ± 0.00 | 482.89 ± 0.99 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 34 | pp4096+tg32 | 15551.65 ± 14.10 | 381.00 ± 6.75 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | tg32 | 0.00 ± 0.00 | 481.55 ± 1.15 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 35 | pp4096+tg32 | 15565.34 ± 5.96 | 385.77 ± 0.25 | ## Output formats @@ -172,8 +184,8 @@ $ ./llama-bench -o md | model | params | backend | ngl | test | prompt t/s | gen t/s | | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp512 | 27663.05 ± 90.18 | 0.00 ± 0.00 | -| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg128 | 0.00 ± 0.00 | 467.13 ± 5.21 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | tg32 | 0.00 ± 0.00 | 455.34 ± 13.25 | +| llama 1B Q4_K - Medium | 1.24 B | CUDA | 99 | pp4096+tg32 | 15479.05 ± 93.15 | 383.70 ± 2.79 | ### CSV @@ -183,8 +195,8 @@ $ ./llama-bench -o csv ```csv build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,test_time,avg_prompt_ns,stddev_prompt_ns,avg_prompt_ts,stddev_prompt_ts,avg_gen_ns,stddev_gen_ns,avg_gen_ts,stddev_gen_ts -"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","2025-04-18T07:40:28Z","18599580","87220","27527.987050","128.945972","60","54","0.000000","0.000000" -"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","2025-04-18T07:40:28Z","0","0","0.000000","0.000000","273595440","4100226","467.926504","6.877051" +"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","32","2025-04-18T11:21:18Z","66","58","0.000000","0.000000","71886000","7590","445.149267","0.046999" +"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","4096","32","2025-04-18T11:21:18Z","272293733","3247466","15044.014817","180.586130","87201066","125581","366.968490","0.525734" ``` ### JSON @@ -196,8 +208,8 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "df46ea53", - "build_number": 5099, + "build_commit": "fa6cb8ae", + "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", @@ -221,25 +233,25 @@ $ ./llama-bench -o json "tensor_split": "0.00", "use_mmap": true, "embeddings": false, - "n_prompt": 512, - "n_gen": 0, - "test_time": "2025-04-18T07:41:24Z", - "avg_prompt_ns": 19432500, - "stddev_prompt_ns": 1155276, - "avg_prompt_ts": 26420.253006, - "stddev_prompt_ts": 1527.724050, - "avg_gen_ns": 60, - "stddev_gen_ns": 89, - "avg_gen_ts": 0.000000, - "stddev_gen_ts": 0.000000, - "samples_prompt_ns": [ 18723500, 18641500, 18476200, 21034200, 20287100 ], - "samples_prompt_ts": [ 27345.3, 27465.6, 27711.3, 24341.3, 25237.7 ] - "samples_gen_ns": [ 0, 100, 0, 200, 0 ], - "samples_gen_ts": [ 0 ] + "n_prompt": 0, + "n_gen": 32, + "test_time": "2025-04-18T11:21:45Z", + "avg_prompt_ns": 66, + "stddev_prompt_ns": 58, + "avg_prompt_ts": 0.000000, + "stddev_prompt_ts": 0.000000, + "avg_gen_ns": 67903233, + "stddev_gen_ns": 498856, + "avg_gen_ts": 471.275875, + "stddev_gen_ts": 3.475513, + "samples_prompt_ns": [ 100, 0, 100 ], + "samples_prompt_ts": [ 0 ] + "samples_gen_ns": [ 68251300, 68126600, 67331800 ], + "samples_gen_ts": [ 468.856, 469.714, 475.258 ] }, { - "build_commit": "df46ea53", - "build_number": 5099, + "build_commit": "fa6cb8ae", + "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", @@ -263,21 +275,21 @@ $ ./llama-bench -o json "tensor_split": "0.00", "use_mmap": true, "embeddings": false, - "n_prompt": 0, - "n_gen": 128, - "test_time": "2025-04-18T07:41:24Z", - "avg_prompt_ns": 20, - "stddev_prompt_ns": 44, - "avg_prompt_ts": 0.000000, - "stddev_prompt_ts": 0.000000, - "avg_gen_ns": 279581280, - "stddev_gen_ns": 7013491, - "avg_gen_ts": 458.054981, - "stddev_gen_ts": 11.337387, - "samples_prompt_ns": [ 0, 0, 0, 0, 100 ], - "samples_prompt_ts": [ 0 ] - "samples_gen_ns": [ 290465300, 280112200, 280351000, 274751300, 272226600 ], - "samples_gen_ts": [ 440.672, 456.96, 456.571, 465.876, 470.197 ] + "n_prompt": 4096, + "n_gen": 32, + "test_time": "2025-04-18T11:21:46Z", + "avg_prompt_ns": 263273600, + "stddev_prompt_ns": 273278, + "avg_prompt_ts": 15557.970647, + "stddev_prompt_ts": 16.143068, + "avg_gen_ns": 85820333, + "stddev_gen_ns": 4372337, + "avg_gen_ts": 373.500825, + "stddev_gen_ts": 18.514532, + "samples_prompt_ns": [ 263043600, 263201500, 263575700 ], + "samples_prompt_ts": [ 15571.6, 15562.2, 15540.1 ] + "samples_gen_ns": [ 82844300, 83776400, 90840300 ], + "samples_gen_ts": [ 386.267, 381.969, 352.267 ] } ] ``` @@ -290,8 +302,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "test_time": "2025-04-18T07:42:16Z", "avg_prompt_ns": 18637700, "stddev_prompt_ns": 164536, "avg_prompt_ts": 27472.914745, "stddev_prompt_ts": 242.676976, "avg_gen_ns": 0, "stddev_gen_ns": 0, "avg_gen_ts": 0.000000, "stddev_gen_ts": 0.000000, "samples_prompt_ns": [ 18782700, 18654200, 18812300, 18466100, 18473200 ],"samples_prompt_ts": [ 27259.1, 27446.9, 27216.2, 27726.5, 27715.8 ]"samples_gen_ns": [ 0, 0, 0, 0, 0 ],"samples_gen_ts": [ 0 ]} -{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "test_time": "2025-04-18T07:42:17Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 122, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 273159220, "stddev_gen_ns": 2025528, "avg_gen_ts": 468.611680, "stddev_gen_ts": 3.456568, "samples_prompt_ns": [ 300, 100, 100, 0, 0 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 276312400, 272096600, 271459600, 274053300, 271874200 ],"samples_gen_ts": [ 463.244, 470.421, 471.525, 467.062, 470.806 ]} +{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 0, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 71156300, "stddev_gen_ns": 912152, "avg_gen_ts": 449.763857, "stddev_gen_ts": 5.808090, "samples_prompt_ns": [ 100, 100, 100 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 71725200, 71639500, 70104200 ],"samples_gen_ts": [ 446.147, 446.681, 456.463 ]} +{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 4096, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 267673800, "stddev_prompt_ns": 4917668, "avg_prompt_ts": 15305.627579, "stddev_prompt_ts": 279.255714, "avg_gen_ns": 83914500, "stddev_gen_ns": 1515058, "avg_gen_ts": 381.422650, "stddev_gen_ts": 6.822569, "samples_prompt_ns": [ 266315000, 273128000, 263578400 ],"samples_prompt_ts": [ 15380.3, 14996.6, 15540 ]"samples_gen_ns": [ 85644600, 83274100, 82824800 ],"samples_gen_ts": [ 373.637, 384.273, 386.358 ]} ``` @@ -343,6 +355,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_gen_ts REAL ); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '2025-04-18T07:42:43Z', '18543960', '131206', '27611.175041', '195.547424', '60', '54', '0.000000', '0.000000'); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '2025-04-18T07:42:43Z', '20', '44', '0.000000', '0.000000', '274190080', '2765950', '466.867210', '4.680900'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '32', '2025-04-18T11:22:37Z', '66', '58', '0.000000', '0.000000', '70741266', '2050337', '452.606173', '13.122321'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '4096', '32', '2025-04-18T11:22:37Z', '270934866', '4466069', '15120.737903', '246.900896', '85258733', '2156168', '375.487736', '9.468350'); ```