From cac00cff1816e818fc103d9d9a37ddc18045c1f6 Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishala@nvidia.com>
Date: Thu, 10 Apr 2025 10:50:21 +0000
Subject: [PATCH 1/5] update llama-bench with prompt and gen throughput metrics

---
 examples/llama-bench/llama-bench.cpp | 114 ++++++++++++++++++++-------
 1 file changed, 87 insertions(+), 27 deletions(-)
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index cbcbfcee861ee..258842f402a56 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -901,7 +901,9 @@ struct test {
     int                      n_prompt;
     int                      n_gen;
     std::string              test_time;
-    std::vector<uint64_t>    samples_ns;
+    std::vector<uint64_t>    samples_e2e_ns;        // e2e latency including prompt processing + token generation
+    std::vector<uint64_t>    samples_prompt_ns;     // prompt processing latency
+    std::vector<uint64_t>    samples_gen_ns;        // token generation latency
 
     test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
         cpu_info(get_cpu_info()),
@@ -939,21 +941,42 @@ struct test {
         (void) ctx;
     }
 
-    uint64_t avg_ns() const { return ::avg(samples_ns); }
+    uint64_t avg_e2e_ns() const { return ::avg(samples_e2e_ns); }
+    uint64_t avg_prompt_ns() const { return ::avg(samples_prompt_ns); }
+    uint64_t avg_gen_ns() const { return ::avg(samples_gen_ns); }
 
-    uint64_t stdev_ns() const { return ::stdev(samples_ns); }
+    uint64_t stddev_e2e_ns() const { return ::stdev(samples_e2e_ns); }
+    uint64_t stddev_prompt_ns() const { return ::stdev(samples_prompt_ns); }
+    uint64_t stddev_gen_ns() const { return ::stdev(samples_gen_ns); }
 
-    std::vector<double> get_ts() const {
-        int                 n_tokens = n_prompt + n_gen;
+    std::vector<double> get_ts(const std::vector<uint64_t> & samples_ns, int n_tokens) const {
+        if(n_tokens==0)
+            return {0};
         std::vector<double> ts;
         std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
                        [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
         return ts;
     }
+    
+    std::vector<double> get_e2e_ts() const {
+        // for only prompt processing, atleast 1 token is generated
+        int n_tokens = n_gen==0 ? 1 : n_gen;
+        return get_ts(samples_e2e_ns, n_tokens);
+    }
+    std::vector<double> get_prompt_ts() const {
+        return get_ts(samples_prompt_ns, n_prompt);
+    }
+    std::vector<double> get_gen_ts() const {
+        return get_ts(samples_gen_ns, n_gen);
+    }
 
-    double avg_ts() const { return ::avg(get_ts()); }
+    double avg_e2e_ts() const { return ::avg(get_e2e_ts()); }
+    double avg_prompt_ts() const { return ::avg(get_prompt_ts()); }
+    double avg_gen_ts() const { return ::avg(get_gen_ts()); }
 
-    double stdev_ts() const { return ::stdev(get_ts()); }
+    double stdev_e2e_ts() const { return ::stdev(get_e2e_ts()); }
+    double stdev_prompt_ts() const { return ::stdev(get_prompt_ts()); }
+    double stdev_gen_ts() const { return ::stdev(get_gen_ts()); }
 
     static std::string get_backend() {
         std::vector<std::string> backends;
@@ -973,8 +996,10 @@ struct test {
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
-            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
-            "avg_ts",       "stddev_ts",
+            "embeddings",   "n_prompt",     "n_gen",          "test_time",  
+            "avg_e2e_ns",  "stddev_e2e_ns",    "avg_e2e_ts",     "stddev_e2e_ts",
+            "avg_prompt_ns",   "stddev_prompt_ns",     "avg_prompt_ts",      "stddev_prompt_ts",
+            "avg_gen_ns",      "stddev_gen_ns",        "avg_gen_ts",         "stddev_gen_ts"
         };
         return fields;
     }
@@ -984,15 +1009,16 @@ struct test {
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
             field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
-            field == "stddev_ns") {
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_e2e_ns" ||
+            field == "stddev_e2e_ns" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || 
+            field == "avg_gen_ns" || field == "stddev_gen_ns") {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
             field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
-        if (field == "avg_ts" || field == "stddev_ts") {
+        if (field == "avg_e2e_ts" || field == "stddev_e2e_ts" || field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") {
             return FLOAT;
         }
         return STRING;
@@ -1042,10 +1068,18 @@ struct test {
                                             std::to_string(n_prompt),
                                             std::to_string(n_gen),
                                             test_time,
-                                            std::to_string(avg_ns()),
-                                            std::to_string(stdev_ns()),
-                                            std::to_string(avg_ts()),
-                                            std::to_string(stdev_ts()) };
+                                            std::to_string(avg_e2e_ns()),
+                                            std::to_string(stddev_e2e_ns()),
+                                            std::to_string(avg_e2e_ts()),
+                                            std::to_string(stdev_e2e_ts()),
+                                            std::to_string(avg_prompt_ns()),
+                                            std::to_string(stddev_prompt_ns()),
+                                            std::to_string(avg_prompt_ts()),
+                                            std::to_string(stdev_prompt_ts()),
+                                            std::to_string(avg_gen_ns()),
+                                            std::to_string(stddev_gen_ns()),
+                                            std::to_string(avg_gen_ts()),
+                                            std::to_string(stdev_gen_ts()) };
         return values;
     }
 
@@ -1153,8 +1187,12 @@ struct json_printer : public printer {
         }
         fprintf(fout, "  {\n");
         print_fields(test::get_fields(), t.get_values());
-        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
-        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "    \"samples_e2e_ns\": [ %s ],\n", join(t.samples_e2e_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_e2e_ts\": [ %s ]\n", join(t.get_e2e_ts(), ", ").c_str());
+        fprintf(fout, "    \"samples_prompt_ns\": [ %s ],\n", join(t.samples_prompt_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_prompt_ts\": [ %s ]\n", join(t.get_prompt_ts(), ", ").c_str());
+        fprintf(fout, "    \"samples_gen_ns\": [ %s ],\n", join(t.samples_gen_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_gen_ts\": [ %s ]\n", join(t.get_gen_ts(), ", ").c_str());
         fprintf(fout, "  }");
         fflush(fout);
     }
@@ -1173,8 +1211,12 @@ struct jsonl_printer : public printer {
     void print_test(const test & t) override {
         fprintf(fout, "{");
         print_fields(test::get_fields(), t.get_values());
-        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
-        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "\"samples_e2e_ns\": [ %s ],", join(t.samples_e2e_ns, ", ").c_str());
+        fprintf(fout, "\"samples_e2e_ts\": [ %s ]", join(t.get_e2e_ts(), ", ").c_str());
+        fprintf(fout, "\"samples_prompt_ns\": [ %s ],", join(t.samples_prompt_ns, ", ").c_str());
+        fprintf(fout, "\"samples_prompt_ts\": [ %s ]", join(t.get_prompt_ts(), ", ").c_str());
+        fprintf(fout, "\"samples_gen_ns\": [ %s ],", join(t.samples_gen_ns, ", ").c_str());
+        fprintf(fout, "\"samples_gen_ts\": [ %s ]", join(t.get_gen_ts(), ", ").c_str());
         fprintf(fout, "}\n");
         fflush(fout);
     }
@@ -1187,7 +1229,7 @@ struct markdown_printer : public printer {
         if (field == "model") {
             return -30;
         }
-        if (field == "t/s") {
+        if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") {
             return 20;
         }
         if (field == "size" || field == "params") {
@@ -1314,7 +1356,9 @@ struct markdown_printer : public printer {
             fields.emplace_back("embeddings");
         }
         fields.emplace_back("test");
-        fields.emplace_back("t/s");
+        fields.emplace_back("e2e t/s");
+        fields.emplace_back("prompt t/s");
+        fields.emplace_back("gen t/s");
 
         fprintf(fout, "|");
         for (const auto & field : fields) {
@@ -1363,8 +1407,14 @@ struct markdown_printer : public printer {
                     snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                 }
                 value = buf;
-            } else if (field == "t/s") {
-                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
+            } else if (field == "e2e t/s") {
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_e2e_ts(), t.stdev_e2e_ts());
+                value = buf;
+            } else if (field == "prompt t/s") {
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts());
+                value = buf;
+            } else if (field == "gen t/s") {
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_gen_ts(), t.stdev_gen_ts());
                 value = buf;
             } else if (vmap.find(field) != vmap.end()) {
                 value = vmap.at(field);
@@ -1374,7 +1424,7 @@ struct markdown_printer : public printer {
             }
 
             int width = get_field_width(field);
-            if (field == "t/s") {
+            if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") {
                 // HACK: the utf-8 character is 2 bytes
                 width += 1;
             }
@@ -1629,6 +1679,9 @@ int main(int argc, char ** argv) {
                 }
                 test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
             }
+
+            uint64_t t_gen_start = get_time_ns();
+
             if (t.n_gen > 0) {
                 if (params.progress) {
                     fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
@@ -1637,8 +1690,15 @@ int main(int argc, char ** argv) {
                 test_gen(ctx, t.n_gen, t.n_threads);
             }
 
-            uint64_t t_ns = get_time_ns() - t_start;
-            t.samples_ns.push_back(t_ns);
+            uint64_t t_end = get_time_ns();
+
+            uint64_t e2e_ns = t_end - t_start;
+            uint64_t prompt_ns = t_gen_start - t_start;
+            uint64_t gen_ns = t_end - t_gen_start;
+
+            t.samples_e2e_ns.push_back(e2e_ns);
+            t.samples_prompt_ns.push_back(prompt_ns);
+            t.samples_gen_ns.push_back(gen_ns);
         }
 
         if (p) {

From df46ea53bc0d0d7fe1542bcb62b71d1fb4fd8fbe Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishalagarwal.jss@gmail.com>
Date: Tue, 15 Apr 2025 11:44:07 +0530
Subject: [PATCH 2/5] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 examples/llama-bench/llama-bench.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 258842f402a56..447e978f09bc3 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -901,7 +901,7 @@ struct test {
     int                      n_prompt;
     int                      n_gen;
     std::string              test_time;
-    std::vector<uint64_t>    samples_e2e_ns;        // e2e latency including prompt processing + token generation
+    std::vector<uint64_t>    samples_e2e_ns;        // e2e latency, i.e. prompt processing + token generation
     std::vector<uint64_t>    samples_prompt_ns;     // prompt processing latency
     std::vector<uint64_t>    samples_gen_ns;        // token generation latency
 
@@ -959,7 +959,7 @@ struct test {
     }
     
     std::vector<double> get_e2e_ts() const {
-        // for only prompt processing, atleast 1 token is generated
+        // for only prompt processing, at least 1 token is generated
         int n_tokens = n_gen==0 ? 1 : n_gen;
         return get_ts(samples_e2e_ns, n_tokens);
     }

From fa6cb8aecc923cbbb23c57e04b3949c451a98996 Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishala@nvidia.com>
Date: Fri, 18 Apr 2025 14:08:34 +0530
Subject: [PATCH 3/5] update readme

---
 examples/llama-bench/README.md       | 274 ++++++++++++++++-----------
 examples/llama-bench/llama-bench.cpp |  54 ++----
 2 files changed, 175 insertions(+), 153 deletions(-)

diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 6bbe4bb75fbf8..c77566f806dd5 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -74,20 +74,32 @@ Note:
 
 ## Examples
 
+### Prompt processing and text generation
+
+```sh
+$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -p 0 -n 0 -pg 100,100 -pg 500,100
+```
+
+| model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |   pp100+tg100 |  14303.91 ± 362.95 |  455.50 ± 11.99 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |   pp500+tg100 |  28026.49 ± 970.98 |   440.05 ± 4.57 |
+
+
 ### Text generation with different models
 
 ```sh
-$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
+$ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -m models/Llama-3.2-3B-Instruct-Q4_K_M.gguf -p 0 -n 128,256,512
 ```
 
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
+| model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   469.99 ± 2.69 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   454.10 ± 9.76 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |  444.62 ± 11.83 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   219.82 ± 0.37 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   215.15 ± 2.04 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |   211.12 ± 1.43 |
 
 ### Prompt processing with different batch sizes
 
@@ -95,12 +107,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.
 $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
 ```
 
-| model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
+| model                          |     params | backend    | ngl | n_batch |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     128 |        pp1024 |  16751.82 ± 667.31 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     256 |        pp1024 |  23255.17 ± 446.86 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     512 |        pp1024 |  25544.36 ± 571.16 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |    1024 |        pp1024 |  25610.04 ± 606.37 |     0.00 ± 0.00 |
 
 ### Different numbers of threads
 
@@ -108,20 +120,20 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
 $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
 ```
 
-| model                          |       size |     params | backend    |    threads | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
+| model                          |     params | backend    | ngl | threads |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          pp64 |  10322.32 ± 193.62 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          tg16 |        0.00 ± 0.00 |   444.22 ± 6.66 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          pp64 |   7313.31 ± 145.33 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          tg16 |        0.00 ± 0.00 |  468.99 ± 12.30 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          pp64 | 10111.46 ± 1261.15 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          tg16 |        0.00 ± 0.00 |  464.07 ± 18.15 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          pp64 |  9605.79 ± 1684.50 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          tg16 |        0.00 ± 0.00 |  469.92 ± 16.23 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          pp64 |  10336.80 ± 740.34 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          tg16 |        0.00 ± 0.00 |  472.06 ± 10.29 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          pp64 |  8819.08 ± 1529.51 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          tg16 |        0.00 ± 0.00 |  458.20 ± 15.14 |
 
 ### Different numbers of layers offloaded to the GPU
 
@@ -129,24 +141,24 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
 $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
 ```
 
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
+| model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |         pp512 |  12082.67 ± 403.77 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |         tg128 |        0.00 ± 0.00 |   106.37 ± 2.72 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |         pp512 | 16742.49 ± 8252.51 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |         tg128 |        0.00 ± 0.00 |   454.01 ± 8.64 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |         pp512 |  29580.40 ± 106.86 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |         tg128 |        0.00 ± 0.00 |   457.68 ± 9.88 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |         pp512 |  29594.52 ± 154.46 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |         tg128 |        0.00 ± 0.00 |   465.27 ± 9.24 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |         pp512 |  29503.27 ± 174.82 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |         tg128 |        0.00 ± 0.00 |   467.16 ± 2.22 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |         pp512 |  29479.41 ± 180.78 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |         tg128 |        0.00 ± 0.00 |   465.67 ± 6.10 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |         pp512 |   29446.50 ± 59.09 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |         tg128 |        0.00 ± 0.00 |   470.60 ± 2.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |         pp512 |  29369.74 ± 229.29 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |         tg128 |        0.00 ± 0.00 |   467.41 ± 6.81 |
 
 ## Output formats
 
@@ -158,10 +170,10 @@ By default, llama-bench outputs the results in markdown format. The results can
 $ ./llama-bench -o md
 ```
 
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
+| model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
+| ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         pp512 |   27663.05 ± 90.18 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   467.13 ± 5.21 |
 
 ### CSV
 
@@ -170,9 +182,9 @@ $ ./llama-bench -o csv
 ```
 
 ```csv
-build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,test_time,avg_prompt_ns,stddev_prompt_ns,avg_prompt_ts,stddev_prompt_ts,avg_gen_ns,stddev_gen_ns,avg_gen_ts,stddev_gen_ts
+"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","2025-04-18T07:40:28Z","18599580","87220","27527.987050","128.945972","60","54","0.000000","0.000000"
+"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","2025-04-18T07:40:28Z","0","0","0.000000","0.000000","273595440","4100226","467.926504","6.877051"
 ```
 
 ### JSON
@@ -184,64 +196,88 @@ $ ./llama-bench -o json
 ```json
 [
   {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "df46ea53",
+    "build_number": 5099,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+    "model_type": "llama 1B Q4_K - Medium",
+    "model_size": 799862912,
+    "model_n_params": 1235814432,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
     "n_gpu_layers": 99,
+    "split_mode": "layer",
     "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
     "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
     "n_prompt": 512,
     "n_gen": 0,
-    "test_time": "2023-09-23T12:09:57Z",
-    "avg_ns": 212365953,
-    "stddev_ns": 985423,
-    "avg_ts": 2410.974041,
-    "stddev_ts": 11.163766,
-    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
-    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
+    "test_time": "2025-04-18T07:41:24Z",
+    "avg_prompt_ns": 19432500,
+    "stddev_prompt_ns": 1155276,
+    "avg_prompt_ts": 26420.253006,
+    "stddev_prompt_ts": 1527.724050,
+    "avg_gen_ns": 60,
+    "stddev_gen_ns": 89,
+    "avg_gen_ts": 0.000000,
+    "stddev_gen_ts": 0.000000,
+    "samples_prompt_ns": [ 18723500, 18641500, 18476200, 21034200, 20287100 ],
+    "samples_prompt_ts": [ 27345.3, 27465.6, 27711.3, 24341.3, 25237.7 ]
+    "samples_gen_ns": [ 0, 100, 0, 200, 0 ],
+    "samples_gen_ts": [ 0 ]
   },
   {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "df46ea53",
+    "build_number": 5099,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+    "model_type": "llama 1B Q4_K - Medium",
+    "model_size": 799862912,
+    "model_n_params": 1235814432,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
     "n_gpu_layers": 99,
+    "split_mode": "layer",
     "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
     "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
     "n_prompt": 0,
     "n_gen": 128,
-    "test_time": "2023-09-23T12:09:59Z",
-    "avg_ns": 977425219,
-    "stddev_ns": 9268593,
-    "avg_ts": 130.965708,
-    "stddev_ts": 1.238924,
-    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
-    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
+    "test_time": "2025-04-18T07:41:24Z",
+    "avg_prompt_ns": 20,
+    "stddev_prompt_ns": 44,
+    "avg_prompt_ts": 0.000000,
+    "stddev_prompt_ts": 0.000000,
+    "avg_gen_ns": 279581280,
+    "stddev_gen_ns": 7013491,
+    "avg_gen_ts": 458.054981,
+    "stddev_gen_ts": 11.337387,
+    "samples_prompt_ns": [ 0, 0, 0, 0, 100 ],
+    "samples_prompt_ts": [ 0 ]
+    "samples_gen_ns": [ 290465300, 280112200, 280351000, 274751300, 272226600 ],
+    "samples_gen_ts": [ 440.672, 456.96, 456.571, 465.876, 470.197 ]
   }
 ]
 ```
@@ -254,8 +290,8 @@ $ ./llama-bench -o jsonl
 ```
 
 ```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "test_time": "2025-04-18T07:42:16Z", "avg_prompt_ns": 18637700, "stddev_prompt_ns": 164536, "avg_prompt_ts": 27472.914745, "stddev_prompt_ts": 242.676976, "avg_gen_ns": 0, "stddev_gen_ns": 0, "avg_gen_ts": 0.000000, "stddev_gen_ts": 0.000000, "samples_prompt_ns": [ 18782700, 18654200, 18812300, 18466100, 18473200 ],"samples_prompt_ts": [ 27259.1, 27446.9, 27216.2, 27726.5, 27715.8 ]"samples_gen_ns": [ 0, 0, 0, 0, 0 ],"samples_gen_ts": [ 0 ]}
+{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "test_time": "2025-04-18T07:42:17Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 122, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 273159220, "stddev_gen_ns": 2025528, "avg_gen_ts": 468.611680, "stddev_gen_ts": 3.456568, "samples_prompt_ns": [ 300, 100, 100, 0, 0 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 276312400, 272096600, 271459600, 274053300, 271874200 ],"samples_gen_ts": [ 463.244, 470.421, 471.525, 467.062, 470.806 ]}
 ```
 
 
@@ -271,32 +307,42 @@ $ ./llama-bench -o sql
 CREATE TABLE IF NOT EXISTS test (
   build_commit TEXT,
   build_number INTEGER,
-  cuda INTEGER,
-  metal INTEGER,
-  gpu_blas INTEGER,
-  blas INTEGER,
   cpu_info TEXT,
   gpu_info TEXT,
+  backends TEXT,
   model_filename TEXT,
   model_type TEXT,
   model_size INTEGER,
   model_n_params INTEGER,
   n_batch INTEGER,
+  n_ubatch INTEGER,
   n_threads INTEGER,
-  f16_kv INTEGER,
+  cpu_mask TEXT,
+  cpu_strict INTEGER,
+  poll INTEGER,
+  type_k TEXT,
+  type_v TEXT,
   n_gpu_layers INTEGER,
+  split_mode TEXT,
   main_gpu INTEGER,
-  mul_mat_q INTEGER,
+  no_kv_offload INTEGER,
+  flash_attn INTEGER,
   tensor_split TEXT,
+  use_mmap INTEGER,
+  embeddings INTEGER,
   n_prompt INTEGER,
   n_gen INTEGER,
   test_time TEXT,
-  avg_ns INTEGER,
-  stddev_ns INTEGER,
-  avg_ts REAL,
-  stddev_ts REAL
+  avg_prompt_ns INTEGER,
+  stddev_prompt_ns INTEGER,
+  avg_prompt_ts REAL,
+  stddev_prompt_ts REAL,
+  avg_gen_ns INTEGER,
+  stddev_gen_ns INTEGER,
+  avg_gen_ts REAL,
+  stddev_gen_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '2025-04-18T07:42:43Z', '18543960', '131206', '27611.175041', '195.547424', '60', '54', '0.000000', '0.000000');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '2025-04-18T07:42:43Z', '20', '44', '0.000000', '0.000000', '274190080', '2765950', '466.867210', '4.680900');
 ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 447e978f09bc3..2607eae9ae8d8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -901,7 +901,6 @@ struct test {
     int                      n_prompt;
     int                      n_gen;
     std::string              test_time;
-    std::vector<uint64_t>    samples_e2e_ns;        // e2e latency, i.e. prompt processing + token generation
     std::vector<uint64_t>    samples_prompt_ns;     // prompt processing latency
     std::vector<uint64_t>    samples_gen_ns;        // token generation latency
 
@@ -941,11 +940,9 @@ struct test {
         (void) ctx;
     }
 
-    uint64_t avg_e2e_ns() const { return ::avg(samples_e2e_ns); }
     uint64_t avg_prompt_ns() const { return ::avg(samples_prompt_ns); }
     uint64_t avg_gen_ns() const { return ::avg(samples_gen_ns); }
 
-    uint64_t stddev_e2e_ns() const { return ::stdev(samples_e2e_ns); }
     uint64_t stddev_prompt_ns() const { return ::stdev(samples_prompt_ns); }
     uint64_t stddev_gen_ns() const { return ::stdev(samples_gen_ns); }
 
@@ -958,11 +955,6 @@ struct test {
         return ts;
     }
     
-    std::vector<double> get_e2e_ts() const {
-        // for only prompt processing, at least 1 token is generated
-        int n_tokens = n_gen==0 ? 1 : n_gen;
-        return get_ts(samples_e2e_ns, n_tokens);
-    }
     std::vector<double> get_prompt_ts() const {
         return get_ts(samples_prompt_ns, n_prompt);
     }
@@ -970,11 +962,9 @@ struct test {
         return get_ts(samples_gen_ns, n_gen);
     }
 
-    double avg_e2e_ts() const { return ::avg(get_e2e_ts()); }
     double avg_prompt_ts() const { return ::avg(get_prompt_ts()); }
     double avg_gen_ts() const { return ::avg(get_gen_ts()); }
 
-    double stdev_e2e_ts() const { return ::stdev(get_e2e_ts()); }
     double stdev_prompt_ts() const { return ::stdev(get_prompt_ts()); }
     double stdev_gen_ts() const { return ::stdev(get_gen_ts()); }
 
@@ -992,14 +982,13 @@ struct test {
 
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
-            "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
-            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
-            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
-            "embeddings",   "n_prompt",     "n_gen",          "test_time",  
-            "avg_e2e_ns",  "stddev_e2e_ns",    "avg_e2e_ts",     "stddev_e2e_ts",
-            "avg_prompt_ns",   "stddev_prompt_ns",     "avg_prompt_ts",      "stddev_prompt_ts",
-            "avg_gen_ns",      "stddev_gen_ns",        "avg_gen_ts",         "stddev_gen_ts"
+            "build_commit",     "build_number",     "cpu_info",       "gpu_info",   "backends",     "model_filename",
+            "model_type",       "model_size",       "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+            "cpu_mask",         "cpu_strict",       "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",       "main_gpu",         "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",       "n_prompt",         "n_gen",          "test_time",  
+            "avg_prompt_ns",    "stddev_prompt_ns", "avg_prompt_ts",  "stddev_prompt_ts",
+            "avg_gen_ns",       "stddev_gen_ns",    "avg_gen_ts",     "stddev_gen_ts"
         };
         return fields;
     }
@@ -1009,8 +998,7 @@ struct test {
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
             field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_e2e_ns" ||
-            field == "stddev_e2e_ns" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || 
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_prompt_ns" || field == "stddev_prompt_ns" || 
             field == "avg_gen_ns" || field == "stddev_gen_ns") {
             return INT;
         }
@@ -1018,7 +1006,7 @@ struct test {
             field == "use_mmap" || field == "embeddings") {
             return BOOL;
         }
-        if (field == "avg_e2e_ts" || field == "stddev_e2e_ts" || field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") {
+        if (field == "avg_prompt_ts" || field == "stddev_prompt_ts" || field == "avg_gen_ts" || field == "stddev_gen_ts") {
             return FLOAT;
         }
         return STRING;
@@ -1068,10 +1056,6 @@ struct test {
                                             std::to_string(n_prompt),
                                             std::to_string(n_gen),
                                             test_time,
-                                            std::to_string(avg_e2e_ns()),
-                                            std::to_string(stddev_e2e_ns()),
-                                            std::to_string(avg_e2e_ts()),
-                                            std::to_string(stdev_e2e_ts()),
                                             std::to_string(avg_prompt_ns()),
                                             std::to_string(stddev_prompt_ns()),
                                             std::to_string(avg_prompt_ts()),
@@ -1187,8 +1171,6 @@ struct json_printer : public printer {
         }
         fprintf(fout, "  {\n");
         print_fields(test::get_fields(), t.get_values());
-        fprintf(fout, "    \"samples_e2e_ns\": [ %s ],\n", join(t.samples_e2e_ns, ", ").c_str());
-        fprintf(fout, "    \"samples_e2e_ts\": [ %s ]\n", join(t.get_e2e_ts(), ", ").c_str());
         fprintf(fout, "    \"samples_prompt_ns\": [ %s ],\n", join(t.samples_prompt_ns, ", ").c_str());
         fprintf(fout, "    \"samples_prompt_ts\": [ %s ]\n", join(t.get_prompt_ts(), ", ").c_str());
         fprintf(fout, "    \"samples_gen_ns\": [ %s ],\n", join(t.samples_gen_ns, ", ").c_str());
@@ -1211,8 +1193,6 @@ struct jsonl_printer : public printer {
     void print_test(const test & t) override {
         fprintf(fout, "{");
         print_fields(test::get_fields(), t.get_values());
-        fprintf(fout, "\"samples_e2e_ns\": [ %s ],", join(t.samples_e2e_ns, ", ").c_str());
-        fprintf(fout, "\"samples_e2e_ts\": [ %s ]", join(t.get_e2e_ts(), ", ").c_str());
         fprintf(fout, "\"samples_prompt_ns\": [ %s ],", join(t.samples_prompt_ns, ", ").c_str());
         fprintf(fout, "\"samples_prompt_ts\": [ %s ]", join(t.get_prompt_ts(), ", ").c_str());
         fprintf(fout, "\"samples_gen_ns\": [ %s ],", join(t.samples_gen_ns, ", ").c_str());
@@ -1229,8 +1209,11 @@ struct markdown_printer : public printer {
         if (field == "model") {
             return -30;
         }
-        if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") {
-            return 20;
+        if (field == "prompt t/s") {
+            return 18;
+        }
+        if (field == "gen t/s") {
+            return 15;
         }
         if (field == "size" || field == "params") {
             return 10;
@@ -1302,7 +1285,6 @@ struct markdown_printer : public printer {
     void print_header(const cmd_params & params) override {
         // select fields to print
         fields.emplace_back("model");
-        fields.emplace_back("size");
         fields.emplace_back("params");
         fields.emplace_back("backend");
         bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
@@ -1356,7 +1338,6 @@ struct markdown_printer : public printer {
             fields.emplace_back("embeddings");
         }
         fields.emplace_back("test");
-        fields.emplace_back("e2e t/s");
         fields.emplace_back("prompt t/s");
         fields.emplace_back("gen t/s");
 
@@ -1407,9 +1388,6 @@ struct markdown_printer : public printer {
                     snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                 }
                 value = buf;
-            } else if (field == "e2e t/s") {
-                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_e2e_ts(), t.stdev_e2e_ts());
-                value = buf;
             } else if (field == "prompt t/s") {
                 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_prompt_ts(), t.stdev_prompt_ts());
                 value = buf;
@@ -1424,7 +1402,7 @@ struct markdown_printer : public printer {
             }
 
             int width = get_field_width(field);
-            if (field == "e2e t/s" || field == "prompt t/s" || field == "gen t/s") {
+            if (field == "prompt t/s" || field == "gen t/s") {
                 // HACK: the utf-8 character is 2 bytes
                 width += 1;
             }
@@ -1692,11 +1670,9 @@ int main(int argc, char ** argv) {
 
             uint64_t t_end = get_time_ns();
 
-            uint64_t e2e_ns = t_end - t_start;
             uint64_t prompt_ns = t_gen_start - t_start;
             uint64_t gen_ns = t_end - t_gen_start;
 
-            t.samples_e2e_ns.push_back(e2e_ns);
             t.samples_prompt_ns.push_back(prompt_ns);
             t.samples_gen_ns.push_back(gen_ns);
         }

From f3d697a3d7153a92bc7d075c246a58505abcbec4 Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishala@nvidia.com>
Date: Fri, 18 Apr 2025 16:00:25 +0530
Subject: [PATCH 4/5] update default llama-bench params

---
 examples/llama-bench/llama-bench.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 2607eae9ae8d8..29c45b55b904b 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -189,9 +189,9 @@ struct cmd_params {
 
 static const cmd_params cmd_params_defaults = {
     /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
-    /* n_prompt             */ { 512 },
-    /* n_gen                */ { 128 },
-    /* n_pg                 */ {},
+    /* n_prompt             */ { 0 },
+    /* n_gen                */ { 32 },
+    /* n_pg                 */ { { 4096, 32 } },
     /* n_batch              */ { 2048 },
     /* n_ubatch             */ { 512 },
     /* type_k               */ { GGML_TYPE_F16 },
@@ -210,7 +210,7 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ { true },
     /* embeddings           */ { false },
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
-    /* reps                 */ 5,
+    /* reps                 */ 3,
     /* prio                 */ GGML_SCHED_PRIO_NORMAL,
     /* delay                */ 0,
     /* verbose              */ false,

From 5f01fabd354020a59aec834d9a610989e802b5b5 Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishala@nvidia.com>
Date: Fri, 18 Apr 2025 16:56:22 +0530
Subject: [PATCH 5/5] update llama-bench readme to reflect new default params

---
 examples/llama-bench/README.md | 178 ++++++++++++++++++---------------
 1 file changed, 95 insertions(+), 83 deletions(-)

diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index c77566f806dd5..008f284fef519 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -25,9 +25,9 @@ usage: ./llama-bench [options]
 options:
   -h, --help
   -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                        (default: 512)
-  -n, --n-gen <n>                           (default: 128)
-  -pg <pp,tg>                               (default: )
+  -p, --n-prompt <n>                        (default: 0)
+  -n, --n-gen <n>                           (default: 32)
+  -pg <pp,tg>                               (default: 4096,32)
   -b, --batch-size <n>                      (default: 2048)
   -ub, --ubatch-size <n>                    (default: 512)
   -ctk, --cache-type-k <t>                  (default: f16)
@@ -94,12 +94,14 @@ $ ./llama-bench -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -m models/Llama-3.2-
 
 | model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
 | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   469.99 ± 2.69 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   454.10 ± 9.76 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |  444.62 ± 11.83 |
-| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   219.82 ± 0.37 |
-| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   215.15 ± 2.04 |
-| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |   211.12 ± 1.43 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   469.34 ± 2.16 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   459.78 ± 9.43 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |  449.25 ± 11.74 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |   pp4096+tg32 |    15545.82 ± 8.35 |   385.90 ± 3.47 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   212.78 ± 5.12 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg256 |        0.00 ± 0.00 |   214.56 ± 2.16 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |         tg512 |        0.00 ± 0.00 |   212.84 ± 1.41 |
+| llama 3B Q4_K - Medium         |     3.21 B | CUDA       |  99 |   pp4096+tg32 |   8825.07 ± 100.28 |   177.25 ± 1.89 |
 
 ### Prompt processing with different batch sizes
 
@@ -109,10 +111,14 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
 
 | model                          |     params | backend    | ngl | n_batch |          test |         prompt t/s |         gen t/s |
 | ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     128 |        pp1024 |  16751.82 ± 667.31 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     256 |        pp1024 |  23255.17 ± 446.86 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     512 |        pp1024 |  25544.36 ± 571.16 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |    1024 |        pp1024 |  25610.04 ± 606.37 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     128 |        pp1024 |  17125.18 ± 731.13 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     128 |   pp4096+tg32 |  12139.39 ± 446.63 |   378.76 ± 8.18 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     256 |        pp1024 |  24112.17 ± 161.18 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     256 |   pp4096+tg32 |   14508.80 ± 53.00 |   386.58 ± 0.42 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     512 |        pp1024 |  25534.56 ± 368.03 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |     512 |   pp4096+tg32 |   15388.41 ± 13.06 |   386.30 ± 0.53 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |    1024 |        pp1024 |  25654.61 ± 772.86 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |    1024 |   pp4096+tg32 |    15487.92 ± 8.59 |   385.20 ± 0.50 |
 
 ### Different numbers of threads
 
@@ -122,18 +128,24 @@ $ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
 
 | model                          |     params | backend    | ngl | threads |          test |         prompt t/s |         gen t/s |
 | ------------------------------ | ---------: | ---------- | --: | ------: | ------------: | -----------------: | --------------: |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          pp64 |  10322.32 ± 193.62 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          tg16 |        0.00 ± 0.00 |   444.22 ± 6.66 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          pp64 |   7313.31 ± 145.33 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          tg16 |        0.00 ± 0.00 |  468.99 ± 12.30 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          pp64 | 10111.46 ± 1261.15 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          tg16 |        0.00 ± 0.00 |  464.07 ± 18.15 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          pp64 |  9605.79 ± 1684.50 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          tg16 |        0.00 ± 0.00 |  469.92 ± 16.23 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          pp64 |  10336.80 ± 740.34 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          tg16 |        0.00 ± 0.00 |  472.06 ± 10.29 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          pp64 |  8819.08 ± 1529.51 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          tg16 |        0.00 ± 0.00 |  458.20 ± 15.14 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          pp64 |  9229.99 ± 1897.41 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |          tg16 |        0.00 ± 0.00 |  444.33 ± 25.11 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       1 |   pp4096+tg32 |   15357.53 ± 27.52 |   373.90 ± 7.03 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          pp64 |   10799.57 ± 33.90 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |          tg16 |        0.00 ± 0.00 |  461.43 ± 10.99 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       2 |   pp4096+tg32 |   15371.18 ± 57.24 |   372.59 ± 4.02 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          pp64 |  11033.35 ± 177.05 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |          tg16 |        0.00 ± 0.00 |   448.57 ± 8.66 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       4 |   pp4096+tg32 |   15371.12 ± 43.70 |   376.71 ± 0.93 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          pp64 |  11206.45 ± 187.47 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |          tg16 |        0.00 ± 0.00 |   457.99 ± 6.92 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |       8 |   pp4096+tg32 |  15022.14 ± 161.68 |   369.76 ± 4.71 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          pp64 |  10397.19 ± 304.08 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |          tg16 |        0.00 ± 0.00 |   457.53 ± 7.06 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      16 |   pp4096+tg32 |  15434.32 ± 158.08 |   372.00 ± 3.34 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          pp64 | 10588.34 ± 1043.71 |     0.00 ± 0.00 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |          tg16 |        0.00 ± 0.00 |   468.10 ± 9.16 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |      32 |   pp4096+tg32 |    15544.54 ± 4.30 |   374.14 ± 7.18 |
 
 ### Different numbers of layers offloaded to the GPU
 
@@ -143,22 +155,22 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
 
 | model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
 | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |         pp512 |  12082.67 ± 403.77 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |         tg128 |        0.00 ± 0.00 |   106.37 ± 2.72 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |         pp512 | 16742.49 ± 8252.51 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |         tg128 |        0.00 ± 0.00 |   454.01 ± 8.64 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |         pp512 |  29580.40 ± 106.86 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |         tg128 |        0.00 ± 0.00 |   457.68 ± 9.88 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |         pp512 |  29594.52 ± 154.46 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |         tg128 |        0.00 ± 0.00 |   465.27 ± 9.24 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |         pp512 |  29503.27 ± 174.82 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |         tg128 |        0.00 ± 0.00 |   467.16 ± 2.22 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |         pp512 |  29479.41 ± 180.78 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |         tg128 |        0.00 ± 0.00 |   465.67 ± 6.10 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |         pp512 |   29446.50 ± 59.09 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |         tg128 |        0.00 ± 0.00 |   470.60 ± 2.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |         pp512 |  29369.74 ± 229.29 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |         tg128 |        0.00 ± 0.00 |   467.41 ± 6.81 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |          tg32 |        0.00 ± 0.00 |   107.29 ± 1.37 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  10 |   pp4096+tg32 |   8458.79 ± 154.44 |    70.84 ± 0.10 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |          tg32 |        0.00 ± 0.00 |   484.02 ± 0.93 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  20 |   pp4096+tg32 |  15303.20 ± 120.74 |   372.57 ± 6.32 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |          tg32 |        0.00 ± 0.00 |   473.82 ± 4.27 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  30 |   pp4096+tg32 |  15372.85 ± 239.94 |   378.99 ± 4.72 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |          tg32 |        0.00 ± 0.00 |   474.76 ± 7.11 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  31 |   pp4096+tg32 |  15373.12 ± 263.84 |  377.83 ± 12.16 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |          tg32 |        0.00 ± 0.00 |   482.19 ± 0.92 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  32 |   pp4096+tg32 |   15515.24 ± 15.85 |   369.73 ± 0.23 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |          tg32 |        0.00 ± 0.00 |   482.07 ± 0.63 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  33 |   pp4096+tg32 |  15299.93 ± 261.50 |   373.32 ± 9.92 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |          tg32 |        0.00 ± 0.00 |   482.89 ± 0.99 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  34 |   pp4096+tg32 |   15551.65 ± 14.10 |   381.00 ± 6.75 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |          tg32 |        0.00 ± 0.00 |   481.55 ± 1.15 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  35 |   pp4096+tg32 |    15565.34 ± 5.96 |   385.77 ± 0.25 |
 
 ## Output formats
 
@@ -172,8 +184,8 @@ $ ./llama-bench -o md
 
 | model                          |     params | backend    | ngl |          test |         prompt t/s |         gen t/s |
 | ------------------------------ | ---------: | ---------- | --: | ------------: | -----------------: | --------------: |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         pp512 |   27663.05 ± 90.18 |     0.00 ± 0.00 |
-| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |         tg128 |        0.00 ± 0.00 |   467.13 ± 5.21 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |          tg32 |        0.00 ± 0.00 |  455.34 ± 13.25 |
+| llama 1B Q4_K - Medium         |     1.24 B | CUDA       |  99 |   pp4096+tg32 |   15479.05 ± 93.15 |   383.70 ± 2.79 |
 
 ### CSV
 
@@ -183,8 +195,8 @@ $ ./llama-bench -o csv
 
 ```csv
 build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,test_time,avg_prompt_ns,stddev_prompt_ns,avg_prompt_ts,stddev_prompt_ts,avg_gen_ns,stddev_gen_ns,avg_gen_ts,stddev_gen_ts
-"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","2025-04-18T07:40:28Z","18599580","87220","27527.987050","128.945972","60","54","0.000000","0.000000"
-"df46ea53","5099","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","2025-04-18T07:40:28Z","0","0","0.000000","0.000000","273595440","4100226","467.926504","6.877051"
+"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","32","2025-04-18T11:21:18Z","66","58","0.000000","0.000000","71886000","7590","445.149267","0.046999"
+"fa6cb8ae","5100","AMD Ryzen 7 7800X3D 8-Core Processor           ","NVIDIA GeForce RTX 4080","CUDA","models/Llama-3.2-1B-Instruct-Q4_K_M.gguf","llama 1B Q4_K - Medium","799862912","1235814432","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","4096","32","2025-04-18T11:21:18Z","272293733","3247466","15044.014817","180.586130","87201066","125581","366.968490","0.525734"
 ```
 
 ### JSON
@@ -196,8 +208,8 @@ $ ./llama-bench -o json
 ```json
 [
   {
-    "build_commit": "df46ea53",
-    "build_number": 5099,
+    "build_commit": "fa6cb8ae",
+    "build_number": 5100,
     "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ",
     "gpu_info": "NVIDIA GeForce RTX 4080",
     "backends": "CUDA",
@@ -221,25 +233,25 @@ $ ./llama-bench -o json
     "tensor_split": "0.00",
     "use_mmap": true,
     "embeddings": false,
-    "n_prompt": 512,
-    "n_gen": 0,
-    "test_time": "2025-04-18T07:41:24Z",
-    "avg_prompt_ns": 19432500,
-    "stddev_prompt_ns": 1155276,
-    "avg_prompt_ts": 26420.253006,
-    "stddev_prompt_ts": 1527.724050,
-    "avg_gen_ns": 60,
-    "stddev_gen_ns": 89,
-    "avg_gen_ts": 0.000000,
-    "stddev_gen_ts": 0.000000,
-    "samples_prompt_ns": [ 18723500, 18641500, 18476200, 21034200, 20287100 ],
-    "samples_prompt_ts": [ 27345.3, 27465.6, 27711.3, 24341.3, 25237.7 ]
-    "samples_gen_ns": [ 0, 100, 0, 200, 0 ],
-    "samples_gen_ts": [ 0 ]
+    "n_prompt": 0,
+    "n_gen": 32,
+    "test_time": "2025-04-18T11:21:45Z",
+    "avg_prompt_ns": 66,
+    "stddev_prompt_ns": 58,
+    "avg_prompt_ts": 0.000000,
+    "stddev_prompt_ts": 0.000000,
+    "avg_gen_ns": 67903233,
+    "stddev_gen_ns": 498856,
+    "avg_gen_ts": 471.275875,
+    "stddev_gen_ts": 3.475513,
+    "samples_prompt_ns": [ 100, 0, 100 ],
+    "samples_prompt_ts": [ 0 ]
+    "samples_gen_ns": [ 68251300, 68126600, 67331800 ],
+    "samples_gen_ts": [ 468.856, 469.714, 475.258 ]
   },
   {
-    "build_commit": "df46ea53",
-    "build_number": 5099,
+    "build_commit": "fa6cb8ae",
+    "build_number": 5100,
     "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ",
     "gpu_info": "NVIDIA GeForce RTX 4080",
     "backends": "CUDA",
@@ -263,21 +275,21 @@ $ ./llama-bench -o json
     "tensor_split": "0.00",
     "use_mmap": true,
     "embeddings": false,
-    "n_prompt": 0,
-    "n_gen": 128,
-    "test_time": "2025-04-18T07:41:24Z",
-    "avg_prompt_ns": 20,
-    "stddev_prompt_ns": 44,
-    "avg_prompt_ts": 0.000000,
-    "stddev_prompt_ts": 0.000000,
-    "avg_gen_ns": 279581280,
-    "stddev_gen_ns": 7013491,
-    "avg_gen_ts": 458.054981,
-    "stddev_gen_ts": 11.337387,
-    "samples_prompt_ns": [ 0, 0, 0, 0, 100 ],
-    "samples_prompt_ts": [ 0 ]
-    "samples_gen_ns": [ 290465300, 280112200, 280351000, 274751300, 272226600 ],
-    "samples_gen_ts": [ 440.672, 456.96, 456.571, 465.876, 470.197 ]
+    "n_prompt": 4096,
+    "n_gen": 32,
+    "test_time": "2025-04-18T11:21:46Z",
+    "avg_prompt_ns": 263273600,
+    "stddev_prompt_ns": 273278,
+    "avg_prompt_ts": 15557.970647,
+    "stddev_prompt_ts": 16.143068,
+    "avg_gen_ns": 85820333,
+    "stddev_gen_ns": 4372337,
+    "avg_gen_ts": 373.500825,
+    "stddev_gen_ts": 18.514532,
+    "samples_prompt_ns": [ 263043600, 263201500, 263575700 ],
+    "samples_prompt_ts": [ 15571.6, 15562.2, 15540.1 ]
+    "samples_gen_ns": [ 82844300, 83776400, 90840300 ],
+    "samples_gen_ts": [ 386.267, 381.969, 352.267 ]
   }
 ]
 ```
@@ -290,8 +302,8 @@ $ ./llama-bench -o jsonl
 ```
 
 ```json lines
-{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "test_time": "2025-04-18T07:42:16Z", "avg_prompt_ns": 18637700, "stddev_prompt_ns": 164536, "avg_prompt_ts": 27472.914745, "stddev_prompt_ts": 242.676976, "avg_gen_ns": 0, "stddev_gen_ns": 0, "avg_gen_ts": 0.000000, "stddev_gen_ts": 0.000000, "samples_prompt_ns": [ 18782700, 18654200, 18812300, 18466100, 18473200 ],"samples_prompt_ts": [ 27259.1, 27446.9, 27216.2, 27726.5, 27715.8 ]"samples_gen_ns": [ 0, 0, 0, 0, 0 ],"samples_gen_ts": [ 0 ]}
-{"build_commit": "df46ea53", "build_number": 5099, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "test_time": "2025-04-18T07:42:17Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 122, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 273159220, "stddev_gen_ns": 2025528, "avg_gen_ts": 468.611680, "stddev_gen_ts": 3.456568, "samples_prompt_ns": [ 300, 100, 100, 0, 0 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 276312400, 272096600, 271459600, 274053300, 271874200 ],"samples_gen_ts": [ 463.244, 470.421, 471.525, 467.062, 470.806 ]}
+{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 100, "stddev_prompt_ns": 0, "avg_prompt_ts": 0.000000, "stddev_prompt_ts": 0.000000, "avg_gen_ns": 71156300, "stddev_gen_ns": 912152, "avg_gen_ts": 449.763857, "stddev_gen_ts": 5.808090, "samples_prompt_ns": [ 100, 100, 100 ],"samples_prompt_ts": [ 0 ]"samples_gen_ns": [ 71725200, 71639500, 70104200 ],"samples_gen_ts": [ 446.147, 446.681, 456.463 ]}
+{"build_commit": "fa6cb8ae", "build_number": 5100, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor           ", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Llama-3.2-1B-Instruct-Q4_K_M.gguf", "model_type": "llama 1B Q4_K - Medium", "model_size": 799862912, "model_n_params": 1235814432, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 4096, "n_gen": 32, "test_time": "2025-04-18T11:22:14Z", "avg_prompt_ns": 267673800, "stddev_prompt_ns": 4917668, "avg_prompt_ts": 15305.627579, "stddev_prompt_ts": 279.255714, "avg_gen_ns": 83914500, "stddev_gen_ns": 1515058, "avg_gen_ts": 381.422650, "stddev_gen_ts": 6.822569, "samples_prompt_ns": [ 266315000, 273128000, 263578400 ],"samples_prompt_ts": [ 15380.3, 14996.6, 15540 ]"samples_gen_ns": [ 85644600, 83274100, 82824800 ],"samples_gen_ts": [ 373.637, 384.273, 386.358 ]}
 ```
 
 
@@ -343,6 +355,6 @@ CREATE TABLE IF NOT EXISTS test (
   stddev_gen_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '2025-04-18T07:42:43Z', '18543960', '131206', '27611.175041', '195.547424', '60', '54', '0.000000', '0.000000');
-INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('df46ea53', '5099', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '2025-04-18T07:42:43Z', '20', '44', '0.000000', '0.000000', '274190080', '2765950', '466.867210', '4.680900');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '32', '2025-04-18T11:22:37Z', '66', '58', '0.000000', '0.000000', '70741266', '2050337', '452.606173', '13.122321');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, test_time, avg_prompt_ns, stddev_prompt_ns, avg_prompt_ts, stddev_prompt_ts, avg_gen_ns, stddev_gen_ns, avg_gen_ts, stddev_gen_ts) VALUES ('fa6cb8ae', '5100', 'AMD Ryzen 7 7800X3D 8-Core Processor           ', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Llama-3.2-1B-Instruct-Q4_K_M.gguf', 'llama 1B Q4_K - Medium', '799862912', '1235814432', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '4096', '32', '2025-04-18T11:22:37Z', '270934866', '4466069', '15120.737903', '246.900896', '85258733', '2156168', '375.487736', '9.468350');
 ```