NVIDIA
diff --git a/‎.github/workflows/module-owners.json
+1-1 b/‎.github/workflows/module-owners.json
+1-1
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎.pre-commit-config.yaml
+2-2 b/‎.pre-commit-config.yaml
+2-2
diff --git a/‎3rdparty/cutlass b/‎3rdparty/cutlass
diff --git a/‎README.md
+2-1 b/‎README.md
+2-1
diff --git a/‎benchmarks/cpp/README.md
+2 b/‎benchmarks/cpp/README.md
+2
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp
+15-2 b/‎benchmarks/cpp/disaggServerBenchmark.cpp
+15-2
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
+74-9 b/‎benchmarks/cpp/gptManagerBenchmark.cpp
+74-9
diff --git a/‎benchmarks/cpp/gptSessionBenchmark.cpp
+2-2 b/‎benchmarks/cpp/gptSessionBenchmark.cpp
+2-2
diff --git a/‎benchmarks/python/enc_dec_benchmark.py
+2 b/‎benchmarks/python/enc_dec_benchmark.py
+2
diff --git a/‎benchmarks/python/gpt_benchmark.py
+1-1 b/‎benchmarks/python/gpt_benchmark.py
+1-1
@@ -9,6 +9,6 @@
     "Performance": ["kaiyux", "jiahanc", "hypdeb"],
     "Lora/P-tuning":["byshiue", "Naveassaf"],
     "Disaggregated Serving":["Shixiaowei02", "joyang-nv", "chuangz0", "schetlur-nv"],
-    "Documentation":["nv-guomingz", "mikemckiernan"],
+    "Documentation":["nv-guomingz"],
     "Windows":["pamelap-nvidia"]
 }
@@ -4,6 +4,7 @@ __pycache__/
 *.engine.config
 *.cache
 *.nsys-rep
+*.npy
 .VSCodeCounter
 cpp/build*
 build
@@ -15,6 +16,7 @@ build
 tmp/
 venv/
 .venv/
+.python-version
 .local/
 .hypothesis/
 .idea/
@@ -58,3 +60,4 @@ cpp/include/tensorrt_llm/executor/version.h
 CMakeUserPresets.json
 compile_commands.json
 *.bin
+.dir-locals.el
@@ -7,7 +7,7 @@ repos:
     rev: v1.1.13
     hooks:
     -   id: remove-crlf
--   repo:  https://github.com/google/yapf
+-   repo: https://github.com/google/yapf
     rev: v0.43.0
     hooks:
     -   id: yapf
@@ -47,4 +47,4 @@ repos:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
         - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
-        exclude: 'tests/llm-test-defs/turtle/test_input_files'
+        exclude: 'tests/llm-test-defs/turtle/test_input_files|.*/test_star_attention_input.jsonl'
@@ -6,9 +6,10 @@ TensorRT-LLM
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
+[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.17.0.dev-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.18.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
 
@@ -368,13 +368,15 @@ cd cpp/build
 `disaggServerBenchmark` only supports `decoder-only` models.
 Here is the basic usage:
 ```
+export TRTLLM_USE_MPI_KVCACHE=1
 mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
 --generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
 ```
 This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
 
 for example:
 ```
+export TRTLLM_USE_MPI_KVCACHE=1
 mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
 
 # need 6 gpus and 7 processes to launch the benchmark.
 
@@ -318,8 +318,15 @@ class Recorder
 
             if (!mGenT2TLatency.mDataTimes.empty())
             {
-
                 mGenT2TLatency.calculate();
+                std::vector<float> userTokensPerSecond;
+                userTokensPerSecond.reserve(mGenT2TLatency.mDataTimes.size());
+                for (auto const& latency : mGenT2TLatency.mDataTimes)
+                {
+                    userTokensPerSecond.push_back(1000.F / latency);
+                }
+                mAvgUserTokensPerSecond = std::accumulate(userTokensPerSecond.begin(), userTokensPerSecond.end(), 0.F)
+                    / userTokensPerSecond.size();
             }
             if (!mGenExcludeFirstIterT2TLatency.mDataTimes.empty())
             {
@@ -348,6 +355,10 @@ class Recorder
         printf("[BENCHMARK] total_latency(ms) %.2f\n", mTotalLatency);
         printf("[BENCHMARK] seq_throughput(seq/sec) %.2f\n", mSeqThroughput);
         printf("[BENCHMARK] token_throughput(token/sec) %.2f\n", mTokenThroughput);
+        if (mStreaming)
+        {
+            printf("[BENCHMARK] user_tokens_per_second(tokens/sec/user) %.2f\n", mAvgUserTokensPerSecond);
+        }
         printf("[BENCHMARK] avg_acceptance_rate(tokens/decoding steps) %.2f\n\n", mAcceptanceRate);
 
         mSeqLatency.report();
@@ -396,6 +407,7 @@ class Recorder
                 auto excludeFirstIterIngterHeader = mGenExcludeFirstIterT2TLatency.genHeaders();
                 headers.insert(headers.end(), std::make_move_iterator(excludeFirstIterIngterHeader.begin()),
                     std::make_move_iterator(excludeFirstIterIngterHeader.end()));
+                headers.push_back("avg_user_tokens_per_second(tokens/sec/user)");
             }
             if (mCalculateKVCacheTransferTime)
             {
@@ -421,7 +433,7 @@ class Recorder
                 {
 
                     outputFile << "," << mGenFirstTokenLatency << "," << mGenT2TLatency << ","
-                               << mGenExcludeFirstIterT2TLatency;
+                               << mGenExcludeFirstIterT2TLatency << "," << mAvgUserTokensPerSecond;
                 }
                 if (mCalculateKVCacheTransferTime)
                 {
@@ -499,6 +511,7 @@ class Recorder
     bool mOutputHasInput;
     bool mCalculateKVCacheTransferTime;
     bool mCalculateQueueTime;
+    float mAvgUserTokensPerSecond{};
 };
 
 texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const& beamWidth,
 
@@ -304,6 +304,7 @@ class Recorder
         std::vector<float> reqLatencies;
         std::vector<float> ftLatencies;
         std::vector<float> genT2TLatencies;
+        std::vector<float> userTokensPerSecond;
 
         int totalOutputTokens{0};
         int totalDecodingIter{0};
@@ -325,6 +326,10 @@ class Recorder
                     {
                         genT2TLatencies.push_back(reqInfo.second.avgGenT2TLatency.value());
                     }
+                    if (reqInfo.second.avgGenT2TLatency.value() > 0)
+                    {
+                        userTokensPerSecond.push_back(1000.F / reqInfo.second.avgGenT2TLatency.value());
+                    }
                 }
                 ++mNumSamples;
             }
@@ -377,6 +382,18 @@ class Recorder
                 mMinGenT2TLatency = genT2TLatencies.front();
             }
 
+            if (!userTokensPerSecond.empty())
+            {
+                mAvgUserTokensPerSecond = std::accumulate(userTokensPerSecond.begin(), userTokensPerSecond.end(), 0.F)
+                    / userTokensPerSecond.size();
+                std::sort(userTokensPerSecond.begin(), userTokensPerSecond.end());
+                mP99UserTokensPerSecond = calcPercentile(userTokensPerSecond, 99);
+                mP90UserTokensPerSecond = calcPercentile(userTokensPerSecond, 90);
+                mP50UserTokensPerSecond = calcPercentile(userTokensPerSecond, 50);
+                mMaxUserTokensPerSecond = userTokensPerSecond.back();
+                mMinUserTokensPerSecond = userTokensPerSecond.front();
+            }
+
             mAvgReqQueueingLatency
                 = std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F)
                 / mRequestsQueueingLatencies.size();
@@ -423,6 +440,13 @@ class Recorder
             printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency);
             printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency);
 
+            printf("[BENCHMARK] avg_user_tokens_per_second(tokens/sec/user) %.2f\n", mAvgUserTokensPerSecond);
+            printf("[BENCHMARK] max_user_tokens_per_second(tokens/sec/user) %.2f\n", mMaxUserTokensPerSecond);
+            printf("[BENCHMARK] min_user_tokens_per_second(tokens/sec/user) %.2f\n", mMinUserTokensPerSecond);
+            printf("[BENCHMARK] p99_user_tokens_per_second(tokens/sec/user) %.2f\n", mP99UserTokensPerSecond);
+            printf("[BENCHMARK] p90_user_tokens_per_second(tokens/sec/user) %.2f\n", mP90UserTokensPerSecond);
+            printf("[BENCHMARK] p50_user_tokens_per_second(tokens/sec/user) %.2f\n\n", mP50UserTokensPerSecond);
+
             printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency);
             printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency);
             printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency);
@@ -443,11 +467,26 @@ class Recorder
 
             if (mStreaming)
             {
-                std::vector<std::string> streamingHeaders
-                    = {"avg_time_to_first_token(ms)", "max_time_to_first_token(ms)", "min_time_to_first_token(ms)",
-                        "p99_time_to_first_token(ms)", "p90_time_to_first_token(ms)", "p50_time_to_first_token(ms)",
-                        "avg_inter_token_latency(ms)", "max_inter_token_latency(ms)", "min_inter_token_latency(ms)",
-                        "p99_inter_token_latency(ms)", "p90_inter_token_latency(ms)", "p50_inter_token_latency(ms)"};
+                std::vector<std::string> streamingHeaders = {
+                    "avg_time_to_first_token(ms)",
+                    "max_time_to_first_token(ms)",
+                    "min_time_to_first_token(ms)",
+                    "p99_time_to_first_token(ms)",
+                    "p90_time_to_first_token(ms)",
+                    "p50_time_to_first_token(ms)",
+                    "avg_inter_token_latency(ms)",
+                    "max_inter_token_latency(ms)",
+                    "min_inter_token_latency(ms)",
+                    "p99_inter_token_latency(ms)",
+                    "p90_inter_token_latency(ms)",
+                    "p50_inter_token_latency(ms)",
+                    "avg_user_tokens_per_second(tokens/sec/user)",
+                    "max_user_tokens_per_second(tokens/sec/user)",
+                    "min_user_tokens_per_second(tokens/sec/user)",
+                    "p99_user_tokens_per_second(tokens/sec/user)",
+                    "p90_user_tokens_per_second(tokens/sec/user)",
+                    "p50_user_tokens_per_second(tokens/sec/user)",
+                };
 
                 headers.insert(headers.end(), streamingHeaders.begin(), streamingHeaders.end());
             }
@@ -470,7 +509,10 @@ class Recorder
                     outputFile << "," << mAvgFtLatency << "," << mMaxFtLatency << "," << mMinFtLatency << ","
                                << mP99FtLatency << "," << mP90FtLatency << "," << mP50FtLatency << ","
                                << mAvgGenT2TLatency << "," << mMaxGenT2TLatency << "," << mMinGenT2TLatency << ","
-                               << mP99GenT2TLatency << "," << mP90GenT2TLatency << "," << mP50GenT2TLatency;
+                               << mP99GenT2TLatency << "," << mP90GenT2TLatency << "," << mP50GenT2TLatency << ","
+                               << mAvgUserTokensPerSecond << "," << mMaxUserTokensPerSecond << ","
+                               << mMinUserTokensPerSecond << "," << mP99UserTokensPerSecond << ","
+                               << mP90UserTokensPerSecond << "," << mP50UserTokensPerSecond << ",";
                 }
 
                 outputFile << "\n";
@@ -524,6 +566,7 @@ class Recorder
     float mSeqThroughput{};
     float mAvgSeqLatency{};
     float mAvgGenT2TLatency{};
+    float mAvgUserTokensPerSecond{};
     float mAvgFtLatency{};
     float mTokenThroughput{};
     float mAcceptanceRate{};
@@ -542,6 +585,11 @@ class Recorder
     float mP50GenT2TLatency{};
     float mMaxGenT2TLatency{};
     float mMinGenT2TLatency{};
+    float mP99UserTokensPerSecond{};
+    float mP90UserTokensPerSecond{};
+    float mP50UserTokensPerSecond{};
+    float mMaxUserTokensPerSecond{};
+    float mMinUserTokensPerSecond{};
     float mAvgReqQueueingLatency{};
     float mP99ReqQueueingLatency{};
     float mP90ReqQueueingLatency{};
@@ -1054,7 +1102,7 @@ int main(int argc, char* argv[])
         "Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()(
-        "enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("false"));
+        "enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
         "enable_chunked_context", "Whether to enable context chunking.", cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
@@ -1096,6 +1144,11 @@ int main(int argc, char* argv[])
         "Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle",
         cxxopts::value<float>());
     options.add_options()("temperature", "Sampling temperature for each request", cxxopts::value<float>());
+    options.add_options()(
+        "eagle_use_dynamic_tree", "Whether to use Eagle-2", cxxopts::value<bool>()->default_value("false"));
+    options.add_options()("eagle_dynamic_tree_max_top_k",
+        "The max topK for dynamic tree, also the number of draft tokens that will expand for each node",
+        cxxopts::value<SizeType32>());
 
     options.add_options()("multi_block_mode",
         "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel",
@@ -1305,7 +1358,8 @@ int main(int argc, char* argv[])
         benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
     }
     // Argument: Eagle choices for the Eagle speculative decoding.
-    if (result.count("eagle_choices") || result.count("eagle_posterior_threshold"))
+    if (result.count("eagle_choices") || result.count("eagle_posterior_threshold")
+        || result.count("eagle_use_dynamic_tree") || result.count("eagle_dynamic_tree_max_top_k"))
     {
         std::optional<float> posteriorThreshold;
         if (result.count("eagle_posterior_threshold"))
@@ -1317,7 +1371,18 @@ int main(int argc, char* argv[])
         {
             choices = parseVectorOfVectors(result["eagle_choices"].as<std::string>());
         }
-        benchmarkParams.eagleConfig = texec::EagleConfig(choices, !posteriorThreshold.has_value(), posteriorThreshold);
+        bool eagleUseDynamicTree = false;
+        if (result.count("eagle_use_dynamic_tree"))
+        {
+            eagleUseDynamicTree = result["eagle_use_dynamic_tree"].as<bool>();
+        }
+        std::optional<SizeType32> eagleDynamicTreeMaxTopK;
+        if (result.count("eagle_dynamic_tree_max_top_k"))
+        {
+            eagleDynamicTreeMaxTopK = result["eagle_dynamic_tree_max_top_k"].as<SizeType32>();
+        }
+        benchmarkParams.eagleConfig = texec::EagleConfig(
+            choices, !posteriorThreshold.has_value(), posteriorThreshold, eagleUseDynamicTree, eagleDynamicTreeMaxTopK);
     }
     if (result.count("temperature"))
     {
 
@@ -187,7 +187,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                     generationOutput.contextLogits
                         = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
                 }
-                if (session.getModelConfig().computeGenerationLogits())
+                if (session.getGatherGenerationLogits())
                 {
                     generationOutput.generationLogits
                         = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
@@ -306,7 +306,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                         std::cout << "generationOutput.contextLogits: " << *generationOutput.contextLogits << std::endl;
                     }
 
-                    if (session.getModelConfig().computeGenerationLogits() && printAllLogits)
+                    if (session.getGatherGenerationLogits() && printAllLogits)
                     {
                         std::cout << "generationOutput.generationLogits.shape: "
                                   << generationOutput.generationLogits->getShape()
 
@@ -69,6 +69,7 @@ def read_config(component):
                 lora_config = builder_config['lora_config']
                 auto_parallel_config = builder_config['auto_parallel_config']
                 use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
+                gemm_allreduce_plugin = plugin_config["gemm_allreduce_plugin"]
                 remove_input_padding = plugin_config["remove_input_padding"]
                 use_lora_plugin = plugin_config["lora_plugin"]
                 tp_size = pretrained_config['mapping']['tp_size']
@@ -123,6 +124,7 @@ def read_config(component):
                     vocab_size=vocab_size,
                     num_layers=num_layers,
                     gpt_attention_plugin=use_gpt_attention_plugin,
+                    gemm_allreduce_plugin=gemm_allreduce_plugin,
                     remove_input_padding=remove_input_padding,
                     kv_cache_type=kv_cache_type,
                     tokens_per_block=tokens_per_block,
 
@@ -101,7 +101,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
             remove_input_padding=self.remove_input_padding,
             quant_mode=self.quant_mode,
             tokens_per_block=self.tokens_per_block if hasattr(
-                self, 'tokens_per_block') else 64,
+                self, 'tokens_per_block') else 32,
             mamba_conv1d_plugin=self.use_mamba_conv1d_plugin,
             gpu_weights_percent=list(sorted(gpu_weights_percents))[0],
             **rnn_configs_kwargs,
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,6 @@`
`9`	`9`	`"Performance": ["kaiyux", "jiahanc", "hypdeb"],`
`10`	`10`	`"Lora/P-tuning":["byshiue", "Naveassaf"],`
`11`	`11`	`"Disaggregated Serving":["Shixiaowei02", "joyang-nv", "chuangz0", "schetlur-nv"],`
`12`		`- "Documentation":["nv-guomingz", "mikemckiernan"],`
	`12`	`+ "Documentation":["nv-guomingz"],`
`13`	`13`	`"Windows":["pamelap-nvidia"]`
`14`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>`
`187`	`187`	`generationOutput.contextLogits`
`188`	`188`	`= bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);`
`189`	`189`	`}`
`190`		`- if (session.getModelConfig().computeGenerationLogits())`
	`190`	`+ if (session.getGatherGenerationLogits())`
`191`	`191`	`{`
`192`	`192`	`generationOutput.generationLogits`
`193`	`193`	`= bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);`
`@@ -306,7 +306,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>`
`306`	`306`	`std::cout << "generationOutput.contextLogits: " << *generationOutput.contextLogits << std::endl;`
`307`	`307`	`}`
`308`	`308`
`309`		`- if (session.getModelConfig().computeGenerationLogits() && printAllLogits)`
	`309`	`+ if (session.getGatherGenerationLogits() && printAllLogits)`
`310`	`310`	`{`
`311`	`311`	`std::cout << "generationOutput.generationLogits.shape: "`
`312`	`312`	`<< generationOutput.generationLogits->getShape()`