Skip to content

Commit 16d2467

Browse files
DanBlanarukaiyux
authored andcommitted
Update TensorRT-LLM (#2755)
* Update TensorRT-LLM --------- Co-authored-by: Denis Kayshev <[email protected]> Co-authored-by: akhoroshev <[email protected]> Co-authored-by: Patrick Reiter Horn <[email protected]> Update
1 parent d93a2dd commit 16d2467

File tree

2,228 files changed

+20048610
-19118
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,228 files changed

+20048610
-19118
lines changed

.github/workflows/module-owners.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@
99
"Performance": ["kaiyux", "jiahanc", "hypdeb"],
1010
"Lora/P-tuning":["byshiue", "Naveassaf"],
1111
"Disaggregated Serving":["Shixiaowei02", "joyang-nv", "chuangz0", "schetlur-nv"],
12-
"Documentation":["nv-guomingz", "mikemckiernan"],
12+
"Documentation":["nv-guomingz"],
1313
"Windows":["pamelap-nvidia"]
1414
}

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ __pycache__/
44
*.engine.config
55
*.cache
66
*.nsys-rep
7+
*.npy
78
.VSCodeCounter
89
cpp/build*
910
build
@@ -15,6 +16,7 @@ build
1516
tmp/
1617
venv/
1718
.venv/
19+
.python-version
1820
.local/
1921
.hypothesis/
2022
.idea/
@@ -58,3 +60,4 @@ cpp/include/tensorrt_llm/executor/version.h
5860
CMakeUserPresets.json
5961
compile_commands.json
6062
*.bin
63+
.dir-locals.el

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ repos:
77
rev: v1.1.13
88
hooks:
99
- id: remove-crlf
10-
- repo: https://github.com/google/yapf
10+
- repo: https://github.com/google/yapf
1111
rev: v0.43.0
1212
hooks:
1313
- id: yapf
@@ -47,4 +47,4 @@ repos:
4747
- --skip=".git,3rdparty"
4848
- --exclude-file=examples/whisper/tokenizer.py
4949
- --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
50-
exclude: 'tests/llm-test-defs/turtle/test_input_files'
50+
exclude: 'tests/llm-test-defs/turtle/test_input_files|.*/test_star_attention_input.jsonl'

3rdparty/cutlass

Submodule cutlass updated 425 files

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ TensorRT-LLM
66

77
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
88
[![python](https://img.shields.io/badge/python-3.12.3-green)](https://www.python.org/downloads/release/python-3123/)
9+
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
910
[![cuda](https://img.shields.io/badge/cuda-12.6.3-green)](https://developer.nvidia.com/cuda-downloads)
1011
[![trt](https://img.shields.io/badge/TRT-10.7.0-green)](https://developer.nvidia.com/tensorrt)
11-
[![version](https://img.shields.io/badge/release-0.17.0.dev-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-0.18.0.dev-green)](./tensorrt_llm/version.py)
1213
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1314

1415
[Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)

benchmarks/cpp/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -368,13 +368,15 @@ cd cpp/build
368368
`disaggServerBenchmark` only supports `decoder-only` models.
369369
Here is the basic usage:
370370
```
371+
export TRTLLM_USE_MPI_KVCACHE=1
371372
mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
372373
--generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
373374
```
374375
This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
375376
376377
for example:
377378
```
379+
export TRTLLM_USE_MPI_KVCACHE=1
378380
mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
379381

380382
# need 6 gpus and 7 processes to launch the benchmark.

benchmarks/cpp/disaggServerBenchmark.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,15 @@ class Recorder
318318

319319
if (!mGenT2TLatency.mDataTimes.empty())
320320
{
321-
322321
mGenT2TLatency.calculate();
322+
std::vector<float> userTokensPerSecond;
323+
userTokensPerSecond.reserve(mGenT2TLatency.mDataTimes.size());
324+
for (auto const& latency : mGenT2TLatency.mDataTimes)
325+
{
326+
userTokensPerSecond.push_back(1000.F / latency);
327+
}
328+
mAvgUserTokensPerSecond = std::accumulate(userTokensPerSecond.begin(), userTokensPerSecond.end(), 0.F)
329+
/ userTokensPerSecond.size();
323330
}
324331
if (!mGenExcludeFirstIterT2TLatency.mDataTimes.empty())
325332
{
@@ -348,6 +355,10 @@ class Recorder
348355
printf("[BENCHMARK] total_latency(ms) %.2f\n", mTotalLatency);
349356
printf("[BENCHMARK] seq_throughput(seq/sec) %.2f\n", mSeqThroughput);
350357
printf("[BENCHMARK] token_throughput(token/sec) %.2f\n", mTokenThroughput);
358+
if (mStreaming)
359+
{
360+
printf("[BENCHMARK] user_tokens_per_second(tokens/sec/user) %.2f\n", mAvgUserTokensPerSecond);
361+
}
351362
printf("[BENCHMARK] avg_acceptance_rate(tokens/decoding steps) %.2f\n\n", mAcceptanceRate);
352363

353364
mSeqLatency.report();
@@ -396,6 +407,7 @@ class Recorder
396407
auto excludeFirstIterIngterHeader = mGenExcludeFirstIterT2TLatency.genHeaders();
397408
headers.insert(headers.end(), std::make_move_iterator(excludeFirstIterIngterHeader.begin()),
398409
std::make_move_iterator(excludeFirstIterIngterHeader.end()));
410+
headers.push_back("avg_user_tokens_per_second(tokens/sec/user)");
399411
}
400412
if (mCalculateKVCacheTransferTime)
401413
{
@@ -421,7 +433,7 @@ class Recorder
421433
{
422434

423435
outputFile << "," << mGenFirstTokenLatency << "," << mGenT2TLatency << ","
424-
<< mGenExcludeFirstIterT2TLatency;
436+
<< mGenExcludeFirstIterT2TLatency << "," << mAvgUserTokensPerSecond;
425437
}
426438
if (mCalculateKVCacheTransferTime)
427439
{
@@ -499,6 +511,7 @@ class Recorder
499511
bool mOutputHasInput;
500512
bool mCalculateKVCacheTransferTime;
501513
bool mCalculateQueueTime;
514+
float mAvgUserTokensPerSecond{};
502515
};
503516

504517
texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const& beamWidth,

benchmarks/cpp/gptManagerBenchmark.cpp

+74-9
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ class Recorder
304304
std::vector<float> reqLatencies;
305305
std::vector<float> ftLatencies;
306306
std::vector<float> genT2TLatencies;
307+
std::vector<float> userTokensPerSecond;
307308

308309
int totalOutputTokens{0};
309310
int totalDecodingIter{0};
@@ -325,6 +326,10 @@ class Recorder
325326
{
326327
genT2TLatencies.push_back(reqInfo.second.avgGenT2TLatency.value());
327328
}
329+
if (reqInfo.second.avgGenT2TLatency.value() > 0)
330+
{
331+
userTokensPerSecond.push_back(1000.F / reqInfo.second.avgGenT2TLatency.value());
332+
}
328333
}
329334
++mNumSamples;
330335
}
@@ -377,6 +382,18 @@ class Recorder
377382
mMinGenT2TLatency = genT2TLatencies.front();
378383
}
379384

385+
if (!userTokensPerSecond.empty())
386+
{
387+
mAvgUserTokensPerSecond = std::accumulate(userTokensPerSecond.begin(), userTokensPerSecond.end(), 0.F)
388+
/ userTokensPerSecond.size();
389+
std::sort(userTokensPerSecond.begin(), userTokensPerSecond.end());
390+
mP99UserTokensPerSecond = calcPercentile(userTokensPerSecond, 99);
391+
mP90UserTokensPerSecond = calcPercentile(userTokensPerSecond, 90);
392+
mP50UserTokensPerSecond = calcPercentile(userTokensPerSecond, 50);
393+
mMaxUserTokensPerSecond = userTokensPerSecond.back();
394+
mMinUserTokensPerSecond = userTokensPerSecond.front();
395+
}
396+
380397
mAvgReqQueueingLatency
381398
= std::accumulate(mRequestsQueueingLatencies.begin(), mRequestsQueueingLatencies.end(), 0.F)
382399
/ mRequestsQueueingLatencies.size();
@@ -423,6 +440,13 @@ class Recorder
423440
printf("[BENCHMARK] p90_inter_token_latency(ms) %.2f\n", mP90GenT2TLatency);
424441
printf("[BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n", mP50GenT2TLatency);
425442

443+
printf("[BENCHMARK] avg_user_tokens_per_second(tokens/sec/user) %.2f\n", mAvgUserTokensPerSecond);
444+
printf("[BENCHMARK] max_user_tokens_per_second(tokens/sec/user) %.2f\n", mMaxUserTokensPerSecond);
445+
printf("[BENCHMARK] min_user_tokens_per_second(tokens/sec/user) %.2f\n", mMinUserTokensPerSecond);
446+
printf("[BENCHMARK] p99_user_tokens_per_second(tokens/sec/user) %.2f\n", mP99UserTokensPerSecond);
447+
printf("[BENCHMARK] p90_user_tokens_per_second(tokens/sec/user) %.2f\n", mP90UserTokensPerSecond);
448+
printf("[BENCHMARK] p50_user_tokens_per_second(tokens/sec/user) %.2f\n\n", mP50UserTokensPerSecond);
449+
426450
printf("[BENCHMARK] avg_request_queueing_latency(ms) %.2f\n", mAvgReqQueueingLatency);
427451
printf("[BENCHMARK] max_request_queueing_latency(ms) %.2f\n", mMaxReqQueueingLatency);
428452
printf("[BENCHMARK] min_request_queueing_latency(ms) %.2f\n", mMinReqQueueingLatency);
@@ -443,11 +467,26 @@ class Recorder
443467

444468
if (mStreaming)
445469
{
446-
std::vector<std::string> streamingHeaders
447-
= {"avg_time_to_first_token(ms)", "max_time_to_first_token(ms)", "min_time_to_first_token(ms)",
448-
"p99_time_to_first_token(ms)", "p90_time_to_first_token(ms)", "p50_time_to_first_token(ms)",
449-
"avg_inter_token_latency(ms)", "max_inter_token_latency(ms)", "min_inter_token_latency(ms)",
450-
"p99_inter_token_latency(ms)", "p90_inter_token_latency(ms)", "p50_inter_token_latency(ms)"};
470+
std::vector<std::string> streamingHeaders = {
471+
"avg_time_to_first_token(ms)",
472+
"max_time_to_first_token(ms)",
473+
"min_time_to_first_token(ms)",
474+
"p99_time_to_first_token(ms)",
475+
"p90_time_to_first_token(ms)",
476+
"p50_time_to_first_token(ms)",
477+
"avg_inter_token_latency(ms)",
478+
"max_inter_token_latency(ms)",
479+
"min_inter_token_latency(ms)",
480+
"p99_inter_token_latency(ms)",
481+
"p90_inter_token_latency(ms)",
482+
"p50_inter_token_latency(ms)",
483+
"avg_user_tokens_per_second(tokens/sec/user)",
484+
"max_user_tokens_per_second(tokens/sec/user)",
485+
"min_user_tokens_per_second(tokens/sec/user)",
486+
"p99_user_tokens_per_second(tokens/sec/user)",
487+
"p90_user_tokens_per_second(tokens/sec/user)",
488+
"p50_user_tokens_per_second(tokens/sec/user)",
489+
};
451490

452491
headers.insert(headers.end(), streamingHeaders.begin(), streamingHeaders.end());
453492
}
@@ -470,7 +509,10 @@ class Recorder
470509
outputFile << "," << mAvgFtLatency << "," << mMaxFtLatency << "," << mMinFtLatency << ","
471510
<< mP99FtLatency << "," << mP90FtLatency << "," << mP50FtLatency << ","
472511
<< mAvgGenT2TLatency << "," << mMaxGenT2TLatency << "," << mMinGenT2TLatency << ","
473-
<< mP99GenT2TLatency << "," << mP90GenT2TLatency << "," << mP50GenT2TLatency;
512+
<< mP99GenT2TLatency << "," << mP90GenT2TLatency << "," << mP50GenT2TLatency << ","
513+
<< mAvgUserTokensPerSecond << "," << mMaxUserTokensPerSecond << ","
514+
<< mMinUserTokensPerSecond << "," << mP99UserTokensPerSecond << ","
515+
<< mP90UserTokensPerSecond << "," << mP50UserTokensPerSecond << ",";
474516
}
475517

476518
outputFile << "\n";
@@ -524,6 +566,7 @@ class Recorder
524566
float mSeqThroughput{};
525567
float mAvgSeqLatency{};
526568
float mAvgGenT2TLatency{};
569+
float mAvgUserTokensPerSecond{};
527570
float mAvgFtLatency{};
528571
float mTokenThroughput{};
529572
float mAcceptanceRate{};
@@ -542,6 +585,11 @@ class Recorder
542585
float mP50GenT2TLatency{};
543586
float mMaxGenT2TLatency{};
544587
float mMinGenT2TLatency{};
588+
float mP99UserTokensPerSecond{};
589+
float mP90UserTokensPerSecond{};
590+
float mP50UserTokensPerSecond{};
591+
float mMaxUserTokensPerSecond{};
592+
float mMinUserTokensPerSecond{};
545593
float mAvgReqQueueingLatency{};
546594
float mP99ReqQueueingLatency{};
547595
float mP90ReqQueueingLatency{};
@@ -1054,7 +1102,7 @@ int main(int argc, char* argv[])
10541102
"Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency",
10551103
cxxopts::value<bool>()->default_value("false"));
10561104
options.add_options()(
1057-
"enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("false"));
1105+
"enable_kv_cache_reuse", "Enables the KV cache reuse.", cxxopts::value<bool>()->default_value("true"));
10581106
options.add_options()(
10591107
"enable_chunked_context", "Whether to enable context chunking.", cxxopts::value<bool>()->default_value("true"));
10601108
options.add_options()(
@@ -1096,6 +1144,11 @@ int main(int argc, char* argv[])
10961144
"Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle",
10971145
cxxopts::value<float>());
10981146
options.add_options()("temperature", "Sampling temperature for each request", cxxopts::value<float>());
1147+
options.add_options()(
1148+
"eagle_use_dynamic_tree", "Whether to use Eagle-2", cxxopts::value<bool>()->default_value("false"));
1149+
options.add_options()("eagle_dynamic_tree_max_top_k",
1150+
"The max topK for dynamic tree, also the number of draft tokens that will expand for each node",
1151+
cxxopts::value<SizeType32>());
10991152

11001153
options.add_options()("multi_block_mode",
11011154
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel",
@@ -1305,7 +1358,8 @@ int main(int argc, char* argv[])
13051358
benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
13061359
}
13071360
// Argument: Eagle choices for the Eagle speculative decoding.
1308-
if (result.count("eagle_choices") || result.count("eagle_posterior_threshold"))
1361+
if (result.count("eagle_choices") || result.count("eagle_posterior_threshold")
1362+
|| result.count("eagle_use_dynamic_tree") || result.count("eagle_dynamic_tree_max_top_k"))
13091363
{
13101364
std::optional<float> posteriorThreshold;
13111365
if (result.count("eagle_posterior_threshold"))
@@ -1317,7 +1371,18 @@ int main(int argc, char* argv[])
13171371
{
13181372
choices = parseVectorOfVectors(result["eagle_choices"].as<std::string>());
13191373
}
1320-
benchmarkParams.eagleConfig = texec::EagleConfig(choices, !posteriorThreshold.has_value(), posteriorThreshold);
1374+
bool eagleUseDynamicTree = false;
1375+
if (result.count("eagle_use_dynamic_tree"))
1376+
{
1377+
eagleUseDynamicTree = result["eagle_use_dynamic_tree"].as<bool>();
1378+
}
1379+
std::optional<SizeType32> eagleDynamicTreeMaxTopK;
1380+
if (result.count("eagle_dynamic_tree_max_top_k"))
1381+
{
1382+
eagleDynamicTreeMaxTopK = result["eagle_dynamic_tree_max_top_k"].as<SizeType32>();
1383+
}
1384+
benchmarkParams.eagleConfig = texec::EagleConfig(
1385+
choices, !posteriorThreshold.has_value(), posteriorThreshold, eagleUseDynamicTree, eagleDynamicTreeMaxTopK);
13211386
}
13221387
if (result.count("temperature"))
13231388
{

benchmarks/cpp/gptSessionBenchmark.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
187187
generationOutput.contextLogits
188188
= bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
189189
}
190-
if (session.getModelConfig().computeGenerationLogits())
190+
if (session.getGatherGenerationLogits())
191191
{
192192
generationOutput.generationLogits
193193
= bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
@@ -306,7 +306,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
306306
std::cout << "generationOutput.contextLogits: " << *generationOutput.contextLogits << std::endl;
307307
}
308308

309-
if (session.getModelConfig().computeGenerationLogits() && printAllLogits)
309+
if (session.getGatherGenerationLogits() && printAllLogits)
310310
{
311311
std::cout << "generationOutput.generationLogits.shape: "
312312
<< generationOutput.generationLogits->getShape()

benchmarks/python/enc_dec_benchmark.py

+2
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def read_config(component):
6969
lora_config = builder_config['lora_config']
7070
auto_parallel_config = builder_config['auto_parallel_config']
7171
use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
72+
gemm_allreduce_plugin = plugin_config["gemm_allreduce_plugin"]
7273
remove_input_padding = plugin_config["remove_input_padding"]
7374
use_lora_plugin = plugin_config["lora_plugin"]
7475
tp_size = pretrained_config['mapping']['tp_size']
@@ -123,6 +124,7 @@ def read_config(component):
123124
vocab_size=vocab_size,
124125
num_layers=num_layers,
125126
gpt_attention_plugin=use_gpt_attention_plugin,
127+
gemm_allreduce_plugin=gemm_allreduce_plugin,
126128
remove_input_padding=remove_input_padding,
127129
kv_cache_type=kv_cache_type,
128130
tokens_per_block=tokens_per_block,

benchmarks/python/gpt_benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
101101
remove_input_padding=self.remove_input_padding,
102102
quant_mode=self.quant_mode,
103103
tokens_per_block=self.tokens_per_block if hasattr(
104-
self, 'tokens_per_block') else 64,
104+
self, 'tokens_per_block') else 32,
105105
mamba_conv1d_plugin=self.use_mamba_conv1d_plugin,
106106
gpu_weights_percent=list(sorted(gpu_weights_percents))[0],
107107
**rnn_configs_kwargs,

0 commit comments

Comments
 (0)