Skip to content

Commit 535c9cc

Browse files
authored
Update TensorRT-LLM (#2460)
1 parent c629546 commit 535c9cc

File tree

244 files changed

+6961
-3528
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

244 files changed

+6961
-3528
lines changed

README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ TensorRT-LLM
88
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
99
[![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
1010
[![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
11-
[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
11+
[![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
1212
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1313

1414
[Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)   |   [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
@@ -18,12 +18,18 @@ TensorRT-LLM
1818

1919
## Latest News
2020

21+
* [2024/11/09] 🚀🚀🚀 3x Faster AllReduce with NVSwitch and TensorRT-LLM MultiShot
22+
[➡️ link](https://developer.nvidia.com/blog/3x-faster-allreduce-with-nvswitch-and-tensorrt-llm-multishot/)
23+
<div align="center">
24+
<img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/08/HGX-H200-tech-blog-1920x1080-1.jpg" width="50%">
25+
<div align="left">
26+
27+
* [2024/11/09] ✨ NVIDIA advances the AI ecosystem with the AI model of LG AI Research 🙌
28+
[➡️ link](https://blogs.nvidia.co.kr/blog/nvidia-lg-ai-research/)
29+
2130
* [2024/11/02] 🌟🌟🌟 NVIDIA and LlamaIndex Developer Contest
2231
🙌 Enter for a chance to win prizes including an NVIDIA® GeForce RTX™ 4080 SUPER GPU, DLI credits, and more🙌
2332
[➡️ link](https://developer.nvidia.com/llamaindex-developer-contest)
24-
<div align="center">
25-
<img src="docs/source/media/image-11-02-2024.png" width="50%">
26-
<div align="left">
2733

2834
* [2024/10/28] 🏎️🏎️🏎️ NVIDIA GH200 Superchip Accelerates Inference by 2x in Multiturn Interactions with Llama Models
2935
[➡️ link](https://developer.nvidia.com/blog/nvidia-gh200-superchip-accelerates-inference-by-2x-in-multiturn-interactions-with-llama-models/)

benchmarks/cpp/gptManagerBenchmark.cpp

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ class ExecutorServer
664664
{
665665
public:
666666
ExecutorServer(std::optional<std::filesystem::path> const& decoderTrtEnginePath,
667-
std::optional<std::filesystem::path> const& encoderTrtEnginePath, TrtGptModelType modelType,
667+
std::optional<std::filesystem::path> const& encoderTrtEnginePath, texec::BatchingType batchingType,
668668
int32_t maxBeamWidth, texec::CapacitySchedulerPolicy capacitySchedulerPolicy,
669669
BenchmarkParams const& benchmarkParams, std::shared_ptr<Recorder> recorder, std::chrono::milliseconds waitSleep,
670670
bool logIterationData, texec::ModelType executorModelType)
@@ -692,8 +692,7 @@ class ExecutorServer
692692
maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext, true);
693693
executorConfig.setGpuWeightsPercent(benchmarkParams.gpuWeightsPercent);
694694
executorConfig.setPeftCacheConfig(peftCacheConfig);
695-
executorConfig.setBatchingType(
696-
modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
695+
executorConfig.setBatchingType(batchingType);
697696
if (benchmarkParams.maxBatchSize)
698697
{
699698
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
@@ -947,6 +946,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
947946
std::nullopt, // embeddingBias
948947
std::nullopt, // speculativeDecoding
949948
std::nullopt, // pTuning
949+
std::nullopt, // mRopeConfig
950950
loraConfig, // loraConfig
951951
lookaheadConfig, // lookaheadConfig
952952
std::nullopt, // kvCacheRetentionConfig
@@ -955,7 +955,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
955955
}
956956

957957
void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngineDir,
958-
std::optional<std::filesystem::path> const& encoderEngineDir, TrtGptModelType modelType,
958+
std::optional<std::filesystem::path> const& encoderEngineDir, texec::BatchingType batchingType,
959959
std::string const& datasetPath, std::string const& opCsvFile, int maxNumSamples, int beamWidth, int warmUp,
960960
std::optional<int32_t> const& eosId, std::optional<int32_t> const& padId, BenchmarkParams const& benchmarkParams,
961961
texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep,
@@ -977,16 +977,17 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
977977
{
978978
TLLM_CHECK_WITH_INFO(
979979
decoderEngineDir.has_value(), "decoder models require a path to decoder engine in executor benchmark.");
980-
executorServer = std::make_shared<ExecutorServer>(decoderEngineDir.value(), std::nullopt, modelType, beamWidth,
981-
capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
980+
executorServer
981+
= std::make_shared<ExecutorServer>(decoderEngineDir.value(), std::nullopt, batchingType, beamWidth,
982+
capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
982983
}
983984
else if (executorModelType == texec::ModelType::kENCODER_DECODER)
984985
{
985986
TLLM_CHECK_WITH_INFO(encoderEngineDir.has_value(),
986987
"encoder-decoder models require a path to encoder engine in executor benchmark.");
987-
executorServer
988-
= std::make_shared<ExecutorServer>(decoderEngineDir.value(), encoderEngineDir.value(), modelType, beamWidth,
989-
capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
988+
executorServer = std::make_shared<ExecutorServer>(decoderEngineDir.value(), encoderEngineDir.value(),
989+
batchingType, beamWidth, capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData,
990+
executorModelType);
990991
try
991992
{
992993
std::ifstream decoderJsonConfigPath(decoderEngineDir.value() / "config.json");
@@ -1011,8 +1012,9 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
10111012
{
10121013
TLLM_CHECK_WITH_INFO(
10131014
encoderEngineDir.has_value(), "encoder models require a path to encoder engine in executor benchmark.");
1014-
executorServer = std::make_shared<ExecutorServer>(std::nullopt, encoderEngineDir.value(), modelType, beamWidth,
1015-
capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
1015+
executorServer
1016+
= std::make_shared<ExecutorServer>(std::nullopt, encoderEngineDir.value(), batchingType, beamWidth,
1017+
capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
10161018
}
10171019
else
10181020
{
@@ -1219,8 +1221,9 @@ int main(int argc, char* argv[])
12191221
"encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value<std::string>());
12201222
options.add_options()(
12211223
"api", "API type: gptManager or executor.", cxxopts::value<std::string>()->default_value("executor"));
1222-
options.add_options()("type", "Batching type: IFB, UIFB (unfused IFB) or V1 (non-IFB) batching.",
1223-
cxxopts::value<std::string>()->default_value("IFB"));
1224+
options.add_options()("type",
1225+
"Batching type: choose between inflight/static. (IFB/V1 options are going to be deprecated)",
1226+
cxxopts::value<std::string>()->default_value("inflight"));
12241227
options.add_options()("dataset", "Dataset that is used for benchmarking BatchManager.",
12251228
cxxopts::value<std::string>()->default_value(""));
12261229
options.add_options()(
@@ -1332,18 +1335,22 @@ int main(int argc, char* argv[])
13321335

13331336
// Argument: Batching Type
13341337
auto const type = result["type"].as<std::string>();
1335-
TrtGptModelType modelType{TrtGptModelType::V1};
1336-
if (type == "V1")
1338+
texec::BatchingType batchingType{texec::BatchingType::kINFLIGHT};
1339+
if (type == "V1" || type == "static")
13371340
{
1338-
modelType = TrtGptModelType::V1;
1339-
}
1340-
else if (type == "UIFB")
1341-
{
1342-
modelType = TrtGptModelType::InflightBatching;
1341+
if (type == "V1")
1342+
{
1343+
TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\".");
1344+
}
1345+
batchingType = texec::BatchingType::kSTATIC;
13431346
}
1344-
else if (type == "IFB")
1347+
else if (type == "IFB" || type == "inflight")
13451348
{
1346-
modelType = TrtGptModelType::InflightFusedBatching;
1349+
if (type == "IFB")
1350+
{
1351+
TLLM_LOG_WARNING("type option \"IFB\" is going to be renamed to \"inflight\".");
1352+
}
1353+
batchingType = texec::BatchingType::kINFLIGHT;
13471354
}
13481355
else
13491356
{
@@ -1604,7 +1611,7 @@ int main(int argc, char* argv[])
16041611
{
16051612
TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
16061613
TLLM_CHECK_WITH_INFO(
1607-
modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
1614+
batchingType == texec::BatchingType::kINFLIGHT, "encoder-decoder only support inflight batching.");
16081615
executorModelType = texec::ModelType::kENCODER_DECODER;
16091616
encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
16101617
decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
@@ -1621,7 +1628,7 @@ int main(int argc, char* argv[])
16211628
}
16221629
try
16231630
{
1624-
benchmarkExecutor(decoderEngineDir, encoderEngineDir, modelType, datasetPath, opCsvFile, maxNumSamples,
1631+
benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples,
16251632
beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, capacitySchedulerPolicy,
16261633
waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData,
16271634
maxPromptLen, executorModelType);

benchmarks/cpp/utils/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def print_dataset(input_ids, output_lens):
5050
for i, input_tokens in enumerate(input_ids):
5151
d = {
5252
"task_id": i,
53-
"logits": input_tokens,
53+
"input_ids": input_tokens,
5454
"output_tokens": output_lens[i]
5555
}
5656
print(json.dumps(d, separators=(',', ':'), ensure_ascii=False))

cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class KvCacheConfig
8585
std::optional<SizeType32> sinkTokenLength;
8686
std::optional<float> freeGpuMemoryFraction;
8787
bool enableBlockReuse;
88-
static constexpr auto kDefaultGpuMemFraction = 0.9f;
88+
static constexpr auto kDefaultGpuMemFraction = 0.9F;
8989
bool useUvm;
9090
std::optional<size_t> hostCacheSize;
9191
bool onboardBlocks;

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ class KVCacheManager
835835
* 2 * modelConfig.getSizePerHead();
836836
}
837837

838-
[[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
838+
[[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
839839
nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
840840
tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
841841

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ class GenericLlmRequest
9292
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
9393
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
9494
std::optional<SizeType32> promptVocabSize = std::nullopt,
95+
std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
96+
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
9597
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
9698
std::optional<TensorPtr> loraConfig = std::nullopt,
9799
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -131,6 +133,8 @@ class GenericLlmRequest
131133
, mPositionIds(std::move(positionIds))
132134
, mPromptEmbeddingTable(std::move(promptEmbeddingTable))
133135
, mPromptVocabSize(promptVocabSize)
136+
, mMropeRotarySinCos(std::move(mropeRotarySinCos))
137+
, mMropePositionDeltas(std::move(mropePositionDeltas))
134138
, mLoraTaskId(loraTaskId)
135139
, mLoraWeights(std::move(loraWeights))
136140
, mLoraConfig(std::move(loraConfig))
@@ -188,6 +192,8 @@ class GenericLlmRequest
188192
, mPositionIds(std::nullopt)
189193
, mPromptEmbeddingTable(std::nullopt)
190194
, mPromptVocabSize(std::nullopt)
195+
, mMropeRotarySinCos(std::nullopt)
196+
, mMropePositionDeltas(std::nullopt)
191197
, mLoraTaskId(std::nullopt)
192198
, mLoraWeights(std::nullopt)
193199
, mLoraConfig(std::nullopt)
@@ -285,6 +291,12 @@ class GenericLlmRequest
285291
= std::make_shared<VecTokenExtraIds>(pTuningConfig->getInputTokenExtraIds().value());
286292
}
287293
}
294+
auto mRopeConfig = req.getMropeConfig();
295+
if (mRopeConfig)
296+
{
297+
mMropeRotarySinCos = executor::detail::toITensor(mRopeConfig.value().getMRopeRotarySinCos());
298+
mMropePositionDeltas = mRopeConfig.value().getMRopePositionDeltas();
299+
}
288300

289301
auto loraConfig = req.getLoraConfig();
290302
if (loraConfig)
@@ -447,16 +459,6 @@ class GenericLlmRequest
447459
mContextPhaseParams = std::move(contextPhaseParams);
448460
}
449461

450-
[[nodiscard]] bool isLayerWiseKvCacheEnabled() const
451-
{
452-
return isContextOnlyRequest() && mLayerWiseKvCacheEnabled;
453-
}
454-
455-
void setLayerWiseKvCacheEnabled(bool enabled)
456-
{
457-
mLayerWiseKvCacheEnabled = enabled;
458-
}
459-
460462
/// @brief Get the state params of the context
461463
/// @return The state params of the context
462464
[[nodiscard]] executor::DataTransceiverState const& getDataTransceiverState() const
@@ -798,6 +800,16 @@ class GenericLlmRequest
798800
return mPromptVocabSize;
799801
}
800802

803+
[[nodiscard]] std::optional<TensorPtr> getMropeRotarySinCos() const
804+
{
805+
return mMropeRotarySinCos;
806+
}
807+
808+
[[nodiscard]] std::optional<SizeType32> getMropePositionDeltas() const
809+
{
810+
return mMropePositionDeltas;
811+
}
812+
801813
[[nodiscard]] std::optional<LoraTaskIdType> getLoraTaskId() const
802814
{
803815
return mLoraTaskId;
@@ -1604,6 +1616,8 @@ class GenericLlmRequest
16041616

16051617
std::optional<TensorPtr> mPromptEmbeddingTable;
16061618
std::optional<SizeType32> mPromptVocabSize;
1619+
std::optional<TensorPtr> mMropeRotarySinCos;
1620+
std::optional<SizeType32> mMropePositionDeltas;
16071621

16081622
std::optional<LoraTaskIdType> mLoraTaskId;
16091623
std::optional<TensorPtr> mLoraWeights;
@@ -1654,7 +1668,6 @@ class GenericLlmRequest
16541668
std::optional<TensorPtr> mCrossAttentionMask; // Input cross attention mask
16551669
LlmRequestType mLlmRequestType;
16561670
std::optional<executor::ContextPhaseParams> mContextPhaseParams;
1657-
bool mLayerWiseKvCacheEnabled = false;
16581671

16591672
std::optional<std::shared_ptr<VecTokenExtraIds>> mInputTokenExtraIds;
16601673
BeamUniqueTokens mUniqueTokens;
@@ -1819,6 +1832,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
18191832
std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
18201833
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
18211834
std::optional<SizeType32> promptVocabSize = std::nullopt,
1835+
std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
1836+
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
18221837
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
18231838
std::optional<TensorPtr> loraConfig = std::nullopt,
18241839
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -1840,7 +1855,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
18401855
std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt)
18411856
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
18421857
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
1843-
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
1858+
std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotarySinCos),
1859+
std::move(mropePositionDeltas), loraTaskId, std::move(loraWeights), std::move(loraConfig),
18441860
std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
18451861
returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput,
18461862
std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, std::move(encoderInputTokens),
@@ -1857,6 +1873,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
18571873
std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
18581874
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
18591875
std::optional<SizeType32> promptVocabSize = std::nullopt,
1876+
std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
1877+
std::optional<SizeType32> mropePositionDeltas = std::nullopt,
18601878
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
18611879
std::optional<TensorPtr> loraConfig = std::nullopt,
18621880
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -1879,7 +1897,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
18791897
std::move(stopWordsList),
18801898
positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
18811899
: std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
1882-
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
1900+
std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotarySinCos),
1901+
std::move(mropePositionDeltas), loraTaskId, std::move(loraWeights), std::move(loraConfig),
18831902
std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
18841903
returnGenerationLogits,
18851904
draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))

cpp/include/tensorrt_llm/common/mpiUtils.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@
3535
#include <mpi.h>
3636
#else
3737
// Dummy defines to avoid #if in wider places.
38-
typedef int MPI_Datatype;
39-
typedef int MPI_Comm;
40-
typedef int MPI_Request;
41-
typedef int MPI_Message;
42-
typedef int MPI_Op;
38+
typedef void* MPI_Datatype;
39+
typedef void* MPI_Comm;
40+
typedef void* MPI_Request;
41+
typedef void* MPI_Message;
42+
typedef void* MPI_Op;
4343

4444
typedef struct MPI_Status
4545
{

0 commit comments

Comments
 (0)