NVIDIA
diff --git a/‎README.md
Lines changed: 10 additions & 4 deletions b/‎README.md
Lines changed: 10 additions & 4 deletions
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp
Lines changed: 31 additions & 24 deletions b/‎benchmarks/cpp/gptManagerBenchmark.cpp
Lines changed: 31 additions & 24 deletions
diff --git a/‎benchmarks/cpp/utils/utils.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cpp/utils/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 32 additions & 13 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 32 additions & 13 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/mpiUtils.h
Lines changed: 5 additions & 5 deletions b/‎cpp/include/tensorrt_llm/common/mpiUtils.h
Lines changed: 5 additions & 5 deletions
@@ -8,7 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.6.2-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.6.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.16.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
@@ -18,12 +18,18 @@ TensorRT-LLM
 
 ## Latest News
 
+* [2024/11/09] 🚀🚀🚀 3x Faster AllReduce with NVSwitch and TensorRT-LLM MultiShot
+[➡️ link](https://developer.nvidia.com/blog/3x-faster-allreduce-with-nvswitch-and-tensorrt-llm-multishot/)
+<div align="center">
+<img src="https://developer-blogs.nvidia.com/wp-content/uploads/2024/08/HGX-H200-tech-blog-1920x1080-1.jpg" width="50%">
+<div align="left">
+
+* [2024/11/09] ✨ NVIDIA advances the AI ecosystem with the AI model of LG AI Research 🙌
+[➡️ link](https://blogs.nvidia.co.kr/blog/nvidia-lg-ai-research/)
+
 * [2024/11/02] 🌟🌟🌟 NVIDIA and LlamaIndex Developer Contest
 🙌 Enter for a chance to win prizes including an NVIDIA® GeForce RTX™ 4080 SUPER GPU, DLI credits, and more🙌
 [➡️ link](https://developer.nvidia.com/llamaindex-developer-contest)
-<div align="center">
-<img src="docs/source/media/image-11-02-2024.png" width="50%">
-<div align="left">
 
 * [2024/10/28] 🏎️🏎️🏎️ NVIDIA GH200 Superchip Accelerates Inference by 2x in Multiturn Interactions with Llama Models
 [➡️ link](https://developer.nvidia.com/blog/nvidia-gh200-superchip-accelerates-inference-by-2x-in-multiturn-interactions-with-llama-models/)
 
@@ -664,7 +664,7 @@ class ExecutorServer
 {
 public:
     ExecutorServer(std::optional<std::filesystem::path> const& decoderTrtEnginePath,
-        std::optional<std::filesystem::path> const& encoderTrtEnginePath, TrtGptModelType modelType,
+        std::optional<std::filesystem::path> const& encoderTrtEnginePath, texec::BatchingType batchingType,
         int32_t maxBeamWidth, texec::CapacitySchedulerPolicy capacitySchedulerPolicy,
         BenchmarkParams const& benchmarkParams, std::shared_ptr<Recorder> recorder, std::chrono::milliseconds waitSleep,
         bool logIterationData, texec::ModelType executorModelType)
@@ -692,8 +692,7 @@ class ExecutorServer
             maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext, true);
         executorConfig.setGpuWeightsPercent(benchmarkParams.gpuWeightsPercent);
         executorConfig.setPeftCacheConfig(peftCacheConfig);
-        executorConfig.setBatchingType(
-            modelType == TrtGptModelType::V1 ? texec::BatchingType::kSTATIC : texec::BatchingType::kINFLIGHT);
+        executorConfig.setBatchingType(batchingType);
         if (benchmarkParams.maxBatchSize)
         {
             executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
@@ -947,6 +946,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
         std::nullopt,    // embeddingBias
         std::nullopt,    // speculativeDecoding
         std::nullopt,    // pTuning
+        std::nullopt,    // mRopeConfig
         loraConfig,      // loraConfig
         lookaheadConfig, // lookaheadConfig
         std::nullopt,    // kvCacheRetentionConfig
@@ -955,7 +955,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
 }
 
 void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngineDir,
-    std::optional<std::filesystem::path> const& encoderEngineDir, TrtGptModelType modelType,
+    std::optional<std::filesystem::path> const& encoderEngineDir, texec::BatchingType batchingType,
     std::string const& datasetPath, std::string const& opCsvFile, int maxNumSamples, int beamWidth, int warmUp,
     std::optional<int32_t> const& eosId, std::optional<int32_t> const& padId, BenchmarkParams const& benchmarkParams,
     texec::CapacitySchedulerPolicy capacitySchedulerPolicy, std::chrono::milliseconds waitSleep,
@@ -977,16 +977,17 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
     {
         TLLM_CHECK_WITH_INFO(
             decoderEngineDir.has_value(), "decoder models require a path to decoder engine in executor benchmark.");
-        executorServer = std::make_shared<ExecutorServer>(decoderEngineDir.value(), std::nullopt, modelType, beamWidth,
-            capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
+        executorServer
+            = std::make_shared<ExecutorServer>(decoderEngineDir.value(), std::nullopt, batchingType, beamWidth,
+                capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
     }
     else if (executorModelType == texec::ModelType::kENCODER_DECODER)
     {
         TLLM_CHECK_WITH_INFO(encoderEngineDir.has_value(),
             "encoder-decoder models require a path to encoder engine in executor benchmark.");
-        executorServer
-            = std::make_shared<ExecutorServer>(decoderEngineDir.value(), encoderEngineDir.value(), modelType, beamWidth,
-                capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
+        executorServer = std::make_shared<ExecutorServer>(decoderEngineDir.value(), encoderEngineDir.value(),
+            batchingType, beamWidth, capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData,
+            executorModelType);
         try
         {
             std::ifstream decoderJsonConfigPath(decoderEngineDir.value() / "config.json");
@@ -1011,8 +1012,9 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
     {
         TLLM_CHECK_WITH_INFO(
             encoderEngineDir.has_value(), "encoder models require a path to encoder engine in executor benchmark.");
-        executorServer = std::make_shared<ExecutorServer>(std::nullopt, encoderEngineDir.value(), modelType, beamWidth,
-            capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
+        executorServer
+            = std::make_shared<ExecutorServer>(std::nullopt, encoderEngineDir.value(), batchingType, beamWidth,
+                capacitySchedulerPolicy, benchmarkParams, recorder, waitSleep, logIterationData, executorModelType);
     }
     else
     {
@@ -1219,8 +1221,9 @@ int main(int argc, char* argv[])
         "encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value<std::string>());
     options.add_options()(
         "api", "API type: gptManager or executor.", cxxopts::value<std::string>()->default_value("executor"));
-    options.add_options()("type", "Batching type: IFB, UIFB (unfused IFB) or V1 (non-IFB) batching.",
-        cxxopts::value<std::string>()->default_value("IFB"));
+    options.add_options()("type",
+        "Batching type: choose between inflight/static. (IFB/V1 options are going to be deprecated)",
+        cxxopts::value<std::string>()->default_value("inflight"));
     options.add_options()("dataset", "Dataset that is used for benchmarking BatchManager.",
         cxxopts::value<std::string>()->default_value(""));
     options.add_options()(
@@ -1332,18 +1335,22 @@ int main(int argc, char* argv[])
 
     // Argument: Batching Type
     auto const type = result["type"].as<std::string>();
-    TrtGptModelType modelType{TrtGptModelType::V1};
-    if (type == "V1")
+    texec::BatchingType batchingType{texec::BatchingType::kINFLIGHT};
+    if (type == "V1" || type == "static")
     {
-        modelType = TrtGptModelType::V1;
-    }
-    else if (type == "UIFB")
-    {
-        modelType = TrtGptModelType::InflightBatching;
+        if (type == "V1")
+        {
+            TLLM_LOG_WARNING("type option \"V1\" is going to be renamed to \"static\".");
+        }
+        batchingType = texec::BatchingType::kSTATIC;
     }
-    else if (type == "IFB")
+    else if (type == "IFB" || type == "inflight")
     {
-        modelType = TrtGptModelType::InflightFusedBatching;
+        if (type == "IFB")
+        {
+            TLLM_LOG_WARNING("type option \"IFB\" is going to be renamed to \"inflight\".");
+        }
+        batchingType = texec::BatchingType::kINFLIGHT;
     }
     else
     {
@@ -1604,7 +1611,7 @@ int main(int argc, char* argv[])
         {
             TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
             TLLM_CHECK_WITH_INFO(
-                modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
+                batchingType == texec::BatchingType::kINFLIGHT, "encoder-decoder only support inflight batching.");
             executorModelType = texec::ModelType::kENCODER_DECODER;
             encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
             decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
@@ -1621,7 +1628,7 @@ int main(int argc, char* argv[])
         }
         try
         {
-            benchmarkExecutor(decoderEngineDir, encoderEngineDir, modelType, datasetPath, opCsvFile, maxNumSamples,
+            benchmarkExecutor(decoderEngineDir, encoderEngineDir, batchingType, datasetPath, opCsvFile, maxNumSamples,
                 beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, capacitySchedulerPolicy,
                 waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, logIterationData,
                 maxPromptLen, executorModelType);
 
@@ -50,7 +50,7 @@ def print_dataset(input_ids, output_lens):
     for i, input_tokens in enumerate(input_ids):
         d = {
             "task_id": i,
-            "logits": input_tokens,
+            "input_ids": input_tokens,
             "output_tokens": output_lens[i]
         }
         print(json.dumps(d, separators=(',', ':'), ensure_ascii=False))
 
@@ -85,7 +85,7 @@ class KvCacheConfig
     std::optional<SizeType32> sinkTokenLength;
     std::optional<float> freeGpuMemoryFraction;
     bool enableBlockReuse;
-    static constexpr auto kDefaultGpuMemFraction = 0.9f;
+    static constexpr auto kDefaultGpuMemFraction = 0.9F;
     bool useUvm;
     std::optional<size_t> hostCacheSize;
     bool onboardBlocks;
 
@@ -835,7 +835,7 @@ class KVCacheManager
             * 2 * modelConfig.getSizePerHead();
     }
 
-    [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
+    [[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
         nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
         tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
 
@@ -92,6 +92,8 @@ class GenericLlmRequest
         std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
+        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
         std::optional<TensorPtr> loraConfig = std::nullopt,
         std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -131,6 +133,8 @@ class GenericLlmRequest
         , mPositionIds(std::move(positionIds))
         , mPromptEmbeddingTable(std::move(promptEmbeddingTable))
         , mPromptVocabSize(promptVocabSize)
+        , mMropeRotarySinCos(std::move(mropeRotarySinCos))
+        , mMropePositionDeltas(std::move(mropePositionDeltas))
         , mLoraTaskId(loraTaskId)
         , mLoraWeights(std::move(loraWeights))
         , mLoraConfig(std::move(loraConfig))
@@ -188,6 +192,8 @@ class GenericLlmRequest
         , mPositionIds(std::nullopt)
         , mPromptEmbeddingTable(std::nullopt)
         , mPromptVocabSize(std::nullopt)
+        , mMropeRotarySinCos(std::nullopt)
+        , mMropePositionDeltas(std::nullopt)
         , mLoraTaskId(std::nullopt)
         , mLoraWeights(std::nullopt)
         , mLoraConfig(std::nullopt)
@@ -285,6 +291,12 @@ class GenericLlmRequest
                     = std::make_shared<VecTokenExtraIds>(pTuningConfig->getInputTokenExtraIds().value());
             }
         }
+        auto mRopeConfig = req.getMropeConfig();
+        if (mRopeConfig)
+        {
+            mMropeRotarySinCos = executor::detail::toITensor(mRopeConfig.value().getMRopeRotarySinCos());
+            mMropePositionDeltas = mRopeConfig.value().getMRopePositionDeltas();
+        }
 
         auto loraConfig = req.getLoraConfig();
         if (loraConfig)
@@ -447,16 +459,6 @@ class GenericLlmRequest
         mContextPhaseParams = std::move(contextPhaseParams);
     }
 
-    [[nodiscard]] bool isLayerWiseKvCacheEnabled() const
-    {
-        return isContextOnlyRequest() && mLayerWiseKvCacheEnabled;
-    }
-
-    void setLayerWiseKvCacheEnabled(bool enabled)
-    {
-        mLayerWiseKvCacheEnabled = enabled;
-    }
-
     /// @brief Get the state params of the context
     /// @return The state params of the context
     [[nodiscard]] executor::DataTransceiverState const& getDataTransceiverState() const
@@ -798,6 +800,16 @@ class GenericLlmRequest
         return mPromptVocabSize;
     }
 
+    [[nodiscard]] std::optional<TensorPtr> getMropeRotarySinCos() const
+    {
+        return mMropeRotarySinCos;
+    }
+
+    [[nodiscard]] std::optional<SizeType32> getMropePositionDeltas() const
+    {
+        return mMropePositionDeltas;
+    }
+
     [[nodiscard]] std::optional<LoraTaskIdType> getLoraTaskId() const
     {
         return mLoraTaskId;
@@ -1604,6 +1616,8 @@ class GenericLlmRequest
 
     std::optional<TensorPtr> mPromptEmbeddingTable;
     std::optional<SizeType32> mPromptVocabSize;
+    std::optional<TensorPtr> mMropeRotarySinCos;
+    std::optional<SizeType32> mMropePositionDeltas;
 
     std::optional<LoraTaskIdType> mLoraTaskId;
     std::optional<TensorPtr> mLoraWeights;
@@ -1654,7 +1668,6 @@ class GenericLlmRequest
     std::optional<TensorPtr> mCrossAttentionMask; // Input cross attention mask
     LlmRequestType mLlmRequestType;
     std::optional<executor::ContextPhaseParams> mContextPhaseParams;
-    bool mLayerWiseKvCacheEnabled = false;
 
     std::optional<std::shared_ptr<VecTokenExtraIds>> mInputTokenExtraIds;
     BeamUniqueTokens mUniqueTokens;
@@ -1819,6 +1832,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
+        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
         std::optional<TensorPtr> loraConfig = std::nullopt,
         std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -1840,7 +1855,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt)
         : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
             std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
-            std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
+            std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotarySinCos),
+            std::move(mropePositionDeltas), loraTaskId, std::move(loraWeights), std::move(loraConfig),
             std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
             returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput,
             std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, std::move(encoderInputTokens),
@@ -1857,6 +1873,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
         std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
         std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<TensorPtr> mropeRotarySinCos = std::nullopt,
+        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
         std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
         std::optional<TensorPtr> loraConfig = std::nullopt,
         std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
@@ -1879,7 +1897,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
             std::move(stopWordsList),
             positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
                                     : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
-            std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
+            std::move(promptEmbeddingTable), promptVocabSize, std::move(mropeRotarySinCos),
+            std::move(mropePositionDeltas), loraTaskId, std::move(loraWeights), std::move(loraConfig),
             std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
             returnGenerationLogits,
             draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
 
@@ -35,11 +35,11 @@
 #include <mpi.h>
 #else
 // Dummy defines to avoid #if in wider places.
-typedef int MPI_Datatype;
-typedef int MPI_Comm;
-typedef int MPI_Request;
-typedef int MPI_Message;
-typedef int MPI_Op;
+typedef void* MPI_Datatype;
+typedef void* MPI_Comm;
+typedef void* MPI_Request;
+typedef void* MPI_Message;
+typedef void* MPI_Op;
 
 typedef struct MPI_Status
 {
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ def print_dataset(input_ids, output_lens):`
`50`	`50`	`for i, input_tokens in enumerate(input_ids):`
`51`	`51`	`d = {`
`52`	`52`	`"task_id": i,`
`53`		`- "logits": input_tokens,`
	`53`	`+ "input_ids": input_tokens,`
`54`	`54`	`"output_tokens": output_lens[i]`
`55`	`55`	`}`
`56`	`56`	`print(json.dumps(d, separators=(',', ':'), ensure_ascii=False))`
Original file line number	Diff line number	Diff line change
`@@ -835,7 +835,7 @@ class KVCacheManager`
`835`	`835`	`* 2 * modelConfig.getSizePerHead();`
`836`	`836`	`}`
`837`	`837`
`838`		`- [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,`
	`838`	`+ [[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,`
`839`	`839`	`nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,`
`840`	`840`	`tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);`
`841`	`841`