pytorch
diff --git a/‎CMakeLists.txt
+6-2 b/‎CMakeLists.txt
+6-2
diff --git a/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+5-1 b/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+5-1
diff --git a/‎examples/mediatek/executor_runner/mtk_llama_runner.cpp
+2-4 b/‎examples/mediatek/executor_runner/mtk_llama_runner.cpp
+2-4
diff --git a/‎examples/mediatek/executor_runner/mtk_llama_runner.h
+2-4 b/‎examples/mediatek/executor_runner/mtk_llama_runner.h
+2-4
diff --git a/‎examples/models/llama/main.cpp
+9-4 b/‎examples/models/llama/main.cpp
+9-4
diff --git a/‎examples/models/llama/runner/runner.cpp
+39-42 b/‎examples/models/llama/runner/runner.cpp
+39-42
diff --git a/‎examples/models/llama/runner/runner.h
+6-10 b/‎examples/models/llama/runner/runner.h
+6-10
diff --git a/‎examples/models/llava/runner/llava_runner.cpp
+9-3 b/‎examples/models/llava/runner/llava_runner.cpp
+9-3
diff --git a/‎examples/models/llava/runner/llava_text_decoder_runner.h
+2-5 b/‎examples/models/llava/runner/llava_text_decoder_runner.h
+2-5
diff --git a/‎extension/android/jni/jni_layer_llama.cpp
+7-6 b/‎extension/android/jni/jni_layer_llama.cpp
+7-6
@@ -761,12 +761,16 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
 
@@ -12,6 +12,7 @@
 #import <executorch/examples/models/llama/runner/runner.h>
 #import <executorch/examples/models/llava/runner/llava_runner.h>
 
+using executorch::extension::llm::GenerationConfig;
 using executorch::extension::llm::Image;
 using executorch::runtime::Error;
 
@@ -61,8 +62,11 @@ - (BOOL)generate:(NSString*)prompt
        sequenceLength:(NSInteger)seq_len
     withTokenCallback:(nullable void (^)(NSString*))callback
                 error:(NSError**)error {
+  const GenerationConfig config{
+    .seq_len = static_cast<int32_t>(seq_len)
+  };
   const auto status = _runner->generate(
-      prompt.UTF8String, seq_len, [callback](const std::string& token) {
+      prompt.UTF8String, config, [callback](const std::string& token) {
         callback(@(token.c_str()));
       });
   if (status != Error::Ok) {
 
@@ -80,11 +80,9 @@ bool MTKLlamaRunner::is_loaded() const {
 
 Error MTKLlamaRunner::generate(
     const std::string& prompt,
-    int32_t seq_len,
+    executorch::extension::llm::GenerationConfig config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
-    bool echo,
-    bool warming) {
+    std::function<void(const Stats&)> stats_callback) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
 
@@ -43,11 +43,9 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
   Error load();
   Error generate(
       const std::string& prompt,
-      int32_t seq_len = 128,
+      executorch::extension::llm::GenerationConfig config,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {},
-      bool echo = true,
-      bool warming = false);
+      std::function<void(const Stats&)> stats_callback = {});
   void stop();
 
   LlamaModelOptions get_model_options();
 
@@ -53,7 +53,7 @@ int32_t main(int32_t argc, char** argv) {
 
   const char* prompt = FLAGS_prompt.c_str();
 
-  double temperature = FLAGS_temperature;
+  float temperature = FLAGS_temperature;
 
   int32_t seq_len = FLAGS_seq_len;
 
@@ -73,13 +73,18 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  example::Runner runner(model_path, tokenizer_path, temperature);
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  example::Runner runner(model_path, tokenizer_path);
 
   if (warmup) {
-    runner.warmup(prompt, seq_len);
+    // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+    runner.warmup(prompt, /*max_new_tokens=*/seq_len);
   }
   // generate
-  runner.generate(prompt, seq_len);
+  executorch::extension::llm::GenerationConfig config{
+      .seq_len = seq_len, .temperature = temperature};
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  runner.generate(prompt, config);
 
   return 0;
 }
@@ -41,13 +41,11 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    const float temperature,
     std::optional<const std::string> data_path)
     // NOTE: we observed ~2x loading performance increase on iPhone 15
     // and a ~5% improvement on Galaxy S22 by switching to
     // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : temperature_(temperature),
-      tokenizer_path_(tokenizer_path),
+    : tokenizer_path_(tokenizer_path),
       metadata_({
           {kEnableDynamicShape, false},
           {kMaxSeqLen, 128},
@@ -133,11 +131,9 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_.get(),
-      metadata_.at(kUseKVCache),
-      metadata_.at(kVocabSize),
-      temperature_);
+      module_.get(), metadata_.at(kUseKVCache));
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
@@ -164,11 +160,9 @@ Error Runner::load() {
 
 Error Runner::generate(
     const std::string& prompt,
-    int32_t seq_len,
+    const ::executorch::extension::llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const llm::Stats&)> stats_callback,
-    bool echo,
-    bool warmup) {
+    std::function<void(const llm::Stats&)> stats_callback) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
@@ -178,19 +172,19 @@ Error Runner::generate(
     stats_.model_load_end_ms = llm::time_in_ms();
   }
 
-  if (warmup) {
+  if (config.warming) {
     ET_LOG(Info, "Doing a warmup run...");
   }
 
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after loading model: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
-      [token_callback, warmup](const std::string& piece) {
-        if (!warmup) {
+      [token_callback, config](const std::string& piece) {
+        if (!config.warming) {
           llm::safe_printf(piece.c_str());
           fflush(stdout);
         }
@@ -204,11 +198,6 @@ Error Runner::generate(
   stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
 
-  // Set the sequence length to the max seq length if not provided
-  seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxContextLen))
-      ? seq_len
-      : metadata_.at(kMaxContextLen);
-
   ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
       /* bos */ 0,
@@ -225,21 +214,22 @@ Error Runner::generate(
   ET_CHECK_MSG(
       num_prompt_tokens < metadata_.at(kMaxContextLen),
       "num_prompt_tokens %d >= max_seq_len_ %" PRId64
-      ", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
+      ", Max seq length exceeded - please increase max seq len value in your export script",
       num_prompt_tokens,
       metadata_.at(kMaxContextLen));
-  ET_CHECK_MSG(
-      num_prompt_tokens < seq_len,
-      "num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
-      num_prompt_tokens,
-      seq_len);
+
+  // Determine max_new_tokens using the GenerationConfig's resolve method
+  int max_new_tokens = config.resolve_max_new_tokens(
+      metadata_.at(kMaxContextLen), num_prompt_tokens);
+
+  ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);
 
   // Prefill first
   // Here feed all tokens to the model and get the next predicted token
   // after the prompt. After that we will enter generate loop.
 
   // print prompts
-  if (echo) {
+  if (config.echo) {
     wrapped_callback(prompt);
   }
   int64_t pos = 0;
@@ -253,32 +243,38 @@ Error Runner::generate(
   wrapped_callback(
       ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
+
+  // Generate max_new_tokens - 1 because prefill already generated 1 token.
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
+      prompt_tokens,
+      num_prompt_tokens,
+      max_new_tokens - 1,
+      config.temperature,
+      wrapped_callback));
 
   stats_.inference_end_ms = llm::time_in_ms();
-  if (!warmup) {
+  if (!config.warming) {
     printf("\n");
   }
   RUNNER_ET_LOG(
-      warmup,
+      config.warming,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
       llm::get_rss_bytes() / 1024.0 / 1024.0);
 
-  if (num_prompt_tokens + num_generated_tokens == seq_len) {
-    RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
+  if (num_generated_tokens == max_new_tokens) {
+    RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
   }
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
 
-  if (warmup) {
+  if (config.warming) {
     ET_LOG(Info, "Warmup run finished!");
   } else {
     // Do not print report during warmup
@@ -291,14 +287,15 @@ Error Runner::generate(
   return Error::Ok;
 }
 
-Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
-  Error err = generate(
-      prompt,
-      seq_len,
-      /*token_callback=*/nullptr,
-      /*stats_callbak=*/nullptr,
-      /*echo=*/false,
-      /*warmup=*/true);
+Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
+  // Create a GenerationConfig for warmup
+  llm::GenerationConfig config{
+      .echo = false, .max_new_tokens = max_new_tokens, .warming = true};
+
+  // Call generate with the warmup config
+  Error err = generate(prompt, config);
+
+  // Reset stats after warmup
   stats_.reset();
   return err;
 }
 
@@ -33,26 +33,22 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   explicit Runner(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      const float temperature = 0.8f,
       std::optional<const std::string> data_path = std::nullopt);
 
-  bool is_loaded() const;
-  ::executorch::runtime::Error load();
+  bool is_loaded() const override;
+  ::executorch::runtime::Error load() override;
   ::executorch::runtime::Error generate(
       const std::string& prompt,
-      int32_t seq_len = 128,
+      const ::executorch::extension::llm::GenerationConfig& config,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const ::executorch::extension::llm::Stats&)>
-          stats_callback = {},
-      bool echo = true,
-      bool warming = false);
+          stats_callback = {}) override;
   ::executorch::runtime::Error warmup(
       const std::string& prompt,
-      int32_t seq_len = 128);
-  void stop();
+      int32_t max_new_tokens);
+  void stop() override;
 
  private:
-  float temperature_;
   bool shouldStop_{false};
 
   // model
 
@@ -47,8 +47,10 @@ Error LlavaRunner::load() {
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
-  text_decoder_runner_ = std::make_unique<LlavaTextDecoderRunner>(
-      module_.get(), tokenizer_->vocab_size(), temperature_);
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+  text_decoder_runner_ =
+      std::make_unique<LlavaTextDecoderRunner>(module_.get());
+  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
   text_decoder_runner_->load();
 
   // Load the text prefiller
@@ -117,7 +119,11 @@ Error LlavaRunner::generate_from_pos(
 
   // Generate tokens
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      {prefill_next_token}, start_pos, seq_len, token_callback));
+      /*tokens=*/{prefill_next_token},
+      /*start_pos=*/start_pos,
+      /*max_new_tokens=*/seq_len - start_pos + 1,
+      /*temperature=*/temperature_,
+      /*token_callback=*/token_callback));
 
   // Bookkeeping
   stats_.num_generated_tokens = num_generated_tokens;
 
@@ -17,11 +17,8 @@ namespace example {
 class ET_EXPERIMENTAL LlavaTextDecoderRunner
     : public executorch::extension::llm::TextDecoderRunner {
  public:
-  LlavaTextDecoderRunner(
-      executorch::extension::Module* module,
-      int32_t vocab_size,
-      float temperature)
-      : TextDecoderRunner(module, true, vocab_size, temperature){};
+  LlavaTextDecoderRunner(executorch::extension::Module* module)
+      : TextDecoderRunner(module, true) {}
 
   inline executorch::runtime::Result<executorch::aten::Tensor> step(
       executorch::extension::TensorPtr& tokens,
 
@@ -169,13 +169,11 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
         runner_ = std::make_unique<example::Runner>(
             model_path->toStdString().c_str(),
             tokenizer_path->toStdString().c_str(),
-            temperature,
             data_path->toStdString().c_str());
       } else {
         runner_ = std::make_unique<example::Runner>(
             model_path->toStdString().c_str(),
-            tokenizer_path->toStdString().c_str(),
-            temperature);
+            tokenizer_path->toStdString().c_str());
       }
 #if defined(EXECUTORCH_BUILD_MEDIATEK)
     } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
@@ -219,12 +217,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
+      executorch::extension::llm::GenerationConfig config{
+          .echo = static_cast<bool>(echo),
+          .seq_len = seq_len,
+      };
       runner_->generate(
           prompt->toStdString(),
-          seq_len,
+          config,
           [callback](std::string result) { callback->onResult(result); },
-          [callback](const llm::Stats& result) { callback->onStats(result); },
-          echo);
+          [callback](const llm::Stats& result) { callback->onStats(result); });
     }
     return 0;
   }